1,从 https://pvp.qq.com/web201605/js/herolist.json 获取英雄描述文件herolist.json
2,假定英雄类型标识如下
hero_type =["全部","战士","法师","坦克","刺客","射手","辅助"]
3,从https://pvp.qq.com/web201605/herolist.shtml 获取英雄网页列表,并且提出英雄名称及头像链接
4,从网页提取英雄名称和头像链接信息
5,合并两个信息并构建英雄详细信息数据列表,例如
[‘嫦娥‘, ‘法师|战士|坦克‘, ‘寒月公主|露花倒影‘, ‘game.gtimg.cn/images/yxzj/img201606/heroimg/515/515.jpg‘], [‘上官婉儿‘, ‘法师|全部|刺客‘, ‘惊鸿之笔|修竹墨客‘, ‘game.gtimg.cn/images/yxzj/img201606/heroimg/513/513.jpg‘], ]
6,构建易于搜索的数据结构,并设计函数,实现搜索函数
  def lookup(index,keyword):
          pass
import json
from bs4 import BeautifulSoup as bs
import requests
r = requests.get(‘https://pvp.qq.com/web201605/js/herolist.json‘)
#从官网上找到英雄列表 json文件
hero_list = json.loads(r.text)
hero_list = None
?
## 保存
with open("all_hero.json", ‘wt‘,encoding="utf-8") as fd:
    fd.write(r.text)
?
# 恢复
with open(‘all_hero.json‘,encoding="utf-8") as json_data:
  hero_list = json.load(json_data)
?
def search_for_hero_info(name=None):
  for hero in hero_list:
      if "cname" in hero:
          if hero["cname"] == name:
              return hero
  return None
hero_type =["全部","战士","法师","坦克","刺客","射手","辅助"]
from selenium import webdriver
?
browser = webdriver.Chrome(‘./chromedriver‘)
browser.get("https://pvp.qq.com/web201605/herolist.shtml")
html = browser.page_source
browser.quit()
## 保存HTML
with open("hero_web.html", ‘w‘,encoding="utf-8") as fd:
    fd.write(html)
## 恢复保存的HTML
hero_html = None
with open("hero_web.html", ‘r‘,encoding="utf-8") as fd:
    hero_html = fd.read()
def build_hero_type(hero):
  combine_type = []    
  if "hero_type" in hero:
      combine_type.append(hero_type[hero["hero_type"]])
  if "new_type" in hero:
      combine_type.append(hero_type[hero["new_type"]])
  if "hero_type2" in hero:
      combine_type.append(hero_type[hero["hero_type2"]])
  return((‘|‘).join(combine_type))
#[hero_name, hero_type, hero_skin, hero_url]
def merge_hero_info(hero_html, hero_json):
  all_heros = []
  for hero in hero_html:
      hero_detail = search_for_hero_info(hero[0])
      all_heros.append([hero[0],build_hero_type(hero_detail),hero_detail["skin_name"].strip("
‘"),hero[1]])    
  return all_heros
hero_soup = bs(html,‘lxml‘)
hero_html_list=hero_soup.find("ul",class_="herolist")
all_hero_list =hero_html_list.find_all("li")
gen_heros=[[info.text, info.img["src"].strip("/")] for info in all_hero_list]
combined_heros = merge_hero_info(gen_heros, hero_list)
## build up index
add_to_index
```
index is
[
?
[<keyword>,[<hero_detail>,...]],
[<keyword>,[<hero_detail>,...]]...
?
]
keyword string
a url
```
[
[unit,factor],[]
]
?
[
?
[keyword],[
[‘百里玄策‘, ‘刺客|全部‘, ‘嚣狂之镰|威尼斯狂欢‘, ‘http://game.gtimg.cn/images/yxzj/img201606/heroimg/195/195.jpg‘],
[‘百里玄策‘, ‘刺客|全部‘, ‘嚣狂之镰|威尼斯狂欢‘, ‘http://game.gtimg.cn/images/yxzj/img201606/heroimg/195/195.jpg‘],
[‘百里玄策‘, ‘刺客|全部‘, ‘嚣狂之镰|威尼斯狂欢‘, ‘http://game.gtimg.cn/images/yxzj/img201606/heroimg/195/195.jpg‘]],
?
]
## use list for storage
[‘百里玄策‘, ‘刺客|全部‘, ‘嚣狂之镰|威尼斯狂欢‘, ‘http://game.gtimg.cn/images/yxzj/img201606/heroimg/195/195.jpg‘]
?
?
add_to_index
index is [
[<keyword>,[<hero_detail>,...]], [<keyword>,[<hero_detail>,...]]...
] keyword string a url
[ [unit,factor],[] ]
[
[keyword],[ [‘百里玄策‘, ‘刺客|全部‘, ‘嚣狂之镰|威尼斯狂欢‘, ‘http://game.gtimg.cn/images/yxzj/img201606/heroimg/195/195.jpg‘], [‘百里玄策‘, ‘刺客|全部‘, ‘嚣狂之镰|威尼斯狂欢‘, ‘http://game.gtimg.cn/images/yxzj/img201606/heroimg/195/195.jpg‘], [‘百里玄策‘, ‘刺客|全部‘, ‘嚣狂之镰|威尼斯狂欢‘, ‘http://game.gtimg.cn/images/yxzj/img201606/heroimg/195/195.jpg‘]],
]
[‘百里玄策‘, ‘刺客|全部‘, ‘嚣狂之镰|威尼斯狂欢‘, ‘http://game.gtimg.cn/images/yxzj/img201606/heroimg/195/195.jpg‘]
def get_keywords_array(hero):
  """
  根据英雄信息,生成keyword的列表
  [hero_name, hero_type, hero_skin, hero_url]
  """
  keywords =[]
  if hero[0]:
      keywords.append(hero[0])
  if hero[1]:
      keywords += hero[1].split(‘|‘)
  if hero[2]:
      keywords +=hero[2].split(‘|‘)
  return keywords
def add_to_index(index, keyword, info):
  """
  添加索引到搜索数据列表中
  """    
  for entry in index:
      if entry[0] == keyword:
          entry[1].append(info)
          return
  #not find
  index.append([keyword,[info]])
?
def build_up_index(index_array):
  """
  创建搜索数据列表
  """        
  for hero_info in combined_heros:
      keywords = get_keywords_array(hero_info)
      for key in keywords:
          add_to_index(index_array,key,hero_info)    
# lookup information by keywords
def lookup(index,keyword):
  """
  根据关键词在列表中搜索
  """        
  for entry in index:
      if entry[0] == keyword:
          return entry[1]
  #not find
  return entry[0]
?
search_index=[]
build_up_index(search_index)
lookup(search_index,"苏烈")
原文:https://www.cnblogs.com/Lilwhat/p/12431056.html