首页 > 其他 > 详细

柯林斯

时间:2020-05-08 11:26:18      阅读:52      评论:0      收藏:0      [点我收藏+]

原文地址:# https://www.cnblogs.com/dylan9/p/9207366.html

python代码:

 1 # 关于线程以及进程的使用
 2 #文件名:sample.py
 3 import time
 4 
 5 import requests
 6 from lxml import etree
 7 from multiprocessing.dummy import Pool
 8 headers = {
 9     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
10 }
11 
12 # url = "https://www.collinsdictionary.com/zh/browse/english/"
13 #
14 # page_text = requests.get(url=url, headers=headers).text
15 #
16 # tree = etree.HTML(page_text)
17 #
18 # li_list = tree.xpath("//ul[@class=‘bLtr‘]/li/a/@href")[1:]
19 pool = Pool(20)
20 
21 li_list = [https://www.collinsdictionary.com/zh/browse/english/words-starting-with-a, https://www.collinsdictionary.com/zh/browse/english/words-starting-with-b, https://www.collinsdictionary.com/zh/browse/english/words-starting-with-c, https://www.collinsdictionary.com/zh/browse/english/words-starting-with-d, https://www.collinsdictionary.com/zh/browse/english/words-starting-with-e, https://www.collinsdictionary.com/zh/browse/english/words-starting-with-f, https://www.collinsdictionary.com/zh/browse/english/words-starting-with-g, https://www.collinsdictionary.com/zh/browse/english/words-starting-with-h, https://www.collinsdictionary.com/zh/browse/english/words-starting-with-i, https://www.collinsdictionary.com/zh/browse/english/words-starting-with-j, https://www.collinsdictionary.com/zh/browse/english/words-starting-with-k, https://www.collinsdictionary.com/zh/browse/english/words-starting-with-l, https://www.collinsdictionary.com/zh/browse/english/words-starting-with-m, https://www.collinsdictionary.com/zh/browse/english/words-starting-with-n, https://www.collinsdictionary.com/zh/browse/english/words-starting-with-o, https://www.collinsdictionary.com/zh/browse/english/words-starting-with-p, https://www.collinsdictionary.com/zh/browse/english/words-starting-with-q, https://www.collinsdictionary.com/zh/browse/english/words-starting-with-r, https://www.collinsdictionary.com/zh/browse/english/words-starting-with-s, https://www.collinsdictionary.com/zh/browse/english/words-starting-with-t, https://www.collinsdictionary.com/zh/browse/english/words-starting-with-u, https://www.collinsdictionary.com/zh/browse/english/words-starting-with-v, https://www.collinsdictionary.com/zh/browse/english/words-starting-with-w, https://www.collinsdictionary.com/zh/browse/english/words-starting-with-x, https://www.collinsdictionary.com/zh/browse/english/words-starting-with-y, https://www.collinsdictionary.com/zh/browse/english/words-starting-with-z]
22 
23 # li_list = ["https://www.collinsdictionary.com/zh/browse/english/words-starting-with-a"]
24 
25 deep_url_list = []
26 
27 start = time.time()
28 
29 def get_urls(url):
30     page_text2 = requests.get(url=url, headers=headers).text
31     tree2 = etree.HTML(page_text2)
32     url_list = tree2.xpath("//ul[@class=‘columns2 bL‘]/li/a/@href")
33     deep_url_list.extend(url_list)
34 
35 
36 def get_data(url):
37     page_text3 = requests.get(url=url, headers=headers).text
38     tree3 = etree.HTML(page_text3)
39     data_li_list = tree3.xpath("//ul[@class=‘columns2 bL‘]/li")
40     for li in data_li_list:
41         data = li.xpath(./a/text())[0]
42         with open("word2.txt", "a", encoding="utf-8") as f:
43             f.write(data + \n)
44 
45 
46 pool.map(get_urls, li_list)
47 result = pool.map_async(get_data, deep_url_list)
48 result.wait()
49 print("执行完毕")
50 print("耗时:", time.time()-start)

windown下安装python,安装pip、安装requests包,结果没有用

 1 import re
 2 import random
 3 import requests
 4 
 5 from bs4 import BeautifulSoup
 6 from concurrent.futures import ThreadPoolExecutor
 7 from multiprocessing import cpu_count
 8 
 9 # ------------------------- 制作英文词典 --------------------------------------
10 
11 rex = re.compile(r[-&()/\.]+)
12 
13 
14 def bar(url):
15     response = requests.get(url=url)
16     soup = BeautifulSoup(response.text, html.parser)
17     ul_obj = soup.find(name=ul, attrs={class, columns2 browse-list})
18     return ul_obj.find_all(name=a)
19 
20 
21 def worker(url):
22     """
23         拿到具体的连接,https://www.collinsdictionary.com/browse/english/words-starting-with-a
24         如上链接,是所有以a开头的单词集合
25     """
26     a_list = bar(url=https://www.collinsdictionary.com/browse/english/words-starting-with-{}.format(url[0]))
27     for item in a_list:
28         for i in bar(item.get(href)):
29             res = i.text
30             if not re.findall(rex, res) and len(res) > 2:
31                 print(res)
32                 url[1].write({}\n.format(res))
33 
34 
35 def spider_collins():
36     """
37         爬取柯林斯网站所有的单词,链接深度共三层,
38         第一层获取24个字母的连接,
39         第二层获取以字母开头的所有短语或单词,
40         第三层,就是具体的一个个单词了
41     """
42     f = open(w.txt, a, encoding=utf8)
43     t = ThreadPoolExecutor(cpu_count() * 5)
44     for i in range(ord(a), ord(z) + 1):  # 97 ~ 122
45         t.submit(worker, (chr(i), f))
46         # break
47     t.shutdown()
48     f.close()

差不多一个意思吧,还要消化下

pip install requests 

等待系统自动加载安装。 

柯林斯

原文:https://www.cnblogs.com/guochaoxxl/p/12848973.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!