首页 > 其他 > 详细

爬出某学校官网全部文章

时间:2020-07-05 19:48:42      阅读:37      评论:0      收藏:0      [点我收藏+]

第一步 导包

 1 import requests
 2 from lxml import etree
 3 from threading import Thread
 4 from queue import Queue
 5 import time
 6 import redis
 7 import re
 8 from bs4 import BeautifulSoup
 9 import tqdm
10 import os

设置 数据库连接

pool = redis.ConnectionPool(host=localhost, port=6379, db=num,password="psd")
redis = redis.StrictRedis(connection_pool=pool)

初始化函数

    def __init__(self):
        self.index_urls = set()
        self.two_index_urls = set()
        self.url_title = {}
        self.base_url = url

从首页获取链接

1     def get_head_index_url(self):
2         url = self.base_url
3         response = requests.get(url)
4         html = response.text
5         page = etree.HTML(html)
6         contents = page.xpath(//a[contains(@href,"Category_")]/@href)
7         for i in contents:
8             self.index_urls.add(i)

获取首页下级链接下的链接

技术分享图片
 1     def get_two_index_url(self):
 2         for i in self.index_urls:
 3             url = self.base_url + i
 4             response = requests.get(url)
 5             html = response.text
 6             page = etree.HTML(html)
 7             contents = page.xpath(//a[contains(@href,"Category_")]/@href)
 8             for i in contents:
 9                 self.two_index_urls.add(i)
10         self.index_urls |= self.two_index_urls
View Code

获取链接中的可用链接

技术分享图片
 1   def get_all_url_title_redis(self):
 2         for i in self.index_urls:
 3             url = self.base_url[:-1] + i
 4             try:
 5                 response = requests.get(url, timeout=5)
 6                 time.sleep(0.5)
 7                 html = response.text
 8                 page = etree.HTML(html)
 9                 title = page.xpath(//em/a/text())
10                 redis.hset("url_title",url, str(title))
11                 print(url, over )
12 
13             except Exception as e:
14                 print(e)
15                 print("{}  get wrong!".format(url))
View Code

将可用链接提取

技术分享图片
 1     def get_all_url_from_redis_set(self):
 2         urls = redis.hkeys("url_title")
 3         for i in urls:
 4             if len(redis.hget("url_title", i)) != 2:
 5                 redis.hset("can_use_urls", i.decode("utf8"), redis.hget("url_title", i))
 6                 print("set {} ok!".format(i.decode("utf8")))
 7 
 8     def get_all_split_url_to_redis(self):
 9         all_page_num = 0
10         for i in redis.hkeys("can_use_urls"):
11             all_page_num += 1
12             head_url = i.decode(utf8)
13             print(head_url)
14             base_url = head_url[:len(head_url) - len(Index.aspx)]
15             modol_url = base_url + "Index_{}" + ".aspx"
16             response = requests.get(head_url, timeout=5)
17             time.sleep(0.5)
18             html = response.text
19             page = etree.HTML(html)
20             url_details = page.xpath(//span[@class="disabled"]/text())
21             if not url_details:
22                 continue
23             max_page = re.search("/共(.*?)页", str(url_details)).group(1)
24             urls = [head_url]
25             for i in range(2, int(max_page) + 1):
26                 urls.append(modol_url.format(i))
27                 all_page_num +=1
28             redis.hset("all_urls", head_url, str(urls))
29         print("all page :{}".format(all_page_num))
View Code

早之前的链接中获取所有文章页面的链接

技术分享图片
 1     def get_all_pag_url_to_redis(self):
 2         values = redis.hkeys("all_urls")
 3         urls = set()
 4         page_num = 0
 5         urls_num = 0
 6         for url in values:
 7             url = url.decode("utf8")
 8             split_urls = redis.hget("all_urls", url).decode("utf8")
 9             for i in eval(split_urls):
10                 try:
11                     response = requests.get(i, timeout=5)
12                     time.sleep(0.5)
13                     html = response.text
14                     page = etree.HTML(html)
15                     page_urls = page.xpath("//li/a[contains(@href,‘Item‘)]/@href")
16                     for page_url in page_urls:
17                         urls.add(page_url)
18                         print("{} add over".format(page_url))
19                         urls_num +=1
20                     print("{} already get all url".format(i))
21 
22                 except Exception as e:
23                     print(e)
24                     print(i)
25                     print(url)
26                     continue
27                 page_num += 1
28 
29         print("{} page get!".format(page_num))
30         print("{} url get!".format(urls_num))
31         url_s = ‘‘
32         for i in urls:
33             url_s +=,+i
34             print(i)
35         redis.hset(all_splite_url, str(urls), url_s)
View Code

在获取的包含文章页面的链接中获取链接

技术分享图片
 1     def get_all_conten(self):
 2         urls = redis.hvals("all_splite_url")
 3         urls = urls[0].decode(utf8).split(,)
 4         base_url = http://www.lzlqc.com
 5         all_page = 0
 6         get_page = 0
 7         for ur in tqdm.tqdm(urls):
 8             url = base_url+ur
 9             try:
10                 response = requests.get(url, timeout=5)
11                 time.sleep(0.5)
12                 html = response.text
13                 page = etree.HTML(html)
14                 path = page.xpath(//em/a/text()|//em/text())
15                 clict_num = 0
16                 path_s = \\
17                 path_s += ‘‘.join([i + \\ for i in path])
18                 soup = BeautifulSoup(html, "html.parser")
19                 title = soup.find(name=div, attrs={class: "article_infoTitle"}).find(name=span).find(
20                     name=font).string
21                 author = soup.find(name=div, attrs={class: article_info}).find(
22                     name=span).find(name=font)
23                 author = str(author)
24                 release_time = re.search(发布时间:(.*?日), author).group(1)
25                 author = re.search(>(.*?点击数:), author).group(1)
26                 content = soup.find(name=div, attrs=article_content_list)
27                 content = re.sub(<[^>]+>, ‘‘, str(content))
28                 clict = requests.get(
29                     base_url + page.xpath(//div[@class="article_info"]/span/font/script/@src)[0]).text
30                 clict_num = re.findall("‘(.*?)‘", clict)[0]
31                 author += clict_num
32                 abspath = os.getcwd()
33                 abspath_s = abspath +\gets+ path_s
34                 # print(abspath_s[:-1])
35                 if os.path.isdir(abspath_s[:-1]):
36                     pass
37                     # print(abspath_s[:-1])
38                 else:
39                     os.makedirs(abspath_s[:-1])
40                 # print(path_s)
41                 file_name = release_time + ----- + title
42                 with open(abspath_s + file_name + .txt, a, encoding=utf8) as p:
43                     p.write(title + "\n")
44                     p.write(author)
45                     p.write(content)
46                     p.write("Chang Time:{}".format(time.asctime()))
47                 redis.hset("contents", str(url), title+author+content)
48                 get_page += 1
49                 # print(abspath_s + title)
50             except Exception as e:
51                 print(e)
52                 print("url :{} get some wrong!!!!!!!!".format(url))
53                 with open("wrong.txt",a,encoding=utf8) as P:
54                     P.write(url+"\n")
55                 all_page += 1
56                 continue
57         print("{} all page num".format(all_page))
58         print("{} get page num".format(get_page))
View Code

构造开始函数

技术分享图片
1     def run(self):
2         self.get_head_index_url()
3         self.get_two_index_url()
4         self.get_all_url_title_redis()
5         self.get_all_url_from_redis_set()
6         self.get_all_split_url_to_redis()
7         self.get_all_conten()
View Code

因为没有代理 ip,所以没有使用多线程,效率太慢。

大概3小时提取12000多张页面。

 

爬出某学校官网全部文章

原文:https://www.cnblogs.com/argos/p/13250300.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!