1、古诗文网爬虫
import requests,re
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
}
my_poetic_list = []
def get_poetics(my_url):
text = requests.get(url=my_url,headers = headers).text
titles = re.findall(r‘<div class="cont">.*?<b>(.*?)</b>‘,text,re.DOTALL)
years = re.findall(r‘<p class="source">.*?<a.*?>(.*?)</a>‘,text,re.DOTALL)
potes = re.findall(r‘<p class="source">.*?<a.*?>.*?</a>.*?<a.*?>(.*?)</a>‘,text,re.DOTALL)
poetic = re.findall(r‘<div class="contson".*?>(.*?)</div>‘,text,re.DOTALL)
poetic_list = []
for i in poetic:
i = re.sub(r"<.*?>",‘‘,i).replace("\n",‘‘).replace("\u3000",‘‘)
poetic_list.append(i)
for key,value in enumerate(titles):
my_poetic = {}
my_poetic["title"] = titles[key]
my_poetic["year"] = years[key]
my_poetic["pote"] = potes[key]
my_poetic["poetics"] = poetic_list[key]
my_poetic_list.append(my_poetic)
if __name__ == ‘__main__‘:
for i in range(0,11):
url = "https://www.gushiwen.org/default_{}.aspx".format(i)
get_poetics(url)
for i in my_poetic_list:
print(i)
2、糗事百科案例
import re,requests
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
}
my_lovehhy = []
def get_acticles(url):
text = requests.get(url=url,headers=headers).text
titles = re.findall(r‘<h3.*?><a.*?>(.*?)</a>‘,text,re.DOTALL)
articles = re.findall(r‘<div id="endtext">(.*?)</div>‘,text,re.DOTALL)
for key,article in enumerate(articles):
article = re.sub(r"<.*?>",‘‘,article).replace("\u3000",‘‘)
lovehhy = {}
lovehhy["title"] = titles[key]
lovehhy["content"] = article
my_lovehhy.append(lovehhy)
if __name__ == ‘__main__‘:
for i in range(10):
url = "http://www.lovehhy.net/Joke/Detail/QSBK/{}".format(i)
get_acticles(url)
for i in my_lovehhy:
print(i)
原文:https://www.cnblogs.com/win0211/p/12091295.html