1 import os #建立类目文档,未实现 2 import re 3 import requests 4 from bs4 import BeautifulSoup 5 import csv 6 import random 7 import time 8 #提取类目表 9 #建立子类url 10 books = [] 11 ourl = ‘https://book.douban.com/tag/‘ 12 headers ={ 13 ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36‘ 14 } 15 response = requests.get(ourl,headers = headers) 16 response.encoding = ‘utf-8‘ 17 #定位类目位置, 测试 每大类只选取第一行选取2页 总计480本 18 soup = BeautifulSoup(response.text,‘html.parser‘) 19 dw_= soup.select(‘#content > div > div.article > div:nth-child(2) > div > table > tbody > tr:nth-child(1) > td > a‘) 20 leimu = [] #类目表 21 link = [] #类目链接表 22 for dw in dw_: 23 leimu.append(dw.string) 24 link.append(‘https://book.douban.com‘+dw[‘href‘]) 25 for href_ in link:#每类首页 26 for page in range(0,40,20): 27 data = { 28 ‘start‘:page, 29 ‘type‘: ‘T‘ 30 } 31 headers2 = { 32 ‘Referer‘: ‘https://www.baidu.com/‘, 33 ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36‘ 34 } 35 html2 = requests.get(href_, params=data,headers=headers2) 36 html2.encoding = ‘utf-8‘ 37 soup2 = BeautifulSoup(html2.text, ‘html.parser‘) 38 names = soup2.select(‘#subject_list > ul > li > div.info > h2 > a‘) 39 details = soup2.select(‘#subject_list > ul > li > div.info > div.pub‘) 40 scores = soup2.select(‘#subject_list > ul > li > div.info > div.star.clearfix > span.rating_nums‘) 41 briefs = soup2.select(‘#subject_list > ul > li > div.info > p‘) 42 # tag1 = soup2.select(‘#content > h1‘) 43 # tag2 = tag1[0].get_text() 44 # tag3 = re.findall(r‘\:\s*(.*)‘,tag2) 45 # tag = tag3[0] #感觉有点多余 从标签处拿到标签 46 47 for name, detail, score, brief in zip(names, details, scores, briefs): 48 try: 49 dict_book = {} 50 name1 = name.get_text().strip() 51 name = ‘‘.join(name1.split()) # 消除所有空格 52 detail = detail.get_text().split(‘/‘) 53 author = detail[0].strip() 54 pubtime = detail[-2].strip() 55 price1 = detail[-1].strip() 56 price2 = re.findall(r‘(\d+\.\d{0,3}).*‘, price1) 57 price = price2[0] # 正则保留数字 58 score = score.get_text() 59 brief = brief.get_text() 60 dict_book[‘书名‘] = name 61 dict_book[‘作者‘] = author 62 dict_book[‘上市时间‘] = pubtime 63 dict_book[‘价格‘] = price 64 dict_book[‘书籍评分‘] = score 65 dict_book[‘内容简介‘] = brief 66 books.append(dict_book) 67 time.sleep(random.random() * 3)#随机休眠 68 print(name) 69 except IndexError as e: 70 print(‘IndexError:‘,e) 71 finally: 72 print(‘finally‘) 73 with open(r‘flieName.csv‘, ‘w‘,errors=‘ignore‘) as csvfile: 74 filednames = [‘书名‘, ‘作者‘, ‘上市时间‘, ‘价格‘, ‘书籍评分‘, ‘内容简介‘] 75 writer = csv.DictWriter(csvfile, filednames) 76 writer.writeheader() 77 for book_ in books: 78 writer.writerow({ 79 ‘书名‘: book_[‘书名‘], ‘作者‘: book_[‘作者‘], ‘上市时间‘: book_[‘上市时间‘], ‘价格‘: book_[‘价格‘], 80 ‘书籍评分‘: book_[‘书籍评分‘], ‘内容简介‘: book_[‘内容简介‘] 81 })
还需添加ip池 未实现
原文:https://www.cnblogs.com/yueyuecong/p/11491085.html