爬虫小练习，面向对象，和函数式编程，爬取豆瓣电影

时间：2019-09-10 23:01:27 阅读：132 评论：0 收藏：0 [点我收藏+]

 1 class Grasp:
 2     def __init__(self):
 3         for i in range(0, 10):
 4             self.url = f"https://movie.douban.com/top250?start={25*i}&filter="
 5             self.html = urlopen(self.url).read().decode()
 6             self.htmlobj = et.HTML(self.html)
 7             self.res = self.htmlobj.xpath("//div[@class =‘hd‘]/a/span[@class=‘title‘][1]/text()")  # 电影名
 8             self.dicr = self.htmlobj.xpath("//div[@class =‘bd‘]/p[1]/text()")  # 导演
 9             self.cri = self.htmlobj.xpath("//p[@class =‘quote‘]/span//text()")  # 介绍
10             self.score = self.htmlobj.xpath("//div[@class =‘star‘]/span[@class=‘rating_num‘]/text()")  # 评分
11             print((‘‘.join(self.dicr)).strip().replace(‘ ‘, ‘‘).replace(‘/‘, ‘‘).split(‘,‘)[0:1])
12     def getName(self):
13         pass
14     def getDir(self):
15         pass
16     def getScore(self):
17         pass
18     def getCri(self):
19         pass
20 
21     def run(self):
22         wb = xlwt.Workbook(encoding=‘utf-8‘)
23         ws = wb.add_sheet(‘豆瓣电影‘)
24         for i in range(0,10):
25             for j in range(0, len(self.res)):
26                 ws.write(i, j * 4, (‘‘.join(self.res[j])).strip().replace(‘ ‘, ‘‘).replace(‘/‘, ‘‘))
27                 ws.write(i, 4 * j + 1, (‘‘.join(self.dicr[j * 2])).strip().replace(‘/‘, ‘‘).split(‘,‘)[0:1])
28                 try:
29                     ws.write(i, 4 * j + 2, (‘‘.join(self.cri[j])).strip())
30                 except:
31                     ws.write(i, 4 * j + 2,‘没有介绍‘)
32                 ws.write(i, 4 * j + 3, (‘‘.join(self.score[j])).strip().replace(‘ ‘, ‘‘).replace(‘/‘, ‘‘))
33         wb.save(‘./豆瓣电影/movie.xls‘)
34 
35 r = Grasp()
36 r.run()
37 
38 
39 
40 def reader():
41     wb = xlwt.Workbook(encoding=‘utf-8‘)
42     ws = wb.add_sheet(‘豆瓣电影‘)
43     for i in range(0, 10):
44         url = f"https://movie.douban.com/top250?start={25*i}&filter="
45         html = urlopen(url).read().decode()
46         htmlobj = et.HTML(html)
47         res = htmlobj.xpath("//div[@class =‘hd‘]/a/span[@class=‘title‘][1]/text()") #电影名
48         dicr = htmlobj.xpath("//div[@class =‘bd‘]/p[1]/text()") #导演
49         cri = htmlobj.xpath("//p[@class =‘quote‘]/span//text()") #介绍
50         score = htmlobj.xpath("//div[@class =‘star‘]/span[@class=‘rating_num‘]/text()") #评分
51         for j in range(0, len(res)):
52             d = ((‘‘.join(dicr[j*2])).strip().replace(‘ ‘, ‘‘).replace(‘/‘, ‘‘)).find(‘主‘)
53             ws.write(i, j*4, (‘‘.join(res[j])).strip().replace(‘ ‘,‘‘).replace(‘/‘,‘‘))
54             ws.write(i, 4*j+1, ((‘‘.join(dicr[j*2])).strip().replace(‘ ‘, ‘‘).replace(‘/‘, ‘‘))[0:d])
55             try:
56                 ws.write(i, 4*j+2, (‘‘.join(cri[j])).strip())
57             except:
58                 ws.write(i, 4 * j + 2,‘没有介绍‘)
59             ws.write(i, 4*j+3, (‘‘.join(score[j])).strip().replace(‘ ‘, ‘‘).replace(‘/‘, ‘‘))
60     wb.save(‘./豆瓣电影/movie.xls‘)
61 
62 reader()
将爬取的数据存储为表格
‘‘.join()这些是为了将数据转换为单纯的字符串，除去特殊的字符和空格，便于数据的查看
此外，需要注意的是xpath获取的是一个列表，可以用列表的方法进行操作，不需要进行多余的转化

原文：https://www.cnblogs.com/superSmall/p/11502872.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)