1 class Grasp:
2 def __init__(self):
3 for i in range(0, 10):
4 self.url = f"https://movie.douban.com/top250?start={25*i}&filter="
5 self.html = urlopen(self.url).read().decode()
6 self.htmlobj = et.HTML(self.html)
7 self.res = self.htmlobj.xpath("//div[@class =‘hd‘]/a/span[@class=‘title‘][1]/text()") # 电影名
8 self.dicr = self.htmlobj.xpath("//div[@class =‘bd‘]/p[1]/text()") # 导演
9 self.cri = self.htmlobj.xpath("//p[@class =‘quote‘]/span//text()") # 介绍
10 self.score = self.htmlobj.xpath("//div[@class =‘star‘]/span[@class=‘rating_num‘]/text()") # 评分
11 print((‘‘.join(self.dicr)).strip().replace(‘ ‘, ‘‘).replace(‘/‘, ‘‘).split(‘,‘)[0:1])
12 def getName(self):
13 pass
14 def getDir(self):
15 pass
16 def getScore(self):
17 pass
18 def getCri(self):
19 pass
20
21 def run(self):
22 wb = xlwt.Workbook(encoding=‘utf-8‘)
23 ws = wb.add_sheet(‘豆瓣电影‘)
24 for i in range(0,10):
25 for j in range(0, len(self.res)):
26 ws.write(i, j * 4, (‘‘.join(self.res[j])).strip().replace(‘ ‘, ‘‘).replace(‘/‘, ‘‘))
27 ws.write(i, 4 * j + 1, (‘‘.join(self.dicr[j * 2])).strip().replace(‘/‘, ‘‘).split(‘,‘)[0:1])
28 try:
29 ws.write(i, 4 * j + 2, (‘‘.join(self.cri[j])).strip())
30 except:
31 ws.write(i, 4 * j + 2,‘没有介绍‘)
32 ws.write(i, 4 * j + 3, (‘‘.join(self.score[j])).strip().replace(‘ ‘, ‘‘).replace(‘/‘, ‘‘))
33 wb.save(‘./豆瓣电影/movie.xls‘)
34
35 r = Grasp()
36 r.run()
37
38
39
40 def reader():
41 wb = xlwt.Workbook(encoding=‘utf-8‘)
42 ws = wb.add_sheet(‘豆瓣电影‘)
43 for i in range(0, 10):
44 url = f"https://movie.douban.com/top250?start={25*i}&filter="
45 html = urlopen(url).read().decode()
46 htmlobj = et.HTML(html)
47 res = htmlobj.xpath("//div[@class =‘hd‘]/a/span[@class=‘title‘][1]/text()") #电影名
48 dicr = htmlobj.xpath("//div[@class =‘bd‘]/p[1]/text()") #导演
49 cri = htmlobj.xpath("//p[@class =‘quote‘]/span//text()") #介绍
50 score = htmlobj.xpath("//div[@class =‘star‘]/span[@class=‘rating_num‘]/text()") #评分
51 for j in range(0, len(res)):
52 d = ((‘‘.join(dicr[j*2])).strip().replace(‘ ‘, ‘‘).replace(‘/‘, ‘‘)).find(‘主‘)
53 ws.write(i, j*4, (‘‘.join(res[j])).strip().replace(‘ ‘,‘‘).replace(‘/‘,‘‘))
54 ws.write(i, 4*j+1, ((‘‘.join(dicr[j*2])).strip().replace(‘ ‘, ‘‘).replace(‘/‘, ‘‘))[0:d])
55 try:
56 ws.write(i, 4*j+2, (‘‘.join(cri[j])).strip())
57 except:
58 ws.write(i, 4 * j + 2,‘没有介绍‘)
59 ws.write(i, 4*j+3, (‘‘.join(score[j])).strip().replace(‘ ‘, ‘‘).replace(‘/‘, ‘‘))
60 wb.save(‘./豆瓣电影/movie.xls‘)
61
62 reader()
将爬取的数据存储为表格
‘‘.join()这些是为了将数据转换为单纯的字符串,除去特殊的字符和空格,便于数据的查看
此外,需要注意的是xpath获取的是一个列表,可以用列表的方法进行操作,不需要进行多余的转化
原文:https://www.cnblogs.com/superSmall/p/11502872.html