首页 > 其他 > 详细

爬虫实战1 - 爬取ZOL壁纸

时间:2021-04-10 16:33:26      阅读:9      评论:0      收藏:0      [点我收藏+]
 1 import urllib.request as ur
 2 import os
 3 import time
 4 
 5 
 6 def url_open(url):
 7     # url = url.replace(‘ ‘, ‘%20‘)
 8     req = ur.Request(url)
 9     response = ur.urlopen(req)
10     html = response.read()
11     return html
12 
13 
14 def save_img(down_url, name):
15     f = open(C:\\Users\\路朝阳\\PycharmProjects\\pythonProject1\\my_img\\ + name + .jpg, wb)
16     html_download = url_open(down_url)
17     f.write(html_download)
18     f.close()
19 
20 
21 def change_url(url, para_count_web):
22     if para_count_web == 1:
23         return url
24 
25     else:
26         para_url1, para_url2 = url.split(_, 1)
27         url_changed = para_url1 + _ + str(para_count_web) + .html
28         print(url_changed)
29         return url_changed
30 
31 
32 count = 0
33 count_web = 1
34 continue_location = 0
35 natual_url = https://desk.zol.com.cn/1920x1080/hot_1.html  # zol桌面壁纸 - 热榜
36 
37 while True:
38     # ============== 第一步 ==============
39     next_natual_url = change_url(natual_url, count_web)
40     html1 = url_open(next_natual_url).decode(gbk)  # 不同网站的编码方式不同,注意查询该网站编码方式
41 
42     find_result = html1.find(<a class="pic" href="/bizhi/, continue_location)
43     # 保证在该页面全部图片被爬取后,跳出这次循环进入下一页的爬取(但也要设置一个限制count_web)
44     if find_result == -1:
45         count_web += 1
46         if count_web > 5:
47             break
48         else:
49             continue_location = 0  # 一定要把查询的起始位置重新恢复至 0,否则翻页后还是从末尾开始查询了
50             continue
51 
52     a = find_result + 28
53     b = html1.find(.html, a, a + 100)
54     continue_location = b
55     img_series_id = html1[a:b]  # 成功读取下一个图片集的id
56     print(img_series_id)
57 
58     # ============== 第二步 ==============
59     url2 = https://desk.zol.com.cn/bizhi/ + img_series_id + .html
60 
61     html2 = url_open(url2).decode(gbk)
62 
63     resolution_judge = html2.find(id="1920x1080" href="/showpic/1920x1080_)
64     # 倘若该图片没有我们需要的分辨率,则跳过本次循环
65     if resolution_judge == -1:
66         continue
67 
68     a_html2 = resolution_judge + 40
69     b_html2 = html2.find(.html, a_html2, a_html2 + 100)
70     img_id = html2[a_html2:b_html2]
71     print(img_id)  # 成功读取每个图片的id(分辨率1920*1080)
72 
73     # ============== 第三步 ==============
74     url3 = https://desk.zol.com.cn/showpic/1920x1080_ + img_id + .html
75     html3 = url_open(url3).decode(gbk)
76     a_html3 = html3.find(img src="https:) + 9
77     # 注意,若遇到的图片为png格式则会报错
78     png_judge = html3.find(.jpg, a_html3, a_html3 + 255)
79     if png_judge == -1:
80         continue
81     b_html3 = png_judge + 4
82     img_download_url = html3[a_html3:b_html3]
83 
84     # ============== 第四步 ==============
85     save_img(img_download_url, photo-pc + str(count))
86     count += 1
87     if count_web > 5:
88         break
89     time.sleep(1)

 

爬虫实战1 - 爬取ZOL壁纸

原文:https://www.cnblogs.com/vosoland/p/14640619.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!