爬虫实战1 - 爬取ZOL壁纸

时间：2021-04-10 16:33:26 阅读：11 评论：0 收藏：0 [点我收藏+]

 1 import urllib.request as ur
 2 import os
 3 import time
 4 
 5 
 6 def url_open(url):
 7     # url = url.replace(‘ ‘, ‘%20‘)
 8     req = ur.Request(url)
 9     response = ur.urlopen(req)
10     html = response.read()
11     return html
12 
13 
14 def save_img(down_url, name):
15     f = open(‘C:\\Users\\路朝阳\\PycharmProjects\\pythonProject1\\my_img\\‘ + name + ‘.jpg‘, ‘wb‘)
16     html_download = url_open(down_url)
17     f.write(html_download)
18     f.close()
19 
20 
21 def change_url(url, para_count_web):
22     if para_count_web == 1:
23         return url
24 
25     else:
26         para_url1, para_url2 = url.split(‘_‘, 1)
27         url_changed = para_url1 + ‘_‘ + str(para_count_web) + ‘.html‘
28         print(url_changed)
29         return url_changed
30 
31 
32 count = 0
33 count_web = 1
34 continue_location = 0
35 natual_url = ‘https://desk.zol.com.cn/1920x1080/hot_1.html‘  # zol桌面壁纸 - 热榜
36 
37 while True:
38     # ============== 第一步 ==============
39     next_natual_url = change_url(natual_url, count_web)
40     html1 = url_open(next_natual_url).decode(‘gbk‘)  # 不同网站的编码方式不同，注意查询该网站编码方式
41 
42     find_result = html1.find(‘<a class="pic" href="/bizhi/‘, continue_location)
43     # 保证在该页面全部图片被爬取后，跳出这次循环进入下一页的爬取（但也要设置一个限制count_web）
44     if find_result == -1:
45         count_web += 1
46         if count_web > 5:
47             break
48         else:
49             continue_location = 0  # 一定要把查询的起始位置重新恢复至 0，否则翻页后还是从末尾开始查询了
50             continue
51 
52     a = find_result + 28
53     b = html1.find(‘.html‘, a, a + 100)
54     continue_location = b
55     img_series_id = html1[a:b]  # 成功读取下一个图片集的id
56     print(img_series_id)
57 
58     # ============== 第二步 ==============
59     url2 = ‘https://desk.zol.com.cn/bizhi/‘ + img_series_id + ‘.html‘
60 
61     html2 = url_open(url2).decode(‘gbk‘)
62 
63     resolution_judge = html2.find(‘id="1920x1080" href="/showpic/1920x1080_‘)
64     # 倘若该图片没有我们需要的分辨率，则跳过本次循环
65     if resolution_judge == -1:
66         continue
67 
68     a_html2 = resolution_judge + 40
69     b_html2 = html2.find(‘.html‘, a_html2, a_html2 + 100)
70     img_id = html2[a_html2:b_html2]
71     print(img_id)  # 成功读取每个图片的id（分辨率1920*1080）
72 
73     # ============== 第三步 ==============
74     url3 = ‘https://desk.zol.com.cn/showpic/1920x1080_‘ + img_id + ‘.html‘
75     html3 = url_open(url3).decode(‘gbk‘)
76     a_html3 = html3.find(‘img src="https:‘) + 9
77     # 注意，若遇到的图片为png格式则会报错
78     png_judge = html3.find(‘.jpg‘, a_html3, a_html3 + 255)
79     if png_judge == -1:
80         continue
81     b_html3 = png_judge + 4
82     img_download_url = html3[a_html3:b_html3]
83 
84     # ============== 第四步 ==============
85     save_img(img_download_url, ‘photo-pc‘ + str(count))
86     count += 1
87     if count_web > 5:
88         break
89     time.sleep(1)

爬虫实战1 - 爬取ZOL壁纸

原文：https://www.cnblogs.com/vosoland/p/14640619.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)