1 import urllib.request as ur 2 import os 3 import time 4 5 6 def url_open(url): 7 # url = url.replace(‘ ‘, ‘%20‘) 8 req = ur.Request(url) 9 response = ur.urlopen(req) 10 html = response.read() 11 return html 12 13 14 def save_img(down_url, name): 15 f = open(‘C:\\Users\\路朝阳\\PycharmProjects\\pythonProject1\\my_img\\‘ + name + ‘.jpg‘, ‘wb‘) 16 html_download = url_open(down_url) 17 f.write(html_download) 18 f.close() 19 20 21 def change_url(url, para_count_web): 22 if para_count_web == 1: 23 return url 24 25 else: 26 para_url1, para_url2 = url.split(‘_‘, 1) 27 url_changed = para_url1 + ‘_‘ + str(para_count_web) + ‘.html‘ 28 print(url_changed) 29 return url_changed 30 31 32 count = 0 33 count_web = 1 34 continue_location = 0 35 natual_url = ‘https://desk.zol.com.cn/1920x1080/hot_1.html‘ # zol桌面壁纸 - 热榜 36 37 while True: 38 # ============== 第一步 ============== 39 next_natual_url = change_url(natual_url, count_web) 40 html1 = url_open(next_natual_url).decode(‘gbk‘) # 不同网站的编码方式不同,注意查询该网站编码方式 41 42 find_result = html1.find(‘<a class="pic" href="/bizhi/‘, continue_location) 43 # 保证在该页面全部图片被爬取后,跳出这次循环进入下一页的爬取(但也要设置一个限制count_web) 44 if find_result == -1: 45 count_web += 1 46 if count_web > 5: 47 break 48 else: 49 continue_location = 0 # 一定要把查询的起始位置重新恢复至 0,否则翻页后还是从末尾开始查询了 50 continue 51 52 a = find_result + 28 53 b = html1.find(‘.html‘, a, a + 100) 54 continue_location = b 55 img_series_id = html1[a:b] # 成功读取下一个图片集的id 56 print(img_series_id) 57 58 # ============== 第二步 ============== 59 url2 = ‘https://desk.zol.com.cn/bizhi/‘ + img_series_id + ‘.html‘ 60 61 html2 = url_open(url2).decode(‘gbk‘) 62 63 resolution_judge = html2.find(‘id="1920x1080" href="/showpic/1920x1080_‘) 64 # 倘若该图片没有我们需要的分辨率,则跳过本次循环 65 if resolution_judge == -1: 66 continue 67 68 a_html2 = resolution_judge + 40 69 b_html2 = html2.find(‘.html‘, a_html2, a_html2 + 100) 70 img_id = html2[a_html2:b_html2] 71 print(img_id) # 成功读取每个图片的id(分辨率1920*1080) 72 73 # ============== 第三步 ============== 74 url3 = ‘https://desk.zol.com.cn/showpic/1920x1080_‘ + img_id + ‘.html‘ 75 html3 = url_open(url3).decode(‘gbk‘) 76 a_html3 = html3.find(‘img src="https:‘) + 9 77 # 注意,若遇到的图片为png格式则会报错 78 png_judge = html3.find(‘.jpg‘, a_html3, a_html3 + 255) 79 if png_judge == -1: 80 continue 81 b_html3 = png_judge + 4 82 img_download_url = html3[a_html3:b_html3] 83 84 # ============== 第四步 ============== 85 save_img(img_download_url, ‘photo-pc‘ + str(count)) 86 count += 1 87 if count_web > 5: 88 break 89 time.sleep(1)
原文:https://www.cnblogs.com/vosoland/p/14640619.html