代码:
#-*- coding: UTF-8 -*-
import requests
import os
from bs4 import BeautifulSoup
import urllib
start_url = ‘http://www.521609.com/meinvxiaohua/‘
headers = {‘user-agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36‘}
saved_path = r‘C:\Users\zhoutiax\Desktop\xiaohua‘
x=1
def crawl(url):
# req = urllib2.Request(url, headers=headers)
# content = urllib2.urlopen(req, timeout=20).read()
content = requests.get(url).text
soup = BeautifulSoup(content, "html.parser")
img_urls = soup.find_all("img")
global x
if not os.path.exists(saved_path):
os.makedirs(saved_path)
for img_url in img_urls:
# print img_url[‘src‘]
if img_url[‘src‘].startswith(‘/uploads‘):
img = url.split(‘/m‘)[0] + img_url[‘src‘]
urllib.urlretrieve(img, r‘C:\Users\zhoutiax\Desktop\xiaohua\%d.jpg‘ % x)
x += 1
else:
exit
if __name__ == ‘__main__‘:
for page in range(1, 5): # 多页
page_url = start_url + "list12%d.html" % page
print page_url
crawl(page_url)
原文:https://www.cnblogs.com/nevermore29/p/9606035.html