wb_id就是微博内容所在的id,html标签属性为mid
#图片区域,多图
self.multi_media_xpath="//div[@mid=‘{}‘ and not(@minfo)]//div[@class=‘WB_detail‘]//div[@class=‘media_box‘]/ul/li/img/@src"
# 图片区域,单图
self.single_media_xpath="//div[@mid=‘{}‘ and not(@minfo)]//div[@class=‘WB_detail‘]//div[@class=‘media_box‘]/ul/li//img/@src"
@decorator
def get_img_list(self,root,wb_id):
# 判断是否为单图
imgurllist = []
single_img_node_list = root.xpath(self.single_media_xpath.format(wb_id))
# 不包含360长图
multi_img_node_list = root.xpath(self.multi_media_xpath.format(wb_id))
if len(multi_img_node_list) > 1:
imgurllist = ["http:" + i.replace("thumb150", "mw690") for i in multi_img_node_list]
return imgurllist
elif single_img_node_list:
#单图的链接形式
imgurllist = ["http:" + i.replace("orj360", "mw690") for i in multi_img_node_list]
else:
print("该条内容没有图片")
return imgurllist
def save_imge(self,url,id_path,retry=1):
if retry>3:
print("重试三次以上,该图片下载失败")
return None
filepath=id_path
urlname=url.split(‘/‘)[-1]
filename=os.path.join(filepath,urlname)
if not os.path.exists(filepath):
os.makedirs(filepath)
if not os.path.exists(filename):
while retry<3:
try:
ir=requests.get(url,timeout=10)
print("当前下载的url", url, "id", id_path)
with open(filename, "wb") as fs:
fs.write(ir.content)
break
except:
time.sleep(3)
print(f"图片下载超时,开始重试,重试次数",retry)
retry+=1
self.save_imge(url,id_path,retry)
else:
print("图片已经存在")
原文:https://www.cnblogs.com/c-x-a/p/9146192.html