from lxml import etree import requests baseurl0 = "https://www.ygdy8.net" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36" } def get_page(): for x in range(1,4): pageurl = "https://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html" pageurl = pageurl.format(x) get_urls(pageurl) def get_urls(baseurl): resp = requests.get(baseurl, headers=headers) result = resp.text html = etree.HTML(result) uls = html.xpath("//table[@class=‘tbspan‘]//a[@href]/@href") uls = map(lambda url:baseurl0+url,uls) for ul in uls: print(ul) get_detalis_urls(ul) def get_detalis_urls(url): resp = requests.get(url, headers=headers) result = resp.content.decode(‘gbk‘) html = etree.HTML(result) uls = html.xpath("//div/h1/font[@color]/text()") print(uls) uls2 = html.xpath("//img[@src]/@src")[0] print(uls2) print("---------------------------------------") get_page()
原文:https://www.cnblogs.com/jswf/p/12292232.html