from scrapy.selector import Selector, HtmlXPathSelectorfrom scrapy.http import HtmlResponsehtml = """<!DOCTYPE html><html> <head lang="en"> <meta charset="UTF-8"> <title></title> </head> <body> <ul> <li class="item-"><a id=‘i1‘ href="link.html">first item</a></li> <li class="item-0"><a id=‘i2‘ href="llink.html">first item</a></li> <li class="item-1"><a href="llink2.html">second item<span>vv</span></a></li> </ul> <div><a href="llink2.html">second item</a></div> </body></html>"""response = HtmlResponse(url=‘http://example.com‘, body=html,encoding=‘utf-8‘)# hxs = HtmlXPathSelector(response)# print(hxs)# hxs = Selector(response=response).xpath(‘//a‘)# print(hxs)# hxs = Selector(response=response).xpath(‘//a[2]‘)# print(hxs)# hxs = Selector(response=response).xpath(‘//a[@id]‘)# print(hxs)# hxs = Selector(response=response).xpath(‘//a[@id="i1"]‘)# print(hxs)# hxs = Selector(response=response).xpath(‘//a[@href="link.html"][@id="i1"]‘)# print(hxs)# hxs = Selector(response=response).xpath(‘//a[contains(@href, "link")]‘)# print(hxs)# hxs = Selector(response=response).xpath(‘//a[starts-with(@href, "link")]‘)# print(hxs)# hxs = Selector(response=response).xpath(‘//a[re:test(@id, "i\d+")]‘)# print(hxs)# hxs = Selector(response=response).xpath(‘//a[re:test(@id, "i\d+")]/text()‘).extract()# print(hxs)# hxs = Selector(response=response).xpath(‘//a[re:test(@id, "i\d+")]/@href‘).extract()# print(hxs)# hxs = Selector(response=response).xpath(‘/html/body/ul/li/a/@href‘).extract()# print(hxs)# hxs = Selector(response=response).xpath(‘//body/ul/li/a/@href‘).extract_first()# print(hxs)# ul_list = Selector(response=response).xpath(‘//body/ul/li‘)# for item in ul_list:# v = item.xpath(‘./a/span‘)# # 或# # v = item.xpath(‘a/span‘)# # 或# # v = item.xpath(‘*/a/span‘)# print(v) |
原文:https://www.cnblogs.com/BensonChang/p/9222243.html