linkextractor: 连接提取器:
帮我们从response对象中提取指定的链接
用法:
代码示例:
# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor class ACrawlspiderSpider(scrapy.Spider): name = ‘_crawlspider‘ # allowed_domains = [‘www.xxx.com‘] start_urls = [‘http://pic.netbian.com/‘] def parse(self, response): # link = LinkExtractor(restrict_xpaths=‘//ul/li‘) #allow: 接受一个正则表达式或一个正则表达式列表,提取匹配正则的链接 link = LinkExtractor(allow=‘tupian.+\.html‘) #deny: 与allow相反, 排除符合正则的链接 link = LinkExtractor(deny=‘tupian.+\.html‘) #allow_domins 提取该域名下的链接 link = LinkExtractor(allow_domains=‘pic.netbian.com‘) #deny_domins 剔除该域名下的链接 link = LinkExtractor(deny_domains=‘pic.netbian.com‘) #restrict_xpaths 接受xpath表达式, 提取指定区域的链接 link = LinkExtractor(restrict_xpaths=‘//ul/li‘) #restrict_css 提取指定区域的链接 links = link.extract_links(response) for link in links: print(link.url, link.text)
原文:https://www.cnblogs.com/zhangjian0092/p/11704660.html