import scrapy class BooksSpider(scrapy.Spider): ‘‘‘ 爬取http://books.toscrape.com/的书籍信息 ‘‘‘ # 爬虫标识 name = ‘books‘ # 定义爬虫起始点即url start_urls = [ ‘http://lab.scrapyd.cn/‘ ] ‘‘‘ 定义 传入参数的爬虫 命令 scrapy crawl argsSpider -a tag=励志 def start_requests(self): url = ‘http://lab.scrapyd.cn/‘ tag = getattr(self,‘tag‘,None) if tag: url = url + ‘tag/‘ + tag yield scrapy.Request(url,self.parse) ‘‘‘ # 定义页面解析函数 def parse(self,response): # 数据提取 # data = response.css(‘div.quote‘) # print(data) for info in response.css(‘div.quote‘): mingyan = info.css(‘.text::text‘).extract_first() aothor = info.css(‘.author::text‘).extract_first() tags = info.css(‘.tags .tag ::text‘).extract() # print(mingyan) # print(aothor) # print(tags) with open(‘mingyan.csv‘,‘a+‘,encoding=‘utf-8‘)as f: f.writelines(mingyan+‘\r\n‘+aothor+‘\r‘+str(tags)+‘\r\n‘) next_url = response.css(‘li.next a::attr(href)‘).extract_first() # print(next_url) if next_url: url = response.urljoin(next_url) yield scrapy.Request(url,callback=self.parse)
scrapy爬取http://lab.scrapyd.cn/
原文:https://www.cnblogs.com/lizhihoublog/p/12551396.html