scrapy爬取http://lab.scrapyd.cn/

时间：2020-03-23 13:46:19 阅读：80 评论：0 收藏：0 [点我收藏+]

import scrapy

class BooksSpider(scrapy.Spider):
    ‘‘‘
    爬取http://books.toscrape.com/的书籍信息
    ‘‘‘
    # 爬虫标识
    name = ‘books‘

    # 定义爬虫起始点即url
    start_urls = [
        ‘http://lab.scrapyd.cn/‘
    ]
    ‘‘‘
    定义 传入参数的爬虫
    
    命令 scrapy crawl argsSpider -a tag=励志
    def start_requests(self):
        url = ‘http://lab.scrapyd.cn/‘
        tag = getattr(self,‘tag‘,None)
        if tag:
            url = url + ‘tag/‘ + tag
        yield scrapy.Request(url,self.parse)
    ‘‘‘
    # 定义页面解析函数
    def parse(self,response):
        # 数据提取
        # data = response.css(‘div.quote‘)
        # print(data)
        for info in response.css(‘div.quote‘):
            mingyan = info.css(‘.text::text‘).extract_first()
            aothor = info.css(‘.author::text‘).extract_first()
            tags = info.css(‘.tags .tag ::text‘).extract()

            # print(mingyan)
            # print(aothor)
            # print(tags)
            with open(‘mingyan.csv‘,‘a+‘,encoding=‘utf-8‘)as f:
                f.writelines(mingyan+‘\r\n‘+aothor+‘\r‘+str(tags)+‘\r\n‘)
        next_url = response.css(‘li.next a::attr(href)‘).extract_first()
        # print(next_url)
        if next_url:
            url = response.urljoin(next_url)
            yield scrapy.Request(url,callback=self.parse)

原文：https://www.cnblogs.com/lizhihoublog/p/12551396.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)