Django: # 创建project django-admin startproject mysite cd mysite # 创建app python manage.py startapp app01 python manage.py startapp app02 # 启动项目 python manage.py runserver scrapy: # 创建project 项目名称 scrapy startproject xdb cd xdb #创建爬虫 爬虫名称 爬虫地址 scrapy genspider chouti chouti.com scrapy genspider cnblogs cnblogs.com # 启动爬虫 scrapy crawl chouti scrapy crawl chouti --nolog 持久化:pipelines pipelines.py class XdbPipeline(object): def __init__(self, path): self.f = None self.path = path @classmethod def from_crawler(cls, crawler): path = crawler.settings.get(‘HREF_FILE_PATH‘) return cls(path) def open_spider(self, spider): self.f = open(self.path, ‘a+‘) def process_item(self, item, spider): # print(item.get("text")) self.f.write(item.get(‘href‘) + ‘\n‘) return item def close_spider(self, spider): self.f.close() settings.py ITEM_PIPELINES = { ‘xdb.pipelines.XdbPipeline‘: 300, # 数字越小优先级越高, 范围0--1000 } items.py import scrapy class XdbItem(scrapy.Item): text = scrapy.Field() href = scrapy.Field() chouti.py import scrapy xdb.items import XdbItem class ChoutiSpider(scrapy.Spider): name = ‘chouti‘ allowed_domains = [‘chouti.com‘] start_urls = [‘http://chouti.com/‘] def parse(self, response): content_list = response.xpath(‘//div[@class="link-con"]//div[@class="link-detail"]‘) for item in content_list: text = item.xpath(‘./a/text()‘).extract_first() href = item.xpath(‘./a/@href‘).extract_first() yield XdbItem(text=text, href=href)
原文:https://www.cnblogs.com/xiongfanyong/p/13063205.html