class CDErshouFang(scrapy.Item): "贝壳二手房" house_name = scrapy.Field() #小区名称 house_address = scrapy.Field() #小区地址 house_info = scrapy.Field() # 房子信息:楼层、建造时间、户型、建造面积、朝向 release_time = scrapy.Field() # 发布时间 house_tags = scrapy.Field() # 标签 price = scrapy.Field() # 均价 total_price = scrapy.Field() # 总价 details = scrapy.Field() #详情页-所在区域 trading_ownership = scrapy.Field() #交易权属 commodity_use = scrapy.Field() #商品用途 house_year = scrapy.Field() #房屋年限 property = scrapy.Field() #产权所属 mortgage_information = scrapy.Field() #抵押信息 room_spare = scrapy.Field() #房本备件
1 import scrapy 2 from scrapystudy.items import CDErshouFang 3 4 5 class CdErshoufangSpider(scrapy.Spider): 6 name = ‘cd_ershoufang‘ 7 allowed_domains = [‘cd.ke.com‘] 8 start_urls = [‘https://cd.ke.com/ershoufang/‘] 9 10 def start_requests(self): 11 "重写start_requests,爬虫将从此处运行url" 12 13 for page in range(1,100): 14 url = self.start_urls[0] + ‘pg‘ + str(page) + ‘/‘ 15 yield scrapy.Request(url=url,callback=self.parse,dont_filter=True) 16 17 def parse(self, response): 18 SET_SELECT = response.css(‘.info‘) #*****,此处定位不正确关系下面的信息读取 19 for cle in SET_SELECT: 20 item = CDErshouFang() 21 house_name = cle.css(‘.title a::text‘).extract_first() #用的是cle选择器(下载器下载的网页,不能直接切换操作) 22 house_address = cle.css(‘.positionInfo a::text‘).extract_first() 23 house_info = cle.css(‘.houseInfo::text‘).extract()[1].replace(‘ ‘,‘‘).replace(‘\n‘,‘‘) 24 release_time = cle.css(‘.followInfo::text‘).extract()[1].replace(‘ ‘,‘‘).replace(‘\n‘,‘‘) 25 price_total = cle.css(‘.priceInfo .totalPrice span::text‘).extract_first() 26 if price_total is not None: 27 price_total = price_total + ‘万‘ 28 price = cle.css(‘.unitPrice span::text‘).extract_first() 29 # house_tags = cle.css(‘.info .address .tag span::text‘).extract() 30 item["house_name"] = house_name 31 item["house_address"] = house_address 32 item["house_info"] = house_info 33 item["release_time"] = release_time 34 item["total_price"] = price_total 35 item["price"] = price 36 # item["house_tags"] = house_tags 37 details_page_url = cle.css(‘.title a::attr(href)‘).extract_first() #详情页超链接 38 # meta:把需要传递的信息赋值给这个叫meta的变量(字典类型),Request中meta参数的作用是传递信息给下一个函数 39 yield scrapy.Request(url=details_page_url,callback=self.details,meta={‘item‘:item}) 40 41 def details(self,response): 42 "详情页数据获取" 43 area = response.xpath(‘//span[@class="info"]/a[1]/text()‘).extract_first() #区 44 details = response.xpath(‘//span[@class="info"]/a[last()]/text()‘).extract_first() 45 if area is not None or details is not None: 46 details = area + ‘ ‘ + details 47 trading_ownership = response.xpath(‘//div[@class="transaction"]/div[@class="content"]/ul/li[2]/text()‘).extract_first().strip() # 交易权属 48 commodity_use = response.xpath(‘//div[@class="transaction"]/div[@class="content"]/ul/li[4]/text()‘).extract_first().strip() # 房屋用途 49 house_year = response.xpath(‘//div[@class="transaction"]/div[@class="content"]/ul/li[5]/text()‘).extract_first().strip() # 房屋年限 50 property = response.xpath(‘//div[@class="transaction"]/div[@class="content"]/ul/li[6]/text()‘).extract_first().strip() # 产权所属 51 mortgage_information = response.xpath(‘//div[@class="transaction"]/div[@class="content"]/ul/li[7]/span[2]/text()‘).extract_first().strip() # 抵押信息 52 room_spare = response.xpath(‘//div[@class="transaction"]/div[@class="content"]/ul/li[8]/text()‘).extract_first().strip() # 房本备件 53 item = response.meta[‘item‘] #取出上一个页面爬取的信息 54 item["details"] = details 55 item["trading_ownership"] = trading_ownership 56 item["commodity_use"] = commodity_use 57 item["house_year"] = house_year 58 item["property"] = property 59 item["mortgage_information"] = mortgage_information 60 item["room_spare"] = room_spare 61 yield item
1 # Define your item pipelines here 2 # 3 # Don‘t forget to add your pipeline to the ITEM_PIPELINES setting 4 # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 5 6 7 # useful for handling different item types with a single interface 8 from itemadapter import ItemAdapter 9 from scrapy.exceptions import DropItem 10 import pymongo 11 12 class TextPipeline: 13 "要是用pipelines要先注册,去setting.py添加配置" 14 15 def __init__(self): 16 self.limit = 50 17 18 def process_item(self, item, spider): 19 "处理items长度" 20 if item[‘title‘]: 21 if len(item[‘title‘])>self.limit: 22 item[‘title‘] = item[‘title‘][0:self.limit].rstrip()+‘...‘ 23 return item 24 else: 25 return DropItem(‘Missing Text‘) 26 27 class MongoPipeline(object): 28 "将数据存储在MongoDB中" 29 30 def __init__(self,mongo_url,mongo_db): 31 self.mongo_url = mongo_url 32 self.mongo_db = mongo_db 33 34 @classmethod 35 def from_crawler(cls,crawler): 36 "获取到setting全局的配置" 37 return cls( 38 mongo_url = crawler.settings.get(‘MONGO_URL‘), 39 mongo_db = crawler.settings.get(‘MONGO_DB‘) 40 ) 41 42 def open_spider(self,spider): 43 self.client = pymongo.MongoClient(self.mongo_url) 44 self.db = self.client[self.mongo_db] 45 46 def process_item(self,item,spider): 47 name = item.__class__.__name__ 48 self.db[name].insert(dict(item)) 49 return item 50 51 def close_spider(self,spider): 52 self.client.close() 53 54 import os 55 import time 56 import logging 57 import yaml 58 logger = logging.getLogger(__name__) 59 60 class SaveBeikePipeline(object): 61 "保存爬虫数据到yaml、json文件" 62 63 def open_spider(self,spider): 64 "spider打开时启动,该方法被调用,这里打开或新建一个文件" 65 filetime = time.strftime("%Y%m%d") 66 filepath = os.path.join(os.path.dirname(__file__),‘spiderSaveFile‘) 67 if not os.path.exists(filepath) : os.mkdir(filepath) 68 # spider_file = filepath + ‘\{}.yaml‘ .format(self.__class__.__name__) #self.__class__.__name__获取类名 69 spider_file = filepath + ‘\{}.yaml‘ .format(filetime) 70 try: 71 self.f = open(spider_file, mode=‘w‘, encoding=‘utf-8‘) 72 except Exception as e: 73 logger.error(e) 74 75 def process_item(self,item,spider): 76 "处理数据" 77 data = dict() 78 data["小区名称"] = item["house_name"] 79 data["在售状态"] = item["on_sale"] 80 data["房屋类型"] = item["house_type"] 81 data["小区地址"] = item["address"] 82 data["房屋户型"] = item["door_module"] 83 data["建筑面积"] = item["area"] 84 data["价格"] = item["price"] 85 data["总价/套"] = item["total_price"] 86 data["附近设施"] = item["tags"] 87 # self.f.write(str(data)+‘\n‘) 88 spider_data = yaml.dump(data,allow_unicode=True,width=1000,sort_keys=False) # sort_keys:表示dump后的字典数据按原有的顺序示 89 self.f.write(spider_data+‘*‘.center(50,‘-‘)+‘\n‘) 90 return item 91 92 def close_spider(self,spider): 93 "scrapy结束时启动,用来关掉文件" 94 self.f.close() 95 96 class SaveCDershouFangPipeline(object): 97 "保存爬虫数据到yaml" 98 99 def open_spider(self,spider): 100 "spider打开时启动,该方法被调用,这里打开或新建一个文件" 101 filetime = time.strftime("%Y%m%d") 102 filepath = os.path.join(os.path.dirname(__file__),‘spiderSaveFile‘) 103 if not os.path.exists(filepath) : os.mkdir(filepath) 104 # spider_file = filepath + ‘\{}.yaml‘ .format(self.__class__.__name__) #self.__class__.__name__获取类名 105 spider_file = filepath + ‘\cd_ershoufang{}.yaml‘ .format(filetime) 106 try: 107 self.f = open(spider_file, mode=‘w‘, encoding=‘utf-8‘) 108 except Exception as e: 109 logger.error(e) 110 111 def process_item(self,item,spider): 112 "处理数据" 113 data = dict() 114 data["小区名称"] = item["house_name"] 115 data["小区地址"] = item["house_address"] 116 data["房子信息"] = item["house_info"] 117 data["发布时间"] = item["release_time"] 118 data["总价/套"] = item["total_price"] 119 data["均价"] = item["price"] 120 # data["标签"] = item["house_tags"] 121 data["所在区域"] = item["details"] 122 data["交易权属"] = item["trading_ownership"] 123 data["房屋用途"] = item["commodity_use"] 124 data["房屋年限"] = item["house_year"] 125 data["产权所属"] = item["property"] 126 data["抵押信息"] = item["mortgage_information"] 127 data["房本备件"] = item["room_spare"] 128 spider_data = yaml.dump(data,allow_unicode=True,width=1000,sort_keys=False) # sort_keys:表示dump后的字典数据按原有的顺序示,默认为True 129 self.f.write(spider_data+‘*‘.center(60,‘-‘)+‘\n‘) 130 return item 131 132 def close_spider(self,spider): 133 "scrapy结束时启动,用来关掉文件" 134 self.f.close()
1 # Define here the models for your spider middleware 2 # 3 # See documentation in: 4 # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 5 6 from scrapy import signals 7 8 # useful for handling different item types with a single interface 9 from itemadapter import is_item, ItemAdapter 10 11 12 class ScrapystudySpiderMiddleware: 13 # Not all methods need to be defined. If a method is not defined, 14 # scrapy acts as if the spider middleware does not modify the 15 # passed objects. 16 17 @classmethod 18 def from_crawler(cls, crawler): 19 # This method is used by Scrapy to create your spiders. 20 s = cls() 21 crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 22 return s 23 24 def process_spider_input(self, response, spider): 25 # Called for each response that goes through the spider 26 # middleware and into the spider. 27 28 # Should return None or raise an exception. 29 return None 30 31 def process_spider_output(self, response, result, spider): 32 # Called with the results returned from the Spider, after 33 # it has processed the response. 34 35 # Must return an iterable of Request, or item objects. 36 for i in result: 37 yield i 38 39 def process_spider_exception(self, response, exception, spider): 40 # Called when a spider or process_spider_input() method 41 # (from other spider middleware) raises an exception. 42 43 # Should return either None or an iterable of Request or item objects. 44 pass 45 46 def process_start_requests(self, start_requests, spider): 47 # Called with the start requests of the spider, and works 48 # similarly to the process_spider_output() method, except 49 # that it doesn’t have a response associated. 50 51 # Must return only requests (not items). 52 for r in start_requests: 53 yield r 54 55 def spider_opened(self, spider): 56 spider.logger.info(‘Spider opened: %s‘ % spider.name) 57 58 59 class ScrapystudyDownloaderMiddleware: 60 # Not all methods need to be defined. If a method is not defined, 61 # scrapy acts as if the downloader middleware does not modify the 62 # passed objects. 63 64 @classmethod 65 def from_crawler(cls, crawler): 66 # This method is used by Scrapy to create your spiders. 67 s = cls() 68 crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 return s 70 71 def process_request(self, request, spider): 72 # Called for each request that goes through the downloader 73 # middleware. 74 # request.cookie = { 75 # "Cookie":"__mta=108386109.1609123577452.1610351007435.1610351353409.13; __mta=108386109.1609123577452.1610351353409.1610362706394.14; uuid_n_v=v1; _lxsdk_cuid=176a73d3e42c8-057a36937583e8-c791039-149c48-176a73d3e42c8; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; uuid=DF86446053FA11EBBFA05D0E1C80A5E52BD1299115184C8C837F6324366BFFA0; _csrf=1d012800348e02304158b04bcaacdb15959e3482e6847893721b340ca6f29323; lt=8kvWp1o5sQYEgkrZTHbti6H0uI8AAAAAhgwAADxF8ufwXVyR4TU3_BGMHAKsB_TA6toYFjxg-m34Z43vNJlCb9Bv05PqTeelhSHITw; lt.sig=iPSGNXFnd3jV3SEy7wzqa0L_QOw; uid=2829236546; uid.sig=fiHM__7YgLUMEaZ05TkEQaVApbs; _lxsdk=DF86446053FA11EBBFA05D0E1C80A5E52BD1299115184C8C837F6324366BFFA0; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1609123577,1609148969,1610350992,1610362253; __mta=108386109.1609123577452.1610362628562.1610362689900.15; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1610362706; _lxsdk_s=176f0edcffa-620-f33-c24%7C%7C53", 76 # "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36" 77 # } 78 79 # Must either: 80 # - return None: continue processing this request 81 # - or return a Response object 82 # - or return a Request object 83 # - or raise IgnoreRequest: process_exception() methods of 84 # installed downloader middleware will be called 85 return None 86 87 def process_response(self, request, response, spider): 88 # Called with the response returned from the downloader. 89 90 # Must either; 91 # - return a Response object 92 # - return a Request object 93 # - or raise IgnoreRequest 94 return response 95 96 def process_exception(self, request, exception, spider): 97 # Called when a download handler or a process_request() 98 # (from other downloader middleware) raises an exception. 99 100 # Must either: 101 # - return None: continue processing this exception 102 # - return a Response object: stops process_exception() chain 103 # - return a Request object: stops process_exception() chain 104 pass 105 106 def spider_opened(self, spider): 107 spider.logger.info(‘Spider opened: %s‘ % spider.name) 108 109 # import logging 110 # class ProxyMiddleware(object): 111 # "设置中间件代理" 112 # logger = logging.getLogger(__name__) 113 # def process_request(self,request,spider): 114 # self.logger.debug("Using Proxy") 115 # request.meta["proxy"] = "http://125.87.105.4:49713" 116 117 from selenium import webdriver 118 from selenium.webdriver.common.by import By 119 from selenium.webdriver.support.ui import WebDriverWait 120 from selenium.webdriver.support import expected_conditions as EC 121 from selenium.common.exceptions import TimeoutException 122 from scrapy.http import HtmlResponse 123 from selenium.webdriver.chrome.options import Options 124 import logging 125 import time 126 127 logger = logging.getLogger(__name__) 128 129 class SeleniumMiddleware(object): #??如何将多个HtnlResponse对象传给spider进行解析???? 130 131 def process_request(self,request,spider): 132 url = request.url 133 opt = Options() 134 opt.add_argument(‘--headless‘) 135 # 创建谷歌浏览器对象 136 browser = webdriver.Chrome() 137 wait = WebDriverWait(browser,10) 138 browser.get(url) 139 htmls = [] 140 for page in range(2,3): 141 try: 142 next_page = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"body > div.page-container.clearfix > div.page-box > a.next"))) 143 next_page.click() 144 # 判断当前页码是否为当前页 145 wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,"body > div.page-container.clearfix > div.page-box > span.active"),str(page))) 146 except TimeoutException: 147 continue 148 browser.execute_script(‘window.scrollTo(0,document.body.scrollHeight)‘) 149 time.sleep(2) 150 html = browser.page_source # 返回网页源码 151 logger.info("获取到的URL:"+request.url) 152 # browser.quit() 153 return HtmlResponse(url=request.url,body=html,request=request,encoding=‘utf-8‘)
1 # Scrapy settings for scrapystudy project 2 # 3 # For simplicity, this file contains only settings considered important or 4 # commonly used. You can find more settings consulting the documentation: 5 # 6 # https://docs.scrapy.org/en/latest/topics/settings.html 7 # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 8 # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 9 10 BOT_NAME = ‘scrapystudy‘ 11 12 SPIDER_MODULES = [‘scrapystudy.spiders‘] 13 NEWSPIDER_MODULE = ‘scrapystudy.spiders‘ 14 15 MONGO_URL = "localhost" 16 MONGO_DB = "mydb" 17 18 19 # Crawl responsibly by identifying yourself (and your website) on the user-agent 20 #USER_AGENT = ‘scrapystudy (+http://www.yourdomain.com)‘ 21 22 # Obey robots.txt rules 23 ROBOTSTXT_OBEY = False 24 25 # Configure maximum concurrent requests performed by Scrapy (default: 16) 26 #CONCURRENT_REQUESTS = 32 27 28 # Configure a delay for requests for the same website (default: 0) 29 # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 30 # See also autothrottle settings and docs 31 #DOWNLOAD_DELAY = 3 32 # The download delay setting will honor only one of: 33 #CONCURRENT_REQUESTS_PER_DOMAIN = 16 34 #CONCURRENT_REQUESTS_PER_IP = 16 35 36 # Disable cookies (enabled by default) 37 # COOKIES_ENABLED = True 38 39 # Disable Telnet Console (enabled by default) 40 #TELNETCONSOLE_ENABLED = False 41 42 # Override the default request headers: 43 # DEFAULT_REQUEST_HEADERS = { 44 # ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘, 45 # ‘Accept-Language‘: ‘en‘, 46 # ‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36‘ 47 # } 48 49 # Enable or disable spider middlewares 50 # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 51 # SPIDER_MIDDLEWARES = { 52 # ‘scrapystudy.middlewares.MyFirstSpiderMiddleware‘: 543, 53 # } 54 55 # Enable or disable downloader middlewares 56 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 57 # DOWNLOADER_MIDDLEWARES = { 58 # ‘scrapystudy.middlewares.SeleniumMiddleware‘: 543, 59 # } 60 61 # Enable or disable extensions 62 # See https://docs.scrapy.org/en/latest/topics/extensions.html 63 #EXTENSIONS = { 64 # ‘scrapy.extensions.telnet.TelnetConsole‘: None, 65 #} 66 67 # Configure item pipelines 68 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 69 # 300,400表示执行顺序,越小代表优先级越高,越先执行 70 ITEM_PIPELINES = { 71 ‘scrapystudy.pipelines.SaveCDershouFangPipeline‘: 600, 72 # ‘scrapystudy.pipelines.TextPipeline‘: 300, 73 # ‘scrapystudy.pipelines.MongoPipeline‘: 400, 74 # ‘scrapystudy.pipelines.SaveBeikePipeline‘: 500, 75 } 76 77 # Enable and configure the AutoThrottle extension (disabled by default) 78 # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 79 #AUTOTHROTTLE_ENABLED = True 80 # The initial download delay 81 #AUTOTHROTTLE_START_DELAY = 5 82 # The maximum download delay to be set in case of high latencies 83 #AUTOTHROTTLE_MAX_DELAY = 60 84 # The average number of requests Scrapy should be sending in parallel to 85 # each remote server 86 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 87 # Enable showing throttling stats for every response received: 88 #AUTOTHROTTLE_DEBUG = False 89 90 # Enable and configure HTTP caching (disabled by default) 91 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 92 #HTTPCACHE_ENABLED = True 93 #HTTPCACHE_EXPIRATION_SECS = 0 94 #HTTPCACHE_DIR = ‘httpcache‘ 95 #HTTPCACHE_IGNORE_HTTP_CODES = [] 96 #HTTPCACHE_STORAGE = ‘scrapy.extensions.httpcache.FilesystemCacheStorage‘
原文:https://www.cnblogs.com/yzmPython/p/14371529.html