爬取网易新闻首页(https://news.163.com/)中(国内、国际、军事、航空)四个版块的新闻内容,并且按照格式:(网易新闻/版块名/新闻标题.txt)创建文件名,写入新闻内容
通过对首页、各版块页、以及详情页分析发现,只有四个版块中的新闻标题是通过js动态加载获取的,因此这部分内容准备采用selenium+ChromeDriver去获取。
在scrapy中使用selenium+ChromeDriver的大体策略如下:
import scrapy
class WangyiproItem(scrapy.Item):
path = scrapy.Field() #存储新闻的路径
title = scrapy.Field() #新闻标题
content = scrapy.Field() #新闻内容
import scrapy
import os
from wangyiPro.items import WangyiproItem
from selenium import webdriver
class WangyiSpider(scrapy.Spider):
name = ‘wangyi‘
allowed_domains = [‘163.com‘]
start_urls = [‘https://news.163.com/‘]
model_urls = [] #用于存储国内、国际、军事和航空4个版块的url
bro = webdriver.Chrome() #浏览器对象
def parse(self, response):
‘‘‘解析首页‘‘‘
# 创建目录:网易新闻
if not os.path.exists(‘网易新闻‘):
os.mkdir(‘网易新闻‘)
li_list = response.xpath(‘//div[@class="ns_area list"]/ul/li‘)
index_list = [2,3,5,6] #国内、国际、军事和航空4个版块对应的索引
for index in index_list:
li = li_list[index]
# 获取这4个版块的url和名称
model_url = li.xpath(‘./a/@href‘).extract_first()
model_name = li.xpath(‘./a/text()‘).extract_first()
self.model_urls.append(model_url)
# 存储新闻的路径
path = ‘网易新闻/‘+model_name
#在目录:网易新闻下继续按照各版块名新建目录
if not os.path.exists(path):
os.mkdir(path)
yield scrapy.Request(url=model_url,callback=self.parse_model,meta={‘path‘:path})
def parse_model(self,response):
‘‘‘解析各版块页‘‘‘
path = response.meta.get(‘path‘)
div_list = response.xpath(‘//div[contains(@class,"data_row")]‘)
for div in div_list:
detail_title = div.xpath(‘.//h3/a/text()‘).extract_first()
detail_url = div.xpath(‘.//h3/a/@href‘).extract_first()
#实例化item对象
item = WangyiproItem()
item[‘title‘] = detail_title
item[‘path‘] = path
yield scrapy.Request(url=detail_url,callback=self.parse_detail,meta={‘item‘:item})
def parse_detail(self,response):
‘‘‘解析详情页‘‘‘
item = response.meta.get(‘item‘)
content_list = response.css(‘.post_body p::text‘).extract()
content = ‘‘.join(content_list)
item[‘content‘] = content
yield item
@staticmethod
def close(spider, reason):
‘‘‘关闭浏览器‘‘‘
spider.bro.quit()
注意:千万不要提前在parse方法中去实例化item对象,而是应该为每一个详情页去实例化一个!!!
from fake_useragent import UserAgent
from time import sleep
from scrapy.http import HtmlResponse
class RandomUserAgentMiddlerware:
def process_request(self,request,spider):
‘‘‘给每个请求添上一个随机User-Agent‘‘‘
ua = UserAgent()
request.headers[‘User-Agent‘] = ua.random
class WangyiproDownloaderMiddleware:
def process_request(self, request, spider):
‘‘‘拦截4个版块的请求‘‘‘
if request.url in spider.model_urls:
bro = spider.bro
bro.get(request.url)
sleep(3)
# 不断滚动滚动条,直到无法滚动
current_height = bro.execute_script(‘return document.body.scrollHeight;‘)
while 1:
bro.execute_script(‘window.scrollTo(0,document.body.scrollHeight);‘)
sleep(3)
new_height = bro.execute_script(‘return document.body.scrollHeight;‘)
if new_height == current_height:
break
current_height = new_height
#页面滚动到底部后,有的需要通过点击‘加载更多‘按钮来继续加载新闻标题
load_more_ele = bro.find_elements_by_css_selector(‘.post_addmore > span‘)
try:
if load_more_ele:
load_more_ele[0].click()
sleep(2)
except Exception as e:
pass
# 返回自己构建的响应
return HtmlResponse(url=bro.current_url,body=bro.page_source,request=request,encoding=‘utf-8‘)
class WangyiproPipeline:
def process_item(self, item, spider):
file_name = item[‘path‘]+‘/‘+item[‘title‘]+‘.txt‘
with open(file_name,‘w‘,encoding=‘utf-8‘) as f:
f.write(item[‘content‘])
return item
BOT_NAME = ‘wangyiPro‘
SPIDER_MODULES = [‘wangyiPro.spiders‘]
NEWSPIDER_MODULE = ‘wangyiPro.spiders‘
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 2 #设置下载延迟
#开启下载中间件
DOWNLOADER_MIDDLEWARES = {
‘wangyiPro.middlewares.RandomUserAgentMiddlerware‘:600,
‘wangyiPro.middlewares.WangyiproDownloaderMiddleware‘: 543,
}
#开启管道
ITEM_PIPELINES = {
‘wangyiPro.pipelines.WangyiproPipeline‘: 300,
}
#设置log级别
LOG_LEVEL = ‘ERROR‘
原文:https://www.cnblogs.com/eliwang/p/14843571.html