测试过很多样式的博客园,就发现长书这样的也就是我的博客这样的抓取不了,标签不一样。其他的只需要把bky.py下的user的值即‘username’改为要抓取的用户的用户名即可,如:
user = "whz0215"。
代码如下:
spiders下的bky.py
import scrapy
from scrapy import Request,Selector
import re
from bokeyuan.items import *
class BkySpider(scrapy.Spider):
user = "username"
name = ‘bky‘
allowed_domains = [‘cnblogs.com‘]
start_urls = [‘https://cnblogs.com/‘]
cur_page = 1
url = "https://www.cnblogs.com/%s/default.html?page=%s"
def start_requests(self):
url = self.url%(self.user,self.cur_page)
yield Request(url,callback=self.parse)
def parse(self, response):
selector = Selector(text=response.text)
one_page = selector.xpath(‘//div[@class="forFlow"]/div[@class="day"]‘)
for each in one_page:
title = each.xpath(‘div[@class="postTitle"]/a[@class="postTitle2"]/text()‘).extract_first()
sec_title = each.xpath(‘div[@class="postCon"]/div[@class="c_b_p_desc"]/text()‘).extract_first()
detail_url = each.xpath(‘div[@class="postTitle"]/a/@href‘).extract_first()
desc = each.xpath(‘div[@class="postDesc"]/text()‘).extract_first()
if desc:
split_desc = desc.strip().split()
post_time = split_desc[2] + " " + split_desc[3]
postor = split_desc[4]
read = re.search(r"(\d+)",split_desc[5]).group(1)
conment = re.search((r"(\d+)"),split_desc[6]).group(1)
# print(title,sec_title,post_time,postor,read,conment,detail_url)
item = BokeyuanItem()
item["title"] = title
item["sec_title"] = sec_title
item["post_time"] = post_time
item["postor"] = postor
item["read"] = read
item["comment"] = conment
item["detail_url"] = detail_url
yield item
if self.cur_page > 1:
if_next = selector.xpath(‘//div[@class="pager"]/a[last()]‘).extract_first()
if re.search(r‘(\d+)‘,if_next).group(1) == None:
exit(0)
self.cur_page += 1
yield Request(url=self.url%(self.user,self.cur_page),callback=self.parse)
pipelines.py
class BokeyuanPipeline(object):
def process_item(self, item, spider):
return item
import pymongo
class MongoPipeline(object):
collection_name = ‘whz‘
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get(‘MONGO_URI‘),
mongo_db=crawler.settings.get(‘MONGO_DATAEASE‘,‘bky‘),
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
if len(item["sec_title"]) > 50:
item["sec_title"] = item["sec_title"][:50] + "..."
self.db[self.collection_name].insert_one(dict(item))
return item
items.py
import scrapy
from scrapy import Item, Field
class BokeyuanItem(Item):
# define the fields for your item here like:
# name = scrapy.Field()
# title, sec_title, post_time, postor, read, conment, detail_url
title = Field()
sec_title = Field()
post_time = Field()
postor = Field()
read = Field()
comment = Field()
detail_url = Field()
settings.py
修改为
ROBOTSTXT_OBEY = False
注释解开并添加
DEFAULT_REQUEST_HEADERS = {
‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘,
‘Accept-Language‘: ‘en‘,
‘Referer‘: ‘https://www.cnblogs.com‘,
‘USER_AGENT‘: ‘Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36‘
}
ITEM_PIPELINES = {
‘bokeyuan.pipelines.BokeyuanPipeline‘: 300,
‘bokeyuan.pipelines.MongoPipeline‘: 301,
}
添加
MONGO_URL = ‘localhost‘
MONGO_DATABASE = ‘dbname‘
原文:https://www.cnblogs.com/whz0215/p/9694048.html