scrapy抓取某些样式的博客园博客信息

时间：2018-09-23 22:57:02 阅读：175 评论：0 收藏：0 [点我收藏+]

测试过很多样式的博客园，就发现长书这样的也就是我的博客这样的抓取不了，标签不一样。其他的只需要把bky.py下的user的值即‘username’改为要抓取的用户的用户名即可，如：

user = "whz0215"。

代码如下：

spiders下的bky.py

import scrapy
from scrapy import Request,Selector
import re
from bokeyuan.items import *

class BkySpider(scrapy.Spider):
    user = "username"
    name = ‘bky‘
    allowed_domains = [‘cnblogs.com‘]
    start_urls = [‘https://cnblogs.com/‘]
    cur_page = 1
    url = "https://www.cnblogs.com/%s/default.html?page=%s"

    def start_requests(self):
        url = self.url%(self.user,self.cur_page)
        yield Request(url,callback=self.parse)

    def parse(self, response):
        selector = Selector(text=response.text)
        one_page = selector.xpath(‘//div[@class="forFlow"]/div[@class="day"]‘)
        for each in one_page:
            title = each.xpath(‘div[@class="postTitle"]/a[@class="postTitle2"]/text()‘).extract_first()
            sec_title = each.xpath(‘div[@class="postCon"]/div[@class="c_b_p_desc"]/text()‘).extract_first()
            detail_url = each.xpath(‘div[@class="postTitle"]/a/@href‘).extract_first()
            desc = each.xpath(‘div[@class="postDesc"]/text()‘).extract_first()
            if desc:
                split_desc = desc.strip().split()
                post_time = split_desc[2] + " " + split_desc[3]
                postor = split_desc[4]
                read = re.search(r"(\d+)",split_desc[5]).group(1)
                conment = re.search((r"(\d+)"),split_desc[6]).group(1)
                # print(title,sec_title,post_time,postor,read,conment,detail_url)
                item = BokeyuanItem()
                item["title"] = title
                item["sec_title"] = sec_title
                item["post_time"] = post_time
                item["postor"] = postor
                item["read"] = read
                item["comment"] = conment
                item["detail_url"] = detail_url
                yield item
        if self.cur_page > 1:
            if_next = selector.xpath(‘//div[@class="pager"]/a[last()]‘).extract_first()
            if re.search(r‘(\d+)‘,if_next).group(1) == None:
                exit(0)
        self.cur_page += 1
        yield Request(url=self.url%(self.user,self.cur_page),callback=self.parse)

pipelines.py

class BokeyuanPipeline(object):
    def process_item(self, item, spider):
        return item

import pymongo

class MongoPipeline(object):

    collection_name = ‘whz‘

    def __init__(self, mongo_uri, mongo_db):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get(‘MONGO_URI‘),
            mongo_db=crawler.settings.get(‘MONGO_DATAEASE‘,‘bky‘),
        )

    def open_spider(self, spider):
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]

    def close_spider(self, spider):
        self.client.close()

    def process_item(self, item, spider):
        if len(item["sec_title"]) > 50:
            item["sec_title"] = item["sec_title"][:50] + "..."
        self.db[self.collection_name].insert_one(dict(item))
        return item

items.py

import scrapy
from scrapy import Item, Field

class BokeyuanItem(Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # title, sec_title, post_time, postor, read, conment, detail_url
    title = Field()
    sec_title = Field()
    post_time = Field()
    postor = Field()
    read = Field()
    comment = Field()
    detail_url = Field()

settings.py

修改为
ROBOTSTXT_OBEY = False

注释解开并添加
DEFAULT_REQUEST_HEADERS = {
  ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘,
  ‘Accept-Language‘: ‘en‘,
  ‘Referer‘: ‘https://www.cnblogs.com‘,
  ‘USER_AGENT‘: ‘Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36‘
}

ITEM_PIPELINES = {
   ‘bokeyuan.pipelines.BokeyuanPipeline‘: 300,
   ‘bokeyuan.pipelines.MongoPipeline‘: 301,
}

添加
MONGO_URL = ‘localhost‘
MONGO_DATABASE = ‘dbname‘

scrapy抓取某些样式的博客园博客信息

原文：https://www.cnblogs.com/whz0215/p/9694048.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)