首页 > 其他 > 详细

scrapy全栈抓xpc练习

时间:2020-02-13 18:23:59      阅读:58      评论:0      收藏:0      [点我收藏+]
#  spider文件
# -*- coding: utf-8 -*-
import scrapy
import re
from scrapy import Request
import json
import string
import random
from xpc.items import PostItem, CommentItem, CopyItem  # 多个item

def strip(s):
    # s存在就去空,不存在就返回空
    if s:
        return s.strip()
    return ""


# 使用scrapy.Request和scrapy.FormRequest发送请求的时候,默认会把cookies保存下来
# 模拟登录的时候不用scrapy框架,直接使用request模块
cookies = dict(
    Authorization=4F635191B0602B5D3B06024483B0602AAF8B06023C2F6259656D
)
# 上面的cookies是网站返回的,需要先登陆的一下把这个cookies找到

# 生成26个字母+数字
def gen_sessionid():
    return "".join(random.choices(string.ascii_lowercase + string.digits, k=26))


class XinpianchangSpider(scrapy.Spider):
    name = XinPianChang
    allowed_domains = [xinpianchang.com, openapi-vtom.vmovier.com]
    start_urls = [https://www.xinpianchang.com/channel/index/sort-like?from=tabArticle]
    # 假设从21页开始访问,这里就需要带上cookies,这时候最开始设置的cookies就不能用了,网站会返回4个cookies。需要从写start_requests函数
    # start_urls = [‘https://www.xinpianchang.com/channel/index/type-/sort-like/duration_type-0/resolution_type-/page-21‘]
    page_count = 0

    # 重写父类中的 start_requests方法,该方法默认对start_urls中的url发get请求
    # def start_requests(self):
    #     for url in self.start_urls:
    #         # data = {
    #         #     "kw": "cat"
    #         # }
    #         # post请求发送,使用FormRequest
    #         # yield scrapy.FormRequest(url=url, callback=self.parse, formdata=data)
    #
    #         c = cookies.copy()
    #         c.update(PHPSESSID=gen_sessionid(),
    #                  SERVER_ID=‘b52601c8-285bdd26‘,
    #                  channel_page=‘apU%3D‘)
    #         yield Request(url, cookies=c, dont_filter=True)


    def parse(self, response):
        # from scrapy.shell import inspect_response
        # inspect_response(response, self)
        self.page_count += 1
        if self.page_count >= 100:
            cookies.update(PHPSESSID=gen_sessionid())
            self.page_count = 0

        url_list = response.xpath(//ul[@class="video-list"]/li/@data-articleid).extract()
        for pid in url_list:
            detail_url = https://www.xinpianchang.com/a{}?from=ArticleList.format(pid)
            # print(detail_url)
            request = response.follow(detail_url, callback=self.parse_post)
            request.meta[pid] = pid
            yield request  # 进入作品的详情页请求

        pages = response.xpath(//div[@class="page"]/a/@href).extract()
        for page_url in pages:
            # print("列表页翻页url", page_url)  # page_url是一个相对路径,不完整的
            yield response.follow(page_url, self.parse, cookies=cookies)

    def parse_post(self, response):
        pid = response.meta[pid]
        post = PostItem()
        post[pid] = pid
        post[title] = response.xpath(//div[@class="title-wrap"]/h3/text()).get()
        # video_url = ‘https://openapi-vtom.vmovier.com/v3/video/5E34203E92450?expand=resource&usage=xpc_web‘
        # response.text拿到网页返回的源码
        vid = re.findall(vid: "(.*?)",, response.text)[0]
        # print(vid)
        video_url = https://openapi-vtom.vmovier.com/v3/video/{}?expand=resource&usage=xpc_web.format(vid)
        cates = response.xpath(//span[@class="cate v-center"]/a/text()).extract()
        post[category] = ‘‘.join([cate.strip() for cate in cates])
        post[create_time] = response.xpath(//span[contains(@class,"update-time")]/i/text()).get()
        post[play_count] = response.xpath(//i[contains(@class,"play-counts")]/text()).get()
        desc_lst = response.xpath(//p[contains(@class,"desc")]//text()).extract()
        post[desc] =  .join([i.strip() for i in desc_lst])

        # 请求这个video_url, 多了一步这个注意一下
        request = Request(video_url, callback=self.parse_video)
        # 把之前获取到的post通过meta传到下一个函数中. 这个post是请求传参
        request.meta[post] = post
        yield request

        # 获取评论链接‘https://app.xinpianchang.com/comments?resource_id=10664352&type=article&page=1&per_page=24’
        comment_url = "https://app.xinpianchang.com/comments?resource_id={}&type=article&page=1&per_page=24".format(
            pid)
        request = Request(comment_url, callback=self.parse_comment)
        # 把之前获取到的post通过meta传到下一个函数中
        request.meta[pid] = pid
        yield request

        # 获取作者页链接
        creator_list = response.xpath(//div[@class="filmplay-creator right-section"]/ul[@class="creator-list"]/li)
        composer_url = https://www.xinpianchang.com/u{}?from=articleList
        # cid = response.xpath(‘//div[@class="filmplay-creator right-section"]/ul[@class="creator-list"]/li/a/@data-userid‘)
        for creator in creator_list:
            cid = creator.xpath(./a/@data-userid).get()
            composer_url = https://www.xinpianchang.com/u{}?from=articleList.format(cid)
            request = response.follow(composer_url, self.parse_composer)
            request.meta[cid] = cid
            # 避免在cookies更新之后,不断的添加到请求头里面,避免请求头里带有一串cookies
            request.meta[dont_merge_cookies] = True
            yield request

            # 作者和视频的对应关系
            cr = CopyItem()
            cr[pid] = pid
            cr[cid] = cid
            cr[pcid] = pid + cid
            cr[role] = creator.xpath(./div[@class="creator-info"]/span/text()).get()
            # print("cr", cr)
            yield cr

    def parse_video(self, response):  # 这个response是json格式
        post = response.meta[post]
        # 先把返回的json转化一下, 注意一下
        result = json.loads(response.text)
        post[video_url] = result[data][resource][default][url]
        # 直接返回给管道了
        yield post

    def parse_comment(self, response):
        result = json.loads(response.text)
        for c in result[data][list]:
            comment = CommentItem()
            comment[uname] = c[userInfo][username]
            comment[user_id] = c[userInfo][id]
            # comment[‘user_page‘] = c[‘userInfo‘][‘web_url‘]
            comment[content] = c[content]
            comment[content_id] = c[id]
            print(comment)
            yield comment

        # 如果有下一页
        if result[data][next_page_url]:
            next_page = https://app.xinpianchang.com + result[data][next_page_url]
            # print("next_page", next_page)
            yield response.follow(next_page, self.parse_comment)

    def parse_composer(self, response):
        pass
# settings文件
# -*- coding: utf-8 -*-

# Scrapy settings for xpc project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = xpc

SPIDER_MODULES = [xpc.spiders]
NEWSPIDER_MODULE = xpc.spiders

# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = ‘xpc (+http://www.yourdomain.com)‘

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
# 如果使用自定义cookie就把COOKIES_ENABLED设置为True
# 如果使用settings的cookie就把COOKIES_ENABLED设置为False
COOKIES_ENABLED = True
COOKIES_DEBUG = True  # 可以打印出来详细的cookies信息

# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
    Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8,
    Accept-Language: en,
    User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko),
}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
#    ‘xpc.middlewares.XpcSpiderMiddleware‘: 543,
# }

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
#    ‘xpc.middlewares.XpcDownloaderMiddleware‘: 543,
# }

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
#    ‘scrapy.extensions.telnet.TelnetConsole‘: None,
# }

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   xpc.pipelines.XpcPipeline: 300,  # 优先级高
   # ‘xpc.pipelines.MysqlPipeline‘: 301,
   # ‘xpc.pipelines.RedisPipeline‘: 302,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
HTTPCACHE_ENABLED = False   # True缓存访问过的网页,不会真实的发请求
# HTTPCACHE_ENABLED = True

# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = ‘httpcache‘
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = ‘scrapy.extensions.httpcache.FilesystemCacheStorage‘

# 日志类型: INFO DEBUG ERROR
LOG_LEVEL = DEBUG
# item文件
# -*- coding: utf-8 -*-
import scrapy

class PostItem(scrapy.Item):
    # 保存视频信息
    # 自定义字段,有多个表的时候需要写个table_name
    table_name = posts

    # 下面的是数据字段
    pid = scrapy.Field()
    title = scrapy.Field()
    category = scrapy.Field()
    create_time = scrapy.Field()
    play_count = scrapy.Field()
    desc = scrapy.Field()
    video_url = scrapy.Field()


class CommentItem(scrapy.Item):
    # 保存评论信息
    table_name = comments
    content_id = scrapy.Field()
    pid = scrapy.Field()
    cid = scrapy.Field()
    uname = scrapy.Field()
    user_id = scrapy.Field()
    content = scrapy.Field()
    user_page = scrapy.Field()


class CopyItem(scrapy.Item): table_name = copyrights pcid = scrapy.Field() # 表的主键 pid = scrapy.Field() cid = scrapy.Field() role = scrapy.Field()
# pipeline文件
# -*- coding: utf-8 -*-

import csv
from xpc.items import PostItem, CommentItem, CopyItem
import pymysql
from redis import Redis
import os

class XpcPipeline(object):
    def __init__(self):
        # 当前文件的上一级
        store_file = os.path.dirname(__file__) + /xpc.csv
        # 打开文件
        self.file = open(store_file, w, newline="")
        # csv 写法
        self.writer = csv.writer(self.file)

    def open_spider(self, spider):
        print("pipeline 开始爬虫......")
  

  # 执行多个不同的item时
def process_item(self, item, spider): if isinstance(item, PostItem): print("这是发布信息:", item) elif isinstance(item, CommentItem): print("这是评论信息:", item) elif isinstance(item, CopyItem): print("这是版权信息:", item) return item # 返回给下一个要执行的管道类 def close_spider(self, spider): print("pipeline 结束爬虫......") # 连接数据库 class MysqlPipeline(object): conn = None cursor = None def open_spider(self, spider): self.conn = pymysql.Connect( host=127.0.0.1, port=3306, user=root, password=‘‘, db=test_db, charset=utf8 ) print("数据库连接成功") def process_item(self, item, spider): self.cursor = self.conn.cursor() try: self.cursor.execute(insert into test_db values("%s", "%s") % (item[author], item[content])) self.conn.commit() except Exception as e: print("数据库插入异常:", e) print("数据库执行回滚") self.conn.rollback() return item def close_spider(self, spider): print("断开数据库连接") self.cursor.close() self.conn.close() # 连接数据库 class RedisPipeline(object): conn = None cursor = None def open_spider(self, spider): self.conn = Redis( host=127.0.0.1, port=6379 ) print("数据库连接成功") def process_item(self, item, spider): dic = { "author": item["author"], "content": item["content"] } self.conn.lpush("队列名字", dic) def close_spider(self, spider): print("断开数据库连接") self.cursor.close() self.conn.close()

 

scrapy全栈抓xpc练习

原文:https://www.cnblogs.com/kenD/p/12304304.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!