爬取YY评级信息

时间：2019-10-27 10:30:29 阅读：126 评论：0 收藏：0 [点我收藏+]
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File  : 爬取YY评级基本信息.py
# @Author: lattesea
# @Date  : 2019/10/7
# @Desc  :
import requests
import json
import csv
from fake_useragent import UserAgent
import time
import random


class YYpingjiSpider(object):
    def __init__(self):
        self.url = ‘https://api.ratingdog.cn/v1/search?limit=10&offset={}&type=3&qtext=&filter=%7B%7D&_=1570391570681‘
        self.url2 = ‘https://api.ratingdog.cn/v1/GetIssuerInfo?IssuerID={}&IssuerType=1001‘
        self.url3 = ‘https://api.ratingdog.cn/v1/GetIssuerInfo?IssuerID={}&IssuerType=1002‘

    def get_headers(self):
        ua = UserAgent()
        headers = {
            "Accept": "application/json, text/plain, */*",
            "Origin": "https://www.ratingdog.cn",
            "Referer": "https://www.ratingdog.cn/",
            "Sec-Fetch-Mode": "cors",
            "User-Agent": ua.random
        }
        return headers

    def parse_IssuerID_IssuerType(self, url):
        IssuerID_list = []
        html_json = requests.get(url=url, headers=self.get_headers()).text
        html_py = json.loads(html_json)
        for i in html_py[‘rows‘]:
            IssuerID_list.append((i[‘IssuerID‘], i[‘IssuerType‘]))
        print(IssuerID_list)
        return IssuerID_list

    def parse_basic_message_1002(self, IssuerID):
        url = self.url3.format(IssuerID)
        basic_message = {}
        html_json = requests.get(url=url, headers=self.get_headers()).text
        html_py = json.loads(html_json)
        for i in html_py[‘rows‘]:
            basic_message[‘IssuerName‘] = html_py[‘rows‘][‘IssuerName‘]
            basic_message[‘CorporateRating‘] = html_py[‘rows‘][‘CorporateRating‘]
            basic_message[‘RatingAgency‘] = html_py[‘rows‘][‘RatingAgency‘]
            basic_message[‘Holder‘] = html_py[‘rows‘][‘Holder‘]
            basic_message[‘Industry‘] = html_py[‘rows‘][‘Industry‘]
            basic_message[‘Nature‘] = html_py[‘rows‘][‘Nature‘]
            basic_message[‘YYRating‘] = html_py[‘rows‘][‘YYRating‘]
            basic_message[‘IssuerType‘] = html_py[‘rows‘][‘IssuerType‘]
            basic_message[‘CreditAnalysis‘] = html_py[‘rows‘][‘CreditAnalysis‘]
            basic_message[‘PlatformImportance‘] = html_py[‘rows‘][‘CtExtendInfo‘][‘PlatformImportance‘]
            basic_message[‘PrincipalBusiness‘] = html_py[‘rows‘][‘CtExtendInfo‘][‘PrincipalBusiness‘]
            basic_message[‘GDP‘] = html_py[‘rows‘][‘CtExtendInfo‘][‘GDP‘]
            basic_message[‘Revenue‘] = html_py[‘rows‘][‘CtExtendInfo‘][‘Revenue‘]
            basic_message[‘YYRatio‘] = html_py[‘rows‘][‘CtExtendInfo‘][‘YYRatio‘]
            basic_message[‘IssuerCity‘] = html_py[‘rows‘][‘CtExtendInfo‘][‘IssuerCity‘]
            basic_message[‘ADLevel‘] = html_py[‘rows‘][‘CtExtendInfo‘][‘ADLevel‘]
        print(basic_message)
        return basic_message

    def parse_basic_message_1001(self, IssuerID):
        url = self.url2.format(IssuerID)
        basic_message = {}
        html_json = requests.get(url=url, headers=self.get_headers()).text
        html_py = json.loads(html_json)
        for i in html_py[‘rows‘]:
            basic_message[‘IssuerName‘] = html_py[‘rows‘][‘IssuerName‘]
            basic_message[‘CorporateRating‘] = html_py[‘rows‘][‘CorporateRating‘]
            basic_message[‘RatingAgency‘] = html_py[‘rows‘][‘RatingAgency‘]
            basic_message[‘Holder‘] = html_py[‘rows‘][‘Holder‘]
            basic_message[‘Industry‘] = html_py[‘rows‘][‘Industry‘]
            basic_message[‘Nature‘] = html_py[‘rows‘][‘Nature‘]
            basic_message[‘YYRating‘] = html_py[‘rows‘][‘YYRating‘]
            basic_message[‘IssuerType‘] = html_py[‘rows‘][‘IssuerType‘]
            basic_message[‘CreditAnalysis‘] = html_py[‘rows‘][‘CreditAnalysis‘]
            basic_message[‘YYIndustry‘] = html_py[‘rows‘][‘CyExtendInfo‘][‘YYIndustry‘]
            basic_message[‘YYIndustryId‘] = html_py[‘rows‘][‘CyExtendInfo‘][‘YYIndustryId‘]
            basic_message[‘IndustrylStatus‘] = html_py[‘rows‘][‘CyExtendInfo‘][‘IndustrylStatus‘]
            basic_message[‘ShareholderBackground‘] = html_py[‘rows‘][‘CyExtendInfo‘][‘ShareholderBackground‘]
            basic_message[‘OperatingStatus‘] = html_py[‘rows‘][‘CyExtendInfo‘][‘OperatingStatus‘]
            basic_message[‘FinancialStatus‘] = html_py[‘rows‘][‘CyExtendInfo‘][‘FinancialStatus‘]
            basic_message[‘Focus‘] = html_py[‘rows‘][‘CyExtendInfo‘][‘Focus‘]
        print(basic_message)
        return basic_message

    def save_csv_1001(self, result):
        keyword_list1 = [‘IssuerName‘, ‘CorporateRating‘, ‘RatingAgency‘, ‘Holder‘, ‘Industry‘, ‘Nature‘, ‘YYRating‘,
                         ‘IssuerType‘, ‘CreditAnalysis‘, ‘YYIndustry‘, ‘YYIndustryId‘, ‘IndustrylStatus‘,
                         ‘ShareholderBackground‘, ‘OperatingStatus‘, ‘FinancialStatus‘, ‘Focus‘]

        with open(‘1001.csv‘, ‘a‘, newline=‘‘) as f:
            writer = csv.DictWriter(f, keyword_list1)
            # for row in result:
            writer.writerow(result)

    def save_csv_1002(self, result):
        keyword_list2 = [‘IssuerName‘, ‘CorporateRating‘, ‘RatingAgency‘, ‘Holder‘, ‘Industry‘, ‘Nature‘, ‘YYRating‘,
                         ‘IssuerType‘, ‘CreditAnalysis‘, ‘PlatformImportance‘, ‘PrincipalBusiness‘, ‘PrincipalBusiness‘,
                         ‘GDP‘, ‘Revenue‘, ‘YYRatio‘, ‘IssuerCity‘, ‘ADLevel‘]

        with open(‘1002.csv‘, ‘a‘, newline=‘‘) as f:
            writer = csv.DictWriter(f, keyword_list2)
            # for row in result:
            writer.writerow(result)

    def run(self):
        # self.parse_IssuerID()
        # self.parse_basic_message_1001()
        for i in range(0, 4631, 20):
            url = self.url.format(i)
            IssuerID_IssuerType = self.parse_IssuerID_IssuerType(url)
            for j in IssuerID_IssuerType:

                if j[1] == ‘产业‘:
                    result = self.parse_basic_message_1001(j[0])
                    self.save_csv_1001(result)
                elif j[1] == ‘城投‘:
                    result = self.parse_basic_message_1002(j[0])
                    self.save_csv_1002(result)
                time.sleep(random.uniform(1, 4))


if __name__ == ‘__main__‘:
    spider = YYpingjiSpider()
    spider.run()
该网站主要是访问频率太高会被封账号
爬取YY评级信息
原文：https://www.cnblogs.com/lattesea/p/11746480.html
踩
(0)
评论一句话评论（0）
分享档案
更多>
2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)