import scrapy#持续写入 import time import random import math from Crypto.Cipher import AES import codecs import base64 import requests from lxml import etree import json class niub: def __init__(self): self.key = ‘0CoJUm6Qyw8W8jud‘ self.f = ‘00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7‘ self.e = ‘010001‘ self.singer_id = ‘1411492497‘ self.post_url1 = ‘https://music.163.com/weapi/user/getfolloweds?csrf_token=‘ self.post_url2 = ‘https://music.163.com/weapi/v1/play/record?csrf_token=‘ # 生成16个随机字符 def _generate_random_strs(self, length): string = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" # 控制次数参数i i = 0 # 初始化随机字符串 random_strs = "" while i < length: e = random.random() * len(string) # 向下取整 e = math.floor(e) random_strs = random_strs + list(string)[e] i = i + 1 return random_strs # AES加密 def _AESencrypt(self, msg, key): # 如果不是16的倍数则进行填充(paddiing) padding = 16 - len(msg) % 16 msg = msg + padding * chr(padding) # 用来加密或者解密的初始向量(必须是16位) iv = ‘0102030405060708‘ Cipher = AES.new(key.encode(‘utf-8‘), AES.MODE_CBC, iv.encode(‘utf-8‘)) # 加密后得到的是bytes类型的数据 encryptedbytes = Cipher.encrypt(msg.encode(‘utf8‘)) # 使用Base64进行编码,返回byte字符串 encodestrs = base64.b64encode(encryptedbytes) # 对byte字符串按utf-8进行解码 enctext = encodestrs.decode(‘utf-8‘) return enctext # RSA加密 def _RSAencrypt(self, randomstrs, key, f): # 随机字符串逆序排列 string = randomstrs[::-1] # 将随机字符串转换成byte类型数据 text = bytes(string, ‘utf-8‘) seckey = int(codecs.encode(text, encoding=‘hex‘), 16) ** int(key, 16) % int(f, 16) return format(seckey, ‘x‘).zfill(256) def _get_params1(self, page): offset = (page - 1) * 20 msg = ‘{"userId": "1411492497", "offset":‘ + str( offset) + ‘, "total": "false", "limit": "20", "csrf_token": ""}‘ enctext = self._AESencrypt(msg, self.key) # 生成长度为16的随机字符串 i = self._generate_random_strs(16) # 两次AES加密之后得到params的值 encText = self._AESencrypt(enctext, i) # RSA加密之后得到encSecKey的值 encSecKey = self._RSAencrypt(i, self.e, self.f) return encText, encSecKey def start_requests(self): for i in range(1,2): params, encSecKey = self._get_params1(i) formdata = { ‘params‘: params, ‘encSecKey‘: encSecKey } yield formdata return formdata msg = ‘{"userId": "1411492497", "offset":"1", "total": "false", "limit": "20", "csrf_token": ""}‘ key = ‘0CoJUm6Qyw8W8jud‘ # print(x.start_requests()) def lianjie(params,encSecKey): headers = {‘Host‘: ‘music.163.com‘, ‘Connection‘: ‘keep-alive‘, ‘Content-Length‘: ‘476‘, ‘Pragma‘: ‘no-cache‘, ‘Cache-Control‘: ‘no-cache‘, ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36‘, ‘Content-Type‘: ‘application/x-www-form-urlencoded‘, ‘Accept‘: ‘*/*‘, ‘Origin‘: ‘https://music.163.com‘, ‘Sec-Fetch-Site‘: ‘same-origin‘, ‘Sec-Fetch-Mode‘: ‘cors‘, ‘Sec-Fetch-Dest‘: ‘empty‘, ‘Referer‘: ‘https://music.163.com/user/fans?id=1411492497‘, ‘Accept-Encoding‘: ‘gzip, deflate, br‘, ‘Accept-Language‘: ‘zh-CN,zh;q=0.9‘, # ‘Cookie‘: ‘_iuqxldmzr_=32; _ntes_nnid=008eb89f93bb80b8c5abbbfeb29cf783,1601351876041; _ntes_nuid=008eb89f93bb80b8c5abbbfeb29cf783; NMTID=00OjXQcC8_wl8Qc5Eyzj_hKZKF_GlUAAAF02AJ4-A; WM_NI=Nz8nT1vsX8DoejbrC5yMqBrqv70bOcl%2Fe9pgZSO9wSff8VZdQamhdi38Tu5LOB4kn7SaIJfCij4ENk3o9AkK0xpJ9ALg8jqb0bfyIAprddlPL1%2FzcgWpVXoiyEbZoNNKdHQ%3D; WM_NIKE=9ca17ae2e6ffcda170e2e6eea2e47b9bbaaf93b6508d9a8ab7d44b828b9aafb546a2909db9b3489c949eaff22af0fea7c3b92a8ea6f88fd73bb6ad98d9c765a7b6ae93c23fbb93bfa3f17bb6e90083ce69b0f1abdaf36d9a8d81daf659a6baf8b7d97093bfa389e763f1e89c96b8488d9efc8ced3d91bf87abe549abb6fca6cd61f1b1f7a2ea41f4a8c0abbc4b90b9f88bf46af4beaa9acc4bb4aeac85ef5c90efe19be26d95b09ab9ee63bb9baebbee468e95aca9d437e2a3; WM_TID=5UHlf7z1yZ1FQBVRREY%2FJaeDTwOCfBMs; JSESSIONID-WYYY=khAdF6WsaT8Vl%2FBmeuUxNUJzXuSo9AuMAkkyWuiGbGlShWwbk%5CW3flpBsDz0ZTNpKPz8PcvsO%2FYH8jX9F07a5ACh0KqO5O0nAoEJO5W%2FR8yfJSJdCm95FQaQxo7QQzQ%2FfJpypzjeXQI8RO3opWeXr1x7z1GUBQQ2sn4P5sEWeDNkPoSO%3A1601382242186‘ } url = ‘https://music.163.com/weapi/user/getfolloweds?csrf_token=‘ # payload = {"params": "9XF7hjblVik3GtlIL+qnj89FSxZniqBCgNQp/MC8sl2Bnh0+UBdiNELWLWE4sH5ewDJBmdgfCJsOGtAV+DHgaBYQ70n4zPJHXxzxh7ZogOe7+UIXMS+Zi9R4W2Yhz9aZ1g8Wl6cz5Fzw70F4thj+tL6YcLRd4ISi1TBFsVikMhb41eFj0PMkAjNH8HysS3B0", "encSecKey": "735f184a826789473c8123979b51608ac82c2be7142ebaa300c8b1787c34155336c6dce64bc9eeb74a66fc9ecb51e20e7e03e354943cfea3ebedec850964cac70740af2c7f5a2f5ac2c4c46071063bcd3b7059722be2d866248bd9915d9b6cea870b06e0d2bc0ceda65ddf6b8de4bded25c6bf2012fe71080ab8d36c962612fa"} data = { ‘params‘:params, ‘encSecKey‘:encSecKey, } # 与 get 请求一样,r 为响应对象 r = requests.post(url=url, data=data, headers=headers, verify=False) # 查看响应结果 a= [] for user in json.loads(r.text)[‘followeds‘]: mingzi = user[‘nickname‘] guanzhu = user[‘follows‘] fensi = user[‘followeds‘] dongtai = user[‘eventCount‘] shuju = ‘名字‘ + ‘;‘ + str(mingzi) + ‘;‘ + ‘动态‘ + ‘;‘ + str(dongtai) + ‘;‘ + ‘关注‘ + ‘;‘ + str(guanzhu) + ‘;‘ + ‘粉丝‘ + ‘;‘ + str(fensi) + ‘\n‘ a.append(shuju) a= ‘‘.join(a) return a def writeInfomation(contont):#写到文本 with open("鱿小鱼.txt", "a+", encoding=‘utf-8‘) as f: #定义格式 utf-8 f.write(contont) end = time.time() f.close() print(end-start) def main(offset): x = niub() heji = [] heji1 = [] data1 = x._get_params1(offset) a = data1[0] b = data1[1] lianjie(a, b) zuihou = lianjie(a, b) writeInfomation(zuihou) if __name__ == ‘__main__‘: start = time.time() for i in range(1,3): main(offset=i)
这种速度10页以内比第一种方法慢,但是可以抓取上万页,每页速度3秒左右,可以持续抓取,其他两种方法不行。
原文:https://www.cnblogs.com/aotumandaren/p/13761929.html