最近,发现某头条删掉加密参数‘_signature‘,可以爬取相关新闻信息了。 PS:本篇只做技术分享,不负任何责任
或许是访问量不够,还是无敌是多么寂寞,咱也不知道为啥,反正现在可以爬了。本篇分享一下爬取某头条一下新闻公众号发布的新闻。
首先,分析公众号的页面,发现数据是通过异步加载,没办法,只能分析接口了,
接口: https://www.toutiao.com/c/user/article/,get请求,不过要有参数:
page_type: 1 user_id: 4377795668 max_behot_time: 0 count: 20 as: A1652D5B5BF77CE cp: 5DBB37970C6E3E1 _signature: eMm5vRAXJXW9rOEwX0M2KXjJua # 现在没有这个加密参数了
那只能分析参数了,通过接口和参数找相关js:
function t(i, t) {
if (!e.isBanned && !m) {
m = !0;
var o = n();
http({
url: u, // u = /c/user/article/
method: "get",
data: o,
type: h,
success: function(e) {
"success" === e.message && ("refresh" === i && (d.list = []),
v == e.page_type && (d.list = d.list.concat(s(e.data)),
p[1 * v] = !!e.has_more,
a(),
d.params.max_behot_time = e.next && e.next.max_behot_time || 0))
},
complete: function() {
d.empty = 0 === d.list.length,
m = !1,
t && t(),
d.update()
}
})
}
}
通过这段js我们data:o, o=n(),所以n()这个函数很关键,还有下一页请求参数max_behot_time 来自这个json数据中[‘next‘][‘max_behot_time‘]
下面分析n():
function n() {
var e, i = ascp.getHoney();
return e = _.extend({}, d.params, {
as: i.as,
cp: i.cp
})
}
!function(e) {
var i = {};
i.getHoney = function() {
var e = Math.floor((new Date).getTime() / 1e3)
, i = e.toString(16).toUpperCase()
, t = md5(e).toString().toUpperCase();
if (8 != i.length)
return {
as: "479BB4B7254C150",
cp: "7E0AC8874BB0985"
};
for (var o = t.slice(0, 5), n = t.slice(-5), s = "", a = 0; 5 > a; a++)
s += o[a] + i[a];
for (var l = "", r = 0; 5 > r; r++)
l += i[r + 3] + n[r];
return {
as: "A1" + s + i.slice(-3),
cp: i.slice(0, 3) + l + "E1"
}
}
,
e.ascp = i
}(window, document),
这段js,我们能得到参数as, cp的构建方法,下面用Python构建请求数据:
# 构建请求数据
def create_data(self, max_behot_time):
md5 = hashlib.md5()
e = math.floor(time.time())
i = hex(e)[2:].upper()
md5.update(str(e).encode(‘utf-8‘))
ret = md5.hexdigest()
t = ret.upper()
o = t[0:5]
n = t[-5:]
a = ‘‘
for s in range(5):
a += o[s] + i[s]
r = ‘‘
for l in range(5):
r += i[l + 3] + n[l]
data = {
‘page_type‘: 1,
‘user_id‘: self.mid,
‘max_behot_time‘: max_behot_time,
‘count‘: 20,
‘as‘: ‘A1‘ + a + i[-3:],
‘cp‘: i[0:3] + r + ‘E1‘,
# ‘_signature‘: ‘1uOraRARi2EThvPkAv1vPtbjq3‘, # _signature 好厉害
}
return data
请求参数有了,我们就可以爬数据了。。。
import time
import math
import pymongo
import requests
import hashlib
import threading
from queue import Queue
import pandas as pd
from fake_useragent import UserAgent
class PaTouTiao(object):
def __init__(self, mid, user_agent):
self.mid = mid
self.url = ‘https://www.toutiao.com/c/user/article/‘
self.headers = {
‘referer‘: ‘https://www.toutiao.com/c/user/{}‘.format(self.mid),
‘user-agent‘: user_agent,
}
# 连接本地MongoDB
self.client = pymongo.MongoClient(host=‘127.0.0.1‘, port=27017)
self.db = self.client[‘TouTiaoNews‘]
# 构建请求数据
def create_data(self, max_behot_time):
md5 = hashlib.md5()
e = math.floor(time.time())
i = hex(e)[2:].upper()
md5.update(str(e).encode(‘utf-8‘))
ret = md5.hexdigest()
t = ret.upper()
o = t[0:5]
n = t[-5:]
a = ‘‘
for s in range(5):
a += o[s] + i[s]
r = ‘‘
for l in range(5):
r += i[l + 3] + n[l]
data = {
‘page_type‘: 1,
‘user_id‘: self.mid,
‘max_behot_time‘: max_behot_time,
‘count‘: 20,
‘as‘: ‘A1‘ + a + i[-3:],
‘cp‘: i[0:3] + r + ‘E1‘,
# ‘_signature‘: ‘1uOraRARi2EThvPkAv1vPtbjq3‘, # _signature 好厉害
}
return data
# 请求数据
def get_response(self, max_behot_time):
data = self.create_data(max_behot_time)
response = requests.get(self.url, params=data, headers=self.headers)
ret = response.json()
max_behot_time = ret[‘next‘][‘max_behot_time‘] # 下次请求的max_behot_time
news_list = ret[‘data‘] # news_list <class ‘list‘>
return max_behot_time, news_list
# 保存数据 存入本地MongoDB数据库
def save_file(self, n, news_list):
for news in news_list:
self.db[‘xin_news‘].insert_one(news)
print("第{}页新闻保存成功".format(n))
def bulid_mid():
# 获取公众号 mid
df = pd.read_excel(‘./toutiaohao.xlsx‘)
url_list = df.url
mid_list = []
for url in url_list:
if pd.notna(url):
mid = url.rsplit(‘=‘, 1)[1]
mid_list.append(mid)
return mid_list
def task(q, user_agent):
while not q.empty():
mid = q.get()
print(mid)
patiao = PaTouTiao(mid, user_agent)
# 爬取前二十页新闻
max_behot_time = 0
for i in range(20):
max_behot_time, news_list = patiao.get_response(max_behot_time)
patiao.save_file(i + 1, news_list)
if __name__ == ‘__main__‘:
mid_list = bulid_mid()
user_agent = UserAgent() # UserAgent实例化
q = Queue()
for mid in mid_list:
q.put(mid)
threading_list = []
for i in range(4):
# 开启四个线程
# user_agent.random 随机生成useragent
t = threading.Thread(target=task, args=(q, user_agent.random))
threading_list.append(t)
t.start()
for t in threading_list:
t.join()
PS:本篇只做技术分享,不负任何责任
原文:https://www.cnblogs.com/bestwishfang/p/11957456.html