首页 > 其他 > 详细

某头条删掉加密参数'_signature'

时间:2019-11-29 14:56:21      阅读:67      评论:0      收藏:0      [点我收藏+]

  最近,发现某头条删掉加密参数‘_signature‘,可以爬取相关新闻信息了。 PS:本篇只做技术分享,不负任何责任

或许是访问量不够,还是无敌是多么寂寞,咱也不知道为啥,反正现在可以爬了。本篇分享一下爬取某头条一下新闻公众号发布的新闻。

首先,分析公众号的页面,发现数据是通过异步加载,没办法,只能分析接口了,

接口: https://www.toutiao.com/c/user/article/,get请求,不过要有参数:

page_type: 1
user_id: 4377795668
max_behot_time: 0
count: 20
as: A1652D5B5BF77CE
cp: 5DBB37970C6E3E1
_signature: eMm5vRAXJXW9rOEwX0M2KXjJua  # 现在没有这个加密参数了

那只能分析参数了,通过接口和参数找相关js:

    function t(i, t) {
        if (!e.isBanned && !m) {
            m = !0;
            var o = n();
            http({
                url: u,  // u = /c/user/article/ 
                method: "get",
                data: o,
                type: h,
                success: function(e) {
                    "success" === e.message && ("refresh" === i && (d.list = []),
                    v == e.page_type && (d.list = d.list.concat(s(e.data)),
                    p[1 * v] = !!e.has_more,
                    a(),
                    d.params.max_behot_time = e.next && e.next.max_behot_time || 0))
                },
                complete: function() {
                    d.empty = 0 === d.list.length,
                    m = !1,
                    t && t(),
                    d.update()
                }
            })
        }
    }  

通过这段js我们data:o, o=n(),所以n()这个函数很关键,还有下一页请求参数max_behot_time 来自这个json数据中[‘next‘][‘max_behot_time‘]

下面分析n():

    function n() {
        var e, i = ascp.getHoney();
        return e = _.extend({}, d.params, {
            as: i.as,
            cp: i.cp
        })
    }

!function(e) {
    var i = {};
    i.getHoney = function() {
        var e = Math.floor((new Date).getTime() / 1e3)
          , i = e.toString(16).toUpperCase()
          , t = md5(e).toString().toUpperCase();
        if (8 != i.length)
            return {
                as: "479BB4B7254C150",
                cp: "7E0AC8874BB0985"
            };
        for (var o = t.slice(0, 5), n = t.slice(-5), s = "", a = 0; 5 > a; a++)
            s += o[a] + i[a];
        for (var l = "", r = 0; 5 > r; r++)
            l += i[r + 3] + n[r];
        return {
            as: "A1" + s + i.slice(-3),
            cp: i.slice(0, 3) + l + "E1"
        }
    }
    ,
    e.ascp = i
}(window, document),

这段js,我们能得到参数as, cp的构建方法,下面用Python构建请求数据:

    # 构建请求数据
    def create_data(self, max_behot_time):
        md5 = hashlib.md5()
        e = math.floor(time.time())
        i = hex(e)[2:].upper()
        md5.update(str(e).encode(‘utf-8‘))
        ret = md5.hexdigest()
        t = ret.upper()
        o = t[0:5]
        n = t[-5:]
        a = ‘‘
        for s in range(5):
            a += o[s] + i[s]

        r = ‘‘
        for l in range(5):
            r += i[l + 3] + n[l]

        data = {
            ‘page_type‘: 1,
            ‘user_id‘: self.mid,
            ‘max_behot_time‘: max_behot_time,
            ‘count‘: 20,
            ‘as‘: ‘A1‘ + a + i[-3:],
            ‘cp‘: i[0:3] + r + ‘E1‘,
            # ‘_signature‘: ‘1uOraRARi2EThvPkAv1vPtbjq3‘,  # _signature 好厉害
        }
        return data

请求参数有了,我们就可以爬数据了。。。

import time
import math
import pymongo
import requests
import hashlib
import threading
from queue import Queue
import pandas as pd
from fake_useragent import UserAgent


class PaTouTiao(object):
    def __init__(self, mid, user_agent):
        self.mid = mid
        self.url = ‘https://www.toutiao.com/c/user/article/‘
        self.headers = {
            ‘referer‘: ‘https://www.toutiao.com/c/user/{}‘.format(self.mid),
            ‘user-agent‘: user_agent,
        }
        # 连接本地MongoDB
        self.client = pymongo.MongoClient(host=‘127.0.0.1‘, port=27017)
        self.db = self.client[‘TouTiaoNews‘]

    # 构建请求数据
    def create_data(self, max_behot_time):
        md5 = hashlib.md5()
        e = math.floor(time.time())
        i = hex(e)[2:].upper()
        md5.update(str(e).encode(‘utf-8‘))
        ret = md5.hexdigest()
        t = ret.upper()
        o = t[0:5]
        n = t[-5:]
        a = ‘‘
        for s in range(5):
            a += o[s] + i[s]

        r = ‘‘
        for l in range(5):
            r += i[l + 3] + n[l]

        data = {
            ‘page_type‘: 1,
            ‘user_id‘: self.mid,
            ‘max_behot_time‘: max_behot_time,
            ‘count‘: 20,
            ‘as‘: ‘A1‘ + a + i[-3:],
            ‘cp‘: i[0:3] + r + ‘E1‘,
            # ‘_signature‘: ‘1uOraRARi2EThvPkAv1vPtbjq3‘,  # _signature 好厉害
        }
        return data

    # 请求数据
    def get_response(self, max_behot_time):
        data = self.create_data(max_behot_time)
        response = requests.get(self.url, params=data, headers=self.headers)
        ret = response.json()
        max_behot_time = ret[‘next‘][‘max_behot_time‘]  # 下次请求的max_behot_time
        news_list = ret[‘data‘]  # news_list  <class ‘list‘>
        return max_behot_time, news_list

    # 保存数据 存入本地MongoDB数据库
    def save_file(self, n, news_list):
        for news in news_list:
            self.db[‘xin_news‘].insert_one(news)
        print("第{}页新闻保存成功".format(n))


def bulid_mid():
    # 获取公众号 mid
    df = pd.read_excel(‘./toutiaohao.xlsx‘)
    url_list = df.url
    mid_list = []
    for url in url_list:
        if pd.notna(url):
            mid = url.rsplit(‘=‘, 1)[1]
            mid_list.append(mid)

    return mid_list


def task(q, user_agent):
    while not q.empty():
        mid = q.get()
        print(mid)
        patiao = PaTouTiao(mid, user_agent)
        # 爬取前二十页新闻
        max_behot_time = 0
        for i in range(20):
            max_behot_time, news_list = patiao.get_response(max_behot_time)
            patiao.save_file(i + 1, news_list)


if __name__ == ‘__main__‘:
    mid_list = bulid_mid()
    user_agent = UserAgent()  # UserAgent实例化
    q = Queue()
    for mid in mid_list:
        q.put(mid)

    threading_list = []
    for i in range(4):
        # 开启四个线程
        # user_agent.random 随机生成useragent
        t = threading.Thread(target=task, args=(q, user_agent.random))
        threading_list.append(t)
        t.start()

    for t in threading_list:
        t.join()

 

PS:本篇只做技术分享,不负任何责任

 

 

 

 

 

某头条删掉加密参数'_signature'

原文:https://www.cnblogs.com/bestwishfang/p/11957456.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!