练习4-今日头条爬取

时间：2021-05-16 14:28:00 阅读：13 评论：0 收藏：0 [点我收藏+]

import requests
from urllib.parse import urlencode
from pyquery import PyQuery as pq
import re,os
from hashlib import md5

def get_page(page_num,search_id):
    param1={
        ‘dvpf‘: ‘pc‘,
        ‘source‘:‘input‘,
        ‘keyword‘: ‘街拍‘
    }
    param2 = {
        ‘keyword‘: ‘街拍‘,
        ‘pd‘: ‘synthesis‘,
        ‘source‘: ‘pagination‘,
        ‘dvpf‘: ‘pc‘,
        ‘aid‘: 4916,
        ‘page_num‘: page_num,
        ‘search_id‘: search_id
    }
    if page_num == 0:
        param=param1
    else:
        param = param2
    url = ‘https://so.toutiao.com/search?‘ + urlencode(param)
    try:
        reponse = requests.get(url)
        if reponse.status_code == 200:
            return reponse.text
    except Exception as e:
        print(‘ERROR1:‘, e)

def parse_pg(html):
    doc=pq(html)
    imgs=doc(‘.abs-fill img‘).items()
    for img in imgs:
        src=img.attr(‘src‘)
        print(src)
        yield  src


def save_img(img):
    if not os.path.exists(r‘D:\pycharm_projects\街拍‘):
        os.mkdir(r‘D:\pycharm_projects\街拍‘)
    try:
        response=requests.get(img)
        if response.status_code ==200:
            file_path=‘{}/{}.{}‘.format(r‘D:\pycharm_projects\街拍‘,md5(response.content).hexdigest(),‘jpg‘)
            if not os.path.exists(file_path):
                with open(file_path,‘wb‘) as f:
                    f.write(response.content)
            else:
                print(‘alredy download‘)
    except Exception as e:
        print(‘ERROR2:‘,e)


def main():
    search_id=‘‘
    for i in range(2):
        if i == 0:
            html = get_page(i, search_id)
            doc = pq(html)
            search_id = re.search(r‘search_id=(.*)&?‘,doc(‘.result-content:last-child a:first-child‘).attr(‘href‘)).group(1)
        else:
            html = get_page(i, search_id)
        imgs=parse_pg(html)
        for img in imgs:
            print(img)
            save_img(img)

if __name__ == ‘__main__‘:
    main()

练习4-今日头条爬取

原文：https://www.cnblogs.com/tingshu/p/14773354.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)