首页 > 其他 > 详细

练习4-今日头条爬取

时间:2021-05-16 14:28:00      阅读:9      评论:0      收藏:0      [点我收藏+]
import requests
from urllib.parse import urlencode
from pyquery import PyQuery as pq
import re,os
from hashlib import md5

def get_page(page_num,search_id):
    param1={
        ‘dvpf‘: ‘pc‘,
        ‘source‘:‘input‘,
        ‘keyword‘: ‘街拍‘
    }
    param2 = {
        ‘keyword‘: ‘街拍‘,
        ‘pd‘: ‘synthesis‘,
        ‘source‘: ‘pagination‘,
        ‘dvpf‘: ‘pc‘,
        ‘aid‘: 4916,
        ‘page_num‘: page_num,
        ‘search_id‘: search_id
    }
    if page_num == 0:
        param=param1
    else:
        param = param2
    url = ‘https://so.toutiao.com/search?‘ + urlencode(param)
    try:
        reponse = requests.get(url)
        if reponse.status_code == 200:
            return reponse.text
    except Exception as e:
        print(‘ERROR1:‘, e)

def parse_pg(html):
    doc=pq(html)
    imgs=doc(‘.abs-fill img‘).items()
    for img in imgs:
        src=img.attr(‘src‘)
        print(src)
        yield  src


def save_img(img):
    if not os.path.exists(r‘D:\pycharm_projects\街拍‘):
        os.mkdir(r‘D:\pycharm_projects\街拍‘)
    try:
        response=requests.get(img)
        if response.status_code ==200:
            file_path=‘{}/{}.{}‘.format(r‘D:\pycharm_projects\街拍‘,md5(response.content).hexdigest(),‘jpg‘)
            if not os.path.exists(file_path):
                with open(file_path,‘wb‘) as f:
                    f.write(response.content)
            else:
                print(‘alredy download‘)
    except Exception as e:
        print(‘ERROR2:‘,e)


def main():
    search_id=‘‘
    for i in range(2):
        if i == 0:
            html = get_page(i, search_id)
            doc = pq(html)
            search_id = re.search(r‘search_id=(.*)&?‘,doc(‘.result-content:last-child a:first-child‘).attr(‘href‘)).group(1)
        else:
            html = get_page(i, search_id)
        imgs=parse_pg(html)
        for img in imgs:
            print(img)
            save_img(img)

if __name__ == ‘__main__‘:
    main()

练习4-今日头条爬取

原文:https://www.cnblogs.com/tingshu/p/14773354.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!