首页 > 编程语言 > 详细

python爬虫糗事百科

时间:2021-03-15 19:36:24      阅读:30      评论:0      收藏:0      [点我收藏+]
import re
from lxml import etree
import requests
from bs4 import BeautifulSoup
import requests
url = https://www.qiushibaike.com/imgrank/
headers = {
    RequestURL: https://eclick.baidu.com/fp.htm?br=2&fp=8D1371255901FBD7974323B7D8E17C98&fp2=E8ECE829116D278272FB03F89C616E7E&ci=&bi=&im=0&wf=0&ct=2011&bp=&m=&t=0&ft=&_=1606374621886,
    User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36
}
proxies = {
    http: 124.205.155.147:9090
}
rep = requests.get(url, headers=headers, proxies=proxies)
html = etree.HTML(rep.content.decode(utf-8))
# print(html.xpath("//div[contains(@class,‘article block untagged mb15‘)][1]//div[@class=‘content‘]/span")[0].text)
# print(div_tags)
print("==================男生数据=====================")
div_tag = html.xpath("//div[contains(@class,‘article block untagged mb15‘)]")
for i in div_tag:
    t = i.xpath(".//div[@class=‘articleGender manIcon‘]")
    temp_xpath = i.xpath(".//div[@class=‘content‘]/span")
    name_xpath = i.xpath(".//h2")
    ping_xpath = i.xpath(".//div[@class=‘stats‘]//span[@class=‘stats-vote‘]//i[@class=‘number‘]")
    pingshus_xpath = i.xpath(".//div[@class=‘stats‘]//span[@class=‘stats-comments‘]//i[@class=‘number‘]")
    img_xpath = i.xpath(".//div[@class=‘thumb‘]//a//img")
    if name_xpath:   
        print("\n"+"姓名:"+name_xpath[0].text.replace(\n, ‘‘))
    if t:
        print("男年龄:"+t[0].text)
    if temp_xpath:
        print("内容:"+temp_xpath[0].text.replace(\n, ‘‘))
    if ping_xpath:
        print("好笑:"+ping_xpath[0].text)
    if pingshus_xpath:
        print("评论数:"+pingshus_xpath[0].text)
    if img_xpath:
        print("图片:"+"http:"+img_xpath[0].attrib.get(src))

print("\n")
print("==================女生数据=====================")
div_tag = html.xpath("//div[contains(@class,‘article block untagged mb15‘)]")
for i in div_tag:
    t2 = i.xpath(".//div[@class=‘articleGender womenIcon‘]")
    temp_xpath = i.xpath(".//div[@class=‘content‘]/span")
    name_xpath = i.xpath(".//h2")
    ping_xpath2 = i.xpath(".//div[@class=‘stats‘]//span[@class=‘stats-vote‘]//i[@class=‘number‘]")
    pingshus2_xpath = i.xpath(".//div[@class=‘stats‘]//span[@class=‘stats-comments‘]//i[@class=‘number‘]")
    img_xpath2 = i.xpath(".//div[@class=‘thumb‘]//a//img")
    if name_xpath:          
        print(\n+"姓名:"+name_xpath[0].text.replace(\n, ‘‘))
    if t2:
        print("女年龄:"+t2[0].text)
    if temp_xpath:
        print("内容:"+temp_xpath[0].text.replace(\n, ‘‘))
    if ping_xpath2:
        print("好笑:"+ping_xpath2[0].text)
    if pingshus2_xpath:
        print("评论数:"+pingshus2_xpath[0].text)
    if img_xpath2:
        print("图片:http:"+img_xpath2[0].attrib.get(src))

(仅供学习参考)

python爬虫糗事百科

原文:https://www.cnblogs.com/Outsider07/p/14537708.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!