首页 > 其他 > 详细

使用xpath技术爬取段子网

时间:2019-08-16 19:45:24      阅读:63      评论:0      收藏:0      [点我收藏+]
from lxml import etree
import time
import json
import urllib.request
item_list = []  # 创建一个列表存储获取的信息


# 构造request对象
def handler_request(url, page):
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) Apple                              WebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
    }
    get_url = url + str(page)
    request = urllib.request.Request(url=get_url, headers=headers)
    return request


# 解析获取的html文件
def parse_content(content):
    # 生成对象
    tree = etree.HTML(content)
    article_list = tree.xpath(//main[@class="col-md-8 main-content"]/article)
    # 遍历article列表
    for article in article_list:
        # 获取标题
        title = article.xpath(.//div[@class="post-head"]/h1/a/text())[0]
        # 获取内容
        text = article.xpath(.//div[@class="post-content"]/p/text())
        text = \n.join(text)  # 将内容进行拼接
        item = {
            标题: title,
            内容: text,
        }
        item_list.append(item)


def main():
    start_page = int(input("请输入查询起始页面:"))
    end_page = int(input("查询结束页面:"))
    url = "http://duanziwang.com/page/"
    for page in range(start_page, end_page+1):
        request = handler_request(url, page)
        try:
            content = urllib.request.urlopen(request).read().decode()
            parse_content(content)
        except:
            print("第%d页面爬取失败" % page)
    string = json.dumps(item_list, ensure_ascii=False)
    with open(duanzi.txt, "w", encoding=utf-8) as f:
        f.write(string)


if __name__ == __main__:
    main()

 

使用xpath技术爬取段子网

原文:https://www.cnblogs.com/nxrs/p/11365422.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!