python--多线程爬取顶点小说（）

时间：2019-10-11 19:58:58 阅读：96 评论：0 收藏：0 [点我收藏+]

import requests
from lxml import etree
from threading import Thread
from queue import Queue


class MyThread(Thread):
    def __init__(self, q):
        Thread.__init__(self)
        self.q = q

    def run(self):
        global index
        while not self.q.empty():
            data = self.q.get()
            url = root + ‘‘.join(data[1])
            response = requests.get(url, headers=headers)
            page = etree.HTML(response.content)

            chapter = page.xpath("//h1/text()")
            chapter = ‘‘.join(chapter)
            print("爬取 -> %s" % chapter,index)

            content = page.xpath("//div[@id=‘content‘]/text()")
            content = ‘\n‘.join(content)
            content = content.replace("\xa0\xa0\xa0\xa0", "\t")

            # 如果当前标记比保存的小说章节序号大于1，阻塞
            while data[0] > index + 1:
                pass

            # 刚好大于1时，通过，保存章节
            if data[0] == index + 1:
                print("保存 -> %s" % chapter,index)
                f.write(‘\n‘ + chapter + ‘\n‘)
                f.write(content)
                index += 1


if __name__ == ‘__main__‘:
    root = "http://www.booktxt.net/8_8455/"
    headers = {
        ‘user-agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36‘
    }
    
    index = -1  # 章节标记，表示保存的章数
    
    response = requests.get(root, headers=headers)
    page = etree.HTML(response.content)
    title = ‘‘.join(page.xpath("//h1/text()"))      # 小说名
    print(title)

    with open("%s.txt" % title, ‘w‘, encoding=‘utf8‘) as f:
        f.write(title)      # 先写入小说名
        hrefs = page.xpath("//div[@id=‘list‘]/dl/dt[2]/following-sibling::dd/a/@href")
        q = Queue()
        for i,href in enumerate(hrefs):
            q.put((i,href))

        ts = []
        for i in range(5):
            t = MyThread(q)
            t.start()
            ts.append(t)
        for t in ts:
            t.join()

　　转载自--https://www.cnblogs.com/twoice/p/11405677.html

python--多线程爬取顶点小说（）

原文：https://www.cnblogs.com/fqqwz/p/11656074.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)