首页 > 其他 > 详细

链家网 + gevent

时间:2020-04-28 18:23:52      阅读:55      评论:0      收藏:0      [点我收藏+]

import gevent
from gevent import monkey
monkey.patch_all()
from gevent.queue import Queue
import time
import os
import requests
import re

start = time.perf_counter()
work = Queue()
# for i in range(1,101):
#     url = f‘https://sh.fang.lianjia.com/loupan/pg{i}/‘
#     print(url)
#     work.put_nowait(url)
url = ‘https://sh.fang.lianjia.com/loupan/pg{}/‘
url_list = (url.format(i) for i in range(1,101))
[work.put_nowait(url) for url in url_list]

info_set = set()

def spider():
    while not work.empty():
        url = work.get_nowait()
        res = requests.get(url).text
        title = re.findall(‘<a href="/loup.*?itle="(.*?)"‘
                           ‘.*?<div class="resb.*?<span>(.*?)</span>‘
                           ‘.*?<span>(.*?)</span>‘
                           ‘.*?<span class="number">(.*?)</span>‘
                           ‘.*?<span class="desc">&nbsp;(.*?)</span>‘,res,re.S)

        for i in title:
            info_set.add(i)

tasks = []

for x in range(200):
    task = gevent.spawn(spider)
    tasks.append(task)
gevent.joinall(tasks,timeout=6)


for i,n in enumerate(info_set):

    title = f‘标题:  {n[0]}‘
    addr = f‘地区:  {n[1]}{n[2]}‘
    price = f‘价格:  {n[3]}{n[4]}‘
    print(f"""
    {i}
    {title}
    {addr}
    {price}
    """)

    with open(‘./lianjie.cvs‘,‘a‘,encoding=‘utf-8‘) as f:
        f.writelines([title,addr,price,‘\n‘])
        print(‘写入完成‘)

print(time.perf_counter()-start)

链家网 + gevent

原文:https://www.cnblogs.com/kai-/p/12795638.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!