import gevent
from gevent import monkey
monkey.patch_all()
from gevent.queue import Queue
import time
import os
import requests
import re
start = time.perf_counter()
work = Queue()
# for i in range(1,101):
# url = f‘https://sh.fang.lianjia.com/loupan/pg{i}/‘
# print(url)
# work.put_nowait(url)
url = ‘https://sh.fang.lianjia.com/loupan/pg{}/‘
url_list = (url.format(i) for i in range(1,101))
[work.put_nowait(url) for url in url_list]
info_set = set()
def spider():
while not work.empty():
url = work.get_nowait()
res = requests.get(url).text
title = re.findall(‘<a href="/loup.*?itle="(.*?)"‘
‘.*?<div class="resb.*?<span>(.*?)</span>‘
‘.*?<span>(.*?)</span>‘
‘.*?<span class="number">(.*?)</span>‘
‘.*?<span class="desc"> (.*?)</span>‘,res,re.S)
for i in title:
info_set.add(i)
tasks = []
for x in range(200):
task = gevent.spawn(spider)
tasks.append(task)
gevent.joinall(tasks,timeout=6)
for i,n in enumerate(info_set):
title = f‘标题: {n[0]}‘
addr = f‘地区: {n[1]}{n[2]}‘
price = f‘价格: {n[3]}{n[4]}‘
print(f"""
{i}
{title}
{addr}
{price}
""")
with open(‘./lianjie.cvs‘,‘a‘,encoding=‘utf-8‘) as f:
f.writelines([title,addr,price,‘\n‘])
print(‘写入完成‘)
print(time.perf_counter()-start)
原文:https://www.cnblogs.com/kai-/p/12795638.html