昨天浪了一天,只看了一点点python的多线程,今天改了改爬信件的脚本。
由于写入文件需要加锁防止写入混乱,所以运行耗时并未减少,但是相当于将脚本放到后台执行,可以在爬取信件的同时运行其他脚本。
# -*- coding: utf-8 -*- """ Created on Mon Jan 26 9:35:00 2020 @author: 星辰° """ import json import requests import time import threading from threading import Thread def getContent(begin,length): url = ‘http://www.beijing.gov.cn/hudong/hdjl/com.web.search.mailList.mailList.biz.ext‘ headers = { "Host": "www.beijing.gov.cn", "Connection": "keep-alive", "Content-Length": "155", "Pragma": "no-cache", "Cache-Control": "no-cache", "Accept": "application/json, text/javascript, */*; q=0.01", "Origin": "http://www.beijing.gov.cn", "X-Requested-With": "XMLHttpRequest", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36", "Content-Type": "text/json", "Referer": "http://www.beijing.gov.cn/hudong/hdjl/com.web.search.mailList.flow", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9", "Cookie": "HDJLJSID=88D9174C76DD6101B765BC08EBCC0042; __jsluid_h=5bf544c63ba671436f7a72dbea4f2107; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216fb37e42d93f-05d6c08984c92b-3c604504-2073600-16fb37e42db11e%22%7D; X-LB=1.1.44.637df82f; route=a2cec3cb28b0d59d32db7b39f74f56a5; _va_ref=%5B%22%22%2C%22%22%2C1579688489%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3D_YgZhWCf_Bktk-Qmd0FW46ZrmtOPvEAUxLo3-rLKpBuJ4lAGvTZ1-MNDKfbQzyA3%26wd%3D%26eqid%3Dbf36854b000546f7000000025e21a840%22%5D; _va_ses=*; _va_id=953201576463cf85.1579264069.4.1579691692.1579688489." } #即使定义的时候格式为json标准格式,python也会认为这是python字典,自动将双引号改成单引号 data = { "PageCond/begin":begin, "PageCond/length":length, "PageCond/isCount":"true", "keywords":"", "orgids":"", "startDate":"", "endDate":"", "letterType":"", "letterStatue":"" } #print(type(data)) #print(data) #print(type(json.dumps(data))) #print(json.dumps(data)) #模拟Ajax,data数据必须格式化为标准json数据格式,否则结果会是<Response [500]> #headers不能格式化,必须用dist类型 res = requests.post(url, data=json.dumps(data), headers=headers) return res.text mu = threading.Lock() #1、创建一个锁 def writeFile(begin,length): if mu.acquire(True): #2、获取锁状态,一个线程有锁时,别的线程只能在外面等着 start = time.time() mailList = json.loads(getContent(begin,length))["mailList"] for mail in mailList: string = json.dumps(mail["letter_type"],ensure_ascii=False) + ";" string += json.dumps(mail["original_id"],ensure_ascii=False) + ";" string += json.dumps(mail["catalog_id"],ensure_ascii=False) + ";" string += json.dumps(mail["letter_title"],ensure_ascii=False) + ";" string += json.dumps(mail["create_date"],ensure_ascii=False) + ";" string += json.dumps(mail["org_id"],ensure_ascii=False) + ";" string += json.dumps(mail["keywords"],ensure_ascii=False) + ";" string += json.dumps(mail["letter_status"],ensure_ascii=False) + ";" string += json.dumps(mail["ask_same_num"],ensure_ascii=False) + ";" string += json.dumps(mail["reply_num"],ensure_ascii=False) + ";" string += json.dumps(mail["support_num"],ensure_ascii=False) + ";" string += json.dumps(mail["supervise_num"],ensure_ascii=False) + ";" string += json.dumps(mail["isReply"],ensure_ascii=False) string += "\r\n" string = string.replace(‘"‘,‘‘) f = open("data.txt","a",encoding="utf-8") f.write(string) f.close() spend_time = time.time() - start print(f"begin:{begin},length:{length},spend time:{spend_time}s") mu.release() #3、释放锁 if __name__ == ‘__main__‘: count = json.loads(getContent(0,1))["PageCond"]["count"] print(f"count:{count}") f = open("data.txt","w",encoding="utf-8") f.write("letter_type;original_id;catalog_id;letter_title;create_date;org_id;keywords;letter_status;ask_same_num;reply_num;support_num;supervise_num;isReply") f.write("\r\n") f.close() begin = 0 length = 1000 while begin < count: if(begin + length > count): length = count - begin Thread(target = writeFile, args=(begin,length,)).start() begin += length
原文:https://www.cnblogs.com/dream0-0/p/12239876.html