马蜂窝数据采集,难点sn 加密 采用固定字符串 加上参数转化md5值截取部分md5
import hashlib
import pymongo
import pandas
import requests
import time
from pyquery import PyQuery as pq
from retry import retry
class ScenicSpot:
def __init__(self):
self.scenic_url = "http://www.mafengwo.cn/ajax/router.php"
self.headers = {
‘User-Agent‘: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
}
self.client = pymongo.MongoClient().ScenicSpot.ScenicSpot
self.pio_headers = {
‘Host‘: ‘www.mafengwo.cn‘,
‘Connection‘: ‘keep-alive‘,
‘Upgrade-Insecure-Requests‘: ‘1‘,
‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9‘,
‘Accept-Encoding‘: ‘gzip, deflate‘,
‘Accept-Language‘: ‘zh-CN,zh;q=0.9‘,
‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36‘,
‘Cookie‘: ‘PHPSESSID=o98g37f4squ0aq4ubcr07d84f2; mfw_uuid=5fe69be4-4047-f9a5-fa34-c5d367eb316b; oad_n=a%3A3%3A%7Bs%3A3%3A%22oid%22%3Bi%3A1029%3Bs%3A2%3A%22dm%22%3Bs%3A15%3A%22www.mafengwo.cn%22%3Bs%3A2%3A%22ft%22%3Bs%3A19%3A%222020-12-26+10%3A11%3A48%22%3B%7D; __jsluid_h=8b69bee30f0e6459c08df76385484c05; __omc_chl=; __omc_r=; __mfwc=direct; __mfwa=1608948709480.47993.1.1608948709480.1608948709480; __mfwb=0418cf79c433.1.direct; __mfwlv=1608948709; __mfwvn=1; __mfwlt=1608948709; Hm_lvt_8288b2ed37e5bc9b4c9f7008798d2de0=1608948710; Hm_lpvt_8288b2ed37e5bc9b4c9f7008798d2de0=1608948710; uva=s%3A78%3A%22a%3A3%3A%7Bs%3A2%3A%22lt%22%3Bi%3A1608948710%3Bs%3A10%3A%22last_refer%22%3Bs%3A6%3A%22direct%22%3Bs%3A5%3A%22rhost%22%3Bs%3A0%3A%22%22%3B%7D%22%3B; __mfwurd=a%3A3%3A%7Bs%3A6%3A%22f_time%22%3Bi%3A1608948710%3Bs%3A9%3A%22f_rdomain%22%3Bs%3A0%3A%22%22%3Bs%3A6%3A%22f_host%22%3Bs%3A3%3A%22www%22%3B%7D; __mfwuuid=5fe69be4-4047-f9a5-fa34-c5d367eb316b; bottom_ad_status=1; UM_distinctid=1769cd106714ce-0be3d1620af6d5-3e604809-1fa400-1769cd106729c0; CNZZDATA30065558=cnzz_eid%3D1359122677-1608945050-%26ntime%3D1608945050; __jsl_clearance=1608948735.479|0|LP8kMR7h6lJOyF4aqU9yvnUg4Ek%3D‘
}
self.all_list = []
tunnel = "tps198.kdlapi.com:15818"
username = "t10886694756492"
password = "bjgfg7jn"
self.proxies = {
"http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
"https": "https://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
}
def par(self, t):
hl = hashlib.md5()
hl.update(t)
return hl.hexdigest()[2:12]
def get_page(self):
for i in range(18, 21):
t = time.time() * 1000
print(‘第‘ + str(i) + ‘页‘)
page = str(i)
qdata = ‘{"_ts":"‘ + str(t) + ‘","iMddid":"10794","iPage":"‘ + str(
page) + ‘","iTagId":"0","sAct":"KMdd_StructWebAjax|GetPoisByTag"}c9d6618dbc657b41a66eb0af952906f1‘
sn = self.par(qdata.encode(‘utf-8‘))
data = {
‘sAct‘: ‘KMdd_StructWebAjax|GetPoisByTag‘,
‘iMddid‘: ‘10794‘,
‘_ts‘: t,
‘iPage‘: page,
‘iTagId‘: ‘0‘,
‘_sn‘: sn
}
response = requests.post(url=self.scenic_url, headers=self.headers, data=data, proxies=self.proxies)
data = response.json()[‘data‘][‘list‘]
doc = pq(data)
li_list = doc(‘li‘).items()
for li in li_list:
title = li(‘a‘).attr("title")
title_url = "http://www.mafengwo.cn" + li(‘a‘).attr("href")
self.get_point_info(title_url, title)
time.sleep(2)
def get_point_info(self, url, title):
"""
cookie 有问题
"""
poi_dict = {}
poi_dict[‘景区名称‘] = title
self.pio_headers[‘Referer‘] = url
print(url)
poi_resp = requests.get(url, headers=self.pio_headers)
if poi_resp.status_code == 521:
raise
poi_doc = pq(poi_resp.content)
introduction = poi_doc(‘.summary‘).text()
poi_dict[‘景区介绍‘] = introduction
place_station = poi_doc(‘.mod.mod-location p‘).text()
dl_list = poi_doc(‘.mod.mod-detail dl‘).items()
for dl in dl_list:
dt = dl(‘dt‘).text()
if ‘门票‘ in dt:
dd = dl(‘dd‘).text()
poi_dict[‘门票‘] = dd
elif ‘开放时间‘ in dt:
dd = dl(‘dd‘).text()
poi_dict[‘开放时间‘] = dd
else:
continue
poi_dict[‘景点位置‘] = place_station
self.client.insert_one(poi_dict)
print(poi_dict)
self.all_list.append(poi_dict)
def run(self):
self.get_page()
pandas.DataFrame(self.all_list).to_excel(‘旅游景点.xlsx‘, index=False)
if __name__ == ‘__main__‘:
ScenicSpot().run()
原文:https://www.cnblogs.com/lqn404/p/14231287.html