#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2019/12/24 17:01 # @Site : # @File : shell.py # @Software: PyCharm import json import urllib3 import requests from pyquery import PyQuery urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)#fidder抓包忽略请求warning headers = { "Referer": "https://ag.fang.ke.com/loupan", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36", } def shell_room_page(pgmax): room_page_list = [] num = 0#用于已获取页面总资源计数 for i in range(1,pgmax+1): # urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)#fidder抓包忽略请求warning url = ‘https://cq.fang.ke.com/loupan/pg‘+str(i) print("正在获取的链接:%s"%url) response = requests.get(url,headers=headers,verify=False) print("正在获取%s页房源......"%i) page_doc = PyQuery(response.text) j = 0#y用于当前页资源计数 for item in page_doc(‘.resblock-list-wrapper li ‘).items(): # room_page_list.append(item.attr(‘data-project-name‘)) if item.attr(‘data-project-name‘)==None: # print(item) continue else: room_page_list.append(item.attr(‘data-project-name‘)) num+=1 j+=1 print("当前是第%s页,本页有%s套资源,当前共获取%s套资源!"%(i,j,num)) print(room_page_list) return room_page_list def shell_room_detail(list): for j in range(0,len(list)): try: url = ‘https://cq.fang.ke.com/loupan/p_‘+list[j] print url response = requests.get(url,headers=headers,verify=False) detail_doc = PyQuery(response.text) price_list = [] tag_list = [] #价格获取 price = detail_doc(‘.price span‘) for pri in price.items(): # print(pri.text()) price_list.append(pri.text()) # print(price_list) if price_list[0]==u‘价格待定‘:#未开盘且没有参考价格 ref_ave_price = u‘未开盘,价格待定‘ ref_total_price = u‘未开盘,价格待定‘ ref_unit_price = u‘未开盘,价格待定‘ elif price_list[3]==u‘参考单价‘:#没有总价 ref_ave_price = price_list[1]+price_list[2] ref_total_price = u‘暂无总价‘ ref_unit_price = price_list[4]+price_list[5] else:#各报价齐全 ref_ave_price = price_list[1]+price_list[2] ref_total_price = price_list[3]+price_list[4] ref_unit_price = price_list[6]+price_list[7] #最新开盘时间 for open in detail_doc(‘.open-date span‘).items(): if open.attr(‘class‘)=="content": opendate = open.text() break else: opendate = u‘未知‘ #项目地址 for addr in detail_doc(‘.info-item span‘).items(): # print(addr) if addr.attr(‘class‘)=="content": addres = addr.text() break else: addres = u‘未知‘ #标签获取 pro_tag = "" for tag in detail_doc(".top-info ul li").items(): if tag.attr(‘class‘)=="item": tag_list.append(tag.text()) pro_tag = pro_tag+tag.text()+‘/‘ #户型 style = "" for style_room in detail_doc(‘.content span‘).items(): style = style+style_room.text()+‘/‘ #楼盘在售状态和类型,只有售卖状态和类型且为必须项 type_list = [] for sell_house_type in detail_doc(‘.tags-wrap span‘).items(): type_list.append(sell_house_type.text()) if len(type_list) >=2 : type = type_list[0]+‘/‘+type_list[1] else: type = None pro_name = detail_doc(‘.title-wrap div h2‘).text() detail_dic = { "pro_name":pro_name, "room_code":list[j], "ref_ave_price":ref_ave_price, "ref_total_price":ref_total_price, "ref_unit_price":ref_unit_price, "new_open_date":opendate, "pro_addr":addres, "pro_tag":pro_tag, "style_room":style, "sell_house_type":type } print("正在获取第%s套信息......"%(j+1)) # print("单价为:%s"%detail_doc(‘.price span‘).text()) print("第%s套信息:"%(j+1)) print(json.dumps(detail_dic,encoding=‘UTF-8‘, ensure_ascii=False)) except Exception as e: #捕获所有异常并打印 print(e) continue if __name__ == ‘__main__‘: list = shell_room_page(100) shell_room_detail(list)
原文:https://www.cnblogs.com/East-fence/p/12112402.html