首页 > 其他 > 详细

练手之爬取贝壳楼盘信息

时间:2019-12-28 19:18:52      阅读:149      评论:0      收藏:0      [点我收藏+]
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2019/12/24 17:01
# @Site    : 
# @File    : shell.py
# @Software: PyCharm

import json
import urllib3
import requests
from  pyquery import PyQuery

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)#fidder抓包忽略请求warning
headers = {
        "Referer": "https://ag.fang.ke.com/loupan",
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
    }

def shell_room_page(pgmax):
    room_page_list = []
    num = 0#用于已获取页面总资源计数
    for i in range(1,pgmax+1):
        # urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)#fidder抓包忽略请求warning
        url = ‘https://cq.fang.ke.com/loupan/pg‘+str(i)
        print("正在获取的链接:%s"%url)
        response = requests.get(url,headers=headers,verify=False)
        print("正在获取%s页房源......"%i)
        page_doc = PyQuery(response.text)
        j = 0#y用于当前页资源计数
        for item in page_doc(‘.resblock-list-wrapper li ‘).items():
            # room_page_list.append(item.attr(‘data-project-name‘))
            if item.attr(‘data-project-name‘)==None:
                # print(item)
                continue
            else:
                room_page_list.append(item.attr(‘data-project-name‘))
                num+=1
                j+=1
        print("当前是第%s页,本页有%s套资源,当前共获取%s套资源!"%(i,j,num))
    print(room_page_list)
    return room_page_list

def shell_room_detail(list):
    for j in range(0,len(list)):
        try:
            url = ‘https://cq.fang.ke.com/loupan/p_‘+list[j]
            print url
            response = requests.get(url,headers=headers,verify=False)
            detail_doc = PyQuery(response.text)
            price_list = []
            tag_list = []
            #价格获取
            price = detail_doc(‘.price span‘)
            for pri in price.items():
                # print(pri.text())
                price_list.append(pri.text())
            # print(price_list)
            if  price_list[0]==u‘价格待定‘:#未开盘且没有参考价格
                ref_ave_price = u‘未开盘,价格待定‘
                ref_total_price = u‘未开盘,价格待定‘
                ref_unit_price = u‘未开盘,价格待定‘
            elif price_list[3]==u‘参考单价‘:#没有总价
                ref_ave_price = price_list[1]+price_list[2]
                ref_total_price = u‘暂无总价‘
                ref_unit_price = price_list[4]+price_list[5]
            else:#各报价齐全
                ref_ave_price = price_list[1]+price_list[2]
                ref_total_price = price_list[3]+price_list[4]
                ref_unit_price = price_list[6]+price_list[7]
            #最新开盘时间
            for open in detail_doc(‘.open-date span‘).items():
                if open.attr(‘class‘)=="content":
                    opendate = open.text()
                    break
                else:
                    opendate = u‘未知‘
            #项目地址
            for addr in detail_doc(‘.info-item span‘).items():
                # print(addr)
                if addr.attr(‘class‘)=="content":
                    addres = addr.text()
                    break
                else:
                    addres = u‘未知‘
            #标签获取
            pro_tag = ""
            for tag in detail_doc(".top-info ul li").items():
                if tag.attr(‘class‘)=="item":
                    tag_list.append(tag.text())
                    pro_tag = pro_tag+tag.text()+‘/‘
            #户型
            style = ""
            for style_room in  detail_doc(‘.content span‘).items():
                style = style+style_room.text()+‘/‘

            #楼盘在售状态和类型,只有售卖状态和类型且为必须项
            type_list = []
            for sell_house_type in detail_doc(‘.tags-wrap span‘).items():
                type_list.append(sell_house_type.text())
            if len(type_list) >=2 :
                type = type_list[0]+‘/‘+type_list[1]
            else:
                type = None

            pro_name = detail_doc(‘.title-wrap div h2‘).text()

            detail_dic = {
                "pro_name":pro_name,
                "room_code":list[j],
                "ref_ave_price":ref_ave_price,
                "ref_total_price":ref_total_price,
                "ref_unit_price":ref_unit_price,
                "new_open_date":opendate,
                "pro_addr":addres,
                "pro_tag":pro_tag,
                "style_room":style,
                "sell_house_type":type

            }
            print("正在获取第%s套信息......"%(j+1))

            # print("单价为:%s"%detail_doc(‘.price span‘).text())
            print("第%s套信息:"%(j+1))
            print(json.dumps(detail_dic,encoding=‘UTF-8‘, ensure_ascii=False))
        except Exception as e:  #捕获所有异常并打印
            print(e)
            continue

if __name__ == ‘__main__‘:
    list = shell_room_page(100)
    shell_room_detail(list)

练手之爬取贝壳楼盘信息

原文:https://www.cnblogs.com/East-fence/p/12112402.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!