lxml是一个第三方框架,用于对xml文件进行格式化操作(html文件是一种特殊xml文件)
xpath是一种基于xml文件,根据xml文件的文档结构来提取目标元素或者属性的语法,它的基本依赖工具就是lxml
1 from lxml import etree 2 html_tree = etree.parse("./test.html") 3 4 print(html_tree) 5 输出结果: <lxml.etree._ElementTree object at 0x0000028A81E566C8> 6 7 xpath语法中"/"代表当前节点的子节点 "//"代表当前节点的后代节点 如果以“/”代表从根节点开始查找 8 xpath函数,传入一个字符串参数,代表的是xpath路径,用于定位目标节点,返回值是一个列表,列表中定位到测那些节点 9 「注意」在xpath语法中数字都是从1开始数,没有0序号也没有负数
1 ret = html_tree.xpath("/html/body/ol/li[1]") 2 ret = html_tree.xpath("/html/body/div/div[1]/a") # 里面用xpath路径来定位目标节点
1 ret = html_tree.xpath("/html/body/div/div[1]/a/text()") # 提取标签的内容 2 3 ret = html_tree.xpath("/html/body/div/div[1]/a/@href") # 提取href属性,【注意】xpath语法中所有的节点属性要在前面加上“@ ”符号
1 层级定位 2 ret = html_tree.xpath("/html/body//li/text()") # 获取页面上的所有的li
1 属性定位 2 3 ret = html_tree.xpath("/html/body//li[@id]/text()") # 查找页面上所有带有id属性的li 4 ret = html_tree.xpath("/html/body//li[@class=‘dudu‘]/text()") # 查找页面上所有的class属性为dudu的li 5 ret = html_tree.xpath("/html/body//li[@class=‘tanshui taohua‘]/text()") # 属性的值一定写全
1 ret = html_tree.xpath("/html/body//li[contains(@class,‘he‘)]/text()") #包含:查找所有class值中包含he的li 2 ret = html_tree.xpath("/html/body//li[starts-with(@class,‘h‘)]/text()") # 开头:查找所有的class值以h开头的li
1 ret = html_tree.xpath("/html/body//li[@class and @id]/text()") # 与:查找所有的包含id属性和class属性的那些li 2 ret = html_tree.xpath("//li[@class=‘nene‘ or @id=‘hh‘]/text()") # 或:查找所有的id值为hh,或者class值为neme的li 3 print(ret)
1 ol = html_tree.xpath("//ol[2]")[0] print(ol) # 查找第二个ol 2 3 从上面查找到的ol中提取li "."代表当前 ".."代表当前的上一级 4 ret = ol.xpath("//li/text()") # 用绝对路径来提取,无论xpath函数前面用谁调用,都是从文档的跟节来提取点 5 ret = ol.xpath("..//li/text()") # 用相对路径来提取,从xpath前面调用对象来查找 6 7 print(ret)
1 from urllib import request,parse 2 from time import sleep 3 from lxml import etree 4 import re 5 import json 6 import csv 7 import redis 8 9 # 1、【数据的获取】 10 def request_from(url,page,city): 11 page_url = url%(city,page) 12 req = request.Request(headers={‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36‘},url=page_url) 13 return req 14 def get_pages(url,start,end,city): 15 # 创建请求对象 16 for page in range(start,end+1): 17 req = request_from(url=url,page=page,city=city) 18 # 发起请求 19 res = request.urlopen(req) 20 sleep(1) 21 html = res.read().decode("utf-8") 22 23 yield html 24 25 # 2、【数据的解析】 26 def anylasis_data(pages): 27 for page in pages: 28 # 用etree将页面转成节点树 29 page_tree = etree.HTML(page) 30 house_list = page_tree.xpath("//ul[@class=‘sellListContent‘]/li") 31 # print(house_list) 32 # 迭代每一个li(每一个房屋信息内容) 33 for house in house_list: 34 # 提取内容 35 # 创建一个item字典,用于整合每一个房屋信息 36 item = {} 37 item["title"] = house.xpath(".//div[@class=‘title‘]//a/text()")[0] 38 item["houseInfo"] = "".join(house.xpath(".//div[@class=‘houseInfo‘]//text()")) 39 item["positionInfo"] = "".join(house.xpath(".//div[@class=‘positionInfo‘]//text()")) 40 item["unitPrice"] = re.findall(pattern=r‘[0-9]+‘,string=house.xpath(".//div[@class=‘unitPrice‘]//text()")[0])[0] 41 item["totalPrice"] = house.xpath(".//div[@class=‘totalPrice‘]//text()")[0] 42 item["picUrl"] = house.xpath(".//img[@class=‘lj-lazy‘]/@data-original")[0] 43 44 yield item 45 46 # 3、【数据的存储】 47 def write_to_json(houses): 48 # 整合json数据 49 # 创建一个字典用于整合所有的房屋数据 50 hd = {} 51 # 创建一个列表,用于存储每一个房屋的信息 52 hl = [] 53 for house in houses: 54 hl.append(house) 55 hd["house"] = hl 56 # print(hd) 57 with open("house.json",‘w‘,encoding=‘utf-8‘) as fp: 58 fp.write(json.dumps(hd)) 59 60 def write_to_redis(houses): 61 # 创建redis数据库连接 62 rds = redis.StrictRedis(host="www.fanjianbo.com",port=6379,db=6) 63 for house in houses: 64 rds.lpush("ershoufang",house) 65 66 def write_to_csv(houses): 67 # 打开一个csv文件 68 fp = open("ershoufang.csv","a+") 69 # 创建一个写对象 70 writer = csv.writer(fp) 71 # 写表头 72 writer.writerow(["title","houseInfo","positionInfo","unitPrice","totalPrice","picUrl"]) 73 for house in houses: 74 # csv二维表的每一行是一个列表 75 values = [] 76 for k,v in house.items(): 77 values.append(v) 78 writer.writerow(values) 79 fp.close() 80 81 if __name__ == ‘__main__‘: 82 url = "https://%s.lianjia.com/ershoufang/pg%d/" 83 city = input("请输入城市简称:") 84 start = int(input("请输入起始页:")) 85 end = int(input("请输入终止页:")) 86 pages = get_pages(url=url,city=city,start=start,end=end) 87 # print(pages) 88 houses = anylasis_data(pages) 89 # 存入json 90 write_to_csv(houses)
原文:https://www.cnblogs.com/TMMM/p/10802113.html