#coding:utf-8
import?requests
from?lxml?import?etree
#1:将目标网站上的页面抓取下来
proxy?=?{
????'http':'120.11.0.63:9000'#设置代理
}
headers?=?{
????'User-Agent':?'Mozilla/5.0?(Windows?NT?10.0;?Win64;?x64)?AppleWebKit/537.36?(KHTML,?like?Gecko)?Chrome/80.0.3987.122?Safari/537.36'
????
}
url?=?'https://movie.douban.com/cinema/nowplaying/jining/'
response?=?requests.get(url,headers=headers,proxies=proxy)
#print(response.text)
#?response.text:返回的是一个经过解码后的字符串,是str(unicode)类型
#?response.content?:返回的是一个原生的字符串,就是从网页上抓取下来的,没有经过处理的字符串,是byte类型。
#with?open('renren.html','w',encoding='utf-8')?as?fb:
#????fb.write(response.text)
text?=?response.text
#2:将抓取下来的数据根据一定的规则进行提取
html?=?etree.HTML(text)
ul?=?html.xpath("//ul[@class='lists']")[0]
#print(etree.tostring(ul,encoding='utf-8').decode("utf-8"))
lis?=?ul.xpath("./li")
movies?=?[]
for?li?in?lis:
????#print(etree.tostring(li,encoding='utf-8').decode('utf-8'))
?????title?=?li.xpath("@data-title")[0]
?????#print(title)
?????wish?=?li.xpath("@data-wish")[0]
?????region?=?li.xpath("@data-region")[0]
?????director?=?li.xpath("@data-director")[0]
?????actors?=?li.xpath("@data-actors")[0]
?????posts?=?li.xpath(".//img/@src")
?????movie?=?{
?????????'title':?title,
?????????'wish':?wish,
?????????'region':?region,
?????????'direction':?director,
?????????'actors'?:?actors,
?????????'posts'?:?posts
?????}
?????movies.append(movie)
print(movies)
?????
原文:https://www.cnblogs.com/wangtanzhi/p/12380753.html