首页 > 其他 > 详细

电影天堂爬虫

时间:2020-03-01 15:54:51      阅读:45      评论:0      收藏:0      [点我收藏+]

0x00电影天堂爬虫代码

#coding:utf-8
from lxml import etree
import requests
BASE_DOMAIN = "https://www.dy2018.com/"

url = "https://www.dy2018.com/html/gndy/dyzz/index_2.html"

proxy = {
    'http':'117.69.150.100:9000'#设置代理
}
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'
    
}
def get_detail_urls(url):
    response = requests.get(url,headers=headers,proxies=proxy)
    #requests库,默认会使用自己猜测的编码方式,将抓取下来的网页进行解码,然后存储到text属性。这里乱码我们需要自己指定解码方式
    #text = response.content.decode('gbk')
    text = response.text
    #with open('dianying.html','w',encoding='utf-8') as fb:
    #   fb.write(response.text)
    html = etree.HTML(text)
    detail_urls = html.xpath("//table[@class='tbspan']//a/@href")#获取a标签下的href属性
    #for detail_url in detail_urls:
     #   print(BASE_DOMAIN+detail_url)
    detail_urls = map(lambda url:BASE_DOMAIN+url,detail_urls)#使用无名函数,将列表中的每一项都执行一下函数。等价于下面的:
    #def abc(url):
     #   return BASE_DOMAIN+url
    #index = 0
    #for detail_url in detail_urls:
     #   detail_url = abc(detail_url)
      #  detail_urls(index) = detail_url
     #   index += 1
    return detail_urls

#获取内容页数据
def parse_detail_page(url):
    response = requests.get(url,headers=headers)
    movie = {}
    #text = response.content.decode('utf-8')
    text = response.content.decode('gbk')
    html = etree.HTML(text)
    title = html.xpath("//div[@class='title_all']")
    #for x in title:
     #   print(etree.tostring(x,encoding='utf-8').decode('utf-8'))
    movie['title'] = title

    zoomE = html.xpath("//div[@id='Zoom']")[0]
    imgs = zoomE.xpath(".//img/@src")
    cover = imgs[0]
    screenshot = imgs[1]
    movie['cover'] = cover
    movie['screenshot'] = screenshot

    def parse_info(info,rule):
        return info.replace(rule,"").strip()

    infos = zoomE.xpath(".//text()")
    for index,info in enumerate(infos):
        if info.startswith("◎年  代"):
            info = info.replace("◎年  代","").strip()#将年代字符串替换为空,strip方法去掉前后空格 
            movie['year'] = info
        elif info.startswith("◎产  地"):
            info = parse_info(info,"◎产  地")
            movie['country'] = info
        elif info.startswith("◎类  别"):
            info = parse_info(info,"◎类  别")
            movie['category'] = info
        elif info.startswith("◎豆瓣评分"):
            info = parse_info(info,"◎豆瓣评分")
            movie['douban_rate'] = info
        elif info.startswith("◎片  长"):
            info = parse_info(info,"◎片  长")
            movie['duration'] = info
        elif info.startswith("◎导  演"):
            info = parse_info(info,"◎导  演")
            movie['director'] = info
        elif info.startswith("◎主  演"):
            info = parse_info(info,"◎主  演")
            actors = [info]
            for x in range(index+1,len(infos)):#因为演员不止一个,所以要用遍历形式打印
                actor = infos[x].strip()
                if actor.startswith("◎"):
                    break
                actors.append(actor)
                movie['actors'] = actors
        elif info.startswith("◎简  介"):
            info = parse_info(info,"◎简  介")
            for x in range(index+1,len(infos)):
                profile = infos[x].strip()
                movie['profile'] = profile
    down_url = html.xpath("//td[@bgcolor='#fdfddf']/a/@href")[0]
    movie['download_url'] = down_url
    return movie



           
    
#获取列表数据:

def spider():
    base_url = "https://www.dy2018.com/html/gndy/dyzz/index_{}.html"
    movies = []
    for x in range(2,9):#第一个for循环,用来控制电影共有7页
        url = base_url.format(x)
        #print(url)
        detail_urls = get_detail_urls(url)
    #url = "https://www.dy2018.com/html/gndy/dyzz/index_2.html"
    #detail_urls = get_detail_urls(url)
        for detail_url in detail_urls:#第二个for循环,用来遍历一页中所有电影的详情url
        #print(detail_url)
           movie = parse_detail_page(detail_url)
           movies.append(movie)
           print(movie)
          



if __name__ == '__main__':
    spider()

运行效果

技术分享图片

电影天堂爬虫

原文:https://www.cnblogs.com/wangtanzhi/p/12390116.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!