首页 > 编程语言 > 详细

初始python 之 爬虫:爬取某电影网站信息

时间:2019-07-20 22:28:44      阅读:128      评论:0      收藏:0      [点我收藏+]

注:此代码仅用于个人爱好学习使用,不涉及任何商业行为!

 话不多说,直接上代码:

#!/user/bin env python
# author:Simple-Sir
# time:2019/7/20 20:36
# 获取电影天堂详细信息
import requests
from lxml import etree

# 伪装浏览器
HEADERS ={
    User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36,
}
# 定义全局变量
BASE_DOMAIN = https://www.dytt8.net
# 获取首页网页信息并解析
def getUrlText(url,coding):
    respons = requests.get(url,headers=HEADERS)  # 获取网页信息
    # enc = respons.encoding
    # urlText = respons.content.decode(‘gbk‘)
    if(coding==c):
        urlText = respons.content.decode(gbk)
        html = etree.HTML(urlText)  # 使用lxml解析网页
    else:
        urlText = respons.text
        html = etree.HTML(urlText)  # 使用lxml解析网页
    return html

# 获取电影详情页的href,text解析
def getHref(url):
    html = getUrlText(url,t)
    aHref = html.xpath(//table[@class="tbspan"]//a/@href)
    htmlAll = map(lambda url:BASE_DOMAIN+url,aHref) # 给每个href补充BASE_DOMAIN
    return htmlAll

# 使用content解析电影详情页,并获取详细信息数据
def getPage(url):
    html = getUrlText(url,c)
    moveInfo = {}  # 定义电影信息字典
    mName = html.xpath(//div[@class="title_all"]//font[@color="#07519a"]/text())[0]
    moveInfo[电影名字] = mName
    mDiv = html.xpath(//div[@id="Zoom"])[0]
    mImgSrc = mDiv.xpath(.//img/@src)
    moveInfo[海报地址] = mImgSrc[0]  # 获取海报src地址
    if len(mImgSrc) >= 2:
        moveInfo[电影截图地址] = mImgSrc[1]  # 获取电影截图src地址
    mContnent = mDiv.xpath(.//text())
    def pares_info(info,rule):
        ‘‘‘
        :param info: 字符串
        :param rule: 替换字串
        :return:  指定字符串替换为空,并剔除左右空格
        ‘‘‘
        return info.replace(rule,‘‘).strip()
    for index,t in enumerate(mContnent):
        if t.startswith(◎译  名):
            name = pares_info(t,◎译  名)
            moveInfo[译名]=name
        elif t.startswith(◎片  名):
            name = pares_info(t,◎片  名)
            moveInfo[片名]=name
        elif t.startswith(◎年  代):
            name = pares_info(t,◎年  代)
            moveInfo[年代]=name
        elif t.startswith(◎产  地):
            name = pares_info(t,◎产  地)
            moveInfo[产地]=name
        elif t.startswith(◎类  别):
            name = pares_info(t,◎类  别)
            moveInfo[类别]=name
        elif t.startswith(◎语  言):
            name = pares_info(t,◎语  言)
            moveInfo[语言]=name
        elif t.startswith(◎字  幕):
            name = pares_info(t,◎字  幕)
            moveInfo[字幕]=name
        elif t.startswith(◎上映日期):
            name = pares_info(t,◎上映日期)
            moveInfo[上映日期]=name
        elif t.startswith(◎IMDb评分):
            name = pares_info(t,◎IMDb评分)
            moveInfo[IMDb评分]=name
        elif t.startswith(◎豆瓣评分):
            name = pares_info(t,◎豆瓣评分)
            moveInfo[豆瓣评分]=name
        elif t.startswith(◎文件格式):
            name = pares_info(t,◎文件格式)
            moveInfo[文件格式]=name
        elif t.startswith(◎视频尺寸):
            name = pares_info(t,◎视频尺寸)
            moveInfo[视频尺寸]=name
        elif t.startswith(◎文件大小):
            name = pares_info(t,◎文件大小)
            moveInfo[文件大小]=name
        elif t.startswith(◎片  长):
            name = pares_info(t,◎片  长)
            moveInfo[片长]=name
        elif t.startswith(◎导  演):
            name = pares_info(t,◎导  演)
            moveInfo[导演]=name
        elif t.startswith(◎编  剧):
            name = pares_info(t, ◎编  剧)
            writers = [name]
            for i in range(index + 1, len(mContnent)):
                writer = mContnent[i].strip()
                if writer.startswith():
                    break
                writers.append(writer)
            moveInfo[编剧] = writers
        elif t.startswith(◎主  演):
            name = pares_info(t, ◎主  演)
            actors = [name]
            for i in range(index+1,len(mContnent)):
                actor = mContnent[i].strip()
                if actor.startswith():
                    break
                actors.append(actor)
            moveInfo[主演] = actors
        elif t.startswith(◎标  签):
            name = pares_info(t,◎标  签)
            moveInfo[标签]=name
        elif t.startswith(◎简  介):
            name = pares_info(t,◎简  介)
            profiles = []
            for i in range(index + 1, len(mContnent)):
                profile = mContnent[i].strip()
                if profile.startswith(◎获奖情况) or 【下载地址】 in profile:
                    break
                profiles.append(profile)
            moveInfo[简介]=profiles
        elif t.startswith(◎获奖情况):
            name = pares_info(t,◎获奖情况)
            awards = []
            for i in range(index + 1, len(mContnent)):
                award = mContnent[i].strip()
                if 【下载地址】 in award:
                    break
                awards.append(award)
            moveInfo[获奖情况]=awards
    downUrl = html.xpath(//td[@bgcolor="#fdfddf"]/a/@href)[0]
    moveInfo[下载地址] = downUrl
    return moveInfo

# 获取前n页所有电影的详情页href
def spider():
    base_url = https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html
    moves = []
    m = int(input(请输入您要获取的开始页:))
    n = int(input(请输入您要获取的结束页:))
    print(即将写入第{}页到第{}页的电影信息,请稍后....format(m, n))
    for i in range(m,n+1):
        print(******* 第{}页电影 正在写入 ********.format(i))
        url = base_url.format(i)
        moveHref = getHref(url)
        for index,mhref in enumerate(moveHref):
            print(---- 第{}部电影 正在写入----.format(index+1))
            move = getPage(mhref)
            moves.append(move)
    # 将电影信息写入本地本件
    for i in moves:
        with open(电影天堂电影信息.txt, a+, encoding=utf-8) as f:
            f.write(\n********* {} ***************\n.format(i[电影名字]))
        for info in i:
            with open(电影天堂电影信息.txt,a+,encoding=utf-8) as f:
                f.write({}:{}\n.format(info,i[info]))
    print(写入完成!)

if __name__ == __main__:
    spider()

执行情况:

技术分享图片

 技术分享图片

 

结果文件:

技术分享图片

 

初始python 之 爬虫:爬取某电影网站信息

原文:https://www.cnblogs.com/simple-li/p/11219525.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!