首页 > 其他 > 详细

爬虫大作业

时间:2018-04-22 21:03:24      阅读:134      评论:0      收藏:0      [点我收藏+]
import requests
import re
from bs4 import BeautifulSoup
import jieba.analyse
from PIL import Image, ImageSequence
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud, ImageColorGenerator

# 获取总页数
def getnum(url):
    res = requests.get(url)
    res.encoding = gb2312
    soup = BeautifulSoup(res.text, html.parser)
    Info = soup.select(".page-next")[0].extract().text
    TotalNum = re.search("共(\d+)页.*",Info).group(1)
    return TotalNum

#获取单个页面所有链接
def getpageurl(url):
    res = requests.get(url)
    res.encoding = gb2312
    soup = BeautifulSoup(res.text, html.parser)
    a = soup.select(".list-page ul")
    for i in soup.select(".list-page ul li"):
        if len(i.select("a"))>0:
            info = i.select("a")[0].attrs[href]
            pageurl = http://www.ckck.tv/ + info
            print(pageurl)
            getinfromation(pageurl)
# 获取页面的信息
def getinfromation(url):
    res = requests.get(url)
    res.encoding = gb2312
    soup = BeautifulSoup(res.text, html.parser)
    a = soup.select(".content .movie ul h1")[0].text
    print("电影:",a)
    b = soup.select(".content .movie ul li")[1].text
    name = re.search("【主 演】:(.*)",b).group(1)
    print("主演:",name)
    c = soup.select(".content .movie ul li")[4].text
    date = re.search("【年 代】:(.*)  【地 区】:", c).group(1)
    print("年代:", date)
    diqu = re.search("【地 区】:(.*)", c).group(1)
    print("地区:",diqu)
    # 将标签内容写入文件
    f = open(gzccNews.txt, a, encoding=utf-8)
    f.write(a )
    f.write(name )
    f.write(date )
    f.write(diqu)
    f.write("\n")
    f.close()

# 生成词云
def getpicture():
    lyric = ‘‘
    f = open(gzccNews.txt, r, encoding=utf-8)
    for i in f:
        lyric += f.read()

    result = jieba.analyse.textrank(lyric, topK=50, withWeight=True)
    keywords = dict()
    for i in result:
        keywords[i[0]] = i[1]
    print(keywords)

    image = Image.open(input.jpg)
    graph = np.array(image)
    wc = WordCloud(font_path=./fonts/simhei.ttf, background_color=White, max_words=50, mask=graph)
    wc.generate_from_frequencies(keywords)
    image_color = ImageColorGenerator(graph)
    plt.imshow(wc)
    plt.imshow(wc.recolor(color_func=image_color))
    plt.axis("off")
    plt.show()
    wc.to_file(output.png)


url = http://www.ckck.tv/xj/Index.html
a = getnum(url)
getpageurl(url)
for i in range(2,int(a)):
     page = http://www.ckck.tv/xj/List_4_{}.html.format(i)
     getpageurl(page)
getpicture()

首先定义获取总页面、获取页面所有链接、获取页面信息、生成词云等的函数,过程中就是获取所有页面所有链接出现点问题,归结于找标签问题。这次爬取的是一个电影网站,将网站里面的电影名、主演、年代、地区,然后进行词云生成

技术分享图片

技术分享图片

 

技术分享图片

爬虫大作业

原文:https://www.cnblogs.com/qq974975766/p/8909008.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!