首页 > 编程语言 > 详细

python爬虫系列:小试牛刀

时间:2017-03-05 21:02:11      阅读:404      评论:0      收藏:0      [点我收藏+]

这篇博客主要是写两个爬虫,一个抓取静态网站的文字和图片,一个抓取动态网站的电影及相关消息。

 

1.每日一文(http://voice.meiriyiwen.com/

技术分享
#coding=utf-8
#爬取每日一文前10页内容
from lxml import etree
import requests
import urllib2,urllib
import sys
import os
import time

tmpt_url = http://voice.meiriyiwen.com/voice/past?page=%d
urllist = [tmpt_url%i for i in range(1,11)]

def get_url():
    for url in urllist:
        try:
            headers = {
            Host:voice.meiriyiwen.com,
            Upgrade-Insecure-Requests:1,
            User-Agent : Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36 ,
            Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8,
            Accept-Encoding:gzip, deflate, sdch, br,
            Accept-Language:zh-CN,zh;q=0.8,
            Cache-Control:max-age=0,
            Connection:keep-alive}
            #proxies = { "http": "dev-proxy.oa.com:8080","https": "dev-proxy.oa.com:8080",}
            time.sleep(0.5)
            response = requests.get(url,headers = headers)
            print response.status_code
            get_info(response)

        except urllib2.URLError, e:
            print e.reason

def get_info(response):
    global count
    html = response.content
    #print html
    tree = etree.HTML(html)
    rez = tree.xpath(//*[@class="img_list"])

    for i in rez:
        title = i.xpath(//*[@class="list_author"]/a/text())
        author = i.xpath(//*[@class="author_name"]/text())
        
        for x,y in zip(title,author):
            count += 1
            print count,|,x.replace(u\xa0,‘‘).strip(),|,y.replace(u\xa0,‘‘).strip()

if __name__ == __main__:
    count = 0
    get_url()
View Code

运行结果:

技术分享

 

2.豆瓣电影(https://movie.douban.com

技术分享
# coding=utf-8
import json
import os
import sys
import time
import urllib
import urllib2
import pymongo
import requests
import re
from lxml import etree

# reload(sys)
# sys.setdefaultencoding(‘utf-8‘)
tmpt_url = https://movie.douban.com/j/search_subjects?type=movie&tag=%s&sort=recommend&page_limit=500&page_start=0
tags = (热门, 最新, 经典, 豆瓣高分, 冷门佳片, 华语, 欧美, 韩国,
        日本, 动作, 喜剧, 爱情, 科幻, 悬疑, 恐怖, 文艺)
urllist = [tmpt_url % i for i in tags]
#print urllist


def get_url():
    for url in urllist:
        try:
            headers = {
                #‘Host‘:‘www.douban.com‘,
                Upgrade-Insecure-Requests: 1,
                User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36,
                Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8,
                Accept-Encoding: gzip, deflate, sdch, br,
                Accept-Language: zh-CN,zh;q=0.8,
                Cache-Control: max-age=0,
                Connection: keep-alive}
            # proxies = { "http": "dev-proxy.oa.com:8080","https":
            # "dev-proxy.oa.com:8080",}
            time.sleep(0.5)
            response = requests.get(url, headers=headers)
            tag = re.findall(tag=(.*?)&,url)
            print u"电影类型:",tag
            get_info(response)

        except Exception,e:
            print e


def get_info(response):
    global count
    html = response.content
    dictt = json.loads(html, encoding=utf-8)
    dd = dictt[subjects]
    for item in dd:
        count += 1
        print count,u电影链接:, item[url], u电影名:, item[title], u评分:, item[rate]


if __name__ == __main__:
    count = 0
    get_url()
View Code

运行结果:

技术分享

 

python爬虫系列:小试牛刀

原文:http://www.cnblogs.com/Ryana/p/6147771.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!