python爬虫系列：小试牛刀

时间：2017-03-05 21:02:11 阅读：404 评论：0 收藏：0 [点我收藏+]

这篇博客主要是写两个爬虫，一个抓取静态网站的文字和图片，一个抓取动态网站的电影及相关消息。

1.每日一文（http://voice.meiriyiwen.com/）

#coding=utf-8
#爬取每日一文前10页内容
from lxml import etree
import requests
import urllib2,urllib
import sys
import os
import time

tmpt_url = ‘http://voice.meiriyiwen.com/voice/past?page=%d‘
urllist = [tmpt_url%i for i in range(1,11)]

def get_url():
    for url in urllist:
        try:
            headers = {
            ‘Host‘:‘voice.meiriyiwen.com‘,
            ‘Upgrade-Insecure-Requests‘:‘1‘,
            ‘User-Agent‘ : ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36‘ ,
            ‘Accept‘:‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8‘,
            ‘Accept-Encoding‘:‘gzip, deflate, sdch, br‘,
            ‘Accept-Language‘:‘zh-CN,zh;q=0.8‘,
            ‘Cache-Control‘:‘max-age=0‘,
            ‘Connection‘:‘keep-alive‘}
            #proxies = { "http": "dev-proxy.oa.com:8080","https": "dev-proxy.oa.com:8080",}
            time.sleep(0.5)
            response = requests.get(url,headers = headers)
            print response.status_code
            get_info(response)

        except urllib2.URLError, e:
            print e.reason

def get_info(response):
    global count
    html = response.content
    #print html
    tree = etree.HTML(html)
    rez = tree.xpath(‘//*[@class="img_list"]‘)

    for i in rez:
        title = i.xpath(‘//*[@class="list_author"]/a/text()‘)
        author = i.xpath(‘//*[@class="author_name"]/text()‘)
        
        for x,y in zip(title,author):
            count += 1
            print count,‘|‘,x.replace(u‘\xa0‘,‘‘).strip(),‘|‘,y.replace(u‘\xa0‘,‘‘).strip()

if __name__ == ‘__main__‘:
    count = 0
    get_url()

View Code

运行结果：

技术分享

2.豆瓣电影（https://movie.douban.com）

# coding=utf-8
import json
import os
import sys
import time
import urllib
import urllib2
import pymongo
import requests
import re
from lxml import etree

# reload(sys)
# sys.setdefaultencoding(‘utf-8‘)
tmpt_url = ‘https://movie.douban.com/j/search_subjects?type=movie&tag=%s&sort=recommend&page_limit=500&page_start=0‘
tags = (‘热门‘, ‘最新‘, ‘经典‘, ‘豆瓣高分‘, ‘冷门佳片‘, ‘华语‘, ‘欧美‘, ‘韩国‘,
        ‘日本‘, ‘动作‘, ‘喜剧‘, ‘爱情‘, ‘科幻‘, ‘悬疑‘, ‘恐怖‘, ‘文艺‘)
urllist = [tmpt_url % i for i in tags]
#print urllist


def get_url():
    for url in urllist:
        try:
            headers = {
                #‘Host‘:‘www.douban.com‘,
                ‘Upgrade-Insecure-Requests‘: ‘1‘,
                ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36‘,
                ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8‘,
                ‘Accept-Encoding‘: ‘gzip, deflate, sdch, br‘,
                ‘Accept-Language‘: ‘zh-CN,zh;q=0.8‘,
                ‘Cache-Control‘: ‘max-age=0‘,
                ‘Connection‘: ‘keep-alive‘}
            # proxies = { "http": "dev-proxy.oa.com:8080","https":
            # "dev-proxy.oa.com:8080",}
            time.sleep(0.5)
            response = requests.get(url, headers=headers)
            tag = re.findall(‘tag=(.*?)&‘,url)
            print u"电影类型：",tag
            get_info(response)

        except Exception,e:
            print e


def get_info(response):
    global count
    html = response.content
    dictt = json.loads(html, encoding=‘utf-8‘)
    dd = dictt[‘subjects‘]
    for item in dd:
        count += 1
        print count,u‘电影链接：‘, item[‘url‘], u‘电影名：‘, item[‘title‘], u‘评分：‘, item[‘rate‘]


if __name__ == ‘__main__‘:
    count = 0
    get_url()

View Code

运行结果：

技术分享

python爬虫系列：小试牛刀

原文：http://www.cnblogs.com/Ryana/p/6147771.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)