一个人无聊,写了个小爬虫爬取不可描述图片....
代码太短,就暂时先往这里贴一下做备份吧。
#! /usr/bin/python
import chardet
import urllib3
import uuid
import os
import logging
import time
import sys
import re
import threading
from bs4 import BeautifulSoup
"""
http://www.qiubaichengren.com/1.html
"""
class PageNotFoundException(BaseException):
    """
        代表网页404的异常
    """
    pass
class ResponseStatusException(BaseException):
    pass
class QiuBaiChengRenSpider:
    http_pool_manager = urllib3.PoolManager()
    img_save_dir = ‘D:/QiuBaiChengRen/‘
    logger = logging.getLogger(‘QiuBaiChengRenSpider‘)
    def __init__(self):
        self.init_log()
    def init_log(self):
        stream_handler = logging.StreamHandler(sys.stdout)
        self.logger.addHandler(stream_handler)
        self.logger.setLevel(logging.DEBUG)
    def get(self, url):
        try:
            http_response = self.http_pool_manager.request(‘GET‘, url)
            if http_response.status == 404:
                raise PageNotFoundException(‘404‘)
            if http_response.status != 200:
                raise ResponseStatusException(http_response.status)
            return http_response.data
        except Exception:
            self.logger.info(u‘获取网页的时候发生了异常‘)
            return ‘‘
    def extract_img(self, html_doc):
        bs = BeautifulSoup(html_doc, ‘lxml‘)
        imgs = bs.select(‘div.mala-text img‘)
        return imgs
    def save_img(self, img_tag):
        img_link = img_tag[‘src‘].strip()
        save_name = self.img_save_dir + img_tag[‘alt‘] + ‘___‘ + uuid.uuid4().hex + os.path.splitext(img_link)[1]
        save_name = re.compile(‘[\\s+,\",\‘]‘).sub(‘‘, save_name)    # 覆盖掉生成的文件名中不合法的部分
        self.logger.info(‘Save img: %s %s‘ %(save_name, img_link))
        img_byte = self.get(img_link)
        if img_byte == ‘‘:
            return
        img_file = open(save_name, ‘wb‘)
        img_file.write(img_byte)
        img_file.close()
    def list_visitor(self, seed):
        threads = []
        i = 1
        while True:
            try:
                url = seed % {‘page‘: i}
                self.logger.info(‘Begin process:%s‘ %url)
                html_doc = self.get(url)
                if html_doc == ‘‘:
                    continue
                imgs = self.extract_img(html_doc)
                for img in imgs:
                    # self.logger.info(‘Saving img:%s %s‘ %(img[‘alt‘], img[‘src‘]))
                    t1 = threading.Thread(target=self.save_img, args={img})
                    t1.start()
                    threads.append(t1)
                i += 1
            except PageNotFoundException:
                self.logger.info(‘404‘)
                break
            except BaseException:
                break
        for t1 in threads:
            t1.join()
if __name__ == ‘__main__‘:
    spider = QiuBaiChengRenSpider()
    spider.list_visitor(‘http://www.qiubaichengren.com/%(page)d.html‘)
原文:http://www.cnblogs.com/cc11001100/p/7624927.html