爬虫程序

时间：2014-06-06 19:55:50 阅读：385 评论：0 收藏：0 [点我收藏+]
下面是一个简单的爬虫程序。
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/env python
 
from sys import 
argv
from os import 
makedirs, unlink, sep
from os.path import 
dirname, exists, isdir, splitext
from string import 
replace, find, lower
#from htmllib import HTMLParser
from urllib import 
urlretrieve
from urlparse import 
urlparse, urljoin
from 
formatter import 
DumbWriter, AbstractFormatter
from 
cStringIO import 
StringIO
from 
HTMLParser import 
HTMLParser<br>‘‘‘下面的三行代码是为了设置默认编码 utf8.如果不这样做，python会默认用ascii编码方式去解析，那么如果遇到unicode的编码就出错了。这里先import sys后 reload sys是因为，sys在默认导入的时候通常会删掉setdefaultencoding这个函数，所以需要用reload加载一下‘‘‘
import 
sys
reload(sys)
sys.setdefaultencoding(‘utf8‘)
 
 
class 
RetrieveURL(HTMLParser):#我们用HTMLParser新生成了一个类
    def 
__init__(self):
        HTMLParser.__init__(self)
        self.anchorlist=[]#重写__init__函数的唯一目的就是对该类的对象增加一个anchorlist
    def 
handle_starttag(self, tag, attrs):#重写handle_starttag函数，让它在遇到<A>标签的时候把href属性代表的超链接记录在anchorlist中
            if 
tag==‘a‘ 
or tag==‘A‘:
                for 
t in 
attrs :
                    if 
t[0] == 
‘href‘ or t[0]==‘HREF‘:
                        self.anchorlist.append(t[1])
 
class 
Retriever(object):# download Web pages
    def 
__init__(self, url):
        self.url = 
url
        self.file 
= self.filename(url)
     
    def 
filename(self, url, deffile=‘index.htm‘):
        parsedurl = 
urlparse(url, ‘http:‘, 0) ## parse path
        path = 
parsedurl[1] + 
parsedurl[2]
        ext = 
splitext(path)
        if 
ext[1] == 
‘‘:    # no file, use default. ( what kind of situation this could be? https://www.baidu.com/file1)
            if 
path[-1] == 
‘/‘:
                path += 
deffile
            else:
                path += 
‘/‘ + deffile
        ldir = 
dirname(path)    # local directory
        if 
sep != 
‘/‘:  # os-indep. path separator
            ldir = 
replace(ldir, ‘/‘, sep)
        if 
not isdir(ldir): # create archive dir if nec.
            if 
exists(ldir): unlink(ldir)
            print 
‘ldir is ‘,ldir
            makedirs(ldir)
        return 
path
         
     
    def 
download(self): # download Web page
        try:
            retval = 
urlretrieve(self.url, self.file)
        except 
IOError:
            retval = 
(‘*** ERROR: invalid URL "%s"‘ 
%self.url,)
            return 
retval
        return 
retval
 
    ‘‘‘def parseAndGetLinks(self):# parse HTML, save links
        self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
        self.parser.feed(open(self.file).read())
        self.parser.close()
        return self.parser.anchorlist‘‘‘
    def 
parseAndGetLinks(self):
        self.parser=RetrieveURL()
        self.parser.feed(open(self.file).read())
        self.parser.close()
        return 
self.parser.anchorlist
 
 
class 
Crawler(object):# manage entire crawling process
    count = 
0   # static downloaded page counter
    def 
__init__(self, url):
        self.q = 
[url]
        self.seen = 
[]
        self.dom = 
urlparse(url)[1]
 
    def 
getPage(self, url):
        r = 
Retriever(url)
        retval = 
r.download()
         
        if 
retval[0] == 
‘*‘: # error situation, do not parse
            print 
retval, ‘... skipping parse‘
            return
 
        Crawler.count += 
1
        print 
‘\n(‘, Crawler.count, ‘)‘
        print 
‘URL:‘, url
        print 
‘FILE:‘, retval[0]
        self.seen.append(url)
         
        links = 
r.parseAndGetLinks() # get and process links
        for 
eachLink in 
links:
            if 
eachLink[:4] != 
‘http‘ and find(eachLink, ‘://‘) == 
-1:
                eachLink = 
urljoin(url, eachLink)
                print 
‘* ‘, eachLink,
            if 
find(lower(eachLink), ‘mailto:‘) != 
-1:
                print 
‘... discarded, mailto link‘
                continue
            if 
eachLink not 
in self.seen:
                if 
find(eachLink, self.dom) == 
-1:
                    print 
‘... discarded, not in domain‘
                else:
                    if 
eachLink not 
in self.q:
                        self.q.append(eachLink)
                        print 
‘... new, added to Q‘
                    else:
                        print 
‘... discarded, already in Q‘
            else:
                print 
‘... discarded, already processed‘
                         
    def 
go(self):# process links in queue
        while 
self.q:
            url = 
self.q.pop()
            self.getPage(url)
 
def 
main():
    if 
len(argv) > 1:
        url = 
argv[1]
    else:
        try:
            url = 
raw_input(‘Enter starting URL: ‘)
        except 
(KeyboardInterrupt, EOFError):
            url = 
‘‘
     
    if 
not url: return
    robot = 
Crawler(url)
    robot.go()
 
if 
__name__ == 
‘__main__‘:
    main()
爬虫程序,布布扣,bubuko.com
爬虫程序
原文：http://www.cnblogs.com/kramer/p/3766090.html
踩
(0)
评论一句话评论（0）
分享档案
更多>
2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)