下面是一个简单的爬虫程序。
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 | #!/usr/bin/env pythonfromsys importargvfromos importmakedirs, unlink, sepfromos.path importdirname, exists, isdir, splitextfromstring importreplace, find, lower#from htmllib import HTMLParserfromurllib importurlretrievefromurlparse importurlparse, urljoinfromformatter importDumbWriter, AbstractFormatterfromcStringIO importStringIOfromHTMLParser importHTMLParser<br>‘‘‘下面的三行代码是为了设置默认编码 utf8.如果不这样做,python会默认用ascii编码方式去解析,那么如果遇到unicode的编码就出错了。这里先import sys后 reload sys是因为,sys在默认导入的时候通常会删掉setdefaultencoding这个函数,所以需要用reload加载一下‘‘‘importsysreload(sys)sys.setdefaultencoding(‘utf8‘)classRetrieveURL(HTMLParser):#我们用HTMLParser新生成了一个类    def__init__(self):        HTMLParser.__init__(self)        self.anchorlist=[]#重写__init__函数的唯一目的就是对该类的对象增加一个anchorlist    defhandle_starttag(self, tag, attrs):#重写handle_starttag函数,让它在遇到<A>标签的时候把href属性代表的超链接记录在anchorlist中            iftag==‘a‘ortag==‘A‘:                fort inattrs :                    ift[0] ==‘href‘ort[0]==‘HREF‘:                        self.anchorlist.append(t[1])classRetriever(object):# download Web pages    def__init__(self, url):        self.url =url        self.file=self.filename(url)        deffilename(self, url, deffile=‘index.htm‘):        parsedurl =urlparse(url, ‘http:‘, 0) ## parse path        path =parsedurl[1] +parsedurl[2]        ext =splitext(path)        ifext[1] ==‘‘:    # no file, use default. ( what kind of situation this could be? https://www.baidu.com/file1)            ifpath[-1] ==‘/‘:                path +=deffile            else:                path +=‘/‘+deffile        ldir =dirname(path)    # local directory        ifsep !=‘/‘:  # os-indep. path separator            ldir =replace(ldir, ‘/‘, sep)        ifnotisdir(ldir): # create archive dir if nec.            ifexists(ldir): unlink(ldir)            print‘ldir is ‘,ldir            makedirs(ldir)        returnpath                defdownload(self): # download Web page        try:            retval =urlretrieve(self.url, self.file)        exceptIOError:            retval =(‘*** ERROR: invalid URL "%s"‘%self.url,)            returnretval        returnretval    ‘‘‘def parseAndGetLinks(self):# parse HTML, save links        self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))        self.parser.feed(open(self.file).read())        self.parser.close()        return self.parser.anchorlist‘‘‘    defparseAndGetLinks(self):        self.parser=RetrieveURL()        self.parser.feed(open(self.file).read())        self.parser.close()        returnself.parser.anchorlistclassCrawler(object):# manage entire crawling process    count =0# static downloaded page counter    def__init__(self, url):        self.q =[url]        self.seen =[]        self.dom =urlparse(url)[1]    defgetPage(self, url):        r =Retriever(url)        retval =r.download()                ifretval[0] ==‘*‘: # error situation, do not parse            printretval, ‘... skipping parse‘            return        Crawler.count +=1        print‘\n(‘, Crawler.count, ‘)‘        print‘URL:‘, url        print‘FILE:‘, retval[0]        self.seen.append(url)                links =r.parseAndGetLinks() # get and process links        foreachLink inlinks:            ifeachLink[:4] !=‘http‘andfind(eachLink, ‘://‘) ==-1:                eachLink =urljoin(url, eachLink)                print‘* ‘, eachLink,            iffind(lower(eachLink), ‘mailto:‘) !=-1:                print‘... discarded, mailto link‘                continue            ifeachLink notinself.seen:                iffind(eachLink, self.dom) ==-1:                    print‘... discarded, not in domain‘                else:                    ifeachLink notinself.q:                        self.q.append(eachLink)                        print‘... new, added to Q‘                    else:                        print‘... discarded, already in Q‘            else:                print‘... discarded, already processed‘                            defgo(self):# process links in queue        whileself.q:            url =self.q.pop()            self.getPage(url)defmain():    iflen(argv) > 1:        url =argv[1]    else:        try:            url =raw_input(‘Enter starting URL: ‘)        except(KeyboardInterrupt, EOFError):            url =‘‘        ifnoturl: return    robot =Crawler(url)    robot.go()if__name__ ==‘__main__‘:    main() | 
原文:http://www.cnblogs.com/kramer/p/3766090.html