爬虫程序

时间：2014-06-05 12:50:50 阅读：341 评论：0 收藏：0 [点我收藏+]

标签：c class blog code a http

下面是一个简单的爬虫程序。

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

#!/usr/bin/env python
 
from sys import 
argv
from os import 
makedirs, unlink, sep
from os.path import 
dirname, exists, isdir, splitext
from string import 
replace, find, lower
#from htmllib import HTMLParser
from urllib import 
urlretrieve
from urlparse import 
urlparse, urljoin
from 
formatter import 
DumbWriter, AbstractFormatter
from 
cStringIO import 
StringIO
from 
HTMLParser import 
HTMLParser<br>‘‘‘下面的三行代码是为了设置默认编码 utf8.如果不这样做，python会默认用ascii编码方式去解析，那么如果遇到unicode的编码就出错了。这里先import sys后 reload sys是因为，sys在默认导入的时候通常会删掉setdefaultencoding这个函数，所以需要用reload加载一下‘‘‘
import 
sys
reload(sys)
sys.setdefaultencoding(‘utf8‘)
 
 
class 
RetrieveURL(HTMLParser):#我们用HTMLParser新生成了一个类
    def 
__init__(self):
        HTMLParser.__init__(self)
        self.anchorlist=[]#重写__init__函数的唯一目的就是对该类的对象增加一个anchorlist
    def 
handle_starttag(self, tag, attrs):#重写handle_starttag函数，让它在遇到<A>标签的时候把href属性代表的超链接记录在anchorlist中
            if 
tag==‘a‘ 
or tag==‘A‘:
                for 
t in 
attrs :
                    if 
t[0] == 
‘href‘ or t[0]==‘HREF‘:
                        self.anchorlist.append(t[1])
 
class 
Retriever(object):# download Web pages
    def 
__init__(self, url):
        self.url = 
url
        self.file 
= self.filename(url)
     
    def 
filename(self, url, deffile=‘index.htm‘):
        parsedurl = 
urlparse(url, ‘http:‘, 0) ## parse path
        path = 
parsedurl[1] + 
parsedurl[2]
        ext = 
splitext(path)
        if 
ext[1] == 
‘‘:    # no file, use default. ( what kind of situation this could be? https://www.baidu.com/file1)
            if 
path[-1] == 
‘/‘:
                path += 
deffile
            else:
                path += 
‘/‘ + deffile
        ldir = 
dirname(path)    # local directory
        if 
sep != 
‘/‘:  # os-indep. path separator
            ldir = 
replace(ldir, ‘/‘, sep)
        if 
not isdir(ldir): # create archive dir if nec.
            if 
exists(ldir): unlink(ldir)
            print 
‘ldir is ‘,ldir
            makedirs(ldir)
        return 
path
         
     
    def 
download(self): # download Web page
        try:
            retval = 
urlretrieve(self.url, self.file)
        except 
IOError:
            retval = 
(‘*** ERROR: invalid URL "%s"‘ 
%self.url,)
            return 
retval
        return 
retval
 
    ‘‘‘def parseAndGetLinks(self):# parse HTML, save links
        self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
        self.parser.feed(open(self.file).read())
        self.parser.close()
        return self.parser.anchorlist‘‘‘
    def 
parseAndGetLinks(self):
        self.parser=RetrieveURL()
        self.parser.feed(open(self.file).read())
        self.parser.close()
        return 
self.parser.anchorlist
 
 
class 
Crawler(object):# manage entire crawling process
    count = 
0   # static downloaded page counter
    def 
__init__(self, url):
        self.q = 
[url]
        self.seen = 
[]
        self.dom = 
urlparse(url)[1]
 
    def 
getPage(self, url):
        r = 
Retriever(url)
        retval = 
r.download()
         
        if 
retval[0] == 
‘*‘: # error situation, do not parse
            print 
retval, ‘... skipping parse‘
            return
 
        Crawler.count += 
1
        print 
‘\n(‘, Crawler.count, ‘)‘
        print 
‘URL:‘, url
        print 
‘FILE:‘, retval[0]
        self.seen.append(url)
         
        links = 
r.parseAndGetLinks() # get and process links
        for 
eachLink in 
links:
            if 
eachLink[:4] != 
‘http‘ and find(eachLink, ‘://‘) == 
-1:
                eachLink = 
urljoin(url, eachLink)
                print 
‘* ‘, eachLink,
            if 
find(lower(eachLink), ‘mailto:‘) != 
-1:
                print 
‘... discarded, mailto link‘
                continue
            if 
eachLink not 
in self.seen:
                if 
find(eachLink, self.dom) == 
-1:
                    print 
‘... discarded, not in domain‘
                else:
                    if 
eachLink not 
in self.q:
                        self.q.append(eachLink)
                        print 
‘... new, added to Q‘
                    else:
                        print 
‘... discarded, already in Q‘
            else:
                print 
‘... discarded, already processed‘
                         
    def 
go(self):# process links in queue
        while 
self.q:
            url = 
self.q.pop()
            self.getPage(url)
 
def 
main():
    if 
len(argv) > 1:
        url = 
argv[1]
    else:
        try:
            url = 
raw_input(‘Enter starting URL: ‘)
        except 
(KeyboardInterrupt, EOFError):
            url = 
‘‘
     
    if 
not url: return
    robot = 
Crawler(url)
    robot.go()
 
if 
__name__ == 
‘__main__‘:
    main()

爬虫程序,布布扣,bubuko.com

爬虫程序

标签：c class blog code a http

原文地址：http://www.cnblogs.com/kramer/p/3766090.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行