下面是一个简单的爬虫程序。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132 |
#!/usr/bin/env python from sys import
argv from os import
makedirs, unlink, sep from os.path import
dirname, exists, isdir, splitext from string import
replace, find, lower #from htmllib import HTMLParser from urllib import
urlretrieve from urlparse import
urlparse, urljoin from
formatter import
DumbWriter, AbstractFormatter from
cStringIO import
StringIO from
HTMLParser import
HTMLParser<br> ‘‘‘下面的三行代码是为了设置默认编码 utf8.如果不这样做,python会默认用ascii编码方式去解析,那么如果遇到unicode的编码就出错了。这里先import sys后 reload sys是因为,sys在默认导入的时候通常会删掉setdefaultencoding这个函数,所以需要用reload加载一下‘‘‘ import
sys reload (sys) sys.setdefaultencoding( ‘utf8‘ ) class
RetrieveURL(HTMLParser): #我们用HTMLParser新生成了一个类 def
__init__( self ): HTMLParser.__init__( self ) self .anchorlist = [] #重写__init__函数的唯一目的就是对该类的对象增加一个anchorlist def
handle_starttag( self , tag, attrs): #重写handle_starttag函数,让它在遇到<A>标签的时候把href属性代表的超链接记录在anchorlist中 if
tag = = ‘a‘
or tag = = ‘A‘ : for
t in
attrs : if
t[ 0 ] = =
‘href‘ or t[ 0 ] = = ‘HREF‘ : self .anchorlist.append(t[ 1 ]) class
Retriever( object ): # download Web pages def
__init__( self , url): self .url =
url self . file
= self .filename(url) def
filename( self , url, deffile = ‘index.htm‘ ): parsedurl =
urlparse(url, ‘http:‘ , 0 ) ## parse path path =
parsedurl[ 1 ] +
parsedurl[ 2 ] ext =
splitext(path) if
ext[ 1 ] = =
‘‘: # no file, use default. ( what kind of situation this could be? https://www.baidu.com/file1) if
path[ - 1 ] = =
‘/‘ : path + =
deffile else : path + =
‘/‘ + deffile ldir =
dirname(path) # local directory if
sep ! =
‘/‘ : # os-indep. path separator ldir =
replace(ldir, ‘/‘ , sep) if
not isdir(ldir): # create archive dir if nec. if
exists(ldir): unlink(ldir) print
‘ldir is ‘ ,ldir makedirs(ldir) return
path def
download( self ): # download Web page try : retval =
urlretrieve( self .url, self . file ) except
IOError: retval =
( ‘*** ERROR: invalid URL "%s"‘
% self .url,) return
retval return
retval ‘‘‘def parseAndGetLinks(self):# parse HTML, save links self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) self.parser.feed(open(self.file).read()) self.parser.close() return self.parser.anchorlist‘‘‘ def
parseAndGetLinks( self ): self .parser = RetrieveURL() self .parser.feed( open ( self . file ).read()) self .parser.close() return
self .parser.anchorlist class
Crawler( object ): # manage entire crawling process count =
0 # static downloaded page counter def
__init__( self , url): self .q =
[url] self .seen =
[] self .dom =
urlparse(url)[ 1 ] def
getPage( self , url): r =
Retriever(url) retval =
r.download() if
retval[ 0 ] = =
‘*‘ : # error situation, do not parse print
retval, ‘... skipping parse‘ return Crawler.count + =
1 print
‘\n(‘ , Crawler.count, ‘)‘ print
‘URL:‘ , url print
‘FILE:‘ , retval[ 0 ] self .seen.append(url) links =
r.parseAndGetLinks() # get and process links for
eachLink in
links: if
eachLink[: 4 ] ! =
‘http‘ and find(eachLink, ‘://‘ ) = =
- 1 : eachLink =
urljoin(url, eachLink) print
‘* ‘ , eachLink, if
find(lower(eachLink), ‘mailto:‘ ) ! =
- 1 : print
‘... discarded, mailto link‘ continue if
eachLink not
in self .seen: if
find(eachLink, self .dom) = =
- 1 : print
‘... discarded, not in domain‘ else : if
eachLink not
in self .q: self .q.append(eachLink) print
‘... new, added to Q‘ else : print
‘... discarded, already in Q‘ else : print
‘... discarded, already processed‘ def
go( self ): # process links in queue while
self .q: url =
self .q.pop() self .getPage(url) def
main(): if
len (argv) > 1 : url =
argv[ 1 ] else : try : url =
raw_input ( ‘Enter starting URL: ‘ ) except
(KeyboardInterrupt, EOFError): url =
‘‘ if
not url: return robot =
Crawler(url) robot.go() if
__name__ = =
‘__main__‘ : main() |
原文地址:http://www.cnblogs.com/kramer/p/3766090.html