标签:style blog http io color ar os 使用 for
web应用也遵循客户服务器架构
浏览器就是一个基本的web客户端,她实现两个基本功能,一个是从web服务器下载文件,另一个是渲染文件
同浏览器具有类似功能以实现简单的web客户端的模块式urllib以及urllib2(可以打开需要登录的网页)等模块
另外还有一些负载的web客户端,它不仅下载web文件,还执行其它复杂的任务,一个典型的例子就是爬虫
python实现爬虫也有一些框架模块:如Scrapy
1 #!/usr/bin/env python 2 3 from sys import argv 4 from os import makedirs, unlink, sep 5 from os.path import isdir, exists, dirname, splitext 6 from string import replace, find, lower 7 from htmllib import HTMLParser 8 from urllib import urlretrieve 9 from urlparse import urlparse, urljoin 10 from formatter import DumbWriter, AbstractFormatter 11 from cStringIO import StringIO 12 13 class Retriever(object): # download Web pages 14 15 def __init__(self, url): 16 self.url = url 17 self.file = self.filename(url) 18 19 def filename(self, url, deffile=‘index.htm‘): 20 parsedurl = urlparse(url, ‘http:‘, 0) # parse path 21 path = parsedurl[1] + parsedurl[2] 22 ext = splitext(path) 23 if ext[1] == ‘‘: 24 if path[-1] == ‘/‘: 25 path += deffile 26 else: 27 path += ‘/‘ + deffile 28 ldir = dirname(path) # local directory 29 if sep != ‘/‘: # os-indep. path separator 30 ldir = replace(ldir, ‘,‘, sep) 31 if not isdir(ldir): # create archive dir if nec. 32 if exists(ldir): unlink(ldir) 33 makedirs(ldir) 34 return path 35 36 def download(self): # download Web page 37 try: 38 retval = urllib.urlretrieve(self.url, self.file) 39 except IOError: 40 retval = (‘*** ERROR: invalid URL "%s"‘ % 41 self.url, ) 42 return retval 43 44 def parseAndGetLinks(self): # pars HTML, save links 45 self.parser = HTMLParser(AbstractFormatter( 46 DumbWriter(StringIO()))) 47 self.parser.feed(open(self.file).read()) 48 self.parser.close() 49 return self.parse.anchorlist 50 51 class Crawler(object): # manage entire crawling process 52 53 count = 0 # static downloaded page counter 54 55 def __init__(self, url): 56 self.q = [url] 57 self.seen = [] 58 self.dom = urlparse(url)[1] 59 60 def getPage(self, url): 61 r = Retriever(url) 62 retval = r.download() 63 if retval[0] == ‘*‘: # error situation, do not parse 64 print retval, ‘... skipping parse‘ 65 return 66 Crawler.count = Crawler.count + 1 67 print ‘\n(‘, Crawler.count, ‘)‘ 68 print ‘URL:‘, url 69 print ‘FILE:‘, retval[0] 70 self.seen.append(url) 71 72 links = r.parseAndGetLinks() # get and process links 73 for eachLink in links: 74 if eachLink[:4] != ‘http‘ and 75 find(eachLink, ‘://‘) == -1: 76 eachLink = urljoin(url, eachLink) 77 print ‘* ‘, eachLink, 78 79 if find(lower(eachLink), ‘mailto:‘) != -1: 80 print ‘... discarded, mailto link‘ 81 continue 82 83 if eachLink not in self.seen: 84 if find(eachLink, self.dom) == -1: 85 print ‘... discarded, not in domain‘ 86 else: 87 if eachLink not in self.q: 88 self.q.append(eachLink) 89 print ‘... new, added to Q‘ 90 else: 91 print ‘... discarded, already in Q‘ 92 else: 93 print ‘... discarded, already processed‘ 94 95 def go(self): # process links in queue 96 while self.q: 97 url = self.q.pop() 98 self.getPage(url) 99 100 def main(): 101 if len(argv) > 1: 102 url = argv[1] 103 else: 104 try: 105 url = raw_input(‘Enter starting URL: ‘) 106 except (KeyboardInterrupt, EOFError): 107 url = ‘‘ 108 109 if not url: return 110 robot = Crawler(url) 111 robot.go() 112 113 if __name__ == ‘__main__‘: 114 main()
标签:style blog http io color ar os 使用 for
原文地址:http://www.cnblogs.com/wybert/p/4077230.html