标签:des http io os ar for sp log on
# -*- coding: utf-8 -*-
# python:2.x
__author__ = ‘Administrator‘
import urllib2
#例子
LOGIN=‘wesc‘
PASSWD="you‘llNeverGuess"
URL=‘http://localhost‘
def h1(url):
from urlparse import urlparse as up
hdlr=urllib2.HTTPBasicAuthHandler()
hdlr.add_password(‘Archives‘,up(url)[1],LOGIN,PASSWD)
opener=urllib2.build_opener(hdlr)
urllib2.install_opener(opener)
return url
def req(url):
from base64 import encodestring as s
req1=urllib2.Request(url)
b64str=s(‘%s:%s‘%(LOGIN,PASSWD))[:-1]# -*- coding: utf-8 -*-
# python:2.x
__author__ = ‘Administrator‘
#python 网络爬虫
#python 网络爬虫
import sys,os,string
from htmllib import HTMLParser
from urllib import urlretrieve
from urlparse import *
from formatter import *
from cStringIO import *
class Retruver(object):#下载页面
def __init__(self,url):
self.url=url
self.file=self.filename(url)
def filename(selfself,url,deffile=‘index.htm‘):
parsedurl=urlparse(url,‘http:‘,0)#parse path
path=parsedurl[1]+parsedurl[2]
ext=os.path.splitext(path)
if ext[1]==‘‘:#没有路径
if path[-1]==‘/‘:
path+=deffile
else:
path+=‘/‘+deffile
ldir=os.path.dirname(path)#local directory
if os.sep!=‘/‘:#os-indep.path separator
ldir=string.replace(ldir,‘/‘,os.sep)
if not os.path.isdir(ldir):#create archive dir if nec
if os.path.exists(ldir):os.unlink(ldir)
os.makedirs(ldir)
return path
def download(self):#下载页面
try:
retval=urlretrieve(self.url,self.file)
except IOError:
retval=(‘***ERROR:invalid URL%S‘%\
self.url,)
return retval
def parseAndGetLinks(self):#parse HTML ,save links
self.parser=HTMLParser(AbstractFormatter(\
DumbWriter(StringIO())))
self.parser.feed(open(self.file).read())
self.parser.close()
return self.parser.anchorlist
class Crawler(object):#manage entire crawling process
count=0#static downloaded page counter
def __init__(self,url):
self.q=[url]
self.seen=[]
self.dom=urlparse(url)[1]
def getPage(self,url):
r=Retruver(url)
retval=r.download()
if retval[0]==‘*‘:#error situation,do not parse
print retval,‘...skipping parse‘
return
Crawler.count+=1
print ‘\n(‘,Crawler.count,‘)‘
print ‘URL:%s‘%(url)
print ‘FILE%s‘%(retval[0])
self.seen.append(url)
lineks=r.parseAndGetLinks()#get and process links
for eachLink in lineks:
if eachLink[:4]!=‘http‘ and \
string.find(eachLink,‘//‘)==-1:
eachLink=urljoin(url,eachLink)
print ‘*%s‘%(eachLink,)
if string.find(string.lower(eachLink),‘mailto:‘)!=-1:
print ‘...discarded,mailto lone‘
continue
if eachLink not in self.seen:
if string.find(eachLink,self.dom)==-1:
print ‘...discarded,not in domain‘
else:
if eachLink not in self.q:
self.q.append(eachLink)
print ‘..new,added to Q‘
else:
print ‘..dircarded,already in Q‘
else:
print ‘ok‘
def go(self):#process linek in queue
while self.q:
url=self.q.pop()
self.getPage(url)
def main():
if len(sys.argv)>1:
url=sys.argv[1]
else:
try:
url=raw_input(‘url:‘)
except(KeyboardInterrupt,EOFError):
url=‘‘
if not url:return
robot=Crawler(url)
robot.go()
main()
req1.add_header(‘Authorization‘,‘Basic %s‘%b64str)
return req1
for s in (‘handler‘,‘request‘):
print ‘***using %s:‘%s.upper()
url=eval(‘req‘)(URL)
f=urllib2.urlopen(url)
print f.readline()
f.close()
标签:des http io os ar for sp log on
原文地址:http://www.cnblogs.com/mhxy13867806343/p/3969967.html