python 网络爬虫,python 网络爬虫

时间：2014-09-13 17:10:35 阅读：247 评论：0 收藏：0 [点我收藏+]

标签：des http io os ar for sp log on

# -*- coding: utf-8 -*-

# python:2.x

__author__ = ‘Administrator‘

import urllib2

#例子

PASSWD="you‘llNeverGuess"

URL=‘http://localhost‘

def h1(url):

from urlparse import urlparse as up

hdlr=urllib2.HTTPBasicAuthHandler()

hdlr.add_password(‘Archives‘,up(url)[1],LOGIN,PASSWD)

opener=urllib2.build_opener(hdlr)

urllib2.install_opener(opener)

return url

def req(url):

from base64 import encodestring as s

req1=urllib2.Request(url)

b64str=s(‘%s:%s‘%(LOGIN,PASSWD))[:-1]# -*- coding: utf-8 -*-

# python:2.x

__author__ = ‘Administrator‘

#python 网络爬虫

import sys,os,string

from htmllib import HTMLParser

from urllib import urlretrieve

from urlparse import *

from formatter import *

from cStringIO import *

class Retruver(object):#下载页面

def __init__(self,url):

self.url=url

self.file=self.filename(url)

def filename(selfself,url,deffile=‘index.htm‘):

parsedurl=urlparse(url,‘http:‘,0)#parse path

path=parsedurl[1]+parsedurl[2]

ext=os.path.splitext(path)

if ext[1]==‘‘:#没有路径

if path[-1]==‘/‘:

path+=deffile

else:

path+=‘/‘+deffile

ldir=os.path.dirname(path)#local directory

if os.sep!=‘/‘:#os-indep.path separator

ldir=string.replace(ldir,‘/‘,os.sep)

if not os.path.isdir(ldir):#create archive dir if nec

if os.path.exists(ldir):os.unlink(ldir)

os.makedirs(ldir)

return path

def download(self):#下载页面

try:

retval=urlretrieve(self.url,self.file)

except IOError:

retval=(‘***ERROR:invalid URL%S‘%\

self.url,)

return retval

def parseAndGetLinks(self):#parse HTML ,save links

self.parser=HTMLParser(AbstractFormatter(\

DumbWriter(StringIO())))

self.parser.feed(open(self.file).read())

self.parser.close()

return self.parser.anchorlist

class Crawler(object):#manage entire crawling process

count=0#static downloaded page counter

def __init__(self,url):

self.q=[url]

self.seen=[]

self.dom=urlparse(url)[1]

def getPage(self,url):

r=Retruver(url)

retval=r.download()

if retval[0]==‘*‘:#error situation,do not parse

print retval,‘...skipping parse‘

return

Crawler.count+=1

print ‘\n(‘,Crawler.count,‘)‘

print ‘URL:%s‘%(url)

print ‘FILE%s‘%(retval[0])

self.seen.append(url)

lineks=r.parseAndGetLinks()#get and process links

for eachLink in lineks:

if eachLink[:4]!=‘http‘ and \

string.find(eachLink,‘//‘)==-1:

eachLink=urljoin(url,eachLink)

print ‘*%s‘%(eachLink,)

if string.find(string.lower(eachLink),‘mailto:‘)!=-1:

print ‘...discarded,mailto lone‘

continue

if eachLink not in self.seen:

if string.find(eachLink,self.dom)==-1:

print ‘...discarded,not in domain‘

else:

if eachLink not in self.q:

self.q.append(eachLink)

print ‘..new,added to Q‘

else:

print ‘..dircarded,already in Q‘

else:

print ‘ok‘

def go(self):#process linek in queue

while self.q:

url=self.q.pop()

self.getPage(url)

def main():

if len(sys.argv)>1:

url=sys.argv[1]

else:

try:

url=raw_input(‘url:‘)

except(KeyboardInterrupt,EOFError):

url=‘‘

if not url:return

robot=Crawler(url)

robot.go()

main()

req1.add_header(‘Authorization‘,‘Basic %s‘%b64str)

return req1

for s in (‘handler‘,‘request‘):

print ‘***using %s:‘%s.upper()

url=eval(‘req‘)(URL)

f=urllib2.urlopen(url)

print f.readline()

f.close()

python 网络爬虫,python 网络爬虫

标签：des http io os ar for sp log on

原文地址：http://www.cnblogs.com/mhxy13867806343/p/3969967.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行