码迷,mamicode.com
首页 > 编程语言 > 详细

python 网络爬虫,python 网络爬虫

时间:2014-09-13 17:10:35      阅读:247      评论:0      收藏:0      [点我收藏+]

标签:des   http   io   os   ar   for   sp   log   on   

# -*- coding: utf-8 -*-

# python:2.x

__author__ = ‘Administrator‘

import urllib2

#例子

LOGIN=‘wesc‘

PASSWD="you‘llNeverGuess"

URL=‘http://localhost‘

def h1(url):

    from urlparse import urlparse as  up

    hdlr=urllib2.HTTPBasicAuthHandler()

    hdlr.add_password(‘Archives‘,up(url)[1],LOGIN,PASSWD)

    opener=urllib2.build_opener(hdlr)

    urllib2.install_opener(opener)

    return url

def req(url):

    from base64 import encodestring as s

    req1=urllib2.Request(url)

    b64str=s(‘%s:%s‘%(LOGIN,PASSWD))[:-1]# -*- coding: utf-8 -*-

# python:2.x

__author__ = ‘Administrator‘

#python 网络爬虫

#python 网络爬虫

import sys,os,string

from htmllib import HTMLParser

from urllib import urlretrieve

from urlparse import *

from formatter import *

from cStringIO import *

 

class Retruver(object):#下载页面

    def __init__(self,url):

        self.url=url

        self.file=self.filename(url)

 

    def filename(selfself,url,deffile=‘index.htm‘):

        parsedurl=urlparse(url,‘http:‘,0)#parse path

        path=parsedurl[1]+parsedurl[2]

        ext=os.path.splitext(path)

        if ext[1]==‘‘:#没有路径

            if path[-1]==‘/‘:

                path+=deffile

            else:

                path+=‘/‘+deffile

        ldir=os.path.dirname(path)#local directory

        if os.sep!=‘/‘:#os-indep.path separator

            ldir=string.replace(ldir,‘/‘,os.sep)

        if not os.path.isdir(ldir):#create archive dir if nec

            if os.path.exists(ldir):os.unlink(ldir)

            os.makedirs(ldir)

        return path

    def download(self):#下载页面

        try:

            retval=urlretrieve(self.url,self.file)

        except IOError:

            retval=(‘***ERROR:invalid URL%S‘%\

                self.url,)

            return retval

    def parseAndGetLinks(self):#parse HTML ,save links

        self.parser=HTMLParser(AbstractFormatter(\

            DumbWriter(StringIO())))

        self.parser.feed(open(self.file).read())

        self.parser.close()

        return  self.parser.anchorlist

 

class Crawler(object):#manage entire crawling process

    count=0#static downloaded page counter

 

    def __init__(self,url):

        self.q=[url]

        self.seen=[]

        self.dom=urlparse(url)[1]

 

    def getPage(self,url):

        r=Retruver(url)

        retval=r.download()

        if retval[0]==‘*‘:#error situation,do not parse

            print  retval,‘...skipping parse‘

            return

        Crawler.count+=1

        print ‘\n(‘,Crawler.count,‘)‘

        print ‘URL:%s‘%(url)

        print ‘FILE%s‘%(retval[0])

        self.seen.append(url)

 

        lineks=r.parseAndGetLinks()#get and process links

        for eachLink in lineks:

            if eachLink[:4]!=‘http‘ and \

                string.find(eachLink,‘//‘)==-1:

                eachLink=urljoin(url,eachLink)

            print ‘*%s‘%(eachLink,)

 

            if string.find(string.lower(eachLink),‘mailto:‘)!=-1:

                print ‘...discarded,mailto lone‘

                continue

            if eachLink not in self.seen:

                if string.find(eachLink,self.dom)==-1:

                    print ‘...discarded,not in domain‘

                else:

                    if eachLink not in self.q:

                        self.q.append(eachLink)

                        print ‘..new,added to Q‘

                    else:

                        print ‘..dircarded,already in Q‘

            else:

                    print ‘ok‘

    def go(self):#process linek in queue

        while self.q:

            url=self.q.pop()

            self.getPage(url)

def main():

    if len(sys.argv)>1:

        url=sys.argv[1]

    else:

        try:

            url=raw_input(‘url:‘)

        except(KeyboardInterrupt,EOFError):

            url=‘‘

            if not url:return

            robot=Crawler(url)

            robot.go()

 

main()

    req1.add_header(‘Authorization‘,‘Basic %s‘%b64str)

    return  req1

for s in (‘handler‘,‘request‘):

    print ‘***using %s:‘%s.upper()

    url=eval(‘req‘)(URL)

    f=urllib2.urlopen(url)

    print f.readline()

    f.close()

python 网络爬虫,python 网络爬虫

标签:des   http   io   os   ar   for   sp   log   on   

原文地址:http://www.cnblogs.com/mhxy13867806343/p/3969967.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!