码迷,mamicode.com
首页 > 编程语言 > 详细

提取网址的python练习

时间:2015-07-23 19:07:45      阅读:123      评论:0      收藏:0      [点我收藏+]

标签:

import urllib, urllib2, cookielib
from HTMLParser import HTMLParser
import sys

reload(sys)
sys.setdefaultencoding(utf8)

class WebParser(HTMLParser):
    def __init__(self, links, path):
        HTMLParser.__init__(self)
        self.links = links
        self.path = path

    def handle_starttag(self, tag, attrs):
        if tag == a:
            if len(attrs) == 0:
                pass
            else:
                for (key, val) in attrs:
                    if key == href:
                        if val.startswith(http):
                            self.links.add(val)
                        elif val.startswith(/):
                            self.links.add(self.path + val)

class Crawl:
    def __init__(self):
        self.path = http://www.baidu.com
        self.cookie = cookielib.CookieJar()
        handler = urllib2.HTTPCookieProcessor(self.cookie)
        self.opener = urllib2.build_opener(handler)

    def open(self, path):
        self.response = self.opener.open(path)

    def showCookie(self):
        for item in self.cookie:
            print Name =  + item.name
            print value =  + item.value

    def showResponse(self):
        print self.response.read()

    def getAllUrl(self, links, path):
        try:
            self.open(path)
            res = self.response.read()
            parser = WebParser(links, path)
            parser.feed(res)
            parser.close()
        except Exception, e:
            print e

    def crawl(self):
        src_links = set()
        result_links = set()
        self.getAllUrl(src_links, self.path)
        n = 200
        while len(src_links) != 0 and n > 0:
            link = src_links.pop()
            if link in result_links:
                pass
            result_links.add(link)
            self.getAllUrl(src_links, link)
            n -= 1
            print n

        return result_links | src_links

c = Crawl()
rlt = c.crawl()
for link in rlt:
    print link

 

提取网址的python练习

标签:

原文地址:http://www.cnblogs.com/hushpa/p/4671144.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!