提取网址的python练习

时间：2015-07-23 19:07:45 阅读：123 评论：0 收藏：0 [点我收藏+]

标签：

import urllib, urllib2, cookielib
from HTMLParser import HTMLParser
import sys

reload(sys)
sys.setdefaultencoding(‘utf8‘)

class WebParser(HTMLParser):
    def __init__(self, links, path):
        HTMLParser.__init__(self)
        self.links = links
        self.path = path

    def handle_starttag(self, tag, attrs):
        if tag == ‘a‘:
            if len(attrs) == 0:
                pass
            else:
                for (key, val) in attrs:
                    if key == ‘href‘:
                        if val.startswith(‘http‘):
                            self.links.add(val)
                        elif val.startswith(‘/‘):
                            self.links.add(self.path + val)

class Crawl:
    def __init__(self):
        self.path = ‘http://www.baidu.com‘
        self.cookie = cookielib.CookieJar()
        handler = urllib2.HTTPCookieProcessor(self.cookie)
        self.opener = urllib2.build_opener(handler)

    def open(self, path):
        self.response = self.opener.open(path)

    def showCookie(self):
        for item in self.cookie:
            print ‘Name = ‘ + item.name
            print ‘value = ‘ + item.value

    def showResponse(self):
        print self.response.read()

    def getAllUrl(self, links, path):
        try:
            self.open(path)
            res = self.response.read()
            parser = WebParser(links, path)
            parser.feed(res)
            parser.close()
        except Exception, e:
            print e

    def crawl(self):
        src_links = set()
        result_links = set()
        self.getAllUrl(src_links, self.path)
        n = 200
        while len(src_links) != 0 and n > 0:
            link = src_links.pop()
            if link in result_links:
                pass
            result_links.add(link)
            self.getAllUrl(src_links, link)
            n -= 1
            print n

        return result_links | src_links

c = Crawl()
rlt = c.crawl()
for link in rlt:
    print link

提取网址的python练习

标签：

原文地址：http://www.cnblogs.com/hushpa/p/4671144.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行