码迷,mamicode.com
首页 > Web开发 > 详细

爬虫3 html解析器 html_parser.py

时间:2016-11-30 02:59:05      阅读:293      评论:0      收藏:0      [点我收藏+]

标签:return   bsp   add   style   author   node   blog   parse   one   

#coding:utf8
import urlparse
from bs4 import BeautifulSoup
import re

__author__ = wang


class HtmlParser(object):
    def parse(self, page_url, html_cont):
        if page_url is None or html_cont is None:
            return

        soup = BeautifulSoup(html_cont, html.parser, from_encoding = utf-8)
        new_urls = self._get_new_urls(page_url, soup)
        new_data = self._get_new_data(page_url, soup)
        return new_urls, new_data;

    def _get_new_urls(self, page_url, soup):
        new_urls = set()
        links = soup.find_all(a, href=re.compile(r"/view/\d+\.htm"))

        for link in links:
            new_url = link[href]
            new_full_url = urlparse.urljoin(page_url, new_url);
            new_urls.add(new_full_url)

        return new_urls

    def _get_new_data(self, page_url, soup):
        res_data = {}
        res_data[url] = page_url
        title_node = soup.find(dd, class_=lemmaWgt-lemmaTitle-title).find("h1")
        res_data[title] = title_node.get_text()
        summary_node = soup.find(div, class_ = lemma-summary)
        res_data[summary] = summary_node.get_text()

        return res_data

 

爬虫3 html解析器 html_parser.py

标签:return   bsp   add   style   author   node   blog   parse   one   

原文地址:http://www.cnblogs.com/php-linux/p/6115804.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!