码迷,mamicode.com
首页 > 编程语言 > 详细

Python爬虫实战(二):爬百度贴吧

时间:2015-11-27 19:42:32      阅读:239      评论:0      收藏:0      [点我收藏+]

标签:

代码:

# _*_ coding:utf-8 _*_
import urllib
import urllib2
import re
class Tool:
    removingImg = re.compile(<img.*?>| {7}|)
    removingAddr = re.compile(<a.*?>|</a>)
    replaceLine = re.compile(<tr>|<div>|</div>|</p>)
    replaceTD = re.compile(<td>)
    replacePara = re.compile(<p.*?>)
    replaceBR = re.compile(<br><br>|<br>)
    removeExtraTag = re.compile(<.*?>)

    def replace(self,x):
        x = re.sub(self.removingImg,"",x)
        x = re.sub(self.removingAddr,"",x)
        x = re.sub(self.replaceLine,"\n",x)
        x = re.sub(self.replaceTD,"\t",x)
        x = re.sub(self.replacePara,"\n",x)
        x = re.sub(self.replaceBR,"\n",x)
        x = re.sub(self.removeExtraTag,"",x)
        return x.strip()
        
class BDTB:
    def __init__(self,baseUrl,seeLZ):
        self.baseURL = baseUrl
        self.seeLZ = ?see_lz=+str(seeLZ)
        self.tool = Tool()

    def getPage(self,pageNum):
        try:
            url = self.baseURL + self.seeLZ + &pn= + str(pageNum)
            request = urllib2.Request(url)
            response = urllib2.urlopen(request)
            pageCode = response.read().decode(utf-8)
            #print pageCode
            return pageCode
        except urllib2.URLError,e:
            if hasattr(e,"reason"):
                print u"连接百度贴吧失败,错误原因",e.reason
                return None

    def getTitle(self):
         page = self.getPage(1)
         pattern = re.compile(<h3 class="core_title_txt.*?>(.*?)</h3>,re.S)
         result = re.search(pattern,page)
         if result:
             #print result.group(1)
             return result.group(1).strip()
         else:
             #print "Not match"
             return None

    def getPageNum(self):
         page = self.getPage(1)
         pattern = re.compile(<li class="l_reply_num.*?</span>.*?<span.*?>(.*?)</span>,re.S)
         result = re.search(pattern,page)
         if result:
             #print result.group(1)
             return result.group(1).strip()
         else:
             #print "Not match"
             return None

    def getContent(self,page):
        pattern = re.compile(<div id="post_content_.*?>(.*?)</div>,re.S)
        items = re.findall(pattern,page)
        #print self.tool.replace(items[1])
        floor = 1
        for item in items:
            print floor,u"楼-----------------------------------------\n"
            content = self.tool.replace(item)
            floor += 1
            print content

baseURL = http://tieba.baidu.com/p/3138733512
bdtb = BDTB(baseURL,1)
page = bdtb.getPage(1)
bdtb.getTitle()
bdtb.getPageNum()
bdtb.getContent(page)

 

Python爬虫实战(二):爬百度贴吧

标签:

原文地址:http://www.cnblogs.com/AndyJee/p/5001283.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!