【爬虫】BeautifulSoup之爬取百度贴吧的帖子

时间：2016-08-19 18:51:24 阅读：166 评论：0 收藏：0 [点我收藏+]

标签：

在网上看到爬百度贴吧的例子，仿照写了一个用BeautifulSoup实现的，直接上代码吧

#coding:gbk
import urllib2
from bs4 import BeautifulSoup
import re
import os  

class TiebatoTxt:
    def __init__(self, url, seeLZ):
        #传入url
        self.url = url 
        #是否只看楼主
         self.seeLZ = ‘?see_lz=‘+str(seeLZ)
        self.floor = 1
        self.File = None
        self.defaultTitle = "百度贴吧"

    #获得每一页的BeautifulSoup对象
    def get_body(self, pageNum):
        url = self.url + self.seeLZ  + ‘&pn=‘ + str(pageNum)
        req = urllib2.Request(url)
        try :
            html = urllib2.urlopen(req)
        except (urllib2.HTTPError, urllib2.URLError) as e:
            print u"获取帖子链接错误"
            return None
        try:
            bsObj = BeautifulSoup(html, "html.parser")        
        except AttributeError as e:
            print u"获得BeautifulSoup对象错误"
            return None
        return bsObj
    
    #获得帖子标题
    def find_title(self, page):
        name = page.find("head").find("title").get_text()
        if name:
            return name
        else:
            return None
        
    #获取帖子共有多少页
    def get_pagenum(self, page):
        pageinfoList= page.findAll("li", {"class":"l_reply_num"})
        if pageinfoList is not None:
            for info in pageinfoList:
                span = info.findAll("span")
                if span is not None:
                    return span[1].get_text().encode("gbk")
        else:
            print "pageinfoList is none"
            
    #获得每一楼层的内容
    def get_content(self, page):
        div = page.findAll("div", {"id":re.compile("post_content_.*?")})
        contents = []
        for item in div:
            floorLine = "\n\n" + str(self.floor) + u"------------------------------------------------------\n\n"
            contents.append(floorLine)
            con = item.getText("\n", strip=True).encode("gbk", "ignore")#忽略一些特殊字符
            self.floor = self.floor + 1
            txturl = None
            txturl = item.findAll("a")
            #有些词带链接，去掉链接
            if txturl:
                for i in txturl:
                    word = i.getText(strip=True).encode("gbk", "ignore")
                    con = con.replace(("\n%s\n"%word), word)
                        
            contents.append(con)
        return contents
            #print item.get_text(strip=True)
            
    def setFileTitle(self,title):
        #如果标题不是为None，即成功获取到标题      
        if title is not None:
            title = title.replace(‘/‘, ‘‘)
            self.File = open(os.path.join(os.getcwd(), (title + ".txt")),"w+")
        else:
            self.File = open(os.path.join(os.getcwd(), (self.defaultTitle + ".txt")),"w+")
            
    def writetotxt(self,contents):
        #向文件写入每一楼的信息
        for item in contents:
            self.File.write(item)

    def start(self):
        indexPage = self.get_body(1)
        pageNum = self.get_pagenum(indexPage)
        title = self.find_title(indexPage)
        self.setFileTitle(title)
        if pageNum == None:
            print "URL已失效，请重试"
            return
        try:
            print "该帖子共有" + str(pageNum) + "页"
            for i in range(1,int(pageNum)+1):
                print "正在写入第" + str(i) + "页数据"
                page = self.get_body(i)
                contents = self.get_content(page)
                self.writetotxt(contents)
        #出现写入异常
        except IOError,e:
            print "写入异常，原因" + e.message
        finally:
            print "写入任务完成"


#270051025
if __name__ == ‘__main__‘:     
    print u"请输入帖子代号"
    baseURL = ‘http://tieba.baidu.com/p/‘ + str(raw_input(u‘http://tieba.baidu.com/p/‘))
    seeLZ = raw_input(u"是否只获取楼主发言，是输入1，否输入0：")        
    t = TiebatoTxt(baseURL, seeLZ)
    b = t.start()

标签：

原文地址：http://www.cnblogs.com/zoro-robin/p/5788595.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行