标签:
在网上看到爬百度贴吧的例子,仿照写了一个用BeautifulSoup实现的,直接上代码吧
#coding:gbk import urllib2 from bs4 import BeautifulSoup import re import os class TiebatoTxt: def __init__(self, url, seeLZ): #传入url self.url = url #是否只看楼主 self.seeLZ = ‘?see_lz=‘+str(seeLZ) self.floor = 1 self.File = None self.defaultTitle = "百度贴吧" #获得每一页的BeautifulSoup对象 def get_body(self, pageNum): url = self.url + self.seeLZ + ‘&pn=‘ + str(pageNum) req = urllib2.Request(url) try : html = urllib2.urlopen(req) except (urllib2.HTTPError, urllib2.URLError) as e: print u"获取帖子链接错误" return None try: bsObj = BeautifulSoup(html, "html.parser") except AttributeError as e: print u"获得BeautifulSoup对象错误" return None return bsObj #获得帖子标题 def find_title(self, page): name = page.find("head").find("title").get_text() if name: return name else: return None #获取帖子共有多少页 def get_pagenum(self, page): pageinfoList= page.findAll("li", {"class":"l_reply_num"}) if pageinfoList is not None: for info in pageinfoList: span = info.findAll("span") if span is not None: return span[1].get_text().encode("gbk") else: print "pageinfoList is none" #获得每一楼层的内容 def get_content(self, page): div = page.findAll("div", {"id":re.compile("post_content_.*?")}) contents = [] for item in div: floorLine = "\n\n" + str(self.floor) + u"------------------------------------------------------\n\n" contents.append(floorLine) con = item.getText("\n", strip=True).encode("gbk", "ignore")#忽略一些特殊字符 self.floor = self.floor + 1 txturl = None txturl = item.findAll("a") #有些词带链接,去掉链接 if txturl: for i in txturl: word = i.getText(strip=True).encode("gbk", "ignore") con = con.replace(("\n%s\n"%word), word) contents.append(con) return contents #print item.get_text(strip=True) def setFileTitle(self,title): #如果标题不是为None,即成功获取到标题 if title is not None: title = title.replace(‘/‘, ‘‘) self.File = open(os.path.join(os.getcwd(), (title + ".txt")),"w+") else: self.File = open(os.path.join(os.getcwd(), (self.defaultTitle + ".txt")),"w+") def writetotxt(self,contents): #向文件写入每一楼的信息 for item in contents: self.File.write(item) def start(self): indexPage = self.get_body(1) pageNum = self.get_pagenum(indexPage) title = self.find_title(indexPage) self.setFileTitle(title) if pageNum == None: print "URL已失效,请重试" return try: print "该帖子共有" + str(pageNum) + "页" for i in range(1,int(pageNum)+1): print "正在写入第" + str(i) + "页数据" page = self.get_body(i) contents = self.get_content(page) self.writetotxt(contents) #出现写入异常 except IOError,e: print "写入异常,原因" + e.message finally: print "写入任务完成" #270051025 if __name__ == ‘__main__‘: print u"请输入帖子代号" baseURL = ‘http://tieba.baidu.com/p/‘ + str(raw_input(u‘http://tieba.baidu.com/p/‘)) seeLZ = raw_input(u"是否只获取楼主发言,是输入1,否输入0:") t = TiebatoTxt(baseURL, seeLZ) b = t.start()
标签:
原文地址:http://www.cnblogs.com/zoro-robin/p/5788595.html