标签:
代码:
# _*_ coding:utf-8 _*_ import urllib import urllib2 import re class Tool: removingImg = re.compile(‘<img.*?>| {7}|‘) removingAddr = re.compile(‘<a.*?>|</a>‘) replaceLine = re.compile(‘<tr>|<div>|</div>|</p>‘) replaceTD = re.compile(‘<td>‘) replacePara = re.compile(‘<p.*?>‘) replaceBR = re.compile(‘<br><br>|<br>‘) removeExtraTag = re.compile(‘<.*?>‘) def replace(self,x): x = re.sub(self.removingImg,"",x) x = re.sub(self.removingAddr,"",x) x = re.sub(self.replaceLine,"\n",x) x = re.sub(self.replaceTD,"\t",x) x = re.sub(self.replacePara,"\n",x) x = re.sub(self.replaceBR,"\n",x) x = re.sub(self.removeExtraTag,"",x) return x.strip() class BDTB: def __init__(self,baseUrl,seeLZ): self.baseURL = baseUrl self.seeLZ = ‘?see_lz=‘+str(seeLZ) self.tool = Tool() def getPage(self,pageNum): try: url = self.baseURL + self.seeLZ + ‘&pn=‘ + str(pageNum) request = urllib2.Request(url) response = urllib2.urlopen(request) pageCode = response.read().decode(‘utf-8‘) #print pageCode return pageCode except urllib2.URLError,e: if hasattr(e,"reason"): print u"连接百度贴吧失败,错误原因",e.reason return None def getTitle(self): page = self.getPage(1) pattern = re.compile(‘<h3 class="core_title_txt.*?>(.*?)</h3>‘,re.S) result = re.search(pattern,page) if result: #print result.group(1) return result.group(1).strip() else: #print "Not match" return None def getPageNum(self): page = self.getPage(1) pattern = re.compile(‘<li class="l_reply_num.*?</span>.*?<span.*?>(.*?)</span>‘,re.S) result = re.search(pattern,page) if result: #print result.group(1) return result.group(1).strip() else: #print "Not match" return None def getContent(self,page): pattern = re.compile(‘<div id="post_content_.*?>(.*?)</div>‘,re.S) items = re.findall(pattern,page) #print self.tool.replace(items[1]) floor = 1 for item in items: print floor,u"楼-----------------------------------------\n" content = self.tool.replace(item) floor += 1 print content baseURL = ‘http://tieba.baidu.com/p/3138733512‘ bdtb = BDTB(baseURL,1) page = bdtb.getPage(1) bdtb.getTitle() bdtb.getPageNum() bdtb.getContent(page)
标签:
原文地址:http://www.cnblogs.com/AndyJee/p/5001283.html