# coding=utf-8 import urllib2 import urllib import re class QiuShi: def _init_(self): self.page = 1 # 从网页获取糗事 def GetQiuShis(self,page): #网址 url = "http://www.qiushibaike.com/hot/page/"+page #伪装浏览器 user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = {'user-Agent':user_agent} #请求 req = urllib2.Request(url,headers = headers) response = urllib2.urlopen(req) html = response.read() #encode的作用是将unicode编码转换成其他编码的字符串 #decode的作用是将其他编码的字符串转换成unicode编码 unicodeHtml = html.decode("utf-8") items = re.findall('<div.*?class="content".*?title="(.*?)">(.*?)</div>',unicodeHtml,re.S) contents = [] for item in items: # item 中第一个是div的标题,也就是时间 # item 中第二个是div的内容,也就是内容 contents.append([item[0].replace("\n",""),item[1].replace("\n","")]) return contents #打印糗事 def ShowQiuShi(self,contents): count = 1 for content in contents: print "第%d条糗事" % count,content[0],"\n" print content[1],"\n" count += 1 #启动 def Start(self): page = 1 while page < 5: print "第%d页:\n" % page contents = self.GetQiuShis(str(page)) self.ShowQiuShi(contents) page += 1 qiuShi = QiuShi() qiuShi.Start()
原文地址:http://blog.csdn.net/sunnyyoona/article/details/42060375