标签:
#!/usr/bin/python
#encoding=utf-8
import urllib2
import urllib
import re
import thread
import time
class Spider(object):
def __init__(self):
self.page = 1
self.pages = []
self.enable = False
def Start(self):
self.enable = True
page = self.page
thread.start_new_thread(self.LoadPage,())
while self.enable:
if self.pages:
nowpage = self.pages[0]
del self.pages[0]
self.ShowPage(nowpage,page)
page += 1
def LoadPage(self):
while self.enable:
if len(self.pages)<2:
try:
myPage = self.GetPage(str(self.page))
self.pages.append(myPage)
self.page += 1
except:
print "无法链接糗事百科"
else:
time.sleep(1)
def ShowPage(self,nowpage,page):
for item in nowpage:
print "the %d page"%(page)
print item[0],item[1]
myInput = raw_input()
if myInput == "q" :
self.enable = False
break
def GetPage(self,page):
url = "http://m.qiushibaike.com/hot/page/" + page
user_agent = ‘Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)‘
headers = {"User-Agent":user_agent}
req = urllib2.Request(url,headers = headers)
res = urllib2.urlopen(req)
myPage = res.read()
unicodePage = myPage.decode("utf-8")
# 找出所有class="content"的div标记
#re.S是任意匹配模式,也就是.可以匹配换行符
myItems = re.findall
(‘<div.*?class="content".*?title="(.*?)">(.*?)</div>‘,unicodePage,re.S)
items = []
for i in myItems:
items.append([i[0],i[1]])
return items
#----------- 程序的入口处 -----------
print u"""
---------------------------------------
程序:糗百爬虫
语言:Python 2.7
操作:输入q
退出阅读糗事百科
功能:按下回车依次浏览今日的糗百热点
---------------------------------------
"""
print u‘请按下回车浏览今日的糗百内容:‘
raw_input(‘ ‘)
my_spider = Spider()
my_spider.Start()
标签:
原文地址:http://my.oschina.net/sunxichao/blog/372951