标签:
# -*- coding: cp936 -*-
__author__ = "christian chen"
import urllib2
import re
import threading
import time
class Tool:
def pTitle(self):
return re.compile(‘<title.*?>(.*?)</‘, re.S)
def pContent(self):
return re.compile(‘<div class="author.*?>.*?<a.*?<img.*?/>(.*?)</a>.*?</div>.*?<div.*?class="content.*?>(.*?)</div>.*?class="number.*?>(.*?)</.*?‘, re.S)
class CSBK(threading.Thread):
def __init__(self, max_page):
threading.Thread.__init__(self, name=‘christian_thread‘)
self.baseUrl = "http://www.qiushibaike.com/hot/page/"
self.maxPage = int(max_page) + 1
self.tool = Tool()
def getPageContent(self, pageNum):
user_agent = ‘Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)‘
headers = { ‘User-Agent‘ : user_agent }
url = self.baseUrl + str(pageNum)
try:
request = urllib2.Request(url, headers=headers)
response = urllib2.urlopen(request)
content = response.read().decode(‘utf-8‘, ‘ignore‘)
content = content.encode(‘gbk‘, ‘ignore‘)
return content
except urllib2.URLError, e:
if hasattr(e,"reason"):
print u"error: ", e.reason
return None
def getPageDetail(self, c):
items = re.findall(self.tool.pContent(), c)
result = []
for item in items:
p = {}
p[‘发布人‘] = item[0].strip()
p[‘id‘] = item[2].strip()
p[‘内容‘] = item[1].strip()
result.append(p)
return result
def getTitle(self, c):
result = re.findall(self.tool.pTitle(), c)
return result[0].strip()
def run(self):
print "---- " + time.ctime() + " ----\n"
for page in range(1, self.maxPage):
c = self.getPageContent(page)
if c == None:
print "URL已失效,请重试"
return
print "---- 正在抓取第" + str(page) + "页 ---- "
title = self.getTitle(c)
f = open(title + ‘ - Page_‘ + str(page) + ‘.txt‘, ‘w‘)
result = self.getPageDetail(c)
cutLine = u‘-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.\n‘
for item in result:
f.write(cutLine)
for K, V in item.items():
f.write(str(K) + ‘ : ‘ + str(V) + ‘\n‘)
print "---- 第" + str(page) + "页抓取完毕 ----\n"
f.close()
del result
del f
del cutLine
del c
print "---- " + time.ctime() + " ----"
maxPage = raw_input("输入想抓取的糗事百科的最大页数: \n")
csbk = CSBK(maxPage)
csbk.start()
这里是Freestyletime@foxmail.com,欢迎交流。
本人原创作品,转载请标明出处。
标签:
原文地址:http://my.oschina.net/freestyletime/blog/510724