码迷,mamicode.com
首页 > 编程语言 > 详细

Python 网络爬虫 - 抓取糗事百科的段子(最新版)

时间:2015-09-24 17:53:15      阅读:266      评论:0      收藏:0      [点我收藏+]

标签:

代码


# -*- coding: cp936 -*-
__author__  = "christian chen"
import urllib2
import re
import threading
import time

class Tool:
    def pTitle(self):
        return re.compile(‘<title.*?>(.*?)</‘, re.S)

    def pContent(self):
        return re.compile(‘<div class="author.*?>.*?<a.*?<img.*?/>(.*?)</a>.*?</div>.*?<div.*?class="content.*?>(.*?)</div>.*?class="number.*?>(.*?)</.*?‘, re.S)

class CSBK(threading.Thread):
    def __init__(self, max_page):
        threading.Thread.__init__(self, name=‘christian_thread‘)
        self.baseUrl = "http://www.qiushibaike.com/hot/page/"
        self.maxPage = int(max_page) + 1
        self.tool = Tool()
        
    def getPageContent(self, pageNum):
        user_agent = ‘Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)‘
        headers = { ‘User-Agent‘ : user_agent }
        url = self.baseUrl + str(pageNum)
        try:
            request = urllib2.Request(url, headers=headers)
            response = urllib2.urlopen(request)
            content = response.read().decode(‘utf-8‘, ‘ignore‘)
            content = content.encode(‘gbk‘, ‘ignore‘)
            return content
        except urllib2.URLError, e:
            if hasattr(e,"reason"):
                print u"error: ", e.reason
                return None
            
    def getPageDetail(self, c):
        items = re.findall(self.tool.pContent(), c)
        result = []
        for item in items:
            p = {}
            p[‘发布人‘] = item[0].strip()
            p[‘id‘] = item[2].strip()
            p[‘内容‘] = item[1].strip()
            result.append(p)
        return result

    def getTitle(self, c):
        result = re.findall(self.tool.pTitle(), c)
        return result[0].strip()

    def run(self):
        print "---- " + time.ctime() + " ----\n"
        for page in range(1, self.maxPage):
            c = self.getPageContent(page)
            if c == None:
                print "URL已失效,请重试"
                return

            print "---- 正在抓取第" + str(page) + "页 ---- "
            title = self.getTitle(c)
            f = open(title + ‘ - Page_‘ + str(page) + ‘.txt‘, ‘w‘)
            result = self.getPageDetail(c)
            cutLine = u‘-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.\n‘
            for item in result:
                f.write(cutLine)
                for K, V in item.items():
                    f.write(str(K) + ‘ : ‘ + str(V) + ‘\n‘)
            print "---- 第" + str(page) + "页抓取完毕 ----\n"

            f.close()
            del result
            del f
            del cutLine
            del c
        print "---- " + time.ctime() + " ----"
        
maxPage = raw_input("输入想抓取的糗事百科的最大页数: \n")
csbk = CSBK(maxPage)
csbk.start()



效果图

技术分享

技术分享

技术分享

这里是Freestyletime@foxmail.com,欢迎交流。

本人原创作品,转载请标明出处。


Python 网络爬虫 - 抓取糗事百科的段子(最新版)

标签:

原文地址:http://my.oschina.net/freestyletime/blog/510724

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!