Python 网络爬虫 - 抓取糗事百科的段子(最新版)

时间：2015-09-24 17:53:15 阅读：266 评论：0 收藏：0 [点我收藏+]

标签：

代码

# -*- coding: cp936 -*-
__author__  = "christian chen"
import urllib2
import re
import threading
import time

class Tool:
    def pTitle(self):
        return re.compile(‘<title.*?>(.*?)</‘, re.S)

    def pContent(self):
        return re.compile(‘<div class="author.*?>.*?<a.*?<img.*?/>(.*?)</a>.*?</div>.*?<div.*?class="content.*?>(.*?)</div>.*?class="number.*?>(.*?)</.*?‘, re.S)

class CSBK(threading.Thread):
    def __init__(self, max_page):
        threading.Thread.__init__(self, name=‘christian_thread‘)
        self.baseUrl = "http://www.qiushibaike.com/hot/page/"
        self.maxPage = int(max_page) + 1
        self.tool = Tool()
        
    def getPageContent(self, pageNum):
        user_agent = ‘Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)‘
        headers = { ‘User-Agent‘ : user_agent }
        url = self.baseUrl + str(pageNum)
        try:
            request = urllib2.Request(url, headers=headers)
            response = urllib2.urlopen(request)
            content = response.read().decode(‘utf-8‘, ‘ignore‘)
            content = content.encode(‘gbk‘, ‘ignore‘)
            return content
        except urllib2.URLError, e:
            if hasattr(e,"reason"):
                print u"error: ", e.reason
                return None
            
    def getPageDetail(self, c):
        items = re.findall(self.tool.pContent(), c)
        result = []
        for item in items:
            p = {}
            p[‘发布人‘] = item[0].strip()
            p[‘id‘] = item[2].strip()
            p[‘内容‘] = item[1].strip()
            result.append(p)
        return result

    def getTitle(self, c):
        result = re.findall(self.tool.pTitle(), c)
        return result[0].strip()

    def run(self):
        print "---- " + time.ctime() + " ----\n"
        for page in range(1, self.maxPage):
            c = self.getPageContent(page)
            if c == None:
                print "URL已失效，请重试"
                return

            print "---- 正在抓取第" + str(page) + "页 ---- "
            title = self.getTitle(c)
            f = open(title + ‘ - Page_‘ + str(page) + ‘.txt‘, ‘w‘)
            result = self.getPageDetail(c)
            cutLine = u‘-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.\n‘
            for item in result:
                f.write(cutLine)
                for K, V in item.items():
                    f.write(str(K) + ‘ : ‘ + str(V) + ‘\n‘)
            print "---- 第" + str(page) + "页抓取完毕 ----\n"

            f.close()
            del result
            del f
            del cutLine
            del c
        print "---- " + time.ctime() + " ----"
        
maxPage = raw_input("输入想抓取的糗事百科的最大页数: \n")
csbk = CSBK(maxPage)
csbk.start()

效果图

技术分享

这里是Freestyletime@foxmail.com，欢迎交流。

本人原创作品，转载请标明出处。

Python 网络爬虫 - 抓取糗事百科的段子(最新版)

标签：

原文地址：http://my.oschina.net/freestyletime/blog/510724

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行