码迷,mamicode.com
首页 > 编程语言 > 详细

python爬虫-糗百阅读器

时间:2016-03-18 13:31:34      阅读:238      评论:0      收藏:0      [点我收藏+]

标签:

#!/usr/bi/env python
# -*- coding:utf-8 -*-
import urllib
import urllib2
import re

class Turtle(object):
    def __init__(self):
        self.pageIndex = 1
        self.stories = []
        self.enable = True
        self.header = {‘User-Agent‘:‘Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)‘}
        self.enable = True

    ‘‘‘
    获取 网页内容
    ‘‘‘
    def getPage(self,pageIndex):
        try:
            url = ‘http://www.qiushibaike.com/hot/page/‘+ str(pageIndex)
            request = urllib2.Request(url, headers = self.header)
            response = urllib2.urlopen(request)
            return response.read().decode(‘utf-8‘)
        except urllib2.URLError,e:
            if hasattr(e,‘code‘):
                print u‘错误码:‘,e.code
            if hasattr(e , ‘reason‘):
                print u‘错误原因:‘,e.reason

    ‘‘‘
    获取网页内段子
    ‘‘‘
    def getPageItem(self,pageIndex):
        pageContent = self.getPage(pageIndex)
        if not  pageContent:
            print u‘页面加载失败。。。‘
            return None
        pattern = re.compile(‘<div.*?author.*?<h2>(.*?)</h2>.*?‘+
                             ‘<div.*?content">(.*?)<!--(.*?)-->.*?</div>‘+
                             ‘.*?<div.*?class="stats.*?class="number">(.*?)</i>‘,re.S)
        try:
            items = re.findall(pattern,pageContent)
        except BaseException,e:
            print e
        pageStories = []
        for item in items:
            replaceBR = re.compile(‘<br/>‘)
            text = re.sub(replaceBR,‘\n‘,item[1])
            pageStories.append([item[0].strip(),text.strip(),item[3].strip()])
        return pageStories

    ‘‘‘
    加载一页段子
    ‘‘‘
    def loadPage(self):
        if len(self.stories) < 2:#总页数小于1页,加载下一页
            print ‘==============剩余未读小于两页,预加载下一页==============‘
            pageStories = self.getPageItem(self.pageIndex)
            self.pageIndex += 1
            self.stories.append(pageStories)

    ‘‘‘
    一个个读段子
    ‘‘‘
    def getOneStory(self):
        for story in self.stories[0]:
            isQ = raw_input()
            if isQ == ‘q‘ or isQ == ‘Q‘:
                self.enable = False
                return
            self.loadPage()
            print story[1]
            print ‘-----%s,liked by %s‘ % ( story[0] , story[2] )


    def start(self):
        print ‘start to read page 1‘
        self.loadPage();
        while self.enable:
            if len(self.stories) >0:
                self.getOneStory()
                del  self.stories[0]
                print ‘===========该页已读完,读取下一页===========‘




turtle = Turtle()
turtle.start();

  

python爬虫-糗百阅读器

标签:

原文地址:http://www.cnblogs.com/sixstones/p/5291642.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!