码迷,mamicode.com
首页 > 编程语言 > 详细

糗事百科python爬虫

时间:2017-08-24 22:43:23      阅读:196      评论:0      收藏:0      [点我收藏+]

标签:style   index   www.   4.0   exce   htm   tor   self   user   

# -*- coding: utf-8 -*-
#coding=utf-8

import urllib
import urllib2
import re
import thread
import time

class QSBK:
    def __init__(self):
        self.pageIndex=1
        self.user_agent = Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)
        self.header={User-Agent:self.user_agent}
        self.store=[]
        self.enable=False
    def getPage(self,pageIndex):
        try:
            url = http://www.qiushibaike.com/hot/page/+str(pageIndex)
            request = urllib2.Request(url,headers=self.header)
            response = urllib2.urlopen(request)
            pageHtml =response.read().decode(utf-8)
            return pageHtml
        except urllib2.URLError,e:
            print 链接网络失败+e.reason
            return None
    def getPageItem(self,pageIndex):
        page = self.getPage(pageIndex)
        if page==None:
            print "页面获得失败"
            return  None
        pattern = re.compile(<div class="author.*?<a.*?<img.*?</a>.*?<a.*?<h2>(.*?)</h2>.*?class="content.*?<span>\s*(.*?)\s*</span>,re.S)
        items = re.findall(pattern, page)
        pageStories = []
        for item in items:
            pageStories.append([item[0],item[1]])
        return pageStories
    def loadPage(self):
        if self.enable==True:
            if len(self.store)<2:
                pageStories = self.getPageItem(self.pageIndex)
                if pageStories!=None:
                    self.store.append(pageStories)
                    self.pageIndex+=1
    def getOneStory(self,pageStories):
        for story in pageStories:
            input= raw_input()
            self.loadPage()
            if input==Q:
                self.enable=False
                return
            print u%s %s%(story[0],story[1])
    def start(self):
        print u"正在读取糗事百科的数据,按Q退出"
        self.enable=True
        self.loadPage()
        nowPage=0
        while self.enable:
            if len(self.store)>0:
                pageStore=self.store[0]
                nowPage+=1
                del self.store[0]
                self.getOneStory(pageStore)



spider =QSBK()
spider.start()

 

糗事百科python爬虫

标签:style   index   www.   4.0   exce   htm   tor   self   user   

原文地址:http://www.cnblogs.com/norm/p/7425193.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!