码迷,mamicode.com
首页 > 编程语言 > 详细

python 爬小说

时间:2018-02-10 11:25:39      阅读:202      评论:0      收藏:0      [点我收藏+]

标签:apt   decode   list   text   ==   files   get   otto   gen   

#coding=utf-8
import datetime
import time
import sys
import os 

import urllib2
import urllib

sx = 小说站网址

type = sys.getfilesystemencoding()  
user_agent = Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)  
headers = { User-Agent : user_agent }  


fo = open("note.txt", "wb")

def getHtml(url):  
    try:  
        request = urllib2.Request(url, headers=headers)  
        response = urllib2.urlopen(request)  
        data = response.read()  
        data = data.decode(gbk)  
        data = data.encode(utf-8)  
        print len(data)  
        return data
    except urllib2.URLError, e:  
        if hasattr(e, "code"):  
            print e.code  
        if hasattr(e, "reason"):  
            print e.reson  
        pass

def dealIndex(url):
    data = getHtml(url)
    # pos = data.find()
    bgnpos = data.index(ChapterList_HengFu_1) + 10
    endpos = data.index(ChapterList_HengFu_2) - 10
    print bgnpos
    print endpos

    achfx = data[bgnpos:endpos]
    pos = bgnpos

    i = 0
    while 1:
        newpos = achfx.find(href=, pos)
        if newpos == -1 or newpos >= endpos:
            break

        # print data[newpos:newpos+200]
        indexurl = achfx[newpos+6:newpos+19]

        titlepos = achfx.find(</a>, newpos+20)
        titlename = achfx[newpos+21:titlepos+1]
        # print indexurl + "   " + titlename
        pos = titlepos + 5

        dealContext(sx + indexurl, titlename)
        # i = i + 1
        # # print "-----------------" + str(pos)
        # if i >= 1:
        #     break
        pass

    # print achfx


def dealContext(url, title):
    print url
    print title

    data = getHtml(url)
    bgnpos = data.find(name="content", 10) + 15
    endpos = data.find(yuedu_bottom, bgnpos)
    endpos = data.find(</div>, endpos - 50)

    sContent = data[bgnpos:endpos]
    sContent = sContent.replace(&nbsp;,  )
    sContent = sContent.replace(<br />,  )

    # # sContent = sContent.strip("&nbsp;")
    # # sContent = sContent.strip(‘<br />‘)
    # print sContent
    # # print sContent.strip(‘<br />‘)
    sContent = title + "  " + sContent
    fo.write(sContent)

dealIndex(sx)


fo.close()

 

python 爬小说

标签:apt   decode   list   text   ==   files   get   otto   gen   

原文地址:https://www.cnblogs.com/yylingyao/p/8438130.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!