码迷,mamicode.com
首页 > 编程语言 > 详细

python爬虫——豆瓣图书top250信息

时间:2015-11-19 16:25:20      阅读:265      评论:0      收藏:0      [点我收藏+]

标签:

# -*- coding: utf-8 -*-
import requests
import re
import sys
reload(sys)
sys.setdefaultencoding(utf-8)

class Spider(object):
    def __init__(self):
        print(开始爬取豆瓣图书top250的内容。。。。。。)

    # 传入url,返回网页源代码
    def getSourceCode(self, url):
        html = requests.get(url)
        return html.text

    # 从源代码中提取出我们需要的内容块:{书名、作者出版社等、评分、评价人数}。
    def getEveryBookContent(self, sourceCode):
        everyBookContent = re.findall(<table width="100%">(.*?)</table>, sourceCode, re.S)
        # everyBookContent = re.findall(‘<div class="pl2">(.*?)</div>(.*?)<p class="pl">(.*?)</p>‘, sourceCode, re.S)
        return everyBookContent

    # 从内容块中提取出数据
    def getBookInfo(self, eachBookContent):
        bookInfo = {}
        # bookInfo[‘title‘] = re.subn(‘( |\n|<br/>|</?span.*?>)‘, "", re.search(‘<a href=.*?>(.*?)</a>‘, eachBookContent, re.S).group(1))[0]
        bookInfo[title] = re.sub(( |\n|<br/>|</?span.*?>), "", re.search(<a href=.*?>(.*?)</a>, eachBookContent, re.S).group(1))
        bookInfo[author] = re.search(<p class="pl">(.*?)</p>, eachBookContent, re.S).group(1)
        bookInfo[discussNum] = re.sub(( |\n|<br/>), "", re.search(<span class="pl">\((.*?)\)</span>, eachBookContent, re.S).group(1))
        bookInfo[score] = re.search(<span class="rating_nums">(.*?)</span>, eachBookContent, re.S).group(1)
        return bookInfo

    # 将结果保存到文件
    def saveBookInfo(self, bookList):
        f = open("bookList.txt", "a")
        for each in bookList:
            f.writelines(书  名:\t {}\n.format(each[title]))
            f.writelines(作  者:\t {}\n.format(each[author]))
            f.writelines(评论数:\t {}\n.format(each[discussNum]))
            f.writelines(评  分:\t {}\n\n.format(each[score]))
        f.close()

    def start(self, url):
        sourceCode = self.getSourceCode(url)
        everyBookContent = self.getEveryBookContent(sourceCode)
        bookList = []
        for each in everyBookContent:
            bookList.append(self.getBookInfo(each))
        self.saveBookInfo(bookList)


if __name__ == __main__:
    douban = Spider()
    url = http://book.douban.com/top250?start=0
    i = 0
    while i <= 225:
        url = http://book.douban.com/top250?start={}.format(i)
        douban.start(url)
        i += 25

 

python爬虫——豆瓣图书top250信息

标签:

原文地址:http://www.cnblogs.com/ponpon7/p/4977856.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!