标签:
# -*- coding: utf-8 -*- import requests import re import sys reload(sys) sys.setdefaultencoding(‘utf-8‘) class Spider(object): def __init__(self): print(‘开始爬取豆瓣图书top250的内容。。。。。。‘) # 传入url,返回网页源代码 def getSourceCode(self, url): html = requests.get(url) return html.text # 从源代码中提取出我们需要的内容块:{书名、作者出版社等、评分、评价人数}。 def getEveryBookContent(self, sourceCode): everyBookContent = re.findall(‘<table width="100%">(.*?)</table>‘, sourceCode, re.S) # everyBookContent = re.findall(‘<div class="pl2">(.*?)</div>(.*?)<p class="pl">(.*?)</p>‘, sourceCode, re.S) return everyBookContent # 从内容块中提取出数据 def getBookInfo(self, eachBookContent): bookInfo = {} # bookInfo[‘title‘] = re.subn(‘( |\n|<br/>|</?span.*?>)‘, "", re.search(‘<a href=.*?>(.*?)</a>‘, eachBookContent, re.S).group(1))[0] bookInfo[‘title‘] = re.sub(‘( |\n|<br/>|</?span.*?>)‘, "", re.search(‘<a href=.*?>(.*?)</a>‘, eachBookContent, re.S).group(1)) bookInfo[‘author‘] = re.search(‘<p class="pl">(.*?)</p>‘, eachBookContent, re.S).group(1) bookInfo[‘discussNum‘] = re.sub(‘( |\n|<br/>)‘, "", re.search(‘<span class="pl">\((.*?)\)</span>‘, eachBookContent, re.S).group(1)) bookInfo[‘score‘] = re.search(‘<span class="rating_nums">(.*?)</span>‘, eachBookContent, re.S).group(1) return bookInfo # 将结果保存到文件 def saveBookInfo(self, bookList): f = open("bookList.txt", "a") for each in bookList: f.writelines(‘书 名:\t {}\n‘.format(each[‘title‘])) f.writelines(‘作 者:\t {}\n‘.format(each[‘author‘])) f.writelines(‘评论数:\t {}\n‘.format(each[‘discussNum‘])) f.writelines(‘评 分:\t {}\n\n‘.format(each[‘score‘])) f.close() def start(self, url): sourceCode = self.getSourceCode(url) everyBookContent = self.getEveryBookContent(sourceCode) bookList = [] for each in everyBookContent: bookList.append(self.getBookInfo(each)) self.saveBookInfo(bookList) if __name__ == ‘__main__‘: douban = Spider() url = ‘http://book.douban.com/top250?start=0‘ i = 0 while i <= 225: url = ‘http://book.douban.com/top250?start={}‘.format(i) douban.start(url) i += 25
标签:
原文地址:http://www.cnblogs.com/ponpon7/p/4977856.html