码迷,mamicode.com
首页 > 其他好文 > 详细

爬虫爬当当网书籍信息

时间:2018-05-02 13:18:55      阅读:306      评论:0      收藏:0      [点我收藏+]

标签:title   imp   sha   split   format   find   书籍   .text   bs4   

拖了好久的一个爬虫

先上代码 文字慢慢补

 

 1 # -*- coding: utf-8 -*
 2 
 3 import urllib2
 4 import xlwt
 5 from bs4 import BeautifulSoup
 6 from datashape import json
 7 import re
 8 import json
 9 import requests
10 
11 
12 def getJsonText(url):
13     try:
14         r = requests.get(url, timeout=1)
15         r.raise_for_status()
16         r.encoding = r.apparent_encoding
17         return r.text
18     except:
19         print 获取失败
20         return ‘‘
21 
22 
23 def getgood(url):
24     html = urllib2.urlopen(url).read()
25 
26     # 用正则表达式拿取
27     ma = re.search(r"productId":"[\d]+", html)
28     productId = eval(ma.group().split(:)[-1])
29     categoryPath = eval(ma.group().split(:)[-1])
30     mainProductId = eval(ma.group().split(:)[-1])
31     # 对Ajax的url进行拼接
32     json_url = http://product.dangdang.com/index.php?r=comment%2Flist&productId={productId}&categoryPath={categoryPath}&mainProductId={mainProductId}&mediumId=0&pageIndex=1&sortType=1&filterType=1&isSystem=1&tagId=0&tagFilterCount=0.format(
33         productId=productId, categoryPath=categoryPath, mainProductId=mainProductId)
34     # 调用方法,下载下来json数据
35     json_html = json.loads(getJsonText(json_url))
36     summary = json_html[data][list][summary]
37     data = {}
38     data[all_comment_num] = summary[total_comment_num]              # 总评论数
39     data[good_comment_num] = summary[total_crazy_count]             # 好评数
40     data[middle_comment_num] = summary[total_indifferent_count]     # 中评数
41     data[bad_comment_num] = summary[total_detest_count]             # 差评数
42     data[good_rate] = summary[goodRate]                             # 好评率
43     return data
44 
45 def main():
46     wb = xlwt.Workbook()
47     sheet1 = wb.add_sheet("Sheet")
48     sheet1.write(0, 0, unicode(序号, "utf-8"))
49     sheet1.write(0, 1, unicode(书名, "utf-8"))
50     sheet1.write(0, 2, unicode(价格, "utf-8"))
51     sheet1.write(0, 3, unicode(折扣, "utf-8"))
52     sheet1.write(0, 4, unicode(评论数, "utf-8"))
53     sheet1.write(0, 5, unicode(好评, "utf-8"))
54     sheet1.write(0, 6, unicode(中评, "utf-8"))
55     sheet1.write(0, 7, unicode(差评, "utf-8"))
56     sheet1.write(0, 8, unicode(好评率, "utf-8"))
57 
58     for page in range(25):
59 
60         url = http://bang.dangdang.com/books/bestsellers/01.00.00.00.00.00-24hours-0-0-1-%d % (page+1)
61         get = urllib2.urlopen(url).read()
62         data = BeautifulSoup(get, lxml)
63 
64         bookname = data.find_all(div, attrs={class: name})
65         bookstar = data.find_all(div, attrs={class: star})
66         bookprice = data.find_all(div, attrs={class: price})
67         bookoff = data.find_all(span, attrs={class: price_s})
68 
69         for i in range(20):
70             bookurl = bookname[i].find(a)[href]
71             data = getgood(bookurl)
72             print (str(page*20+i+1) + " "
73                     + bookname[i].find(a)[title] + " "                  # 书名
74                     + bookprice[i].find(span).text[1:] + " "              # 价格
75                     + bookoff[i].text[:-1] + " "                            # 折扣
76                     + bookstar[i].find(a).text[:-3] + " "                 # 评论数
77                     + data[good_comment_num] + " "                        # 好评数
78                     + data[middle_comment_num] + " "                      # 中评数
79                     + data[bad_comment_num] + " "                         # 差评数
80                     + data[good_rate] + " "                               # 好评率
81                    )
82 
83             sheet1.write(page * 20 + i + 1, 0, page * 20 + i + 1)
84             sheet1.write(page * 20 + i + 1, 1, bookname[i].find(a)[title])
85             sheet1.write(page * 20 + i + 1, 2, bookprice[i].find(span).text[1:])
86             sheet1.write(page * 20 + i + 1, 3, bookoff[i].text[:-1])
87             sheet1.write(page * 20 + i + 1, 4, bookstar[i].find(a).text[:-3])
88             sheet1.write(page * 20 + i + 1, 5, data[good_comment_num])
89             sheet1.write(page * 20 + i + 1, 6, data[middle_comment_num])
90             sheet1.write(page * 20 + i + 1, 7, data[bad_comment_num])
91             sheet1.write(page * 20 + i + 1, 8, data[good_rate])
92             wb.save(test.xls)
93 
94 main()

 

爬虫爬当当网书籍信息

标签:title   imp   sha   split   format   find   书籍   .text   bs4   

原文地址:https://www.cnblogs.com/general10/p/8979389.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!