爬虫爬当当网书籍信息

时间：2018-05-02 13:18:55 阅读：306 评论：0 收藏：0 [点我收藏+]

标签：title imp sha split format find 书籍 .text bs4

拖了好久的一个爬虫

先上代码文字慢慢补

 1 # -*- coding: utf-8 -*
 2 
 3 import urllib2
 4 import xlwt
 5 from bs4 import BeautifulSoup
 6 from datashape import json
 7 import re
 8 import json
 9 import requests
10 
11 
12 def getJsonText(url):
13     try:
14         r = requests.get(url, timeout=1)
15         r.raise_for_status()
16         r.encoding = r.apparent_encoding
17         return r.text
18     except:
19         print ‘获取失败‘
20         return ‘‘
21 
22 
23 def getgood(url):
24     html = urllib2.urlopen(url).read()
25 
26     # 用正则表达式拿取
27     ma = re.search(r‘"productId":"[\d]+"‘, html)
28     productId = eval(ma.group().split(‘:‘)[-1])
29     categoryPath = eval(ma.group().split(‘:‘)[-1])
30     mainProductId = eval(ma.group().split(‘:‘)[-1])
31     # 对Ajax的url进行拼接
32     json_url = ‘http://product.dangdang.com/index.php?r=comment%2Flist&productId={productId}&categoryPath={categoryPath}&mainProductId={mainProductId}&mediumId=0&pageIndex=1&sortType=1&filterType=1&isSystem=1&tagId=0&tagFilterCount=0‘.format(
33         productId=productId, categoryPath=categoryPath, mainProductId=mainProductId)
34     # 调用方法，下载下来json数据
35     json_html = json.loads(getJsonText(json_url))
36     summary = json_html[‘data‘][‘list‘][‘summary‘]
37     data = {}
38     data[‘all_comment_num‘] = summary[‘total_comment_num‘]              # 总评论数
39     data[‘good_comment_num‘] = summary[‘total_crazy_count‘]             # 好评数
40     data[‘middle_comment_num‘] = summary[‘total_indifferent_count‘]     # 中评数
41     data[‘bad_comment_num‘] = summary[‘total_detest_count‘]             # 差评数
42     data[‘good_rate‘] = summary[‘goodRate‘]                             # 好评率
43     return data
44 
45 def main():
46     wb = xlwt.Workbook()
47     sheet1 = wb.add_sheet("Sheet")
48     sheet1.write(0, 0, unicode(‘序号‘, "utf-8"))
49     sheet1.write(0, 1, unicode(‘书名‘, "utf-8"))
50     sheet1.write(0, 2, unicode(‘价格‘, "utf-8"))
51     sheet1.write(0, 3, unicode(‘折扣‘, "utf-8"))
52     sheet1.write(0, 4, unicode(‘评论数‘, "utf-8"))
53     sheet1.write(0, 5, unicode(‘好评‘, "utf-8"))
54     sheet1.write(0, 6, unicode(‘中评‘, "utf-8"))
55     sheet1.write(0, 7, unicode(‘差评‘, "utf-8"))
56     sheet1.write(0, 8, unicode(‘好评率‘, "utf-8"))
57 
58     for page in range(25):
59 
60         url = ‘http://bang.dangdang.com/books/bestsellers/01.00.00.00.00.00-24hours-0-0-1-%d‘ % (page+1)
61         get = urllib2.urlopen(url).read()
62         data = BeautifulSoup(get, ‘lxml‘)
63 
64         bookname = data.find_all(‘div‘, attrs={‘class‘: ‘name‘})
65         bookstar = data.find_all(‘div‘, attrs={‘class‘: ‘star‘})
66         bookprice = data.find_all(‘div‘, attrs={‘class‘: ‘price‘})
67         bookoff = data.find_all(‘span‘, attrs={‘class‘: ‘price_s‘})
68 
69         for i in range(20):
70             bookurl = bookname[i].find(‘a‘)[‘href‘]
71             data = getgood(bookurl)
72             print (str(page*20+i+1) + " "
73                     + bookname[i].find(‘a‘)[‘title‘] + " "                  # 书名
74                     + bookprice[i].find(‘span‘).text[1:] + " "              # 价格
75                     + bookoff[i].text[:-1] + " "                            # 折扣
76                     + bookstar[i].find(‘a‘).text[:-3] + " "                 # 评论数
77                     + data[‘good_comment_num‘] + " "                        # 好评数
78                     + data[‘middle_comment_num‘] + " "                      # 中评数
79                     + data[‘bad_comment_num‘] + " "                         # 差评数
80                     + data[‘good_rate‘] + " "                               # 好评率
81                    )
82 
83             sheet1.write(page * 20 + i + 1, 0, page * 20 + i + 1)
84             sheet1.write(page * 20 + i + 1, 1, bookname[i].find(‘a‘)[‘title‘])
85             sheet1.write(page * 20 + i + 1, 2, bookprice[i].find(‘span‘).text[1:])
86             sheet1.write(page * 20 + i + 1, 3, bookoff[i].text[:-1])
87             sheet1.write(page * 20 + i + 1, 4, bookstar[i].find(‘a‘).text[:-3])
88             sheet1.write(page * 20 + i + 1, 5, data[‘good_comment_num‘])
89             sheet1.write(page * 20 + i + 1, 6, data[‘middle_comment_num‘])
90             sheet1.write(page * 20 + i + 1, 7, data[‘bad_comment_num‘])
91             sheet1.write(page * 20 + i + 1, 8, data[‘good_rate‘])
92             wb.save(‘test.xls‘)
93 
94 main()

爬虫爬当当网书籍信息

标签：title imp sha split format find 书籍 .text bs4

原文地址：https://www.cnblogs.com/general10/p/8979389.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行