Python_爬虫_豆瓣（存数据库、图片）

时间：2018-03-22 10:50:55 阅读：201 评论：0 收藏：0 [点我收藏+]

豆瓣3.21

 1 #coding:utf-8
 2 #采集豆瓣书信息和图片，写进数据库 
 3 
 4 from urllib import request
 5 # from bs4 import BeautifulSoup
 6 from lxml import etree
 7 import json,pymysql
 8 
 9 # from my_pymysql import pymysql
10 
11 url="https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4"
12 headers={
13     ‘Host‘:‘book.douban.com‘,
14     ‘Upgrade-Insecure-Requests‘:‘1‘,
15     ‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36‘
16 }
17 req = request.Request(url=url,headers=headers,method="GET")
18 content = request.urlopen(req).read().decode("utf-8")
19 content_dict=etree.HTML(content)    #格式化
20 # print(content_dict)
21 content_dict_allli = content_dict.xpath(r‘//*[@id="subject_list"]/ul/li‘)  #拿到列表
22 info_all = ‘‘
23 
24 for li in content_dict_allli:     
25     # 书名/标题
26     title_list = li.xpath(r‘div[2]/h2/a/@title‘)    #取标签里的内容，注意地址是相对地址，不能直接拿来用 （注：和bs4不一样）
27     title =title_list[0]
28     title=title.replace(" ",‘‘)
29     print(title)
30     #信息 作者、出版社
31     info_list = li.xpath(r‘div[2]/div[1]/text()‘) 
32     author = info_list[0].split(‘/‘)[0]
33     author = author.replace(‘\n‘,‘‘).replace(" ",‘‘)
34     chubanshe = info_list[0].split(‘/‘)[1]
35     print(author)
36     print(chubanshe)
37     #评分
38     pingfen_list = li.xpath(r‘div[2]/div[2]/span[2]/text()‘)
39     pingfen = pingfen_list[0]
40     print(pingfen)
41 
42     #图片
43     img_net_addr =li.xpath(r‘div[1]/a/img/@src‘)
44     img_net_addr = img_net_addr[0]
45     print(img_net_addr)
46     data = request.urlopen(img_net_addr).read()
47     img_name =str(‘douban/‘) + title + str(‘.jpg‘)
48     with open(img_name,‘wb‘)as f:
49         f.write(data)
50         
51     #数据库
52     db = pymysql.connect(host=‘localhost‘,port=3306,user="root",password=‘root‘,db=‘douban‘,charset=‘utf8‘)    #
53     cur=db.cursor()
54     sql = "insert into douban(title,author,chubanshe,pingfen)values(‘%s‘,‘%s‘,‘%s‘,‘%s‘)"%(title,author,chubanshe,pingfen)
55     cur.execute(sql)
56     db.commit()
57 
58 db.close()

采集豆瓣书信息和图片，写进数据库

Python_爬虫_豆瓣（存数据库、图片）

标签：技术作者爬虫 eth img 相对 word one body

原文地址：https://www.cnblogs.com/hellangels333/p/8621368.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行