标签:open star final 多个 联网 文档 name glib 保存
import requests
from bs4 import BeautifulSoup
headers = {
‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36‘
}
f = open(‘./test.html‘,‘r‘)
soup = BeautifulSoup(f,‘lxml‘)
soup.div # 标签定位
soup.find(‘div‘,class_=‘song‘) # 属性定位:根据树新定位具体的标签,class属性为song的div标签,因为class是内置属性,所以要加下划线class_,如果是id则直接用id=‘xxx‘
soup.finAll(‘a‘,id=‘feng‘)
soup.select(‘#feng‘) # 根据id选择器定位a标签
soup.select(‘.song‘) # 定位class为song的标签
#层级选择器
soup.select(‘.tang > ul > li > a‘) # >表示一个层级
soup.select(‘.tang a‘) # 空格表示多个层级
soup.p.string # 取出p标签直系的文本内容
soup.div.text # 取出div标签中所有的文本内容
soup.a[‘href‘] # 取出按标签属性为href的数据
# 使用bs4解析爬取三国演义整篇小说内容:http://www.shicimingju.com/book/sanguoyanyi.html
# 从首页解析出章节的标题和详情页的url
url = ‘http://www.shicimingju.com/book/sanguoyanyi.html‘
page_text = requests.get(url, headers=headers).text # 首页页面的源码数据
f = open(‘./sanguoyanyi.txt‘,‘w‘,encoding=‘utf-8‘)
# 数据解析(章节标题,详情页的url)
soup = BeautifulSoup(page_text,‘lxml‘)
a_list = soup.select(‘.book-mulu > ul > li > a‘)
for a in a_list:
title = a.string
url_detail = ‘http://www.shicimingju.com‘ + a[‘href‘]
# 解析提取章节内容
page_text_detail = requests.get(url_detail, headers=headers).text
# 解析详情页中的章节内容
soup = BeautifulSoup(page_text_detail,‘lxml‘)
content = soup.find(‘div‘,class_=‘chapter_content‘).text
f.write(title + ‘:‘ + content + ‘\n‘)
print(title,‘下载成功‘)
from lxml import etree
import os
tree = etree.parse(‘./test.html‘)
tree.xpath(‘/html/head‘) # 从根标签开始定位head标签,返回<Element>列表
tree.xpath(‘//head‘) # 将html文档中所有的head标签定位到
tree.xpath(‘//div[@class="song"]‘) # 定位class为song的div标签
tree.xpath(‘//li[1]‘) #定位第一个li标签
tree.xpath(‘//a[@id="feng"]/text()‘) # 取出id为feng的a标签的直系内容
tree.xpath(‘//div[@class="song"]//text()‘) # 取出class为song的div标签里面的所有内容
tree.xpath(‘//a[@id="feng"]/@href‘) # 取出id为feng的a标签中属性为href的值
# 爬取图片数据和图片名称,并保存到本地
# 第一页:http://pic.netbian.com/4kmeinv/
# 非第一页:http://pic.netbian.com/4kmeinv/index_2.html
dir_name = ‘imgLibs‘
if not os.path.exists(dir_name):
os.mkdir(dir_name)
url = ‘http://pic.netbian.com/4kmeinv/index_%d.html‘
for page in range(1,6):
if page == 1:
new_url = ‘http://pic.netbian.com/4kmeinv/‘
else:
new_url = format(url%page) # 表示非第一页的url
response = requests.get(new_url, headers=headers)
response.encoding = ‘gbk‘
page_text = response.text
tree = etree.HTML(page_text) # 解析图片地址和图片名称
li_list = tree.xpath(‘//div[@class="slist"]/ul/li‘) # 全局解析
for li in li_list:
img_src = ‘http://pic.netbian.com‘ + li.xpath(‘./a/img/@src‘)[0] # 局部解析,./表示xpath对调用者对应的标签
img_name = li.xpath(‘./a/img/@alt‘)[0] + ‘.jpg‘
img_data = requests.get(img_src,headers=headers).content
# print(img_src, img_name)
file_path = dir_name + ‘/‘ + img_name
with open(file_path,‘wb‘) as f:
f.write(img_data)
print(img_name,‘下载成功‘)
# 如何提升xpath表达式的通用性
url = ‘https://www.aqistudy.cn/historydata/‘
page_text = requests.get(url,headers=headers