标签:params 页码 复数 page 链接 with dex off ext
#将每一个页码对应的图片数据进行下载
if not os.path.exists(‘./imgLib‘):
os.mkdir(‘./imgLib‘)
#定义一个通用的url模板
url_model = ‘http://sc.chinaz.com/tag_tupian/OuMeiMeiNv_%d.html‘
for page in range(1,5):
print(‘正在下载第%d页的数据......‘%page)
if page == 1:
url = ‘http://sc.chinaz.com/tag_tupian/OuMeiMeiNv.htm‘
else:
url = format(url_model%page)
page_text = requests.get(url=url,headers=headers).text
#使用正则将图片地址进行提取
re_ex = ‘<a target="_blank".*?<img src2="(.*?)" alt.*?</a>‘
#正则在解析数据时,遇到换行会停止匹配数据。re.S就可以让正则遇到换行不停止匹配
#在爬虫中必须要使用re.S
img_src = re.findall(re_ex,page_text,re.S)
for src in img_src:
img_name = src.split(‘/‘)[-1]
img_path = ‘./imgLib/‘+img_name
img_data = requests.get(url=src,headers=headers).content
with open(img_path,‘wb‘) as fp:
fp.write(img_data)
print(img_name,‘下载成功!‘)
from bs4 import BeautifulSoup
fp = open(‘./test.html‘,‘r‘,encoding=‘utf-8‘)
soup = BeautifulSoup(fp,‘lxml‘)
# print(soup) #返回的是加载到该对象中的页面源码数据
soup.title
soup.div
soup.find(‘div‘,class_=‘song‘)
soup.find_all(‘div‘,class_=‘song‘)
soup.select(‘.song‘)
soup.select(‘#feng‘)
soup.select(‘.tang > ul > li > a ‘)
soup.select(‘.tang a‘)
soup.title.string
soup.title.text
soup.find(‘div‘,class_=‘song‘).text
soup.find(‘a‘,id="feng")[‘href‘]
url = ‘http://www.shicimingju.com/book/sanguoyanyi.html‘
page_text = requests.get(url=url,headers=headers).text
#数据解析
soup = BeautifulSoup(page_text,‘lxml‘)#soup只可以解析首页的内容
a_list = soup.select(‘.book-mulu > ul > li > a‘)
fp = open(‘./sanguo.txt‘,‘w‘,encoding=‘utf-8‘)
for a in a_list:
title = a.string
detail_url = ‘http://www.shicimingju.com‘+a[‘href‘]
detail_page_text = requests.get(url=detail_url,headers=headers).text
#数据解析:解析章节内容
detail_soup = BeautifulSoup(detail_page_text,‘lxml‘)
div_tag = detail_soup.find(‘div‘,class_=‘chapter_content‘)
content = div_tag.text
fp.write(title+‘:‘+content+‘\n‘)
print(title,‘已经下载成功!!!‘)
fp.close()
# 将糗事百科中的段子标题和内容进行解析爬取
url_model = ‘https://www.qiushibaike.com/text/page/%d/‘
for page in range(1,4):
url = format(url_model%page)
page_text = requests.get(url=url,headers=headers).text
tree = etree.HTML(page_text)
#列表中每一个div标签都包含了我们要解析的内容
#xpath是在做全局数据解析
div_list = tree.xpath(‘//div[@class="col1 old-style-col1"]/div‘)
for div in div_list:
#div表示:div是一个Element类型的对象,表示就是页面中的一个指定的div标签
#div:html源码中的一个局部数据
#局部数据解析中:./表示xpath方法调用者表示的标签
author = div.xpath(‘./div[1]/a[2]/h2/text()‘)[0] #局部数据解析,只可以定位div这个局部数据中的相关标签
content = div.xpath(‘./a[1]/div/span//text()‘)
content = ‘‘.join(content)
print(author,content)
# xpath图片数据爬取
import os
from urllib import request
dirName = ‘imgLibs‘
if not os.path.exists(dirName):
os.mkdir(dirName)
url = ‘http://pic.netbian.com/4kmeinv/‘
response = requests.get(url,headers=headers)
response.encoding = ‘gbk‘
page_text = response.text
tree = etree.HTML(page_text)
#解析图片名称+图片链接
li_list = tree.xpath(‘//*[@id="main"]/div[3]/ul/li‘)
for li in li_list:#局部数据解析,一定要使用./操作
img_name = li.xpath(‘./a/img/@alt‘)[0]+‘.jpg‘
img_src = ‘http://pic.netbian.com‘+li.xpath(‘./a/img/@src‘)[0]
img_path = dirName+‘/‘+img_name #图片存储路径
request.urlretrieve(img_src,img_path)
print(img_name,‘下载成功!!!‘)
main_url = ‘https://openapi.vmall.com/mcp/offlineshop/getShopList‘
data = {"portal":2,"lang":"zh-CN","country":"CN","brand":1,"province":"河北","city":"邯郸","pageNo":1,"pageSize":40}
main_json_data = requests.post(url=main_url,headers=headers,json=data).json()
for dic in main_json_data[‘shopInfos‘]:
id_ = dic[‘id‘]
url = ‘https://openapi.vmall.com/mcp/offlineshop/getShopById‘
params = {
‘portal‘: ‘2‘,
‘version‘: ‘10‘,
‘country‘: ‘CN‘,
‘shopId‘: id_,
‘lang‘: ‘zh-CN‘,
}
json_data = requests.get(url=url,headers=headers,params=params).json()
address = json_data[‘shopInfo‘][‘address‘]
time_ = json_data[‘shopInfo‘][‘serviceTime‘]
print(address,time_)
#爬取城市名称
url = ‘https://www.aqistudy.cn/historydata/‘
page_text = requests.get(url=url,headers=headers).text
tree = etree.HTML(page_text)
all_cities = tree.xpath(‘//div[@class="bottom"]/ul/div[2]/li/a/text() | //div[@class="bottom"]/ul/li/a/text()‘)
all_cities
标签:params 页码 复数 page 链接 with dex off ext
原文地址:https://www.cnblogs.com/freedom0923/p/13155959.html