标签:ref 出图 limit 基础上 run resume 过程 验证 字典
#爬取搜狗首页的页面源码数据
import requests
#1.指定url
url = ‘https://www.sogou.com/‘
#2.请求发送get:get返回值是一个响应对象
response = requests.get(url=url)
#3.获取响应数据
page_text = response.text #返回的是字符串形式的响应数据
#4.持久化存储
with open(‘sogou.html‘,‘w‘,encoding=‘utf-8‘) as fp:
fp.write(page_text)
#实现一个简易的网页采集器 ; 需要让url携带的参数动态化
url = ‘https://www.sogou.com/web‘
#实现参数动态化
wd = input(‘enter a key:‘)
params = {
‘query‘:wd
}
#在请求中需要将请求参数对应的字典作用到params这个get方法的参数中
response = requests.get(url=url,params=params)
page_text = response.text
fileName = wd+‘.html‘
with open(fileName,‘w‘,encoding=‘utf-8‘) as fp:
fp.write(page_text)
#解决乱码: img_name = img_name.encode(‘iso-8859-1‘).decode(‘gbk‘)
url = ‘https://www.sogou.com/web‘
#实现参数动态化
wd = input(‘enter a key:‘)
params = {
‘query‘:wd
}
#在请求中需要将请求参数对应的字典作用到params这个get方法的参数中
response = requests.get(url=url,params=params)
response.encoding = ‘utf-8‘ #修改响应数据的编码格式
page_text = response.text
fileName = wd+‘.html‘
with open(fileName,‘w‘,encoding=‘utf-8‘) as fp:
fp.write(page_text)
# 解决UA检测
url = ‘https://www.sogou.com/web‘
#实现参数动态化
wd = input(‘enter a key:‘)
params = {
‘query‘:wd
}
headers = {
# UA伪装
‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36‘
}
# 在请求中需要将请求参数对应的字典作用到params这个get方法的参数中
response = requests.get(url=url,params=params,headers=headers)
response.encoding = ‘utf-8‘ #修改响应数据的编码格式
page_text = response.text
fileName = wd+‘.html‘
with open(fileName,‘w‘,encoding=‘utf-8‘) as fp:
fp.write(page_text)
https://movie.douban.com/typerank?type_name=%E7%88%B1%E6%83%85&type=13&interval_id=100:90&action=
#分析:当滚动条被滑动到页面底部的时候,当前页面发生了局部刷新(ajax的请求)
动态加载的页面数据
是通过另一个单独的请求请求到的数据
url = ‘https://movie.douban.com/j/chart/top_list‘
start = input(‘您想从第几部电影开始获取:‘)
limit = input(‘您想获取多少电影数据:‘)
params = {
‘type‘: ‘13‘,
‘interval_id‘: ‘100:90‘,
‘action‘: ‘‘,
‘start‘: start,
‘limit‘: limit,
}
response = requests.get(url=url,params=params,headers=headers)
page_text = response.json() #json()返回的是序列化好的实例对象
for dic in page_text:
print(dic[‘title‘]+‘:‘+dic[‘score‘])
#肯德基餐厅查询http://www.kfc.com.cn/kfccda/storelist/index.aspx
url = ‘http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword‘
for page in range(1,5):
data = {
‘cname‘: ‘‘,
‘pid‘: ‘‘,
‘keyword‘: ‘西安‘,
‘pageIndex‘: str(page),
‘pageSize‘: ‘10‘,
}
response = requests.post(url=url,headers=headers,data=data)
print(response.json())
需求
需求分析
如何检测页面中是否存在动态加载的数据?
requests作用:模拟浏览器发起请求
urllib:requests的前身
requests模块的编码流程:
参数动态化:
UA检测(反爬机制):
动态加载的数据
如果我们要对一个陌生的网站进行指定数据的爬取?
数据解析
数据解析的作用:
数据解析的方式:
正则
糗图爬取1-3页的图片,使用爬虫将前3页对应的页面源码进行爬取
通用的URL模板(不可变)
import re,os
import requests
from urllib import request
dirName = ‘./imgLibs‘
if not os.path.exists(dirName):
os.mkdir(dirName)
url = ‘https://www.qiushibaike.com/pic/page/%d‘
for page in range(1,4):
new_url = format(url%page) # 这个用法没见过
page_text = requests.get(new_url,headers=headers).text
ex = ‘<div class="thumb">.*?<img src="(.*?)" alt.*?</div>‘
img_src_list = re.findall(ex,page_text,re.S)
for src in img_src_list:
src = ‘https:‘+src
img_name = src.split(‘/‘)[-1]
img_path = dirName+‘/‘+img_name #./imgLibs/xxxx.jpg
request.urlretrieve(src,filename=img_path)
print(img_name,‘下载成功!!!‘)
requests 和 request 比较
# requests
import requests
headers={
‘User-Agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36‘
}
url = ‘https://pic.qiushibaike.com/pictures/12223/122231866/medium/IZ3H2HQN8W52V135.jpg‘
img_data = requests.get(url,headers).content #bytes类型数据
with open(‘./img.jpg‘,‘wb‘) as fp:
fp.write(img_data)
# request
from urllib import request
url = ‘https://pic.qiushibaike.com/pictures/12223/122231866/medium/IZ3H2HQN8W52V135.jpg‘
request.urlretrieve(url,filename=‘./qiutu.jpg‘)
bs4
bs4解析的原理
环境安装:
beautifulSoup的实例化:
from bs4 import BeautifulSoup
fp = open(‘./test.html‘,‘r‘,encoding=‘utf-8‘)
soup = BeautifulSoup(fp,‘lxml‘)
soup.div
定位标签的操作:
取属性:
文本:
xpath
xpath解析的实现原理
环境安装:
etree对象的实例化:
etree.parse(’test.html’) # parse 解析
etree.HTML(page_text)
from lxml import etree
tree = etree.parse(‘./test.html‘) # 返回对象
# 定位方式1:
tree.xpath(‘/html/head/title‘) # Element title at 0x.
# 定位方式2:
tree.xpath(‘//title‘) // 任意位置开始找 /从跟开始找
# 定位方式3:
tree.xpath(‘/html/body//p‘) // body 下面所有的p 跨多级
tree.xpath(‘//div[@class="song"]‘) # 属性定位
tree.xpath(‘//li[7]‘) # 索引定位,从1开始
# /text() 取直系
tree.xpath(‘//a[@id="feng"]/text()‘)[0] 返回时列表 一个元素也是列表形式返回
# //text() 取所有
tree.xpath(‘//div[@class="song"]//text()‘)
# 取属性
tree.xpath(‘//a[@id="feng"]/@href‘)
定位:
取文本:
tree.xpath(‘//div[@class="song"]//text()‘)
tree.xpath(‘//a[@id="feng"]/text()‘)[0] 返回时列表 一个元素也是列表形式返回
取属性:tree.xpath(‘//a[@id="feng"]/@href‘)
pyquery:自学
数据解析的通用原理
import requests
headers = {
‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36‘
}
#如何爬取图片
url = ‘https://pic.qiushibaike.com/system/pictures/12223/122231866/medium/IZ3H2HQN8W52V135.jpg‘
img_data = requests.get(url,headers=headers).content #byte类型数据
with open(‘./img.jpg‘,‘wb‘) as fp:
fp.write(img_data)
import re
import os
import requests
headers = {
‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36‘
}
#糗图爬取1-3页所有的图片
#1.使用通用爬虫将前3页对应的页面源码数据进行爬取
#通用的url模板(不可变)
dirName = ‘./imgLibs‘
if not os.path.exists(dirName):
os.mkdir(dirName)
url = ‘https://www.qiushibaike.com/pic/page/%d/‘
for page in range(1,4):
new_url = format(url%page)
page_text = requests.get(new_url,headers=headers).text #每一个页码对应的页面源码数据
#在通用爬虫的基础上实现聚焦爬虫(每一个页码对应页面源码数据中解析出图片地址)
ex = ‘<div class="thumb">.*?<img src="(.*?)" alt.*?</div>‘
img_src_list = re.findall(ex,page_text,re.S)
for src in img_src_list:
src = ‘https:‘+src
img_name = src.split(‘/‘)[-1]
img_path = dirName+‘/‘+img_name #./imgLibs/xxxx.jpg
request.urlretrieve(src,filename=img_path)
print(img_name,‘下载成功!!!‘)
#爬取三国整篇内容(章节名称+章节内容)http://www.shicimingju.com/book/sanguoyanyi.html
fp = open(‘sanguo.txt‘,‘w‘,encoding=‘utf-8‘)
main_url = ‘http://www.shicimingju.com/book/sanguoyanyi.html‘
page_text = requests.get(main_url,headers=headers).text
#解析出章节名称和章节详情页的url
soup = BeautifulSoup(page_text,‘lxml‘)
a_list = soup.select(‘.book-mulu > ul > li > a‘) #返回的列表中存储的是一个个a标签
for a in a_list:
title = a.string
detail_url = ‘http://www.shicimingju.com‘+a[‘href‘]
detail_page_text = requests.get(detail_url,headers=headers).text
#解析详情页中的章节内容
soup = BeautifulSoup(detail_page_text,‘lxml‘)
content = soup.find(‘div‘,class_=‘chapter_content‘).text
fp.write(title+‘:‘+content+‘\n‘)
print(title,‘下载成功!‘)
fp.close()
#http://pic.netbian.com/4kmeinv/中文乱码的处理
dirName = ‘./meinvLibs‘
if not os.path.exists(dirName):
os.mkdir(dirName)
url = ‘http://pic.netbian.com/4kmeinv/index_%d.html‘
for page in range(1,11):
if page == 1:
new_url = ‘http://pic.netbian.com/4kmeinv/‘
else:
new_url = format(url%page)
page_text = requests.get(new_url,headers=headers).text
tree = etree.HTML(page_text)
a_list = tree.xpath(‘//div[@class="slist"]/ul/li/a‘)
for a in a_list:
img_src = ‘http://pic.netbian.com‘+a.xpath(‘./img/@src‘)[0]
img_name = a.xpath(‘./b/text()‘)[0]
img_name = img_name.encode(‘iso-8859-1‘).decode(‘gbk‘)
img_data = requests.get(img_src,headers=headers).content
imgPath = dirName+‘/‘+img_name+‘.jpg‘
with open(imgPath,‘wb‘) as fp:
fp.write(img_data)
print(img_name,‘下载成功!!!‘)
#https://www.aqistudy.cn/historydata/所有城市名称
page_text = requests.get(‘https://www.aqistudy.cn/historydata/‘,headers=headers).text
tree = etree.HTML(page_text)
# hot_cities = tree.xpath(‘//div[@class="bottom"]/ul/li/a/text()‘)
# all_cities = tree.xpath(‘//div[@class="bottom"]/ul/div[2]/li/a/text()‘)
cities = tree.xpath(‘//div[@class="bottom"]/ul/div[2]/li/a/text() | //div[@class="bottom"]/ul/li/a/text()‘) #提高xpath的通用性
cities
"""
从http://www.shicimingju.com/book 爬取所有书籍保存到本地
"""
import requests,os
from urllib import request
from lxml import etree
dirName = ‘./books‘
if not os.path.exists(dirName):
os.mkdir(dirName)
headers={
‘User-Agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36‘
}
url = ‘http://www.shicimingju.com/book‘
page_text = requests.get(url,headers=headers).text
# print(page_text)
tree = etree.HTML(page_text)
a_list = tree.xpath(‘//div[@class="bookmark-list"]//a‘)
for a in a_list:
bookname = a.xpath(‘./text()‘)[0]
book_path = "http://www.shicimingju.com" + a.xpath(‘./@href‘)[0]
#print(bookname,book_path) # 不取第一个元素的话返回的是列表 [‘三国演义‘] [‘/book/sanguoyanyi.html‘]
book_page = requests.get(book_path,headers=headers).text
tree = etree.HTML(book_page)
book_a_list = tree.xpath(‘//div[@class="book-mulu"]//a‘)
path = dirName + ‘/‘ + bookname
with open(path,‘w‘,encoding=‘utf-8‘) as f:
for a in book_a_list:
title = a.xpath(‘./text()‘)[0]
detail_path = ‘http://www.shicimingju.com‘+a.xpath(‘./@href‘)[0]
detail_page = requests.get(detail_path,headers=headers).text
content = etree.HTML(detail_page).xpath(‘//div[@class="chapter_content"]//text()‘)
content = ‘‘.join(content)
f.write(title+‘:‘ + content + ‘\n‘)
print(title,"下载成功")
"""
从这里爬取免费的简历模板
第一页:‘http://sc.chinaz.com/jianli/free.html‘
第其它页:f‘http://sc.chinaz.com/jianli/free_{i}.html‘
"""
import requests,os
from urllib import request
from lxml import etree
dirName = ‘./resume_template‘
if not os.path.exists(dirName):
os.mkdir(dirName)
headers={
‘User-Agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36‘,
‘Connection‘: ‘close‘
}
count = 0
for i in range(1,11):
if i==1:
url = ‘http://sc.chinaz.com/jianli/free.html‘
else:
url = f‘http://sc.chinaz.com/jianli/free_{i}.html‘
page_text = requests.get(url,headers=headers).text
tree = etree.HTML(page_text)
a_list = tree.xpath(‘//a[@class="title_wl"]‘)
for a in a_list:
title = a.xpath(‘./text()‘)[0]
title = title.encode(‘iso-8859-1‘).decode(‘utf8‘)
detail_path = a.xpath(‘./@href‘)[0]
# print(title,detail_path)
page_detail = requests.get(detail_path,headers=headers).text
tree = etree.HTML(page_detail)
download_url = tree.xpath(‘//ul[@class="clearfix"]/li[1]/a/@href‘)[0]
# print(f"{title}={download_url}")
if ‘src‘ in download_url:
print(f‘\033[1;35;47m{title}:下载链接无效,跳过\033[0m‘)
continue
path = dirName + ‘/‘ + title + ‘.rar‘
ret = request.urlretrieve(download_url,path)
print (ret)
print(title,‘下载成功‘)
count += 1
print(count)
原因:
解决:
导入使用
from multiprocessing.dummy import Pool
import time
start = time.time()
urls = [
‘www.1.com‘,
‘www.1.com‘,
‘www.1.com‘,
]
def get_request(url):
print(‘正在下载‘,url)
sleep(2)
print(‘下载结束‘,url)
pool = Pool(3)
pool.map(get_request,urls)
print(‘总耗时‘,time.time()-start)
协程
import asyncio
import time
# 回调函数
def callback(task):
print(‘i am callback and‘,task.result())
start = time.time()
async def test():
ayncio.sleep(2)
print(‘i am test()‘)
return ‘xxxx‘
c = test() # c 是一个协程对象
# 封装了一个任务对象
task = asyncio.ensure_future(c)
# 绑定回调
task.add_done_callback()
# 创建事件循环的对象
loop = asyncio.get_event_loop()
# 开启事件循环
loop.run_until_complete(task)
_____________________________________________
_____________________________________________
# 多任务
import asyncio
import time
import requests
# 在特殊函数内部的实现中不可以出现不支持异步的模块代码
async def get_request(url):
await asyncio.sleep(2)
print(‘下载成功‘,url)
urls = [
‘www.1.com‘,
‘www.2.com‘,
]
tasks = []
for url in urls:
c = get_request(url)
task = asyncio.ensure_future(c)
tasks.append(task)
loop = asyncio.get_event_loop()
# 注意:挂起操作需要我们手动处理
loop.run_until_complete(asyncio.wait(tasks))
print(time.time()-start)
任务对象
事件循环
aiohttp:支持异步网络请求的模块
标签:ref 出图 limit 基础上 run resume 过程 验证 字典
原文地址:https://www.cnblogs.com/he-qing-qing/p/12708243.html