标签:sts cas line strip 新功能 中国 like stat utf-8
Python 是一种跨平台的计算机程序设计语言,面向对象动态类型语言,Python是纯粹的自由软件,源代码和解释器CPython遵循 GPL(GNU General Public License)协议,随着版本的不断更新和语言新功能的添加,Python 越来越多被用于独立的、大型项目的开发。
快速抓取网页: 使用urllib最基本的抓取功能,将百度首页的内容保存到本地目录下.
>>> import urllib.request
>>>
>>> res=urllib.request.urlopen("https://www.baidu.com")
>>> print(res.read().decode("utf-8"))
>>> f=open("./test.html","wb") #保存在本地
>>> f.write(res.read())
>>> f.close()
实现POST请求: 上述的例子是通过请求百度的get请求获得百度,下面使用urllib的post请求.
>>> import urllib.parse
>>> import urllib.request
>>>
>>> data=bytes(urllib.parse.urlencode({"hello":"lyshark"}),encoding="utf-8")
>>> print(data)
>>> response = urllib.request.urlopen('http://www.baidu.com/post',data=data)
>>> print(response.read())
设置TIMEOUT时间: 我们需要给请求设置一个超时时间,而不是让程序一直在等待结果.
import urllib.request
response = urllib.request.urlopen('http://www.baidu.com', timeout=1)
print(response.read())
获取网站状态: 我们可以通过status、getheaders(),getheader("server"),获取状态码以及头部信息.
>>> import urllib.request
>>>
>>> res=urllib.request.urlopen("https://www.python.org")
>>> print(type(res))
<class 'http.client.HTTPResponse'>
>>>
>>> res.status
>>> res.getheaders()
>>> res.getheader("server")
伪装访问网站: 给请求添加头部信息,从而定制自己请求网站是时的头部信息,防止被和谐.
from urllib import request,parse
url = 'http://www.baidu.com'
headers = {
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
'Host': 'mkdirs.org'
}
dict = {
'name': 'LyShark'
}
data = bytes(parse.urlencode(dict), encoding='utf8')
req = request.Request(url=url, data=data, headers=headers, method='POST')
response = request.urlopen(req)
print(response.read().decode('utf-8'))
简单的URL页面拼接:
import re
def Get_Url(target,start,ends):
urls=[]
for i in range(start,ends):
url = target+"/"+str(i)
urls.append(url)
return urls
if __name__ == "__main__":
url = Get_Url("https://www.mzitu.com/214261",1,10)
print(url)
request库的使用:
import re
import requests
head={'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
if __name__ == "__main__":
ret = requests.get(url="https://www.mzitu.com/214261", headers=head, timeout=1)
all_pic_link = re.findall('<img src="(.*?)"', ret.text, re.S)
print(all_pic_link)
简单实现爬取图片:
import re
import urllib.request
def open_url(url):
ret = urllib.request.Request(url)
ret.add_header('user-agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36')
page = urllib.request.urlopen(ret)
html =page.read().decode("utf-8")
return html
def get_img(html):
ret = re.findall('<img src="([^"]+\.jpg)"',html)
for each in ret:
filename = each.split("/")[-1]
print("完整路径:",each)
print("文件名称:",filename)
urllib.request.urlretrieve(each,filename,None)
if __name__ == '__main__':
url = open_url("https://www.mzitu.com/210402")
get_img(url)
爬每日CVE漏洞列表:
import re
import requests
from bs4 import BeautifulSoup
head={'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
def Get_CVE(url):
new_cve = []
ret = requests.get(url=url, headers=head, timeout=3)
bs = BeautifulSoup(ret.text, 'html.parser')
for i in bs.find_all('a'):
href = i.get('href')
new_cve.append(href)
return(new_cve)
def Get_Number(list):
new = []
for i in list:
temp = re.findall("[0-9]{1,}-.*", str(i))
new.append("CVE-{}".format(temp))
return new
if __name__ == "__main__":
url= "https://cassandra.cerias.purdue.edu/CVE_changes/today.html"
cve = Get_CVE(url)
number = Get_Number(cve)
for i in number:
print("今日份的漏洞:",i)
简单爬取西刺代理地址: 此处我们就用简单的正则匹配爬取,该方法比较笨拙.
import re
import requests
head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
ret = requests.get(url="https://www.xicidaili.com/nn/1", headers=head, timeout=3)
data = re.findall('<td>.*</td>', ret.text)
sum =0
for i in range(0,20):
IP = data[sum].replace("<td>","").replace("</td>","")
Port = data[sum+1].replace("<td>","").replace("</td>","")
Type = data[sum+2].replace("<td>","").replace("</td>","")
times = data[sum+3].replace("<td>","").replace("</td>","")
year = data[sum+4].replace("<td>","").replace("</td>","")
print("IP地址:{} 端口号:{} 类型:{} 生存周期:{} 时间:{}".format(IP,Port,Type,times,year))
sum = sum+5
BeautifulSoup 定位技巧: 使用bs库需要安装,三个依赖包 pip install requests bs4 lxml
from bs4 import BeautifulSoup
import requests
head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
ret = requests.get(url="https://lyshark.cnblogs.com", headers=head, timeout=3)
ret.encoding="utf-8" # 出现乱码需要改这里
bs = BeautifulSoup(ret.text,"lxml")
# 查找head头节点里面的所有link标签,过滤出0个里面的,href成员
print(bs.select('head link')[0]['href'])
# 查找文中所有a标签,且类名是c_b_p_desc_readmore的,并提取出其href字段
print(bs.find_all('a',class_='c_b_p_desc_readmore')[0]['href'])
# 提取所有a标签,且id等于blog_nav_admin类等于menu,并提取出其href字段
print(bs.find_all('a',id='blog_nav_admin',class_='menu')[0]['href'])
print(bs.find_all('a',id='blog_nav_admin',class_='menu')[0].attrs['href'])
# 提取DIV标签里面,id是page_begin_html且里面是link标签的
print(bs.select('div[id="page_begin_html"] link')[0]['href'])
print(bs.select('ul[id="navList"] .menu')[0]['href'])
# 提取 body 标签下面的 div标签并且匹配id=page_begin_html标签里面第1个link元素
print(bs.select('body > div[id="page_begin_html"] > link')[0])
# 提取指定标签里面的内容
print(bs.select('title')[0].get_text())
print(bs.select('a[href="https://www.cnblogs.com/LyShark/archive/2019/12/04.html"]'))
# 定位body标签下面的div下面子标签div下面的span标签
print(bs.select('div[id="header"] div[id="blogTitle"] a[id="lnkBlogLogo"]'))
print(bs.select('body div[id="header"] div[class="blogStats"] span[id="stats_post_count"]'))
stripped_strings方法的简单应用: 提取出house-name标签下面的所有字符串
from bs4 import BeautifulSoup
import requests
import html5lib
head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
ret = requests.get(url="https://gz.centanet.com/ershoufang/", headers=head, timeout=3)
text = str(ret.content.decode('utf-8'))
bs = BeautifulSoup(text,"html5lib")
ret = bs.select('div[class="section"] div[class="house-item clearfix"] p[class="house-name"]')
for i in ret:
#house = i.get_text() # 提取出文中的所有字符串以及其格式
house = list(i.stripped_strings) # 提取出字符串并以列表的形式返回
print(house)
实现爬取中国天气网:
from bs4 import BeautifulSoup
import requests
import html5lib
head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
ret = requests.get(url="http://www.weather.com.cn/textFC/shandong.shtml", headers=head, timeout=3)
text = str(ret.content.decode('utf-8'))
bs = BeautifulSoup(text,"html5lib")
bs.find_all('div',class_='conMidtab')[1] # 定位到第一个标签上
tr = bs.find_all('tr')[2:] # 在conMidtab里面找,tr标签并从第3个标签开始保存
for i in tr:
td = i.find_all('td') # 循环找代码中的所有td标签
city_td = td[0] # 找所有的td标签,并找出第一个td标签
# stripped_strings 获取目标路径下所有的子孙非标签字符串,自动去掉空字符串
city = list(city_td.stripped_strings)[0]
temp = td[-5] # 取出度数的标签
temperature = list(temp.stripped_strings)[0]
print('城市:{} 温度:{}'.format(city,temperature))
使用bs4库爬取西刺代理: 使用库的方式爬取,啪啪啪,三下五除二搞定.
import re
import requests
from bs4 import BeautifulSoup
head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
ret = requests.get(url="https://www.xicidaili.com/wt/", headers=head, timeout=3)
bs = BeautifulSoup(ret.text,"lxml")
ret = bs.select('table[id="ip_list"] tr[class="odd"]')
ip=[]
for i in ret:
house =list(i.stripped_strings)
ip.append(house)
for i in range(0,50):
format = "http://{}:{}".format(ip[i][0],ip[i][1])
print(format,file=open("save.log",'a+',encoding='utf-8'))
print("代理地址(已保存) {}".format(format))
request使用代理IP地址
import re
from time import sleep
import requests
head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
proxy = { "http":"http://127.0.0.1:9999" }
# 无密码写法:"http": "http://ip:端口号"
# 有密码写法:"https": "https://username:password@ip:端口号"
file = open("save.log","r",encoding="utf-8")
for i in file.readlines():
data = "".join(i.split('\n')) # 去除空格
proxy.update(http=data) # 更新proxy中的数据为当前行
ret = requests.get(url="https://www.cnblogs.com/LyShark/", headers=head, timeout=3, proxies=proxy)
if ret.status_code == 200:
print("代理:{} 访问完成".format(proxy["http"]))
else:
print("代理:{} 不在线,失败".format(proxy["http"]))
sleep(1)
request代理下载文件
import requests
head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
proxy = { "http":"http://117.69.200.46:9999" }
url = "https://nmap.org/dist/nmap-7.80-win32.zip"
ret = requests.get(url=url, headers=head,stream=True,proxies=proxy)
fp = open("nmap.zip","wb")
for chunk in ret.iter_content(chunk_size=4096):
if chunk:
print("本次保存长度:{} ".format(len(chunk)))
fp.write(chunk)
标签:sts cas line strip 新功能 中国 like stat utf-8
原文地址:https://www.cnblogs.com/LyShark/p/12163252.html