python实训第五天

时间：2019-06-17 20:33:33 阅读：131 评论：0 收藏：0 [点我收藏+]

1.requests之post请求

2.requests高级用法

3.selenium模块

# post 请求访问 github
# 请求 url：https://github.com/login
# 请求方式：post
# 请求头：
# 上一次请求的来源
# cookies
# user_agent
# 请求体：（只有才有post）
# commit: Sign in
# utf8: ?
# authenticity_token: .........
# login: .....
# password: ......
# webauthn-support: supported


# 获取token信息
import requests
import re
headers={‘User-Agent‘: ‘......‘}
response=requests.get(url=‘......‘,headers=headers)
authenticity_token=re.findall(‘<input type="hidden" name="authenticity_token" value="(.*?)" />‘,response.text,re.S)[0]
print(authenticity_token)
# 拼接请求头信息
headers2={
    ‘Referer‘:‘.....‘,
    ‘User_Agent‘:‘......‘,

}
form_data={
    "commit":"Sign in",
    "utf8":"?",
    "authenticity_token":authenticity token,
    "login":"..",
    "password":"....",
    "webauthn-support": "supported",
}
response2=requests.post(url=‘...‘,data=form_data,headers=headers2,cookies=login_cookies)
print(response2.status_code)
with open(‘github.html‘,‘w‘,encoding=‘utf-8‘)as f:
    f.write(response2.text)
2.

import  requests
response=requests.get(‘https://www.baidu.com‘)
print(response.status_code)#获取相应状态码
print(response.url)#获取url
print(response.encoding)#字符编码
response.encoding=‘utf_8‘
print(response.text)#获取文本
print(response.content)#获取二进制流
print(response.headers)#获取页面请求头信息
print(response.history)#上一次跳转的地址
print(response.cookies)#获取cookoes信息
print(response.cookies.get_dict())#获取cookies信息转换成字典
print(response.cookies.items())#获取cookies信息转换成字典
print(response.encoding)
print(response.elapsed)#访问时间

# stream=True、iter_content()

import requests

url = ‘https://vd3.bdstatic.com/mda-ic4pfhh3ex32svqi/hd/mda-ic4pfhh3ex32svqi.mp4?auth_key=1557973824-0-0-bfb2e69bb5198ff65e18065d91b2b8c8&bcevod_channel=searchbox_feed&pd=wisenatural&abtest=all.mp4‘
response = requests.get(url, stream=True)
print(response.content)

with open(‘love_for_GD.mp4‘, ‘wb‘) as f:
    for content in response.iter_content():
        f.write(content)

#证书验证(大部分网站都是https)
import requests
# 如果是ssl请求,首先检查证书是否合法,不合法则报错,程序终端
response = requests.get(‘https://www.xiaohuar.com‘)
print(response.status_code)

# 改进1:去掉报错,但是会报警告
import requests
response = requests.get(‘https://www.xiaohuar.com‘, verify=False)
# 不验证证书,报警告,返回200
print(response.status_code)

# 改进2:去掉报错,并且去掉警报信息
import requests
import urllib3
urllib3.disable_warnings()  # 关闭警告
response = requests.get(‘https://www.xiaohuar.com‘, verify=False)
print(response.status_code)

# 改进3:加上证书
# 很多网站都是https,但是不用证书也可以访问,大多数情况都是可以携带也可以不携带证书
# 知乎\百度等都是可带可不带
# 有硬性要求的,则必须带，比如对于定向的用户,拿到证书后才有权限访问某个特定网站
import requests
import urllib3
# urllib3.disable_warnings()  # 关闭警告
response = requests.get(
    ‘https://www.xiaohuar.com‘,
    # verify=False,
    cert=(‘/path/server.crt‘, ‘/path/key‘))
print(response.status_code)

# 超时设置
# 两种超时:float or tuple
# timeout=0.1  # 代表接收数据的超时时间
# timeout=(0.1,0.2)  # 0.1代表链接超时  0.2代表接收数据的超时时间

import requests

response = requests.get(‘https://www.baidu.com‘,
                        timeout=0.0001)
# 认证设置
‘‘‘
登录网站时，会弹出一个框，要求你输入用户名与密码（类似于alert），此时无法进入html页面，待授权通过后才能进入html页面。

Requests模块为我们提供了多种身份认证方式，包括基本身份认证等...

其原理指的是通过输入用户名与密码获取用户的凭证来识别用户，然后通过token对用户进行授权。
基本身份认证:
    HTTP Basic Auth是HTTP1.0提出的认证方式。客户端对于每一个realm，通过提供用户名和密码来进行认证的方式当认证失败时，服务器收到客户端请求，返回401。

‘‘‘
import requests
# 通过访问github的api来测试
url = ‘https://api.github.com/user‘
HEADERS = {
    ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36‘,
}

# 测试1，失败返回401
response = requests.get(url, headers=HEADERS)
print(response.status_code)  # 401
print(response.text)


# 测试2，通过requests.auth内的HTTPBasicAuth进行认证，认证成功返回用户信息
from requests.auth import HTTPBasicAuth
response = requests.get(url, headers=HEADERS, auth=HTTPBasicAuth(‘tankjam‘, ‘kermit46709394‘))
print(response.text)


# 测试3，通过requests.get请求内的auth参数默认就是HTTPBasicAuth，认证成功返回用户信息
response = requests.get(url, headers=HEADERS, auth=(‘tankjam‘, ‘kermit46709394‘))
print(response.text)

3.

# 一什么是selenium
# 最初是一个自动化测试工具，可以使用它帮助我们驱动浏览器自动执行某些定义好的操作，
# 例如在页面执行js代码，跳过登录界面。

# 为什么要使用selenium？
# 1.优点：
# 使用requests模块需要分析大量的通信流程
# ，使用selenium可以轻松跳过登录验证，爬虫效率要比requests
# 2.缺点
# 浏览器会加载css，js，图片，视频。。。数据爬虫效率要比requests模块要低

# 使用selenium？
# 1.下载selenium模块
# 2.下载浏览器驱动

# selenium 第一次使用  可能有一种报错需要关闭防火墙
from selenium import webdriver
import time

chrome=webdriver.Chrome()
chrome.get(‘https://www.baidu.com/‘)
time.sleep(100)

# from selenium import webdriver  # 用来驱动浏览器的
# from selenium.webdriver import ActionChains  # 破解滑动验证码的时候用的 可以拖动图片
# from selenium.webdriver.common.by import By  # 按照什么方式查找，By.ID,By.CSS_SELECTOR
# from selenium.webdriver.common.keys import Keys  # 键盘按键操作
# from selenium.webdriver.support import expected_conditions as EC  # 和下面WebDriverWait一起用的
# from selenium.webdriver.support.wait import WebDriverWait  # 等待页面加载某些元素
# import time
# try:
#     driver = webdriver.Chrome()
#     driver.get(‘https://www.baidu.com‘)
#     wait = WebDriverWait(driver, 10)
#     input_tag = wait.until(EC.presence_of_element_located((By.ID, ‘kw‘)))
#     input_tag.send_keys(‘美女‘)
#     input_tag.send_keys(Keys.ENTER)
#     time.sleep(5)
# finally:
#     driver.close()
#     from selenium import webdriver  # 用来驱动浏览器的
#     from selenium.webdriver import ActionChains  # 破解滑动验证码的时候用的 可以拖动图片
#     from selenium.webdriver.common.by import By  # 按照什么方式查找，By.ID,By.CSS_SELECTOR
#     from selenium.webdriver.common.keys import Keys  # 键盘按键操作
#     from selenium.webdriver.support import expected_conditions as EC  # 和下面WebDriverWait一起用的
#     from selenium.webdriver.support.wait import WebDriverWait  # 等待页面加载某些元素
#     import time
#     driver = webdriver.Chrome()  # 打开谷歌驱动浏览器
#     driver.get(‘https://www.baidu.com/‘)  # 向百度发送一个get请求
#     driver.implicitly_wait(5)  # 隐式等待如果所有元素未被加载则等待5秒  在get前设置 针对所有元素 等待所有标签加载完成后再查找标签
#     # selenium自带的解析功能
#     try:
#         # ===============所有方法===================
#         # element是查找一个标签
#         # elements是查找所有标签
#         # 1、find_element_by_id 通过id去找
#         # 2、find_element_by_link_text  通过链接文本去找
#         # 3、find_element_by_partial_link_text
#         # 4、find_element_by_tag_name
#         # 5、find_element_by_class_name
#         # 6、find_element_by_name
#         # 7、find_element_by_css_selector
#         # 8、find_element_by_xpath
#         # 1、find_element_by_id  根据id元素查找
#         # input_tag = driver.find_element_by_id(‘kw‘)  # 查找到kw元素的标签       
#         # input_tag.send_keys(‘kermit大宝贝‘)  # 输入kermit大宝贝
#         # input_tag.send_keys(Keys.ENTER)  # 按回车按钮
#         # 2、find_element_by_link_text  根据精确文本匹配内容
#         # login_button = driver.find_element_by_link_text(‘登录‘)  # 找到登陆文本标签
#         # login_button = driver.find_element_by_link_text(‘  登录‘)  # 因为是精确查找所以找不到‘  登陆‘
#         # login_button.click()  # 点击登陆按钮
#         # 3、find_element_by_partial_link_text  # 根据文本局部匹配去查找标签
#         # login = driver.find_element_by_partial_link_text(‘登‘)  # 局部匹配有登字的标签
#         # login.click()  # 点击事件
#         # 4、find_element_by_tag_name  # 根据标签名查找
#         # a = driver.find_element_by_tag_name(‘a‘)
#         # print(a)
#         # 5、find_element_by_class_name  根据类元素查找
#         # login_tag = driver.find_element_by_class_name(‘tang-pass-footerBarULogin‘)  # 根据类元素查找登陆按钮
#         # login_tag.click()  # 点击登陆事件
#         # 6、find_element_by_name  根据name属性去查找
#         # username = driver.find_element_by_name(‘userName‘)       
#         # password = driver.find_element_by_name(‘password‘)
#         # username.send_keys(‘15622792660‘)
#         # password.send_keys(‘k46709394‘)
#         # login_button = driver.find_element_by_id(‘TANGRAM__PSP_10__submit‘)
#         # login_button.click()
#         # 7、find_element_by_css_selector  根据属性选择器查找
#         search = driver.find_element_by_css_selector(‘.s_ipt‘)
#         search.send_keys(‘帅哥‘)  # 往百度输入框添加帅哥
#         search.send_keys(Keys.ENTER)  # 点击回车
#         # 8、find_element_by_xpath  # 根据xpath查找
#         # 等待5秒
#         time.sleep(5)
#     finally:
#         driver.close()
#1、selenium只是模拟浏览器的行为，而浏览器解析页面是需要时间的（执行css，js），一些元素可能需要过一段时间才能加载出来，为了保证能查找到元素，必须等待

#2、等待的方式分两种：
# 隐式等待：在browser.get（‘xxx‘）前就设置，针对所有元素有效
# 显式等待：在browser.get（‘xxx‘）之后设置，只针对某个元素有效
......
......
爬取西刺代理：

‘‘‘
爬取西刺免费代理：
    1.访问西刺免费代理页面
    2.通过re模块解析并提取所有代理
    3.通过ip测试网站对爬取的代理进行测试
    4.若test_ip函数抛出异常代表代理作废，否则代理有效
    5.利用有效的代理进行代理测试

<tr class="odd">
      <td class="country"><img src="//fs.xicidaili.com/images/flag/cn.png" alt="Cn"></td>
      <td>112.85.131.99</td>
      <td>9999</td>
      <td>
        <a href="/2019-05-09/jiangsu">江苏南通</a>
      </td>
      <td class="country">高匿</td>
      <td>HTTPS</td>
      <td class="country">
        <div title="0.144秒" class="bar">
          <div class="bar_inner fast" style="width:88%">

          </div>
        </div>
      </td>
      <td class="country">
        <div title="0.028秒" class="bar">
          <div class="bar_inner fast" style="width:97%">

          </div>
        </div>
      </td>

      <td>6天</td>
      <td>19-05-16 11:20</td>
    </tr>
re:
    <tr class="odd">(.*?)</td>.*?<td>(.*?)</td>

‘‘‘
import requests
import re
import time

HEADERS = {
    ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36‘,
}


def get_index(url):
    time.sleep(1)
    response = requests.get(url, headers=HEADERS)
    return response


def parse_index(text):
    ip_list = re.findall(‘<tr class="odd">.*?<td>(.*?)</td>.*?<td>(.*?)</td>‘, text, re.S)
    for ip_port in ip_list:
        ip = ‘:‘.join(ip_port)
        yield ip

def test_ip(ip):
    print(‘测试ip: %s‘ % ip)
    try:
        proxies = {
            ‘https‘: ip
        }

        # ip测试网站
        ip_url = ‘https://www.ipip.net/‘

        # 使用有效与无效的代理对ip测试站点进行访问，若返回的结果为200则代表当前测试ip正常
        response = requests.get(ip_url, headers=HEADERS, proxies=proxies, timeout=1)

        if response.status_code == 200:
            return ip

    # 若ip代理无效则抛出异常
    except Exception as e:
        print(e)

# 使用代理爬取nba
def spider_nba(good_ip):
    url = ‘https://china.nba.com/‘

    proxies = {
        ‘https‘: good_ip
    }

    response = requests.get(url, headers=HEADERS, proxies=proxies)
    print(response.status_code)
    print(response.text)


if __name__ == ‘__main__‘:
    base_url = ‘https://www.xicidaili.com/nn/{}‘

    for line in range(1, 3677):
        ip_url = base_url.format(line)

        response = get_index(ip_url)

        ip_list = parse_index(response.text)
        for ip in ip_list:
            # print(ip)
            good_ip = test_ip(ip)

            if good_ip:
                # 真是代理，开始测试
                spider_nba(good_ip)



# 官网链接: http://docs.python-requests.org/en/master/user/advanced/#proxies

# 代理设置:先发送请求给代理,然后由代理帮忙发送(封ip是常见的事情)
import requests
proxies={
    # 带用户名密码的代理,@符号前是用户名与密码
    ‘http‘:‘http://tank:123@localhost:9527‘,
    ‘http‘:‘http://localhost:9527‘,
    ‘https‘:‘https://localhost:9527‘,
}
response=requests.get(‘https://www.12306.cn‘,
                     proxies=proxies)
print(response.status_code)


# 支持socks代理,安装:pip install requests[socks]
import requests
proxies = {
    ‘http‘: ‘socks5://user:pass@host:port‘,
    ‘https‘: ‘socks5://user:pass@host:port‘
}
respone=requests.get(‘https://www.12306.cn‘,
                     proxies=proxies)

print(respone.status_code)

python实训第五天

标签：link driver fas 状态码终端多网站隐式 gecko art

原文地址：https://www.cnblogs.com/7777qqq/p/11041731.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行