标签:标记 apple 返回 strong version tom color 属性 auto
1 # coding=utf-8 2 # Version:python3.6.0 3 # Tools:Pycharm 2017.3.2 4 # author ="wlx" 5 __date__ = ‘2018/6/14 10:37‘ 6 import requests 7 from bs4 import BeautifulSoup 8 9 ret = requests.get(url="https://github.com/login") 10 ret_cookie_dir = ret.cookies.get_dict() 11 s1 = BeautifulSoup(ret.text, ‘html.parser‘) 12 token = s1.find(name=‘input‘, attrs={‘name‘: ‘authenticity_token‘}).get(‘value‘) 13 14 r = requests.post( 15 url=‘https://github.com/session‘, 16 data={ 17 ‘commit‘: ‘Sign in‘, 18 ‘utf8‘: ‘?‘, 19 ‘authenticity_token‘: token, 20 ‘login‘: ‘792665319@qq.com‘, 21 ‘password‘: ‘_97e68fde946b‘ 22 }, 23 headers={ 24 ‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36‘ 25 }, 26 cookies=ret_cookie_dir 27 ) 28 s2 = BeautifulSoup(r.text, ‘html.parser‘) 29 name = s2.find(name = ‘strong‘, attrs={‘class‘: ‘css-truncate-target‘}).string 30 print(‘name:‘, name)
简易爬虫request和beautifulsoup爬取汽车之家
# coding=utf-8 # Version:python3.6.0 # Tools:Pycharm 2017.3.2 # author ="wlx" __date__ = ‘2018/6/12 21:10‘ import requests from bs4 import BeautifulSoup ret = requests.get(url="https://www.autohome.com.cn/news/") # print(ret.content) #二进制输出内容 # print(ret.apparent_encoding) # 检测其文档用的是什么编码 # ret.encoding = "gbk" ret.encoding = ret.apparent_encoding # print(ret.text) soup = BeautifulSoup(ret.text, ‘html.parser‘) # lxml # print(type(soup)) # 把文本变成了对象<class ‘bs4.BeautifulSoup‘> div = soup.find(name=‘div‘, id=‘auto-channel-lazyload-article‘) # find()找匹配成功的第一个,只有对象才有find()函数 # id属性用id,类属性不能用class,因为class是python内置关键字用后面 class_=‘name‘or attrs={‘id‘:‘id1‘,‘class‘:‘wei‘} # print(div) li_list = div.find_all(name=‘li‘) # find_all()函数匹配所有满足条件的对象,并返回这些对象构成的列表,返回列表后就不能往下找了,只有对象能find往下找 # print(li_list) for i in li_list: h3 = i.find(name=‘h3‘) if not h3: continue print(h3.text) p = i.find(name=‘p‘) print(p.text) a = i.find(‘a‘) # name=‘a‘ 不写name默认为第一个参数name # print(a.attrs) # 取a标记所有属性 # for key in a.attrs: # print(a.attrs[key]) print(a.get(‘href‘)) # 取标签指定属性 img = i.find(name=‘img‘) # print(img.get(‘src‘)) # 这样做只得到图片的地址,要再次发请求 src = img.get(‘src‘) file_name = src.rsplit(‘__‘, maxsplit=1)[1] ret_img = requests.get(url="https:"+src) with open(file_name, ‘wb‘) as f: f.write(ret_img.content) # print(ret_img.content)
2. 抽屉登录
# coding=utf-8 # Version:python3.6.0 # Tools:Pycharm 2017.3.2 # author ="wlx" __date__ = ‘2018/6/13 10:59‘ import requests # 网页浏览器工作流程,第一步先访问主页,然后主页返回一个未授权的cookie,然后发送post请求携带着用户名密码和为授权的cookie登陆, # 登陆后,网页对未授权的cookie进行授权,第一次给的未授权的cookie则可用了 # 1向网址发送get请求,有防爬虫防火墙,所以要带上头部headers表明自己是走浏览器发的请求 ret = requests.get( url="https://dig.chouti.com/", headers={ ‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36‘ } ) ret_cookie_dict = ret.cookies.get_dict() # 2向网址发送post请求,并获取cookie值 response = requests.post( url=‘https://dig.chouti.com/login‘, data={ ‘phone‘: ‘8618846453138‘, ‘password‘: ‘we18846453138‘, ‘oneMonth‘: ‘1‘ }, headers={ ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36‘ }, cookies=ret_cookie_dict # 携带初始访问网页的cookies登陆,使之被授权 ) # cookie_dict = response.cookies.get_dict() # 获取cookie 第二次访问得到的cookie在此处无用 r1 = requests.post( url=‘https://dig.chouti.com/link/vote?linksId=20217671‘, headers={ ‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36‘ }, cookies=ret_cookie_dict # 注意是cookies不是cookie,使用的是授权后的cookies ) print(r1.text)
3. requests模块
# params:URL中传入参数 import requests ‘‘‘ 请求头:http://www/oldboyedu.com headers={} ... 请求体:用data传请求体内部会转换为name=alex&age=18 用json传请求体的话传字符串‘{"name":"alex","age":18}‘ 标志: 一般Form Data需发送data类型请求体,payload需发送json类型请求体 ‘‘‘ requests.request(method=‘get‘, url=‘http://127.0.0.1:8000/test/‘) requests.request(method=‘post‘, url=‘http://127.0.0.1:8000/test/‘) requests.get(url=‘x‘) # 等价于requests.request(method=‘get‘, url=‘x‘) requests.post(url=‘x‘) # 等价于requests.request(method=‘post‘, url=‘x‘) requests.get(url=‘http://www/oldboyedu.com‘, params={"nid": 1, ‘name‘: ‘x‘}, headers={}, cookie={}) # data和json都能传请求体,具体区别见上 # json={"name":"alex","age":18} data=json.dumps({"name":"alex","age":18})//json其实就是帮忙做一个json.dump的操作 requests.post(url=‘http://www/oldboyedu.com‘, params={"nid": 1, ‘name‘: ‘x‘}, data={"name":"alex","age":18}, headers={}, cookie={}) # param向URL中传参上面的URL相当于https://www.oldboyedu.com?nid=1&name=x ‘‘‘ 模块 requests method: url: params data: json: headers: cookies: proxies:封ip,用代理,代理别写死,买十分代理每次发请求随机选择一个代理发送有的代理还要authorize认证auth = HTTPProxyAuth(‘username‘, ‘mypassword‘) r = requests.get("http://www.google.com", proxies=proxyDict, auth=auth) file:上传文件 auth:基本认证 timeout:相应超时等,设定超时时间 allow_redricts:True stream: 下载大文件时分开下载 ret = requests.get(‘http://127.0.0.2:8000/test/‘, stram = True) for i in r.iter_content(): print(i) from contextlib import closing with closing(requests.get(‘http://httpbin.org/get‘), stream=True) as r: # 在此处理相应 for i in r.iter_content(): print(i) cert:证书 verify:确认 参考:https://www.cnblogs.com/wupeiqi/articles/6283017.html ‘‘‘ ‘‘‘ def request(method, url, **kwargs): """Constructs and sends a :class:`Request <Request>`. :param method: method for the new :class:`Request` object. :param url: URL for the new :class:`Request` object. :param params: (optional) Dictionary or bytes to be sent in the query string for the :class:`Request`. :param data: (optional) Dictionary, bytes, or file-like object to send in the body of the :class:`Request`. :param json: (optional) json data to send in the body of the :class:`Request`. :param headers: (optional) Dictionary of HTTP Headers to send with the :class:`Request`. :param cookies: (optional) Dict or CookieJar object to send with the :class:`Request`. :param files: (optional) Dictionary of ``‘name‘: file-like-objects`` (or ``{‘name‘: file-tuple}``) for multipart encoding upload. ``file-tuple`` can be a 2-tuple ``(‘filename‘, fileobj)``, 3-tuple ``(‘filename‘, fileobj, ‘content_type‘)`` or a 4-tuple ``(‘filename‘, fileobj, ‘content_type‘, custom_headers)``, where ``‘content-type‘`` is a string defining the content type of the given file and ``custom_headers`` a dict-like object containing additional headers to add for the file. :param auth: (optional) Auth tuple to enable Basic/Digest/Custom HTTP Auth. :param timeout: (optional) How long to wait for the server to send data before giving up, as a float, or a :ref:`(connect timeout, read timeout) <timeouts>` tuple. :type timeout: float or tuple :param allow_redirects: (optional) Boolean. Set to True if POST/PUT/DELETE redirect following is allowed. :type allow_redirects: bool :param proxies: (optional) Dictionary mapping protocol to the URL of the proxy. :param verify: (optional) whether the SSL cert will be verified. A CA_BUNDLE path can also be provided. Defaults to ``True``. :param stream: (optional) if ``False``, the response content will be immediately downloaded. :param cert: (optional) if String, path to ssl client cert file (.pem). If Tuple, (‘cert‘, ‘key‘) pair. :return: :class:`Response <Response>` object :rtype: requests.Response Usage:: >>> import requests >>> req = requests.request(‘GET‘, ‘http://httpbin.org/get‘) <Response [200]> 参考:https://www.cnblogs.com/wupeiqi/articles/6283017.html ‘‘‘
标签:标记 apple 返回 strong version tom color 属性 auto
原文地址:https://www.cnblogs.com/wlx97e6/p/9270790.html