标签:ada adapter status webkit ids close div prettify name
import requests
import requests.adapters
from bs4 import BeautifulSoup
from lxml import etree
from pyquery import PyQuery as pq
def get_url_txt(url, headers, encoding, data=None):
ret = ''
try:
requests.adapters.DEFAULT_RETRIES = 5
session = requests.session()
session.keep_alive = False
if data is None:
response = session.get(url, headers=headers)
else:
response = session.get(url, headers=headers, data=data)
if response.status_code == 200:
response.encoding = encoding
ret = response.text
response.close()
session.close()
except Exception as e:
print(e)
return ret
def get_url_byte(url, headers, data_dict=None):
ret = b''
try:
requests.adapters.DEFAULT_RETRIES = 5
session = requests.session()
session.keep_alive = False
if data_dict is None:
response = session.get(url, headers=headers)
else:
response = session.get(url, headers=headers, data=data_dict)
if response.status_code == 200:
ret = response.content
response.close()
session.close()
except Exception as e:
print(e)
return ret
if __name__ == '__main__':
url1 = 'https://www.baidu.com/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
encoding = 'utf8'
text = get_url_txt(url=url1, headers=headers, encoding=encoding)
# print(text)
"""
<a href="http://news.baidu.com" name="tj_trnews" class="mnav">新闻</a>
selector #u1 > a:nth-child(2)
xpath //*[@id="u1"]/a[2]
full xpath /html/body/div[1]/div[1]/div/div[3]/a[2]
"""
pass # bs4 .get_text() .get('属性')
soup = BeautifulSoup(text, 'lxml')
soup.prettify()
rets = soup.select('#u1 > a:nth-child(2)')
# for ret in rets:
# print(ret.get_text(), ret.get('href'))
pass # xpath //text() //@属性
xpath = etree.HTML(text)
rets1 = xpath.xpath('//*[@id="u1"]/a[2]//text()')
rets2 = xpath.xpath('//*[@id="u1"]/a[2]//@href')
# print(rets1, rets2)
pass # PyQuery tag'tag名称' id'#id值' class'.class值'
doc = pq(text)
tags = doc('a')
# print(len(tags), tags)
# for i in tags:
# print(pq(i).text(), pq(i).attr('href'))
ids = doc('#u1')
# print(len(ids), ids)
classes = doc('.mnav')
# print(len(classes), classes)
# for i in classes:
# print(pq(i).text(), pq(i).attr('href'))
Python.requests.bs4.xpath.pquery
标签:ada adapter status webkit ids close div prettify name
原文地址:https://www.cnblogs.com/dailycode/p/12466910.html