标签:index 版本 page 5.0 each https text lse on()
要求安装Requests-html,Python版本高于或等于3.6。
1 # -*- coding -*- 2 3 from requests_html import HTMLSession 4 5 6 def get_web_page_elements(url, headers={}, xpath_expression=‘‘): 7 ‘‘‘通过 xpath expression 获取 网页元素‘‘‘ 8 session = HTMLSession() 9 response = session.get(url, headers=headers) 10 elements_list = response.html.xpath(xpath_expression) 11 return elements_list 12 13 14 if __name__ == ‘__main__‘: 15 url = ‘https://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000‘ 16 # headers 设置 17 referer = url 18 cookie = ‘Cookie: atsp=1548864427226_1548863599220; Hm_lvt_2efddd14a5f2b304677462d06fb4f964=1548863599; Hm_lpvt_2efddd14a5f2b304677462d06fb4f964=1548863599‘ 19 user_agent = ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.81 Safari/537.36‘ 20 headers = { 21 ‘Referer‘: referer, 22 ‘Cookie‘: cookie, 23 ‘User-Agent‘: user_agent 24 } 25 # 获取 目录 26 index_xpath_expression = "//a[@class=‘x-wiki-index-item‘]" 27 index_data = get_web_page_elements(url, headers=headers, xpath_expression=index_xpath_expression) 28 for each_index in index_data: 29 print(each_index.text + ‘\t\t‘ + each_index.url)
标签:index 版本 page 5.0 each https text lse on()
原文地址:https://www.cnblogs.com/mcgill0217/p/10340310.html