标签:col lse == user XML ice put tail name
#爬取站长素材中的免费建立模板 import requests from lxml import etree import random headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36", "Connection": "close" # 防止连接池 一直被占用着 请求结束后马上断开连接 释放连接池中的资源 } #第一页的url url_page_one = ‘http://sc.chinaz.com/jianli/free.html‘ #其他页 通用的url url_demo = ‘http://sc.chinaz.com/jianli/free_%d.html‘ start_page = int(input(‘enter a start page num:‘)) end_page = int(input(‘enter a end page num:‘)) for i in range(start_page,end_page+1): if i == 1: url = url_page_one else: url = url_demo%i response = requests.get(url=url, headers=headers) response.encoding = ‘utf-8‘ page_text = response.text #解析详情页的名称 和 url tree = etree.HTML(page_text) div_list = tree.xpath(‘//div[@id="container"]/div‘) for div in div_list: detail_url = div.xpath(‘./p/a/@href‘)[0] name = div.xpath(‘./p/a/text()‘)[0] #对详情页发起请求 detail_page_text = requests.get(url=detail_url,headers=headers).text #对详情页的源码数据进行解析:下载地址对应的url tree = etree.HTML(detail_page_text) #找到所有的下载地址 li_list = tree.xpath(‘//div[@class="clearfix mt20 downlist"]/ul/li‘) #随机选取一个li标签(li标签中包含了下载地址的url) li = random.choice(li_list) download_url = li.xpath(‘./a/@href‘)[0] #进行简历下载 相当于点击下载连接 data = requests.get(url=download_url, headers=headers).content #content 是以流的形式获取数据 name = name+ ".rar" with open(name,"wb") as fp: fp.write(data) print(name,"下载成功")
标签:col lse == user XML ice put tail name
原文地址:https://www.cnblogs.com/kenD/p/11111669.html