标签:agent title lis __name__ session arc ice import requests
#导入库
import os import requests from bs4 import BeautifulSoup import time from config import * import random import re from requests import ConnectionError
#生成mzitu请求headers
def res_headers(): headers = { ‘User-Agent‘: random.choice(USER_AGENT_LIST), ‘Referer‘:random.choice(REFERER_LIST), } return headers
#生成单个user-agent
def get_header(): headers = { ‘User-Agent‘:random.choice(USER_AGENT_LIST) } return headers
#获取list后checkip返回可用ip
def get_proxy_list(): ip_list = [] base_url = ‘https://www.xicidaili.com/wt/‘ header = get_header() actual_url = base_url + str(random.randint(1,300)) try: res = requests.get(url=actual_url, headers=header) if res.status_code == 200: html = res.text pattern = ‘(\d+\.\d+\.\d+\.\d+)</td>\s*<td>(\d+)‘ re_list = re.findall(pattern, html) for ip_port in re_list: ip_port = ip_port[0] + ‘:‘ + ip_port[1] ip_list.append(ip_port) check_ip(ip_list) else:get_proxy_list() except ConnectionError: get_proxy_list()
#check 有效ip
def check_ip(ip_list): # print(‘check_ip‘) url_baidu = ‘https://www.mzitu.com/‘ proxy_ip = ‘http://‘ + random.choice(ip_list) proxy_ip_dic = { ‘http‘: proxy_ip } header = get_header() # print(proxy_ip_dic) try: res = requests.get(url_baidu, headers=header, proxies=proxy_ip_dic, timeout=8) if res.status_code == 200: # print(proxy_ip_dic) return proxy_ip_dic except ConnectionError: get_proxy_list()
#网站请求
def get_page(url): headers=res_headers() # 创建session s = requests.session() s.keep_alive = False # 获取页面 res = s.get(url,headers=headers) html = res.text return html
#获取页面all girls的详情页url
def get_all_girls(url): html = get_page(url) # 构建soup页面 soup = BeautifulSoup(html, ‘html.parser‘) # 获取 class_=‘archives‘ 下的所有 ‘a‘标签 total_info = soup.find(class_=‘archives‘).find_all(‘a‘) # 遍历 ‘a‘ 标签,读取‘href‘值 all_list=[] for girls_info in total_info: link_url = girls_info[‘href‘] all_list.append(link_url) # print(all_list) return all_list
#获取girl的所有图片url
def get_girl_all_page(url): html=get_page(url) soup = BeautifulSoup(html,‘html.parser‘) # 在 class_=‘pagenavi‘ 中的倒数第3个标签,读取 ‘span‘ 的值(图片数量) max_page = soup.find(class_=‘pagenavi‘,).find_all(‘a‘)[-2].find(‘span‘).string title = soup.find(class_=‘main-title‘).string # 循环读取详情页面中的‘img‘标签中的‘src‘值 pic_url_list = [] for i in range(int(max_page)): html = get_page(url + "/%s" %(i+1)) # print(html) soup = BeautifulSoup(html,‘html.parser‘) # print(soup.text) # pic_url = soup.find(‘img‘).get(‘src‘) pic_url = soup.find(‘img‘).get(‘src‘) # print(pic_url) pic_url_list.append(pic_url) time.sleep(0.1) # print(pic_url_list) download_Pic(title,pic_url_list)
#下载图片,以标题为文件夹名
def download_Pic(title, pic_url_list): # 新建文件夹,路径 os.mkdir(title) headers = res_headers() proxy = get_proxy_list() # 自定义序列号 j = 1 # 下载图片 for item in pic_url_list: # 定义文件路径及名称 filename = ‘%s/%s.jpg‘ % (title, str(j)) print(‘downloading....%s : NO.%s‘ % (title, str(j))) with open(filename, ‘wb‘) as f: img = requests.get(item, headers=headers,proxies=proxy).content f.write(img) j += 1 time.sleep(10)
#主程序
if __name__ == ‘__main__‘: url = "https://www.mzitu.com/all" pic_list = get_all_girls(url) for i in pic_list: get_girl_all_page(i)
标签:agent title lis __name__ session arc ice import requests
原文地址:https://www.cnblogs.com/lijifei/p/12048437.html