标签:连接 referer json html top proc 多线程 create check
下面是已定义好的反爬函数一个函数对应一个或N个知识点,相互调用构造调试反爬。
# 1.随机头部 def 随机头部(): from fake_useragent import UserAgent my_headers = { ‘User-Agent‘: UserAgent().random, # 注意是‘User-Agent’不是‘User_Agent’ ‘referer‘: ‘https://music.163.com/discover/toplist‘, # 浏览器->初始页F12->network->Headers->拷贝referer:‘内容粘贴至此‘ } return my_headers # 2.cookies(保持登录状态1) def cookies(): cookie_str = ‘‘ # 浏览器->F12->network->Headers->拷贝cookie:‘内容粘贴至此‘ cookies = cookie_str.split(‘;‘) # 以;分割 cookies_dic = {} # 字典来装?列表形式 for cookie in cookies: # 遍历列表 cookies_dic[cookie.split(‘=‘, 1)[0]] = cookie.split(‘=‘, 1)[1] # 以=号分割2次取索引0作为key值1作为values return cookies_dic # 这个函数返回的字典列表 # 3.保持会话框(保持登录状态2) def 会话框(): import requests session = requests.session() # 初始化会话框 session.get(url=‘https://music.163.com/#/discover‘, headers=随机头部()) # 通过session.get(当前url)连接下一个url,保持网页是同一个会话。# 浏览器->F12->network->Headers->拷贝xhr文件->Request URL:‘内容粘贴至此‘ cookies = session.cookies # 调用参数得到cookies: 内含用户基本信息 return cookies # 4.定义代理IP列表 def ip池(base_url): import requests import lxml.html import time for url in base_url: # 遍历base_url列表 reponse = requests.get(url=url, headers=随机头部(), cookies=会话框()).content.decode() # 获取内容解码转换(默认‘utf-8‘) time.sleep(2) selector = lxml.html.fromstring(reponse) # 将网页源代码打包 data_list = selector.xpath(‘//*[@id="list"]/table/tbody/tr‘) # 利用Xpath提取。div[position()<=3]: 获取当前div前3标签 for i in data_list: if i.xpath(‘td[1]‘) and i.xpath(‘td[4]‘): # 去空行: 用if条件判断筛选两个皆满足才能进入下一步 ip_data = i.xpath(‘td[1]/text()‘)[0] # IP ip_type = i.xpath(‘td[4]/text()‘)[0].lower() # 类型lower变成小写 ip_prot = i.xpath(‘td[2]/text()‘)[0] # 端口 proxies = { ip_type: ip_type + ‘://‘ + str(ip_data) + ":" + str(ip_prot) } check_proxy_ip(proxies) # 判断代理ip和端口能用否 def check_proxy_ip(url,proxies) # 5.检测ip可用 def check_proxy_ip(proxies): import requests url = "http://www.baidu.com/" # 检测地址: 百度 print("正在检测: "+proxies[‘http‘]) try: # 捕抓错误 response = requests.get(url, headers=随机头部(), proxies=proxies, timeout=2) # proxies: 代理ip 、timeout: 控制时间 except: # 如果错误那么就执行 return print("不可用") else: # 不然就执行 print("可用") 保存txt(proxies) return None # 6.使用ip代理 def read_id(): proxirs = [ {‘http‘: ‘http://47.98.251.15:8118‘}, {‘http‘: ‘http://122.51.231.113:8080‘}, {‘http‘: ‘http://112.111.77.75:9999‘}, {‘http‘: ‘http://101.231.104.82:80‘}, ] return proxirs # 7.定义信息提取器 def 提取器(base_url): import requests import random import lxml.html reponse = requests.post(url=base_url, headers=随机头部(), proxies=random.choice(read_id()), cookies=会话框(), timeout=5) print(reponse.text) # selector = lxml.html.fromstring(reponse) # 将网页源代码打包 # print(selector) # s =selector.xpath(‘//*[@id="g-topbar"]/div[1]/div/ul/li[2]/span/a/text()‘) # print(s) # json解析(reponse) # xhr return reponse # 8.定义json解析 def json解析(reponse): jobs_data = reponse.json() # 解析 jobs = jobs_data[‘content‘][‘data‘][‘page‘][‘result‘] # 获取json下的东西,成为列表 return 获取数据(jobs) # 9.遍历数据 def 获取数据(jobs): # 遍历获取: 年限要求、学历要求、职位描述、城市信息、公司名称、发布时间、职位名称、工资信息 for job in jobs: job_city = job[‘city‘] #城市 job_companyFullName = job[‘companyFullName‘] #公司名 job_createTime = job[‘createTime‘] #发布时长 job_positionName = job[‘positionName‘] #职位 job_salary = job[‘salary‘] #月薪 positionId = job[‘positionId‘] #职位号码 job_item = [job_city, job_companyFullName, job_createTime, job_positionName, job_salary, positionId] # 将数据装入列表 保存csv(job_item) # 传参 return None
# 10.多线程 def 线程(house_url): from multiprocessing.dummy import Pool as pl # 线程库 import time pool = pl(4) # 线程初始化开启4个cpu工作(这里理解一个cpu执行多个任务) time.sleep(2) pool.map(ip池, house_url) # 执行spider pool.close() # 关闭线程 pool.join() return None # 11.txt保存 def 保存txt(data_list): doc = open(‘ip_data.txt‘, ‘a‘) # 打开地址: a追加模式 print("正在保存"+str(data_list)) # 提示 print(data_list, file=doc) # 打印保存 doc.close() # 关闭释放资源
# 12.csv保存 def 保存csv(data_list): import csv with open(‘lwc137_data.csv‘, ‘a‘, encoding=‘utf-8-sig‘, newline=‘‘) as csvfile: # 打开(‘文件.csv’,’ 追加模式’,’语言’,’去空行’) 另命 名: wriiter = csv.writer(csvfile) # 传入 wriiter.writerow(data_list) # 根据参数写入定义好的文件 return print("正在保存"+str(data_list)) # 13.图片保存 def image_saver(url, apartment): import requests img = requests.get(url, headers=随机头部()) with open(‘D:\99977\python课程\学习数据\爬虫图片\{}.jpg‘.format(apartment), ‘wb‘) as f: f.write(img.content) return None # 执行入口 if __name__ == ‘__main__‘: # 1.大数据职位 # url = ‘https://m.lagou.com/search.json?‘ # city = ‘全国‘ # positionName = ‘大数据‘ # for pageNo in range(1, 10, 1): # base_url = url + ‘city=‘ + city + ‘&positionName=‘ + positionName + ‘&pageNo=‘ + str(pageNo) + ‘&pageSize=15‘ # 提取器(base_url) # 2.获取k快代理的可用ip放至txt文件 # ip池([‘https://www.kuaidaili.com/free/inha/‘ + str(x) + "/" for x in range(1, 20, 1)]) # 循环19页url列表->调用线程&ip池 # 3.读取可用ip # read_id() 提取器(‘https://music.163.com/#/discover/toplist‘)
标签:连接 referer json html top proc 多线程 create check
原文地址:https://www.cnblogs.com/Agent9527/p/13191798.html