标签:要求 tle dal yield http pos sea code url
# -*- coding: utf-8 -*- import scrapy import re from zhaopin_project.items import LagouItem class QianchengwuyouSpider(scrapy.Spider): name = ‘qianchengwuyou‘ allowed_domains = [‘51job.com‘] start_urls = [‘http://51job.com/‘] def parse(self, response): for i in range(1,1620): base_url = ‘https://search.51job.com/list/010000,000000,0000,32,9,99,%2B,2,{}.html‘.format(i) # print(base_url) yield scrapy.Request(base_url,callback=self.parse_detail) def parse_detail(self,response): html_str = response.xpath(‘//div[@class="el"]/p/span/a/@href‘).extract() # print(html_str) for html_list in html_str: yield scrapy.Request(html_list,callback=self.parse_list) def parse_list(self,response): try: # 职位名称 title = response.xpath(‘//div[@class="cn"]/h1/text()‘).extract_first() # 月薪 salary = response.xpath(‘//div[@class="cn"]/strong/text()‘).extract_first() # 位置 p = re.findall(r‘<p class="msg ltype" title="(.*)">‘,response.text)[0] ss = p.split(‘ | ‘) position = ss[0] #经验要求 jingyan = ss[1] # 学历要求 if len(ss) ==4: xueli = ‘学历不限‘ else: xueli = ss[2] # 时间 shijian = ss[-1] # 发布网站 fabu = ‘前程无忧‘ # 职位描述 job_bt = response.xpath(‘//div[@class="tBorderTop_box"]/div/p/text()‘).extract() job_bt = ‘‘.join(job_bt) # print(‘--‘*50) item = LagouItem() item[‘title‘] = title item[‘salary‘] = salary item[‘position‘] = position item[‘jingyan‘] = jingyan item[‘xueli‘] = xueli item[‘shijian‘] = shijian item[‘fabu‘] = fabu item[‘job_bt‘] = job_bt yield item except: pass
标签:要求 tle dal yield http pos sea code url
原文地址:https://www.cnblogs.com/lxh777/p/9581019.html