标签:request jobs rds desc led last any text detail
使用selenium爬取拉勾网职位
1 from selenium import webdriver 2 from lxml import etree 3 import re 4 import time 5 from selenium.webdriver.support.ui import WebDriverWait 6 from selenium.webdriver.support import expected_conditions as EC 7 from selenium.webdriver.common.by import By 8 class LagouSpider(object): 9 driver_path = r"D:\driver\chromedriver.exe" 10 11 def __init__(self): 12 self.driver = webdriver.Chrome(executable_path=LagouSpider.driver_path) 13 self.url = ‘https://www.lagou.com/jobs/list_%E4%BA%91%E8%AE%A1%E7%AE%97?labelWords=&fromSearch=true&suginput=‘ 14 self.positions = [] 15 16 def run(self): 17 self.driver.get(self.url) 18 while True: 19 source = self.driver.page_source 20 WebDriverWait(driver=self.driver,timeout=10).until( 21 EC.presence_of_element_located((By.XPATH, "//div[@class=‘pager_container‘]/span[last()]")) 22 ) 23 self.parse_list_page(source) 24 try: 25 next_btn = self.driver.find_element_by_xpath("//div[@class=‘pager_container‘]/span[last()]") 26 if "pager_next_disabled" in next_btn.get_attribute("class"): 27 break 28 else: 29 next_btn.click() 30 except: 31 print(source) 32 33 time.sleep(1) 34 35 def parse_list_page(self,source): 36 html = etree.HTML(source) 37 links = html.xpath("//a[@class=‘position_link‘]/@href") 38 for link in links: 39 self.request_detail_page(link) 40 time.sleep(1) 41 42 def request_detail_page(self,url): 43 # self.driver.get(url) 44 print() 45 print(url) 46 print() 47 self.driver.execute_script("window.open(‘%s‘)" % url) 48 self.driver.switch_to.window(self.driver.window_handles[1]) 49 WebDriverWait(self.driver,timeout=10).until( 50 EC.presence_of_element_located((By.XPATH,"//div[@class=‘job-name‘]/span[@class=‘name‘]")) 51 ) 52 source = self.driver.page_source 53 self.parse_detail_page(source) 54 self.driver.close() 55 self.driver.switch_to.window(self.driver.window_handles[0]) 56 57 def parse_detail_page(self,source): 58 html = etree.HTML(source) 59 position_name = html.xpath("//span[@class=‘name‘]/text()")[0] 60 job_request_spans = html.xpath("//dd[@class=‘job_request‘]//span") 61 salary = job_request_spans[0].xpath(‘.//text()‘)[0].strip() 62 city = job_request_spans[1].xpath(".//text()")[0].strip() 63 city = re.sub(r"[\s/]", "", city) 64 work_years = job_request_spans[2].xpath(".//text()")[0].strip() 65 work_years = re.sub(r"[\s/]", "", work_years) 66 education = job_request_spans[3].xpath(".//text()")[0].strip() 67 education = re.sub(r"[\s/]", "", education) 68 desc = "".join(html.xpath("//dd[@class=‘job_bt‘]//text()")).strip() 69 company_name = html.xpath("//h2[@class=‘f1‘]/text()") 70 position = { 71 ‘name‘: position_name, 72 ‘company_name‘: company_name, 73 ‘salary‘: salary, 74 ‘city‘: city, 75 ‘work_years‘: work_years, 76 ‘education‘: education, 77 ‘desc‘: desc 78 } 79 self.positions.append(position) 80 print(position) 81 if __name__ == ‘__main__‘: 82 spider = LagouSpider() 83 spider.run()
标签:request jobs rds desc led last any text detail
原文地址:https://www.cnblogs.com/kingle-study/p/9953842.html