码迷,mamicode.com
首页 > 其他好文 > 详细

selenium 爬boss

时间:2019-07-16 08:32:16      阅读:84      评论:0      收藏:0      [点我收藏+]

标签:des   信息   pytho   ons   strip   selenium   ini   position   url   

# 有问题

from selenium import webdriver
import time
from lxml import etree



class LagouSpider(object):
    driver_path = r"G:\Crawler and Data\chromedriver.exe"

    def __init__(self):
        self.driver = webdriver.Chrome(executable_path=self.driver_path)
        self.url = "https://www.zhipin.com/job_detail/?query=python&city=101010100&industry=&position="
        self.positions = []
        self.position_dict = {}
        self.detail_url_list = []
    def run(self):
        # 访问首页
        self.driver.get(self.url)
        # 获取页面信息
        # page_source可以获取页面的所有数据,包括每个职位的链接
        source= self.driver.page_source
        self.parse_list_page(source)


    def parse_list_page(self,source):
        # 每个职位的链接
        tree = etree.HTML(source)

        # 获取职位的链接 ******
        li_list = tree.xpath("//div[@class=‘job-box‘]/div[@class=‘job-list‘]/ul/li")
        for li in li_list:
            detail_url = li.xpath(.//div[@class="info-primary"]/h3/a/@href)[0]
            detail_url = "https://www.zhipin.com"+detail_url
            print(detail_url)
            self.detail_url_list.append(detail_url)
            title = li.xpath(.//div[@class="info-primary"]/h3/a/div[@class="job-title"]/text())[0]
            salary = li.xpath(.//div[@class="info-primary"]/h3/a/span[@class="red"]/text())[0]
            company = li.xpath(.//div[@class="info-company"]//h3/a/text())[0]
            self.position_dict["title"]=title
            self.position_dict["salary"]=salary
            self.position_dict["company"]=company

            self.detail_page(detail_url)
            # break

    def detail_page(self,url):
        for url in self.detail_url_list:
            # self.driver.get(url) # 直接访问这个url
            self.driver.execute_script(window.open("%s")%url) # 新打开一个窗口
            self.driver.switch_to.window(self.driver.window_handles[1])  # 切换到新窗口
            source = self.driver.page_source
            tree = etree.HTML(source)
            desc = tree.xpath("//div[@id=‘main‘]/div[3]/div/div[2]/div[2]/div[1]/div")
            # 获取一个标签(含有其他标签)下所有的文本
            desc_text = desc[0].xpath(string()).strip()
            self.position_dict[desc_text] = desc_text
            print(self.position_dict)
            time.sleep(2)
            self.driver.close()  # 关闭页面
            self.driver.switch_to.window(self.driver.window_handles[0])  # 切换到新窗口


if __name__ == __main__:
    spider = LagouSpider()
    spider.run()

 

selenium 爬boss

标签:des   信息   pytho   ons   strip   selenium   ini   position   url   

原文地址:https://www.cnblogs.com/kenD/p/11192654.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!