码迷,mamicode.com
首页 > 其他好文 > 详细

基于selenium爬取京东

时间:2019-11-17 01:57:00      阅读:69      评论:0      收藏:0      [点我收藏+]

标签:pat   text   页面   imp   form   led   cat   脚本   --   

爬取iphone

注意:browser对象会发生变化,当对当前网页做任意操作时

import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
#

if __name__ == __main__:

    browser = webdriver.Chrome()
    browser.get(https://www.jd.com)
    # 搜索iphone
    _input = browser.find_element_by_id(key)
    _input.send_keys(iphone)
    _input.send_keys(Keys.ENTER)
    time.sleep(5)
    # 按销量排序
    sales = browser.find_element_by_xpath(//div[@class="f-sort"]/a[2])
    sales.click()

    has_next = True
    while has_next:
        # 获取当前的页码
        time.sleep(5)
        cur_page = browser.find_element_by_xpath(//div[@id="J_bottomPage"]/span[@class="p-skip"]/input).get_attribute(value)
        print(-------------------------   当前页码 {}  -------------------------.format(cur_page))
        
        # 加载全部数据,数据随着滚动条的下来而加载
        # good_list = browser.find_element_by_id(‘J_goodsList‘)
        # y = good_list.rect[‘y‘] + good_list.rect[‘height‘]
        next_page = browser.find_element_by_class_name(pn-next)
        y = next_page.location[y]
        browser.execute_script(window.scrollTo(0, {}).format(y))
        time.sleep(3)
        # 获取当前页面所有商品列表
        p_list = browser.find_elements_by_class_name(gl-item)
        for p in p_list:
            production = {}
            sku = p.get_attribute(data-sku)
            production[price] = p.find_element_by_css_selector(strong.J_{}.format(sku)).text
            production[name] = p.find_element_by_css_selector(div.p-name>a>em).text
            production[comment] = p.find_element_by_id(J_comment_{}.format(sku)).text
            production[shop] = p.find_element_by_css_selector(div.p-shop>span>a).get_attribute(title)
            print(production)

        # 下一页
        cur_next_page = browser.find_element_by_class_name(pn-next)
        # 判断是否是最后一页
        if disabled in cur_next_page.get_attribute(class):
            has_next = False
        else:
            cur_next_page.click()

    browser.quit()

 优化

import time
import sys
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
#

if __name__ == __main__:
    keyword = iphone
    if len(sys.argv) > 1:
        keyword = sys.argv[1]
    browser = webdriver.Chrome()
    browser.get(https://www.jd.com)
    # 搜索iphone
    _input = browser.find_element_by_id(key)
    _input.send_keys(keyword)
    _input.send_keys(Keys.ENTER)
    time.sleep(5)
    # 按销量排序
    sales = browser.find_element_by_xpath(//div[@class="f-sort"]/a[2])
    sales.click()

    has_next = True
    while has_next:
        # 获取当前的页码
        time.sleep(5)
        cur_page = browser.find_element_by_xpath(//div[@id="J_bottomPage"]/span[@class="p-skip"]/input).get_attribute(value)
        print(-------------------------   当前页码 {}  -------------------------.format(cur_page))

        # 加载全部数据,数据随着滚动条的下来而加载
        # good_list = browser.find_element_by_id(‘J_goodsList‘)
        # y = good_list.rect[‘y‘] + good_list.rect[‘height‘]
        next_page = browser.find_element_by_class_name(pn-next)
        y = next_page.location[y]
        browser.execute_script(window.scrollTo(0, {}).format(y))
        time.sleep(3)
        # 获取当前页面所有商品列表
        p_list = browser.find_elements_by_class_name(gl-item)
        for p in p_list:
            production = {}
            sku = p.get_attribute(data-sku)
            production[price] = p.find_element_by_css_selector(strong.J_{}.format(sku)).text
            production[name] = p.find_element_by_css_selector(div.p-name>a>em).text
            production[comment] = p.find_element_by_id(J_comment_{}.format(sku)).text
            production[shop] = p.find_element_by_css_selector(div.p-shop>span>a).get_attribute(title)
            print(production)

        # 下一页
        cur_next_page = browser.find_element_by_class_name(pn-next)
        # 判断是否是最后一页
        if disabled in cur_next_page.get_attribute(class):
            has_next = False
        else:
            cur_next_page.click()

    browser.quit()

通过sys使

python jd.py mac

补充

sys.argv[0] 是脚本的名称

sys.argv[1] 是参数

基于selenium爬取京东

标签:pat   text   页面   imp   form   led   cat   脚本   --   

原文地址:https://www.cnblogs.com/wt7018/p/11874823.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!