基于selenium爬取京东

时间：2019-11-17 01:57:00 阅读：69 评论：0 收藏：0 [点我收藏+]

爬取iphone

注意：browser对象会发生变化，当对当前网页做任意操作时

import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
#

if __name__ == ‘__main__‘:

    browser = webdriver.Chrome()
    browser.get(‘https://www.jd.com‘)
    # 搜索iphone
    _input = browser.find_element_by_id(‘key‘)
    _input.send_keys(‘iphone‘)
    _input.send_keys(Keys.ENTER)
    time.sleep(5)
    # 按销量排序
    sales = browser.find_element_by_xpath(‘//div[@class="f-sort"]/a[2]‘)
    sales.click()

    has_next = True
    while has_next:
        # 获取当前的页码
        time.sleep(5)
        cur_page = browser.find_element_by_xpath(‘//div[@id="J_bottomPage"]/span[@class="p-skip"]/input‘).get_attribute(‘value‘)
        print(‘-------------------------   当前页码 {}  -------------------------‘.format(cur_page))
        
        # 加载全部数据，数据随着滚动条的下来而加载
        # good_list = browser.find_element_by_id(‘J_goodsList‘)
        # y = good_list.rect[‘y‘] + good_list.rect[‘height‘]
        next_page = browser.find_element_by_class_name(‘pn-next‘)
        y = next_page.location[‘y‘]
        browser.execute_script(‘window.scrollTo(0, {})‘.format(y))
        time.sleep(3)
        # 获取当前页面所有商品列表
        p_list = browser.find_elements_by_class_name(‘gl-item‘)
        for p in p_list:
            production = {}
            sku = p.get_attribute(‘data-sku‘)
            production[‘price‘] = p.find_element_by_css_selector(‘strong.J_{}‘.format(sku)).text
            production[‘name‘] = p.find_element_by_css_selector(‘div.p-name>a>em‘).text
            production[‘comment‘] = p.find_element_by_id(‘J_comment_{}‘.format(sku)).text
            production[‘shop‘] = p.find_element_by_css_selector(‘div.p-shop>span>a‘).get_attribute(‘title‘)
            print(production)

        # 下一页
        cur_next_page = browser.find_element_by_class_name(‘pn-next‘)
        # 判断是否是最后一页
        if ‘disabled‘ in cur_next_page.get_attribute(‘class‘):
            has_next = False
        else:
            cur_next_page.click()

    browser.quit()

优化

import time
import sys
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
#

if __name__ == ‘__main__‘:
    keyword = ‘iphone‘
    if len(sys.argv) > 1:
        keyword = sys.argv[1]
    browser = webdriver.Chrome()
    browser.get(‘https://www.jd.com‘)
    # 搜索iphone
    _input = browser.find_element_by_id(‘key‘)
    _input.send_keys(keyword)
    _input.send_keys(Keys.ENTER)
    time.sleep(5)
    # 按销量排序
    sales = browser.find_element_by_xpath(‘//div[@class="f-sort"]/a[2]‘)
    sales.click()

    has_next = True
    while has_next:
        # 获取当前的页码
        time.sleep(5)
        cur_page = browser.find_element_by_xpath(‘//div[@id="J_bottomPage"]/span[@class="p-skip"]/input‘).get_attribute(‘value‘)
        print(‘-------------------------   当前页码 {}  -------------------------‘.format(cur_page))

        # 加载全部数据，数据随着滚动条的下来而加载
        # good_list = browser.find_element_by_id(‘J_goodsList‘)
        # y = good_list.rect[‘y‘] + good_list.rect[‘height‘]
        next_page = browser.find_element_by_class_name(‘pn-next‘)
        y = next_page.location[‘y‘]
        browser.execute_script(‘window.scrollTo(0, {})‘.format(y))
        time.sleep(3)
        # 获取当前页面所有商品列表
        p_list = browser.find_elements_by_class_name(‘gl-item‘)
        for p in p_list:
            production = {}
            sku = p.get_attribute(‘data-sku‘)
            production[‘price‘] = p.find_element_by_css_selector(‘strong.J_{}‘.format(sku)).text
            production[‘name‘] = p.find_element_by_css_selector(‘div.p-name>a>em‘).text
            production[‘comment‘] = p.find_element_by_id(‘J_comment_{}‘.format(sku)).text
            production[‘shop‘] = p.find_element_by_css_selector(‘div.p-shop>span>a‘).get_attribute(‘title‘)
            print(production)

        # 下一页
        cur_next_page = browser.find_element_by_class_name(‘pn-next‘)
        # 判断是否是最后一页
        if ‘disabled‘ in cur_next_page.get_attribute(‘class‘):
            has_next = False
        else:
            cur_next_page.click()

    browser.quit()

通过sys使

python jd.py mac

补充

sys.argv[0] 是脚本的名称

sys.argv[1] 是参数

基于selenium爬取京东

标签：pat text 页面 imp form led cat 脚本 --

原文地址：https://www.cnblogs.com/wt7018/p/11874823.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行