标签:imp condition 加载完成 完成 attr print img browser oba
import re from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from pyquery import PyQuery as pq KEYWORD = ‘小米手机‘ MAX_PAGE = 3 # 浏览器驱动 browser = webdriver.Chrome() wait = WebDriverWait(browser,10) def get_products(): # 获取网页源代码 html = browser.page_source # 解析 content = pq(browser.page_source) # 得到所有选择的内容 items = content(‘#mainsrp-itemlist .m-itemlist .grid.g-clearfix .item‘).items() for item in items: product = { ‘image‘:item.find(‘.pic .img‘).attr(‘data-src‘), ‘price‘:item.find(‘.price‘).text().strip(), ‘deal‘:item.find(‘.deal-cnt‘).text(), ‘title‘:item.find(‘.title‘).text(), ‘shop‘:item.find(‘.shop‘).text(), ‘location‘:item.find(‘.location‘).text() } print(‘--------{}----------\n‘.format(product)) def index_page(page): ‘‘‘ 抓取索引页 :param page: :return: ‘‘‘ print(10*‘-‘,‘正在抓取第{}页‘.format(page),10*‘-‘) try: url = ‘https://s.taobao.com/search?q={}‘.format(KEYWORD) print(url) browser.get(url) # 如果抓取的不是第一页,进行跳页操作 if page > 1: input = wait.until(EC.presence_of_element_located(( By.CSS_SELECTOR,‘#mainsrp-pager > div > div > div > div.form > input‘ ))) submit = wait.until(EC.element_to_be_clickable(( By.CSS_SELECTOR,‘#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit‘ ))) input.clear() input.send_keys(page) submit.click() # 等待页面加载完成(当前高亮页码是page) wait.until(EC.text_to_be_present_in_element(( By.CSS_SELECTOR,‘#mainsrp-pager > div > div > div > ul > li.item.active > span‘ ),str(page))) # 等待所有商品信息加载完成 wait.until(EC.presence_of_element_located(( By.CSS_SELECTOR,‘#mainsrp-itemlist .m-itemlist .grid.g-clearfix .item‘) )) get_products() except TimeoutException: index_page() def main(): # 遍历每一页 for page in range(1,MAX_PAGE+1): index_page(page)
标签:imp condition 加载完成 完成 attr print img browser oba
原文地址:https://www.cnblogs.com/ray-mmss/p/9385927.html