标签:ati title open ted final 美食 enc port select
‘‘‘利用selenium爬取淘宝美食网页内容‘‘‘ import re from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from pyquery import PyQuery as pq from config import * driver = webdriver.PhantomJS(service_args=SERVICE_ARGS) # driver = webdriver.Chrome() wait = WebDriverWait(driver, 10) driver.set_window_size(1400,900) #有这这句话就是可以爬取到网页的内容,没有的话就出现TimeOut错误 def search(): print(‘正在搜索‘) try: driver.get(‘http://www.taobao.com‘) s_input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,‘#q‘))) sumbit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,‘#J_TSearchForm > div.search-button > button‘))) s_input.send_keys(KEYWORD) sumbit.click() totle = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,‘#mainsrp-pager > div > div > div > div.total‘))) get_products() return totle.text except TimeoutException: print(‘TimeOut‘) return search() def next_page(page_number): print(‘正在翻页‘, page_number) try: s_input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ‘#mainsrp-pager > div > div > div > div.form > input‘))) sumbit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ‘#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit‘))) s_input.clear() s_input.send_keys(page_number) sumbit.click() wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,‘#mainsrp-pager > div > div > div > ul > li.item.active > span‘),str(page_number))) get_products() except TimeoutException: print(‘TimeOut‘) next_page(page_number) def get_products(): wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,‘#mainsrp-itemlist .items .item‘))) html = driver.page_source doc = pq(html) items = doc(‘#mainsrp-itemlist .items .item‘).items() for item in items: product = { ‘image‘: item.find(‘.pic .img‘).attr(‘src‘), ‘price‘:item.find(‘.price‘).text(), ‘deal‘: item.find(‘.deal-cnt‘).text()[:-3], ‘title‘: item.find(‘.title‘).text(), ‘shop‘: item.find(‘.shop‘).text(), ‘location‘: item.find(‘.location‘).text() } print(product) def main(): try: totle = search() totle = int(re.compile(‘(\d+)‘).search(totle).group(1)) for num in range(2,totle + 1): next_page(num) except Exception as e: print(e) finally: #最后执行的操作 driver.close() if __name__ == ‘__main__‘: main()
config文件
SERVICE_ARGS = [‘--load-images=false‘, ‘--disk-cache=true‘] KEYWORD = ‘美食‘
标签:ati title open ted final 美食 enc port select
原文地址:http://www.cnblogs.com/114811yayi/p/7226206.html