码迷,mamicode.com
首页 > 编程语言 > 详细

python+selenium爬取关键字搜索google图片

时间:2019-07-05 19:27:39      阅读:152      评论:0      收藏:0      [点我收藏+]

标签:--   roc   meta   结果   div   arch   key   open   nta   

  1 # -*- coding: utf-8 -*-
  2 
  3 import json
  4 import os
  5 import time
  6 from multiprocessing import Pool
  7 import multiprocessing
  8 import requests
  9 from selenium import webdriver
 10 
 11 
 12 def get_image_links(keyword, num_requested = 1000):
 13     """get image links with selenium
 14     """
 15     number_of_scrolls = int(num_requested/400) + 1 
 16     img_urls = set()#设置为集合,自动去除重复链接
 17     chrome_options = webdriver.ChromeOptions()
 18     # chrome_options.add_argument(‘--headless‘)#设置无头浏览器
 19     # chrome_options.add_argument(‘user-agent="Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"‘)
 20     # chrome_options.add_argument("lang=en_US")#设置语言
 21     # prefs = {"profile.managed_default_content_settings.images":2}
 22     # chrome_options.add_experimental_option("prefs",prefs)#配置不加载图片
 23     driver = webdriver.Chrome(chrome_options=chrome_options)
 24     driver.maximize_window()
 25     search_query = keyword
 26     url = "https://www.google.com/search?q="+search_query+"&source=lnms&tbm=isch"
 27     driver.get(url)
 28     for _ in range(number_of_scrolls):
 29         for i in range(5):
 30             # multiple scrolls needed to show all 400 images
 31             driver.execute_script("window.scrollBy(0, 100000)")
 32             time.sleep(1)
 33         time.sleep(5)#等待页面刷新,否则有可能元素不可见
 34         try:
 35             # driver.find_element_by_xpath("//input[@value=‘Show more results‘]").click()#浏览器的中英文版本不同
 36             driver.find_element_by_xpath("//input[@value=‘显示更多结果‘]").click()
 37         except Exception as e:
 38             print("reach the end of page ")
 39             break
 40 
 41     # with open(‘page.html‘,‘w‘) as f:
 42     #     f.write(driver.page_source)
 43     imgs = driver.find_elements_by_xpath(//div[contains(@class,"rg_meta")])#模糊定位
 44     for i,img in enumerate(imgs):
 45         img_url = json.loads(img.get_attribute(innerHTML))["ou"]
 46         img_urls.add(img_url)
 47     driver.quit()
 48     print("finish getting all image urls!")
 49 
 50     return img_urls
 51 
 52 def download(urls,download_dir):
 53     ‘‘‘download images
 54     ‘‘‘
 55     print("start downloading images!")
 56     for url in urls:
 57         filename=os.path.join(download_dir,os.path.basename(url))
 58         try:
 59             r = requests.get(url, stream=True, timeout=60)
 60             r.raise_for_status()
 61             with open(filename, wb) as f:
 62                 f.write(r.content)  
 63         except Exception:
 64             continue
 65     print("finish downloading images!")
 66 
 67 keywords = [girl,boy]
 68 download_dir = ./images/
 69 download_dirs = []
 70 for keyword in keywords:
 71     path = os.path.join(download_dir,keyword)
 72     download_dirs.append(path)
 73     if not os.path.exists(path):
 74         os.makedirs(path)
 75 
 76 # for keyword in main_keywords: 
 77 #     image_urls = get_image_links(keyword)
 78 #     download(image_urls,download_dir)
 79 
 80 
 81 ###################################
 82 # get image links/MultiProcess
 83 ################################### 
 84 img_urls=[]
 85 multiprocessing.freeze_support()
 86 p = Pool(4) # default number of process is the number of cores of your CPU, change it by yourself
 87 for keyword in keywords:
 88     img_urls.append(p.apply_async(get_image_links, (keyword,)))
 89 #img_urls:[<multiprocessing.pool.ApplyResult object at 0x7f536925fcc0>, <multiprocessing.pool.ApplyResult object at 0x7f536925fd68>]
 90 for i,urls in enumerate(img_urls):
 91     img_urls[i]=urls.get()
 92 p.close()
 93 p.join()
 94 
 95 
 96 # # ###################################
 97 # # # download images/MultiProcess
 98 # # ###################################
 99 p = Pool(4) # default number of process is the number of cores of your CPU, change it by yourself
100 for i,urls in enumerate(img_urls):
101     p.apply_async(download, [urls,download_dirs[i]])
102 p.close()
103 p.join()

 

python+selenium爬取关键字搜索google图片

标签:--   roc   meta   结果   div   arch   key   open   nta   

原文地址:https://www.cnblogs.com/buyizhiyou/p/11140128.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!