python+selenium爬取关键字搜索google图片

时间：2019-07-05 19:27:39 阅读：152 评论：0 收藏：0 [点我收藏+]

  1 # -*- coding: utf-8 -*-
  2 
  3 import json
  4 import os
  5 import time
  6 from multiprocessing import Pool
  7 import multiprocessing
  8 import requests
  9 from selenium import webdriver
 10 
 11 
 12 def get_image_links(keyword, num_requested = 1000):
 13     """get image links with selenium
 14     """
 15     number_of_scrolls = int(num_requested/400) + 1 
 16     img_urls = set()#设置为集合，自动去除重复链接
 17     chrome_options = webdriver.ChromeOptions()
 18     # chrome_options.add_argument(‘--headless‘)#设置无头浏览器
 19     # chrome_options.add_argument(‘user-agent="Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"‘)
 20     # chrome_options.add_argument("lang=en_US")#设置语言
 21     # prefs = {"profile.managed_default_content_settings.images":2}
 22     # chrome_options.add_experimental_option("prefs",prefs)#配置不加载图片
 23     driver = webdriver.Chrome(chrome_options=chrome_options)
 24     driver.maximize_window()
 25     search_query = keyword
 26     url = "https://www.google.com/search?q="+search_query+"&source=lnms&tbm=isch"
 27     driver.get(url)
 28     for _ in range(number_of_scrolls):
 29         for i in range(5):
 30             # multiple scrolls needed to show all 400 images
 31             driver.execute_script("window.scrollBy(0, 100000)")
 32             time.sleep(1)
 33         time.sleep(5)#等待页面刷新，否则有可能元素不可见
 34         try:
 35             # driver.find_element_by_xpath("//input[@value=‘Show more results‘]").click()＃浏览器的中英文版本不同
 36             driver.find_element_by_xpath("//input[@value=‘显示更多结果‘]").click()
 37         except Exception as e:
 38             print("reach the end of page ")
 39             break
 40 
 41     # with open(‘page.html‘,‘w‘) as f:
 42     #     f.write(driver.page_source)
 43     imgs = driver.find_elements_by_xpath(‘//div[contains(@class,"rg_meta")]‘)#模糊定位
 44     for i,img in enumerate(imgs):
 45         img_url = json.loads(img.get_attribute(‘innerHTML‘))["ou"]
 46         img_urls.add(img_url)
 47     driver.quit()
 48     print("finish getting all image urls!")
 49 
 50     return img_urls
 51 
 52 def download(urls,download_dir):
 53     ‘‘‘download images
 54     ‘‘‘
 55     print("start downloading images!")
 56     for url in urls:
 57         filename=os.path.join(download_dir,os.path.basename(url))
 58         try:
 59             r = requests.get(url, stream=True, timeout=60)
 60             r.raise_for_status()
 61             with open(filename, ‘wb‘) as f:
 62                 f.write(r.content)  
 63         except Exception:
 64             continue
 65     print("finish downloading images!")
 66 
 67 keywords = [‘girl‘,‘boy‘]
 68 download_dir = ‘./images/‘
 69 download_dirs = []
 70 for keyword in keywords:
 71     path = os.path.join(download_dir,keyword)
 72     download_dirs.append(path)
 73     if not os.path.exists(path):
 74         os.makedirs(path)
 75 
 76 # for keyword in main_keywords: 
 77 #     image_urls = get_image_links(keyword)
 78 #     download(image_urls,download_dir)
 79 
 80 
 81 ###################################
 82 # get image links/MultiProcess
 83 ################################### 
 84 img_urls=[]
 85 multiprocessing.freeze_support()
 86 p = Pool(4) # default number of process is the number of cores of your CPU, change it by yourself
 87 for keyword in keywords:
 88     img_urls.append(p.apply_async(get_image_links, (keyword,)))
 89 #img_urls:[<multiprocessing.pool.ApplyResult object at 0x7f536925fcc0>, <multiprocessing.pool.ApplyResult object at 0x7f536925fd68>]
 90 for i,urls in enumerate(img_urls):
 91     img_urls[i]=urls.get()
 92 p.close()
 93 p.join()
 94 
 95 
 96 # # ###################################
 97 # # # download images/MultiProcess
 98 # # ###################################
 99 p = Pool(4) # default number of process is the number of cores of your CPU, change it by yourself
100 for i,urls in enumerate(img_urls):
101     p.apply_async(download, [urls,download_dirs[i]])
102 p.close()
103 p.join()

标签：-- roc meta 结果 div arch key open nta

原文地址：https://www.cnblogs.com/buyizhiyou/p/11140128.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行