标签:eth ready code gen bing enumerate urlopen com move
#在根部目录下建一个 list.txt(notepad打开) ,输入你要爬取的关键字,utf8编码,类似:(用谷歌爬图片用英文关键字)
#手机
#蓝天
个人认为谷歌,百度图片最好。
1 #!/usr/bin/env python 2 # -*- coding:utf-8 -*- 3 4 import os 5 import re 6 import urllib 7 import json 8 import socket 9 import urllib.request 10 import urllib.parse 11 import urllib.error 12 import time 13 14 timeout = 5 15 socket.setdefaulttimeout(timeout) 16 17 from collections import defaultdict 18 import hashlib 19 import imagehash 20 import os 21 22 from PIL import Image 23 24 from PIL import ImageFile 25 26 #from PIL import imagehash 27 28 ImageFile.LOAD_TRUNCATED_IMAGES = True 29 30 31 32 class Crawler: 33 __time_sleep = 0.1 34 __amount = 0 35 __start_amount = 0 36 __counter = 0 37 headers = {‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0‘} 38 39 def __init__(self, t=0.1): 40 self.time_sleep = t 41 42 43 def __save_image(self, rsp_data, word): 44 45 if not os.path.exists("./" + word): 46 os.mkdir("./" + word) 47 48 self.__counter = len(os.listdir(‘./‘ + word)) + 1 49 for image_info in rsp_data[‘imgs‘]: 50 try: 51 time.sleep(self.time_sleep) 52 fix = self.__get_suffix(image_info[‘objURL‘]) 53 urllib.request.urlretrieve(image_info[‘objURL‘], ‘./‘ + word + ‘/‘ + str(self.__counter) + str(fix)) 54 except urllib.error.HTTPError as urllib_err: 55 print(urllib_err) 56 continue 57 except Exception as err: 58 time.sleep(1) 59 print(err) 60 print("unknown errors, do not save") 61 continue 62 else: 63 print("Picture+1,already have" + str(self.__counter) + "pictures") 64 self.__counter += 1 65 return 66 67 68 @staticmethod 69 def __get_suffix(name): 70 m = re.search(r‘\.[^\.]*$‘, name) 71 if m.group(0) and len(m.group(0)) <= 5: 72 return m.group(0) 73 else: 74 return ‘.jpeg‘ 75 76 77 @staticmethod 78 def __get_prefix(name): 79 return name[:name.find(‘.‘)] 80 81 def __get_images(self, word=‘sky‘): 82 search = urllib.parse.quote(word) 83 pn = self.__start_amount 84 while pn < self.__amount: 85 86 url = ‘http://image.baidu.com/search/avatarjson?tn=resultjsonavatarnew&ie=utf-8&word=‘ + search + ‘&cg=girl&pn=‘ + str( 87 pn) + ‘&rn=60&itg=0&z=0&fr=&width=&height=&lm=-1&ic=0&s=0&st=-1&gsm=1e0000001e‘ 88 89 try: 90 time.sleep(self.time_sleep) 91 req = urllib.request.Request(url=url, headers=self.headers) 92 page = urllib.request.urlopen(req) 93 rsp = page.read().decode(‘unicode_escape‘) 94 print("Already tried") 95 except UnicodeDecodeError as e: 96 print(e) 97 print(‘-----UnicodeDecodeErrorurl:‘, url) 98 except urllib.error.URLError as e: 99 print(e) 100 print("-----urlErrorurl:", url) 101 except socket.timeout as e: 102 print(e) 103 print("-----socket timout:", url) 104 else: 105 try: 106 rsp_data = json.loads(rsp,strict=False) 107 self.__save_image(rsp_data, word) 108 print("nextpage") 109 pn += 60 110 except json.decoder.JSONDecodeError as e: 111 print(e) 112 pn += 60 113 print("json_decode problem") 114 #continue 115 page.close() 116 finally: 117 page.close() 118 print("finished") 119 return 120 121 def start(self, word, spider_page_num=10, start_page=1): 122 123 self.__start_amount = (start_page - 1) * 60 124 self.__amount = spider_page_num * 60 + self.__start_amount 125 self.__get_images(word) 126 127 128 def removeDup(path): 129 if not os.path.isdir(path): 130 print (path + "is not a directory!") 131 exit() 132 133 hash_paths = [] 134 image_hash = hashlib.md5() 135 for file in os.listdir(path): 136 #for root, dirs, files in os.walk(path, followlinks=True): 137 # for name in files: 138 # print(os.path.join(root, name)) 139 # for name in dirs: 140 # print(os.path.join(root, name)) 141 file_path = os.path.join(path, file) 142 ext = file_path.split(‘.‘)[-1] 143 file_size = os.path.getsize(file_path) 144 if (ext == ‘jpg‘ or ext == ‘jpeg‘ or ext == ‘JPG‘): 145 try: 146 hash_value = imagehash.average_hash(Image.open(file_path)) 147 hash_paths.append((str(hash_value), file_path)) 148 #print str(hash_paths) 149 except IOError: 150 continue 151 152 dd = defaultdict(list) 153 for k, v in hash_paths: 154 dd[k].append(v) 155 156 num_removed = 0 157 for list_paths in dd.values(): 158 for index, image_path in enumerate(list_paths): 159 if index > 0: 160 os.remove(image_path) 161 print ("Remove: " + image_path) 162 num_removed += 1 163 164 print ("Removed {} images.".format(num_removed)) 165 166 167 if __name__ == ‘__main__‘: 168 keyword_list = open("list.txt", ‘r‘, encoding=‘utf-8‘) 169 lines = keyword_list.readlines() 170 keyword_list.close() 171 172 for keyword in lines: 173 print (keyword) 174 if keyword[-1] == "\n": 175 keyword = keyword[:-1] 176 crawler = Crawler(0.05) 177 img_file=keyword 178 crawler.start(img_file) 179 removeDup(img_file)
#!/usr/bin/env python # -*- coding:utf-8 -*- from icrawler.builtin import GoogleImageCrawler # keyword = ‘动漫美女‘ # 图片存储路径为运行当前应用的路径 import sys import os from importlib import reload reload(sys) s = os.sep root = r"./" for test in os.listdir(root): if os.path.isfile(os.path.join(root,test)) and test.__eq__("list.txt"): filename = os.path.join(root,test) with open(filename) as f: while 1: keyword = f.readline().strip() if not keyword: break print("---filename---",test) google_crawler = GoogleImageCrawler( feeder_threads=1, parser_threads=2, downloader_threads=16, storage={‘root_dir‘: keyword}) google_crawler.crawl(keyword=keyword, max_num=1000, file_idx_offset=0)
#!/usr/bin/env python # -*- coding:utf-8 -*- from icrawler.builtin import BingImageCrawler import sys import os from importlib import reload reload(sys) s = os.sep root = "" #此处放置图片保存路径 for test in os.listdir(root): if os.path.isfile(os.path.join(root,test)) and test.__eq__("list.txt"): filename = os.path.join(root,test) with open(os.path.join(root,test)) as f: while 1: keyword = f.readline().strip() if not keyword: break print("---filename---",test) bing_crawler = BingImageCrawler( feeder_threads=1, parser_threads=2, downloader_threads=16, storage={‘root_dir‘: keyword}) bing_crawler.crawl(keyword=keyword, max_num=500, file_idx_offset=0) #max_num 默认爬取500
标签:eth ready code gen bing enumerate urlopen com move
原文地址:https://www.cnblogs.com/ajie-linda/p/11590643.html