标签:没有 ima 读取 int 数据读取 timeout 工作 map none
注意:之前用过openpyxl库保存数据到Excel文件写入不了,换用xlsxwriter
1 import os 2 import requests 3 import re 4 from openpyxl import load_workbook 5 import xlsxwriter 6 from multiprocessing.dummy import Pool as ThreadPool 7 8 def spider(url): 9 headers = { 10 ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36‘} 11 html = requests.get(url, headers, timeout=None) 12 pic_url = re.findall(‘class="product-image">.*?<img src="(.*?)" height="‘, html.text, re.S) 13 sku = re.findall(‘q=(\d+)‘, url, re.S)#正则匹配链接后面的sku 14 if pic_url != []: 15 print(‘正在下载‘+sku[0]+‘图片,图片地址:‘ + pic_url[0]) 16 pic = requests.get(pic_url[0]) 17 dir = cwd + ‘\\images\\‘ + sku[0] + ‘.jpg‘ 18 # print(dir) 19 with open(dir, ‘wb‘) as file: 20 file.write(pic.content) 21 else: 22 if sku !=[]: 23 print(‘没有找到‘ + sku[0]+‘产品‘) 24 No_images.append(sku[0]) 25 #之前用过openpyxl创建新的Excel文件,但是写入不了,之后换用xlsxwriter保存数据到Excel 26 def save_excel(sku): 27 print(sku) 28 wb1 = xlsxwriter.Workbook(cwd + ‘\\‘ + ‘No_images.xlsx‘) 29 ws1 = wb1.add_worksheet() 30 ws1.write(0, 0, ‘No_images_sku‘) 31 for i in range(1,len(sku)+1): 32 ws1.write(i, 0, sku[i-1]) 33 wb1.close() 34 print(‘保存没有图片的sku成功!‘) 35 36 if __name__ == ‘__main__‘: 37 cwd=os.getcwd() 38 path = cwd + ‘\\‘+‘最近12个月没有销量产品(201711).xlsx‘ 39 wb =load_workbook(path) 40 ws = wb.worksheets[0] 41 pool =ThreadPool(50)#开启多少个进程,四核电脑 42 urls = [] 43 No_images = [] 44 for i in range(1, ws.max_row+1):#通过循环将Excel数据读取出来 45 sku = ws.cell(i,2).value 46 if sku !=None: 47 print(‘正在爬取第‘+str(i)+‘个sku图片‘) 48 url = ‘http://www.fulchic.com/catalogsearch/result/?q=‘ + str(sku) 49 urls.append(url) 50 pool.map(spider,urls)#多线程工作,其中,spider是爬虫函数名,urls是个爬取链接列表 51 pool.close() 52 pool.join() 53 # print(No_images) 54 save_excel(No_images)
标签:没有 ima 读取 int 数据读取 timeout 工作 map none
原文地址:https://www.cnblogs.com/chunfang/p/12829640.html