Python多线程Threading爬取图片，保存本地

时间：2020-05-05 10:56:46 阅读：167 评论：0 收藏：0 [点我收藏+]

标签：没有 ima 读取 int 数据读取 timeout 工作 map none

注意：之前用过openpyxl库保存数据到Excel文件写入不了，换用xlsxwriter

 1 import os
 2 import requests
 3 import re
 4 from openpyxl import load_workbook
 5 import xlsxwriter
 6 from multiprocessing.dummy import Pool as ThreadPool
 7 
 8 def spider(url):
 9     headers = {
10         ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36‘}
11     html = requests.get(url, headers, timeout=None)
12     pic_url = re.findall(‘class="product-image">.*?<img src="(.*?)"  height="‘, html.text, re.S)
13     sku = re.findall(‘q=(\d+)‘, url, re.S)#正则匹配链接后面的sku
14     if pic_url != []:
15         print(‘正在下载‘+sku[0]+‘图片，图片地址：‘ + pic_url[0])
16         pic = requests.get(pic_url[0])
17         dir = cwd + ‘\\images\\‘ + sku[0] + ‘.jpg‘
18         # print(dir)
19         with open(dir, ‘wb‘) as file:
20             file.write(pic.content)
21     else:
22         if sku !=[]:
23             print(‘没有找到‘ + sku[0]+‘产品‘)
24             No_images.append(sku[0])
25 #之前用过openpyxl创建新的Excel文件，但是写入不了，之后换用xlsxwriter保存数据到Excel
26 def save_excel(sku):
27     print(sku)
28     wb1 = xlsxwriter.Workbook(cwd + ‘\\‘ + ‘No_images.xlsx‘)
29     ws1 = wb1.add_worksheet()
30     ws1.write(0, 0, ‘No_images_sku‘)
31     for i in range(1,len(sku)+1):
32         ws1.write(i, 0, sku[i-1])
33     wb1.close()
34     print(‘保存没有图片的sku成功！‘)
35 
36 if __name__ == ‘__main__‘:
37     cwd=os.getcwd()
38     path = cwd + ‘\\‘+‘最近12个月没有销量产品（201711）.xlsx‘
39     wb =load_workbook(path)
40     ws = wb.worksheets[0]
41     pool =ThreadPool(50)#开启多少个进程，四核电脑
42     urls = []
43     No_images = []
44     for i in range(1, ws.max_row+1):#通过循环将Excel数据读取出来
45         sku = ws.cell(i,2).value
46         if sku !=None:
47             print(‘正在爬取第‘+str(i)+‘个sku图片‘)
48             url = ‘http://www.fulchic.com/catalogsearch/result/?q=‘ + str(sku)
49             urls.append(url)
50     pool.map(spider,urls)#多线程工作，其中，spider是爬虫函数名，urls是个爬取链接列表
51     pool.close()
52     pool.join()
53     # print(No_images)
54     save_excel(No_images)

标签：没有 ima 读取 int 数据读取 timeout 工作 map none

原文地址：https://www.cnblogs.com/chunfang/p/12829640.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行