python 百度图片爬虫

时间：2018-05-22 22:27:19 阅读：377 评论：0 收藏：0 [点我收藏+]

标签：百度 exist requests download down .net rm -rf format os.path

# -*- coding:utf-8 -*-
#https://blog.csdn.net/qq_32166627/article/details/60882964
import requests
import os
import pinyin

def getManyPages(keyword,pages):
    params=[]
    for i in range(30,30*pages+30,30):
        params.append({
                      ‘tn‘: ‘resultjson_com‘,
                      ‘ipn‘: ‘rj‘,
                      ‘ct‘: 201326592,
                      ‘is‘: ‘‘,
                      ‘fp‘: ‘result‘,
                      ‘queryWord‘: keyword,
                      ‘cl‘: 2,
                      ‘lm‘: -1,
                      ‘ie‘: ‘utf-8‘,
                      ‘oe‘: ‘utf-8‘,
                      ‘adpicid‘: ‘‘,
                      ‘st‘: -1,
                      ‘z‘: ‘‘,
                      ‘ic‘: 0,
                      ‘word‘: keyword,
                      ‘s‘: ‘‘,
                      ‘se‘: ‘‘,
                      ‘tab‘: ‘‘,
                      ‘width‘: ‘‘,
                      ‘height‘: ‘‘,
                      ‘face‘: 0,
                      ‘istype‘: 2,
                      ‘qc‘: ‘‘,
                      ‘nc‘: 1,
                      ‘fr‘: ‘‘,
                      ‘pn‘: i,
                      ‘rn‘: 30,
                      ‘gsm‘: ‘1e‘,
                      ‘1488942260214‘: ‘‘
                  })
    url = ‘https://image.baidu.com/search/acjson‘
    urls = []
    for i in params:
        urls.append(requests.get(url,params=i).json().get(‘data‘))

    return urls


def getImg(dataList, localPath, keyword):

    if not os.path.exists(localPath):  # 新建文件夹
        os.mkdir(localPath)

    x = 0
    for list in dataList:
        for i in list:
            if i.get(‘thumbURL‘) != None:
                #print(‘download：%s‘ % i.get(‘thumbURL‘))
                print("down "  + keyword + str(x) + " image " + i.get(‘thumbURL‘))
                ir = requests.get(i.get(‘thumbURL‘))
                open(localPath +"/" + keyword +  ‘_%d.jpg‘ % x, ‘wb‘).write(ir.content)
                x += 1
            else:
                print(‘image not exist‘)

# if __name__ == ‘__main__‘:

#     with open("stars_list_clean.txt",‘r‘) as face_file:
#       stars_list = face_file.readlines()
#       index = 0
#       for line in stars_list:
#           line = line.replace(‘\r‘,‘‘).replace(‘\n‘,‘‘).replace(‘\t‘,‘‘)
#           keyword_english = pinyin.get(line, format="strip")
#           keyword = line
#           index += 1
#           if index > 0:
#             break

#     # print(keyword)
#     # keyword1 = ‘胡因梦‘
#     # if keyword == keyword1:
#     #     print("yes")
#     # else:
#     #     print("no")
#     #keyword = ‘胡因梦‘
#     #keyword = keyword.replace(‘\X‘,‘‘)
#     dataList = getManyPages(keyword,2)  # 参数1:关键字，参数2:要下载的页数
#     getImg(dataList,‘./hanxue‘, keyword_english) # 参数2:指定保存的路径

    # keyword = ‘韩雪‘
    # dataList = getManyPages(keyword,10)  # 参数1:关键字，参数2:要下载的页数
    # getImg(dataList,‘./hanxue‘) # 参数2:指定保存的路径

if __name__ == ‘__main__‘:

    #convert()

    #word = input("Input key word: ")
    # print pinyin.get(‘你好‘)#声调
    # print pinyin.get((‘你好‘), format="strip")#无声调
    #stars_list = ["范冰冰", "刘德华","周迅","乔丹"]
    #en = []
    # fp = open("stars_list_en.txt",‘w‘)
    # with open("stars_list.txt",‘r‘) as face_file:
    #     stars_list = face_file.readlines()
    #     for line in stars_list:
    #         print(line[0:-1]) 
    #         keyword_english = pinyin.get(line[0:-1], format="strip")
    #         print(keyword_english)
    #         en.append(keyword_english)
    #         fp.write(‘%s\n‘ % keyword_english.encode(‘utf-8‘))
    # print(en)

    fp = open("stars_list_en.txt",‘w‘)
    with open("stars_list_clean.txt",‘r‘) as face_file:
        stars_list = face_file.readlines()
        for line in stars_list:
            line = line.replace(‘\r‘,‘‘).replace(‘\n‘,‘‘).replace(‘\t‘,‘‘)
            keyword_english = pinyin.get(line, format="strip")
            fp.write(‘%s\n‘ % keyword_english)
    face_ID_index = 0

    dir = "./stars_srcimg/"
    
    # if os.path.exists(dir):
    #     os.system("rm -rf " + dir)

    if not os.path.exists(dir):
        os.mkdir(dir)

    pages = 2
    maxnum = pages * 30
    print(maxnum)

    for line in stars_list:
        #line.decode(‘utf-8‘).encode(‘gb2312‘)
        line = line.replace(‘\r‘,‘‘).replace(‘\n‘,‘‘).replace(‘\t‘,‘‘)
        keyword = line
        print keyword
        keyword_english = pinyin.get(keyword, format="strip")
        print keyword_english
        facesavepath = dir + str(face_ID_index) + "_" + keyword_english
        face_ID_index += 1
        print facesavepath
        if not os.path.exists(facesavepath):
            os.mkdir(facesavepath)
        else:
            print(keyword, " exist")
            continue

        dataList = getManyPages(keyword, pages)  # 参数1:关键字，参数2:要下载的页数
        getImg(dataList, facesavepath, keyword_english) # 参数2:指定保存的路径

python 百度图片爬虫

标签：百度 exist requests download down .net rm -rf format os.path

原文地址：https://www.cnblogs.com/adong7639/p/9074012.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行