码迷,mamicode.com
首页 > 编程语言 > 详细

python爬虫之爬取百度图片

时间:2017-04-02 17:53:04      阅读:359      评论:0      收藏:0      [点我收藏+]

标签:erro   aam   正则匹配   爬虫   bae   编码   ogr   rcv   mat   

#
#author:wuhao
#
#爬取指定页码的图片,如果需要爬取某一类的所有图片,整体框架不变,但需要另作分析
#
import urllib.request
import urllib.parse
import re
import os
#添加header

header=\
{
     User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36,
     Accept: text/plain, */*; q=0.01,
     Accept-Language: zh-CN,zh;q=0.8,
     X-Requested-With: XMLHttpRequest,
     Connection: keep-alive,
    "Cookie":"__cfduid=d9625fabd71ff1ba94d67ec814e05e1571473486956; BDUSS=4tcGMxYzd-dm94ZzhzaTNqenptWVlpWFFGeUh5OGR1ZTRFMnhwNnRJekdEV0ZZSVFBQUFBJCQAAAAAAAAAAAEAAACevjl7vvTYvMq~2LwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMaAOVjGgDlYY2; BAIDUID=D6E9D0B92F3F35B1E4A7AD3CABCBAED3:FG=1; BIDUPSID=FF63886073870CA5ABC6EA2663979765; PSTM=1482469381; indexPageSugList=%5B%22king%20of%20the%20Kill%E5%A4%B4%E5%83%8F%22%2C%22%E5%88%9B%E6%84%8F%E6%91%84%E5%BD%B1%22%2C%22%E9%AB%98%E6%B8%85%E6%91%84%E5%BD%B1%22%2C%22test%22%2C%22s%20logo%22%5D; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDRCVFR[X_XKQks0S63]=mk3SLVN4HKm; firstShowTip=1; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; userFrom=null",
    "referer":"https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1490169358952_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E7%BE%8E%E5%A5%B3"
    }

 

#url是分析后得来的,指向的是Json数据
url=
"https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord={word}&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&word={word}&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&cg=girl&pn={pageNum}&rn=30&gsm=1e00000000001e&1490169411926="

 


#keyword=input("请输入搜索关键字:")
keyword=‘美女‘
#转换编码格式
keyword=urllib.parse.quote(keyword,"utf-8")
n=0
j=0
error=0


#获取前3000张图片
while(n<30*100):
n+=30
    #url链接
    url1=url.format(word=keyword,pageNum=str(n))

    #获取请求
    rep=urllib.request.Request(url1,headers=header)
    #打开网页
    rep=urllib.request.urlopen(rep)
    #读取网页数据
    try:
        html=rep.read().decode("utf-8")
    except:
        print("something wrong!")
        error=1
        print("-------------now page ="+str(n))
    if(error==1): continue
    #正则匹配
    p=re.compile("thumbURL.*?\.jpg")
    #获取正则匹配结果,返回的是一个list
    s=p.findall(html)
    #写入文件夹
    if os.path.isdir("f:\\myproject\\MyCrawlPic\\美女")!=True:
        os.makedirs(r"f:\\myproject\\MyCrawlPic\\美女")
    with open("testPic1.txt","w") as f:
        for i in s:
            i=i.replace("thumbURL\":\"","")
            print(i)
            f.write(i)
            f.write("\n")
            urllib.request.urlretrieve(i,"f:\\myproject\\MyCrawlPic\\大胸\\pic{num}.jpg".format(num=j))
            j+=1

  

  








python爬虫之爬取百度图片

标签:erro   aam   正则匹配   爬虫   bae   编码   ogr   rcv   mat   

原文地址:http://www.cnblogs.com/one-lightyear/p/python_crawl_image_baidu.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!