一个爬取百度图库程序

时间：2017-04-24 14:08:07 阅读：193 评论：0 收藏：0 [点我收藏+]

学习python有一段时间了这几天想写一个爬去百度图片的小爬虫
代码
from selenium import webdriver
import urllib,re
import time
import urllib2
import sys
import os
import socket
import threading
socket.setdefaulttimeout(15.0)
def mkdir(name): #判断文件存放的目录是否存在
    if not os.path.exists(name):
        os.mkdir(name)
def get_html(name,papg):#通过selenium+PhantomJS来访问目标网址
        try: #异常处理
            name = urllib.quote(name)
            driver=webdriver.PhantomJS()
            driver.get(‘https://image.baidu.com/search/index?tn=baiduimage&word={}&pn={}‘.format(name,papg))
            data=driver.page_source
            driver.quit()
            return data
        except Exception:
            return None
def req(html):#抓取图片的正则表达式
    try:
        s=r‘data-objurl="(http://.*?)"‘
        req=re.findall(s,html)
        return req
    except Exception:
        return None
def Loadown(req):#下载函数
    for i in req:
        try:
            #heard={‘User-Agent‘:"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0"}
            #urllib2.Request.add_header(‘User-Agent‘,"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0")
            if urllib2.urlopen(i).getcode()==200:
                print i
                urllib.urlretrieve(i,name+‘/%s‘% len(os.listdir(name)))
            else:
                pass
        except Exception :
            pass
def three(req):#线程函数
    threading.Thread(target=Loadown, args=(req,)).start()
    while (threading.activeCount() > 20):
        if (threading.activeCount() < 20):
            break;

def ForImg(papg):
    html = get_html(name, papg)
    res = req(html)
    if res != None:
        three(res)

#data-thumburl="https://ss0.bdstatic.com/70cFuHSh_Q1YnxGkpoWK1HF6hhy/it/u=2766886107,1571085905&fm=23&gp=0.jpg"
if __name__ == ‘__main__‘:
    print ‘----------------------------------------------------------------------------‘
    name = raw_input(‘请输入搜索的目标:‘).decode(sys.stdin.encoding)
    name = name.encode(‘utf-8‘)
    mkdir(name)
    s=raw_input(‘请输入需要几页数据‘)
    if s.isdigit():
        s=int(s)
    else:
        print ‘请输入数字‘
    papg=0
    for i in range(0,s):
        ForImg(papg)
        papg+=20

一个爬取百度图库程序

标签：jpg 异常 com arc getc turn 线程 pre 正则

原文地址：http://www.cnblogs.com/duang-cheng/p/6756435.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行