标签:jpg 异常 com arc getc turn 线程 pre 正则
学习python有一段时间了这几天想写一个爬去百度图片的小爬虫
代码
from selenium import webdriver
import urllib,re
import time
import urllib2
import sys
import os
import socket
import threading
socket.setdefaulttimeout(15.0)
def mkdir(name): #判断文件存放的目录是否存在
if not os.path.exists(name):
os.mkdir(name)
def get_html(name,papg):#通过selenium+PhantomJS来访问目标网址
try: #异常处理
name = urllib.quote(name)
driver=webdriver.PhantomJS()
driver.get(‘https://image.baidu.com/search/index?tn=baiduimage&word={}&pn={}‘.format(name,papg))
data=driver.page_source
driver.quit()
return data
except Exception:
return None
def req(html):#抓取图片的正则表达式
try:
s=r‘data-objurl="(http://.*?)"‘
req=re.findall(s,html)
return req
except Exception:
return None
def Loadown(req):#下载函数
for i in req:
try:
#heard={‘User-Agent‘:"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0"}
#urllib2.Request.add_header(‘User-Agent‘,"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0")
if urllib2.urlopen(i).getcode()==200:
print i
urllib.urlretrieve(i,name+‘/%s‘% len(os.listdir(name)))
else:
pass
except Exception :
pass
def three(req):#线程函数
threading.Thread(target=Loadown, args=(req,)).start()
while (threading.activeCount() > 20):
if (threading.activeCount() < 20):
break;
def ForImg(papg):
html = get_html(name, papg)
res = req(html)
if res != None:
three(res)
#data-thumburl="https://ss0.bdstatic.com/70cFuHSh_Q1YnxGkpoWK1HF6hhy/it/u=2766886107,1571085905&fm=23&gp=0.jpg"
if __name__ == ‘__main__‘:
print ‘----------------------------------------------------------------------------‘
name = raw_input(‘请输入搜索的目标:‘).decode(sys.stdin.encoding)
name = name.encode(‘utf-8‘)
mkdir(name)
s=raw_input(‘请输入需要几页数据‘)
if s.isdigit():
s=int(s)
else:
print ‘请输入数字‘
papg=0
for i in range(0,s):
ForImg(papg)
papg+=20
标签:jpg 异常 com arc getc turn 线程 pre 正则
原文地址:http://www.cnblogs.com/duang-cheng/p/6756435.html