DeepLearnToolbox是一个简单理解CNN过程的工具箱,可以在github下载。为了理解卷积神经网络的过程,我特此对CNN部分源码进行了注释。公式的计算可以由上一篇blog推导得出。
注意:代码中没有的subsampling进行设置参数,将subsampling层的参数w就设置为了0.25,而偏置参数b设置为0。卷积层计算过程为上一层所有feature map的卷积的结果和,后再加一个偏置,再取sigmoid函数。而subsampling的计算过程为上一层对应的2*2的feature map的像素值求和再取平均,没有加上偏置和取sigmoid。
此外net中一些参数进行说明:
net.fv: 最后一层隐藏层的特征矩阵,采用的是全连接方式
net.o: 最后输出的结果,每一列为一个样本结果
net.od: 最后一层输出层所对应的残差
net.fvd: 最后一层隐藏层所对应的误差(全连接的方式)
test_example_CNN.m
%function test_example_CNN addpath D:\DeepLearning\DeepLearnToolbox-master\dataaddpath D:\DeepLearning\DeepLearnToolbox-master\CNNaddpath D:\DeepLearning\DeepLearnToolbox-master\utilload mnist_uint8; train_x = double(reshape(train_x',28,28,60000))/255; % 训练集变成60000张28*28的图片大小 28*28*60000,像素点归一化到[0,1] test_x = double(reshape(test_x',28,28,10000))/255; % 测试集 28*28*10000 train_y = double(train_y'); %10*6000 每列代表一个标签 softmax回归模型 test_y = double(test_y'); %% ex1 Train a 6c-2s-12c-2s Convolutional neural network %will run 1 epoch in about 200 second and get around 11% error. %With 100 epochs you'll get around 1.2% error rand('state',0) cnn.layers = { %%% 设置各层feature maps个数及卷积模板大小等属性 struct('type', 'i') %input layer struct('type', 'c', 'outputmaps', 6, 'kernelsize', 5) %convolution layer struct('type', 's', 'scale', 2) %sub sampling layer struct('type', 'c', 'outputmaps', 12, 'kernelsize', 5) %convolution layer struct('type', 's', 'scale', 2) %subsampling layer }; opts.alpha = 0.01; %迭代下降的速率 opts.batchsize = 50; %每次选择50个样本进行更新 随机梯度下降,每次只选用50个样本进行更新 opts.numepochs = 50; %迭代次数 cnn = cnnsetup(cnn, train_x, train_y); %对各层参数进行初始化 包括权重和偏置 cnn = cnntrain(cnn, train_x, train_y, opts); %训练的过程,包括bp算法及迭代过程 [er, bad] = cnntest(cnn, test_x, test_y); %plot mean squared error figure; plot(cnn.rL); % assert(er<0.12, 'Too big error');
cnnsetup.m
function net = cnnsetup(net, x, y) % assert(~isOctave() || compare_versions(OCTAVE_VERSION, '3.8.0', '>='), ['Octave 3.8.0 or greater is required for CNNs as there is a bug in convolution in previous versions. See http://savannah.gnu.org/bugs/?39314. Your version is ' myOctaveVersion]); inputmaps = 1; %输入图片数量 输入feature maps数量 mapsize = size(squeeze(x(:, :, 1))); % 图片的大小 squeeze 要不要都行28 28 for l = 1 : numel(net.layers) % layer if strcmp(net.layers{l}.type, 's') mapsize = mapsize / net.layers{l}.scale; %% sumsampling的featuremap长宽都是上一层卷积层featuremap的一半 assert(all(floor(mapsize)==mapsize), ['Layer ' num2str(l) ' size must be integer. Actual: ' num2str(mapsize)]); for j = 1 : inputmaps net.layers{l}.b{j} = 0; % 将偏置初始化0, 权重weight,,这段代码subsampling层将weight设为1/4 而偏置参数设为0,故subsampling阶段无需参数 end end if strcmp(net.layers{l}.type, 'c') mapsize = mapsize - net.layers{l}.kernelsize + 1; % 得到当前层feature map的大小 fan_out = net.layers{l}.outputmaps * net.layers{l}.kernelsize ^ 2; % fan_out与fan_in都是用来初始化kernel的,不知道why for j = 1 : net.layers{l}.outputmaps % output map 当前层feature maps的个数 fan_in = inputmaps * net.layers{l}.kernelsize ^ 2; for i = 1 : inputmaps % input map 共享权值,故kernel参数个数为inputmaps*outputmaps个数,每一个都是5*5 net.layers{l}.k{i}{j} = (rand(net.layers{l}.kernelsize) - 0.5) * 2 * sqrt(6 / (fan_in + fan_out)); %% 初始化每个feature map对应的kernel参数 -0.5 再乘2归一化到[-1,1],最终归一化到[-sqrt(6 / (fan_in + fan_out)),+sqrt(6 / (fan_in + fan_out))] why?? end net.layers{l}.b{j} = 0; % 初始话feture map对应的偏置参数 初始化为0 end inputmaps = net.layers{l}.outputmaps; % 修改输入feature maps的个数以便下次使用 end end % 'onum' is the number of labels, that's why it is calculated using size(y, 1). If you have 20 labels so the output of the network will be 20 neurons. % 'fvnum' is the number of output neurons at the last layer, the layer just before the output layer. % 'ffb' is the biases of the output neurons. % 'ffW' is the weights between the last layer and the output neurons. Note that the last layer is fully connected to the output layer, that's why the size of the weights is (onum * fvnum) fvnum = prod(mapsize) * inputmaps; % S4最后结点个数即为特征的个数 用作全连接 12*4*4=192维特征 onum = size(y, 1); %最终分类的个数 10类 net.ffb = zeros(onum, 1); %softmat回归的偏置参数个数 net.ffW = (rand(onum, fvnum) - 0.5) * 2 * sqrt(6 / (onum + fvnum)); %% softmaxt回归的权值参数 为10*192个 全连接 end
cnntrain.m
function net = cnntrain(net, x, y, opts) m = size(x, 3); %% 图片一共的数量 60000 numbatches = m / opts.batchsize; % 循环的次数 共1200次,每次使用50个样本进行 if rem(numbatches, 1) ~= 0 error('numbatches not integer'); end net.rL = []; for i = 1 : opts.numepochs disp(['epoch ' num2str(i) '/' num2str(opts.numepochs)]); tic; kk = randperm(m); %% 随机产生m以内的不重复的m个数 for l = 1 : numbatches %% 循环1200次,每次选取50个样本进行更新 batch_x = x(:, :, kk((l - 1) * opts.batchsize + 1 : l * opts.batchsize)); %50个样本的训练数据 batch_y = y(:, kk((l - 1) * opts.batchsize + 1 : l * opts.batchsize)); %50个样本所对应的标签 net = cnnff(net, batch_x); %计算前向传播 net = cnnbp(net, batch_y); %bp算法更新参数 opts.i = i; opts.l = l; net = cnnapplygrads(net, opts); %% 运用梯度迭代更新参数 if isempty(net.rL) net.rL(1) = net.L; end net.rL(end + 1) = 0.99 * net.rL(end) + 0.01 * net.L; %为什么要这样计算不太明白 net.L为每次迭代时候的cost function end toc; end end
cnnff.m
function net = cnnff(net, x) n = numel(net.layers); %% 所具有的层数 net.layers{1}.a{1} = x; %第一层的激励值初始化 inputmaps = 1; for l = 2 : n % for each layer %针对每一个卷积层 if strcmp(net.layers{l}.type, 'c') % !!below can probably be handled by insane matrix operations for j = 1 : net.layers{l}.outputmaps % for each output map %针对该层的每一个feture map % create temp output map z = zeros(size(net.layers{l - 1}.a{1}) - [net.layers{l}.kernelsize - 1 net.layers{l}.kernelsize - 1 0]); % 该层feture map的大小,最后一位是样本图片个数 初始化为0 for i = 1 : inputmaps % for each input map %针对每一个输入feature map % convolve with corresponding kernel and add to temp output map z = z + convn(net.layers{l - 1}.a{i}, net.layers{l}.k{i}{j}, 'valid'); %做卷积操作 k{i}{j} 是5*5的double类型,其中a{i}是输入图片的feature map 大小为28*28*50 50为图像数量 end %% 卷积操作这里的k是不是应该旋转180度 % add bias, pass through nonlinearity net.layers{l}.a{j} = sigm(z + net.layers{l}.b{j}); %%获取sigmoid function的值 end % set number of input maps to this layers number of outputmaps inputmaps = net.layers{l}.outputmaps; %% 设置新的输入feature maps的个数 elseif strcmp(net.layers{l}.type, 's') %%% 下采样采用的方法是,2*2相加乘以权值1/4, 没有取偏置和取sigmoid % downsample for j = 1 : inputmaps z = convn(net.layers{l - 1}.a{j}, ones(net.layers{l}.scale) / (net.layers{l}.scale ^ 2), 'valid'); % !! replace with variable %%先卷积后各行各列取结果 net.layers{l}.a{j} = z(1 : net.layers{l}.scale : end, 1 : net.layers{l}.scale : end, :); %得到的结果是上一层卷积层行列的一半 a=z end end end % concatenate all end layer feature maps into vector net.fv = []; %%用来保存最后一个隐藏层所对应的特征 将feature maps变成全连接的形式 for j = 1 : numel(net.layers{n}.a) % 最后一层隐层具有的feture map数量 sa = size(net.layers{n}.a{j}); net.fv = [net.fv; reshape(net.layers{n}.a{j}, sa(1) * sa(2), sa(3))]; %% 最后得到192*50的矩阵,每一列对应一个样本图像的特征,192=4*4*12 12个feturemap end % feedforward into output perceptrons net.o = sigm(net.ffW * net.fv + repmat(net.ffb, 1, size(net.fv, 2))); %% 结果为10*50的矩阵,每一列表示一个样本图像的标签结果 取了sigmoid function表明是k个二分类器,各类之间不互斥,当然也可以换成softmax回归 % net.o = softmax(net.ffW * net.fv + repmat(net.ffb, 1, size(net.fv, 2))); end
cnnbp.m
function net = cnnbp(net, y) n = numel(net.layers); %layers个数 % error net.e = net.o - y; % 10*50 每一列表示一个样本图像 % loss function net.L = 1/2* sum(net.e(:) .^ 2) / size(net.e, 2); %% cost function 没有加入参数构成贝叶斯学派的观点 %% backprop deltas net.od = net.e .* (net.o .* (1 - net.o)); % output delta 输出层的误差 用来求解 10*50 net.fvd = (net.ffW' * net.od); % feature vector delta 最后一层隐藏层误差 如果是下采样层,由于a=z,所以误差就是这个结果(导数为1,就是对z求导),如果是卷积层,那么需要乘以f(z)的导数 192*50 if strcmp(net.layers{n}.type, 'c') % only conv layers has sigm function net.fvd = net.fvd .* (net.fv .* (1 - net.fv)); %% 如果最后一个隐藏层是卷积层,直接用该公式就能得到误差 end % reshape feature vector deltas into output map style sa = size(net.layers{n}.a{1}); %%layers{n}共有12个a 每个a都是4*4*50 50 为样本图片的个数 n表示最后一层隐藏层 fvnum = sa(1) * sa(2); for j = 1 : numel(net.layers{n}.a) %%最后一个隐藏层一共有多少个feature maps,每个feature map即表示为d{j}变成4*4*50的形式,50为样本图片数量,这样好用于计算前面层次的误差**** 转变 net.layers{n}.d{j} = reshape(net.fvd(((j - 1) * fvnum + 1) : j * fvnum, :), sa(1), sa(2), sa(3)); %将最后一层隐藏层变成feature maps的形式,这样易求解前一层卷积的结果 end for l = (n - 1) : -1 : 1 %实际是到2终止了,1是输入层,没有误差要求 if strcmp(net.layers{l}.type, 'c') %卷积层的计算方式 for j = 1 : numel(net.layers{l}.a) %第n-1层具有的feature maps的个数,进行遍历 每个d{j}是8*8*50的形式, 由于下一层为下采样层,故后一层d{j}扩展为8*8的(每个点复制成2*2的),按照bp求误差公式就可以得出,这里权重就为1/4, net.layers{l}.d{j} = net.layers{l}.a{j} .* (1 - net.layers{l}.a{j}) .* (expand(net.layers{l + 1}.d{j}, [net.layers{l + 1}.scale net.layers{l + 1}.scale 1]) / net.layers{l + 1}.scale ^ 2); end elseif strcmp(net.layers{l}.type, 's') %下采样层的计算方式 for i = 1 : numel(net.layers{l}.a) %该层feature maps的个数 每个a都是12*12*50 的大小,其中50为样本图片的个数 z = zeros(size(net.layers{l}.a{1})); %大小等于 当前层feature map的大小 for j = 1 : numel(net.layers{l + 1}.a) %计算公式来自 Notes on Convolutional Neural Networks的pdf,,将当前层下采样层与后面的采样层每个feature map相连接, 故按照bp的公式要进行求和 z = z + convn(net.layers{l + 1}.d{j}, rot180(net.layers{l + 1}.k{i}{j}), 'full'); %%% 可以举一个简单的例子进行讲解 所有节点相乘都是相加的(因为该结点是与后一层所有的feature maps都是有连接的), end %% 卷积 full valid是什么意思 要弄清楚???? net.layers{l}.d{i} = z; %% 因为是下采样层,所以a=z,就f(z)=z,导数就等于1,所以误差就是所连接结点权值与后一层误差和 end end end %% calc gradients %% 对kij求偏导没有看懂 为什么要进行求和 for l = 2 : n if strcmp(net.layers{l}.type, 'c') for j = 1 : numel(net.layers{l}.a) for i = 1 : numel(net.layers{l - 1}.a) net.layers{l}.dk{i}{j} = convn(flipall(net.layers{l - 1}.a{i}), net.layers{l}.d{j}, 'valid') / size(net.layers{l}.d{j}, 3); % 可以看论文中的推导!与论文中先将k rot180,然后再rot整体效果是一样的。 end net.layers{l}.db{j} = sum(net.layers{l}.d{j}(:)) / size(net.layers{l}.d{j}, 3); %% 对偏置参数b的导数 end end end net.dffW = net.od * (net.fv)' / size(net.od, 2); %softmax回归中参数所对应的导数 net.dffb = mean(net.od, 2); %% softmax回归中最后参数b所对应的导数 function X = rot180(X) X = flipdim(flipdim(X, 1), 2); % flipdim(X, 1) 行互换 flipdim(X, 2) 列互换 end end
cnnapplygrads.m
function net = cnnapplygrads(net, opts) for l = 2 : numel(net.layers) if strcmp(net.layers{l}.type, 'c') for j = 1 : numel(net.layers{l}.a) for ii = 1 : numel(net.layers{l - 1}.a) net.layers{l}.k{ii}{j} = net.layers{l}.k{ii}{j} - 1/(opts.i+opts.alpha + opts.l) * net.layers{l}.dk{ii}{j}; %% 梯度下降求更新后的参数 end net.layers{l}.b{j} = net.layers{l}.b{j} - 1/(opts.i+opts.alpha + opts.l) * net.layers{l}.db{j}; end end end net.ffW = net.ffW - 1/(opts.i+opts.alpha + opts.l) * net.dffW; net.ffb = net.ffb - 1/(opts.i+opts.alpha + opts.l) * net.dffb; end
cnntest.m
function [er, bad] = cnntest(net, x, y) % feedforward net = cnnff(net, x); [~, h] = max(net.o); [~, a] = max(y); bad = find(h ~= a); er = numel(bad) / size(y, 2); end
将代码中opts.numepochs 迭代次数设为1和10分别获得了11.13%和2.73%的错误率。
将最后一层的分类回归改为softmax回归得到的错误率为16.01%和5.10%。当然这里修改迭代次数即下降的速率可能会得到更佳的效果!
原文地址:http://blog.csdn.net/lu597203933/article/details/46576017