CvSVM::predict函数解析:无论是Mat接口还是CvMat接口终于都是通过指针的形式调用的。也就是终于都是调用的下面函数实现的
float CvSVM::predict( const float* row_sample, int row_len, bool returnDFVal ) const
{
// 首先确保创建了核函数输入了样本
assert( kernel );
assert( row_sample );
// 样本的长度,也就是特征的维数必须匹配
int var_count = get_var_count();
assert( row_len == var_count );
(void)row_len; // 不知道干啥的
// 计算类别数目
int class_count = class_labels ? class_labels->cols :
params.svm_type == ONE_CLASS ? 1 : 0;
float result = 0;
cv::AutoBuffer<float> _buffer(sv_total + (class_count+1)*2);
float* buffer = _buffer;
// 对于回归或者一类使用下面函数
if( params.svm_type == EPS_SVR ||
params.svm_type == NU_SVR ||
params.svm_type == ONE_CLASS )
{
CvSVMDecisionFunc* df = (CvSVMDecisionFunc*)decision_func;
int i, sv_count = df->sv_count;
double sum = -df->rho;
// 计算最优平面与当前样本点之间的距离
kernel->calc( sv_count, var_count, (const float**)sv, row_sample,
buffer );
for( i = 0; i < sv_count; i++ )
sum += buffer[i]*df->alpha[i];
result = params.svm_type == ONE_CLASS ? (float)(sum > 0) : (float)sum;
}
// 对于分类问题使用下面方法
else if( params.svm_type == C_SVC ||
params.svm_type == NU_SVC )
{
CvSVMDecisionFunc* df = (CvSVMDecisionFunc*)decision_func;
int* vote = (int*)(buffer + sv_total);
int i, j, k;
memset( vote, 0, class_count*sizeof(vote[0]));
// 计算样本点到全部支持向量之间的距离。sum(xi-yi)。当中xi表示样本点,yi表示支持向量,i=1:N。N表示一个样本的特征维度
kernel->calc( sv_total, var_count, (const float**)sv, row_sample,
buffer );
double sum = 0.;
// 对于二分类问题下面函数仅仅运行一遍
for( i = 0; i < class_count; i++ )
{
for( j = i+1; j < class_count; j++, df++ )
{
// 获取样本点到最优分界面的距离:buffer中存放測试样本与每一个支持向量距离度量值
sum = -df->rho;
int sv_count = df->sv_count;
for( k = 0; k < sv_count; k++ )
sum += df->alpha[k]*buffer[df->sv_index[k]]; // sv_index存储的是支持向量index。对于2分类实际sv_index[k]=k
// sum大于0表示属于第一个类别。也就是类别标记较小的那个类别,对于二分类问题也就是负样本
vote[sum > 0 ? i : j]++;
}
}
// 统计每一个类别的投票次数,注意大于0投票给了0也就是0为负样本,假设一次也没有投票的话,默认输出的也是负样本
for( i = 1, k = 0; i < class_count; i++ )
{
if( vote[i] > vote[k] )
k = i;
}
// returnDFVal仅仅对于2分类问题有效,sum大于0表示属于类别标记较小的那个类别也就是-1,sum小于0表示属于类别标记较大的那个类别也就是+1
result = returnDFVal && class_count == 2 ? (float)sum : (float)(class_labels->data.i[k]);
}
else
CV_Error( CV_StsBadArg, "INTERNAL ERROR: Unknown SVM type, "
"the SVM structure is probably corrupted" );
return result;
}
kernel->calc( sv_total, var_count, (const float**)sv, row_sample, buffer );
计算样本点到支持向量点间的距离,结果存储到buffer中,以线性核函数为例
从上面的分析能够看出。假设不更改rho或者alpha的符号,计算的是-rho+w*x的结果,大于0表示负样本,小于0表示正样本;因此在HOGDescriptor中须要将alpha更改符号。直接保存rho就可以,这样给出的结果rho-alpha*x。大于0表示正样本。
w=alpha*sv; alpha中存放每一个支持向量的权重,sv表示每一个支持向量,w为加权后的支持向量
void CvSVMKernel::calc( int vcount, int var_count, const float** vecs,
const float* another, Qfloat* results )
{
const Qfloat max_val = (Qfloat)(FLT_MAX*1e-3);
int j;
// 对于核函数为Liner时调用线性核函数进行计算
(this->*calc_func)( vcount, var_count, vecs, another, results );
// 检查是否越界
for( j = 0; j < vcount; j++ )
{
if( results[j] > max_val )
results[j] = max_val;
}
}
线性核函数又调用non_rbf_base核函数进行计算
void CvSVMKernel::calc_linear( int vcount, int var_count, const float** vecs,
const float* another, Qfloat* results )
{
calc_non_rbf_base( vcount, var_count, vecs, another, results, 1, 0 );
}
vcount表示支持向量的个数,也就是位于正负样本分界面上的正负样本总数
var_count表示样本的特征维度
vecs表示支持向量。也就是位于正负样本分解面上的正负样本
another表示測试样本
result表示对于每一个支持向量的累加结果值
void CvSVMKernel::calc_non_rbf_base( int vcount, int var_count, const float** vecs,
const float* another, Qfloat* results,
double alpha, double beta )
{
int j, k;
// 对于每个支持向量,或者每个位于分界面的训练样本
for( j = 0; j < vcount; j++ )
{
// 获取第j个支持向量或者说第j个样本
const float* sample = vecs[j];
double s = 0;
// 特征维度大于4的时候进行优化
for( k = 0; k <= var_count - 4; k += 4 )
s += sample[k]*another[k] + sample[k+1]*another[k+1] +
sample[k+2]*another[k+2] + sample[k+3]*another[k+3];
// 计算其余的维度
for( ; k < var_count; k++ )
s += sample[k]*another[k];
// 计算測试样本与第j个样本之间的距离
results[j] = (Qfloat)(s*alpha + beta); // 注意这里的alpha和beta为每一个支持向量的权重,对于线性核函数而言alpha=1,beta=0能够从calc_linear函数看出
}
}
// RBF核函数,遵循RBF核函数公式
void CvSVMKernel::calc_rbf( int vcount, int var_count, const float** vecs,
const float* another, Qfloat* results )
{
CvMat R = cvMat( 1, vcount, QFLOAT_TYPE, results );
double gamma = -params->gamma;
int j, k;
// vcount=sv_count
for( j = 0; j < vcount; j++ )
{
const float* sample = vecs[j];
double s = 0;
// var_count=feature_num
for( k = 0; k <= var_count - 4; k += 4 )
{
double t0 = sample[k] - another[k];
double t1 = sample[k+1] - another[k+1];
s += t0*t0 + t1*t1;
t0 = sample[k+2] - another[k+2];
t1 = sample[k+3] - another[k+3];
s += t0*t0 + t1*t1;
}
for( ; k < var_count; k++ )
{
double t0 = sample[k] - another[k];
s += t0*t0;
}
results[j] = (Qfloat)(s*gamma);
}
if( vcount > 0 )
cvExp( &R, &R );
}
// 依据交叉次数自己主动训练函数,CvParamGrid为训练參数构造函数,其包括三个參数(min_val,max_val,step)
// Param Grid的检查函数,最大值不小于最小值。最小值不小于DBL_EPSILON。step不小于1(应该是不小于等于1)
bool CvParamGrid::check() const
{
bool ok = false;
CV_FUNCNAME( "CvParamGrid::check" );
__BEGIN__;
if( min_val > max_val )
CV_ERROR( CV_StsBadArg, "Lower bound of the grid must be less then the upper one" );
if( min_val < DBL_EPSILON )
CV_ERROR( CV_StsBadArg, "Lower bound of the grid must be positive" );
if( step < 1. + FLT_EPSILON )
CV_ERROR( CV_StsBadArg, "Grid step must greater then 1" );
ok = true;
__END__;
return ok;
}
假设不确定參数的值的范围能够通过下面函数获得,当然也能够通过一次次实验不断调整,最后得到一个大概的范围后採用train_auto函数进行训练
CvParamGrid CvSVM::get_default_grid( int param_id )
{
CvParamGrid grid;
if( param_id == CvSVM::C )
{
grid.min_val = 0.1;
grid.max_val = 500;
grid.step = 5; // total iterations = 5
}
else if( param_id == CvSVM::GAMMA )
{
grid.min_val = 1e-5;
grid.max_val = 0.6;
grid.step = 15; // total iterations = 4
}
else if( param_id == CvSVM::P )
{
grid.min_val = 0.01;
grid.max_val = 100;
grid.step = 7; // total iterations = 4
}
else if( param_id == CvSVM::NU )
{
grid.min_val = 0.01;
grid.max_val = 0.2;
grid.step = 3; // total iterations = 3
}
else if( param_id == CvSVM::COEF )
{
grid.min_val = 0.1;
grid.max_val = 300;
grid.step = 14; // total iterations = 3
}
else if( param_id == CvSVM::DEGREE )
{
grid.min_val = 0.01;
grid.max_val = 4;
grid.step = 7; // total iterations = 3
}
else
cvError( CV_StsBadArg, "CvSVM::get_default_grid", "Invalid type of parameter "
"(use one of CvSVM::C, CvSVM::GAMMA et al.)", __FILE__, __LINE__ );
return grid;
}
bool CvSVM::train_auto( const CvMat* _train_data, const CvMat* _responses,
const CvMat* _var_idx, const CvMat* _sample_idx, CvSVMParams _params, int k_fold,
CvParamGrid C_grid, CvParamGrid gamma_grid, CvParamGrid p_grid,
CvParamGrid nu_grid, CvParamGrid coef_grid, CvParamGrid degree_grid,
bool balanced)
{
bool ok = false;
CvMat* responses = 0;
CvMat* responses_local = 0;
CvMemStorage* temp_storage = 0;
const float** samples = 0;
const float** samples_local = 0;
CV_FUNCNAME( "CvSVM::train_auto" );
__BEGIN__;
int svm_type, sample_count, var_count, sample_size;
int block_size = 1 << 16;
double* alpha;
RNG* rng = &theRNG();
// all steps are logarithmic and must be > 1
// 步长都必须大于1,由于等于1的时候会造成死循环。,默认step=10
double degree_step = 10, g_step = 10, coef_step = 10, C_step = 10, nu_step = 10, p_step = 10;
double gamma = 0, curr_c = 0, degree = 0, coef = 0, p = 0, nu = 0;
double best_degree = 0, best_gamma = 0, best_coef = 0, best_C = 0, best_nu = 0, best_p = 0;
float min_error = FLT_MAX, error;
// SVMType为ONE_CLASS的时候不能进行自己主动训练
if( _params.svm_type == CvSVM::ONE_CLASS )
{
if(!train( _train_data, _responses, _var_idx, _sample_idx, _params ))
EXIT;
return true;
}
clear();
// 选择的交叉验证层数必须大于等于2。等于2的时候就是使用一个训练另外一个測试。训练两次,測试两次
// 等于10表示训练10次。測试10次,选择一组測试其余9组训练
if( k_fold < 2 )
CV_ERROR( CV_StsBadArg, "Parameter <k_fold> must be > 1" );
CV_CALL(set_params( _params ));
svm_type = _params.svm_type;
// All the parameters except, possibly, <coef0> are positive.
// <coef0> is nonnegative
// 检查各个參数的值是否满足要求,这是第一步检查(步长不能小于1),后面依据SVM类型与Kernel类型还会进行二次检查
if( C_grid.step <= 1 )
{
C_grid.min_val = C_grid.max_val = params.C;
C_grid.step = 10;
}
else
CV_CALL(C_grid.check());
if( gamma_grid.step <= 1 )
{
gamma_grid.min_val = gamma_grid.max_val = params.gamma;
gamma_grid.step = 10;
}
else
CV_CALL(gamma_grid.check());
if( p_grid.step <= 1 )
{
p_grid.min_val = p_grid.max_val = params.p;
p_grid.step = 10;
}
else
CV_CALL(p_grid.check());
if( nu_grid.step <= 1 )
{
nu_grid.min_val = nu_grid.max_val = params.nu;
nu_grid.step = 10;
}
else
CV_CALL(nu_grid.check());
if( coef_grid.step <= 1 )
{
coef_grid.min_val = coef_grid.max_val = params.coef0;
coef_grid.step = 10;
}
else
CV_CALL(coef_grid.check());
if( degree_grid.step <= 1 )
{
degree_grid.min_val = degree_grid.max_val = params.degree;
degree_grid.step = 10;
}
else
CV_CALL(degree_grid.check());
// these parameters are not used:
// 二次检查參数的值。依据核函数类型与SVM类型优化參数
if( params.kernel_type != CvSVM::POLY )
degree_grid.min_val = degree_grid.max_val = params.degree;
if( params.kernel_type == CvSVM::LINEAR )
gamma_grid.min_val = gamma_grid.max_val = params.gamma;
if( params.kernel_type != CvSVM::POLY && params.kernel_type != CvSVM::SIGMOID )
coef_grid.min_val = coef_grid.max_val = params.coef0;
if( svm_type == CvSVM::NU_SVC || svm_type == CvSVM::ONE_CLASS )
C_grid.min_val = C_grid.max_val = params.C;
if( svm_type == CvSVM::C_SVC || svm_type == CvSVM::EPS_SVR )
nu_grid.min_val = nu_grid.max_val = params.nu;
if( svm_type != CvSVM::EPS_SVR )
p_grid.min_val = p_grid.max_val = params.p;
CV_ASSERT( g_step > 1 && degree_step > 1 && coef_step > 1);
CV_ASSERT( p_step > 1 && C_step > 1 && nu_step > 1 );
/* Prepare training data and related parameters */
// 实现数据的转存,放到指针中
CV_CALL(cvPrepareTrainData( "CvSVM::train_auto", _train_data, CV_ROW_SAMPLE,
svm_type != CvSVM::ONE_CLASS ? _responses : 0,
svm_type == CvSVM::C_SVC ||
svm_type == CvSVM::NU_SVC ? CV_VAR_CATEGORICAL :
CV_VAR_ORDERED, _var_idx, _sample_idx,
false, &samples, &sample_count, &var_count, &var_all,
&responses, &class_labels, &var_idx ));
sample_size = var_count*sizeof(samples[0][0]);
// make the storage block size large enough to fit all
// the temporary vectors and output support vectors.
block_size = MAX( block_size, sample_count*(int)sizeof(CvSVMKernelRow));
block_size = MAX( block_size, sample_count*2*(int)sizeof(double) + 1024 );
block_size = MAX( block_size, sample_size*2 + 1024 );
CV_CALL( storage = cvCreateMemStorage(block_size + sizeof(CvMemBlock) + sizeof(CvSeqBlock)));
CV_CALL(temp_storage = cvCreateChildMemStorage(storage));
CV_CALL(alpha = (double*)cvMemStorageAlloc(temp_storage, sample_count*sizeof(double)));
create_kernel();
create_solver();
{
const int testset_size = sample_count/k_fold; // 每组測试集合大小
const int trainset_size = sample_count - testset_size; // 每组训练集合大小
const int last_testset_size = sample_count - testset_size*(k_fold-1); // 最后一组測试集合大小。实际为testset_size
const int last_trainset_size = sample_count - last_testset_size; // 最后一组训练集合大小。实际trainset_size
const bool is_regression = (svm_type == EPS_SVR) || (svm_type == NU_SVR);
size_t resp_elem_size = CV_ELEM_SIZE(responses->type);
size_t size = 2*last_trainset_size*sizeof(samples[0]);
samples_local = (const float**) cvAlloc( size );
memset( samples_local, 0, size );
responses_local = cvCreateMat( 1, trainset_size, CV_MAT_TYPE(responses->type) );
cvZero( responses_local );
// randomly permute samples and responses
// 随机变更样本和标签的顺序为了获取分组
for(int i = 0; i < sample_count; i++ )
{
int i1 = (*rng)(sample_count);
int i2 = (*rng)(sample_count);
const float* temp;
float t;
int y;
CV_SWAP( samples[i1], samples[i2], temp );
if( is_regression )
CV_SWAP( responses->data.fl[i1], responses->data.fl[i2], t );
else
CV_SWAP( responses->data.i[i1], responses->data.i[i2], y );
}
// 假设是分类问题。而且是2分类,而且须要均衡化分组
if (!is_regression && class_labels->cols==2 && balanced)
{
// count class samples
// responses中存放0和1。class_label中存放为0和1就对了,因此注意负样本的标签为0,正样本的标签为1!!
int num_0=0,num_1=0;
for (int i=0; i<sample_count; ++i)
{
if (responses->data.i[i]==class_labels->data.i[0])
++num_0;
else
++num_1;
}
// 哪个类别是较大的
int label_smallest_class;
int label_biggest_class;
if (num_0 < num_1)
{
label_biggest_class = class_labels->data.i[1];
label_smallest_class = class_labels->data.i[0];
}
else
{
label_biggest_class = class_labels->data.i[0];
label_smallest_class = class_labels->data.i[1];
int y;
CV_SWAP(num_0,num_1,y);
}
const double class_ratio = (double) num_0/sample_count;
// calculate class ratio of each fold
indexedratio *ratios=0;
ratios = (indexedratio*) cvAlloc(k_fold*sizeof(*ratios));
for (int k=0, i_begin=0; k<k_fold; ++k, i_begin+=testset_size)
{
int count0=0;
int count1=0;
int i_end = i_begin + (k<k_fold-1 ? testset_size : last_testset_size);
for (int i=i_begin; i<i_end; ++i)
{
if (responses->data.i[i]==label_smallest_class)
++count0;
else
++count1;
}
ratios[k].ind = k;
ratios[k].count_smallest = count0;
ratios[k].count_biggest = count1;
ratios[k].eval();
}
// initial distance
qsort(ratios, k_fold, sizeof(ratios[0]), icvCmpIndexedratio);
double old_dist = 0.0;
for (int k=0; k<k_fold; ++k)
old_dist += abs(ratios[k].val-class_ratio);
double new_dist = 1.0;
// iterate to make the folds more balanced
while (new_dist > 0.0)
{
if (ratios[0].count_biggest==0 || ratios[k_fold-1].count_smallest==0)
break; // we are not able to swap samples anymore
// what if we swap the samples, calculate the new distance
ratios[0].count_smallest++;
ratios[0].count_biggest--;
ratios[0].eval();
ratios[k_fold-1].count_smallest--;
ratios[k_fold-1].count_biggest++;
ratios[k_fold-1].eval();
qsort(ratios, k_fold, sizeof(ratios[0]), icvCmpIndexedratio);
new_dist = 0.0;
for (int k=0; k<k_fold; ++k)
new_dist += abs(ratios[k].val-class_ratio);
if (new_dist < old_dist)
{
// swapping really improves, so swap the samples
// index of the biggest_class sample from the minimum ratio fold
int i1 = ratios[0].ind * testset_size;
for ( ; i1<sample_count; ++i1)
{
if (responses->data.i[i1]==label_biggest_class)
break;
}
// index of the smallest_class sample from the maximum ratio fold
int i2 = ratios[k_fold-1].ind * testset_size;
for ( ; i2<sample_count; ++i2)
{
if (responses->data.i[i2]==label_smallest_class)
break;
}
// swap
const float* temp;
int y;
CV_SWAP( samples[i1], samples[i2], temp );
CV_SWAP( responses->data.i[i1], responses->data.i[i2], y );
old_dist = new_dist;
}
else
break; // does not improve, so break the loop
}
cvFree(&ratios);
}
// 遍历每一个參数进行測试,将最小错误率的检測结果保存下来
// 在自己主动话训练时是依照例如以下方式进行的。当然针对不同的核函数与不同的SVM类型进行了优化
while(cur_val=min_val; cur_val<=max_val;
cur_val*=step)
int* cls_lbls = class_labels ? class_labels->data.i : 0;
curr_c = C_grid.min_val;
do
{
params.C = curr_c;
gamma = gamma_grid.min_val;
do
{
params.gamma = gamma;
p = p_grid.min_val;
do
{
params.p = p;
nu = nu_grid.min_val;
do
{
params.nu = nu;
coef = coef_grid.min_val;
do
{
params.coef0 = coef;
degree = degree_grid.min_val;
do
{
params.degree = degree;
float** test_samples_ptr = (float**)samples;
uchar* true_resp = responses->data.ptr;
int test_size = testset_size;
int train_size = trainset_size;
// 依据每轮训练中最小错误率保存结果
// 一组測试。其余组用于训练。依据总的错误率作为最佳结果
error = 0;
for(int k = 0; k < k_fold; k++ )
{
// 分段拷贝训练样本与训练样本标签
memcpy( samples_local, samples, sizeof(samples[0])*test_size*k );
memcpy( samples_local + test_size*k, test_samples_ptr + test_size,
sizeof(samples[0])*(sample_count - testset_size*(k+1)) );
memcpy( responses_local->data.ptr, responses->data.ptr, resp_elem_size*test_size*k );
memcpy( responses_local->data.ptr + resp_elem_size*test_size*k,
true_resp + resp_elem_size*test_size,
resp_elem_size*(sample_count - testset_size*(k+1)) );
if( k == k_fold - 1 )
{
test_size = last_testset_size;
train_size = last_trainset_size;
responses_local->cols = last_trainset_size;
}
// Train SVM on <train_size> samples
// 分组測试
if( !do_train( svm_type, train_size, var_count,
(const float**)samples_local, responses_local, temp_storage, alpha ) )
EXIT;
// Compute test set error on <test_size> samples
// 统计错误样本个数
for(int i = 0; i < test_size; i++, true_resp += resp_elem_size, test_samples_ptr++ )
{
float resp = predict( *test_samples_ptr, var_count );
error += is_regression ? powf( resp - *(float*)true_resp, 2 )
: ((int)resp != cls_lbls[*(int*)true_resp]);
}
}
// 保存具有最小错误样本个数的训练參数
if( min_error > error )
{
min_error = error;
best_degree = degree;
best_gamma = gamma;
best_coef = coef;
best_C = curr_c;
best_nu = nu;
best_p = p;
}
degree *= degree_grid.step;
}
while( degree < degree_grid.max_val );
coef *= coef_grid.step;
}
while( coef < coef_grid.max_val );
nu *= nu_grid.step;
}
while( nu < nu_grid.max_val );
p *= p_grid.step;
}
while( p < p_grid.max_val );
gamma *= gamma_grid.step;
}
while( gamma < gamma_grid.max_val );
curr_c *= C_grid.step;
}
while( curr_c < C_grid.max_val );
}
// 计算错误率
min_error /= (float) sample_count;
params.C = best_C;
params.nu = best_nu;
params.p = best_p;
params.gamma = best_gamma;
params.degree = best_degree;
params.coef0 = best_coef;
// 依据最佳的參数做一次最后的训练,因此分组測试的目的是为了寻找最佳參数
CV_CALL(ok = do_train( svm_type, sample_count, var_count, samples, responses, temp_storage, alpha ));
__END__;
delete solver;
solver = 0;
cvReleaseMemStorage( &temp_storage );
cvReleaseMat( &responses );
cvReleaseMat( &responses_local );
cvFree( &samples );
cvFree( &samples_local );
if( cvGetErrStatus() < 0 || !ok )
clear();
return ok;
}
总结:
1) predict函数首先计算測试样本到每一个支持向量sv[j]之间的距离。然后乘以每一个支持向量的权重alpha[i]结果得到一个数,该值减去rho。假设大于0表示为负样本,否则为正样本
2) 对于二分类使用balanced=true的train_auto而言,负样本标签必须为0,这样就能够通过统计正负样本的比例而得到一个较好的平衡结果