标签:style blog http ar os 使用 sp for strong
本文的起源来自最近一个让我非常不爽的事。
我最近在改一个开源RNN工具包currennt(http://sourceforge.net/projects/currennt/),想用它实现RNNLM功能。
currennt使用了大量的面向对象的编程技巧,可以使用GPU,向量运算使用了thrust库(https://code.google.com/p/thrust/)。
RNNLM(http://rnnlm.org/)也有相应开源实现,非常算法风格的代码,向量运算就是自己使用数组实现的。
结果……大出我的语料,在不使用GPU的情况下,currennt慢成狗!我不断的修改,直到最后几乎完全在currennt里重写了一个RNNLM……速度才终于一致了。这花费了我大量时间,最关键的是我根本没打算花这些时间,算是计划外开销。
所以这里干脆对常用的几种向量运算做个评测,下回遇到至少心里有数。
参与评测的向量实现包括:
评测指标包括:
测试环境:
VS2010
python 2.7.6
Intel Xeon CPU E5649@2.53GHz x24
thrust v1.5
C++ array
创建全0向量:0.000s,几乎不占用时间
int vector_size=100000000; float* vector=(float*)calloc(vector_size,sizeof(float));
创建+填充向量:0.140s
int vector_size=100000000; float* vector=(float*)calloc(vector_size,sizeof(float)); for (int i=0;i<vector_size;++i){ vector[i]=0.01; }
向量点乘:0.390s
float sum=0; for(int i=0;i<vector_size;++i){ sum+=vector1[i]*vector2[i]; }
向量相乘:0.265s
float sum=0; for(int i=0;i<vector_size;++i){ vector3[i]=vector1[i]*vector2[i]; }
矩阵乘向量:0.344s
int matrix1_colnum=50000; int matrix1_rownum=2000; int matrix1_size=matrix1_colnum*matrix1_rownum; float* vector1=(float*)calloc(matrix1_size,sizeof(float)); for (int i=0;i<matrix1_size;++i){ vector1[i]=0.01; } float* vector2=(float*)calloc(matrix1_colnum,sizeof(float)); for (int i=0;i<matrix1_colnum;++i){ vector2[i]=0.02; } start_t=clock(); float* vector3=(float*)calloc(matrix1_rownum,sizeof(float)); for(int row=0;row<matrix1_rownum;++row){ for(int col=0;col<matrix1_colnum;++col){ vector3[row]+=vector1[row*matrix1_colnum+col]*vector2[col]; } } end_t=clock();
矩阵乘矩阵:0.749
(耗费时间与matrix1_rownum*matrix1_colnum*matrix2_colnum成正比)
int matrix1_rownum=200; int matrix1_colnum=5000; int matrix1_size=matrix1_colnum*matrix1_rownum; float* vector1=(float*)calloc(matrix1_size,sizeof(float)); for (int i=0;i<matrix1_size;++i){ vector1[i]=0.01; } int matrix2_rownum=5000; int matrix2_colnum=200; int matrix2_size=matrix2_rownum*matrix2_colnum; float* vector2=(float*)calloc(matrix2_size,sizeof(float)); for (int i=0;i<matrix2_size;++i){ vector2[i]=0.02; } int matrix3_size=matrix1_rownum*matrix2_colnum; float* vector3=(float*)calloc(matrix3_size,sizeof(float)); start_t=clock(); for(int row1=0;row1<matrix1_rownum;++row1){ for(int col2=0;col2<matrix2_colnum;++col2){ for(int col1=0;col1<matrix1_colnum;++col1){ vector3[row1*matrix2_colnum+col2]+=vector1[row1*matrix1_colnum+col1]*vector2[col1*matrix2_colnum+col2]; } } } end_t=clock();
C++ STL vector
创建全0向量:0.140s
int vect_size=100000000;
vector<float> vector(vect_size);
创建+填充向量:0.140s
int vect_size=100000000; vector<float> vector(vect_size,0.01);
向量点乘:0.375s
int vect_size=100000000; vector<float> vector1(vect_size,0.01); vector<float> vector2(vect_size,0.02); start_t=clock(); float sum=0; for(int i=0;i<vect_size;++i){ sum+=vector1[i]*vector2[i]; } end_t=clock();
向量相乘:0.250s
int vect_size=100000000; vector<float> vector1(vect_size,0.01); vector<float> vector2(vect_size,0.02); vector<float> vector3(vect_size); start_t=clock(); for(int i=0;i<vect_size;++i){ vector3[i]=vector1[i]*vector2[i]; } end_t=clock();
矩阵乘向量:0.390s
int matrix1_colnum=50000; int matrix1_rownum=2000; int matrix1_size=matrix1_colnum*matrix1_rownum; vector<float> vector1(matrix1_size,0.01); vector<float> vector2(matrix1_colnum,0.02); vector<float> vector3(matrix1_rownum); start_t=clock(); for(int row=0;row<matrix1_rownum;++row){ for(int col=0;col<matrix1_colnum;++col){ vector3[row]+=vector1[row*matrix1_colnum+col]*vector2[col]; } } end_t=clock();
矩阵乘法:0.827s
int matrix1_rownum=200; int matrix1_colnum=5000; int matrix1_size=matrix1_colnum*matrix1_rownum; vector<float> vector1(matrix1_size,0.01); int matrix2_rownum=5000; int matrix2_colnum=200; int matrix2_size=matrix2_rownum*matrix2_colnum; vector<float> vector2(matrix2_size,0.02); int matrix3_size=matrix1_rownum*matrix2_colnum; vector<float> vector3(matrix3_size); start_t=clock(); for(int row1=0;row1<matrix1_rownum;++row1){ for(int col2=0;col2<matrix2_colnum;++col2){ for(int col1=0;col1<matrix1_colnum;++col1){ vector3[row1*matrix2_colnum+col2]+=vector1[row1*matrix1_colnum+col1]*vector2[col1*matrix2_colnum+col2]; } } } end_t=clock();
C++ thrust(CPU)
创建全0向量:0.140s
int vect_size=100000000; thrust::host_vector<float> vector1(vect_size);
创建+填充向量:0.140s
int vect_size=100000000; thrust::host_vector<float> vector1(vect_size,0.01);
填充向量:0.078s
thrust::fill(vector1.begin(),vector1.end(),0.01);
向量点乘:0.359s
int vect_size=100000000; thrust::host_vector<float> vector1(vect_size,(float)0.1); thrust::host_vector<float> vector2(vect_size,(float)0.2); thrust::host_vector<float> vector3(vect_size,(float)0.2); start_t=clock(); thrust::transform(vector1.begin(),vector1.end(),vector2.begin(),vector3.begin(),thrust::multiplies<float>()); float sum=thrust::reduce(vector3.begin(),vector3.end(),(float)0,thrust::multiplies<float>()); end_t=clock();
向量相乘:0.187s
int vect_size=100000000; thrust::host_vector<float> vector1(vect_size,(float)0.1); thrust::host_vector<float> vector2(vect_size,(float)0.2); thrust::host_vector<float> vector3(vect_size); start_t=clock(); thrust::transform(vector1.begin(),vector1.end(),vector2.begin(),vector3.begin(),thrust::multiplies<float>()); end_t=clock();
矩阵乘向量:0.110s
struct matrixXvect_func { thrust::host_vector<float>* matrix; thrust::host_vector<float>* vector; int matrix_rownum; int matrix_colnum; __host__ __device__ float operator()(const int& idx) const{ float t=0; for(int col=0;col<matrix_colnum;++col){ t+=(*matrix)[idx*matrix_colnum+col]* (*vector)[col]; } return t; } }; int matrix1_colnum=50000; int matrix1_size=matrix1_colnum*matrix1_rownum; thrust::host_vector<float> vector1(matrix1_size,(float)0.1); thrust::host_vector<float> vector2(matrix1_colnum,(float)0.2); thrust::host_vector<float> vector3(matrix1_rownum); start_t=clock(); matrixXvect_func fn; fn.matrix=&vector1; fn.vector=&vector2; fn.matrix_rownum=matrix1_rownum; fn.matrix_colnum=matrix1_colnum; thrust::transform( thrust::counting_iterator<int>(0), thrust::counting_iterator<int>(0) + matrix1_rownum, vector3.begin(), fn ); end_t=clock();
矩阵乘矩阵:0.655s
struct matrixXmatrix_func { thrust::host_vector<float>* matrix1; thrust::host_vector<float>* matrix2; int matrix1_rownum; int matrix1_colnum; int matrix2_rownum; int matrix2_colnum; __host__ __device__ float operator()(const int& idx) const{ int rownum=idx/matrix2_colnum; int colnum=idx%matrix2_colnum; float t=0; for(int col=0;col<matrix1_colnum;++col){ t+=(*matrix1)[rownum*matrix1_colnum+col]* (*matrix2)[col*matrix2_colnum+colnum]; } return t; } }; int matrix1_rownum=200; int matrix1_colnum=5000; int matrix1_size=matrix1_colnum*matrix1_rownum; thrust::host_vector<float> vector1(matrix1_size,(float)0.1); int matrix2_rownum=5000; int matrix2_colnum=200; int matrix2_size=matrix2_rownum*matrix2_colnum; thrust::host_vector<float> vector2(matrix2_size,(float)0.2); int matrix3_size=matrix1_rownum*matrix2_colnum; thrust::host_vector<float> vector3(matrix3_size); start_t=clock(); matrixXmatrix_func fn; fn.matrix1=&vector1; fn.matrix2=&vector2; fn.matrix1_rownum=matrix1_rownum; fn.matrix1_colnum=matrix1_colnum; fn.matrix2_rownum=matrix2_rownum; fn.matrix2_colnum=matrix2_colnum; thrust::transform( thrust::counting_iterator<int>(0), thrust::counting_iterator<int>(0) + matrix3_size, vector3.begin(), fn ); end_t=clock();
标签:style blog http ar os 使用 sp for strong
原文地址:http://www.cnblogs.com/plwang1990/p/4147379.html