标签:create end seed 需要 打印 ntc malloc tcp das
先放上完整代码,以后填坑。
1 #include <stdio.h> 2 #include <stdlib.h> 3 #include <malloc.h> 4 #include <math.h> 5 #include <time.h> 6 #include "cuda_runtime.h" 7 #include "device_launch_parameters.h" 8 9 #define ROW (256+79) 10 #define COL (256+83) 11 #define EPSILON 0.1 // 系数矩阵非零元素占比 12 #define NON_ZEROS ROW * COL // 非零元素上限(用于分配MAT转ELL时的临时变量大小) 13 #define THREAD_SIZE 256 14 #define SEED 1 // (unsigned int)time(MULL) 15 #define MAX(x,y) ((x>y)?x:y) 16 #define CEIL(x,y) (int)(( x - 1 ) / y + 1) 17 #define INT false // 使用的数字格式 18 19 #if INT 20 typedef int format; 21 #else 22 typedef float format; 23 #endif 24 25 // 矩阵存储格式 26 typedef struct // Matrix 顺序格式 27 { 28 int row; // 行数 29 int col; // 列数 30 int count; // 元素个数(计算时用不到,而是用于转换) 31 format *data; // 实际元素的值 32 } 33 MAT; 34 35 typedef struct // Compressed Sparse Row 格式 36 { 37 int row; // 行数 38 int col; // 列数 39 format *data; // 实际元素的值 40 int *index; // 元素的列号 41 int *ptr; // 矩阵的行指针 42 } 43 CSR; 44 45 typedef struct // ELL 格式 46 { 47 int row; // 行数 48 int col; // 列数 49 format *data; // 实际元素的值 50 int *index; // 元素的列号 51 } 52 ELL; 53 54 typedef struct // Coordinate 格式 55 { 56 int row; // 行数 57 int col; // 列数 58 int count; // 元素个数 59 int *row_index; // 行向量 60 int *col_index; // 列向量 61 format *data; // 实际元素的值 62 } 63 COO; 64 65 // 通用函数 66 void checkNULL(const void *input) 67 { 68 if (input == NULL) 69 { 70 printf("\n\tfind a NULL!"); 71 exit(1); 72 } 73 return; 74 } 75 76 void checkCudaError(cudaError input) 77 { 78 if (input != cudaSuccess) 79 { 80 printf("\n\tfind a cudaError!"); 81 exit(1); 82 } 83 return; 84 } 85 86 int checkResult(const format * in1, const format * in2, const int length) 87 // 注意返回值为0(两向量相等)或者“值不等的元素下标加一”(防止0号元素就不相等),返回后若想使用该下标则需要减1 88 { 89 int i; 90 for (i = 0; i < length; i++) 91 { 92 if (in1[i] != in2[i]) 93 return i + 1; 94 } 95 return 0; 96 } 97 98 // 打印矩阵 99 void printMAT(MAT *in)// 打印MAT格式的矩阵 100 { 101 int i; 102 printf("\n\t"); 103 for (i = 0; i < in->row * in->col; i++) 104 { 105 #if INT 106 printf("%d ", in->data[i]); 107 #else 108 printf("%.2f ", in->data[i]); 109 #endif 110 if ((i + 1) % in->col == 0) 111 printf("\n\t"); 112 } 113 printf("\n"); 114 return; 115 } 116 117 void printCSR(const CSR *in)// 打印CSR格式的矩阵 118 { 119 int i; 120 printf("\n\t%d,%d", in->row, in->col); 121 printf("\n\t"); 122 for (i = 0; i < in->ptr[in->row]; i++) 123 #if INT 124 printf("%d ", in->data[i]); 125 #else 126 printf("%.2f ", in->data[i]); 127 #endif 128 printf("\n\t"); 129 for (i = 0; i < in->ptr[in->row]; i++) 130 printf("%d ", in->index[i]); 131 printf("\n\t"); 132 for (i = 0; i < in->row + 1; i++) 133 printf("%d ", in->ptr[i]); 134 printf("\n\n"); 135 return; 136 } 137 138 void printELL(const ELL *in)// 打印ELL格式的矩阵 139 { 140 int i; 141 printf("\n\t%d,%d", in->row, in->col); 142 printf("\n\t"); 143 for (i = 0; i < in->row * in->col; i++) 144 { 145 #if INT 146 printf("%d ", in->data[i]); 147 #else 148 printf("%.2f ", in->data[i]); 149 #endif 150 if ((i + 1) % in->col == 0) 151 printf("\n\t"); 152 } 153 printf("\n\t"); 154 for (i = 0; i < in->row * in->col; i++) 155 { 156 printf("%d ", in->index[i]); 157 if ((i + 1) % in->col == 0) 158 printf("\n\t"); 159 } 160 printf("\n\n"); 161 return; 162 } 163 164 void printCOO(const COO *in)// 打印ELL格式的矩阵 165 { 166 int i; 167 printf("\n\t%d,%d,%d", in->row, in->col, in->count); 168 printf("\n\t"); 169 for (i = 0; i < in->count; i++) 170 { 171 #if INT 172 printf("%d ", in->data[i]); 173 #else 174 printf("%.2f ", in->data[i]); 175 #endif 176 } 177 printf("\n\t"); 178 for (i = 0; i < in->count; i++) 179 { 180 printf("%d ", in->row_index[i]); 181 printf("%d ", in->col_index[i]); 182 } 183 printf("\n\n"); 184 return; 185 } 186 187 // 矩阵初始化与清理(有问题) 188 MAT *initializeMAT(const int row, const int col, const bool isCPU)// 初始化MAT 189 { 190 MAT *temp; 191 if (isCPU) 192 { 193 checkNULL(temp = (MAT *)malloc(sizeof(MAT))); 194 temp->row = row; 195 temp->col = col; 196 temp->count = 0; 197 checkNULL(temp->data = (format *)malloc(sizeof(format) * row * col)); 198 } 199 else 200 { 201 checkCudaError(cudaMalloc((void **)&temp, sizeof(MAT))); 202 temp->row = row; 203 temp->col = col; 204 temp->count = 0; 205 checkCudaError(cudaMalloc((void **)&temp->data, sizeof(format) * row * col)); 206 } 207 return temp; 208 } 209 210 int countMAT(MAT *in)// 统计MAT形式的矩阵中的非零元素个数 211 { 212 if (in == NULL) 213 return -1; 214 215 int i, out; 216 for (i = out = 0; i < in->row * in->col; i++) 217 out += (in->data[i] == 0); 218 return out; 219 } 220 221 int freeMAT(MAT *in, const bool isCPU)// 释放MAT 222 { 223 checkNULL(in); 224 if (isCPU) 225 { 226 free(in->data); 227 free(in); 228 } 229 else 230 { 231 cudaFree(in->data); 232 cudaFree(in); 233 } 234 return 0; 235 } 236 237 CSR * initializeCSR(const int row, const int col, const int count, const bool isCPU)// 初始化CSR 238 { 239 CSR *temp; 240 if (isCPU) 241 { 242 checkNULL(temp = (CSR *)malloc(sizeof(CSR))); 243 temp->row = row; 244 temp->col = col; 245 checkNULL(temp->data = (format *)malloc(sizeof(format) * count)); 246 checkNULL(temp->index = (int *)malloc(sizeof(int) * count)); 247 checkNULL(temp->ptr = (int *)malloc(sizeof(int) * (row + 1))); 248 } 249 else 250 { 251 checkCudaError(cudaMalloc((void **)&temp, sizeof(CSR))); 252 //temp->row = row; 253 //temp->col = col; 254 checkCudaError(cudaMalloc((void **)&temp->data, sizeof(format) * count)); 255 checkCudaError(cudaMalloc((void **)&temp->index, sizeof(int) * count)); 256 checkCudaError(cudaMalloc((void **)&temp->ptr, sizeof(int) * (row + 1))); 257 } 258 return temp; 259 } 260 261 int freeCSR(CSR *in, const bool isCPU)// 释放CSR 262 { 263 checkNULL(in); 264 if (isCPU) 265 { 266 free(in->data); 267 free(in->index); 268 free(in->ptr); 269 free(in); 270 } 271 else 272 { 273 cudaFree(in->data); 274 cudaFree(in->index); 275 cudaFree(in->ptr); 276 cudaFree(in); 277 } 278 return 0; 279 } 280 281 ELL * initializeELL(const int row, const int col, const bool isCPU)// 初始化ELL 282 { 283 ELL *temp; 284 if (isCPU) 285 { 286 checkNULL(temp = (ELL *)malloc(sizeof(ELL))); 287 temp->row = row; 288 temp->col = col; 289 checkNULL(temp->data = (format *)malloc(sizeof(format) * row * col)); 290 checkNULL(temp->index = (int *)malloc(sizeof(int) *row * col)); 291 } 292 else 293 { 294 checkCudaError(cudaMalloc((void **)&temp, sizeof(ELL))); 295 //temp->row = row; 296 //temp->col = col; 297 checkCudaError(cudaMalloc((void **)&temp->data, sizeof(format) * row * col)); 298 checkCudaError(cudaMalloc((void **)&temp->data, sizeof(int) * row * col)); 299 } 300 return temp; 301 } 302 303 int freeELL(ELL *in, const bool isCPU)// 释放ELL 304 { 305 checkNULL(in); 306 if (isCPU) 307 { 308 free(in->data); 309 free(in->index); 310 free(in); 311 } 312 else 313 { 314 cudaFree(in->data); 315 cudaFree(in->index); 316 cudaFree(in); 317 } 318 return 0; 319 } 320 321 COO * initializeCOO(const int row, const int col, const int count, const bool isCPU)// 初始化COO 322 { 323 COO * temp; 324 if (isCPU) 325 { 326 checkNULL(temp = (COO *)malloc(sizeof(COO))); 327 temp->row = row; 328 temp->col = col; 329 temp->count = count; 330 checkNULL(temp->data = (format *)malloc(sizeof(format) * count)); 331 checkNULL(temp->row_index = (int *)malloc(sizeof(int) * count)); 332 checkNULL(temp->col_index = (int *)malloc(sizeof(int) * count)); 333 } 334 else 335 { 336 checkCudaError(cudaMalloc((void **)&temp, sizeof(COO))); 337 //temp->row = row; 338 //temp->col = col; 339 //temp->count = count; 340 checkCudaError(cudaMalloc((void **)&temp->data, sizeof(format) * count)); 341 checkCudaError(cudaMalloc((void **)&temp->row_index, sizeof(int) * count)); 342 checkCudaError(cudaMalloc((void **)&temp->col_index, sizeof(int) * count)); 343 } 344 return temp; 345 } 346 347 int freeCOO(COO *in, const bool isCPU)// 释放COO 348 { 349 checkNULL(in); 350 if (isCPU) 351 { 352 free(in->data); 353 free(in->row_index); 354 free(in->col_index); 355 free(in); 356 } 357 else 358 { 359 cudaFree(in->data); 360 cudaFree(in->row_index); 361 cudaFree(in->col_index); 362 cudaFree(in); 363 } 364 return 0; 365 } 366 367 // 矩阵格式的相互转化 368 int MATToCSR(const MAT *in, CSR *out)// MAT格式转化为CSR格式 369 { 370 if (in == NULL || out == NULL) 371 return 1; 372 373 int i = 0, j = 0, k = 0; 374 out->row = in->row; // 行数和列数 375 out->col = in->col; 376 out->ptr[k++] = 0; 377 for (; i < in->row * in->col; i++) 378 { 379 if (in->data[i] != 0) // 找到非零元 380 { 381 if (j == in->count) // 在out->data已经填满了的基础上又发现了非零元 382 return 1; 383 out->data[j] = in->data[i]; // 填充非零元素 384 out->index[j] = i % in->col; // 填充列号 385 j++; 386 } 387 if ((i + 1) % in->col == 0) // 到了最后一列,写入行指针号 388 out->ptr[k++] = j; 389 } 390 return 0; 391 } 392 393 int CSRToMAT(const CSR *in, MAT *out)// CSR转MAT 394 { 395 if (in == NULL || out == NULL) 396 return 1; 397 398 int i, j; 399 out->row = in->row; 400 out->col = in->col; 401 for (i = 0; i < in->row; i++) 402 { 403 for (j = 0; j < in->col; j++) 404 out->data[i*in->col + j] = 0; 405 for (j = in->ptr[i]; j < in->ptr[i + 1]; j++) 406 out->data[i*in->col + in->index[j]] = in->data[j]; 407 } 408 return 0; 409 } 410 411 int MATToELL(const MAT *in, ELL *out)// MAT转ELL 412 { 413 if (in == NULL || out == NULL) 414 return 1; 415 416 int i, j, k; 417 format temp_data[NON_ZEROS]; 418 int temp_index[NON_ZEROS]; 419 for (i = j = k = 0; i < in->row*in->col; i++)// 寻找最长的行,同时把in中每行的元素往左边推 420 { 421 temp_data[i] = 0; 422 temp_index[i] = 0; 423 if (in->data[i] != 0) // 找到非零元 424 { 425 temp_data[i / in->col*in->col + j] = in->data[i]; // 存放元素 426 temp_index[i / in->col*in->col + j] = i % in->col; // 记录所在的列号 427 j++; // j中为改行已经找到的非零元个数 428 } 429 if ((i + 1) % in->col == 0) // 找到了行末 430 { 431 k = MAX(j, k); // 更新最大非零元素个数 432 j = 0; // 开始下一行之前清空j 433 } 434 } 435 out->row = k; // 找到了原矩阵最大非零元个数 436 out->col = in->row; // 新矩阵尺寸相当于原矩阵的转置 437 for (i = 0; i < in->row*in->col; i++) // 使用新的列数重构原来的out 438 { 439 temp_data[i - i / in->col*(in->col - k)] = temp_data[i];// 将data和index往前挪 440 temp_index[i - i / in->col*(in->col - k)] = temp_index[i]; 441 if (i >= out->row * k) 442 { 443 temp_data[i] = 0; 444 temp_index[i] = 0; 445 } 446 } 447 for (i = 0; i < out->row*out->col; i++) // 转置 448 { 449 out->data[i] = temp_data[i % out->col * k + i / out->col]; 450 out->index[i] = temp_index[i % out->col * k + i / out->col]; 451 } 452 for (; i < in->row*in->col; i++) // 补0 453 { 454 out->data[i] = 0; 455 out->index[i] = 0; 456 } 457 return 0; 458 } 459 460 int ELLToMAT(const ELL *in, MAT *out)// ELL转MAT 461 { 462 if (in == NULL || out == NULL) 463 return 1; 464 return 0; 465 } 466 467 int MATToCOO(const MAT *in, COO *out)// MAT转COO 468 { 469 if (in == NULL || out == NULL) 470 return 1; 471 472 int i, j; 473 out->row = in->row; 474 out->col = in->col; 475 for (i = j = 0; i < in->row * in->col; i++) 476 { 477 if (in->data[i] != 0) 478 { 479 out->data[j] = in->data[i]; 480 out->row_index[j] = i / in->col; 481 out->col_index[j] = i % in->col; 482 j++; 483 } 484 } 485 out->count = j; 486 return 0; 487 } 488 489 int COOToMAT(const COO *in, MAT *out)// COO转MAT 490 { 491 if (in == NULL || out == NULL) 492 return 1; 493 494 int i; 495 out->row = in->row; 496 out->col = in->col; 497 for (i = 0; i < in->row * in->col; i++) 498 out->data[i] = 0; 499 for (i = 0; i < in->count; i++) 500 out->data[in->row_index[i] * in->col + in->col_index[i]] = in->data[i]; 501 return 0; 502 } 503 504 int CSRToELL(const CSR *in, ELL *out)// CSR转ELL 505 { 506 if (in == NULL || out == NULL) 507 return 1; 508 return 0; 509 } 510 511 int ELLToCSR(const ELL *in, CSR *out)// ELL转CSR 512 { 513 if (in == NULL || out == NULL) 514 return 1; 515 return 0; 516 } 517 518 int CSRToCOO(const CSR *in, COO *out)// CSR转COO 519 { 520 if (in == NULL || out == NULL) 521 return 1; 522 return 0; 523 } 524 525 int COOToCSR(const COO *in, CSR *out)// COO转CSR 526 { 527 if (in == NULL || out == NULL) 528 return 1; 529 return 0; 530 } 531 532 int ELLToCOO(const ELL *in, COO *out)// ELL转COO 533 { 534 if (in == NULL || out == NULL) 535 return 1; 536 return 0; 537 } 538 539 int COOToELL(const COO *in, ELL *out)// COO转ELL 540 { 541 if (in == NULL || out == NULL) 542 return 1; 543 544 return 0; 545 } 546 547 // 各种形式的矩阵和一个向量的乘法 548 int dotMATCPU(const MAT *a, const MAT *x, MAT *y)// CPU MAT乘法 549 { 550 if (a == NULL || x == NULL || y == NULL || a->col != x->row) 551 return 1; 552 553 int i; 554 format sum; 555 y->row = a->row; 556 y->col = x->col; 557 for (i = 0, sum = 0; i < a->row * a->col; i++) 558 { 559 sum += a->data[i] * x->data[i % a->col]; 560 if ((i + 1) % a->col == 0) 561 { 562 y->data[i / a->col] = sum; 563 sum = 0; 564 } 565 } 566 return 0; 567 } 568 569 int dotCSRCPU(const CSR *a, const MAT *x, MAT *y)// CPU CSR乘法 570 { 571 if (a == NULL || x == NULL || y == NULL || a->col != x->row) 572 return 1; 573 574 int i, j; 575 format sum; 576 y->row = a->row; 577 y->col = x->col; 578 for (i = 0; i < a->row; i++) 579 { 580 for (sum = 0, j = a->ptr[i]; j < a->ptr[i + 1]; j++) 581 sum += a->data[j] * x->data[a->index[j]]; 582 y->data[i] = sum; 583 } 584 return 0; 585 } 586 587 int dotELLCPU(const ELL *a, const MAT *x, MAT *y)// CPU ELL乘法 588 { 589 if (a == NULL || x == NULL || y == NULL || a->col != x->row) 590 return 1; 591 592 int i, j; 593 format sum; 594 y->row = a->row; 595 y->col = x->col; 596 for (i = 0; i<a->row; i++) 597 { 598 for (sum = 0, j = 0; j < a->col; j++) 599 sum += a->data[i + j*a->row] * x->data[a->index[i + j*a->row]]; 600 y->data[i] = sum; 601 } 602 return 0; 603 } 604 605 int dotCOOCPU(const COO *a, const MAT *x, MAT *y)// CPU COO乘法 606 { 607 if (a == NULL || x == NULL || y == NULL || a->col != x->row) 608 return 1; 609 610 int i; 611 y->row = a->row; 612 y->col = x->col; 613 for (i = 0; i < a->row; i++) 614 y->data[i] = 0; 615 for (i = 0; i<a->count; i++) 616 y->data[a->row_index[i]] += a->data[i] * x->data[a->col_index[i]]; 617 return 0; 618 } 619 620 __global__ void dotMATGPU(const MAT *a, const MAT *x, MAT *y)// GPU MAT乘法 621 { 622 int id = blockIdx.x * blockDim.x + threadIdx.x; 623 if (id == 0) 624 { 625 y->row = a->row; 626 y->col = x->col; 627 } 628 if (id < a->row) 629 { 630 int i; 631 format sum = 0; 632 for (i = 0; i < a->col; i++) 633 sum += a->data[id * a->col + i] * x->data[i]; 634 y->data[id] = sum; 635 } 636 return; 637 } 638 639 __global__ void dotCSRGPU(const CSR *a, const MAT *x, MAT *y)// GPU CSR乘法 640 { 641 int id = blockIdx.x * blockDim.x + threadIdx.x; 642 if (id == 0) 643 { 644 y->row = a->row; 645 y->col = x->col; 646 } 647 if (id < a->row) 648 { 649 int j; 650 format sum; 651 for (sum = 0, j = a->ptr[id]; j < a->ptr[id + 1]; j++) 652 sum += a->data[j] * x->data[a->index[j]]; 653 y->data[id] = sum; 654 } 655 return; 656 } 657 658 __global__ void dotELLGPU(const ELL *a, const MAT *x, MAT *y)// GPU ELL乘法 659 { 660 int id = blockIdx.x * blockDim.x + threadIdx.x; 661 if (id == 0) 662 { 663 y->row = a->row; 664 y->col = x->col; 665 } 666 if (id < a->row) 667 { 668 int j; 669 format sum; 670 for (sum = 0, j = 0; j < a->col; j++) 671 sum += a->data[id + j*a->row] * x->data[a->index[id + j*a->row]]; 672 y->data[id] = sum; 673 } 674 return; 675 } 676 677 __global__ void dotCOOGPU(const COO *a, const MAT *x, MAT *y)// GPU COO乘法 678 { 679 int id = blockIdx.x * blockDim.x + threadIdx.x; 680 if (id == 0) 681 { 682 y->row = a->row; 683 y->col = x->col; 684 } 685 if (id < a->count) 686 atomicAdd(&(y->data[a->row_index[id]]), a->data[id] * x->data[a->col_index[id]]); 687 return; 688 } 689 690 int test()// 测试矩阵打印和CPU计算的函数 691 { 692 int i; 693 MAT a, c, x, y; 694 CSR b; 695 ELL d; 696 COO e; 697 698 a.row = 4, a.col = 5; 699 for (i = 0; i < a.row * a.col; i++) 700 a.data[i] = 0; 701 a.data[0] = 3, a.data[2] = 1, a.data[4] = 5, a.data[11] = 2; 702 a.data[12] = 4, a.data[13] = 1, a.data[15] = 1, a.data[18] = 1; 703 704 x.row = a.col, x.col = 1; 705 for (i = 0; i < x.row; i++) 706 x.data[i] = i; 707 708 printMAT(&a); // MAT型 709 710 MATToCSR(&a, &b); // MAT转CSR 711 printCSR(&b); 712 713 CSRToMAT(&b, &c); // CSR转回MAT型 714 printMAT(&c); 715 716 MATToELL(&a, &d); // ELL型 717 printELL(&d); 718 719 MATToCOO(&a, &e); // MAT转COO 720 printCOO(&e); 721 722 COOToMAT(&e, &c); // COO转回MAT 723 printMAT(&c); 724 725 dotMATCPU(&a, &x, &y); // MAT乘法 726 printMAT(&y); 727 728 dotCSRCPU(&b, &x, &y); // CSR乘法 729 printMAT(&y); 730 731 dotELLCPU(&d, &x, &y); // ELL乘法 732 printMAT(&y); 733 734 dotCOOCPU(&e, &x, &y); // COO乘法 735 printMAT(&y); 736 737 getchar(); 738 return 0; 739 } 740 741 int main() 742 { 743 int i; 744 MAT *h_aMAT, *d_aMAT, *h_xMAT, *d_xMAT, *h_yMAT_1, *h_yMAT_2, *d_yMAT; 745 CSR *h_aCSR, *d_aCSR; 746 ELL *h_aELL, *d_aELL; 747 COO *h_aCOO, *d_aCOO; 748 749 clock_t timeCPU; 750 cudaEvent_t startGPU, stopGPU; 751 float timeGPU; 752 cudaEventCreate(&startGPU); 753 cudaEventCreate(&stopGPU); 754 755 printf("\n\tStart! Matrix dimension:\t%d × %d", ROW, COL); 756 757 h_aMAT = initializeMAT(ROW, COL, 1); 758 h_xMAT = initializeMAT(COL, 1, 1); 759 h_yMAT_1 = initializeMAT(ROW, 1, 1); 760 h_yMAT_2 = initializeMAT(ROW, 1, 1); 761 762 // 初始化 763 timeCPU = clock(); 764 srand(SEED); 765 #if INT 766 for (i = 0; i < COL * ROW; i++) 767 h_aMAT->data[i] = ((float)rand() < RAND_MAX * EPSILON) ? (rand() - RAND_MAX / 2) : 0; 768 h_aMAT->count = countMAT(h_aMAT); 769 for (i = 0; i < COL; i++) 770 h_xMAT->data[i] = (rand() - RAND_MAX / 2); 771 //h_xMAT->count = countMAT(h_xMAT); 772 #else 773 for (i = 0; i < COL * ROW; i++) 774 h_aMAT->data[i] = ((float)rand() < RAND_MAX * EPSILON) ? ((float)rand() / RAND_MAX) : 0; 775 h_aMAT->count = countMAT(h_aMAT); 776 for (i = 0; i < COL; i++) 777 h_xMAT->data[i] = ((float)rand() / RAND_MAX); 778 //h_xMAT->count = countMAT(h_xMAT); 779 #endif 780 printf("\n\tInitialized! Time:\t\t%8.3f ms\n", (format)(clock() - timeCPU)); 781 782 //CPU计算部分 783 //MAT 格式 784 timeCPU = clock(); 785 dotMATCPU(h_aMAT, h_xMAT, h_yMAT_1); 786 timeCPU = clock() - timeCPU; 787 printf("\n\tdotMATCPU time:\t%8.3f ms\n", (float)timeCPU); 788 789 // CSR格式 790 h_aCSR = initializeCSR(ROW, COL, h_aMAT->count, 1); 791 MATToCSR(h_aMAT, h_aCSR); 792 timeCPU = clock(); 793 dotCSRCPU(h_aCSR, h_xMAT, h_yMAT_2); 794 timeCPU = clock() - timeCPU; 795 printf("\n\tdotCSRCPU time:\t%8.3f ms\n", (float)timeCPU); 796 if (i = checkResult(h_yMAT_1->data, h_yMAT_2->data, ROW)) 797 printf("\n\tError at i = %d\n\tcpu_out1[i] = %10f, h_yMAT_2[i] = %10f\n", 798 i - 1, (float)h_yMAT_1->data[i - 1], (float)h_yMAT_2->data[i - 1]); 799 else 800 printf("\n\tdotCSRCPU correct!\n"); 801 802 // ELL格式 803 h_aELL = initializeELL(ROW, COL, 1); 804 MATToELL(h_aMAT, h_aELL); 805 timeCPU = clock(); 806 dotELLCPU(h_aELL, h_xMAT, h_yMAT_2); 807 timeCPU = clock() - timeCPU; 808 printf("\n\tdotCELLPU time:\t%8.3f ms\n", (float)timeCPU); 809 if (i = checkResult(h_yMAT_1->data, h_yMAT_2->data, ROW)) 810 printf("\n\tError at i = %d\n\tcpu_out1[i] = %10f, h_yMAT_2[i] = %10f\n", 811 i - 1, (float)h_yMAT_1->data[i - 1], (float)h_yMAT_2->data[i - 1]); 812 else 813 printf("\n\tdotELLCPU correct!\n"); 814 815 // COO格式 816 h_aCOO = initializeCOO(ROW, COL, h_aMAT->count, 1); 817 MATToCOO(h_aMAT, h_aCOO); 818 timeCPU = clock(); 819 dotCOOCPU(h_aCOO, h_xMAT, h_yMAT_2); 820 timeCPU = clock() - timeCPU; 821 printf("\n\tdotCOOCPU time:\t%8.3f ms\n", (float)timeCPU); 822 if (i = checkResult(h_yMAT_1->data, h_yMAT_2->data, ROW)) 823 printf("\n\tError at i = %d\n\tcpu_out1[i] = %10f, h_yMAT_2[i] = %10f\n", 824 i - 1, (float)h_yMAT_1->data[i - 1], (float)h_yMAT_2->data[i - 1]); 825 else 826 printf("\n\tdotCOOCPU correct!\n"); 827 828 // GPU计算部分 829 d_aMAT = initializeMAT(ROW, COL, 0); 830 d_xMAT = initializeMAT(ROW, COL, 0); 831 d_yMAT = initializeMAT(ROW, COL, 0); 832 d_aCSR = initializeCSR(ROW, COL, h_aMAT->count, 0); 833 d_aELL = initializeELL(ROW, COL, 0); 834 d_aCOO = initializeCOO(ROW, COL, h_aMAT->count, 0); 835 836 cudaMemcpy(d_aMAT, h_aMAT, sizeof(MAT), cudaMemcpyHostToDevice); 837 cudaMemcpy(d_xMAT, h_xMAT, sizeof(format) * COL, cudaMemcpyHostToDevice); 838 cudaMemcpy(d_aCSR, h_aCSR, sizeof(CSR), cudaMemcpyHostToDevice); 839 cudaMemcpy(d_aELL, h_aELL, sizeof(CSR), cudaMemcpyHostToDevice); 840 cudaMemcpy(d_aCOO, h_aCOO, sizeof(CSR), cudaMemcpyHostToDevice); 841 842 // MAT格式 843 cudaMemset(d_yMAT, 0, sizeof(MAT)); 844 cudaEventRecord(startGPU, 0); 845 dotMATGPU << <CEIL(ROW, THREAD_SIZE), THREAD_SIZE >> > (d_aMAT, d_xMAT, d_yMAT); 846 cudaMemcpy(h_yMAT_2, d_yMAT, sizeof(format) * ROW, cudaMemcpyDeviceToHost); 847 cudaDeviceSynchronize(); 848 cudaEventRecord(stopGPU, 0); 849 cudaEventSynchronize(stopGPU); 850 cudaEventElapsedTime(&timeGPU, startGPU, stopGPU); 851 printf("\n\tdotMATGPU time:\t%8.3f ms\n", timeGPU); 852 if (i = checkResult(h_yMAT_1->data, h_yMAT_2->data, ROW)) 853 printf("\n\tError at i = %d\n\tcpu_out[i] = %10f, gpu_out[i] = %10f\n", 854 i - 1, (float)h_yMAT_1->data[i - 1], (float)h_yMAT_2->data[i - 1]); 855 else 856 printf("\n\tdotMATGPU correct!\n"); 857 858 // CSR格式 859 cudaMemset(d_yMAT, 0, sizeof(MAT)); 860 cudaEventRecord(startGPU, 0); 861 dotCSRGPU << <CEIL(ROW, THREAD_SIZE), THREAD_SIZE >> > (d_aCSR, d_xMAT, d_yMAT); 862 cudaMemcpy(h_yMAT_2, d_yMAT, sizeof(format) * ROW, cudaMemcpyDeviceToHost); 863 cudaDeviceSynchronize(); 864 cudaEventRecord(stopGPU, 0); 865 cudaEventSynchronize(stopGPU); 866 cudaEventElapsedTime(&timeGPU, startGPU, stopGPU); 867 printf("\n\tdotCSRGPU time:\t%8.3f ms\n", timeGPU); 868 if (i = checkResult(h_yMAT_1->data, h_yMAT_2->data, ROW)) 869 printf("\n\tError at i = %d\n\tcpu_out[i] = %10f, gpu_out[i] = %10f\n", 870 i - 1, (float)h_yMAT_1->data[i - 1], (float)h_yMAT_2->data[i - 1]); 871 else 872 printf("\n\tdotCSRGPU correct!\n"); 873 874 // ELL格式 875 cudaMemset(d_yMAT, 0, sizeof(MAT)); 876 cudaEventRecord(startGPU, 0); 877 dotELLGPU << <CEIL(ROW, THREAD_SIZE), THREAD_SIZE >> > (d_aELL, d_xMAT, d_yMAT); 878 cudaMemcpy(h_yMAT_2, d_yMAT, sizeof(format) * ROW, cudaMemcpyDeviceToHost); 879 cudaDeviceSynchronize(); 880 cudaEventRecord(stopGPU, 0); 881 cudaEventSynchronize(stopGPU); 882 cudaEventElapsedTime(&timeGPU, startGPU, stopGPU); 883 printf("\n\tdotELLGPU time:\t%8.3f ms\n", timeGPU); 884 if (i = checkResult(h_yMAT_1->data, h_yMAT_2->data, ROW)) 885 printf("\n\tError at i = %d\n\tcpu_out[i] = %10f, gpu_out[i] = %10f\n", 886 i - 1, (float)h_yMAT_1->data[i - 1], (float)h_yMAT_2->data[i - 1]); 887 else 888 printf("\n\tdotELLGPU correct!\n"); 889 890 // COO格式 891 cudaMemset(d_yMAT, 0, sizeof(MAT)); 892 cudaEventRecord(startGPU, 0); 893 dotCOOGPU << <CEIL(h_aMAT->count, THREAD_SIZE), THREAD_SIZE >> > (d_aCOO, d_xMAT, d_yMAT); 894 cudaMemcpy(h_yMAT_2, d_yMAT, sizeof(format) * ROW, cudaMemcpyDeviceToHost); 895 cudaDeviceSynchronize(); 896 cudaEventRecord(stopGPU, 0); 897 cudaEventSynchronize(stopGPU); 898 cudaEventElapsedTime(&timeGPU, startGPU, stopGPU); 899 printf("\n\tdotCOOGPU time:\t%8.3f ms\n", timeGPU); 900 if (i = checkResult(h_yMAT_1->data, h_yMAT_2->data, ROW)) 901 printf("\n\tError at i = %d\n\tcpu_out[i] = %10f, gpu_out[i] = %10f\n", 902 i - 1, (float)h_yMAT_1->data[i - 1], (float)h_yMAT_2->data[i - 1]); 903 else 904 printf("\n\tdotCOOGPU correct!\n"); 905 906 freeMAT(h_aMAT, 1); 907 freeMAT(h_xMAT, 1); 908 freeMAT(h_yMAT_1, 1); 909 freeMAT(h_yMAT_2, 1); 910 freeCSR(h_aCSR, 1); 911 freeELL(h_aELL, 1); 912 freeCOO(h_aCOO, 1); 913 freeMAT(d_aMAT, 0); 914 freeMAT(d_xMAT, 0); 915 freeMAT(d_yMAT, 0); 916 freeCSR(d_aCSR, 0); 917 freeELL(d_aELL, 0); 918 freeCOO(d_aCOO, 0); 919 getchar(); 920 return 0; 921 }
标签:create end seed 需要 打印 ntc malloc tcp das
原文地址:http://www.cnblogs.com/cuancuancuanhao/p/7687855.html