标签:高精度 variable generated cloc rate sum str 长度 rest
? OpenACC 的原子操作,用到了 C++ 的一个高精度计时器
● 代码,直接的原子操作
1 #include <iostream> 2 #include <cstdlib> 3 #include <chrono> 4 5 #define ATOMIC 6 7 using namespace std; 8 using namespace std::chrono; 9 10 int main() 11 { 12 high_resolution_clock::time_point t1 = high_resolution_clock::now();// 高精度计时器 13 14 const int count = 1073741824; 15 int sum = 0; 16 17 #pragma acc parallel loop copyout(sum) 18 for (int i = 0; i < count; i++) 19 { 20 #ifdef ATOMIC 21 #pragma acc atomic update 22 #endif 23 sum++; 24 } 25 26 high_resolution_clock::time_point t2 = high_resolution_clock::now(); 27 duration<double> time = duration_cast<duration<double>>(t2 - t1); 28 29 cout << "\nCount = " << count << ", duraion = " << time.count() << " s" << endl; 30 return 0; 31 }
● 输出结果,不知道为什么,win10中的 pgCC 不能用
D:\Code\OpenACC\OpenACCProject\OpenACCProject>pgCC -acc -o acc_win10.exe main.cpp -Minfo pgCC-Warning-C++ compilation is not supported: main.cpp
● 输出结果,WSL 中
// 不使用 OpenACC cuan@CUAN:/mnt/d/Code/OpenACC/OpenACCProject/OpenACCProject$ pgc++ -std=c++11 -o acc.exe main.cpp -Minfo cuan@CUAN:/mnt/d/Code/OpenACC/OpenACCProject/OpenACCProject$ ./acc.exe Count = 1073741824, duraion = 0.483907 s // 使用宏 ATOMIC,即使用原子操作 cuan@CUAN:/mnt/d/Code/OpenACC/OpenACCProject/OpenACCProject$ pgc++ -acc -std=c++11 -o acc.exe main.cpp -Minfo main: 15, Generating copyout(sum) Accelerator kernel generated Generating Tesla code 18, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ cuan@CUAN:/mnt/d/Code/OpenACC/OpenACCProject/OpenACCProject$ ./acc.exe Count = 1073741824, duraion = 0.248377 s // 不用宏 ATOMIC,即不用原子操作 cuan@CUAN:/mnt/d/Code/OpenACC/OpenACCProject/OpenACCProject$ pgc++ -acc -std=c++11 -o acc.exe main.cpp -Minfo main: 15, Generating copyout(sum) Accelerator kernel generated Generating Tesla code 18, #pragma acc loop seq 23, Accelerator restriction: induction variable live-out from loop: sum // 编译器提示强制原子操作 cuan@CUAN:/mnt/d/Code/OpenACC/OpenACCProject/OpenACCProject$ ./acc.exe Count = 1073741824, duraion = 0.247399 s
● 优化一下,使用分段计数
1 #include <iostream> 2 #include <cstdlib> 3 #include <chrono> 4 5 using namespace std; 6 using namespace std::chrono; 7 8 int main() 9 { 10 high_resolution_clock::time_point t1 = high_resolution_clock::now(); 11 12 const int count = 1073741824, length = count / 32;// 每一段的长度 13 int sum = 0; 14 15 #pragma acc parallel loop copyout(sum) 16 for (int start = 0; start < count; start+=length) // start 取每段的起点,共 count / length 段 17 { 18 const int end = (start + length < count) ? start + length : count; // 每段的终点 19 int subSum = 0; 20 #pragma acc loop worker reduction(+:subSum) 21 for (int j = start; j < end; j++) // 每段从 start 加到 end 22 subSum ++; 23 24 #pragma acc atomic update 25 sum += subSum; // 规约结果加到 sum 中来 26 } 27 28 high_resolution_clock::time_point t2 = high_resolution_clock::now(); 29 duration<double> time = duration_cast<duration<double>>(t2 - t1); 30 31 cout << "\nCount = " << sum << ", duraion = " << time.count() << " s" << endl; 32 return 0; 33 }
● 输出结果,好像好一点点
cuan@CUAN:/mnt/d/Code/OpenACC/OpenACCProject/OpenACCProject$ pgc++ -acc -std=c++11 -o acc.exe main.cpp -Minfo main: 15, Generating copyout(sum) Accelerator kernel generated Generating Tesla code 18, #pragma acc loop gang /* blockIdx.x */ 23, #pragma acc loop seq /* threadIdx.y */ Generating reduction(+:subSum) 23, Loop is parallelizable cuan@CUAN:/mnt/d/Code/OpenACC/OpenACCProject/OpenACCProject$ ./acc.exe Count = 1073741824, duraion = 0.246488 s
标签:高精度 variable generated cloc rate sum str 长度 rest
原文地址:https://www.cnblogs.com/cuancuancuanhao/p/9458900.html