Is there any tip for improving CUDA performance in that case such as declaring global/local variable, parameter passing, memory copy. and I wonder what makes the runtime different between sum_gpu_SLOW and sum_gpu_FAST.
<Results / Profiling>
$ nvprof ./a.out
-GPU-Ready------------------------------------------
-GPU-FAST-------------------------------------------
runtime : 461ms
-GPU-SLOW-------------------------------------------
runtime : 2421ms
----------------------------------------------------
==29732== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 83.75% 2.42130s 1 2.42130s 2.42130s 2.42130s sum_gpu_SLOW(int[2][10000000]&, int&, int)
15.96% 461.49ms 1 461.49ms 461.49ms 461.49ms sum_gpu_FAST(int[2][10000000]&, int&, int)
Here you can see the whole example code.
#include <iostream>
#include <chrono>
constexpr int N = 10000000;
__global__
void sum_gpu_FAST(int (&data)[N][2], int& sum, int n) { // runtime : 2.42342s
int s = 0;
for (int i = 0; i < n; i++)
s += data[i][0] * 10 + data[i][1];
sum = s;
}
__global__
void sum_gpu_SLOW(int (&data)[N][2], int& sum, int n) { // runtime : 436.64ms
sum = 0;
for (int i = 0; i < n; i++) {
sum += data[i][0] * 10 + data[i][1];
}
}
int main()
{
// Prepare and Set Host Data
int (*v)[2] = new int[N][2];
for (int i = 0; i < N; i++)
v[i][0] = 1, v[i][1] = 3;
std::cout << "-GPU-Ready------------------------------------------" << std::endl;
int *dev_sum = nullptr;
int (*dev_v)[N][2] = nullptr;
cudaMalloc((void **)&dev_v, sizeof(int[N][2]));
cudaMalloc((void **)&dev_sum, sizeof(int));
cudaMemcpy(dev_v, v, sizeof(int[N][2]), cudaMemcpyHostToDevice);
std::cout << "-GPU-FAST-------------------------------------------" << std::endl;
{
auto start = std::chrono::system_clock::now();
sum_gpu_FAST<<<1, 1>>> (*dev_v, *dev_sum, N);
cudaDeviceSynchronize(); // wait until end of kernel
auto end = std::chrono::system_clock::now();
// print output
std::cout << "runtime : " << (end-start).count() / 1000000 << "ms" << std::endl;
}
std::cout << "-GPU-SLOW-------------------------------------------" << std::endl;
{
auto start = std::chrono::system_clock::now();
sum_gpu_SLOW<<<1, 1>>> (*dev_v, *dev_sum, N);
cudaDeviceSynchronize(); // wait until end of kernel
auto end = std::chrono::system_clock::now();
// print output
std::cout << "runtime : " << (end-start).count() / 1000000 << "ms" << std::endl;
}
std::cout << "----------------------------------------------------" << std::endl;
delete[] v;
return 0;
}