question about the performance of CUDA

Is there any tip for improving CUDA performance in that case such as declaring global/local variable, parameter passing, memory copy. and I wonder what makes the runtime different between sum_gpu_SLOW and sum_gpu_FAST.

<Results / Profiling>

$ nvprof ./a.out 
-GPU-Ready------------------------------------------
-GPU-FAST-------------------------------------------
runtime : 461ms
-GPU-SLOW-------------------------------------------
runtime : 2421ms
----------------------------------------------------


==29732== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   83.75%  2.42130s         1  2.42130s  2.42130s  2.42130s  sum_gpu_SLOW(int[2][10000000]&, int&, int)
                   15.96%  461.49ms         1  461.49ms  461.49ms  461.49ms  sum_gpu_FAST(int[2][10000000]&, int&, int)

Here you can see the whole example code.

#include <iostream>
#include <chrono>

constexpr int N = 10000000;

__global__
void sum_gpu_FAST(int (&data)[N][2], int& sum, int n) {  // runtime : 2.42342s
    int s = 0;
    for (int i = 0; i < n; i++) 
        s += data[i][0] * 10 + data[i][1];
    sum = s;
}

__global__
void sum_gpu_SLOW(int (&data)[N][2], int& sum, int n) {  // runtime : 436.64ms
    sum = 0;
    for (int i = 0; i < n; i++) {
        sum += data[i][0] * 10 + data[i][1];
    }
}

int main()
{
    // Prepare and Set Host Data 
    int (*v)[2] = new int[N][2];
    for (int i = 0; i < N; i++)
        v[i][0] = 1, v[i][1] = 3;

    std::cout << "-GPU-Ready------------------------------------------" << std::endl;

    int *dev_sum       = nullptr;
    int (*dev_v)[N][2] = nullptr;

    cudaMalloc((void **)&dev_v,   sizeof(int[N][2]));
    cudaMalloc((void **)&dev_sum, sizeof(int));
    cudaMemcpy(dev_v, v, sizeof(int[N][2]), cudaMemcpyHostToDevice);

    std::cout << "-GPU-FAST-------------------------------------------" << std::endl;
    {
        auto start = std::chrono::system_clock::now();
        sum_gpu_FAST<<<1, 1>>> (*dev_v, *dev_sum, N);
        cudaDeviceSynchronize(); // wait until end of kernel
        auto end   = std::chrono::system_clock::now();

        // print output
        std::cout << "runtime : " << (end-start).count() / 1000000 << "ms" << std::endl;
    }

    std::cout << "-GPU-SLOW-------------------------------------------" << std::endl;
    {
        auto start = std::chrono::system_clock::now();
        sum_gpu_SLOW<<<1, 1>>> (*dev_v, *dev_sum, N);
        cudaDeviceSynchronize(); // wait until end of kernel
        auto end   = std::chrono::system_clock::now();

        // print output
        std::cout << "runtime : " << (end-start).count() / 1000000 << "ms" << std::endl;
    }

    std::cout << "----------------------------------------------------" << std::endl;

    delete[] v;
    return 0;
}

https://stackoverflow.com/questions/59385108/the-performance-of-cuda-depending-on-declaring-variable