First, my initial observation is the opposite of yours (you: [&]-capture is slower, me: [=]-capture is slower)
Second, when I apply what I consider to be basic benchmarking practice, I observe no difference between the two cases:
$ cat t2186.cu
#include <cuda_runtime.h>
#include <iostream>
__global__ void traverseArray(int *d_array, int width, int height) {
auto x = blockIdx.x * blockDim.x + threadIdx.x;
auto y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < width && y < height) {
d_array[y * width + x] *= 2;
}
}
struct CudaKernelExecutionTime {
template<typename F, typename... Args>
float operator()(F &&f, Args &&... args) {
// Start timing
cudaEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Insert start event
cudaEventRecord(start, nullptr);
// call kernel
f(std::forward<Args>(args)...);
// insert stop event
cudaEventRecord(stop, nullptr);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
// Free resources
cudaEventDestroy(start);
cudaEventDestroy(stop);
return time;
}
};
int main() {
int width = 10;
int height = 10;
int size = width * height;
int *h_array = new int[size];
int *d_array;
cudaMalloc((void **) &d_array, size * sizeof(int));
// Initialize array on host
for (int i = 0; i < size; i++) {
h_array[i] = i;
}
// Copy array from host to device
cudaMemcpy(d_array, h_array, size * sizeof(int), cudaMemcpyHostToDevice);
// Launch kernel to traverse array
dim3 blockSize(8, 8);
dim3 gridSize((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y);
// Run kernel
#ifdef USE_EQ
auto launch_kernel = [=]() {
#else
auto launch_kernel = [&]() {
#endif
traverseArray<<<gridSize, blockSize>>>(d_array, width, height);
};
CudaKernelExecutionTime timer;
#ifdef USE_WARMUP
launch_kernel();
cudaDeviceSynchronize();
#endif
auto time = timer(launch_kernel);
// Print time
std::cout << "Kernel execution time: " << time << "ms"
<< std::endl;
// Copy array back from device to host
cudaMemcpy(h_array, d_array, size * sizeof(int), cudaMemcpyDeviceToHost);
// Print modified array
for (int i = 0; i < 2; i++) {
printf("Array element %d: %d\n", i, h_array[i]);
}
// Free device memory
cudaFree(d_array);
delete[] h_array;
return 0;
}
$ nvcc -o t2186 t2186.cu
$ ./t2186
Kernel execution time: 0.034304ms
Array element 0: 0
Array element 1: 2
$ nvcc -o t2186 t2186.cu -DUSE_EQ
$ ./t2186
Kernel execution time: 0.038016ms
Array element 0: 0
Array element 1: 2
$ nvcc -o t2186 t2186.cu -DUSE_WARMUP
$ ./t2186
Kernel execution time: 0.011808ms
Array element 0: 0
Array element 1: 4
$ nvcc -o t2186 t2186.cu -DUSE_WARMUP -DUSE_EQ
$ ./t2186
Kernel execution time: 0.011776ms
Array element 0: 0
Array element 1: 4
$
CUDA 11.4, Tesla V100
I also note that when I run the profiler, the kernel execution time on my system is ~3.6us, regardless of which capture/launch variant is used, for your original test case. Youâre timing an incredibly short kernel. Whatever overhead or overhead variance there may be here will likely become insignificant for a longer duration kernel.
Thanks sincerely for your detailed explanation and good codes.
As a CUDA beginner, I want to how to use âthe profilerâ to test a kernel function speed, as you mentioned âwhen I run the profiler, the kernel execution time on my system is ~3.6usâ.
Since my only knowledge about profiling is using Event APIs, and I want to learn a more professional way.
The profiler you use should be chosen based on the GPU you are running on. Eventually, as GPUs become obsolete, all future profiling is expected to be done using nsight systems, and this blog can help you get started. If you are running on a Pascal or older GPU, you should instead use nvprof - a legacy profiler at this point. The blog gives an example of nvprof usage also.
While not offering all the benefits of the latest Nsight Compute, the earlier version 2019.5.0, available here, is the last Pascal 6.X supporting Linux version.
I consider the profiler to be the best tool for measuring the running time of a kernel. And there are known situations where cudaEvent based timing can be tricky to use for the purpose of timing an individual kernel.
You can find many forum postings where people discuss the tradeoffs of various different CUDA timing methodologies. Reasonably good timing can be done if properly applied, using either a host based timing method, cudaEvent based timing method, or the profilers. For most of my own use cases, I am only interested in comparisons, so as long as you properly apply and stick with one methodology, you can usually do good comparison work.
For my own uses, discussions of whether or not a CUDA kernel actually took 10 microseconds as reported by host based timing, or 6 microseconds as reported by cudaEvent based timing, or 5 microseconds as reported by the profiler, donât lead me to useful insight.
I need to elaborate more on my original question, in which I want to compare whether 2D array indexing would seriously impact the performance. But, I failed to apply a 2D array using cudaMalloc, as CUDA_2d_cudaMalloc.
That is incorrect. CUDA is based on C++ and therefore makes exactly the same array addressing available that is found in C++11.
If by d_array[i][j] you mean the kind of data structure that is not actually a true 2D array but instead an array of pointers to 1D arrays, you can create that in CUDA as well, but I would not recommend it for performance reasons. Here is a worked example.
#include <cstdio>
#include <cstdlib>
#include <complex>
#include "cuComplex.h"
#define N (2)
#define M (3)
typedef std::complex<float> T;
__global__ void print_device_matrix (cuComplex** mat)
{
printf ("matrix on device:\n");
for (int i = 0; i < N; i++) {
for (int j = 0; j < M; j++) {
printf ("(%f, %f) ", cuCrealf (mat[i][j]), cuCimagf (mat[i][j]));
}
printf ("\n");
}
}
int main (void)
{
/* allocate host "matrix" */
T **mat = (T**)malloc (N * sizeof (mat[0]));
for (int i = 0; i < N; i++) {
mat[i] = (T *)malloc (M * sizeof (mat[0][0]));
}
/* fill in host "matrix" */
for (int i = 0; i < N; i++) {
for (int j = 0; j < M; j++) {
mat[i][j] = T (float(i)+1, float(j)+1);
}
}
/* print host "matrix" */
printf ("matrix on host:\n");
for (int i = 0; i < N; i++) {
for (int j = 0; j < M; j++) {
printf ("(%f, %f) ", real(mat[i][j]), imag(mat[i][j]));
}
printf ("\n");
}
/* allocate device "matrix" */
T **tmp = (T**)malloc (N * sizeof (tmp[0]));
for (int i = 0; i < N; i++) {
cudaMalloc ((void **)&tmp[i], M * sizeof (tmp[0][0]));
}
cuComplex **matD = 0;
cudaMalloc ((void **)&matD, N * sizeof (matD[0]));
/* copy "matrix" from host to device */
cudaMemcpy (matD, tmp, N * sizeof (matD[0]), cudaMemcpyHostToDevice);
for (int i = 0; i < N; i++) {
cudaMemcpy (tmp[i], mat[i], M * sizeof (matD[0][0]), cudaMemcpyHostToDevice);
}
free (tmp);
/* print device "matrix" */
print_device_matrix<<<1,1>>> (matD);
/* free host "matrix" */
for (int i = 0; i < N; i++) {
free (mat[i]);
}
free (mat);
/* free device "matrix" */
tmp = (T**)malloc (N * sizeof (tmp[0]));
cudaMemcpy (tmp, matD, N * sizeof (matD[0]), cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++) {
cudaFree (tmp[i]);
}
free (tmp);
cudaFree (matD);
return EXIT_SUCCESS;
}