Hello everyone, I’m new to cuda and I’m finding a problem in the execution times displayed by Nvidia Visual Profiler
followed by the suspicion that the kernels were running in parallel even though the visual profiler is not showing it.
The generated code is the following . I am using linux for development ubuntu 20.04
#include <stdio.h>
#include <cuda.h>
#include “helper_cuda.h”
global void do_work(double data, int N, int idx) {
int i = blockIdx.x * blockDim.x + blockDim.xidx + threadIdx.x;
if (i < N) {
for (int j = 0; j < 20000; j++) {
data[i] = cos(data[i]);
data[i] = sqrt(fabs(data[i]));
}
}
}
int main()
{
int nblocks = 30;
int blocksize = 1024;
double data;
checkCudaErrors(cudaMalloc( (void*)&data, nblocksblocksizesizeof(double) ));
float time;
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
checkCudaErrors(cudaEventRecord(start, 0));
dim3 dimBlock( blocksize, 1, 1 );
dim3 dimGrid( 1, 1, 1 );
for (int i = 0; i < nblocks; i++)
do_work<<<dimGrid,dimBlock>>>(data, nblocks*blocksize, i);
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaEventSynchronize(stop));
checkCudaErrors(cudaEventElapsedTime(&time, start, stop));
printf("Serialised time: %g ms\n", time);
cudaStream_t streams[nblocks];
for (int i = 0; i < nblocks; i++)
checkCudaErrors(cudaStreamCreate(&streams[i]));
checkCudaErrors(cudaEventRecord(start, 0));
checkCudaErrors(cudaEventSynchronize(start));
for (int i = 0; i < nblocks; i++)
do_work<<<dimGrid,dimBlock,0,streams[i]>>>(data, nblocks*blocksize, i);
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaEventSynchronize(stop));
checkCudaErrors(cudaEventElapsedTime(&time, start, stop));
printf("Multi-stream parallel time: %g ms\n", time);
for (int i = 0; i < nblocks; i++)
checkCudaErrors(cudaStreamDestroy(streams[i]));
checkCudaErrors(cudaFree( data ));
return EXIT_SUCCESS;
}
the output follows
(base) manuel@manuel-DT:~/eclipse-workspace/kernel_ocerlap$ ./kernel_overlap
Serialised time: 3856.09 ms
Multi-stream parallel time: 259.779 ms
As can be seen in the first part, 30 kernels are launched in the same stream and the time is calculated
serialized time 3856ms
However, when 30 streams are generated and a kernel is executed in each one, the total “parallel” time is 259.77ms.
Therefore the execution in parallel since it is shorter than the series must have kernels executing simultaneously
But what it shows in nvprof is that kernels even though they are running on different streams they are not running in parallel
order for analisis
nvprof --analysis-metrics -o nbody-analysis4.nvprof ./kernel_overlap
Please can any one give any indication how to make that the nvprof shows the real execution time