Nvidia Visual Profiler Not accurate in timing

Hello everyone, I’m new to cuda and I’m finding a problem in the execution times displayed by Nvidia Visual Profiler

followed by the suspicion that the kernels were running in parallel even though the visual profiler is not showing it.

The generated code is the following . I am using linux for development ubuntu 20.04

#include <stdio.h>
#include <cuda.h>

#include “helper_cuda.h”

global void do_work(double data, int N, int idx) {
int i = blockIdx.x * blockDim.x + blockDim.x
idx + threadIdx.x;
if (i < N) {
for (int j = 0; j < 20000; j++) {
data[i] = cos(data[i]);
data[i] = sqrt(fabs(data[i]));
}
}
}

int main()
{
int nblocks = 30;
int blocksize = 1024;
double data;
checkCudaErrors(cudaMalloc( (void
*)&data, nblocksblocksizesizeof(double) ));

float time;
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
checkCudaErrors(cudaEventRecord(start, 0));
dim3 dimBlock( blocksize, 1, 1 );
dim3 dimGrid( 1, 1, 1 );
for (int i = 0; i < nblocks; i++)
	do_work<<<dimGrid,dimBlock>>>(data, nblocks*blocksize, i);
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaEventSynchronize(stop));
checkCudaErrors(cudaEventElapsedTime(&time, start, stop));
printf("Serialised time:  %g ms\n", time);

cudaStream_t streams[nblocks];
for (int i = 0; i < nblocks; i++)
	checkCudaErrors(cudaStreamCreate(&streams[i]));

checkCudaErrors(cudaEventRecord(start, 0));
checkCudaErrors(cudaEventSynchronize(start));
for (int i = 0; i < nblocks; i++)
	do_work<<<dimGrid,dimBlock,0,streams[i]>>>(data, nblocks*blocksize, i);
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaEventSynchronize(stop));
checkCudaErrors(cudaEventElapsedTime(&time, start, stop));
printf("Multi-stream parallel time:  %g ms\n", time);

for (int i = 0; i < nblocks; i++)
	checkCudaErrors(cudaStreamDestroy(streams[i]));

checkCudaErrors(cudaFree( data ));
return EXIT_SUCCESS;

}

the output follows

(base) manuel@manuel-DT:~/eclipse-workspace/kernel_ocerlap$ ./kernel_overlap
Serialised time: 3856.09 ms
Multi-stream parallel time: 259.779 ms

As can be seen in the first part, 30 kernels are launched in the same stream and the time is calculated
serialized time 3856ms
However, when 30 streams are generated and a kernel is executed in each one, the total “parallel” time is 259.77ms.

Therefore the execution in parallel since it is shorter than the series must have kernels executing simultaneously

But what it shows in nvprof is that kernels even though they are running on different streams they are not running in parallel

order for analisis
nvprof --analysis-metrics -o nbody-analysis4.nvprof ./kernel_overlap

Please can any one give any indication how to make that the nvprof shows the real execution time