Timing of kernel getting more than a function that runs on only CPU why so...??

Hi, i am new to CUDA programming
i have created simple cuda application

#define N 1
  
  __global__ void kernel(int *a)
  {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
   ++a[idx];
  }
  
  int main()
  {
    // allocate N element currently i kept only 1 element
    // allocated Unified Mem. using cudaMallocManaged(...)
     . . . . .
     . . . . .
     cudaEventRecord(start,0);
     kernel<<<1,1>>>(dev_a);
     cudaEventRecord(stop, 0);
     cudaEventSynchronize(stop);
     cudaEventElapsedTime(&elapsedTime, start, stop));
     printf("Time taken by kernel %3.5f MicroSec",elapsedTime * 1000);
     . . . . . 
     . . . . . 
  }

After running it tooks 30.7 microsec for only 1 record.

Why it tooks too much time …?? if i run same sample in C without CUDA (plane c)
it shows me 0 microsecs…

Please let me know…

Hi,

  1. How did you measure this “0”? Because a simple example that does nothing but start and stop gives me already a runtime of 1-3 microsecs.

  2. CUDA is good for parallel processing, not for a single instruction like in your example. Try this example: Then the execution time for CUDA should be much better than the CPU part.

#include <iostream>

using namespace std;

#define N 512
#define BLOCKS 512

__global__ void kernel( int *a )
{
	int idx = threadIdx.x + blockIdx.x * blockDim.x;
	++a[idx];
}

void onHost( int *a )
{
	for( int i = 0; i < BLOCKS*N; i++ )
		++a[i];
}

int main( int argc, const char* argv[] )
{
	cudaEvent_t start, stop;
	float time;

	int* dA;
	int a[N*BLOCKS];

	cudaMalloc( &dA, BLOCKS*N*sizeof(int) );

	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	cudaEventRecord( start, 0 );
//	kernel<<<BLOCKS,N>>> ( dA );
	onHost( a );

	cudaEventRecord( stop, 0 );
	cudaEventSynchronize( stop );

	cudaEventElapsedTime( &time, start, stop );
	cudaEventDestroy( start );
	cudaEventDestroy( stop );

	cout << "Time: " << time*1000 << endl;
}

In this example, the runtime is probably dominated by the memory performance, i.e. on the quality of the cache.

  1. Managed memory does a synchronization of host and device memory. This might also affect performance.