I found some odd things when I tried to measure maximal possiable perfomance of my cuda programm.
This code:
__global__ void MyKernel()
{
int a = 0;
for (int i = 0; i < 1024 * 10; i++) {
a += i % 2;
}
}
void main()
{
int blockSize = 1024;
int gridSize = 1024;
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
MyKernel << <gridSize, blockSize >> >();
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Kernel elapsed time: %3.3f ms \n", time);
}
get me “Kernel elapsed time: 3780.342 ms”.
But when I add any parametr to kernel function, execution time is halved.
This kernel get me “Kernel elapsed time: 1866.104 ms”
__global__ void MyKernel(int par)
{
int a = 0;
for (int i = 0; i < 1024 * 10; i++) {
a += i % 2;
}
}
Please, explain me why it happen, and advise me better way to measure perfomance.