Different timing results for the same kernel function Problem comparing two kernel functions

Hello,

I’ve have a code that compares two kernel functions. It runs each one of the kernels and print how much time the kernel function took using CUDA timers. Both Kernel A and Kernel B are run several times in the same order (A-B-A-B…). The average results for these runs for an especific problem size are:

Kernel A: 3.3 ms

Kernel B: 331.5 ms

If I comment the Kernel B and I test only Kernel A the results are the following:

Kernel A: 4.9 ms

If I change the problem size this keeps happening but with different results.

Why there is a difference in the timing results? What am I doing wrong? If I make comparisons using different files I keep getting the second result for Kernel A.

This is the code for Kernel A which is a version of reduce6 to reduce several vectors at the same time.

[codebox]template global void reduction6(int * entrada, int totalObjects, int arraySize) {

unsigned int classindex = blockIdx.y;

unsigned int insIndex = blockIdx.x * blockSize*2 + threadIdx.x;

unsigned int realindex = classindex*arraySize + blockIdx.x * blockSize*2 + threadIdx.x;

unsigned int tid = threadIdx.x;

unsigned int gridSize = blockSize*2*gridDim.x;

extern __shared__ int sdata[];

sdata[tid] = 0;

while (insIndex < totalObjects) {



	if (insIndex + blockSize < totalObjects) {

	

		sdata[tid] += entrada[realindex] +entrada[realindex + blockSize];

	} else {

		

		sdata[tid] += entrada[realindex];

	}

	insIndex += gridSize;

	realindex += gridSize;

}

__syncthreads();

// do reduction in shared mem

if (blockSize >= 512) {if (tid < 256) {sdata[tid] += sdata[tid + 256];}__syncthreads();}

if (blockSize >= 256) {if (tid < 128) {sdata[tid] += sdata[tid + 128];}__syncthreads();}

if (blockSize >= 128) {if (tid < 64) {sdata[tid] += sdata[tid + 64];}__syncthreads();}

#ifndef DEVICE_EMULATION

if (tid < 32)

#endif

{

	if (blockSize >= 64) {sdata[tid] += sdata[tid + 32]; EMUSYNC;}

	if (blockSize >= 32) {sdata[tid] += sdata[tid + 16]; EMUSYNC;}

	if (blockSize >= 16) {sdata[tid] += sdata[tid + 8]; EMUSYNC;}

	if (blockSize >= 8) {sdata[tid] += sdata[tid + 4]; EMUSYNC;}

	if (blockSize >= 4) {sdata[tid] += sdata[tid + 2]; EMUSYNC;}

	if (blockSize >= 2) {sdata[tid] += sdata[tid + 1]; EMUSYNC;}

}

if (tid == 0) {

	

	entrada[classindex*arraySize + blockIdx.x] = sdata[0];

}

}[/codebox]

This is the code for Kernel B

[codebox]global void reductionFor(int * entrada, int totalObjects) {

unsigned int classindex = threadIdx.x;

unsigned int res = 0;

for (int i = 0; i < totalObjects; i++) {

	res += entrada[classindex * totalObjects + i];

}

entrada[classindex * totalObjects] = res;

}[/codebox]

This is the body of the program

[codebox]

#include <time.h>

#include <cutil_inline.h>

#define THREADS_PER_BLOCK 512

#ifdef DEVICE_EMULATION

#define EMUSYNC __syncthreads()

#else

#define EMUSYNC

#endif

template global void reduction6(int * entrada, int totalObjects, int arraySize);

global void reductionFor(int * entrada, int totalObjects);

void testCudaFor(int * h_idata, int mem_size, int arraySize, int numClass) {

int * d_idata2;

cutilSafeCall(cudaMalloc((void**) &d_idata2, mem_size));

// copy host memory to device

cutilSafeCall(cudaMemcpy(d_idata2, h_idata, mem_size,

		cudaMemcpyHostToDevice));

unsigned int kerneltimer2;

cutCreateTimer(&kerneltimer2);

cutStartTimer(kerneltimer2);

dim3 threads(numClass,1,1);

dim3 grid(1,1,1);

reductionFor<<<grid,threads>>>(d_idata2, arraySize);

cudaThreadSynchronize();

cutilCheckMsg("Kernel execution failed");

fprintf(stdout, "Kernel For: %f\n", cutGetTimerValue(kerneltimer2));

cutStopTimer(kerneltimer2);

cutilCheckError(cutDeleteTimer(kerneltimer2));

cutilSafeCall(cudaFree(d_idata2));

}

void testCudaTree(int * h_idata, int mem_size, int arraySize, int numClass, int n) {

unsigned int shared_mem_size = sizeof(int) * THREADS_PER_BLOCK;

// allocate device memory

int * d_idata;

cutilSafeCall(cudaMalloc((void**) &d_idata, mem_size));

// copy host memory to device

cutilSafeCall(

		cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice));

int blocks = ceil((double) arraySize / ((double) THREADS_PER_BLOCK * 2));

printf("block %d threads %d shared %d\n", blocks, THREADS_PER_BLOCK,

		shared_mem_size);

unsigned int numThreads = arraySize;

int numBlocks = ceil(blocks / (double) n);

int * h_odata = (int *) malloc(mem_size);

unsigned int kerneltimer;

cutCreateTimer(&kerneltimer);

cutStartTimer(kerneltimer);

while (numBlocks > 0) {

	// setup execution parameters

	dim3 grid(numBlocks, numClass, 1);

	dim3 threads( THREADS_PER_BLOCK, 1, 1);

	reduction6<THREADS_PER_BLOCK><<< grid, threads, shared_mem_size >>>( d_idata, numThreads,arraySize);

	cudaThreadSynchronize();

	numThreads = numBlocks;

	numBlocks = (numBlocks == 1 ? 0 : ceil((double) numThreads

			/ (double) (THREADS_PER_BLOCK * n)));

}

cutilCheckMsg("Kernel execution failed");

float time = cutGetTimerValue(kerneltimer);

fprintf(stdout, "Kernel: %f\n", time);

cutilSafeCall(cudaFree(d_idata));

free(h_odata);

}

void runTest(int argc, char** argv) {

cudaSetDevice(cutGetMaxGflopsDeviceId());

unsigned int arraySize = atoi(argv[1]);

unsigned int numClass = atoi(argv[2]);

unsigned int n = atoi(argv[3]);

unsigned int mem_size = sizeof(int) * arraySize * numClass;

// allocate host memory

int * h_idata = (int *) malloc(mem_size);

// initalize the memory

for (unsigned int i = 0; i < numClass; i++) {

	for (unsigned int j = 0; j < arraySize; j++) {

		h_idata[i * arraySize + j] = 1;

	}

}

h_idata[numClass * arraySize - 1] += 3;

testCudaTree(h_idata,mem_size, arraySize,numClass,n);

//testCudaFor(h_idata,mem_size, arraySize,numClass);

// cleanup memory

free(h_idata);

cudaThreadExit();

}

int main(int argc, char** argv) {

printf("************** Reduction 1 elem ************\n");

for (int i = 0; i < atoi(argv[4]); i++) {

	runTest(argc, argv);

}

}

[/codebox]

I’m testing this code in a MacBook Pro in a GeForce 9600M GT.

I’ll appreciate any advice you could give me. Thanks.

I also encountered the same problem. How do you explain this problem?