Hello,
I’ve have a code that compares two kernel functions. It runs each one of the kernels and print how much time the kernel function took using CUDA timers. Both Kernel A and Kernel B are run several times in the same order (A-B-A-B…). The average results for these runs for an especific problem size are:
Kernel A: 3.3 ms
Kernel B: 331.5 ms
If I comment the Kernel B and I test only Kernel A the results are the following:
Kernel A: 4.9 ms
If I change the problem size this keeps happening but with different results.
Why there is a difference in the timing results? What am I doing wrong? If I make comparisons using different files I keep getting the second result for Kernel A.
This is the code for Kernel A which is a version of reduce6 to reduce several vectors at the same time.
[codebox]template global void reduction6(int * entrada, int totalObjects, int arraySize) {
unsigned int classindex = blockIdx.y;
unsigned int insIndex = blockIdx.x * blockSize*2 + threadIdx.x;
unsigned int realindex = classindex*arraySize + blockIdx.x * blockSize*2 + threadIdx.x;
unsigned int tid = threadIdx.x;
unsigned int gridSize = blockSize*2*gridDim.x;
extern __shared__ int sdata[];
sdata[tid] = 0;
while (insIndex < totalObjects) {
if (insIndex + blockSize < totalObjects) {
sdata[tid] += entrada[realindex] +entrada[realindex + blockSize];
} else {
sdata[tid] += entrada[realindex];
}
insIndex += gridSize;
realindex += gridSize;
}
__syncthreads();
// do reduction in shared mem
if (blockSize >= 512) {if (tid < 256) {sdata[tid] += sdata[tid + 256];}__syncthreads();}
if (blockSize >= 256) {if (tid < 128) {sdata[tid] += sdata[tid + 128];}__syncthreads();}
if (blockSize >= 128) {if (tid < 64) {sdata[tid] += sdata[tid + 64];}__syncthreads();}
#ifndef DEVICE_EMULATION
if (tid < 32)
{
if (blockSize >= 64) {sdata[tid] += sdata[tid + 32]; EMUSYNC;}
if (blockSize >= 32) {sdata[tid] += sdata[tid + 16]; EMUSYNC;}
if (blockSize >= 16) {sdata[tid] += sdata[tid + 8]; EMUSYNC;}
if (blockSize >= 8) {sdata[tid] += sdata[tid + 4]; EMUSYNC;}
if (blockSize >= 4) {sdata[tid] += sdata[tid + 2]; EMUSYNC;}
if (blockSize >= 2) {sdata[tid] += sdata[tid + 1]; EMUSYNC;}
}
if (tid == 0) {
entrada[classindex*arraySize + blockIdx.x] = sdata[0];
}
}[/codebox]
This is the code for Kernel B
[codebox]global void reductionFor(int * entrada, int totalObjects) {
unsigned int classindex = threadIdx.x;
unsigned int res = 0;
for (int i = 0; i < totalObjects; i++) {
res += entrada[classindex * totalObjects + i];
}
entrada[classindex * totalObjects] = res;
}[/codebox]
This is the body of the program
[codebox]
#include <time.h>
#include <cutil_inline.h>
#define THREADS_PER_BLOCK 512
#ifdef DEVICE_EMULATION
#define EMUSYNC __syncthreads()
#define EMUSYNC
template global void reduction6(int * entrada, int totalObjects, int arraySize);
global void reductionFor(int * entrada, int totalObjects);
void testCudaFor(int * h_idata, int mem_size, int arraySize, int numClass) {
int * d_idata2;
cutilSafeCall(cudaMalloc((void**) &d_idata2, mem_size));
// copy host memory to device
cutilSafeCall(cudaMemcpy(d_idata2, h_idata, mem_size,
cudaMemcpyHostToDevice));
unsigned int kerneltimer2;
cutCreateTimer(&kerneltimer2);
cutStartTimer(kerneltimer2);
dim3 threads(numClass,1,1);
dim3 grid(1,1,1);
reductionFor<<<grid,threads>>>(d_idata2, arraySize);
cudaThreadSynchronize();
cutilCheckMsg("Kernel execution failed");
fprintf(stdout, "Kernel For: %f\n", cutGetTimerValue(kerneltimer2));
cutStopTimer(kerneltimer2);
cutilCheckError(cutDeleteTimer(kerneltimer2));
cutilSafeCall(cudaFree(d_idata2));
}
void testCudaTree(int * h_idata, int mem_size, int arraySize, int numClass, int n) {
unsigned int shared_mem_size = sizeof(int) * THREADS_PER_BLOCK;
// allocate device memory
int * d_idata;
cutilSafeCall(cudaMalloc((void**) &d_idata, mem_size));
// copy host memory to device
cutilSafeCall(
cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice));
int blocks = ceil((double) arraySize / ((double) THREADS_PER_BLOCK * 2));
printf("block %d threads %d shared %d\n", blocks, THREADS_PER_BLOCK,
shared_mem_size);
unsigned int numThreads = arraySize;
int numBlocks = ceil(blocks / (double) n);
int * h_odata = (int *) malloc(mem_size);
unsigned int kerneltimer;
cutCreateTimer(&kerneltimer);
cutStartTimer(kerneltimer);
while (numBlocks > 0) {
// setup execution parameters
dim3 grid(numBlocks, numClass, 1);
dim3 threads( THREADS_PER_BLOCK, 1, 1);
reduction6<THREADS_PER_BLOCK><<< grid, threads, shared_mem_size >>>( d_idata, numThreads,arraySize);
cudaThreadSynchronize();
numThreads = numBlocks;
numBlocks = (numBlocks == 1 ? 0 : ceil((double) numThreads
/ (double) (THREADS_PER_BLOCK * n)));
}
cutilCheckMsg("Kernel execution failed");
float time = cutGetTimerValue(kerneltimer);
fprintf(stdout, "Kernel: %f\n", time);
cutilSafeCall(cudaFree(d_idata));
free(h_odata);
}
void runTest(int argc, char** argv) {
cudaSetDevice(cutGetMaxGflopsDeviceId());
unsigned int arraySize = atoi(argv[1]);
unsigned int numClass = atoi(argv[2]);
unsigned int n = atoi(argv[3]);
unsigned int mem_size = sizeof(int) * arraySize * numClass;
// allocate host memory
int * h_idata = (int *) malloc(mem_size);
// initalize the memory
for (unsigned int i = 0; i < numClass; i++) {
for (unsigned int j = 0; j < arraySize; j++) {
h_idata[i * arraySize + j] = 1;
}
}
h_idata[numClass * arraySize - 1] += 3;
testCudaTree(h_idata,mem_size, arraySize,numClass,n);
//testCudaFor(h_idata,mem_size, arraySize,numClass);
// cleanup memory
free(h_idata);
cudaThreadExit();
}
int main(int argc, char** argv) {
printf("************** Reduction 1 elem ************\n");
for (int i = 0; i < atoi(argv[4]); i++) {
runTest(argc, argv);
}
}
[/codebox]
I’m testing this code in a MacBook Pro in a GeForce 9600M GT.
I’ll appreciate any advice you could give me. Thanks.