Hi there,
I am here in the office with an intern at my company who is doing some CUDA studies. As a starter tutorial we’ve implemented the parallel reduction.
Just today we’ve applied the complete loop unrolling (as in kernel 6 of the nVidia SDK’s reduction sample).
On our nVidia 8500GT we should have a theoretical memory bandwidth of 12 GB/s. And the nVidia reduction code peaks at 8 point something Gigabytes in their kernel #7 (two thirds of the theoretical peak).
But our reduction code barely reaches 1.8 GB/sec - not even close to nVidia’s benchmark!
Would anyone spot our mistake? Why do we see a slowdown factor of 4-5 in a kernel that nearly exactly looks like nVidia’s reduction code? We’ve run this through the visual profiler also. Profiler output is attached (renamed from csv to txt) - we’re puzzled by the number of uncoalesced memory writes, we would expect the number to be identical with the grid x size, but it’s not.
We’re calling the code with N_orig of 16777216 elements (a power of two)
It’s probably something stupid that we’ve overlooked.
Help!?! ;-)
Christian
__global__ void
testKernel(float* g_idata, unsigned int N)
{
// shared memory
// the size is determined by the host application
extern __shared__ float sdata[];
SDATA(threadIdx.x) = g_idata[2*blockDim.x*blockIdx.x + threadIdx.x] + g_idata[2*blockDim.x*blockIdx.x + threadIdx.x + N/2];
__syncthreads();
N = N/2;
if (threadIdx.x < 256)
SDATA(threadIdx.x) += SDATA(threadIdx.x+256);
__syncthreads();
if (threadIdx.x < 128)
SDATA(threadIdx.x) += SDATA(threadIdx.x+128);
__syncthreads();
if (threadIdx.x < 64)
SDATA(threadIdx.x) += SDATA(threadIdx.x+64);
__syncthreads();
if (threadIdx.x < 32)
{
SDATA(threadIdx.x) += SDATA(threadIdx.x+32);
SDATA(threadIdx.x) += SDATA(threadIdx.x+16);
SDATA(threadIdx.x) += SDATA(threadIdx.x+ 8);
SDATA(threadIdx.x) += SDATA(threadIdx.x+ 4);
SDATA(threadIdx.x) += SDATA(threadIdx.x+ 2);
SDATA(threadIdx.x) += SDATA(threadIdx.x+ 1);
}
if(threadIdx.x == 0) {
// printf("gtid:%d.%d block_max=%f\n", blockIdx.x, threadIdx.x, SDATA(0));
g_idata[blockIdx.x] = SDATA(0);
}
}
Here is the critical part of the host side code.
cutilCheckError(cutCreateTimer(&timer_kernel));
cutilCheckError(cutStartTimer(timer_kernel));
do{
// setup kernel execution parameters
num_threads = ceil(N/2.0);
num_blocks = ceil(1.0*num_threads/max_threads_per_block);
threads_per_block = ceil(1.0*num_threads/num_blocks);
grid = dim3(num_blocks, 1, 1);
threads = dim3(threads_per_block, 1, 1);
printf("N=%d, num_threads=%d, num_blocks=%d, threads_per_block=%d, gridDim=%d, blockDim=%d\n",
N, num_threads, num_blocks, threads_per_block, grid.x, threads.x);
// execute the kernel
testKernel<<<grid, threads, sizeof(float)*threads_per_block>>>(d_idata, 2*threads_per_block);
// check if kernel execution generated and error
cutilCheckMsg("Kernel execution failed");
N = num_blocks;
} while(N != 1);
cudaThreadSynchronize();
cutilCheckError(cutStopTimer(timer_kernel));
float bw_kernel = N_orig*sizeof(float)/time_kernel*1000;
printf("Parallel CUDA found sum to be %f in %f ms (BW=%f MB/s)\n", cuda_max, time_kernel, bw_kernel/1024/1024);
Christian
profiler_output.txt (952 Bytes)