I was testing the sum reduction algorithm mentioned in NVIDIAReduction (page 35), and was confused by the fact that debug and release configurations produced different results. I set blockPerGrid=1000 and threadPerBlock=512, and expected kEff_block to be 1000*512=512000. Debug config gives me the correct result, while release leads to a strange 56000. External Image Could anyone help me with this? Thanks a lot in advance.
Device:
//++++++++++++++++++++++++++++++++++++
// SHARED
//++++++++++++++++++++++++++++++++++++
__shared__ double kEff_shared[threadPerBlock];
//----------------------------------------------------------
//----------------------------------------------------------
__global__ void Test(double *kEff_device)
{
int globalThreadID;
int threadID;
//get in-block id
threadID=threadIdx.x;
//initialize shared memory
kEff_shared[threadID]=1.0;
__syncthreads();
if (threadPerBlock >= 512) { if (threadID < 256) { kEff_shared[threadID] += kEff_shared[threadID + 256]; } __syncthreads(); }
if (threadPerBlock >= 256) { if (threadID < 128) { kEff_shared[threadID] += kEff_shared[threadID + 128]; } __syncthreads(); }
if (threadPerBlock >= 128) { if (threadID < 64) { kEff_shared[threadID] += kEff_shared[threadID + 64]; } __syncthreads(); }
if (threadID < 32) {ReduceWarp(kEff_shared, threadID, threadPerBlock);}
__syncthreads();
if(threadID==0)
{
kEff_device[blockIdx.x]=kEff_shared[0];
}
}
//----------------------------------------------------------
//----------------------------------------------------------
__device__ void ReduceWarp(double data_shared[], int threadID, int blockSize)
{
if (blockSize >= 64) data_shared[threadID] += data_shared[threadID + 32];
if (blockSize >= 32) data_shared[threadID] += data_shared[threadID + 16];
if (blockSize >= 16) data_shared[threadID] += data_shared[threadID + 8];
if(blockSize >= 8) data_shared[threadID] += data_shared[threadID + 4];
if (blockSize >= 4) data_shared[threadID] += data_shared[threadID + 2];
if (blockSize >= 2) data_shared[threadID] += data_shared[threadID + 1];
}
Host:
HandleDeviceError(cudaMemcpy(kEff_host, kEff_device, blockPerGrid*sizeof(double), cudaMemcpyDeviceToHost));
for(int i=0;i<blockPerGrid;i++)
{
kEff_block+=kEff_host[i];
}