reduction6 kernel from CUDA SDK not working correctly

Hello to all,

I am trying to develop a reduction kernel ad hoc for my applications, copying the function reduce6 from the CUDA SDK reduction example (reduction_kernel.cu).

It works only if the number of threads per block is less or equal than 8.

In emulation mode it works also with more than 8 threads per block.

Could me explain why?

Thank you in advance

Francesco

[codebox]/*

Parallel reduction kernels

*/

#ifndef REDUCE_KERNEL_H

#define REDUCE_KERNEL_H

#include <stdio.h>

#include “sharedmem.cuh”

#ifdef DEVICE_EMULATION

#define EMUSYNC __syncthreads()

#else

#define EMUSYNC

#endif

template <class T, unsigned int blockSize>

global void

reduce(T *g_idata, T *g_odata, unsigned int n)

{

SharedMemory<T> smem;

T *sdata = smem.getPointer();

// perform first level of reduction,

// reading from global memory, writing to shared memory

unsigned int tid = threadIdx.x;

unsigned int i = blockIdx.x*(blockSize*2) + threadIdx.x;

unsigned int gridSize = blockSize*2*gridDim.x;

sdata[tid] = 0;

// we reduce multiple elements per thread. The number is determined by the

// number of active thread blocks (via gridSize).  More blocks will result

// in a larger gridSize and therefore fewer elements per thread

while (i < n)

{

    sdata[tid] += g_idata[i] + g_idata[i+blockSize];  

    i += gridSize;

} 

__syncthreads();

// do reduction in shared mem

if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); }

if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); }

if (blockSize >= 128) { if (tid <  64) { sdata[tid] += sdata[tid +  64]; } __syncthreads(); }

#ifndef DEVICE_EMULATION

if (tid < 32)

#endif

{

    if (blockSize >=  64) { sdata[tid] += sdata[tid + 32]; EMUSYNC; }

    if (blockSize >=  32) { sdata[tid] += sdata[tid + 16]; EMUSYNC; }

    if (blockSize >=  16) { sdata[tid] += sdata[tid +  8]; EMUSYNC; }

    if (blockSize >=   8) { sdata[tid] += sdata[tid +  4]; EMUSYNC; }

    if (blockSize >=   4) { sdata[tid] += sdata[tid +  2]; EMUSYNC; }

    if (blockSize >=   2) { sdata[tid] += sdata[tid +  1]; EMUSYNC; }

}

// write result for this block to global mem

if (tid == 0) g_odata[blockIdx.x] = sdata[0];

}

#endif // #ifndef REDUCE_KERNEL_H[/codebox]

[codebox]#include “reduction_kernel.cu”

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

// includes, project

#include <cutil.h>

#define TYPE float

const unsigned int N=1024;

const unsigned DATA_SIZE=N*sizeof(int);

int main(int argc, char** argv)

{

int* h_vIn;

int* h_vOut;

int* d_vIn;

int* d_vOut;

int cpures;

CUT_DEVICE_INIT(argc, argv);

h_vIn = (int*) malloc(DATA_SIZE);

h_vOut = (int*) malloc(DATA_SIZE);

for ( int i=0; i!=N; i++ )

{

  h_vIn[i]=i+1;

  h_vOut[i]=0;

}

CUDA_SAFE_CALL(cudaMalloc((void**)&d_vIn, DATA_SIZE));

CUDA_SAFE_CALL(cudaMalloc((void**)&d_vOut, DATA_SIZE));

CUDA_SAFE_CALL(cudaMemcpy(d_vIn, h_vIn, DATA_SIZE, cudaMemcpyHostToDevice));

CUDA_SAFE_CALL(cudaMemcpy(d_vOut, h_vOut, DATA_SIZE, cudaMemcpyHostToDevice));

const unsigned int threadsPerBlock=8;

dim3 dimBlock(threadsPerBlock,1);

dim3 dimGrid(N/2/threadsPerBlock,1);

CUDA_SAFE_CALL( cudaThreadSynchronize() );

reduce<int, threadsPerBlock><<<N/2/threadsPerBlock,threadsPerBlock>>>(d_vIn, d_vOut, N);

CUDA_SAFE_CALL( cudaThreadSynchronize() );

CUT_CHECK_ERROR(“kernel execution failed.\n”);

CUDA_SAFE_CALL( cudaMemcpy(h_vOut, d_vOut, DATA_SIZE, cudaMemcpyDeviceToHost));

//CUDA_SAFE_CALL( cudaThreadSynchronize() );

cpures=0;

for (int i=0; i<N; i++)

  cpures+=h_vIn[i];

printf(“computed cpures: %d\n”);

int gpures=0;

for (int i=0; i!=dimGrid.x; i++)

{

  //printf("vOut[%d]: %d\n", i, h_vOut[i]);

  gpures+=h_vOut[i];

}

printf(“gpures: %d\n”, gpures);

}[/codebox]

i wrote him a private message reporting i had got this problem too.

i have just solved my problem (grattazio pallorum) as follows:

i have omitted an important instruction in my reduction code, as i declared my shared memory size ad

int smemSize = threads * sizeof(float);

in sdk shm_size is calculated as:

int smemSize = (threads <= 32) ? 2 * threads * sizeof(float) : threads * sizeof(float);

This is done because in warp-programming part of the sdk example EVERY threads of the warp MUST access to an existing SHM location, even if it doesn’t produce useful computation.

Let explain…

you have to add 32 element, so 16 thread are created.

you perform a first addiction during global mem access, so now you have 16 thread and 16 new elements to sum.

NOW your are in warp-programming zone and you enter into this if-case:

if (blockSize >= 16) { sdata[tid] += sdata[tid + 8];}

unrolling it by EVRY threadID of the warp…

{ sdata[0] += sdata[8];}

{ sdata[1] += sdata[9];}

{ sdata[2] += sdata[10];}

{ sdata[3] += sdata[11];}

{ sdata[4] += sdata[12];}

{ sdata[5] += sdata[13];}

{ sdata[6] += sdata[14];}

{ sdata[7] += sdata[15];}

{ sdata[8] += sdata[16];}

{ sdata[9] += sdata[17];}

{ sdata[10] += sdata[18];}

{ sdata[11] += sdata[19];}

{ sdata[12] += sdata[20];}

{ sdata[13] += sdata[21];}

{ sdata[14] += sdata[22];}

{ sdata[15] += sdata[23];}

Red part of the code produce a SHM read access violation if you decleared

smemSize = threads * sizeof(float)

instead of

int smemSize = 2 * threads * sizeof(float);

summarizing:

if SHM size is not doubled when threads number is == WARPSIZE => kernel crash !

i wrote him a private message reporting i had got this problem too.

i have just solved my problem (grattazio pallorum) as follows:

i have omitted an important instruction in my reduction code, as i declared my shared memory size ad

int smemSize = threads * sizeof(float);

in sdk shm_size is calculated as:

int smemSize = (threads <= 32) ? 2 * threads * sizeof(float) : threads * sizeof(float);

This is done because in warp-programming part of the sdk example EVERY threads of the warp MUST access to an existing SHM location, even if it doesn’t produce useful computation.

Let explain…

you have to add 32 element, so 16 thread are created.

you perform a first addiction during global mem access, so now you have 16 thread and 16 new elements to sum.

NOW your are in warp-programming zone and you enter into this if-case:

if (blockSize >= 16) { sdata[tid] += sdata[tid + 8];}

unrolling it by EVRY threadID of the warp…

{ sdata[0] += sdata[8];}

{ sdata[1] += sdata[9];}

{ sdata[2] += sdata[10];}

{ sdata[3] += sdata[11];}

{ sdata[4] += sdata[12];}

{ sdata[5] += sdata[13];}

{ sdata[6] += sdata[14];}

{ sdata[7] += sdata[15];}

{ sdata[8] += sdata[16];}

{ sdata[9] += sdata[17];}

{ sdata[10] += sdata[18];}

{ sdata[11] += sdata[19];}

{ sdata[12] += sdata[20];}

{ sdata[13] += sdata[21];}

{ sdata[14] += sdata[22];}

{ sdata[15] += sdata[23];}

Red part of the code produce a SHM read access violation if you decleared

smemSize = threads * sizeof(float)

instead of

int smemSize = 2 * threads * sizeof(float);

summarizing:

if SHM size is not doubled when threads number is == WARPSIZE => kernel crash !