Hello, I’ve been following this presentation made by Nvidia https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf And I’m having some precision issues.
My kernel looks like this:
global void SummationReductionKernel(float* input, float* output, unsigned int count)
device void WarpReduce(volatile float* data, unsigned int tid)
{
if (MAX_CUDA_THREADS_PER_BLOCK >= 64) data[tid] += data[tid + 32];
if (MAX_CUDA_THREADS_PER_BLOCK >= 32) data[tid] += data[tid + 16];
if (MAX_CUDA_THREADS_PER_BLOCK >= 16) data[tid] += data[tid + 8];
if (MAX_CUDA_THREADS_PER_BLOCK >= 8) data[tid] += data[tid + 4];
if (MAX_CUDA_THREADS_PER_BLOCK >= 4) data[tid] += data[tid + 2];
if (MAX_CUDA_THREADS_PER_BLOCK >= 2) data[tid] += data[tid + 1];
}
#define MAX_CUDA_THREADS_PER_BLOCK 512
global void SummationReductionKernel(float* input, float* output, unsigned int count)
{
extern shared float sdata[MAX_CUDA_THREADS_PER_BLOCK];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * (MAX_CUDA_THREADS_PER_BLOCK * 2) + tid;
unsigned int gridSize = MAX_CUDA_THREADS_PER_BLOCK * 2 * gridDim.x;
sdata[tid] = 0;
while (i < count)
{
sdata[tid] += input[i] + input[i + MAX_CUDA_THREADS_PER_BLOCK];
i += gridSize;
}
syncthreads();
if (MAX_CUDA_THREADS_PER_BLOCK >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } syncthreads(); }
if (MAX_CUDA_THREADS_PER_BLOCK >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } syncthreads(); }
if (MAX_CUDA_THREADS_PER_BLOCK >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } syncthreads(); }
if (tid < 32) { WarpReduce(sdata, tid); }
if (tid == 0)
{
output[blockIdx.x] = sdata[0];
}
}
(I’ve pretty much just switched from ints to floats and removed the template)
After calling the kernel I copy the output back to host and add the individual parts together. This works for small arrays (< 10k elements), but for larger ones the results vary between host and device implementations. This wouldn’t be an issue if this was a floating point precision issue and I was getting tiny differences, but I’m getting differences in the 1000’s. Any ideas about how to solve this would be greatly appreciated :heart: