Reduction random errors Reduction kernel turns weird values

I’m using a reduction kernel from the SDK examples (kernel 4) and it returns weird values, such as nan, inf -inf and just seeming random large numbers. My reduction takes 2^18 values of 1 and adds them up. It’s using 2ksizeof(float) of shared memory to do the reduction, I tried using as much memory as the number of threadssizeof(float) but that made the problem even worse! It takes -i xxx as an argument for the number of threads to execute and prints the result of the first reduction to the terminal. With 128 threads I get 1024 256s which adds up to 262144 (2^18), but with 256 threads I get something like this:
512.000000
-35628476216761795474279008580415782912.000000
512.000000
-inf
512.000000
512.000000
512.000000
512.000000
512.000000
nan
512.000000
512.000000
512.000000
When it should just be 512 512s. And cudaGetLastError returns ‘no error’… Any ideas anyone?
Note: it requires cutil_inline.h to compile (for the error checking).
I’m using Ubuntu 10.04 64 bit, GNOME 2.30.2, kernel 2.6.32-28, x86_64, nvidia 8800GTS using the CUDA driver 3.2_linux_64_260.19.26
I’ve also tried running this on Ubuntu 10.10 with a 480GTX and I get the same results.
reduction_simple2.cu (3.02 KB)

#include <stdio.h>	/* ANSI C libraries */

#include <stdlib.h>

#include <string.h>

#include <stdarg.h>

#include <unistd.h>

#include <math.h>

#include <time.h>

#include "cutil_inline.h" /* Error checking */

#include <cuda.h>

#define NX 512 		/* define phase grating image size */

#define NY 512

float *h_idata;

float *h_odata;

float *d_idata;

float *d_odata;

// Simple utility function to check for CUDA runtime errors

void checkCUDAError(const char* msg);

// Reduction Kernel

__global__ void reduction(float *g_idata, float *g_odata) {

	extern __shared__ float sdata[];

	// each thread loads one element from global to shared mem

	unsigned int tid = threadIdx.x;

	unsigned int i = blockIdx.x*blockDim.x*2 + threadIdx.x;

	sdata[tid] = g_idata[i] + g_idata[i+blockDim.x];

	__syncthreads();

	for (unsigned int s=blockDim.x/2; s>0; s>>=1) {

		sdata[tid] += sdata[tid + s];

		__syncthreads();

	}

	// write result for this block to global mem

	if (tid == 0){

		g_odata[blockIdx.x] = sdata[0];

	}

}

int main(int argc, char *argv[]) {

	int c, threads;

	while (( c = getopt(argc, argv, "n:i:o")) != -1) {

		switch ( c ) {

		case 'i' : {

			sscanf(optarg, "%i", &threads);

			break;

		}

		}

	}

	/* host memory */

	h_idata = 	( float* ) 	malloc( NX*NY*sizeof(float) );

	h_odata = 	( float* ) 	malloc( NX*NY*sizeof(float) );

	/* device memory */

	cutilSafeCallNoSync( cudaMalloc( (void **) &d_idata, NX*NY*sizeof(float) ));

	cutilSafeCallNoSync( cudaMalloc( (void **) &d_odata, NX*NY*sizeof(float) ));

	// make data

	int i;

	for(i=0;i<NX*NY;i++){

		h_idata[i] = 1;

	}

	for(i=0;i<NX*NY;i++){

		h_odata[i] = 0.0;

	}

	// Copy input from host memory to device memory

	//int temp = number * stride;

	cutilSafeCallNoSync( cudaMemcpy( d_idata, h_idata, NX*NY*sizeof(float), cudaMemcpyHostToDevice));

	//don't copy the out-data to the device, we'll write it as we go along

	cutilSafeCallNoSync( cudaMemcpy( d_odata, h_odata, NX*NY*sizeof(float), cudaMemcpyHostToDevice));

	cudaThreadSynchronize();

	// kernal dim

	dim3 dimBlock( threads );

	dim3 dimGrid( NX*NY/(2*threads)  );

	// exec reduction kernal

	reduction<<<dimGrid, dimBlock, 2049*sizeof(float)>>>(d_idata, d_odata);

	cudaThreadSynchronize();

	checkCUDAError("kernel invocation");

	// retrive result

	cutilSafeCallNoSync(cudaMemcpy( h_odata, d_odata, NX*NY*sizeof(float), cudaMemcpyDeviceToHost ));

	checkCUDAError("result retrive");

	//output data more than 256 ? then run again : CPU reduce

	// sum = 0 to start with

	float sum = 0.0;

	// output intermediate data

	for(i=0;i<(NX*NY)/(2*threads);i++){

		printf("%f\n", h_odata[i]);

		sum += h_odata[i];

	}

	printf("sum %.2f\n", sum);

	// clear gpu memory

	cudaFree(d_idata);

	cudaFree(d_odata);

	printf("result?: %s\n",cudaGetErrorString(cudaGetLastError()));

	// return success

	return (0);

}

// Check for error function

void checkCUDAError(const char *msg)

{

	cudaError_t err = cudaGetLastError();

	if( cudaSuccess != err)

	{

		fprintf(stderr, "Cuda error: %s: %s.\n", msg,

				cudaGetErrorString( err) );

		exit(EXIT_FAILURE);

	}

}

you have race condition

for (int s=blockDim.x/2; s>0; s>>=1) {

        sdata[tid] += sdata[tid + s];

        __syncthreads();

    }

correct version is

for (int s=blockDim.x/2; s>0; s>>=1) {

        if ( tid < s ){

            sdata[tid] += sdata[tid + s];

        }

        __syncthreads();

    }

suppose threads=256, and tid=255 executes first,

then you have wrong result except that sdata[255+128] = 0