I’m using a reduction kernel from the SDK examples (kernel 4) and it returns weird values, such as nan, inf -inf and just seeming random large numbers. My reduction takes 2^18 values of 1 and adds them up. It’s using 2ksizeof(float) of shared memory to do the reduction, I tried using as much memory as the number of threadssizeof(float) but that made the problem even worse! It takes -i xxx as an argument for the number of threads to execute and prints the result of the first reduction to the terminal. With 128 threads I get 1024 256s which adds up to 262144 (2^18), but with 256 threads I get something like this:
512.000000
-35628476216761795474279008580415782912.000000
512.000000
-inf
512.000000
512.000000
512.000000
512.000000
512.000000
nan
512.000000
512.000000
512.000000
When it should just be 512 512s. And cudaGetLastError returns ‘no error’… Any ideas anyone?
Note: it requires cutil_inline.h to compile (for the error checking).
I’m using Ubuntu 10.04 64 bit, GNOME 2.30.2, kernel 2.6.32-28, x86_64, nvidia 8800GTS using the CUDA driver 3.2_linux_64_260.19.26
I’ve also tried running this on Ubuntu 10.10 with a 480GTX and I get the same results.
reduction_simple2.cu (3.02 KB)
#include <stdio.h> /* ANSI C libraries */
#include <stdlib.h>
#include <string.h>
#include <stdarg.h>
#include <unistd.h>
#include <math.h>
#include <time.h>
#include "cutil_inline.h" /* Error checking */
#include <cuda.h>
#define NX 512 /* define phase grating image size */
#define NY 512
float *h_idata;
float *h_odata;
float *d_idata;
float *d_odata;
// Simple utility function to check for CUDA runtime errors
void checkCUDAError(const char* msg);
// Reduction Kernel
__global__ void reduction(float *g_idata, float *g_odata) {
extern __shared__ float sdata[];
// each thread loads one element from global to shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x*2 + threadIdx.x;
sdata[tid] = g_idata[i] + g_idata[i+blockDim.x];
__syncthreads();
for (unsigned int s=blockDim.x/2; s>0; s>>=1) {
sdata[tid] += sdata[tid + s];
__syncthreads();
}
// write result for this block to global mem
if (tid == 0){
g_odata[blockIdx.x] = sdata[0];
}
}
int main(int argc, char *argv[]) {
int c, threads;
while (( c = getopt(argc, argv, "n:i:o")) != -1) {
switch ( c ) {
case 'i' : {
sscanf(optarg, "%i", &threads);
break;
}
}
}
/* host memory */
h_idata = ( float* ) malloc( NX*NY*sizeof(float) );
h_odata = ( float* ) malloc( NX*NY*sizeof(float) );
/* device memory */
cutilSafeCallNoSync( cudaMalloc( (void **) &d_idata, NX*NY*sizeof(float) ));
cutilSafeCallNoSync( cudaMalloc( (void **) &d_odata, NX*NY*sizeof(float) ));
// make data
int i;
for(i=0;i<NX*NY;i++){
h_idata[i] = 1;
}
for(i=0;i<NX*NY;i++){
h_odata[i] = 0.0;
}
// Copy input from host memory to device memory
//int temp = number * stride;
cutilSafeCallNoSync( cudaMemcpy( d_idata, h_idata, NX*NY*sizeof(float), cudaMemcpyHostToDevice));
//don't copy the out-data to the device, we'll write it as we go along
cutilSafeCallNoSync( cudaMemcpy( d_odata, h_odata, NX*NY*sizeof(float), cudaMemcpyHostToDevice));
cudaThreadSynchronize();
// kernal dim
dim3 dimBlock( threads );
dim3 dimGrid( NX*NY/(2*threads) );
// exec reduction kernal
reduction<<<dimGrid, dimBlock, 2049*sizeof(float)>>>(d_idata, d_odata);
cudaThreadSynchronize();
checkCUDAError("kernel invocation");
// retrive result
cutilSafeCallNoSync(cudaMemcpy( h_odata, d_odata, NX*NY*sizeof(float), cudaMemcpyDeviceToHost ));
checkCUDAError("result retrive");
//output data more than 256 ? then run again : CPU reduce
// sum = 0 to start with
float sum = 0.0;
// output intermediate data
for(i=0;i<(NX*NY)/(2*threads);i++){
printf("%f\n", h_odata[i]);
sum += h_odata[i];
}
printf("sum %.2f\n", sum);
// clear gpu memory
cudaFree(d_idata);
cudaFree(d_odata);
printf("result?: %s\n",cudaGetErrorString(cudaGetLastError()));
// return success
return (0);
}
// Check for error function
void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg,
cudaGetErrorString( err) );
exit(EXIT_FAILURE);
}
}
you have race condition
for (int s=blockDim.x/2; s>0; s>>=1) {
sdata[tid] += sdata[tid + s];
__syncthreads();
}
correct version is
for (int s=blockDim.x/2; s>0; s>>=1) {
if ( tid < s ){
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
suppose threads=256, and tid=255 executes first,
then you have wrong result except that sdata[255+128] = 0