Problem:Different results on EMULATION and RELEASE Problem on release and debug mode

Hello. I run my cuda program in EmuDebug or EmuRelease mode and I have the expected results, but if I run my cuda program in Release mode (in debug mode i dont no why the program crashes–cuda kernel error but i cant see it) I have wrong results and they are very extranges (not a numbers, negatives, etc…). Here is the code in host and device:

template.cu:

int
main( int argc, char** argv)
{

...
    ...

CUT_DEVICE_INIT();
unsigned int timer = 0;
   CUT_SAFE_CALL( cutCreateTimer( &timer));
   CUT_SAFE_CALL( cutStartTimer( timer));

long* cuda_list;
long* cuda_nearest;
long* cuda_box;
double* cuda_series;

double* fracfnn;
double* cuda_fracfnn;
double* aveps;
double* vareps;
double* cuda_aveps;
double* cuda_vareps;

check_alloc(fracfnn = (double*)malloc(((maxdim-mindim)+1)*sizeof(double)));
check_alloc(aveps = (double*)malloc(((maxdim-mindim)+1)*sizeof(double)));
check_alloc(vareps = (double*)malloc(((maxdim-mindim)+1)*sizeof(double)));

int sizeResults= ((maxdim-mindim)+1)*sizeof(double);
CUDA_SAFE_CALL(cudaMalloc((void**) &cuda_fracfnn,sizeResults));
CUDA_SAFE_CALL(cudaMalloc((void**) &cuda_aveps,sizeResults));
CUDA_SAFE_CALL(cudaMalloc((void**) &cuda_vareps,sizeResults));

int size= length*sizeof(long);
CUDA_SAFE_CALL(cudaMalloc((void**) &cuda_list,size));
CUDA_SAFE_CALL(cudaMalloc((void**) &cuda_nearest,size));
size=BOX*BOX*sizeof(long);
CUDA_SAFE_CALL(cudaMalloc((void**) &cuda_box,size));
size=length*sizeof(double);
CUDA_SAFE_CALL(cudaMalloc((void**) &cuda_series, size));

//take series vector to cuda device
CUDA_SAFE_CALL(cudaMemcpy(cuda_series, series, size, cudaMemcpyHostToDevice));

dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(GRID_SIZE, GRID_SIZE);
kernel<<<dimGrid,dimBlock>>>(BOX,mindim,maxdim,delay,length,theiler,rt,varianz,cuda_list,cuda_nearest,cuda_box,cuda_series,verbosity,inter,file,stdo,cuda_fracfnn,cuda_aveps,cuda_vareps);
// check if kernel execution generated and error
   CUT_CHECK_ERROR("Kernel execution failed");

CUDA_SAFE_CALL(cudaMemcpy(fracfnn, cuda_fracfnn, sizeResults, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(aveps,cuda_aveps, sizeResults, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(vareps, cuda_vareps, sizeResults, cudaMemcpyDeviceToHost));

int k;
for(k=0;k<=(maxdim-mindim);k++)
{
	fprintf(stderr,"Start for dimension=%u\n",k+mindim);
	if(stdout)
	{
		fprintf(stdout,"%u %e %e %e\n",k+mindim,fracfnn[k],aveps[k],vareps[k]);
		fflush(stdout);
	}
	else
	{
		fprintf(file,"%u %e %e %e\n",k+mindim,fracfnn[k],aveps[k],vareps[k]);
		fflush(file);
	}
	
}

CUT_SAFE_CALL( cutStopTimer( timer));
printf( "\n\nProcessing time: %f (ms)\n", cutGetTimerValue( timer));
CUT_SAFE_CALL( cutDeleteTimer( timer));

CUDA_SAFE_CALL(cudaFree(cuda_list));
CUDA_SAFE_CALL(cudaFree(cuda_box));
CUDA_SAFE_CALL(cudaFree(cuda_nearest));
CUDA_SAFE_CALL(cudaFree(cuda_series));
CUDA_SAFE_CALL(cudaFree(cuda_fracfnn));
CUDA_SAFE_CALL(cudaFree(cuda_aveps));
CUDA_SAFE_CALL(cudaFree(cuda_vareps));

if (!stdo)
	fclose(file);

if (infile != NULL)
	free(infile);
if (outfile != NULL)
	free(outfile);
free(series);
getchar();
exit(0);
    CUT_EXIT(argc, argv);

}


template_kernel.cu:



//GLOBAL VARIABLES
device unsigned long toolarge;
device long alldone;
device unsigned long donesofar;
device double aveps;
device double vareps;

device int find_nearest(long n,unsigned int dim,double eps,
unsigned long *local_toolarge,double *local_aveps,double *local_vareps,
long *box, double *series, long list, long b,unsigned int del,
double varz,double rt, unsigned long theil,int id);
device void make_box(double ser,long list, long box,unsigned long l,
unsigned int bs,unsigned int begin_col,unsigned int end_col,
unsigned int dim,unsigned int del,double eps);
device void localdim(int tambox ,int ident, int numerohilos,int
begin_col, int
end_col);
device void evaluate(int begin,int end,unsigned long dim,double epsilon,
long *local_alldone,unsigned long *local_donesofar,
unsigned long *local_toolarge,double *local_aveps,double *local_vareps,
long *nearest, long *box, double *series, long *list, long b,unsigned int del, double varz,double rt, unsigned long theil,int id);

////////////////////////////////////////////////////////////////////////////////

(IMPLEMENTED DEVICE FUNTIONS)

////////////////////////////////////////////////////////////////////////////////

global void
kernel(long b, unsigned int mind, unsigned int maxd,
unsigned int del, unsigned long len, unsigned long theil, double rt,
double varz, long cuda_list,long cuda_nearest,long cuda_box, double cuda_series, unsigned int ver,double inter,FILE* file,int stdo,double*cuda_fracfnn,double *cuda_aveps,
double *cuda_vareps)
{
cuda_file=file;

// Block index
int bx = blockIdx.x;
int by = blockIdx.y;

// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;

//unique id for each thread
int id = by*blockDim.x*blockDim.y*GRID_SIZE + bx *blockDim.x*blockDim.y+ty*blockDim.x+tx;

double epsilon;
long i;
unsigned int dim;
double eps0 = 1.0e-5;
int firstdata,begin,end,begin_col,end_col; 
double local_aveps = 0.0,local_vareps = 0.0;
unsigned long local_toolarge = 0,local_donesofar = 0;
long local_alldone = 0;
localdim(b,id, NHILOS ,&begin_col,&end_col);
begin_col -= 1;
end_col -= 1;

for(dim = mind;dim <= maxd; dim++){
	epsilon = eps0;
	toolarge = 0;
	alldone = 0;
	donesofar = 0;
	aveps = 0.0;
	vareps = 0.0;
	firstdata = (dim-1)*del;
  	        localdim((len-firstdata),id,NHILOS,&begin,&end);
	begin = (begin - 1) + firstdata;
            end = (end - 1) + firstdata;

	if (id == (NHILOS-1))
    		end = end - 1;

	for (i=begin;i<=end;i++)
  			cuda_nearest[i] = 0;

	while (!alldone && (epsilon < 2.*varz/rt)) 
	{
		local_alldone = 1;

make_box(cuda_series,cuda_list,cuda_box,len-1,b,begin_col,end_col,dim,del,epsilon);

		if (id == (NHILOS-1)) //ver tema ultimo hilo
		{
			alldone = 1;
		}
		__syncthreads();

		evaluate(begin, end, dim, epsilon,&local_alldone,&local_donesofar,&local_toolarge,&local_aveps,&local_vareps,cuda_nearest,cuda_box,cuda_series,cuda_list,b,del,varz,rt,theil,id);
		
		alldone &= (long) local_alldone;
		donesofar += (unsigned long) local_donesofar;	
		__syncthreads();
		
		epsilon *=sqrt(2.0);
		if (!donesofar)
			eps0=epsilon;
		local_alldone=0;
		local_donesofar=0;
	}
	toolarge += (unsigned long) local_toolarge;
 	aveps += local_aveps;
 	vareps += local_vareps;
	if (id==(NHILOS-1)) //ver tema ultimo hilo
	{
		aveps *= (1./(double)donesofar);
		vareps *= (1./(double)donesofar);
		
		cuda_fracfnn[dim-mind]=(double)toolarge/(double)donesofar;
		cuda_aveps[dim-mind]=aveps;
		cuda_vareps[dim-mind]=vareps;
		
	}
	__syncthreads();
	local_toolarge = 0;
	local_aveps = 0.0;
	local_vareps = 0.0;
}

}


So this is the code. As you can see, the problem is with arrays and the pointers. I dont know where is the problem… in the MemCopyDeviceToHost??In the pointers??
In the syncronization?? I dont know. Please if anybody can help me, i will be very pleased, because i’m finishing this FCP (final career proyect) and i have this big problems that i cant understand. Thanks!

PD: it’s any problem if i dont use the CUT_EXIT call in the main program instead exit(0)??

I recommend running Valgrind;

  • Run it on the emulation mode to check in-kernel memory accesses
  • Run it on the device mode to see more information where and when it crashes

Bad memory accesses may crash either device mode OR emulation mode, but not necessarily both.

I’m running my program under Windows XP, so Valgrind can’t help me.

In emulation mode the memory access it’s ok, and in device mode no. So I need an answer to these problem

Are you sure?

You cannot use double on the current generation of GPUs, they will be demoted to float.