Problem:Different results on EMULATION and RELEASE Problem on release and debug mode

rapdios · June 5, 2008, 11:15am

Hello. I run my cuda program in EmuDebug or EmuRelease mode and I have the expected results, but if I run my cuda program in Release mode (in debug mode i dont no why the program crashes–cuda kernel error but i cant see it) I have wrong results and they are very extranges (not a numbers, negatives, etc…). Here is the code in host and device:

template.cu:

int
main( int argc, char** argv)
{

...
    ...

CUT_DEVICE_INIT();
unsigned int timer = 0;
   CUT_SAFE_CALL( cutCreateTimer( &timer));
   CUT_SAFE_CALL( cutStartTimer( timer));

long* cuda_list;
long* cuda_nearest;
long* cuda_box;
double* cuda_series;

double* fracfnn;
double* cuda_fracfnn;
double* aveps;
double* vareps;
double* cuda_aveps;
double* cuda_vareps;

check_alloc(fracfnn = (double*)malloc(((maxdim-mindim)+1)*sizeof(double)));
check_alloc(aveps = (double*)malloc(((maxdim-mindim)+1)*sizeof(double)));
check_alloc(vareps = (double*)malloc(((maxdim-mindim)+1)*sizeof(double)));

int sizeResults= ((maxdim-mindim)+1)*sizeof(double);
CUDA_SAFE_CALL(cudaMalloc((void**) &cuda_fracfnn,sizeResults));
CUDA_SAFE_CALL(cudaMalloc((void**) &cuda_aveps,sizeResults));
CUDA_SAFE_CALL(cudaMalloc((void**) &cuda_vareps,sizeResults));

int size= length*sizeof(long);
CUDA_SAFE_CALL(cudaMalloc((void**) &cuda_list,size));
CUDA_SAFE_CALL(cudaMalloc((void**) &cuda_nearest,size));
size=BOX*BOX*sizeof(long);
CUDA_SAFE_CALL(cudaMalloc((void**) &cuda_box,size));
size=length*sizeof(double);
CUDA_SAFE_CALL(cudaMalloc((void**) &cuda_series, size));

//take series vector to cuda device
CUDA_SAFE_CALL(cudaMemcpy(cuda_series, series, size, cudaMemcpyHostToDevice));

dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(GRID_SIZE, GRID_SIZE);
kernel<<<dimGrid,dimBlock>>>(BOX,mindim,maxdim,delay,length,theiler,rt,varianz,cuda_list,cuda_nearest,cuda_box,cuda_series,verbosity,inter,file,stdo,cuda_fracfnn,cuda_aveps,cuda_vareps);
// check if kernel execution generated and error
   CUT_CHECK_ERROR("Kernel execution failed");

CUDA_SAFE_CALL(cudaMemcpy(fracfnn, cuda_fracfnn, sizeResults, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(aveps,cuda_aveps, sizeResults, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(vareps, cuda_vareps, sizeResults, cudaMemcpyDeviceToHost));

int k;
for(k=0;k<=(maxdim-mindim);k++)
{
	fprintf(stderr,"Start for dimension=%u\n",k+mindim);
	if(stdout)
	{
		fprintf(stdout,"%u %e %e %e\n",k+mindim,fracfnn[k],aveps[k],vareps[k]);
		fflush(stdout);
	}
	else
	{
		fprintf(file,"%u %e %e %e\n",k+mindim,fracfnn[k],aveps[k],vareps[k]);
		fflush(file);
	}
	
}

CUT_SAFE_CALL( cutStopTimer( timer));
printf( "\n\nProcessing time: %f (ms)\n", cutGetTimerValue( timer));
CUT_SAFE_CALL( cutDeleteTimer( timer));

CUDA_SAFE_CALL(cudaFree(cuda_list));
CUDA_SAFE_CALL(cudaFree(cuda_box));
CUDA_SAFE_CALL(cudaFree(cuda_nearest));
CUDA_SAFE_CALL(cudaFree(cuda_series));
CUDA_SAFE_CALL(cudaFree(cuda_fracfnn));
CUDA_SAFE_CALL(cudaFree(cuda_aveps));
CUDA_SAFE_CALL(cudaFree(cuda_vareps));

if (!stdo)
	fclose(file);

if (infile != NULL)
	free(infile);
if (outfile != NULL)
	free(outfile);
free(series);
getchar();
exit(0);
    CUT_EXIT(argc, argv);

}

template_kernel.cu:

…
…
//GLOBAL VARIABLES
device unsigned long toolarge;
device long alldone;
device unsigned long donesofar;
device double aveps;
device double vareps;

device int find_nearest(long n,unsigned int dim,double eps,
unsigned long *local_toolarge,double *local_aveps,double *local_vareps,
long *box, double *series, long list, long b,unsigned int del,
double varz,double rt, unsigned long theil,int id);
device void make_box(double ser,long list, long box,unsigned long l,
unsigned int bs,unsigned int begin_col,unsigned int end_col,
unsigned int dim,unsigned int del,double eps);
device void localdim(int tambox ,int ident, int numerohilos,int begin_col, int end_col);
device void evaluate(int begin,int end,unsigned long dim,double epsilon,
long *local_alldone,unsigned long *local_donesofar,
unsigned long *local_toolarge,double *local_aveps,double *local_vareps,
long *nearest, long *box, double *series, long *list, long b,unsigned int del, double varz,double rt, unsigned long theil,int id);

////////////////////////////////////////////////////////////////////////////////

(IMPLEMENTED DEVICE FUNTIONS)

////////////////////////////////////////////////////////////////////////////////

global void
kernel(long b, unsigned int mind, unsigned int maxd,
unsigned int del, unsigned long len, unsigned long theil, double rt,
double varz, long cuda_list,long cuda_nearest,long cuda_box, double cuda_series, unsigned int ver,double inter,FILE* file,int stdo,double*cuda_fracfnn,double *cuda_aveps,
double *cuda_vareps)
{
cuda_file=file;

// Block index
int bx = blockIdx.x;
int by = blockIdx.y;

// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;

//unique id for each thread
int id = by*blockDim.x*blockDim.y*GRID_SIZE + bx *blockDim.x*blockDim.y+ty*blockDim.x+tx;

double epsilon;
long i;
unsigned int dim;
double eps0 = 1.0e-5;
int firstdata,begin,end,begin_col,end_col; 
double local_aveps = 0.0,local_vareps = 0.0;
unsigned long local_toolarge = 0,local_donesofar = 0;
long local_alldone = 0;
localdim(b,id, NHILOS ,&begin_col,&end_col);
begin_col -= 1;
end_col -= 1;

for(dim = mind;dim <= maxd; dim++){
	epsilon = eps0;
	toolarge = 0;
	alldone = 0;
	donesofar = 0;
	aveps = 0.0;
	vareps = 0.0;
	firstdata = (dim-1)*del;
  	        localdim((len-firstdata),id,NHILOS,&begin,&end);
	begin = (begin - 1) + firstdata;
            end = (end - 1) + firstdata;

	if (id == (NHILOS-1))
    		end = end - 1;

	for (i=begin;i<=end;i++)
  			cuda_nearest[i] = 0;

	while (!alldone && (epsilon < 2.*varz/rt)) 
	{
		local_alldone = 1;

make_box(cuda_series,cuda_list,cuda_box,len-1,b,begin_col,end_col,dim,del,epsilon);

		if (id == (NHILOS-1)) //ver tema ultimo hilo
		{
			alldone = 1;
		}
		__syncthreads();

		evaluate(begin, end, dim, epsilon,&local_alldone,&local_donesofar,&local_toolarge,&local_aveps,&local_vareps,cuda_nearest,cuda_box,cuda_series,cuda_list,b,del,varz,rt,theil,id);
		
		alldone &= (long) local_alldone;
		donesofar += (unsigned long) local_donesofar;	
		__syncthreads();
		
		epsilon *=sqrt(2.0);
		if (!donesofar)
			eps0=epsilon;
		local_alldone=0;
		local_donesofar=0;
	}
	toolarge += (unsigned long) local_toolarge;
 	aveps += local_aveps;
 	vareps += local_vareps;
	if (id==(NHILOS-1)) //ver tema ultimo hilo
	{
		aveps *= (1./(double)donesofar);
		vareps *= (1./(double)donesofar);
		
		cuda_fracfnn[dim-mind]=(double)toolarge/(double)donesofar;
		cuda_aveps[dim-mind]=aveps;
		cuda_vareps[dim-mind]=vareps;
		
	}
	__syncthreads();
	local_toolarge = 0;
	local_aveps = 0.0;
	local_vareps = 0.0;
}

}

So this is the code. As you can see, the problem is with arrays and the pointers. I dont know where is the problem… in the MemCopyDeviceToHost??In the pointers??
In the syncronization?? I dont know. Please if anybody can help me, i will be very pleased, because i’m finishing this FCP (final career proyect) and i have this big problems that i cant understand. Thanks!

PD: it’s any problem if i dont use the CUT_EXIT call in the main program instead exit(0)??

kristleifur · June 5, 2008, 12:34pm

I recommend running Valgrind;

Run it on the emulation mode to check in-kernel memory accesses
Run it on the device mode to see more information where and when it crashes

Bad memory accesses may crash either device mode OR emulation mode, but not necessarily both.

rapdios · June 5, 2008, 1:51pm

I’m running my program under Windows XP, so Valgrind can’t help me.

In emulation mode the memory access it’s ok, and in device mode no. So I need an answer to these problem

kristleifur · June 5, 2008, 3:31pm

Are you sure?

mfatica · June 5, 2008, 3:45pm

You cannot use double on the current generation of GPUs, they will be demoted to float.

Topic		Replies	Views
Different results on device and Emulation mode CUDA Programming and Performance	5	3548	February 5, 2009
CUDA Release vs. Debug CUDA Programming and Performance	10	5176	May 13, 2010
WHAT IT'S WRONG IN THIS KERNEL CODE? CUDA Programming and Performance	4	1952	June 17, 2008
CUDA Toolkit 3.0 update GPU HW debugging tools to replace device emulation CUDA Programming and Performance	44	29438	April 29, 2010
SDK Parallel Reduction Help, different result between EmuDebug and Release... CUDA Programming and Performance	5	5764	March 1, 2009
Help with strange error CUDA Programming and Performance	8	2095	February 25, 2010
CUDA 2.1 Beta Problem/Bugs (Linux) CUDA Programming and Performance	5	1647	January 6, 2009
CUDA code doesn't run on Release mode CUDA Programming and Performance	3	4928	November 21, 2008
Correct output with emulation mode, wrong with GPU/Execution CUDA Programming and Performance	6	3324	March 25, 2010
Weird result difference between release and debug even with -fmad=false CUDA Programming and Performance cuda	8	645	June 30, 2022

Problem:Different results on EMULATION and RELEASE Problem on release and debug mode

Related topics