Hello. I run my cuda program in EmuDebug or EmuRelease mode and I have the expected results, but if I run my cuda program in Release mode (in debug mode i dont no why the program crashes–cuda kernel error but i cant see it) I have wrong results and they are very extranges (not a numbers, negatives, etc…). Here is the code in host and device:
template.cu:
int
main( int argc, char** argv)
{
...
...
CUT_DEVICE_INIT();
unsigned int timer = 0;
CUT_SAFE_CALL( cutCreateTimer( &timer));
CUT_SAFE_CALL( cutStartTimer( timer));
long* cuda_list;
long* cuda_nearest;
long* cuda_box;
double* cuda_series;
double* fracfnn;
double* cuda_fracfnn;
double* aveps;
double* vareps;
double* cuda_aveps;
double* cuda_vareps;
check_alloc(fracfnn = (double*)malloc(((maxdim-mindim)+1)*sizeof(double)));
check_alloc(aveps = (double*)malloc(((maxdim-mindim)+1)*sizeof(double)));
check_alloc(vareps = (double*)malloc(((maxdim-mindim)+1)*sizeof(double)));
int sizeResults= ((maxdim-mindim)+1)*sizeof(double);
CUDA_SAFE_CALL(cudaMalloc((void**) &cuda_fracfnn,sizeResults));
CUDA_SAFE_CALL(cudaMalloc((void**) &cuda_aveps,sizeResults));
CUDA_SAFE_CALL(cudaMalloc((void**) &cuda_vareps,sizeResults));
int size= length*sizeof(long);
CUDA_SAFE_CALL(cudaMalloc((void**) &cuda_list,size));
CUDA_SAFE_CALL(cudaMalloc((void**) &cuda_nearest,size));
size=BOX*BOX*sizeof(long);
CUDA_SAFE_CALL(cudaMalloc((void**) &cuda_box,size));
size=length*sizeof(double);
CUDA_SAFE_CALL(cudaMalloc((void**) &cuda_series, size));
//take series vector to cuda device
CUDA_SAFE_CALL(cudaMemcpy(cuda_series, series, size, cudaMemcpyHostToDevice));
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(GRID_SIZE, GRID_SIZE);
kernel<<<dimGrid,dimBlock>>>(BOX,mindim,maxdim,delay,length,theiler,rt,varianz,cuda_list,cuda_nearest,cuda_box,cuda_series,verbosity,inter,file,stdo,cuda_fracfnn,cuda_aveps,cuda_vareps);
// check if kernel execution generated and error
CUT_CHECK_ERROR("Kernel execution failed");
CUDA_SAFE_CALL(cudaMemcpy(fracfnn, cuda_fracfnn, sizeResults, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(aveps,cuda_aveps, sizeResults, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(vareps, cuda_vareps, sizeResults, cudaMemcpyDeviceToHost));
int k;
for(k=0;k<=(maxdim-mindim);k++)
{
fprintf(stderr,"Start for dimension=%u\n",k+mindim);
if(stdout)
{
fprintf(stdout,"%u %e %e %e\n",k+mindim,fracfnn[k],aveps[k],vareps[k]);
fflush(stdout);
}
else
{
fprintf(file,"%u %e %e %e\n",k+mindim,fracfnn[k],aveps[k],vareps[k]);
fflush(file);
}
}
CUT_SAFE_CALL( cutStopTimer( timer));
printf( "\n\nProcessing time: %f (ms)\n", cutGetTimerValue( timer));
CUT_SAFE_CALL( cutDeleteTimer( timer));
CUDA_SAFE_CALL(cudaFree(cuda_list));
CUDA_SAFE_CALL(cudaFree(cuda_box));
CUDA_SAFE_CALL(cudaFree(cuda_nearest));
CUDA_SAFE_CALL(cudaFree(cuda_series));
CUDA_SAFE_CALL(cudaFree(cuda_fracfnn));
CUDA_SAFE_CALL(cudaFree(cuda_aveps));
CUDA_SAFE_CALL(cudaFree(cuda_vareps));
if (!stdo)
fclose(file);
if (infile != NULL)
free(infile);
if (outfile != NULL)
free(outfile);
free(series);
getchar();
exit(0);
CUT_EXIT(argc, argv);
}
template_kernel.cu:
…
…
//GLOBAL VARIABLES
device unsigned long toolarge;
device long alldone;
device unsigned long donesofar;
device double aveps;
device double vareps;
device int find_nearest(long n,unsigned int dim,double eps,
unsigned long *local_toolarge,double *local_aveps,double *local_vareps,
long *box, double *series, long list, long b,unsigned int del,
double varz,double rt, unsigned long theil,int id);
device void make_box(double ser,long list, long box,unsigned long l,
unsigned int bs,unsigned int begin_col,unsigned int end_col,
unsigned int dim,unsigned int del,double eps);
device void localdim(int tambox ,int ident, int numerohilos,int begin_col, int end_col);
device void evaluate(int begin,int end,unsigned long dim,double epsilon,
long *local_alldone,unsigned long *local_donesofar,
unsigned long *local_toolarge,double *local_aveps,double *local_vareps,
long *nearest, long *box, double *series, long *list, long b,unsigned int del, double varz,double rt, unsigned long theil,int id);
////////////////////////////////////////////////////////////////////////////////
(IMPLEMENTED DEVICE FUNTIONS)
////////////////////////////////////////////////////////////////////////////////
global void
kernel(long b, unsigned int mind, unsigned int maxd,
unsigned int del, unsigned long len, unsigned long theil, double rt,
double varz, long cuda_list,long cuda_nearest,long cuda_box, double cuda_series, unsigned int ver,double inter,FILE* file,int stdo,double*cuda_fracfnn,double *cuda_aveps,
double *cuda_vareps)
{
cuda_file=file;
// Block index
int bx = blockIdx.x;
int by = blockIdx.y;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
//unique id for each thread
int id = by*blockDim.x*blockDim.y*GRID_SIZE + bx *blockDim.x*blockDim.y+ty*blockDim.x+tx;
double epsilon;
long i;
unsigned int dim;
double eps0 = 1.0e-5;
int firstdata,begin,end,begin_col,end_col;
double local_aveps = 0.0,local_vareps = 0.0;
unsigned long local_toolarge = 0,local_donesofar = 0;
long local_alldone = 0;
localdim(b,id, NHILOS ,&begin_col,&end_col);
begin_col -= 1;
end_col -= 1;
for(dim = mind;dim <= maxd; dim++){
epsilon = eps0;
toolarge = 0;
alldone = 0;
donesofar = 0;
aveps = 0.0;
vareps = 0.0;
firstdata = (dim-1)*del;
localdim((len-firstdata),id,NHILOS,&begin,&end);
begin = (begin - 1) + firstdata;
end = (end - 1) + firstdata;
if (id == (NHILOS-1))
end = end - 1;
for (i=begin;i<=end;i++)
cuda_nearest[i] = 0;
while (!alldone && (epsilon < 2.*varz/rt))
{
local_alldone = 1;
make_box(cuda_series,cuda_list,cuda_box,len-1,b,begin_col,end_col,dim,del,epsilon);
if (id == (NHILOS-1)) //ver tema ultimo hilo
{
alldone = 1;
}
__syncthreads();
evaluate(begin, end, dim, epsilon,&local_alldone,&local_donesofar,&local_toolarge,&local_aveps,&local_vareps,cuda_nearest,cuda_box,cuda_series,cuda_list,b,del,varz,rt,theil,id);
alldone &= (long) local_alldone;
donesofar += (unsigned long) local_donesofar;
__syncthreads();
epsilon *=sqrt(2.0);
if (!donesofar)
eps0=epsilon;
local_alldone=0;
local_donesofar=0;
}
toolarge += (unsigned long) local_toolarge;
aveps += local_aveps;
vareps += local_vareps;
if (id==(NHILOS-1)) //ver tema ultimo hilo
{
aveps *= (1./(double)donesofar);
vareps *= (1./(double)donesofar);
cuda_fracfnn[dim-mind]=(double)toolarge/(double)donesofar;
cuda_aveps[dim-mind]=aveps;
cuda_vareps[dim-mind]=vareps;
}
__syncthreads();
local_toolarge = 0;
local_aveps = 0.0;
local_vareps = 0.0;
}
}
So this is the code. As you can see, the problem is with arrays and the pointers. I dont know where is the problem… in the MemCopyDeviceToHost??In the pointers??
In the syncronization?? I dont know. Please if anybody can help me, i will be very pleased, because i’m finishing this FCP (final career proyect) and i have this big problems that i cant understand. Thanks!
PD: it’s any problem if i dont use the CUT_EXIT call in the main program instead exit(0)??