WHAT IT'S WRONG IN THIS KERNEL CODE?

Here is my kernel code. When I launch it, it’s appear different problems (the computer’s get blocked, cuda errors like too many resources, bad configuration, launch time exceed, etc…) If you can see any fatal error, please tell me. Thanks!

#include <stdio.h>

#include "definidos.h"

#include <math.h>

	__device__ unsigned int toolarge;

	__device__ int alldone;

	__device__ unsigned int donesofar;

	__device__ float aveps;

	__device__ float vareps;

__device__ int find_nearest(int n,unsigned int dim,float eps,

   unsigned int *local_toolarge,float *local_aveps,float *local_vareps,

   int *box, float *series, int *list, int b,unsigned int del,

    float varz,float rt, unsigned int theil,int id);

__device__ void make_box(float *ser,int *list, int* box,unsigned int l,

              unsigned int bs,unsigned int begin_col,unsigned int end_col,

     unsigned int dim,unsigned int del,float eps);

__device__ void localdim(int tambox ,int ident, int numerohilos,int* begin_col, int* end_col);

__device__ void evaluate(int begin,int end,unsigned int dim,float epsilon,

       int *local_alldone,unsigned int *local_donesofar,

       unsigned int *local_toolarge,float *local_aveps,float *local_vareps, 

    int *nearest, int *box, float *series, int *list, int b,unsigned int del,

    float varz,float rt, unsigned int theil,int id);

////////////////////////////////////////////////////////////////////////////////

/* make_box(): This function generates the box matrix.  Parallel version for SM architectures.

 *

 * -Arguments:

 *  float *ser: Time series array.

 *  int **box: Box matrix.

 *  int *list: List array.

 *  unsigned int l: Number of points.

 *  unsigned int bs: Box size.

 *  unsigned int begin_col: First column (from rank assigned).

 *  unsigned int end_col: Last column (from rank assigned).

 *  unsigned int dim: Embedding dimension.

 *  unsigned int del: Delay of vectors. 

 *  float eps: Value of epsilon. 

 * -Return value:

 *  None. 

 */

__device__

void make_box(float *ser,int *list, int* box,unsigned int l,

              unsigned int bs,unsigned int begin_col,unsigned int end_col,

     unsigned int dim,unsigned int del,float eps)

{

  	int i,x,y;

  	int ib=bs-1;

	int d = (dim-1)*del;

	

	for (x=begin_col;x<=end_col;x++)

      for (y=0;y<bs;y++)

    box[y*bs+x] = -1;

     

  	for (i=d;i<l;i++)

    {

      x = (int)(ser[i-d]/eps)&ib;

  	

      if ((x >= begin_col) && ( x <= end_col))

      	{

      	y = (int)(ser[i]/eps)&ib;

      	list[i] = box[x*bs+y];

      	box[x*bs+y] = i;

      	}

    }

}

/* localdim(): This function assigns a rank of points/columns to the actual process.

 *

 * -Arguments:

 *  int d: Number of points/columns.

 *  int id: Process identifier.

 *  int nprocs: Process number.

 *  int *begin: First point/column for the actual process. 

 *  int *end: Last point/column for the actual process. 

 * -Return value:

 *  None. 

 */

__device__

void localdim(int tambox ,int ident, int numerohilos,int* begin_col, int* end_col)

{

	int quotient,rem;

	

	quotient = tambox/numerohilos;

	rem = tambox % numerohilos;

	

	if (rem == 0)

  {

  *begin_col = ident*quotient;

  *end_col = *begin_col + quotient;

  }

	else

  {

  if (ident < rem)

  	{

  	*begin_col = ident*(quotient+1);

  	*end_col = *begin_col + quotient + 1;

  	}

  else

  	{

  	*begin_col = rem*(quotient+1)+(ident-rem)*quotient;

  	*end_col = *begin_col + quotient;

  	}

  }

	*begin_col = *begin_col + 1;

}

/* evaluate(): This function resolves the part of the neighbour searching task assigned to the actual process. 

 *

 * -Arguments:

 *  int begin: First point (from rank assigned).

 *  int end: Last point (from rank assigned).

 *  unsigned int dim: Embedding dimension.

 *  float epsilon: Value of epsilon. 

 *  int *local_alldone: alldone local variable.

 *  unsigned int *local_donesofar: donesofar local variable.

 *  unsigned int *local_toolarge: toolarge local variable.

 *  float *local_aveps: aveps local variable.

 *  float *local_vareps: vareps local variable.

 * -Return value:

 *  None. 

 */    	

__device__                                                                                                                                                                                                                                      

void evaluate(int begin,int end,unsigned int dim,float epsilon,

       int *local_alldone,unsigned int *local_donesofar,

       unsigned int *local_toolarge,float *local_aveps,float *local_vareps, 

    int *nearest, int *box, float *series, int *list, int b,unsigned int del,

    float varz,float rt, unsigned int theil, int id)

{

	int i;

	

	for (i=begin;i<=end;i++)

  if (!nearest[i]) 

  	{

    	nearest[i] = find_nearest(i,dim,epsilon,

          local_toolarge,local_aveps,local_vareps,box,series,list,b,del,varz,rt,theil,id);

    	*local_alldone &= nearest[i];

    	*local_donesofar += (unsigned int)nearest[i];

  	}  	

}

/* find_nearest(): This function implements the neighbour searching task for the point n.

 *

 * -Arguments:

 *  int n: Point.

 *  unsigned int dim: Embedding dimension.

 *  float eps: Value of epsilon. 

 *  unsigned int *local_toolarge: toolarge local variable.

 *  float *local_aveps: aveps local variable.

 *  float *local_vareps: vareps local variable.

 * -Return value:

 *  0: No nearest neighbour found (with distance < epsilon).

 *  1: Nearest neighbour found (with distance < epsilon). 

 */  

__device__

int find_nearest(int n,unsigned int dim,float eps,

   unsigned int *local_toolarge,float *local_aveps,float *local_vareps,

   int *box, float *series, int *list, int b,unsigned int del,

    float varz,float rt, unsigned int theil,int id)

{

  	int x,y,x1,x2,y1,i,i1;

  int element,which = -1;

  	float dx,maxdx,mindx=1.1,factor;

	int ibox = b-1;

  	

  	x = (int)(series[n-(dim-1)*del]/eps)&ibox;

	y = (int)(series[n]/eps)&ibox;

	for (x1=x-1;x1<=x+1;x1++) 

    {

      x2 = x1&ibox;

      for (y1=y-1;y1<=y+1;y1++) 

      {

        	element = box[x2*b+(y1&ibox)];

    while (element != -1) 

        	{

      if (labs(element-n) > theil)

      {

        	maxdx = fabs(series[n]-series[element]);

        	for (i=1;i<dim;i++) 

        	{

          	i1 = i*del;

          	dx = fabs(series[n-i1]-series[element-i1]);

          	if (dx > maxdx)

              maxdx = dx;

        	}

        	if ((maxdx < mindx) && (maxdx > 0.0)) 

        	{

          	which = element;

          	mindx = maxdx;

        	}

      }

      element = list[element];

        	}

      }  

    }

	

  	if ((which != -1) && (mindx <= eps) && (mindx <= varz/rt)) 

    {

      *local_aveps += mindx;

      *local_vareps += mindx*mindx;

      factor = fabs(series[n+1]-series[which+1])/mindx;

 	if (factor > rt)

        	*local_toolarge += 1;

      return 1;

    }

  	return 0;

}

__global__ void

kernel(int b, unsigned int mind, unsigned int maxd, 

    unsigned int del, unsigned int len, unsigned int theil, float rt, 

    float varz, int *cuda_list,int *cuda_nearest,int* cuda_box, float* cuda_series, 

    unsigned int ver,float inter,int stdo,float *cuda_fracfnn,float *cuda_aveps,

    float *cuda_vareps)

{

   // Block index

    int bx = blockIdx.x;

    int by = blockIdx.y;

   // Thread index

    int tx = threadIdx.x;

    int ty = threadIdx.y;

	

	

	int id = by*blockDim.x*blockDim.y*GRID_SIZE + bx *blockDim.x*blockDim.y+ty*blockDim.x+tx;

	

	float epsilon;

	int i;

	unsigned int dim;

	float eps0 = 1.0e-5;

	int firstdata,begin,end,begin_col,end_col; 

	float local_aveps = 0.0,local_vareps = 0.0;

	unsigned int local_toolarge = 0,local_donesofar = 0;

	int local_alldone = 0;

	localdim(b,id, NHILOS ,&begin_col,&end_col);

	begin_col -= 1;

	end_col -= 1;

	for(dim = mind;dim <= maxd; dim++){

  epsilon = eps0;

  toolarge = 0;

  alldone = 0;

  donesofar = 0;

  aveps = 0.0;

  vareps = 0.0;

 firstdata = (dim-1)*del;

      	localdim((len-firstdata),id,NHILOS,&begin,&end);

  begin = (begin - 1) + firstdata;

        end = (end - 1) + firstdata;

 if (id == (NHILOS-1)) 

          end = end - 1;

        __syncthreads();

 for (i=begin;i<=end;i++)

        	cuda_nearest[i] = 0;

 while (!alldone && (epsilon < 2.*varz/rt)) 

  {

  	local_alldone = 1;

        make_box(cuda_series,cuda_list,cuda_box,len-1,b,begin_col,end_col,dim,del,epsilon);

 	if (id == (NHILOS-1)) 

  	{

    alldone = 1;

  	}

  	__syncthreads();

 	evaluate(begin, end, dim, epsilon,&local_alldone,&local_donesofar,&local_toolarge,&local_aveps,&local_vareps,cuda_nearest,cuda_box,cuda_series,cuda_list,b,del,varz,rt,theil,id);

  	

  	alldone &= (int) local_alldone;

  	donesofar += (unsigned int) local_donesofar;	

  	__syncthreads();

  	

  	epsilon *=sqrt(2.0);

  	if (!donesofar)

    eps0=epsilon;

  	local_alldone=0;

  	local_donesofar=0;

  }

 if (donesofar==0)

  {

    //the thread should exit. I don't know how to do it

  }

	

  toolarge += (unsigned int) local_toolarge;

      aveps += local_aveps;

      vareps += local_vareps;

  if (id==NHILOS-1) 

  {

  	aveps *= (1./(float)donesofar);

  	vareps *= (1./(float)donesofar);

  	

  	cuda_fracfnn[dim-mind]=(float)toolarge/(float)donesofar;

  	cuda_aveps[dim-mind]=aveps;

  	cuda_vareps[dim-mind]=vareps;

  	

  }

  __syncthreads();

 local_toolarge = 0;

  local_aveps = 0.0;

  local_vareps = 0.0;

	}

}

The host code that launches this kernel is needed to detect the kind of errors you have mentionned. If you debug it, i doubt the kernel code is launching at all.

Can you reduce number of threads/blocks for your kernel? Could you reduce amount of work done in 1 thread? That is to avoid 5s runtime delay.

Also, your code might not use most of GPGPU power - too much branching & loops with different parameters.

Finally, you would helpful everyone if you extract smallest code which reproduces your problem.

Here is the host code that launches the kernel. One question: It’s possible that the problem is with the device variables that are used for all the threads that are launched?? And so, there is some mechanism to implement a Mutex or something like that, wich protect the access to this device variable??

Here is the host code:

CUT_DEVICE_INIT();//iniciamos el device

	

	unsigned int timer = 0;

    CUT_SAFE_CALL( cutCreateTimer( &timer));

    CUT_SAFE_CALL( cutStartTimer( timer));

	

	

	int* cuda_list;

	int* cuda_nearest;

	int* cuda_box;

	float* cuda_series;

	

	float* fracfnn;

	float* aveps;

	float* vareps;

	float* cuda_fracfnn;

	float* cuda_aveps;

	float* cuda_vareps;

	fracfnn = (float*)malloc(((maxdim-mindim)+1)*sizeof(float));

	aveps = (float*)malloc(((maxdim-mindim)+1)*sizeof(float));

	vareps = (float*)malloc(((maxdim-mindim)+1)*sizeof(float));

	int sizeResults= ((maxdim-mindim)+1)*sizeof(float);

	CUDA_SAFE_CALL(cudaMalloc((void**) &cuda_fracfnn,sizeResults));

	CUDA_SAFE_CALL(cudaMalloc((void**) &cuda_aveps,sizeResults));

	CUDA_SAFE_CALL(cudaMalloc((void**) &cuda_vareps,sizeResults));

	int size= length*sizeof(int);

	CUDA_SAFE_CALL(cudaMalloc((void**) &cuda_list,size));

	CUDA_SAFE_CALL(cudaMalloc((void**) &cuda_nearest,size));

	

	size=BOX*BOX*sizeof(int);

	CUDA_SAFE_CALL(cudaMalloc((void**) &cuda_box,size));

	

	size=length*sizeof(float);

	CUDA_SAFE_CALL(cudaMalloc((void**) &cuda_series, size));

	

	

	CUDA_SAFE_CALL(cudaMemcpy(cuda_series, series, size, cudaMemcpyHostToDevice));

	

	dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);

	dim3 dimGrid(GRID_SIZE, GRID_SIZE);

	

	kernel<<<dimGrid,dimBlock>>>(BOX,mindim,maxdim,delay,length,theiler,rt,varianz,cuda_list,cuda_nearest,cuda_box,cuda_series,verbosity,inter,stdo,cuda_fracfnn,cuda_aveps,cuda_vareps);

	

	// check if kernel execution generated and error

    CUT_CHECK_ERROR("Kernel execution failed");

	

	CUDA_SAFE_CALL(cudaMemcpy(fracfnn, cuda_fracfnn, sizeResults, cudaMemcpyDeviceToHost));

	CUDA_SAFE_CALL(cudaMemcpy(aveps,cuda_aveps, sizeResults, cudaMemcpyDeviceToHost));

	CUDA_SAFE_CALL(cudaMemcpy(vareps, cuda_vareps, sizeResults, cudaMemcpyDeviceToHost));

	

	int k;

	for(k=0;k<=(maxdim-mindim);k++)

	{

  fprintf(stderr,"Start for dimension=%u\n",k+mindim);

  if(stdout)

  {

  	fprintf(stdout,"%u %e %e %e\n",k+mindim,fracfnn[k],aveps[k],vareps[k]);

  	fflush(stdout);

  }

  else

  {

  	fprintf(file,"%u %e %e %e\n",k+mindim,fracfnn[k],aveps[k],vareps[k]);

  	fflush(file);

  }

  

	}

	

	CUT_SAFE_CALL( cutStopTimer( timer));

    printf( "\n\nProcessing time: %f (ms)\n", cutGetTimerValue( timer));

    CUT_SAFE_CALL( cutDeleteTimer( timer));

	

	CUDA_SAFE_CALL(cudaFree(cuda_list));

    CUDA_SAFE_CALL(cudaFree(cuda_box));

	CUDA_SAFE_CALL(cudaFree(cuda_nearest));

    CUDA_SAFE_CALL(cudaFree(cuda_series));

	CUDA_SAFE_CALL(cudaFree(cuda_fracfnn));

    CUDA_SAFE_CALL(cudaFree(cuda_aveps));

	CUDA_SAFE_CALL(cudaFree(cuda_vareps));

Thanks

To implement mutex you need atomic operations (check out documentation). The only problem with device variables is that they are slow. If several threads write or read it at the same time the result is unpredictable (that is not a problem, that is expected behavior).