WHAT IT'S WRONG IN THIS KERNEL CODE?

rapdios · June 17, 2008, 11:18am

Here is my kernel code. When I launch it, it’s appear different problems (the computer’s get blocked, cuda errors like too many resources, bad configuration, launch time exceed, etc…) If you can see any fatal error, please tell me. Thanks!

#include <stdio.h>

#include "definidos.h"

#include <math.h>

	__device__ unsigned int toolarge;

	__device__ int alldone;

	__device__ unsigned int donesofar;

	__device__ float aveps;

	__device__ float vareps;

__device__ int find_nearest(int n,unsigned int dim,float eps,

   unsigned int *local_toolarge,float *local_aveps,float *local_vareps,

   int *box, float *series, int *list, int b,unsigned int del,

    float varz,float rt, unsigned int theil,int id);

__device__ void make_box(float *ser,int *list, int* box,unsigned int l,

              unsigned int bs,unsigned int begin_col,unsigned int end_col,

     unsigned int dim,unsigned int del,float eps);

__device__ void localdim(int tambox ,int ident, int numerohilos,int* begin_col, int* end_col);

__device__ void evaluate(int begin,int end,unsigned int dim,float epsilon,

       int *local_alldone,unsigned int *local_donesofar,

       unsigned int *local_toolarge,float *local_aveps,float *local_vareps, 

    int *nearest, int *box, float *series, int *list, int b,unsigned int del,

    float varz,float rt, unsigned int theil,int id);

////////////////////////////////////////////////////////////////////////////////

/* make_box(): This function generates the box matrix.  Parallel version for SM architectures.

 *

 * -Arguments:

 *  float *ser: Time series array.

 *  int **box: Box matrix.

 *  int *list: List array.

 *  unsigned int l: Number of points.

 *  unsigned int bs: Box size.

 *  unsigned int begin_col: First column (from rank assigned).

 *  unsigned int end_col: Last column (from rank assigned).

 *  unsigned int dim: Embedding dimension.

 *  unsigned int del: Delay of vectors. 

 *  float eps: Value of epsilon. 

 * -Return value:

 *  None. 

 */

__device__

void make_box(float *ser,int *list, int* box,unsigned int l,

              unsigned int bs,unsigned int begin_col,unsigned int end_col,

     unsigned int dim,unsigned int del,float eps)

{

  	int i,x,y;

  	int ib=bs-1;

	int d = (dim-1)*del;

	

	for (x=begin_col;x<=end_col;x++)

      for (y=0;y<bs;y++)

    box[y*bs+x] = -1;

     

  	for (i=d;i<l;i++)

    {

      x = (int)(ser[i-d]/eps)&ib;

  	

      if ((x >= begin_col) && ( x <= end_col))

      	{

      	y = (int)(ser[i]/eps)&ib;

      	list[i] = box[x*bs+y];

      	box[x*bs+y] = i;

      	}

    }

}

/* localdim(): This function assigns a rank of points/columns to the actual process.

 *

 * -Arguments:

 *  int d: Number of points/columns.

 *  int id: Process identifier.

 *  int nprocs: Process number.

 *  int *begin: First point/column for the actual process. 

 *  int *end: Last point/column for the actual process. 

 * -Return value:

 *  None. 

 */

__device__

void localdim(int tambox ,int ident, int numerohilos,int* begin_col, int* end_col)

{

	int quotient,rem;

	

	quotient = tambox/numerohilos;

	rem = tambox % numerohilos;

	

	if (rem == 0)

  {

  *begin_col = ident*quotient;

  *end_col = *begin_col + quotient;

  }

	else

  {

  if (ident < rem)

  	{

  	*begin_col = ident*(quotient+1);

  	*end_col = *begin_col + quotient + 1;

  	}

  else

  	{

  	*begin_col = rem*(quotient+1)+(ident-rem)*quotient;

  	*end_col = *begin_col + quotient;

  	}

  }

	*begin_col = *begin_col + 1;

}

/* evaluate(): This function resolves the part of the neighbour searching task assigned to the actual process. 

 *

 * -Arguments:

 *  int begin: First point (from rank assigned).

 *  int end: Last point (from rank assigned).

 *  unsigned int dim: Embedding dimension.

 *  float epsilon: Value of epsilon. 

 *  int *local_alldone: alldone local variable.

 *  unsigned int *local_donesofar: donesofar local variable.

 *  unsigned int *local_toolarge: toolarge local variable.

 *  float *local_aveps: aveps local variable.

 *  float *local_vareps: vareps local variable.

 * -Return value:

 *  None. 

 */    	

__device__                                                                                                                                                                                                                                      

void evaluate(int begin,int end,unsigned int dim,float epsilon,

       int *local_alldone,unsigned int *local_donesofar,

       unsigned int *local_toolarge,float *local_aveps,float *local_vareps, 

    int *nearest, int *box, float *series, int *list, int b,unsigned int del,

    float varz,float rt, unsigned int theil, int id)

{

	int i;

	

	for (i=begin;i<=end;i++)

  if (!nearest[i]) 

  	{

    	nearest[i] = find_nearest(i,dim,epsilon,

          local_toolarge,local_aveps,local_vareps,box,series,list,b,del,varz,rt,theil,id);

    	*local_alldone &= nearest[i];

    	*local_donesofar += (unsigned int)nearest[i];

  	}  	

}

/* find_nearest(): This function implements the neighbour searching task for the point n.

 *

 * -Arguments:

 *  int n: Point.

 *  unsigned int dim: Embedding dimension.

 *  float eps: Value of epsilon. 

 *  unsigned int *local_toolarge: toolarge local variable.

 *  float *local_aveps: aveps local variable.

 *  float *local_vareps: vareps local variable.

 * -Return value:

 *  0: No nearest neighbour found (with distance < epsilon).

 *  1: Nearest neighbour found (with distance < epsilon). 

 */  

__device__

int find_nearest(int n,unsigned int dim,float eps,

   unsigned int *local_toolarge,float *local_aveps,float *local_vareps,

   int *box, float *series, int *list, int b,unsigned int del,

    float varz,float rt, unsigned int theil,int id)

{

  	int x,y,x1,x2,y1,i,i1;

  int element,which = -1;

  	float dx,maxdx,mindx=1.1,factor;

	int ibox = b-1;

  	

  	x = (int)(series[n-(dim-1)*del]/eps)&ibox;

	y = (int)(series[n]/eps)&ibox;

	for (x1=x-1;x1<=x+1;x1++) 

    {

      x2 = x1&ibox;

      for (y1=y-1;y1<=y+1;y1++) 

      {

        	element = box[x2*b+(y1&ibox)];

    while (element != -1) 

        	{

      if (labs(element-n) > theil)

      {

        	maxdx = fabs(series[n]-series[element]);

        	for (i=1;i<dim;i++) 

        	{

          	i1 = i*del;

          	dx = fabs(series[n-i1]-series[element-i1]);

          	if (dx > maxdx)

              maxdx = dx;

        	}

        	if ((maxdx < mindx) && (maxdx > 0.0)) 

        	{

          	which = element;

          	mindx = maxdx;

        	}

      }

      element = list[element];

        	}

      }  

    }

	

  	if ((which != -1) && (mindx <= eps) && (mindx <= varz/rt)) 

    {

      *local_aveps += mindx;

      *local_vareps += mindx*mindx;

      factor = fabs(series[n+1]-series[which+1])/mindx;

 	if (factor > rt)

        	*local_toolarge += 1;

      return 1;

    }

  	return 0;

}

__global__ void

kernel(int b, unsigned int mind, unsigned int maxd, 

    unsigned int del, unsigned int len, unsigned int theil, float rt, 

    float varz, int *cuda_list,int *cuda_nearest,int* cuda_box, float* cuda_series, 

    unsigned int ver,float inter,int stdo,float *cuda_fracfnn,float *cuda_aveps,

    float *cuda_vareps)

{

   // Block index

    int bx = blockIdx.x;

    int by = blockIdx.y;

   // Thread index

    int tx = threadIdx.x;

    int ty = threadIdx.y;

	

	

	int id = by*blockDim.x*blockDim.y*GRID_SIZE + bx *blockDim.x*blockDim.y+ty*blockDim.x+tx;

	

	float epsilon;

	int i;

	unsigned int dim;

	float eps0 = 1.0e-5;

	int firstdata,begin,end,begin_col,end_col; 

	float local_aveps = 0.0,local_vareps = 0.0;

	unsigned int local_toolarge = 0,local_donesofar = 0;

	int local_alldone = 0;

	localdim(b,id, NHILOS ,&begin_col,&end_col);

	begin_col -= 1;

	end_col -= 1;

	for(dim = mind;dim <= maxd; dim++){

  epsilon = eps0;

  toolarge = 0;

  alldone = 0;

  donesofar = 0;

  aveps = 0.0;

  vareps = 0.0;

 firstdata = (dim-1)*del;

      	localdim((len-firstdata),id,NHILOS,&begin,&end);

  begin = (begin - 1) + firstdata;

        end = (end - 1) + firstdata;

 if (id == (NHILOS-1)) 

          end = end - 1;

        __syncthreads();

 for (i=begin;i<=end;i++)

        	cuda_nearest[i] = 0;

 while (!alldone && (epsilon < 2.*varz/rt)) 

  {

  	local_alldone = 1;

        make_box(cuda_series,cuda_list,cuda_box,len-1,b,begin_col,end_col,dim,del,epsilon);

 	if (id == (NHILOS-1)) 

  	{

    alldone = 1;

  	}

  	__syncthreads();

 	evaluate(begin, end, dim, epsilon,&local_alldone,&local_donesofar,&local_toolarge,&local_aveps,&local_vareps,cuda_nearest,cuda_box,cuda_series,cuda_list,b,del,varz,rt,theil,id);

  	

  	alldone &= (int) local_alldone;

  	donesofar += (unsigned int) local_donesofar;	

  	__syncthreads();

  	

  	epsilon *=sqrt(2.0);

  	if (!donesofar)

    eps0=epsilon;

  	local_alldone=0;

  	local_donesofar=0;

  }

 if (donesofar==0)

  {

    //the thread should exit. I don't know how to do it

  }

	

  toolarge += (unsigned int) local_toolarge;

      aveps += local_aveps;

      vareps += local_vareps;

  if (id==NHILOS-1) 

  {

  	aveps *= (1./(float)donesofar);

  	vareps *= (1./(float)donesofar);

  	

  	cuda_fracfnn[dim-mind]=(float)toolarge/(float)donesofar;

  	cuda_aveps[dim-mind]=aveps;

  	cuda_vareps[dim-mind]=vareps;

  	

  }

  __syncthreads();

 local_toolarge = 0;

  local_aveps = 0.0;

  local_vareps = 0.0;

	}

}

Ailleur · June 17, 2008, 12:41pm

The host code that launches this kernel is needed to detect the kind of errors you have mentionned. If you debug it, i doubt the kernel code is launching at all.

BarsMonster · June 17, 2008, 12:59pm

Can you reduce number of threads/blocks for your kernel? Could you reduce amount of work done in 1 thread? That is to avoid 5s runtime delay.

Also, your code might not use most of GPGPU power - too much branching & loops with different parameters.

Finally, you would helpful everyone if you extract smallest code which reproduces your problem.

rapdios · June 17, 2008, 1:31pm

Here is the host code that launches the kernel. One question: It’s possible that the problem is with the device variables that are used for all the threads that are launched?? And so, there is some mechanism to implement a Mutex or something like that, wich protect the access to this device variable??

Here is the host code:

CUT_DEVICE_INIT();//iniciamos el device

	

	unsigned int timer = 0;

    CUT_SAFE_CALL( cutCreateTimer( &timer));

    CUT_SAFE_CALL( cutStartTimer( timer));

	

	

	int* cuda_list;

	int* cuda_nearest;

	int* cuda_box;

	float* cuda_series;

	

	float* fracfnn;

	float* aveps;

	float* vareps;

	float* cuda_fracfnn;

	float* cuda_aveps;

	float* cuda_vareps;

	fracfnn = (float*)malloc(((maxdim-mindim)+1)*sizeof(float));

	aveps = (float*)malloc(((maxdim-mindim)+1)*sizeof(float));

	vareps = (float*)malloc(((maxdim-mindim)+1)*sizeof(float));

	int sizeResults= ((maxdim-mindim)+1)*sizeof(float);

	CUDA_SAFE_CALL(cudaMalloc((void**) &cuda_fracfnn,sizeResults));

	CUDA_SAFE_CALL(cudaMalloc((void**) &cuda_aveps,sizeResults));

	CUDA_SAFE_CALL(cudaMalloc((void**) &cuda_vareps,sizeResults));

	int size= length*sizeof(int);

	CUDA_SAFE_CALL(cudaMalloc((void**) &cuda_list,size));

	CUDA_SAFE_CALL(cudaMalloc((void**) &cuda_nearest,size));

	

	size=BOX*BOX*sizeof(int);

	CUDA_SAFE_CALL(cudaMalloc((void**) &cuda_box,size));

	

	size=length*sizeof(float);

	CUDA_SAFE_CALL(cudaMalloc((void**) &cuda_series, size));

	

	

	CUDA_SAFE_CALL(cudaMemcpy(cuda_series, series, size, cudaMemcpyHostToDevice));

	

	dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);

	dim3 dimGrid(GRID_SIZE, GRID_SIZE);

	

	kernel<<<dimGrid,dimBlock>>>(BOX,mindim,maxdim,delay,length,theiler,rt,varianz,cuda_list,cuda_nearest,cuda_box,cuda_series,verbosity,inter,stdo,cuda_fracfnn,cuda_aveps,cuda_vareps);

	

	// check if kernel execution generated and error

    CUT_CHECK_ERROR("Kernel execution failed");

	

	CUDA_SAFE_CALL(cudaMemcpy(fracfnn, cuda_fracfnn, sizeResults, cudaMemcpyDeviceToHost));

	CUDA_SAFE_CALL(cudaMemcpy(aveps,cuda_aveps, sizeResults, cudaMemcpyDeviceToHost));

	CUDA_SAFE_CALL(cudaMemcpy(vareps, cuda_vareps, sizeResults, cudaMemcpyDeviceToHost));

	

	int k;

	for(k=0;k<=(maxdim-mindim);k++)

	{

  fprintf(stderr,"Start for dimension=%u\n",k+mindim);

  if(stdout)

  {

  	fprintf(stdout,"%u %e %e %e\n",k+mindim,fracfnn[k],aveps[k],vareps[k]);

  	fflush(stdout);

  }

  else

  {

  	fprintf(file,"%u %e %e %e\n",k+mindim,fracfnn[k],aveps[k],vareps[k]);

  	fflush(file);

  }

  

	}

	

	CUT_SAFE_CALL( cutStopTimer( timer));

    printf( "\n\nProcessing time: %f (ms)\n", cutGetTimerValue( timer));

    CUT_SAFE_CALL( cutDeleteTimer( timer));

	

	CUDA_SAFE_CALL(cudaFree(cuda_list));

    CUDA_SAFE_CALL(cudaFree(cuda_box));

	CUDA_SAFE_CALL(cudaFree(cuda_nearest));

    CUDA_SAFE_CALL(cudaFree(cuda_series));

	CUDA_SAFE_CALL(cudaFree(cuda_fracfnn));

    CUDA_SAFE_CALL(cudaFree(cuda_aveps));

	CUDA_SAFE_CALL(cudaFree(cuda_vareps));

Thanks

BarsMonster · June 17, 2008, 3:30pm

To implement mutex you need atomic operations (check out documentation). The only problem with device variables is that they are slow. If several threads write or read it at the same time the result is unpredictable (that is not a problem, that is expected behavior).

Topic		Replies	Views
Very strange behaviour. Maybe a bug...? Kernel fails to run strangely, but no errors are reported. CUDA Programming and Performance	5	1059	May 13, 2009
Wrong output when adding blocks what am I doing wrong? CUDA Programming and Performance	13	12003	December 4, 2007
Number of threads affecting answer; this should not happen a VERY strange error.. CUDA Programming and Performance	8	2493	July 17, 2009
Error on Kernel launch CUDA Programming and Performance	7	1783	April 1, 2010
cannot resolve the error in running multi-block, mutli-threads kernel CUDA Programming and Performance	5	1073	February 5, 2014
CUDA problem when program runs on device CUDA Programming and Performance	6	1586	August 17, 2009
Writing on CUDA I don't know why I can't extract results of the kernel CUDA Programming and Performance	3	3112	September 7, 2009
Blocks execution problem Unexpectable behavior CUDA Programming and Performance	4	826	February 2, 2011
emulation working fine but not GPU CUDA Programming and Performance	4	2122	November 24, 2009
Outrageous speedups over serial code: advice needed CUDA Programming and Performance	17	2843	November 18, 2012

WHAT IT'S WRONG IN THIS KERNEL CODE?

Related topics