Weird Kernel behaveiur loosing variable values

EDIT: after lowering the Block_size from 512 threads to 128 threads, now the variables values do not get changed, does someone know why this happens?

hello,

i’ve been like 2 days tryin to figure out what is happenning in one of my kernels, at some point of the algorithm i just loose the values of my variables.

its the triangle triangle intersection algorithm from mollër HERE

i redefined all his functions to “device”. here is the code

#include <math.h>

/* some macros */

__device__ inline void CROSS(float dest[3], float v1[3], float v2[3]){

			  dest[0]=v1[1]*v2[2]-v1[2]*v2[1];

			  dest[1]=v1[2]*v2[0]-v1[0]*v2[2];

			  dest[2]=v1[0]*v2[1]-v1[1]*v2[0];

}

__device__ inline float DOT(float v1[3],float v2[3]){

	return  (v1[0]*v2[0]+v1[1]*v2[1]+v1[2]*v2[2]);

}

__device__ inline void SUB(float dest[3],float v1[3],float v2[3]){

			dest[0]=v1[0]-v2[0];

			dest[1]=v1[1]-v2[1];

			dest[2]=v1[2]-v2[2];

}

/* sort so that a<=b */

__device__ inline void SORT(float a[2])  {

	 if(a[0]>a[1]){

	   float c;

	   c=a[0];

	   a[0]=a[1];

	   a[1]=c;

	 }

}

/* this edge to edge test is based on Franlin Antonio's gem:

   "Faster Line Segment Intersection", in Graphics Gems III,

   pp. 199-202 */

__device__ inline int EDGE_EDGE_TEST(float V0[3],float U0[3],float U1[3], short i0, short i1, float Ax, float Ay){

	float Bx, By, Cx, Cy, e, d, f;

	Bx=U0[i0]-U1[i0];

	By=U0[i1]-U1[i1];

	Cx=V0[i0]-U0[i0];

	Cy=V0[i1]-U0[i1];

	f=Ay*Bx-Ax*By;

	d=By*Cx-Bx*Cy;

	if((f>0 && d>=0 && d<=f) || (f<0 && d<=0 && d>=f))

	{

		e=Ax*Cy-Ay*Cx;

		if(f>0)

		{

		  if(e>=0 && e<=f) return 1;

		}

		else

		{

		  if(e<=0 && e>=f) return 1;

		}

	}

	return 0;

}

__device__ inline int EDGE_AGAINST_TRI_EDGES(float V0[3],float V1[3],float U0[3],float U1[3],float U2[3], short i0, short i1) {

  float Ax,Ay;

  Ax=V1[i0]-V0[i0];

  Ay=V1[i1]-V0[i1];

  if( EDGE_EDGE_TEST(V0,U0,U1, i0, i1, Ax, Ay) == 1 ){ return 1;}

  if( EDGE_EDGE_TEST(V0,U1,U2, i0, i1, Ax, Ay) == 1 ){ return 1;}

  if( EDGE_EDGE_TEST(V0,U2,U0, i0, i1, Ax, Ay) == 1 ){ return 1;}

  return 0;

}

__device__ inline int POINT_IN_TRI(float V0[3], float U0[3], float U1[3], float U2[3], short i0, short i1){

  float a,b,c,d0,d1,d2;

  a=U1[i1]-U0[i1];

  b=-(U1[i0]-U0[i0]);

  c=-a*U0[i0]-b*U0[i1];

  d0=a*V0[i0]+b*V0[i1]+c;

a=U2[i1]-U1[i1];

  b=-(U2[i0]-U1[i0]);

  c=-a*U1[i0]-b*U1[i1];

  d1=a*V0[i0]+b*V0[i1]+c;

a=U0[i1]-U2[i1];

  b=-(U0[i0]-U2[i0]);

  c=-a*U2[i0]-b*U2[i1];

  d2=a*V0[i0]+b*V0[i1]+c;

  if(d0*d1>0.0f)

  {

	if(d0*d2>0.0f) return 1;

  }

  return 0;

}

__device__ int coplanar_tri_tri(float N[3],float V0[3],float V1[3],float V2[3],

					 float U0[3],float U1[3],float U2[3]){

	float A[3];

	short i0,i1;

	/* first project onto an axis-aligned plane, that maximizes the area */

	/* of the triangles, compute indices: i0,i1. */

	A[0]=fabs(N[0]);

	A[1]=fabs(N[1]);

	A[2]=fabs(N[2]);

	if(A[0]>A[1]){

		if(A[0]>A[2]){

			i0=1;	  /* A[0] is greatest */

			i1=2;

		}

		else{

			i0=0;	  /* A[2] is greatest */

			i1=1;

		}

	}

	else{  /* A[0]<=A[1] */

		if(A[2]>A[1]){

			i0=0;	  /* A[2] is greatest */

			i1=1;

		}

		else{

			i0=0;	  /* A[1] is greatest */

			i1=2;

		}

	}

	/* test all edges of triangle 1 against the edges of triangle 2 */

	if( EDGE_AGAINST_TRI_EDGES(V0,V1,U0,U1,U2, i0, i1) == 1 ){ return 1;}

	if( EDGE_AGAINST_TRI_EDGES(V1,V2,U0,U1,U2, i0, i1) == 1 ){ return 1;}

	if( EDGE_AGAINST_TRI_EDGES(V2,V0,U0,U1,U2, i0, i1) == 1 ){ return 1;}

	/* finally, test if tri1 is totally contained in tri2 or vice versa */

	if( POINT_IN_TRI(V0,U0,U1,U2, i0, i1) == 1 ){ return 1;}

	if( POINT_IN_TRI(U0,V0,V1,V2, i0, i1) == 1 ){ return 1;}

	return 0;

}

__device__ inline int  NEWCOMPUTE_INTERVALS(float N1[3], float V0[3], float V1[3], float V2[3], float U0[3], float U1[3], float U2[3],

									float VV0,float VV1,float VV2,float D0,float D1,float D2,float D0D1,float D0D2,float A[1],float B[1],float C[1],float X0[1],float X1[1] ){

		if(D0D1>0.0f){

				A[0]=VV2; B[0]=(VV0-VV2)*D2; C[0]=(VV1-VV2)*D2; X0[0]=D2-D0; X1[0]=D2-D1;

		}

		else{

			if(D0D2>0.0f){

				A[0]=VV1; B[0]=(VV0-VV1)*D1; C[0]=(VV2-VV1)*D1; X0[0]=D1-D0; X1[0]=D1-D2;

			}

			else {

				if(D1*D2>0.0f || D0!=0.0f){

						A[0]=VV0; B[0]=(VV1-VV0)*D0; C[0]=(VV2-VV0)*D0; X0[0]=D0-D1; X1[0]=D0-D2;

				}

				else {

					if(D1!=0.0f){

							A[0]=VV1; B[0]=(VV0-VV1)*D1; C[0]=(VV2-VV1)*D1; X0[0]=D1-D0; X1[0]=D1-D2;

					}

					else{

						if(D2!=0.0f){

								A[0]=VV2; B[0]=(VV0-VV2)*D2; C[0]=(VV1-VV2)*D2; X0[0]=D2-D0; X1[0]=D2-D1;

						}

						else{

								return coplanar_tri_tri(N1,V0,V1,V2,U0,U1,U2);

						}

					}

				}

			}

		}

		return 0;

}

__device__ inline int  NoDivTriTriIsect(float V0[3],float V1[3],float V2[3],

								 float U0[3],float U1[3],float U2[3]){

	int USE_EPSILON_TEST = 1;

	float EPSILON = 0.00001f;

	float E1[3],E2[3];

	float N1[3],N2[3],d1,d2;

	float du0,du1,du2,dv0,dv1,dv2;

	float D[3];

	float isect1[2], isect2[2];

	float du0du1,du0du2,dv0dv1,dv0dv2;

	int index;

	float vp0,vp1,vp2;

	float up0,up1,up2;

	float bb,cc,max;

		/* compute interval for triangle 1 */

	float a,b,c,x0,x1;

	/* compute interval for triangle 2 */

	float d,e,f,y0,y1;

	/* compute plane equation of triangle(V0,V1,V2) */

	SUB(E1,V1,V0);

	SUB(E2,V2,V0);

	CROSS(N1,E1,E2);

	//if( N1[0] == 0.0f && N1[1] == 0.0f && N1[2] == 1.0f ){ return 1;}

	d1=-DOT(N1,V0);

	/* plane equation 1: N1.X+d1=0 */

	/* put U0,U1,U2 into plane equation 1 to compute signed distances to the plane*/

	du0=DOT(N1,U0)+d1;

	du1=DOT(N1,U1)+d1;

	du2=DOT(N1,U2)+d1;

	/* coplanarity robustness check */

	//if( du0 == 1.0f && du1 == -1.0f && du2 == 0.0f){ return 1; }

//-------------------------------------------------------------------------------------------------------HERE-------------------------------------------------------------------------------------

	if( USE_EPSILON_TEST == 1){						   

		if(fabs(du0)<EPSILON){

			 du0=0.0f;

		}

		if(fabs(du1)<EPSILON){

			 du1=0.0f;

		}

		if(fabs(du2)<EPSILON){

			 du2=0.0f;

		}

	}

	if( du0 == 1.0f && du1 == -1.0f && du2 == 0.0f){ return 1; }

	du0du1=du0*du1;

	du0du2=du0*du2;

	if(du0du1>0.0f && du0du2>0.0f) /* same sign on all of them + not equal 0 ? */

	return 0;					/* no intersection occurs */

	/* compute plane of triangle (U0,U1,U2) */

	SUB(E1,U1,U0);

	SUB(E2,U2,U0);

	CROSS(N2,E1,E2);

	d2=-DOT(N2,U0);

	/* plane equation 2: N2.X+d2=0 */

	/* put V0,V1,V2 into plane equation 2 */

	dv0=DOT(N2,V0)+d2;

	dv1=DOT(N2,V1)+d2;

	dv2=DOT(N2,V2)+d2;

	if( USE_EPSILON_TEST == 1 ){

		if(fabs(dv0)<EPSILON){

			dv0=0.0f;

		}

		if(fabs(dv1)<EPSILON){

			dv1=0.0f;

		}

		if(fabs(dv2)<EPSILON){

			dv2=0.0f;

		}

	}

	dv0dv1=dv0*dv1;

	dv0dv2=dv0*dv2;

	if(dv0dv1>0.0f && dv0dv2>0.0f) /* same sign on all of them + not equal 0 ? */

	return 0;					/* no intersection occurs */

	/* compute direction of intersection line */

	CROSS(D,N1,N2);

	/* compute and index to the largest component of D */

	max=(float)fabs(D[0]);

	index=0;

	bb=(float)fabs(D[1]);

	cc=(float)fabs(D[2]);

	if(bb>max) max=bb,index=1;

	if(cc>max) max=cc,index=2;

	/* this is the simplified projection onto L*/

	vp0=V0[index];

	vp1=V1[index];

	vp2=V2[index];

	up0=U0[index];

	up1=U1[index];

	up2=U2[index];

	if( NEWCOMPUTE_INTERVALS(N1, V0, V1, V2, U0, U1, U2, vp0,vp1,vp2,dv0,dv1,dv2,dv0dv1,dv0dv2,&a,&b,&c,&x0,&x1)  ){ return 1;}

	if( NEWCOMPUTE_INTERVALS(N1, V0, V1, V2, U0, U1, U2, up0,up1,up2,du0,du1,du2,du0du1,du0du2,&d,&e,&f,&y0,&y1)  ){ return 1;}

	float xx,yy,xxyy,tmp;

	xx=x0*x1;

	yy=y0*y1;

	xxyy=xx*yy;

	tmp=a*xxyy;

	isect1[0]=tmp+b*x1*yy;

	isect1[1]=tmp+c*x0*yy;

	tmp=d*xxyy;

	isect2[0]=tmp+e*xx*y1;

	isect2[1]=tmp+f*xx*y0;

	SORT(isect1);

	SORT(isect2);

	if(isect1[1]<isect2[0] || isect2[1]<isect1[0]) return 0;

	return 1;

}

and here is the kernel, very simple since i just want it to return 1 because those triangle do intersect. It works on CPU code and returns 1.

__global__ void kernelDetectarIntersecciones(Vertex* vbo, GLuint* eab, GLuint numCaras){

	int i = blockIdx.x * blockDim.x + threadIdx.x;

	if( i<numCaras  ){

		float V0[3], V1[3], V2[3];

		float U0[3], U1[3], U2[3];

		V0[0] = 1.0f; V0[1] = 0.0f; V0[2] = 0.0f;

		V1[0] = 0.0f; V1[1] = 1.0f; V1[2] = 0.0f;

		V2[0] = 0.0f; V2[1] = 0.0f; V2[2] = 0.0f;

		U0[0] = 1.0f; U0[1] = 1.0f; U0[2] = 1.0f;

		U1[0] = 1.0f; U1[1] = 1.0f; U1[2] = -1.0f;

		U2[0] = 0.0f; U2[1] = 0.0f; U2[2] = 0.0f;

		if( NoDivTriTriIsect(V0, V1, V2, U0, U1, U2)  ){

			//do something to prove that returned 1

		}

	}

}

if you check on the NoDivTriTrisect function, on the first “if” statement ( its marked with a big //---------------------------------HERE------------------------------ comment)

if( USE_EPSILON_TEST == 1){...

i discovered something very weird, the values of du0 du1 du2 before the “if” are (1.0f, -1.0f, 0.0f) and after the “if” they get modified even when the do not satisfy the nested if( fabs(du) < EPSILON ) ).

thats why i check before and after that “if” what are the values of du0, du1, du2.

am i doing something wrong?

i dont know if kernels have a limit of local variables to be defined, or something similar.

any help is welcome.