Unknown problem in cuda program throws exception

Hi all!

I have been coding in CUDA for a few months now and I managed to write a code that calculates a 3d distance grid to an object defined by a set of triangles. The algorithm is purely brute-force… but it should show the kind of speed-up you can achieve with CUDA. I have been running the program successfully for 64x64x64 3d-grids, divided as 64x1x1 for the block size and 64x64x1 for the grid size. The output is correct. I have done this for a few objects with around 50-100 faces and the same order of points. My problem is that when I try to calculate things for a more complex object(thousands of triangles and points)I get an exception: cudaError_enum at memory location 0x0012fac8… and even got the computer to display a blue screen of death, oops. The thing is that memory allocation is working fine and the memory needed should be well below the capabilities of my card(8800 GTS 512). The memory needed does not change with more triangles/points(except for the space to store the triangles and points, which is small). So the problem I have is: the same kernel(which basically loops through all triangles, each instance of the kernel calculating the distance to a particular observation point of the grid) is running fine with simple objects, but fails miserably with a more complex object. I have been looking into this for a while now, but my wits are at an end. I therefore resort to your infinite wisdom. If anyone has an inkling as to why this could be happening… here is the code that invokes the kernel, along with the kernel:(I have been setting N_DIV to 64 and even 128, and as I said it works correctly for simple objects)

//this code is in main

// nt is the number of triangles, np number of points, of the //object

//h.x is the x-spacing of the grid, h.y y-spacing, h-z z-spacing

//a.x: beginning x for grid, a.y: beginning y for grid, a.z: beginning z for grid

float* h_DistanceGrid; 

float* d_DistanceGrid; //device pointer to distance grid

int* h_IndexGrid;

int* d_IndexGrid; // device pointer to index grid(index of closest triangle)

float3* d_p; // device pointer to points

int3* d_t; // device pointer to triangles

int grid_size=N_DIV*N_DIV*N_DIV;

h_DistanceGrid=(float *)malloc(grid_size*sizeof(float));

h_IndexGrid=(int *)malloc(grid_size*sizeof(int));

h_initStructures();

CUDA_SAFE_CALL(cudaMalloc((void **)&d_p,np*sizeof(float3)));

CUDA_SAFE_CALL(cudaMalloc((void **)&d_t,nt*sizeof(int3)));

CUDA_SAFE_CALL(cudaMemcpy(d_p, h_p, np*sizeof(float3), cudaMemcpyHostToDevice));

CUDA_SAFE_CALL(cudaMemcpy(d_t, h_t, nt*sizeof(int3), cudaMemcpyHostToDevice));

CUDA_SAFE_CALL(cudaMalloc((void **)&d_DistanceGrid,grid_size*sizeof(float)));

CUDA_SAFE_CALL(cudaMalloc((void **)&d_IndexGrid,grid_size*sizeof(int)));

callKernel(d_DistanceGrid, d_IndexGrid, d_p, d_t, np, nt, N_DIV,

         h, a );

CUDA_SAFE_CALL(cudaMemcpy(h_DistanceGrid, d_DistanceGrid,

            grid_size*sizeof(float), cudaMemcpyDeviceToHost));

CUDA_SAFE_CALL(cudaMemcpy(h_IndexGrid, d_IndexGrid,

             grid_size*sizeof(int), cudaMemcpyDeviceToHost));

// this is the kernel invocation function and kernel:

extern "C" void callKernel(float* DistanceGrid, int* IndexGrid, float3* p, int3* t, 

    int np, int nt, int n_div, float3 h, float3 a )

{

   dim3 grid(n_div,n_div,1);

   dim3 block(n_div,1,1);

  distKernel<<<grid,block>>>(DistanceGrid, IndexGrid, p, t, np, nt, n_div, h, a);  

   

}

__global__ void distKernel( float* DistanceGrid, int* IndexGrid, float3* p, int3* t,

    int np, int nt, int n_div, float3 h , float3 a )

{

	int i;

	float3 cur_n; // normal to triangle 

	float3 cur_p1, cur_p2, cur_p3; // vertices of triangle

	float3 cur_n1; float infront_p1p2; // normal of side p1p2, infront_p1p2: boolean value saying if point is in front of p1p2(to the outside of the triangle)

	float3 cur_n2; float infront_p2p3; // idem p2p3

	float3 cur_n3; float infront_p3p1; //idem p3p1

	float3 cur_n1_; float infront_n1_p1; float infront_n1_p2; // vector normal to n1 and n

	float3 cur_n2_; float infront_n2_p2; float infront_n2_p3; 

	float3 cur_n3_; float infront_n3_p3; float infront_n3_p1;

	float distp1, distp2, distp3, distp1p2, distp2p3, distp3p1, distp1p2p3; //distance to points p1, p2, p3, distance to lines defined by p1p2, 

                  	//p2p3, p3p1, distance to plane defined by  p1p2p3, respectively

	float dist; //real distance to triangle

	

float3 cur_p=make_float3(a.x+blockIdx.x*h.x, a.y+blockIdx.y*h.y, a.z+threadIdx.x*h.z); //observation point, each thread looks at one point

   

          // first triangle

   cur_p1=p[t[0].x];

   cur_p2=p[t[0].y];

   cur_p3=p[t[0].z];

  //normals creation

   cur_n= normalize(cross(cur_p3-cur_p1,cur_p2-cur_p1));

   cur_n1= normalize(cross(cur_n,cur_p2-cur_p1));

   cur_n2= normalize(cross(cur_n,cur_p3-cur_p2));

   cur_n3= normalize(cross(cur_n,cur_p1-cur_p3));

   cur_n1_= normalize(cross(cur_n1,cur_n));

   cur_n2_= normalize(cross(cur_n2,cur_n));

   cur_n3_= normalize(cross(cur_n3,cur_n));

  // determine where in space is the observation point, so as to know which distance is valid

   infront_p1p2= ( dot(cur_p-cur_p1,cur_n1)>=0 )? 1.0 : 0.0; 

   infront_p2p3= ( dot(cur_p-cur_p2,cur_n2)>=0 )? 1.0 : 0.0; 

   infront_p3p1= ( dot(cur_p-cur_p3,cur_n3)>=0 )? 1.0 : 0.0;

   infront_n1_p1= ( dot(cur_p-cur_p1,cur_n1_)>=0 )? 1.0 : 0.0;

   infront_n1_p2= ( dot(cur_p-cur_p2,cur_n1_)>=0 )? 1.0 : 0.0;

   infront_n2_p2= ( dot(cur_p-cur_p2,cur_n2_)>=0 )? 1.0 : 0.0;

   infront_n2_p3= ( dot(cur_p-cur_p3,cur_n2_)>=0 )? 1.0 : 0.0;

   infront_n3_p3= ( dot(cur_p-cur_p3,cur_n3_)>=0 )? 1.0 : 0.0;

   infront_n3_p1= ( dot(cur_p-cur_p1,cur_n3_)>=0 )? 1.0 : 0.0;

  // calculation of distances

   distp1= length(cur_p-cur_p1); 

   distp2= length(cur_p-cur_p2);

   distp3= length(cur_p-cur_p3);

   distp1p2= sqrt(dot(cur_p-cur_p1,cur_n1)*dot(cur_p-cur_p1,cur_n1)+dot(cur_p-cur_p1,cur_n)*dot(cur_p-cur_p1,cur_n));

   distp2p3= sqrt(dot(cur_p-cur_p2,cur_n2)*dot(cur_p-cur_p2,cur_n2)+dot(cur_p-cur_p2,cur_n)*dot(cur_p-cur_p2,cur_n));

   distp3p1= sqrt(dot(cur_p-cur_p3,cur_n3)*dot(cur_p-cur_p3,cur_n3)+dot(cur_p-cur_p3,cur_n)*dot(cur_p-cur_p3,cur_n));

   distp1p2p3=fabs(dot(cur_p-cur_p1,cur_n));

  // calculation of real distance depending on which zone of space the point of observation is in

   dist= (1.0-infront_p1p2)*(1.0-infront_p2p3)*(1.0-infront_p3p1)*distp1p2p3 +

         infront_p1p2*infront_n1_p1*(1.0-infront_n1_p2)*distp1p2 +

         infront_p2p3*infront_n2_p2*(1.0-infront_n2_p3)*distp2p3 + 

         infront_p3p1*infront_n3_p3*(1.0-infront_n3_p1)*distp3p1 +

         infront_n1_p2*(1.0-infront_n2_p2)*distp2 + 

         infront_n2_p3*(1.0-infront_n3_p3)*distp3 +

         infront_n3_p1*(1.0-infront_n1_p1)*distp1;

  // store distance value and index of triangle      

   DistanceGrid[blockIdx.x+blockIdx.y*n_div+threadIdx.x*n_div*n_div]=dist;

   IndexGrid[blockIdx.x+blockIdx.y*n_div+threadIdx.x*n_div*n_div]=0;

	// loop through all triangles

	for (i=1;i<nt;i++){

   cur_p1=p[t[i].x];

   cur_p2=p[t[i].y];

   cur_p3=p[t[i].z];

  //normals creation

   cur_n= normalize(cross(cur_p3-cur_p1,cur_p2-cur_p1));

   cur_n1= normalize(cross(cur_n,cur_p2-cur_p1));

   cur_n2= normalize(cross(cur_n,cur_p3-cur_p2));

   cur_n3= normalize(cross(cur_n,cur_p1-cur_p3));

   cur_n1_= normalize(cross(cur_n1,cur_n));

   cur_n2_= normalize(cross(cur_n2,cur_n));

   cur_n3_= normalize(cross(cur_n3,cur_n));

   infront_p1p2= ( dot(cur_p-cur_p1,cur_n1)>=0 )? 1.0 : 0.0;

   infront_p2p3= ( dot(cur_p-cur_p2,cur_n2)>=0 )? 1.0 : 0.0;

   infront_p3p1= ( dot(cur_p-cur_p3,cur_n3)>=0 )? 1.0 : 0.0;

   infront_n1_p1= ( dot(cur_p-cur_p1,cur_n1_)>=0 )? 1.0 : 0.0;

   infront_n1_p2= ( dot(cur_p-cur_p2,cur_n1_)>=0 )? 1.0 : 0.0;

   infront_n2_p2= ( dot(cur_p-cur_p2,cur_n2_)>=0 )? 1.0 : 0.0;

   infront_n2_p3= ( dot(cur_p-cur_p3,cur_n2_)>=0 )? 1.0 : 0.0;

   infront_n3_p3= ( dot(cur_p-cur_p3,cur_n3_)>=0 )? 1.0 : 0.0;

   infront_n3_p1= ( dot(cur_p-cur_p1,cur_n3_)>=0 )? 1.0 : 0.0;

   distp1= length(cur_p-cur_p1);

   distp2= length(cur_p-cur_p2);

   distp3= length(cur_p-cur_p3);

   distp1p2= sqrt(dot(cur_p-cur_p1,cur_n1)*dot(cur_p-cur_p1,cur_n1)+dot(cur_p-cur_p1,cur_n)*dot(cur_p-cur_p1,cur_n));

   distp2p3= sqrt(dot(cur_p-cur_p2,cur_n2)*dot(cur_p-cur_p2,cur_n2)+dot(cur_p-cur_p2,cur_n)*dot(cur_p-cur_p2,cur_n));

   distp3p1= sqrt(dot(cur_p-cur_p3,cur_n3)*dot(cur_p-cur_p3,cur_n3)+dot(cur_p-cur_p3,cur_n)*dot(cur_p-cur_p3,cur_n));

   distp1p2p3=fabs(dot(cur_p-cur_p1,cur_n));

  // calculation of real distance depending on which zone of space the point of observation is in

   dist= (1.0-infront_p1p2)*(1.0-infront_p2p3)*(1.0-infront_p3p1)*distp1p2p3 +

         infront_p1p2*infront_n1_p1*(1.0-infront_n1_p2)*distp1p2 +

         infront_p2p3*infront_n2_p2*(1.0-infront_n2_p3)*distp2p3 + 

         infront_p3p1*infront_n3_p3*(1.0-infront_n3_p1)*distp3p1 +

         infront_n1_p2*(1.0-infront_n2_p2)*distp2 + 

         infront_n2_p3*(1.0-infront_n3_p3)*distp3 +

         infront_n3_p1*(1.0-infront_n1_p1)*distp1;

  // if this is the smallest distance yet, store value, also store triangle index in that case

   IndexGrid[blockIdx.x+blockIdx.y*n_div+threadIdx.x*n_div*n_div]=(DistanceGrid[blockIdx.x+blockIdx.y*n_div+threadIdx.x*n_div*n_div]>dist)? i : IndexGrid[blockIdx.x+blockIdx.y*n_div+threadIdx.x*n_div*n_div];

   DistanceGrid[blockIdx.x+blockIdx.y*n_div+threadIdx.x*n_div*n_div]=fminf(DistanceGrid[blockIdx.x+blockIdx.y*n_div+threadIdx.x*n_div*n_div],dist);

   

	}

}

Thank you all in advance! And sorry for the long post, but I am baffled… the only thing that changes in the kernel invocation(between a simple and a complex object) is the number of triangles and points(and the size of the arrays storing them) and therefore the length of the loop. By the way, if I put N_DIV equal to 16 then it works for objects with thousands of triangles/points too!

Well, I have been looking into this a little more. Now I am more confused than before. What I have done is I divided the loop over all the triangles in batches of 50 triangles. I call the same kernel as before, first with triangles 0 to 49, after that 50-99 until I complete all the triangles available. The curious thing is that with this method it works!! No more exceptions!! Maybe I should clarify: the first time I call the kernel all the structures have already been loaded to the global memory of the device. So, the ONLY difference between the working and non-working versions are the lengths of the loops within the kernel. Everything else is equal. If somebody is aware what could be causing such a strange behaviour, please let me know. Thank you!