Hi all!

I have been coding in CUDA for a few months now and I managed to write a code that calculates a 3d distance grid to an object defined by a set of triangles. The algorithm is purely brute-force… but it should show the kind of speed-up you can achieve with CUDA. I have been running the program successfully for 64x64x64 3d-grids, divided as 64x1x1 for the block size and 64x64x1 for the grid size. The output is correct. I have done this for a few objects with around 50-100 faces and the same order of points. My problem is that when I try to calculate things for a more complex object(thousands of triangles and points)I get an exception: cudaError_enum at memory location 0x0012fac8… and even got the computer to display a blue screen of death, oops. The thing is that memory allocation is working fine and the memory needed should be well below the capabilities of my card(8800 GTS 512). The memory needed does not change with more triangles/points(except for the space to store the triangles and points, which is small). So the problem I have is: the same kernel(which basically loops through all triangles, each instance of the kernel calculating the distance to a particular observation point of the grid) is running fine with simple objects, but fails miserably with a more complex object. I have been looking into this for a while now, but my wits are at an end. I therefore resort to your infinite wisdom. If anyone has an inkling as to why this could be happening… here is the code that invokes the kernel, along with the kernel:(I have been setting N_DIV to 64 and even 128, and as I said it works correctly for simple objects)

```
//this code is in main
// nt is the number of triangles, np number of points, of the //object
//h.x is the x-spacing of the grid, h.y y-spacing, h-z z-spacing
//a.x: beginning x for grid, a.y: beginning y for grid, a.z: beginning z for grid
float* h_DistanceGrid;
float* d_DistanceGrid; //device pointer to distance grid
int* h_IndexGrid;
int* d_IndexGrid; // device pointer to index grid(index of closest triangle)
float3* d_p; // device pointer to points
int3* d_t; // device pointer to triangles
int grid_size=N_DIV*N_DIV*N_DIV;
h_DistanceGrid=(float *)malloc(grid_size*sizeof(float));
h_IndexGrid=(int *)malloc(grid_size*sizeof(int));
h_initStructures();
CUDA_SAFE_CALL(cudaMalloc((void **)&d_p,np*sizeof(float3)));
CUDA_SAFE_CALL(cudaMalloc((void **)&d_t,nt*sizeof(int3)));
CUDA_SAFE_CALL(cudaMemcpy(d_p, h_p, np*sizeof(float3), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(d_t, h_t, nt*sizeof(int3), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMalloc((void **)&d_DistanceGrid,grid_size*sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void **)&d_IndexGrid,grid_size*sizeof(int)));
callKernel(d_DistanceGrid, d_IndexGrid, d_p, d_t, np, nt, N_DIV,
h, a );
CUDA_SAFE_CALL(cudaMemcpy(h_DistanceGrid, d_DistanceGrid,
grid_size*sizeof(float), cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(h_IndexGrid, d_IndexGrid,
grid_size*sizeof(int), cudaMemcpyDeviceToHost));
// this is the kernel invocation function and kernel:
extern "C" void callKernel(float* DistanceGrid, int* IndexGrid, float3* p, int3* t,
int np, int nt, int n_div, float3 h, float3 a )
{
dim3 grid(n_div,n_div,1);
dim3 block(n_div,1,1);
distKernel<<<grid,block>>>(DistanceGrid, IndexGrid, p, t, np, nt, n_div, h, a);
}
__global__ void distKernel( float* DistanceGrid, int* IndexGrid, float3* p, int3* t,
int np, int nt, int n_div, float3 h , float3 a )
{
int i;
float3 cur_n; // normal to triangle
float3 cur_p1, cur_p2, cur_p3; // vertices of triangle
float3 cur_n1; float infront_p1p2; // normal of side p1p2, infront_p1p2: boolean value saying if point is in front of p1p2(to the outside of the triangle)
float3 cur_n2; float infront_p2p3; // idem p2p3
float3 cur_n3; float infront_p3p1; //idem p3p1
float3 cur_n1_; float infront_n1_p1; float infront_n1_p2; // vector normal to n1 and n
float3 cur_n2_; float infront_n2_p2; float infront_n2_p3;
float3 cur_n3_; float infront_n3_p3; float infront_n3_p1;
float distp1, distp2, distp3, distp1p2, distp2p3, distp3p1, distp1p2p3; //distance to points p1, p2, p3, distance to lines defined by p1p2,
//p2p3, p3p1, distance to plane defined by p1p2p3, respectively
float dist; //real distance to triangle
float3 cur_p=make_float3(a.x+blockIdx.x*h.x, a.y+blockIdx.y*h.y, a.z+threadIdx.x*h.z); //observation point, each thread looks at one point
// first triangle
cur_p1=p[t[0].x];
cur_p2=p[t[0].y];
cur_p3=p[t[0].z];
//normals creation
cur_n= normalize(cross(cur_p3-cur_p1,cur_p2-cur_p1));
cur_n1= normalize(cross(cur_n,cur_p2-cur_p1));
cur_n2= normalize(cross(cur_n,cur_p3-cur_p2));
cur_n3= normalize(cross(cur_n,cur_p1-cur_p3));
cur_n1_= normalize(cross(cur_n1,cur_n));
cur_n2_= normalize(cross(cur_n2,cur_n));
cur_n3_= normalize(cross(cur_n3,cur_n));
// determine where in space is the observation point, so as to know which distance is valid
infront_p1p2= ( dot(cur_p-cur_p1,cur_n1)>=0 )? 1.0 : 0.0;
infront_p2p3= ( dot(cur_p-cur_p2,cur_n2)>=0 )? 1.0 : 0.0;
infront_p3p1= ( dot(cur_p-cur_p3,cur_n3)>=0 )? 1.0 : 0.0;
infront_n1_p1= ( dot(cur_p-cur_p1,cur_n1_)>=0 )? 1.0 : 0.0;
infront_n1_p2= ( dot(cur_p-cur_p2,cur_n1_)>=0 )? 1.0 : 0.0;
infront_n2_p2= ( dot(cur_p-cur_p2,cur_n2_)>=0 )? 1.0 : 0.0;
infront_n2_p3= ( dot(cur_p-cur_p3,cur_n2_)>=0 )? 1.0 : 0.0;
infront_n3_p3= ( dot(cur_p-cur_p3,cur_n3_)>=0 )? 1.0 : 0.0;
infront_n3_p1= ( dot(cur_p-cur_p1,cur_n3_)>=0 )? 1.0 : 0.0;
// calculation of distances
distp1= length(cur_p-cur_p1);
distp2= length(cur_p-cur_p2);
distp3= length(cur_p-cur_p3);
distp1p2= sqrt(dot(cur_p-cur_p1,cur_n1)*dot(cur_p-cur_p1,cur_n1)+dot(cur_p-cur_p1,cur_n)*dot(cur_p-cur_p1,cur_n));
distp2p3= sqrt(dot(cur_p-cur_p2,cur_n2)*dot(cur_p-cur_p2,cur_n2)+dot(cur_p-cur_p2,cur_n)*dot(cur_p-cur_p2,cur_n));
distp3p1= sqrt(dot(cur_p-cur_p3,cur_n3)*dot(cur_p-cur_p3,cur_n3)+dot(cur_p-cur_p3,cur_n)*dot(cur_p-cur_p3,cur_n));
distp1p2p3=fabs(dot(cur_p-cur_p1,cur_n));
// calculation of real distance depending on which zone of space the point of observation is in
dist= (1.0-infront_p1p2)*(1.0-infront_p2p3)*(1.0-infront_p3p1)*distp1p2p3 +
infront_p1p2*infront_n1_p1*(1.0-infront_n1_p2)*distp1p2 +
infront_p2p3*infront_n2_p2*(1.0-infront_n2_p3)*distp2p3 +
infront_p3p1*infront_n3_p3*(1.0-infront_n3_p1)*distp3p1 +
infront_n1_p2*(1.0-infront_n2_p2)*distp2 +
infront_n2_p3*(1.0-infront_n3_p3)*distp3 +
infront_n3_p1*(1.0-infront_n1_p1)*distp1;
// store distance value and index of triangle
DistanceGrid[blockIdx.x+blockIdx.y*n_div+threadIdx.x*n_div*n_div]=dist;
IndexGrid[blockIdx.x+blockIdx.y*n_div+threadIdx.x*n_div*n_div]=0;
// loop through all triangles
for (i=1;i<nt;i++){
cur_p1=p[t[i].x];
cur_p2=p[t[i].y];
cur_p3=p[t[i].z];
//normals creation
cur_n= normalize(cross(cur_p3-cur_p1,cur_p2-cur_p1));
cur_n1= normalize(cross(cur_n,cur_p2-cur_p1));
cur_n2= normalize(cross(cur_n,cur_p3-cur_p2));
cur_n3= normalize(cross(cur_n,cur_p1-cur_p3));
cur_n1_= normalize(cross(cur_n1,cur_n));
cur_n2_= normalize(cross(cur_n2,cur_n));
cur_n3_= normalize(cross(cur_n3,cur_n));
infront_p1p2= ( dot(cur_p-cur_p1,cur_n1)>=0 )? 1.0 : 0.0;
infront_p2p3= ( dot(cur_p-cur_p2,cur_n2)>=0 )? 1.0 : 0.0;
infront_p3p1= ( dot(cur_p-cur_p3,cur_n3)>=0 )? 1.0 : 0.0;
infront_n1_p1= ( dot(cur_p-cur_p1,cur_n1_)>=0 )? 1.0 : 0.0;
infront_n1_p2= ( dot(cur_p-cur_p2,cur_n1_)>=0 )? 1.0 : 0.0;
infront_n2_p2= ( dot(cur_p-cur_p2,cur_n2_)>=0 )? 1.0 : 0.0;
infront_n2_p3= ( dot(cur_p-cur_p3,cur_n2_)>=0 )? 1.0 : 0.0;
infront_n3_p3= ( dot(cur_p-cur_p3,cur_n3_)>=0 )? 1.0 : 0.0;
infront_n3_p1= ( dot(cur_p-cur_p1,cur_n3_)>=0 )? 1.0 : 0.0;
distp1= length(cur_p-cur_p1);
distp2= length(cur_p-cur_p2);
distp3= length(cur_p-cur_p3);
distp1p2= sqrt(dot(cur_p-cur_p1,cur_n1)*dot(cur_p-cur_p1,cur_n1)+dot(cur_p-cur_p1,cur_n)*dot(cur_p-cur_p1,cur_n));
distp2p3= sqrt(dot(cur_p-cur_p2,cur_n2)*dot(cur_p-cur_p2,cur_n2)+dot(cur_p-cur_p2,cur_n)*dot(cur_p-cur_p2,cur_n));
distp3p1= sqrt(dot(cur_p-cur_p3,cur_n3)*dot(cur_p-cur_p3,cur_n3)+dot(cur_p-cur_p3,cur_n)*dot(cur_p-cur_p3,cur_n));
distp1p2p3=fabs(dot(cur_p-cur_p1,cur_n));
// calculation of real distance depending on which zone of space the point of observation is in
dist= (1.0-infront_p1p2)*(1.0-infront_p2p3)*(1.0-infront_p3p1)*distp1p2p3 +
infront_p1p2*infront_n1_p1*(1.0-infront_n1_p2)*distp1p2 +
infront_p2p3*infront_n2_p2*(1.0-infront_n2_p3)*distp2p3 +
infront_p3p1*infront_n3_p3*(1.0-infront_n3_p1)*distp3p1 +
infront_n1_p2*(1.0-infront_n2_p2)*distp2 +
infront_n2_p3*(1.0-infront_n3_p3)*distp3 +
infront_n3_p1*(1.0-infront_n1_p1)*distp1;
// if this is the smallest distance yet, store value, also store triangle index in that case
IndexGrid[blockIdx.x+blockIdx.y*n_div+threadIdx.x*n_div*n_div]=(DistanceGrid[blockIdx.x+blockIdx.y*n_div+threadIdx.x*n_div*n_div]>dist)? i : IndexGrid[blockIdx.x+blockIdx.y*n_div+threadIdx.x*n_div*n_div];
DistanceGrid[blockIdx.x+blockIdx.y*n_div+threadIdx.x*n_div*n_div]=fminf(DistanceGrid[blockIdx.x+blockIdx.y*n_div+threadIdx.x*n_div*n_div],dist);
}
}
```

Thank you all in advance! And sorry for the long post, but I am baffled… the only thing that changes in the kernel invocation(between a simple and a complex object) is the number of triangles and points(and the size of the arrays storing them) and therefore the length of the loop. By the way, if I put N_DIV equal to 16 then it works for objects with thousands of triangles/points too!