Collision Detection task partitioning strategy


I am very new to CUDA so maybe you can give me some advice on how to start this problem.

My goal is to check every triangle on two 3D meshes for intersection. I will have a global array for each mesh. then each thread will process a collision between triangles based on the thread index.
so if I have two meshes each with say 1000 triangles then it would be 1 million calculations per frame without optimization. but the polycount of the meshes would take a whole range of values. so what would be the best way to partition this into threads and blocks?

so I understand that you can have only 512 threads per block so my guess would be something like:

    dim3  grid( sizeMesh1/512 , sizeMesh2/512 , 1);
    dim3  threads( 512, 512, 1);
    Collide<<< grid, threads, mem_size >>>(m1,m2,t1,t2);

does it look like I am on the right track?
could anyone give me pointers how best to partition the problem?

Thanks in advance!

You don’t have 512 threads per block there, you have (2^9)^2 = 2^18 = 262,144 threads per block. It won’t launch.

But yeah in general that’s one idea on how to do it.

Ah… but here’s where algorithmic thinking will give you huge advantages. The brute force method of checking every triangle against every triangle will work (and can fit quite nicely into CUDA) but it’s likely a poor algorithm overall. Hierarchical methods will likely work much better, allowing you to work with groups of triangles and excluding huge swathes of interactions in a single compute.
This can be parallelized as well.

A reasonable GPU friendly work partitioning would choose the larger of the two objects, take the top 2000 nodes or so of its tree, and make each one of those a block of work, and the block would be responsible for intersecting that node (and its child nodes and leaves) versus the other object’s (entire) tree.

Still, for low poly counts (I’d guess something like number of polys in the low 100s) you could still do OK with brute force… but even there a heirarchical method would be faster (though by that small of a problem you may as well do it on the CPU).

Alternatively: does PhysX have this kind of function built in? Maybe there’s no need to roll your own at all.

AFAIK PhysX doesn’t do mesh-mesh collisions, only mesh-shape, except for one deprecated case with both meshes being “pmaps”.

Also the current version of PhysX does all rigid body calculations (including collisions) on the CPU, the GPU only does fluids, cloth and soft bodies.

Thanks for the response.

Well I can imagine that brute force collision detection will not be the optimal solution. I thought it would be a relatively simple introduction for me to program in cuda. I will most likely do some optimization as you mentioned, but for now I would like to just get it to work, even if it is slow.

I think I almost have it working but the problem comes when I try and return the results from the triangle intersection query in the kernel. the Cudata structure is to hold pointers to the host and device memory.

here is the relevant parts of the code:


struct Cudata{

//Host/CPU Memory

float* h_m1data;

    float* h_m2data;

int* h_resultM1;

int* h_resultM2;

//Device/GPU Memory

float* d_m1data;

float* d_m2data;

int* d_resultM1;

int* d_resultM2;

// total device memory size

int memsize;

//Triangle count

int sizeM1,sizeM2;


extern “C” void cuda_loadMesh(Cudata cd)


int s1 = cd.sizeM1; 

int s2 = cd.sizeM2; 

// allocate device memory

cudaMalloc( (void**) &cd.d_m1data ,3*3*sizeof(float)*s1 );

// copy mesh1 to device memory

cudaMemcpy( cd.d_m1data , cd.h_m1data , 3*3*sizeof(float)*s1 , cudaMemcpyHostToDevice) ;

cudaMalloc( (void**) &cd.d_m2data , 33sizeof(float)*s2 );

// copy mesh2 to device memory

cudaMemcpy( cd.d_m2data , cd.h_m2data  ,  3*3*sizeof(float)*s2 , cudaMemcpyHostToDevice) ;

// allocate device memory for result

cudaMalloc( (void**) &cd.d_resultM1, s1*sizeof(int) );

cudaMalloc( (void**) &cd.d_resultM2, s2*sizeof(int) );


global void kernelCol(float* t1, float* t2, Cudata cd)


const unsigned int tix = threadIdx.x;

const unsigned int tiy = threadIdx.y;

float V0[3],V1[3],V2[3],U0[3],U1[3],U2[3];

int i,result=1;



 V0[i] = (float)( cd.d_m1data[9*tix]    +  t1[i]);

 V1[i] = (float)( cd.d_m1data[9*tix+1]  +  t1[i]);

 V2[i] = (float)( cd.d_m1data[9*tix+2]  +  t1[i]);

 U0[i] = (float)( cd.d_m2data[9*tiy]    +  t2[i]);

 U1[i] = (float)( cd.d_m2data[9*tiy+1]  +  t2[i]);

 U2[i] = (float)( cd.d_m2data[9*tiy+2]  +  t2[i]);



result = cuda_collide(V0,V1,V2,U0,U1,U2);

cd.d_resultM1[tix] = result;

cd.d_resultM2[tiy] = result;



It seems that everything is executing ok, but when I try and put the results into the device global memory it gives me an error saying kernel execution failed. but it doesn’t say much else.

cd.d_resultM1[tix] = result;

cd.d_resultM2[tiy] = result;

does this look like a reasonable way to allocate the memory?

cudaMalloc( (void**) &cd.d_resultM1, s1*sizeof(int) );

cudaMalloc( (void**) &cd.d_resultM2, s2*sizeof(int) );

anyone see something obviously wrong with this?

I have narrowed down my problem to the copying the memory from host to the device. here is my procedure:

I first create a large array in c to hold all the vertex information in the form {v1x,v1y,v1z…vnx.vny,vnz}

then I allocate device memory and copy it from the host to the device. the Cudata structure holds pointers to the gpu and host memory locations.

[codebox]struct Cudata{

//Host Memory

   float* h_m1data;

   float* h_m2data;

//Device Memory

float* d_m1data;

float* d_m2data;

//Triangle count

int sizeM1,sizeM2;


this is the function I call once at the beginning to load all the data into global memory.

[codebox]extern “C” void cuda_loadMesh(Cudata cd)


int s1f = 9*sizeof(float)*cd.sizeM1; 

int s2f = 9*sizeof(float)*cd.sizeM2; 

// allocate device memory

CUDA_SAFE_CALL(cudaMalloc( (void**) &cd.d_m1data ,s1f));

// copy mesh1 to device memory

CUDA_SAFE_CALL(cudaMemcpy( cd.d_m1data , cd.h_m1data , s1f , cudaMemcpyHostToDevice)) ;

CUDA_SAFE_CALL(cudaMalloc( (void**) &cd.d_m2data , s2f ));

// copy mesh2 to device memory

CUDA_SAFE_CALL(cudaMemcpy( cd.d_m2data , cd.h_m2data  , s2f , cudaMemcpyHostToDevice)) ;


the kernel looks something like this

[codebox]global void kernelCol(float* t1, float* t2, Cudata cd) {


V0[1]=cd.d_m1data[1]+t1[1]; // this is where I get the error i am guessing that m1data was not allocated properly



for some reason any time I try and access the data in the gpu the thread terminates and it gives me an error(“Kernel execution failed: unknown error”). so I am guessing that somehow the data got corrupted somewhere. can anyone see what is wrong with this? Is there a better way to copy a large array of floats then this?


Maybe I’m being blind, but where are you initialising the t1 pointer?

I put all the variable initialization below. so one question I have is the following. is it ok to hold pointers to device memory in the c portion of code? for instance I want to keep all the mesh data in global memory and not have to reallocate it every time the function is called.


// only using 12 threads to test on a simple cube

int gx = 1;//(int)(cd.sizeM1/12);

int gy = 1;//(int)(cd.sizeM2/12);

// setup execution parameters

dim3  grid( gx, gy, 1);

dim3  threads( 12, 12, 1); 

float *d_t1,*d_t2;

CUDA_SAFE_CALL(cudaMalloc((void**) &d_t1, sizeof(float)*3 ));

CUDA_SAFE_CALL(cudaMalloc((void**) &d_t2, sizeof(float)*3 ));

// copy host memory to device

CUDA_SAFE_CALL(cudaMemcpy(d_t1, t1, sizeof(float)*3  ,cudaMemcpyHostToDevice) );

CUDA_SAFE_CALL(cudaMemcpy(d_t2, t2, sizeof(float)*3  ,cudaMemcpyHostToDevice) );

float *d_m1,*d_m2;

CUDA_SAFE_CALL(cudaMalloc((void**) &d_m1, sizeof(float)9s1 ));

CUDA_SAFE_CALL(cudaMalloc((void**) &d_m2, sizeof(float)9s2 ));

// copy host memory to device

CUDA_SAFE_CALL(cudaMemcpy(d_m1, m1, sizeof(float)*9*s1  ,cudaMemcpyHostToDevice) );

CUDA_SAFE_CALL(cudaMemcpy(d_m2, m2, sizeof(float)*9*s2  ,cudaMemcpyHostToDevice) );

int *d_r1,*d_r2;

CUDA_SAFE_CALL(cudaMalloc((void**) &d_r1, sizeof(int)*s1));

CUDA_SAFE_CALL(cudaMalloc((void**) &d_r2, sizeof(int)*s2 ));

CUDA_SAFE_CALL(cudaMemcpy(d_r1, h_r1, sizeof(int)*s1 ,cudaMemcpyHostToDevice) );

CUDA_SAFE_CALL(cudaMemcpy(d_r2, h_r2, sizeof(int)*s2 ,cudaMemcpyHostToDevice) );

kernelCol<<< grid, threads>>>(d_t1, d_t2, d_m1, d_m2, d_r1, d_r2);

// check if kernel execution generated and error

CUT_CHECK_ERROR("Kernel execution failed");

CUT_SAFE_CALL( cutStopTimer( timer));

printf( "time: %f (ms)\n", cutGetTimerValue( timer));

CUT_SAFE_CALL( cutDeleteTimer( timer));

cudaMemcpy( h_r1 , d_r1 , sizeof(int)*s1 , cudaMemcpyDeviceToHost) ;

cudaMemcpy( h_r2 , d_r2  ,  sizeof(int)*s2 , cudaMemcpyDeviceToHost) ;

printf("m1: ");

for(int i = 0;i<s1;i++)


	printf("%d ",h_r1[i]);


	printf("m2: ");

 for(int i = 0;i<s2;i++)


	printf("%d ",h_r2[i]);