Ray Mesh Intersection

Hi,
I m implementing ray triangle intersection for a huge mesh , and i want to get the benifit from the parallel processsing of the GPUs .
i am trying to develop a function that o provide for it a set of ray and triangle vertices and the function sould return each ray intersection with all the mesh if any.
but i am not getting the idea how can i get benifit from the CUDA libraries to implemet that.
below i wrote a script which is working for many rays with one triangle but i want also to get many rays with many triangle.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_atomic_functions.h"
#include <stdlib.h> 
#include <stdio.h>
#include <ctime>
#include<iostream>
#include <math.h>
#include <fstream>
#include <vector>


using namespace std;

#define epsilon 0.00000000001
__global__ void add(int *a , int *b, int *c)
{
	c[blockIdx.x] = a[blockIdx.x] + b[blockIdx.x];
}
struct Ray {
	float3 orig;	// ray origin
	float3 dir;		// ray direction	
	__device__ Ray(float3 o_, float3 d_) : orig(o_), dir(d_) {}
};



__global__ void raytrigpu(float *p0x, float *p0y, float *p0z, float *p1x, float *p1y, float *p1z, float *p2x, float *p2y, float *p2z, float *orx, float *ory, float *orz, float *dx, float *dy, float *dz,bool *flag, float *dist , int N)
{
	int id = blockIdx.x*blockDim.x + threadIdx.x;
	if ( id < N)
	{
		float e1x = p1x[id] - p0x[id];
		float e1y = p1y[id] - p0y[id];
		float e1z = p1z[id] - p0z[id];
		float e2x = p2x[id] - p0x[id];
		float e2y = p2y[id] - p0y[id];
		float e2z = p2z[id] - p0z[id];

		float qx = dy[id] * e2z - dz[id] * e2y;
		float qy = dz[id] * e2x - dx[id] * e2z;
		float qz = dx[id] * e2y - dy[id] * e2x;



		float a = e1x*qx + e1y*qy + e1z*qz;
		if (a > -epsilon && a < epsilon)
		{
			flag[id] = false;
			dist[id] = NAN;
			return;
		}
		float f = 1 / a;

		float sx = orx[id] - p0x[id];
		float sy = ory[id] - p0y[id];
		float sz = orz[id] - p0z[id];

		float u = f*(sx*qx + sy*qy + sz*qz);

		if (u < 0.0)
		{
			flag[id] = false;
			dist[id] = NAN;
			return;
		}

		float rx = sy*e1z - sz*e1y;
		float ry = sz*e1x - sx*e1z;
		float rz = sx*e1y - sy*e1x;

		float v = f*(dx[id] * rx + dy[id] * ry + dz[id] * rz);

		if (v < 0.0 || u + v > 1.0)
		{
			flag[id] = false;
			dist[id] = NAN;
			return;
		}

		flag[id] = true;
		dist[id] = f*(e2x*rx + e2y*ry + e2z*rz);
	}
}




#define N 512
#define R 1 
int main()
{
	float P0x[N], P0y[N] , P0z[N];
	float P1x[N], P1y[N], P1z[N];
	float P2x[N], P2y[N], P2z[N];
	float Dx[R], Dy[R], Dz[R];
	float Orx[R], Ory[R], Orz[R];
	float  dist[N];
	bool flag[N];

	float *GPUP0x, *GPUP0y, *GPUP0z;
	float *GPUP1x, *GPUP1y, *GPUP1z;
	float *GPUP2x, *GPUP2y, *GPUP2z;
	float *GPUDx, *GPUDy, *GPUDz;
	float *GPUOrx, *GPUOry, *GPUOrz;
	float *GPUdist;
	bool  *GPUflag;

	cudaMalloc((void **)&GPUP0x, N * sizeof(float));
	cudaMalloc((void **)&GPUP0y, N * sizeof(float));
	cudaMalloc((void **)&GPUP0z, N * sizeof(float));
	cudaMalloc((void **)&GPUP1x, N * sizeof(float));
	cudaMalloc((void **)&GPUP1y, N * sizeof(float));
	cudaMalloc((void **)&GPUP1z, N * sizeof(float));
	cudaMalloc((void **)&GPUP2x, N * sizeof(float));
	cudaMalloc((void **)&GPUP2y, N * sizeof(float));
	cudaMalloc((void **)&GPUP2z, N * sizeof(float));

	cudaMalloc((void **)&GPUDx, R * sizeof(float));
	cudaMalloc((void **)&GPUDy, R * sizeof(float));
	cudaMalloc((void **)&GPUDz, R * sizeof(float));

	cudaMalloc((void **)&GPUOrx, R * sizeof(float));
	cudaMalloc((void **)&GPUOry, R * sizeof(float));
	cudaMalloc((void **)&GPUOrz, R * sizeof(float));
	
	cudaMalloc((void **)&GPUdist, N * sizeof(float));
	cudaMalloc((void **)&GPUflag, N * sizeof(bool));

	for (int i = 0; i<N; ++i) {
		P0x[i] = 0;
		P0y[i] = 0;
		P0z[i] = 0;
		P1x[i] = 1;
		P1y[i] = 0;
		P1z[i] = 0;
		P2x[i] = 0;
		P2y[i] = 1;
		P2z[i] = 0;
	}
	for (int j = 0; j<R; ++j)
	{
		Dx[j] = 0;
		Dy[j] = 0;
		Dz[j] = 1;
		Orx[j] = 0.5;
		Ory[j] = 0.5;
		Orz[j] = 1;
	}
	clock_t tic = clock();
	cudaMemcpy(GPUP0x, P0x, N * sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(GPUP0y, P0y, N * sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(GPUP0z, P0z, N * sizeof(float), cudaMemcpyHostToDevice);
	
	cudaMemcpy(GPUP1x, P1x, N * sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(GPUP1y, P1y, N * sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(GPUP1z, P1z, N * sizeof(float), cudaMemcpyHostToDevice);

	cudaMemcpy(GPUP2x, P2x, N * sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(GPUP2y, P2y, N * sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(GPUP2z, P2z, N * sizeof(float), cudaMemcpyHostToDevice);

	cudaMemcpy(GPUDx, Dx, R * sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(GPUDy, Dy, R * sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(GPUDz, Dz, R * sizeof(float), cudaMemcpyHostToDevice);

	cudaMemcpy(GPUOrx, Orx, R * sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(GPUOry, Ory, R * sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(GPUOrz, Orz, R * sizeof(float), cudaMemcpyHostToDevice);

	raytrigpu<<<R,N>>>(GPUP0x, GPUP0y, GPUP0z, GPUP1x, GPUP1y, GPUP1z, GPUP2x, GPUP2y, GPUP2z, GPUOrx, GPUOry, GPUOrz, GPUDx, GPUDy, GPUDz , GPUflag , GPUdist ,N);

	cudaMemcpy(flag, GPUflag, N * sizeof(bool), cudaMemcpyDeviceToHost);
	cudaMemcpy(dist, GPUdist, N * sizeof(float), cudaMemcpyDeviceToHost);


	cudaFree(GPUflag); cudaFree(GPUdist); 
	cudaFree(GPUP0x); cudaFree(GPUP0y); cudaFree(GPUP0z);
	cudaFree(GPUP1x); cudaFree(GPUP1y); cudaFree(GPUP1z);
	cudaFree(GPUP2x); cudaFree(GPUP2y); cudaFree(GPUP2z);
	cudaFree(GPUDx); cudaFree(GPUDy); cudaFree(GPUDz);
	cudaFree(GPUOrx); cudaFree(GPUOry); cudaFree(GPUOrz);

	printf("%d\n", flag[0]);
	printf("%lf\n", dist[0]);
	clock_t toc = clock();
	double time = (double)(toc - tic);
	cout << "\nTime taken: " << (1000 * (time / CLOCKS_PER_SEC)) << " (ms)";

    return 0;
}

please help me if there is better idea to implement it in a better way.
Thanks.

I think you need to take a step back and look at best practices in the industry. It is simply not practical to evaluate each ray against each triangle of each mesh for any kind of large 3D scene - even when making use of brute force parallelism in hardware. The speed you can reach will probably be completely memory bound (all the triangle data will overwhelm the caches and you simply run into bandwidth limits).

A very common approach is to introduce some bounding volume hierarchies (a recursive spatial subdivision of your scene). For example you can enter your meshes into BVH trees - some very large meshes could also be subdivided into smaller sub-meshes before registering them in the tree to improve efficiency. Then evaluate each ray against the BVH tree before doing any individual checks of the ray only against the meshes flagged by the BVH tree as potentially intersecting with the ray.

NOTE: The nVidia Optix libraries would provide complete implementations of such BVH trees (and maybe other common types of raytracig acceleration structures). The latest generation of nVidia RTX GPUs (Turing) is probably going to have some level of hardware support for BVH trees and ray-triangle intersections.

a bit of background info on CUDA accelerated realtime raytracing and related acceleration structures.

(This entire blog is dedicated to realtime raytracing with a strong focus on nVidia products.)