Hi,
I m implementing ray triangle intersection for a huge mesh , and i want to get the benifit from the parallel processsing of the GPUs .
i am trying to develop a function that o provide for it a set of ray and triangle vertices and the function sould return each ray intersection with all the mesh if any.
but i am not getting the idea how can i get benifit from the CUDA libraries to implemet that.
below i wrote a script which is working for many rays with one triangle but i want also to get many rays with many triangle.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_atomic_functions.h"
#include <stdlib.h>
#include <stdio.h>
#include <ctime>
#include<iostream>
#include <math.h>
#include <fstream>
#include <vector>
using namespace std;
#define epsilon 0.00000000001
__global__ void add(int *a , int *b, int *c)
{
c[blockIdx.x] = a[blockIdx.x] + b[blockIdx.x];
}
struct Ray {
float3 orig; // ray origin
float3 dir; // ray direction
__device__ Ray(float3 o_, float3 d_) : orig(o_), dir(d_) {}
};
__global__ void raytrigpu(float *p0x, float *p0y, float *p0z, float *p1x, float *p1y, float *p1z, float *p2x, float *p2y, float *p2z, float *orx, float *ory, float *orz, float *dx, float *dy, float *dz,bool *flag, float *dist , int N)
{
int id = blockIdx.x*blockDim.x + threadIdx.x;
if ( id < N)
{
float e1x = p1x[id] - p0x[id];
float e1y = p1y[id] - p0y[id];
float e1z = p1z[id] - p0z[id];
float e2x = p2x[id] - p0x[id];
float e2y = p2y[id] - p0y[id];
float e2z = p2z[id] - p0z[id];
float qx = dy[id] * e2z - dz[id] * e2y;
float qy = dz[id] * e2x - dx[id] * e2z;
float qz = dx[id] * e2y - dy[id] * e2x;
float a = e1x*qx + e1y*qy + e1z*qz;
if (a > -epsilon && a < epsilon)
{
flag[id] = false;
dist[id] = NAN;
return;
}
float f = 1 / a;
float sx = orx[id] - p0x[id];
float sy = ory[id] - p0y[id];
float sz = orz[id] - p0z[id];
float u = f*(sx*qx + sy*qy + sz*qz);
if (u < 0.0)
{
flag[id] = false;
dist[id] = NAN;
return;
}
float rx = sy*e1z - sz*e1y;
float ry = sz*e1x - sx*e1z;
float rz = sx*e1y - sy*e1x;
float v = f*(dx[id] * rx + dy[id] * ry + dz[id] * rz);
if (v < 0.0 || u + v > 1.0)
{
flag[id] = false;
dist[id] = NAN;
return;
}
flag[id] = true;
dist[id] = f*(e2x*rx + e2y*ry + e2z*rz);
}
}
#define N 512
#define R 1
int main()
{
float P0x[N], P0y[N] , P0z[N];
float P1x[N], P1y[N], P1z[N];
float P2x[N], P2y[N], P2z[N];
float Dx[R], Dy[R], Dz[R];
float Orx[R], Ory[R], Orz[R];
float dist[N];
bool flag[N];
float *GPUP0x, *GPUP0y, *GPUP0z;
float *GPUP1x, *GPUP1y, *GPUP1z;
float *GPUP2x, *GPUP2y, *GPUP2z;
float *GPUDx, *GPUDy, *GPUDz;
float *GPUOrx, *GPUOry, *GPUOrz;
float *GPUdist;
bool *GPUflag;
cudaMalloc((void **)&GPUP0x, N * sizeof(float));
cudaMalloc((void **)&GPUP0y, N * sizeof(float));
cudaMalloc((void **)&GPUP0z, N * sizeof(float));
cudaMalloc((void **)&GPUP1x, N * sizeof(float));
cudaMalloc((void **)&GPUP1y, N * sizeof(float));
cudaMalloc((void **)&GPUP1z, N * sizeof(float));
cudaMalloc((void **)&GPUP2x, N * sizeof(float));
cudaMalloc((void **)&GPUP2y, N * sizeof(float));
cudaMalloc((void **)&GPUP2z, N * sizeof(float));
cudaMalloc((void **)&GPUDx, R * sizeof(float));
cudaMalloc((void **)&GPUDy, R * sizeof(float));
cudaMalloc((void **)&GPUDz, R * sizeof(float));
cudaMalloc((void **)&GPUOrx, R * sizeof(float));
cudaMalloc((void **)&GPUOry, R * sizeof(float));
cudaMalloc((void **)&GPUOrz, R * sizeof(float));
cudaMalloc((void **)&GPUdist, N * sizeof(float));
cudaMalloc((void **)&GPUflag, N * sizeof(bool));
for (int i = 0; i<N; ++i) {
P0x[i] = 0;
P0y[i] = 0;
P0z[i] = 0;
P1x[i] = 1;
P1y[i] = 0;
P1z[i] = 0;
P2x[i] = 0;
P2y[i] = 1;
P2z[i] = 0;
}
for (int j = 0; j<R; ++j)
{
Dx[j] = 0;
Dy[j] = 0;
Dz[j] = 1;
Orx[j] = 0.5;
Ory[j] = 0.5;
Orz[j] = 1;
}
clock_t tic = clock();
cudaMemcpy(GPUP0x, P0x, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(GPUP0y, P0y, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(GPUP0z, P0z, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(GPUP1x, P1x, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(GPUP1y, P1y, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(GPUP1z, P1z, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(GPUP2x, P2x, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(GPUP2y, P2y, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(GPUP2z, P2z, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(GPUDx, Dx, R * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(GPUDy, Dy, R * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(GPUDz, Dz, R * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(GPUOrx, Orx, R * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(GPUOry, Ory, R * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(GPUOrz, Orz, R * sizeof(float), cudaMemcpyHostToDevice);
raytrigpu<<<R,N>>>(GPUP0x, GPUP0y, GPUP0z, GPUP1x, GPUP1y, GPUP1z, GPUP2x, GPUP2y, GPUP2z, GPUOrx, GPUOry, GPUOrz, GPUDx, GPUDy, GPUDz , GPUflag , GPUdist ,N);
cudaMemcpy(flag, GPUflag, N * sizeof(bool), cudaMemcpyDeviceToHost);
cudaMemcpy(dist, GPUdist, N * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(GPUflag); cudaFree(GPUdist);
cudaFree(GPUP0x); cudaFree(GPUP0y); cudaFree(GPUP0z);
cudaFree(GPUP1x); cudaFree(GPUP1y); cudaFree(GPUP1z);
cudaFree(GPUP2x); cudaFree(GPUP2y); cudaFree(GPUP2z);
cudaFree(GPUDx); cudaFree(GPUDy); cudaFree(GPUDz);
cudaFree(GPUOrx); cudaFree(GPUOry); cudaFree(GPUOrz);
printf("%d\n", flag[0]);
printf("%lf\n", dist[0]);
clock_t toc = clock();
double time = (double)(toc - tic);
cout << "\nTime taken: " << (1000 * (time / CLOCKS_PER_SEC)) << " (ms)";
return 0;
}
please help me if there is better idea to implement it in a better way.
Thanks.