Hi all,
I’m trying to make an ray-triangle accelerator on GPU and according to the article Understanding the Efficiency of Ray Traversal on GPUs one of the best solution is to make persistent threads. So, I launch just enough threads to fill the machine and each thread is getting it’s work from global “work queue”.
Code from the mentioned article (www.tml.tkk.fi/~timo/publications/aila2009hpg_paper.pdf)
// global variables
const int B = 3*32; // example batch size
const int globalPoolRayCount;
int globalPoolNextRay = 0;
__global__ void kernel()
// variables shared by entire warp, place to shared memory
__shared__ volatile int nextRayArray[BLOCKDIM_Y];
__shared__ volatile int rayCountArray[BLOCKDIM_Y] = f0g;
volatile int& localPoolNextRay = nextRayArray[threadIdx.y];
volatile int& localPoolRayCount = rayCountArray[threadIdx.y];
while (true) f
// get rays from global to local pool
if (localPoolRayCount==0 && threadIdx.x==0) {
localPoolNextRay = atomicAdd(globalPoolNextRay, B);
localPoolRayCount = B; }
// get rays from local pool
int myRayIndex = localPoolNextRay + threadIdx.x;
if (myRayIndex >= globalPoolRayCount)
return;
if (threadIdx.x==0) {
localPoolNextRay += 32;
localPoolRayCount -= 32;
}
// init and execute, these must not exit the kernel
fetchAndInitRay(myRayIndex);
trace();
}
So I tried to port it to OpenCL, but I can’t get the desired result. What is wrong? Either I get CL_OUT_OF_RESOURCES or the generated image is wrong.
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#define B 3*32; // example batch size
// globalPoolNextRay is set to 0 in the cpp file
__kernel void kernel(__global int* globalPoolNextRay, int globalPoolRayCount){
__local volatile int localPoolNextRay;
__local volatile int localPoolRayCount;
if ( get_local_id(0) == 0){
localPoolNextRay = localPoolRayCount = 0;
}
mem_fence(CLK_LOCAL_MEM_FENCE);
while(true){
if ( localPoolRayCount == 0 && get_local_id(0) == 0){
localPoolNextRay = atom_add(globalPoolNextRay,B);
localPoolRayCount = B;
}
mem_fence(CLK_LOCAL_MEM_FENCE);
// get rays from local pool
myRayIndex = localPoolNextRay + get_local_id(0);
if ( myRayIndex > globalPoolRayCount)
return;
mem_fence(CLK_LOCAL_MEM_FENCE);
if ( get_local_id(0) == 0){
localPoolNextRay += 32;
localPoolRayCount -= 32;
}
mem_fence(CLK_LOCAL_MEM_FENCE);
// init and execute, these must not exit the kernel
fetchAndInitRay(myRayIndex);
trace();
}
}
I’ll be glad for any suggestion as I’m at my wit’s end.