Ray Tracing using persistent threads in CUDA

Hello everyone,

I am trying understand the persistent threads version of ray tracing in CUDA, the link to the code is in the following URL:

https://code.google.com/archive/p/understanding-the-efficiency-of-ray-traversal-on-gpus/source/default/source

When I look at the TRACE_FUNC in tesla_persistent_packet.cu, the size of the shared memory array s_stackPtr is only 6. Can someone help me explain why the size of the shared memory is so small ? Here is the code for the function:

TRACE_FUNC
{
    // Shared memory arrays.

    __shared__ RayStruct shared[32 * MaxBlockHeight + 1];
    __shared__ volatile int s_stack[STACK_SIZE * MaxBlockHeight];
    __shared__ volatile int s_stackPtr[MaxBlockHeight]; // NOTE: This could equally well be in a register.
    __shared__ volatile int s_red[32 * MaxBlockHeight];

    RayStruct*    aux      = shared + threadIdx.x + (blockDim.x * threadIdx.y);
    volatile int* stack    = s_stack + STACK_SIZE * threadIdx.y;
    volatile int* red      = s_red + 32 * threadIdx.y;
    volatile int& stackPtr = s_stackPtr[threadIdx.y];

    // Live state during traversal, stored in registers.

    int     tidx = threadIdx.x;     // Lane index within warp.
    int     widx = threadIdx.y;     // Warp index within block.

    int     rayidx;                 // Ray index.
    float   origx, origy, origz;    // Ray origin.
    bool    valid;                  // False if the ray is degenerate.

    int     nodeAddr;               // Current node, negative if leaf.
    bool    terminated;             // Whether the traversal has been terminated.
    float   hitT;                   // t-value of the closest intersection.

    // Initialize persistent threads.

    __shared__ volatile int nextRayArray[MaxBlockHeight]; // Current ray index in global buffer.
    __shared__ volatile int rayCountArray[MaxBlockHeight]; // Number of rays in the local pool.
    nextRayArray[threadIdx.y] = 0;
    rayCountArray[threadIdx.y] = 0;

    // Persistent threads: fetch and process rays in a loop.

    do
    {
        volatile int& localPoolRayCount = rayCountArray[widx];
        volatile int& localPoolNextRay = nextRayArray[widx];

        // Local pool is empty => fetch new rays from the global pool using lane 0.

        if (tidx == 0 && localPoolRayCount <= 0)
        {
            localPoolNextRay = atomicAdd(&g_warpCounter, LOAD_BALANCER_BATCH_SIZE);
            localPoolRayCount = LOAD_BALANCER_BATCH_SIZE;
        }

        // Pick 32 rays from the local pool.
        // Out of work => done.
        {
            rayidx = localPoolNextRay + tidx;
            if (rayidx >= numRays)
                break;

            if (tidx == 0)
            {
                localPoolNextRay += 32;
                localPoolRayCount -= 32;
            }

            // Fetch ray.

            float4 o = FETCH_GLOBAL(rays, rayidx * 2 + 0, float4);
            float4 d = FETCH_GLOBAL(rays, rayidx * 2 + 1, float4);
            origx = o.x, origy = o.y, origz = o.z;
            aux->tmin = o.w;
            valid = (o.w < d.w);

            float ooeps = exp2f(-80.0f); // Avoid div by zero.
            aux->idirx = 1.0f / (fabsf(d.x) > ooeps ? d.x : copysignf(ooeps, d.x));
            aux->idiry = 1.0f / (fabsf(d.y) > ooeps ? d.y : copysignf(ooeps, d.y));
            aux->idirz = 1.0f / (fabsf(d.z) > ooeps ? d.z : copysignf(ooeps, d.z));

            // Setup traversal.

            stackPtr = -1;                  // Stack is empty.
            nodeAddr = 0;                   // Start from the root.
            terminated = false;             // Not terminated yet.
            STORE_RESULT(rayidx, -1, 0.0f); // No triangle intersected so far.
            hitT = d.w;                     // tmax
        }

        // Traversal loop.

        while (valid)
        {
            // Internal node => intersect children.

            if (nodeAddr >= 0)
            {
                // Fetch AABBs of the two child nodes.

#ifdef NODES_ARRAY_OF_STRUCTURES
                float4 n0xy = FETCH_TEXTURE(nodesA, nodeAddr*4+0, float4);  // (c0.lo.x, c0.hi.x, c0.lo.y, c0.hi.y)
                float4 n1xy = FETCH_TEXTURE(nodesA, nodeAddr*4+1, float4);  // (c1.lo.x, c1.hi.x, c1.lo.y, c1.hi.y)
                float4 nz   = FETCH_TEXTURE(nodesA, nodeAddr*4+2, float4);  // (c0.lo.z, c0.hi.z, c1.lo.z, c1.hi.z)
                float4 cnodes=FETCH_TEXTURE(nodesA, nodeAddr*4+3, float4);
#else
                float4 n0xy = FETCH_TEXTURE(nodesA, nodeAddr, float4);      // (c0.lo.x, c0.hi.x, c0.lo.y, c0.hi.y)
                float4 n1xy = FETCH_TEXTURE(nodesB, nodeAddr, float4);      // (c1.lo.x, c1.hi.x, c1.lo.y, c1.hi.y)
                float4 nz   = FETCH_TEXTURE(nodesC, nodeAddr, float4);      // (c0.lo.z, c0.hi.z, c1.lo.z, c1.hi.z)
                float4 cnodes=FETCH_TEXTURE(nodesD, nodeAddr, float4);
#endif

                // Intersect the ray against the child nodes.

                float oodx  = origx * aux->idirx;
                float oody  = origy * aux->idiry;
                float oodz  = origz * aux->idirz;
                float c0lox = n0xy.x * aux->idirx - oodx;
                float c0hix = n0xy.y * aux->idirx - oodx;
                float c0loy = n0xy.z * aux->idiry - oody;
                float c0hiy = n0xy.w * aux->idiry - oody;
                float c0loz = nz.x   * aux->idirz - oodz;
                float c0hiz = nz.y   * aux->idirz - oodz;
                float c1loz = nz.z   * aux->idirz - oodz;
                float c1hiz = nz.w   * aux->idirz - oodz;
                float c0min = max4(fminf(c0lox, c0hix), fminf(c0loy, c0hiy), fminf(c0loz, c0hiz), aux->tmin);
                float c0max = min4(fmaxf(c0lox, c0hix), fmaxf(c0loy, c0hiy), fmaxf(c0loz, c0hiz), hitT);
                float c1lox = n1xy.x * aux->idirx - oodx;
                float c1hix = n1xy.y * aux->idirx - oodx;
                float c1loy = n1xy.z * aux->idiry - oody;
                float c1hiy = n1xy.w * aux->idiry - oody;
                float c1min = max4(fminf(c1lox, c1hix), fminf(c1loy, c1hiy), fminf(c1loz, c1hiz), aux->tmin);
                float c1max = min4(fmaxf(c1lox, c1hix), fmaxf(c1loy, c1hiy), fmaxf(c1loz, c1hiz), hitT);

                // Perform warp-wide vote to decide where to go.

                bool traverseChild0 = (c0max >= c0min);
                bool traverseChild1 = (c1max >= c1min);
                bool anyc0 = __any(traverseChild0);
                bool anyc1 = __any(traverseChild1);
                int nodeAddrChild0 = __float_as_int(cnodes.x); // Stored as int.
                int nodeAddrChild1 = __float_as_int(cnodes.y); // Stored as int.

                // Both children were intersected => vote which one to visit first.

                if (anyc0 && anyc1)
                {
                    red[tidx] = (c1min < c0min) ? 1 : -1;
                    reduceSum((int*)red, tidx);
                    if (red[tidx] >= 0)
                        swap(nodeAddrChild0, nodeAddrChild1);

                    nodeAddr = nodeAddrChild0;
                    if (tidx == 0)
                    {
                        stackPtr++;
                        stack[stackPtr] = nodeAddrChild1; // Lane 0 writes.
                    }
                }

                // Only one child was intersected => go there.

                else if (anyc0)
                {
                    nodeAddr = nodeAddrChild0;
                }
                else if (anyc1)
                {
                    nodeAddr = nodeAddrChild1;
                }

                // Neither child was intersected => pop.

                else
                {
                    if (stackPtr < 0)
                        break;
                    else
                    {
                        nodeAddr = stack[stackPtr]; // All lanes read.
                        if (tidx == 0)
                            stackPtr--; // Lane 0 decrements.
                    }
                }
            } // internal node

            // Leaf node => intersect triangles.

            if (nodeAddr < 0)
            {
                // Fetch the start and end of the triangle list.

                nodeAddr = -nodeAddr-1;
#ifdef NODES_ARRAY_OF_STRUCTURES
                float4 leaf = FETCH_TEXTURE(nodesA, nodeAddr*4+3, float4);
#else
                float4 leaf = FETCH_TEXTURE(nodesD, nodeAddr, float4);
#endif
                int triAddr  = __float_as_int(leaf.x); // Stored as int.
                int triAddr2 = __float_as_int(leaf.y); // Stored as int.

                // Intersect the ray against each triangle using Sven Woop's algorithm.

                for(; triAddr < triAddr2; triAddr++)
                {
                    // Compute and check intersection t-value.

#ifdef TRIANGLES_ARRAY_OF_STRUCTURES
                    float4 v00 = FETCH_GLOBAL(trisA, triAddr*4+0, float4);
                    float4 v11 = FETCH_GLOBAL(trisA, triAddr*4+1, float4);
#else
                    float4 v00 = FETCH_GLOBAL(trisA, triAddr, float4);
                    float4 v11 = FETCH_GLOBAL(trisB, triAddr, float4);
#endif
                    float dirx = 1.0f / aux->idirx;
                    float diry = 1.0f / aux->idiry;
                    float dirz = 1.0f / aux->idirz;

                    float Oz = v00.w - origx*v00.x - origy*v00.y - origz*v00.z;
                    float invDz = 1.0f / (dirx*v00.x + diry*v00.y + dirz*v00.z);
                    float t = Oz * invDz;

                    if (t > aux->tmin && t < hitT)
                    {
                        // Compute and check barycentric u.

                        float Ox = v11.w + origx*v11.x + origy*v11.y + origz*v11.z;
                        float Dx = dirx*v11.x + diry*v11.y + dirz*v11.z;
                        float u = Ox + t*Dx;

                        if (u >= 0.0f)
                        {
                            // Compute and check barycentric v.

#ifdef TRIANGLES_ARRAY_OF_STRUCTURES
                            float4 v22 = FETCH_GLOBAL(trisA, triAddr*4+2, float4);
#else
                            float4 v22 = FETCH_GLOBAL(trisC, triAddr, float4);
#endif
                            float Oy = v22.w + origx*v22.x + origy*v22.y + origz*v22.z;
                            float Dy = dirx*v22.x + diry*v22.y + dirz*v22.z;
                            float v = Oy + t*Dy;

                            if (v >= 0.0f && u + v <= 1.0f)
                            {
                                // Record intersection.
                                // Closest intersection not required => terminate.

                                hitT = t;
                                STORE_RESULT(rayidx, FETCH_GLOBAL(triIndices, triAddr, int), t);
                                if (anyHit)
                                    terminated = true; // NOTE: Cannot break because packet traversal!
                            }
                        }
                    }
                } // triangle

                // All lanes have terminated => traversal done.

                if (__all(terminated))
                    break;

                // Pop stack.

                if (stackPtr < 0)
                    break;
                else
                {
                    nodeAddr = stack[stackPtr]; // Everyone reads.
                    if (tidx == 0)
                        stackPtr--; // Lane 0 decrements.
                }
            } // leaf node
        } // traversal loop
    } while(aux); // persistent threads (always true)
}

Thank you very much in advance !

It’s an intrigueing question :)

The data structures involved are these:

__shared__ RayStruct shared[32 * MaxBlockHeight + 1];
__shared__ volatile int s_stack[STACK_SIZE * MaxBlockHeight];
__shared__ volatile int s_stackPtr[MaxBlockHeight]; // NOTE: This could equally well be in a register.
__shared__ volatile int s_red[32 * MaxBlockHeight];

RayStruct is unknown how many bytes these are.
int is probably 4 bytes
STACK_SIZE is unknown
MaxBlockHeight is 6 according to you.

So you should do some digging and figure out these unknowns.

Then multiply and sum them together and such to see how many bytes these data structures use in total.

Then compare this to the actually ammount of shared memory available on the hardware.

The ammount of shared memory available on hardware is probably not much something like 50 KB or something :)

(Quite a nice raytracer by the way ! it’s from 2010 though… perhaps their is a more modern version available with more graphics features ?! hmmm… or maybe even a faster version somewhere on the net ? :))