Hello!
I am running CUDA code on the RTX 3090 GPU. I am trying to find the sum of integers in an array using a full binary tree. I structured the binary tree in the same way one would structure a max-heap with 1-indexing (I just waste the 0th index item). In the code I have pasted below, all my threads are organized linearly, having block dimensions of dim3(256, 1, 1).
In the commented-out section, I had 8 steps of upsweep operating with __syncthreads() between each operation. This worked fine across many different inputs (over 4GB of inputs). However, I thought that for the first 5 steps of upsweep, I probably wouldn’t need a __syncthreads() operation since each of the operations are operating in lock-step within a warp, so I separated the for loop into two different loops, one that performs the first 5 steps of upsweep in lockstep, and one that performs the last 3 steps using __syncthreads() operations in between. However, this fails for some reason. Could somebody explain what is going on here?
#define threads 256
#define logthreads 8
__global__ void upsweep(int *arr, int *d_n, int *ws)
{
__shared__ int heap[2 * threads];
int left, right, pow2, parent_index;
heap[threadIdx.x] = 0;
heap[threadIdx.x + threads] = arr[blockIdx.x * threads + threadIdx.x];
__syncthreads();
// Only needs to be done if threadIdx.x % 2 == 0, but might as well compute it for the rest since there is no additional runtime
left = heap[threadIdx.x + threads];
right = heap[threadIdx.x + 1 + threads];
pow2 = 2;
parent_index = (threads + threadIdx.x) / pow2;
// This is where I substituted the code for the commented section below.
for (unsigned char i = 0; i < 5; i++)
{
if (threadIdx.x % pow2 == 0)
{
left += right;
heap[parent_index] = left;
right = heap[parent_index + 1]; // reassign for next layer
parent_index = parent_index >> 1; // reassign for next layer
}
pow2 = pow2 << 1;
}
__syncthreads();
for (unsigned char i = 5; i < logthreads; i++)
{
if (threadIdx.x % pow2 == 0)
{
left += right;
heap[parent_index] = left;
}
__syncthreads();
if (threadIdx.x % pow2 == 0)
{
right = heap[parent_index + 1]; // reassign for next layer
parent_index = parent_index >> 1; // reassign for next layer
}
pow2 = pow2 << 1;
}
/*
for (unsigned char i = 0; i < logthreads; i++)
{
if (threadIdx.x % pow2 == 0)
{
left += right;
heap[parent_index] = left;
}
__syncthreads();
if (threadIdx.x % pow2 == 0)
{
right = heap[parent_index + 1]; // reassign for next layer
parent_index = parent_index >> 1; // reassign for next layer
}
pow2 = pow2 << 1;
}
*/
// additional code after here...
}