I tried with shared memory:
// SoA-like design
// Without bank conflicts
// blockDim.x = blockDim.y = 8 => Stack capacity = 16
__shared__ float Stack[2048];
int BlockSize = blockDim.x * blockDim.y;
int ThreadId = threadIdx.y * blockDim.x + threadIdx.x;
int Pointer = 0;
// push
Stack[Pointer * BlockSize + ThreadId] = a;
Stack[Pointer * BlockSize + ThreadId + 1024] = b;
Pointer++;
// pop
Pointer--;
a = Stack[Pointer * BlockSize + ThreadId];
b = Stack[Pointer * BlockSize + ThreadId + 1024];
Or another variant:
struct StackStructure
{
float a[64];
float b[64];
}
// SoA-like design
// Without bank conflicts
__shared__ StackStructure Stack[16];
int ThreadId = threadIdx.y * blockDim.x + threadIdx.x;
int Pointer = 0;
// push
Stack[Pointer].a[ThreadId] = a;
Stack[Pointer].b[ThreadId] = b;
Pointer++;
// pop
Pointer--;
a = Stack[Pointer].a[ThreadId];
b = Stack[Pointer].b[ThreadId];
But it is a lot worse than local (which is bad too):
__local__ float2 Stack[16];
int Pointer = 0;
// push
Stack[Pointer].x = a;
Stack[Pointer].y = b;
Pointer++;
// pop
Pointer--;
a = Stack[Pointer].x
b = Stack[Pointer].y;
Maybe it would be better to use device labels? And coalesce the push/pops?