Has anyone implemented an effective stack?

I tried with shared memory:

// SoA-like design

// Without bank conflicts

// blockDim.x = blockDim.y = 8 => Stack capacity = 16

__shared__ float Stack[2048];

int BlockSize = blockDim.x * blockDim.y;

int ThreadId = threadIdx.y * blockDim.x + threadIdx.x;

int Pointer = 0;

// push

Stack[Pointer * BlockSize + ThreadId] = a;

Stack[Pointer * BlockSize + ThreadId + 1024] = b;

Pointer++;

// pop

Pointer--;

a = Stack[Pointer * BlockSize + ThreadId];

b = Stack[Pointer * BlockSize + ThreadId + 1024];

Or another variant:

struct StackStructure

{

   float a[64];

   float b[64];

}

// SoA-like design

// Without bank conflicts

__shared__ StackStructure Stack[16];

int ThreadId = threadIdx.y * blockDim.x + threadIdx.x;

int Pointer = 0;

// push

Stack[Pointer].a[ThreadId] = a;

Stack[Pointer].b[ThreadId] = b;

Pointer++;

// pop

Pointer--;

a = Stack[Pointer].a[ThreadId];

b = Stack[Pointer].b[ThreadId];

But it is a lot worse than local (which is bad too):

__local__ float2 Stack[16];

int Pointer = 0;

// push

Stack[Pointer].x = a;

Stack[Pointer].y = b;

Pointer++;

// pop

Pointer--;

a = Stack[Pointer].x

b = Stack[Pointer].y;

Maybe it would be better to use device labels? And coalesce the push/pops?