I have the following structure:
template <int size>
class MemoryTracker {
private:
int fieldEmpty;
int fieldPtr;
int freeField;
int highestNode;
public:
__device__ inline void init() {
if (threadIdx.x<32) {
freeField=0;
highestNode=0;
}
for (int tid=threadIdx.x;tid<size;tid+=blockDim.x)
fieldPtr[tid]=tid;
}
__device__ inline void reset();
__device__ inline bool isFree(int idx){return idx>=highestNode || fieldEmpty[idx]==1;}
__device__ inline int getSize() {return highestNode;}
__device__ inline int add() {
int idx=atomicAdd(&freeField,1);
atomicMax(&highestNode,idx+1);
idx=fieldPtr[idx];
fieldEmpty[idx]=0;
return idx;
}
__device__ int add_block(int amount) {
__shared__ int bidx;
if (threadIdx.x==0) {
bidx=atomicAdd(&freeField,amount);
atomicMax(&highestNode,bidx+amount);
}
__syncthreads();
int idx=-1;
if (threadIdx.x<amount) {
idx=fieldPtr[bidx+threadIdx.x];
fieldEmpty[idx]=0;
}
return idx;
}
__device__ inline void remove(int idx) {
fieldEmpty[idx]=1;
}
__device__ inline void compact(int *sharr) {
//TODO: you might want to implement something without atomic operations
if (threadIdx.x<32)
freeField=highestNode;
__syncthreads();
for (int i=threadIdx.x;i<highestNode;i+=blockDim.x) {
if (fieldEmpty[i]==1) {
int addr=atomicSub(&freeField,1);
fieldPtr[addr]=i;
}
}
}
};
template <typename T, int size>
class TrackedMemoryManager : public MemoryTracker<size> {
private:
T data;
public:
__device__ inline T &operator[](int idx){return data[idx];};
};
It is not very optimised, and it was not fully tested, so use it with care! The components are:
fieldEmpty array - just stores information is cell x is empty or not
fieldPtr array - for x>=freeField it holds an unique pointer to an empty cell
freeField - holds an index to the first pointer in fieldPtr which points to empty cells
highestNode - holds an index to a possibly highest nonempty cell (so that, for example, you can launch a kernel over all items stored)
void init() - you need to call it once in your program
int add() - allocates a single element. (one element per thread)
int add_block(int amount) - allocates amount elements. Whole block must call this one.
void remove(int idx) - marks given cell that it is now empty.
void compact(int *sharr) - call it with a separate kernel. It will compact freePtr array and reduce freeField by the elements which were deallocated, so that they can be reused in subsequent calls to add and add_block. The function as it is now is extremely slow. I just created it just to be working, but should be replaced by some reduction algorithm.
Use MemoryTracker if the data is stored in a separate array and you just need to know which indexes to acces.
Use TrackedMemoryManager if you do not allocate data storage yourself.