To preface, I need a linked list struct without explicit “dynamic” allocation as specified by cuda(new and delete dont count for some reason) which is thread safe. I want to, for example, call a push_back to my list from each thread(multiple per warp) and have it all work without any problems. I am on an RTX 4050, so I assume my cuda does support warp-level divergence.
I would assume that a device mutex in cuda is written like this:
inline __device__ bool attemptTake(const linkedListCU ls) {
return atomicCAS(ls.mutex, 0, 1) == 0;
}
inline __device__ void releaseList(const linkedListCU ls) {
atomicExch(ls.mutex, 0);
}
and will later be called in a while loop like this:
bool blocked = true;
while(blocked){
if(attemptTake()){
// do memory unsafe ops
releaseList();
blocked = false;
}
}
I implemented a similar structure here:
inline __device__ iteratorCU push_backLinkedCU(linkedListCU& ls, const particlePlaceholder p) {
bool blocked = true;
iteratorCU ret;
while (blocked) {
if (attemptTake(ls)) {
if (ls.size == 0) {
ls.firstNode = new node;
((node*)ls.firstNode)->val = p;
ls.curNode = ls.firstNode;
ret = getBeginningCU(ls);
goto releasePushLinked;
}
((node*)ls.curNode)->nextVal = new node;
ls.curNode = ((node*)ls.curNode)->nextVal;
((node*)ls.curNode)->val = p;
ret.curn = ((node*)ls.curNode)->nextVal;
ret.prevn = ((node*)ls.curNode);
releasePushLinked:
releaseList(ls);
blocked = false;
}
}
return ret;
}
// implementation of getBeginning:
inline __device__ iteratorCU getBeginningCU(const linkedListCU ls) {
bool blocked = true;
iteratorCU ret;
while (blocked) {
if (attemptTake(ls)) {
ret.curn = ((node*)ls.firstNode);
releaseList(ls);
blocked = false;
}
}
return ret;
}
The program cycles in an endless loop, and does not work with high thread counts for some reason.