atomicCAS mutexes not working properly

To preface, I need a linked list struct without explicit “dynamic” allocation as specified by cuda(new and delete dont count for some reason) which is thread safe. I want to, for example, call a push_back to my list from each thread(multiple per warp) and have it all work without any problems. I am on an RTX 4050, so I assume my cuda does support warp-level divergence.

I would assume that a device mutex in cuda is written like this:

inline __device__ bool attemptTake(const linkedListCU ls) {
	return atomicCAS(ls.mutex, 0, 1) == 0;
}

inline __device__ void releaseList(const linkedListCU ls) {
	atomicExch(ls.mutex, 0);
}

and will later be called in a while loop like this:

bool blocked = true;

while(blocked){
     if(attemptTake()){
           // do memory unsafe ops
           releaseList();
           blocked = false;
     }
}

I implemented a similar structure here:

inline __device__ iteratorCU push_backLinkedCU(linkedListCU& ls, const particlePlaceholder p) {
	bool blocked = true;
	iteratorCU ret;
	while (blocked) {
		if (attemptTake(ls)) {
			if (ls.size == 0) {
				ls.firstNode = new node;
				((node*)ls.firstNode)->val = p;
				ls.curNode = ls.firstNode;
				ret = getBeginningCU(ls);
				goto releasePushLinked;
			}
			((node*)ls.curNode)->nextVal = new node;

			ls.curNode = ((node*)ls.curNode)->nextVal;
			((node*)ls.curNode)->val = p;


			ret.curn = ((node*)ls.curNode)->nextVal;
			ret.prevn = ((node*)ls.curNode);
			releasePushLinked:
			releaseList(ls);
			blocked = false;
		}
	}
	return ret;
}

// implementation of getBeginning:
inline __device__ iteratorCU getBeginningCU(const linkedListCU ls) {
	bool blocked = true;
	iteratorCU ret;
	while (blocked) {
		if (attemptTake(ls)) {
			ret.curn = ((node*)ls.firstNode);
			releaseList(ls);
			blocked = false;
		}
	}

	return ret;
}

The program cycles in an endless loop, and does not work with high thread counts for some reason.

For additional context, I am on cuda version 12.7 and am working in visual studio 2022.

Is ls.mutex initialized?

are you compiling for an architecture consistent with your GPU (e.g. for cc8.9) ?