I try to use atomicCAS and atomicExch to simulate lock and unlock functions in troditional thread and block concurrcy programming. But I found some strange problems.
Here is my code.
The lock only works between thread block but not threads. It seems will cause dead lock between threads.
__global__ void lockAdd(int*val, int* mutex) {
while (0 != (atomicCAS(mutex, 0, 1))) {}//Similar to spin lock
(*val)++;//all threads add one to the value
atomicExch(mutex, 0);//unlock
}
int main() {
int* mutex;//all threads share on mutex.
cudaMallocManaged((void**)&mutex, sizeof(int));
*mutex = 0;
int* val;
cudaMallocManaged((void**)&val, sizeof(int));//val is on unified memory
*val = 0;
lockAdd << <1024, 1024 >> > (val,mutex);//1024 blocks,1024 threads per block
//lockAdd << <1024, 1 >> > (val,mutex);//If I only launch 1 thread per block, it works perfectly
cudaDeviceSynchronize();
std::cout << *val << std::endl;//the idea output should be 1 million. But it seems there is a dead lock and the driver is crashed.
cudaFree(val);
cudaFree(mutex);
}