I implemented a simple mutex mechanism:
__device__ void lock(int *mutex)
{
while(atomicCAS(mutex, 0, 1) != 0);
}
__device__ void unlock(int *mutex)
{
atomicExch(mutex, 0);
}
And I`m running a simple kernel to test it:
__global__ void kernel(int *mutex, int *acc)
{
lock(mutex);
++(*acc);
unlock(mutex);
}
By invoking:
int *mutex, *acc, res;
res = -1;
CUDA_CALL(cudaMalloc((void **)&mutex, sizeof(int)));
CUDA_CALL(cudaMemset(mutex, 0, sizeof(int)));
CUDA_CALL(cudaMalloc((void **)&acc, sizeof(int)));
CUDA_CALL(cudaMemset(acc, 0, sizeof(int)));
kernel<<<blocks, threads>>>(devStates, mutex, acc);
CUDA_CALL(cudaMemcpy(&res, acc, sizeof(int), cudaMemcpyDeviceToHost));
printf("Result: %d\n", res);
So when I run with:
[list=1]
[*]blocks(16,16); threads(1,1);
[*]without using mutex
I got ~50 as a result → as expected → OK
[*]using mutex
I got 256 as a result → OK
[*]blocks(1,1); threads(16,16);
[*]without using mutex
I got 1 as a result → probable → OK
[*]using mutex
The program gets stuck and I don`t know why…
I know there maybe some race conditions, but I don’t see where.
Also it works fine when using one thread per block, so theoretically
it should work with one block and many threads.
Also I can’t debbug it, as no break point is hit in any of the kernels.
Got any ideas/solutions ? :)