All I want is implement a mutex on GPU. I find atomicCAS/atomicExch can be used to implement such a mutex.
Everything works fine, when I worked with old GPUs. Error occurs when I run my code on 2080ti.
Here is the code I use. This code use mutex and atomicAdd separately to do the same computation.
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
#define data_t int32_t
__global__ void method(
data_t* data,
unsigned int* lock
){
int i = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int* mutex = &lock[0];
atomicAdd(&data[1], i);
bool blocked = true;
while(blocked){
if(atomicCAS(mutex, 0u, 1u)==0u){
data[0] += i;
data[2] += 1;
blocked = false;
atomicExch(mutex, 0u);
}
}
}
int main(int argc, char* argv[]){
for(int i=0;i<argc;++i)
printf("argc[%d] %s\n", i, argv[i]);
int len = 69;
int dev_id = 0;
if(argc>1){
len = atoi(argv[1]);
}
if(argc>2){
dev_id = atoi(argv[2]);
}
cudaSetDevice(dev_id);
data_t* hst_a = (data_t*)malloc(4*sizeof(data_t));
data_t* dev_a;
unsigned int* hst_lock = (unsigned int*)malloc(4*sizeof(unsigned int));
unsigned int* dev_lock;
cudaMalloc(&dev_a, 4*sizeof(data_t));
cudaMalloc(&dev_lock, 4*sizeof(unsigned int));
memset(hst_a, 0, 4*sizeof(data_t));
memset(hst_lock, 0u, 4*sizeof(unsigned int));
cudaMemcpy(dev_a, hst_a, 4*sizeof(data_t), cudaMemcpyHostToDevice);
cudaMemcpy(dev_lock, hst_lock, 4*sizeof(unsigned int), cudaMemcpyHostToDevice);
method<<<len, 1>>>(dev_a, dev_lock);
cudaMemcpy(hst_a, dev_a, 4*sizeof(data_t), cudaMemcpyDeviceToHost);
for(int i=0;i<4;++i)
printf("%d, ", hst_a[i]);
printf("\n");
}
This code works fine on K40c and GTX970. But on 2080ti, atomicAdd works as expected, atomicCAS only works with very small grid size(~70).
Tested on 2080ti with cuda-10.0(driver-410), cuda10.1(driver-418), cuda10.2(driver-440). None of them works.
Could anyone tell me where the bug is(hardware/driver/cuda/code)? and how to fix it.