:mellow:
Hello, guys
As the title said, I get this deadlock on the following piece of code.
It is just a try to implement mutex using atomicExch() to lock the global memory to guarantee exclusively access some pieces of code.
And what’s wrong about this?
#include <stdio.h>
typedef struct Bus
{
int counter1;
int counter2;
}Bus;
__device__ int g_lock;
//#define SHARE_LOCK
#ifdef SHARE_LOCK
#define LOCK shareLock
#else
#define LOCK g_lock
#endif
__global__ void checkKernel(Bus *bus)
{
#ifdef SHARE_LOCK
__shared__ int shareLock;
#endif
if(threadIdx.x == 0)
{
LOCK = 0;
}
__syncthreads();
while(atomicExch(&LOCK, 1));
bus->counter1++;
bus->counter2--;
atomicExch(&LOCK,0);
}
int main()
{
Bus bus;
bus.counter1 = 0;
bus.counter2 = 0;
Bus *d_bus;
cudaMalloc ((void **)&d_bus, sizeof(Bus));
cudaMemcpy(d_bus,&bus,sizeof(Bus),cudaMemcpyHostToDevice);
cudaError_t err;
checkKernel<<<1,2>>>(d_bus);
err = cudaThreadSynchronize();
if(err != 0)
{
printf("Error : %s\n", cudaGetErrorString(cudaGetLastError()));
return -1;
}
cudaMemcpy(&bus,d_bus,sizeof(Bus),cudaMemcpyDeviceToHost);
printf("counter1 = %d\n counter2 = %d\n",bus.counter1,bus.counter2);
}
And my environment is GTX 280 + linux(Ubuntu) + cuda2.3, and compile with nvcc -g -keep -arch=sm_13 shareMemLock.cu -o shareMemLock.
Also I put the ptx code, since I know a little things about the ptx code, maybe this is helpful for you guys.
.global .s32 g_lock;
.entry _Z11checkKernelP3Bus (
.param .u32 __cudaparm__Z11checkKernelP3Bus_bus)
{
.reg .u32 %rv1;
.reg .u32 %r<21>;
.reg .pred %p<5>;
.loc 2 20 0
$LBB1__Z11checkKernelP3Bus:
mov.s32 %r1, 0;
ld.global.s32 %r2, [g_lock];
cvt.u32.u16 %r3, %tid.x;
mov.u32 %r4, 0;
setp.eq.u32 %p1, %r3, %r4;
selp.s32 %r5, %r1, %r2, %p1;
st.global.s32 [g_lock], %r5;
.loc 2 31 0
bar.sync 0;
.loc 2 33 0
mov.u32 %r6, g_lock;
mov.s32 %r7, 1;
atom.global.exch.b32 %rv1, [%r6], %r7;
mov.s32 %r8, %rv1;
mov.u32 %r9, 0;
setp.eq.s32 %p2, %r8, %r9;
@%p2 bra $Lt_0_2050;
$Lt_0_2562:
//<loop> Loop body line 33
mov.u32 %r10, g_lock;
mov.s32 %r11, 1;
atom.global.exch.b32 %rv1, [%r10], %r11;
mov.s32 %r8, %rv1;
mov.u32 %r12, 0;
setp.ne.s32 %p3, %r8, %r12;
@%p3 bra $Lt_0_2562;
$Lt_0_2050:
.loc 2 34 0
ld.param.u32 %r13, [__cudaparm__Z11checkKernelP3Bus_bus];
ld.global.s32 %r14, [%r13+0];
add.s32 %r15, %r14, 1;
st.global.s32 [%r13+0], %r15;
.loc 2 35 0
ld.global.s32 %r16, [%r13+4];
sub.s32 %r17, %r16, 1;
st.global.s32 [%r13+4], %r17;
.loc 15 123 0
mov.u32 %r18, g_lock;
mov.s32 %r19, 0;
atom.global.exch.b32 %rv1, [%r18], %r19;
.loc 2 37 0
exit;
$LDWend__Z11checkKernelP3Bus:
} // _Z11checkKernelP3Bus
Thanks for your time and reply!