This modified version seems to work better for me:
#include <cuda.h>
#include <stdio.h>
#define BLOCK_SIZE 512
typedef struct {
int val;
int next;
} MyStruct;
__device__ int HashIt(int loc) {
return 1;
}
__global__ void kernel_function(int *hashMap, MyStruct *hashPool,
int *hashLock,
int size)
{
int loc = blockIdx.x*BLOCK_SIZE + threadIdx.x;
if (loc < size) {
int bucket = HashIt(loc);
bool succeed = false;
do {
if (atomicCAS(&hashLock[bucket], 0, 1) == 0 ) {
hashMap[bucket] = loc;
hashPool[loc].val = 100;
atomicExch(&hashLock[bucket], 0);
succeed = true;
}
} while(!succeed);
}
}
int main() {
int* hashMap;
int* hashLock;
MyStruct* hashPool;
int size = 2048;
cudaMalloc((void**)&hashLock, 10 * sizeof(int));
cudaMemset(hashLock, 0, 10 * sizeof(int));
cudaMalloc((void**)&hashMap, 10 * sizeof(int));
cudaMemset(hashMap, 0, 10 * sizeof(int));
cudaMalloc((void**)&hashPool, size * sizeof(MyStruct));
cudaMemset(hashPool, 0, size * sizeof(MyStruct));
kernel_function<<<size/BLOCK_SIZE, BLOCK_SIZE>>>(hashMap, hashPool, hashLock, size);
cudaDeviceSynchronize();
cudaFree(hashLock);
cudaFree(hashMap);
cudaFree(hashPool);
return 0;
}
The only thing I have done is refactor the while loop into a do-while loop.
The difference is in how the compiler schedules the various conditional paths. It can be discovered by comparing the output of both variants in cuobjdump -sass
for the failing variant, I have marked what I believe is the critical section of code:
Function : _Z15kernel_functionPiP8MyStructS_i
.headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
/* 0x08a0b8b0a0a08cc0 */
/*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */
/*0010*/ S2R R0, SR_CTAID.X; /* 0x86400000129c0002 */
/*0018*/ S2R R3, SR_TID.X; /* 0x86400000109c000e */
/*0020*/ IMAD R0, R0, 0x200, R3; /* 0xa1080c01001c0001 */
/*0028*/ ISETP.GE.AND P0, PT, R0, c[0x0][0x158], PT; /* 0x5b681c002b1c001e */
/*0030*/ @P0 EXIT; /* 0x180000000000003c */
/*0038*/ MOV R2, c[0x0][0x150]; /* 0x64c03c002a1c000a */
/* 0x08fcb8a0c0ac8010 */
/*0048*/ IADD32I R2.CC, R2, 0x4; /* 0x40800000021c0809 */
/*0050*/ MOV32I R5, 0x1; /* 0x74000000009fc016 */
/*0058*/ SSY 0xa0; /* 0x1480000020000000 */
/*0060*/ IADD.X R3, RZ, c[0x0][0x154]; /* 0x608040002a9ffc0e */
/*0068*/ MOV R4, RZ; /* 0xe4c03c007f9c0012 */ <-
/*0070*/ ATOM.E.CAS R4, [R2], R4, R5; /* 0x77880000021c0812 */ |
/*0078*/ CCTL.E.IV [R2]; /* 0x7b800000001c0816 */ |
/* 0x08a010b010bcb8b0 */ |
/*0088*/ ISETP.NE.AND P0, PT, R4, RZ, PT; /* 0xdb581c007f9c101e */ |
/*0090*/ @P0 BRA 0x68; /* 0x12007fffe800003c */ *** This loop allows warp deadlock.
/*0098*/ MOV.S R4, c[0x0][0x140]; /* 0x64c03c00285c0012 */ * in order to get past the above loop, all threads in the warp must get a identical response from the ATOM instruction
/*00a0*/ IADD32I R4.CC, R4, 0x4; /* 0x40800000021c1011 */
/*00a8*/ MOV32I R7, 0x8; /* 0x74000000041fc01e */
/*00b0*/ IADD.X R5, RZ, c[0x0][0x144]; /* 0x60804000289ffc16 */
/*00b8*/ ISCADD R6.CC, R0, c[0x0][0x148], 0x3; /* 0x60c40c00291c001a */
/* 0x08bc10b8b8a88c10 */
/*00c8*/ MOV32I R8, 0x64; /* 0x74000000321fc022 */
/*00d0*/ ST.E [R4], R0; /* 0xe4800000001c1000 */
/*00d8*/ IMAD.HI.X R7, R0, R7, c[0x0][0x14c]; /* 0x93181c00299c001e */
/*00e0*/ ST.E [R6], R8; /* 0xe4800000001c1820 */
/*00e8*/ ATOM.E.EXCH R9, [R2], RZ; /* 0x6c0800007f9c0826 */
/*00f0*/ CCTL.E.IV [R2]; /* 0x7b800000001c0816 */
/*00f8*/ EXIT; /* 0x18000000001c003c */
In the refactored code, we see the desirable pattern that there is no branching (and therefore no possibility for warp deadlock) between the acquisition of the lock in the ATOM.E.CAS instruction and the release of the lock in the ATOM.E.EXCH instruction:
Function : _Z15kernel_functionPiP8MyStructS_i
.headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
/* 0x08a0b8b0a0a08cc0 */
/*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */
/*0010*/ S2R R0, SR_CTAID.X; /* 0x86400000129c0002 */
/*0018*/ S2R R3, SR_TID.X; /* 0x86400000109c000e */
/*0020*/ IMAD R0, R0, 0x200, R3; /* 0xa1080c01001c0001 */
/*0028*/ ISETP.GE.AND P0, PT, R0, c[0x0][0x158], PT; /* 0x5b681c002b1c001e */
/*0030*/ @P0 EXIT; /* 0x180000000000003c */
/*0038*/ MOV R4, c[0x0][0x150]; /* 0x64c03c002a1c0012 */
/* 0x08a0c0b08010b010 */
/*0048*/ IADD32I R4.CC, R4, 0x4; /* 0x40800000021c1011 */
/*0050*/ MOV32I R9, 0x8; /* 0x74000000041fc026 */
/*0058*/ IADD.X R5, RZ, c[0x0][0x154]; /* 0x608040002a9ffc16 */
/*0060*/ MOV32I R3, 0x1; /* 0x74000000009fc00e */
/*0068*/ ISCADD R8.CC, R0, c[0x0][0x148], 0x3; /* 0x60c40c00291c0022 */
/*0070*/ IMAD.HI.X R9, R0, R9, c[0x0][0x14c]; /* 0x93182400299c0026 */
/*0078*/ MOV R2, RZ; /* 0xe4c03c007f9c000a */
/* 0x08a010bcb0fcb810 */
/*0088*/ ATOM.E.CAS R2, [R4], R2, R3; /* 0x77880000011c100a */ *** lock acquired
/*0090*/ SSY 0x100; /* 0x1480000034000000 */ ** no
/*0098*/ CCTL.E.IV [R4]; /* 0x7b800000001c1016 */ ** intervening
/*00a0*/ ISETP.NE.AND P0, PT, R2, RZ, PT; /* 0xdb581c007f9c081e */ ** branch
/*00a8*/ @P0 NOP.S; /* 0x8580000000403c02 */ ** instructions
/*00b0*/ MOV R6, c[0x0][0x140]; /* 0x64c03c00281c001a */ **
/*00b8*/ MOV32I R11, 0x64; /* 0x74000000321fc02e */ **
/* 0x08bc10b8b8b8a0b0 */ **
/*00c8*/ IADD32I R6.CC, R6, 0x4; /* 0x40800000021c1819 */ **
/*00d0*/ IADD.X R7, RZ, c[0x0][0x144]; /* 0x60804000289ffc1e */ **
/*00d8*/ ST.E [R6], R0; /* 0xe4800000001c1800 */ **
/*00e0*/ ST.E [R8], R11; /* 0xe4800000001c202c */ **
/*00e8*/ ATOM.E.EXCH R2, [R4], RZ; /* 0x6c0800007f9c100a */ *** lock released
/*00f0*/ CCTL.E.IV [R4]; /* 0x7b800000001c1016 */
/*00f8*/ NOP.S; /* 0x85800000005c3c02 */
/* 0x0800000000bc10b8 */
/*0108*/ @P0 BRA 0x78; /* 0x12007fffb400003c */
/*0110*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */
/*0118*/ EXIT; /* 0x18000000001c003c */
I’m not going to try to explain the difference regarding why the previous while() construct seemed to work at one point and why it does not now. My best guess is that the compiler is different. Careful study of the SASS generated from the different compiler versions similar to the above comparison, would probably identify the difference. As I pointed out in the thread you linked, warp-level contention for locks is extremely hazardous. I’m not sure I can give you a construct that no compiler optimization in the future will not refactor into a warp-divergent (and therefore prone to deadlock) realization in some future CUDA compiler.
My recommendation is to have no more than one thread in a warp compete for a global lock. Use other warp-collective or communication techniques to manage intra-warp behavior.
The method described here:
https://stackoverflow.com/questions/18963293/cuda-atomics-change-flag/18968893#18968893
outlines a possible warp-level mechanism. You could use something like warp-aggregation:
https://devblogs.nvidia.com/cuda-pro-tip-optimized-filtering-warp-aggregated-atomics/
to manage the behavior of individual threads in a warp competing.
This is a complex topic.