Problem of Hash Table Lock in CUDA

Greetings,

I am currently trying to implement a GPU hash table, but I am struggling implementing a proper lock on the table when inserting a new data in the hash slot.

I am using an int array as a flage array and using atomicCAS and atomicExch operation to hold and release lock.

I tried the way mentioned in https://devtalk.nvidia.com/default/topic/762279/problem-with-lock-using-atomiccas/ but I am still getting deadlock situation. Here is a simplified version of my code:

struct pool_struct {
    int val;
    int next;
}

__global__ void mykernel(int* hash_entry,
                         pool_struct* hash_pool,
                         int* hash_lock,
                         int size) {
    int x = blockIdx.x*BLOCK_SIZE + threadIdx.x;

    if (x < size) {
        int hash_bucket = hash_it(x);
        bool succeed = false;

        while (!succeed) {
            if (atomicCAS(&hash_lock[hash_bucket], 0, 1) == 0 ) {
                hash_pool[hash_bucket].next = hash_entry[x];
                hash_pool[hash_bucket].val = x;
                hash_entry[x] = hash_bucket;

                atomicExch(&hash_lock[hash_bucket], 0);
                succeed = true;
            }
        }
    }
}

I really don’t know why it is giving me a dead lock… I have been stuck here for 3 weeks, and being so frustrated. Hope someone can help!!!

Best

If you want to provide a complete runnable test case, as the other poster did in the thread you linked, I will take a look as time permits.

If your test case is longer than about 100 lines of code, I will probably not take a look.

Hi txbob, Here is a runnable version of my code.

#include <cuda.h>
#include <stdio.h>

#define BLOCK_SIZE 512

typedef struct {
    int val;
    int next;
} MyStruct;

__device__ int HashIt(int loc) {
    return 1;
}

__global__ void kernel_function(int *hashMap, MyStruct *hashPool,
                                int *hashLock,
                                int size)
{
    int loc = blockIdx.x*BLOCK_SIZE + threadIdx.x;

    if (loc < size) {

        int bucket = HashIt(loc);

        bool succeed = false;
        while(!succeed) {
            if (atomicCAS(&hashLock[bucket], 0, 1) == 0 ) {
                hashMap[bucket] = loc;
                hashPool[loc].val = 100;

                atomicExch(&hashLock[bucket], 0);
                succeed = true;
            }
        }
    }
}

int main() {
    int* hashMap;
    int* hashLock;
    MyStruct* hashPool;
    int size = 2048;

    cudaMalloc((void**)&hashLock, 10 * sizeof(int));
    cudaMemset(hashLock, 0, 10 * sizeof(int));
    cudaMalloc((void**)&hashMap, 10 * sizeof(int));
    cudaMemset(hashMap, 0, 10 * sizeof(int));
    cudaMalloc((void**)&hashPool, size * sizeof(MyStruct));
    cudaMemset(hashPool, 0, size * sizeof(MyStruct));

    kernel_function<<<size/BLOCK_SIZE, BLOCK_SIZE>>>(hashMap, hashPool, hashLock, size);

    cudaFree(hashLock);
    cudaFree(hashMap);
    cudaFree(hashPool);
    
    return 0;
}

I am currently using GTX1050 on my laptop. The compiling code is

nvcc test.cu -o test -gencode arch=compute_61,code=sm_61 -arch sm_61

Many thanks in advance

This modified version seems to work better for me:

#include <cuda.h>
#include <stdio.h>

#define BLOCK_SIZE 512

typedef struct {
    int val;
    int next;
} MyStruct;

__device__ int HashIt(int loc) {
    return 1;
}

__global__ void kernel_function(int *hashMap, MyStruct *hashPool,
                                int *hashLock,
                                int size)
{
    int loc = blockIdx.x*BLOCK_SIZE + threadIdx.x;

    if (loc < size) {

        int bucket = HashIt(loc);

        bool succeed = false;
        do {
            if (atomicCAS(&hashLock[bucket], 0, 1) == 0 ) {
                hashMap[bucket] = loc;
                hashPool[loc].val = 100;

                atomicExch(&hashLock[bucket], 0);
                succeed = true;
            }
        } while(!succeed);
    }
}

int main() {
    int* hashMap;
    int* hashLock;
    MyStruct* hashPool;
    int size = 2048;

    cudaMalloc((void**)&hashLock, 10 * sizeof(int));
    cudaMemset(hashLock, 0, 10 * sizeof(int));
    cudaMalloc((void**)&hashMap, 10 * sizeof(int));
    cudaMemset(hashMap, 0, 10 * sizeof(int));
    cudaMalloc((void**)&hashPool, size * sizeof(MyStruct));
    cudaMemset(hashPool, 0, size * sizeof(MyStruct));

    kernel_function<<<size/BLOCK_SIZE, BLOCK_SIZE>>>(hashMap, hashPool, hashLock, size);
    cudaDeviceSynchronize();
    cudaFree(hashLock);
    cudaFree(hashMap);
    cudaFree(hashPool);

    return 0;
}

The only thing I have done is refactor the while loop into a do-while loop.

The difference is in how the compiler schedules the various conditional paths. It can be discovered by comparing the output of both variants in cuobjdump -sass

for the failing variant, I have marked what I believe is the critical section of code:

Function : _Z15kernel_functionPiP8MyStructS_i
        .headerflags    @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
                                                                                /* 0x08a0b8b0a0a08cc0 */
        /*0008*/                   MOV R1, c[0x0][0x44];                        /* 0x64c03c00089c0006 */
        /*0010*/                   S2R R0, SR_CTAID.X;                          /* 0x86400000129c0002 */
        /*0018*/                   S2R R3, SR_TID.X;                            /* 0x86400000109c000e */
        /*0020*/                   IMAD R0, R0, 0x200, R3;                      /* 0xa1080c01001c0001 */
        /*0028*/                   ISETP.GE.AND P0, PT, R0, c[0x0][0x158], PT;  /* 0x5b681c002b1c001e */
        /*0030*/               @P0 EXIT;                                        /* 0x180000000000003c */
        /*0038*/                   MOV R2, c[0x0][0x150];                       /* 0x64c03c002a1c000a */
                                                                                /* 0x08fcb8a0c0ac8010 */
        /*0048*/                   IADD32I R2.CC, R2, 0x4;                      /* 0x40800000021c0809 */
        /*0050*/                   MOV32I R5, 0x1;                              /* 0x74000000009fc016 */
        /*0058*/                   SSY 0xa0;                                    /* 0x1480000020000000 */
        /*0060*/                   IADD.X R3, RZ, c[0x0][0x154];                /* 0x608040002a9ffc0e */
        /*0068*/                   MOV R4, RZ;                                  /* 0xe4c03c007f9c0012 */  <-
        /*0070*/                   ATOM.E.CAS R4, [R2], R4, R5;                 /* 0x77880000021c0812 */    |
        /*0078*/                   CCTL.E.IV [R2];                              /* 0x7b800000001c0816 */    |
                                                                                /* 0x08a010b010bcb8b0 */    |
        /*0088*/                   ISETP.NE.AND P0, PT, R4, RZ, PT;             /* 0xdb581c007f9c101e */    |
        /*0090*/               @P0 BRA 0x68;                                    /* 0x12007fffe800003c */  *** This loop allows warp deadlock.
        /*0098*/                   MOV.S R4, c[0x0][0x140];                     /* 0x64c03c00285c0012 */  * in order to get past the above loop, all threads in the warp must get a identical response from the ATOM instruction
        /*00a0*/                   IADD32I R4.CC, R4, 0x4;                      /* 0x40800000021c1011 */
        /*00a8*/                   MOV32I R7, 0x8;                              /* 0x74000000041fc01e */
        /*00b0*/                   IADD.X R5, RZ, c[0x0][0x144];                /* 0x60804000289ffc16 */
        /*00b8*/                   ISCADD R6.CC, R0, c[0x0][0x148], 0x3;        /* 0x60c40c00291c001a */
                                                                                /* 0x08bc10b8b8a88c10 */
        /*00c8*/                   MOV32I R8, 0x64;                             /* 0x74000000321fc022 */
        /*00d0*/                   ST.E [R4], R0;                               /* 0xe4800000001c1000 */
        /*00d8*/                   IMAD.HI.X R7, R0, R7, c[0x0][0x14c];         /* 0x93181c00299c001e */
        /*00e0*/                   ST.E [R6], R8;                               /* 0xe4800000001c1820 */
        /*00e8*/                   ATOM.E.EXCH R9, [R2], RZ;                    /* 0x6c0800007f9c0826 */
        /*00f0*/                   CCTL.E.IV [R2];                              /* 0x7b800000001c0816 */
        /*00f8*/                   EXIT;                                        /* 0x18000000001c003c */

In the refactored code, we see the desirable pattern that there is no branching (and therefore no possibility for warp deadlock) between the acquisition of the lock in the ATOM.E.CAS instruction and the release of the lock in the ATOM.E.EXCH instruction:

Function : _Z15kernel_functionPiP8MyStructS_i
        .headerflags    @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
                                                                                /* 0x08a0b8b0a0a08cc0 */
        /*0008*/                   MOV R1, c[0x0][0x44];                        /* 0x64c03c00089c0006 */
        /*0010*/                   S2R R0, SR_CTAID.X;                          /* 0x86400000129c0002 */
        /*0018*/                   S2R R3, SR_TID.X;                            /* 0x86400000109c000e */
        /*0020*/                   IMAD R0, R0, 0x200, R3;                      /* 0xa1080c01001c0001 */
        /*0028*/                   ISETP.GE.AND P0, PT, R0, c[0x0][0x158], PT;  /* 0x5b681c002b1c001e */
        /*0030*/               @P0 EXIT;                                        /* 0x180000000000003c */
        /*0038*/                   MOV R4, c[0x0][0x150];                       /* 0x64c03c002a1c0012 */
                                                                                /* 0x08a0c0b08010b010 */
        /*0048*/                   IADD32I R4.CC, R4, 0x4;                      /* 0x40800000021c1011 */
        /*0050*/                   MOV32I R9, 0x8;                              /* 0x74000000041fc026 */
        /*0058*/                   IADD.X R5, RZ, c[0x0][0x154];                /* 0x608040002a9ffc16 */
        /*0060*/                   MOV32I R3, 0x1;                              /* 0x74000000009fc00e */
        /*0068*/                   ISCADD R8.CC, R0, c[0x0][0x148], 0x3;        /* 0x60c40c00291c0022 */
        /*0070*/                   IMAD.HI.X R9, R0, R9, c[0x0][0x14c];         /* 0x93182400299c0026 */
        /*0078*/                   MOV R2, RZ;                                  /* 0xe4c03c007f9c000a */
                                                                                /* 0x08a010bcb0fcb810 */
        /*0088*/                   ATOM.E.CAS R2, [R4], R2, R3;                 /* 0x77880000011c100a */  *** lock acquired
        /*0090*/                   SSY 0x100;                                   /* 0x1480000034000000 */  ** no
        /*0098*/                   CCTL.E.IV [R4];                              /* 0x7b800000001c1016 */  ** intervening
        /*00a0*/                   ISETP.NE.AND P0, PT, R2, RZ, PT;             /* 0xdb581c007f9c081e */  ** branch
        /*00a8*/               @P0 NOP.S;                                       /* 0x8580000000403c02 */  ** instructions
        /*00b0*/                   MOV R6, c[0x0][0x140];                       /* 0x64c03c00281c001a */  **
        /*00b8*/                   MOV32I R11, 0x64;                            /* 0x74000000321fc02e */  **
                                                                                /* 0x08bc10b8b8b8a0b0 */  **
        /*00c8*/                   IADD32I R6.CC, R6, 0x4;                      /* 0x40800000021c1819 */  **
        /*00d0*/                   IADD.X R7, RZ, c[0x0][0x144];                /* 0x60804000289ffc1e */  **
        /*00d8*/                   ST.E [R6], R0;                               /* 0xe4800000001c1800 */  **
        /*00e0*/                   ST.E [R8], R11;                              /* 0xe4800000001c202c */  **
        /*00e8*/                   ATOM.E.EXCH R2, [R4], RZ;                    /* 0x6c0800007f9c100a */  *** lock released
        /*00f0*/                   CCTL.E.IV [R4];                              /* 0x7b800000001c1016 */
        /*00f8*/                   NOP.S;                                       /* 0x85800000005c3c02 */
                                                                                /* 0x0800000000bc10b8 */
        /*0108*/               @P0 BRA 0x78;                                    /* 0x12007fffb400003c */
        /*0110*/                   MOV RZ, RZ;                                  /* 0xe4c03c007f9c03fe */
        /*0118*/                   EXIT;                                        /* 0x18000000001c003c */

I’m not going to try to explain the difference regarding why the previous while() construct seemed to work at one point and why it does not now. My best guess is that the compiler is different. Careful study of the SASS generated from the different compiler versions similar to the above comparison, would probably identify the difference. As I pointed out in the thread you linked, warp-level contention for locks is extremely hazardous. I’m not sure I can give you a construct that no compiler optimization in the future will not refactor into a warp-divergent (and therefore prone to deadlock) realization in some future CUDA compiler.

My recommendation is to have no more than one thread in a warp compete for a global lock. Use other warp-collective or communication techniques to manage intra-warp behavior.

The method described here:

https://stackoverflow.com/questions/18963293/cuda-atomics-change-flag/18968893#18968893

outlines a possible warp-level mechanism. You could use something like warp-aggregation:

https://devblogs.nvidia.com/cuda-pro-tip-optimized-filtering-warp-aggregated-atomics/

to manage the behavior of individual threads in a warp competing.

This is a complex topic.

Thank you so much for the quick response txbob! Although your new approach still doesn’t work on my laptop, but your advice is pretty useful, lol.

So now I have managed to fix this problem by setting every block size to 1, so there will be only one active thread working on the hash table every time. Though it’s probably not the best approach, but at least it works now! You are my hero! :-))))))))))))))

That will have a strongly negative performance effect on any CUDA code.

Which CUDA version are you using?

I believe the driver version is 390.67 for Ubuntu, and nvcc version is v9.1.85.