Problem of Hash Table Lock in CUDA

liu.yang12 · July 13, 2018, 9:07pm

Greetings,

I am currently trying to implement a GPU hash table, but I am struggling implementing a proper lock on the table when inserting a new data in the hash slot.

I am using an int array as a flage array and using atomicCAS and atomicExch operation to hold and release lock.

I tried the way mentioned in https://devtalk.nvidia.com/default/topic/762279/problem-with-lock-using-atomiccas/ but I am still getting deadlock situation. Here is a simplified version of my code:

struct pool_struct {
    int val;
    int next;
}

__global__ void mykernel(int* hash_entry,
                         pool_struct* hash_pool,
                         int* hash_lock,
                         int size) {
    int x = blockIdx.x*BLOCK_SIZE + threadIdx.x;

    if (x < size) {
        int hash_bucket = hash_it(x);
        bool succeed = false;

        while (!succeed) {
            if (atomicCAS(&hash_lock[hash_bucket], 0, 1) == 0 ) {
                hash_pool[hash_bucket].next = hash_entry[x];
                hash_pool[hash_bucket].val = x;
                hash_entry[x] = hash_bucket;

                atomicExch(&hash_lock[hash_bucket], 0);
                succeed = true;
            }
        }
    }
}

I really don’t know why it is giving me a dead lock… I have been stuck here for 3 weeks, and being so frustrated. Hope someone can help!!!

Best

Robert_Crovella · July 13, 2018, 10:05pm

If you want to provide a complete runnable test case, as the other poster did in the thread you linked, I will take a look as time permits.

If your test case is longer than about 100 lines of code, I will probably not take a look.

liu.yang12 · July 14, 2018, 9:40pm

Hi txbob, Here is a runnable version of my code.

#include <cuda.h>
#include <stdio.h>

#define BLOCK_SIZE 512

typedef struct {
    int val;
    int next;
} MyStruct;

__device__ int HashIt(int loc) {
    return 1;
}

__global__ void kernel_function(int *hashMap, MyStruct *hashPool,
                                int *hashLock,
                                int size)
{
    int loc = blockIdx.x*BLOCK_SIZE + threadIdx.x;

    if (loc < size) {

        int bucket = HashIt(loc);

        bool succeed = false;
        while(!succeed) {
            if (atomicCAS(&hashLock[bucket], 0, 1) == 0 ) {
                hashMap[bucket] = loc;
                hashPool[loc].val = 100;

                atomicExch(&hashLock[bucket], 0);
                succeed = true;
            }
        }
    }
}

int main() {
    int* hashMap;
    int* hashLock;
    MyStruct* hashPool;
    int size = 2048;

    cudaMalloc((void**)&hashLock, 10 * sizeof(int));
    cudaMemset(hashLock, 0, 10 * sizeof(int));
    cudaMalloc((void**)&hashMap, 10 * sizeof(int));
    cudaMemset(hashMap, 0, 10 * sizeof(int));
    cudaMalloc((void**)&hashPool, size * sizeof(MyStruct));
    cudaMemset(hashPool, 0, size * sizeof(MyStruct));

    kernel_function<<<size/BLOCK_SIZE, BLOCK_SIZE>>>(hashMap, hashPool, hashLock, size);

    cudaFree(hashLock);
    cudaFree(hashMap);
    cudaFree(hashPool);
    
    return 0;
}

I am currently using GTX1050 on my laptop. The compiling code is

nvcc test.cu -o test -gencode arch=compute_61,code=sm_61 -arch sm_61

Many thanks in advance

Robert_Crovella · July 14, 2018, 10:55pm

This modified version seems to work better for me:

#include <cuda.h>
#include <stdio.h>

#define BLOCK_SIZE 512

typedef struct {
    int val;
    int next;
} MyStruct;

__device__ int HashIt(int loc) {
    return 1;
}

__global__ void kernel_function(int *hashMap, MyStruct *hashPool,
                                int *hashLock,
                                int size)
{
    int loc = blockIdx.x*BLOCK_SIZE + threadIdx.x;

    if (loc < size) {

        int bucket = HashIt(loc);

        bool succeed = false;
        do {
            if (atomicCAS(&hashLock[bucket], 0, 1) == 0 ) {
                hashMap[bucket] = loc;
                hashPool[loc].val = 100;

                atomicExch(&hashLock[bucket], 0);
                succeed = true;
            }
        } while(!succeed);
    }
}

int main() {
    int* hashMap;
    int* hashLock;
    MyStruct* hashPool;
    int size = 2048;

    cudaMalloc((void**)&hashLock, 10 * sizeof(int));
    cudaMemset(hashLock, 0, 10 * sizeof(int));
    cudaMalloc((void**)&hashMap, 10 * sizeof(int));
    cudaMemset(hashMap, 0, 10 * sizeof(int));
    cudaMalloc((void**)&hashPool, size * sizeof(MyStruct));
    cudaMemset(hashPool, 0, size * sizeof(MyStruct));

    kernel_function<<<size/BLOCK_SIZE, BLOCK_SIZE>>>(hashMap, hashPool, hashLock, size);
    cudaDeviceSynchronize();
    cudaFree(hashLock);
    cudaFree(hashMap);
    cudaFree(hashPool);

    return 0;
}

The only thing I have done is refactor the while loop into a do-while loop.

The difference is in how the compiler schedules the various conditional paths. It can be discovered by comparing the output of both variants in cuobjdump -sass

for the failing variant, I have marked what I believe is the critical section of code:

Function : _Z15kernel_functionPiP8MyStructS_i
        .headerflags    @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
                                                                                /* 0x08a0b8b0a0a08cc0 */
        /*0008*/                   MOV R1, c[0x0][0x44];                        /* 0x64c03c00089c0006 */
        /*0010*/                   S2R R0, SR_CTAID.X;                          /* 0x86400000129c0002 */
        /*0018*/                   S2R R3, SR_TID.X;                            /* 0x86400000109c000e */
        /*0020*/                   IMAD R0, R0, 0x200, R3;                      /* 0xa1080c01001c0001 */
        /*0028*/                   ISETP.GE.AND P0, PT, R0, c[0x0][0x158], PT;  /* 0x5b681c002b1c001e */
        /*0030*/               @P0 EXIT;                                        /* 0x180000000000003c */
        /*0038*/                   MOV R2, c[0x0][0x150];                       /* 0x64c03c002a1c000a */
                                                                                /* 0x08fcb8a0c0ac8010 */
        /*0048*/                   IADD32I R2.CC, R2, 0x4;                      /* 0x40800000021c0809 */
        /*0050*/                   MOV32I R5, 0x1;                              /* 0x74000000009fc016 */
        /*0058*/                   SSY 0xa0;                                    /* 0x1480000020000000 */
        /*0060*/                   IADD.X R3, RZ, c[0x0][0x154];                /* 0x608040002a9ffc0e */
        /*0068*/                   MOV R4, RZ;                                  /* 0xe4c03c007f9c0012 */  <-
        /*0070*/                   ATOM.E.CAS R4, [R2], R4, R5;                 /* 0x77880000021c0812 */    |
        /*0078*/                   CCTL.E.IV [R2];                              /* 0x7b800000001c0816 */    |
                                                                                /* 0x08a010b010bcb8b0 */    |
        /*0088*/                   ISETP.NE.AND P0, PT, R4, RZ, PT;             /* 0xdb581c007f9c101e */    |
        /*0090*/               @P0 BRA 0x68;                                    /* 0x12007fffe800003c */  *** This loop allows warp deadlock.
        /*0098*/                   MOV.S R4, c[0x0][0x140];                     /* 0x64c03c00285c0012 */  * in order to get past the above loop, all threads in the warp must get a identical response from the ATOM instruction
        /*00a0*/                   IADD32I R4.CC, R4, 0x4;                      /* 0x40800000021c1011 */
        /*00a8*/                   MOV32I R7, 0x8;                              /* 0x74000000041fc01e */
        /*00b0*/                   IADD.X R5, RZ, c[0x0][0x144];                /* 0x60804000289ffc16 */
        /*00b8*/                   ISCADD R6.CC, R0, c[0x0][0x148], 0x3;        /* 0x60c40c00291c001a */
                                                                                /* 0x08bc10b8b8a88c10 */
        /*00c8*/                   MOV32I R8, 0x64;                             /* 0x74000000321fc022 */
        /*00d0*/                   ST.E [R4], R0;                               /* 0xe4800000001c1000 */
        /*00d8*/                   IMAD.HI.X R7, R0, R7, c[0x0][0x14c];         /* 0x93181c00299c001e */
        /*00e0*/                   ST.E [R6], R8;                               /* 0xe4800000001c1820 */
        /*00e8*/                   ATOM.E.EXCH R9, [R2], RZ;                    /* 0x6c0800007f9c0826 */
        /*00f0*/                   CCTL.E.IV [R2];                              /* 0x7b800000001c0816 */
        /*00f8*/                   EXIT;                                        /* 0x18000000001c003c */

In the refactored code, we see the desirable pattern that there is no branching (and therefore no possibility for warp deadlock) between the acquisition of the lock in the ATOM.E.CAS instruction and the release of the lock in the ATOM.E.EXCH instruction:

Function : _Z15kernel_functionPiP8MyStructS_i
        .headerflags    @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
                                                                                /* 0x08a0b8b0a0a08cc0 */
        /*0008*/                   MOV R1, c[0x0][0x44];                        /* 0x64c03c00089c0006 */
        /*0010*/                   S2R R0, SR_CTAID.X;                          /* 0x86400000129c0002 */
        /*0018*/                   S2R R3, SR_TID.X;                            /* 0x86400000109c000e */
        /*0020*/                   IMAD R0, R0, 0x200, R3;                      /* 0xa1080c01001c0001 */
        /*0028*/                   ISETP.GE.AND P0, PT, R0, c[0x0][0x158], PT;  /* 0x5b681c002b1c001e */
        /*0030*/               @P0 EXIT;                                        /* 0x180000000000003c */
        /*0038*/                   MOV R4, c[0x0][0x150];                       /* 0x64c03c002a1c0012 */
                                                                                /* 0x08a0c0b08010b010 */
        /*0048*/                   IADD32I R4.CC, R4, 0x4;                      /* 0x40800000021c1011 */
        /*0050*/                   MOV32I R9, 0x8;                              /* 0x74000000041fc026 */
        /*0058*/                   IADD.X R5, RZ, c[0x0][0x154];                /* 0x608040002a9ffc16 */
        /*0060*/                   MOV32I R3, 0x1;                              /* 0x74000000009fc00e */
        /*0068*/                   ISCADD R8.CC, R0, c[0x0][0x148], 0x3;        /* 0x60c40c00291c0022 */
        /*0070*/                   IMAD.HI.X R9, R0, R9, c[0x0][0x14c];         /* 0x93182400299c0026 */
        /*0078*/                   MOV R2, RZ;                                  /* 0xe4c03c007f9c000a */
                                                                                /* 0x08a010bcb0fcb810 */
        /*0088*/                   ATOM.E.CAS R2, [R4], R2, R3;                 /* 0x77880000011c100a */  *** lock acquired
        /*0090*/                   SSY 0x100;                                   /* 0x1480000034000000 */  ** no
        /*0098*/                   CCTL.E.IV [R4];                              /* 0x7b800000001c1016 */  ** intervening
        /*00a0*/                   ISETP.NE.AND P0, PT, R2, RZ, PT;             /* 0xdb581c007f9c081e */  ** branch
        /*00a8*/               @P0 NOP.S;                                       /* 0x8580000000403c02 */  ** instructions
        /*00b0*/                   MOV R6, c[0x0][0x140];                       /* 0x64c03c00281c001a */  **
        /*00b8*/                   MOV32I R11, 0x64;                            /* 0x74000000321fc02e */  **
                                                                                /* 0x08bc10b8b8b8a0b0 */  **
        /*00c8*/                   IADD32I R6.CC, R6, 0x4;                      /* 0x40800000021c1819 */  **
        /*00d0*/                   IADD.X R7, RZ, c[0x0][0x144];                /* 0x60804000289ffc1e */  **
        /*00d8*/                   ST.E [R6], R0;                               /* 0xe4800000001c1800 */  **
        /*00e0*/                   ST.E [R8], R11;                              /* 0xe4800000001c202c */  **
        /*00e8*/                   ATOM.E.EXCH R2, [R4], RZ;                    /* 0x6c0800007f9c100a */  *** lock released
        /*00f0*/                   CCTL.E.IV [R4];                              /* 0x7b800000001c1016 */
        /*00f8*/                   NOP.S;                                       /* 0x85800000005c3c02 */
                                                                                /* 0x0800000000bc10b8 */
        /*0108*/               @P0 BRA 0x78;                                    /* 0x12007fffb400003c */
        /*0110*/                   MOV RZ, RZ;                                  /* 0xe4c03c007f9c03fe */
        /*0118*/                   EXIT;                                        /* 0x18000000001c003c */

I’m not going to try to explain the difference regarding why the previous while() construct seemed to work at one point and why it does not now. My best guess is that the compiler is different. Careful study of the SASS generated from the different compiler versions similar to the above comparison, would probably identify the difference. As I pointed out in the thread you linked, warp-level contention for locks is extremely hazardous. I’m not sure I can give you a construct that no compiler optimization in the future will not refactor into a warp-divergent (and therefore prone to deadlock) realization in some future CUDA compiler.

My recommendation is to have no more than one thread in a warp compete for a global lock. Use other warp-collective or communication techniques to manage intra-warp behavior.

The method described here:

https://stackoverflow.com/questions/18963293/cuda-atomics-change-flag/18968893#18968893

outlines a possible warp-level mechanism. You could use something like warp-aggregation:

https://devblogs.nvidia.com/cuda-pro-tip-optimized-filtering-warp-aggregated-atomics/

to manage the behavior of individual threads in a warp competing.

This is a complex topic.

liu.yang12 · July 16, 2018, 1:02am

Thank you so much for the quick response txbob! Although your new approach still doesn’t work on my laptop, but your advice is pretty useful, lol.

So now I have managed to fix this problem by setting every block size to 1, so there will be only one active thread working on the hash table every time. Though it’s probably not the best approach, but at least it works now! You are my hero! :-))))))))))))))

Robert_Crovella · July 16, 2018, 1:24am

That will have a strongly negative performance effect on any CUDA code.

Which CUDA version are you using?

liu.yang12 · July 16, 2018, 1:17pm

I believe the driver version is 390.67 for Ubuntu, and nvcc version is v9.1.85.

Topic		Replies	Views
atomicCAS does NOT seem to work Hardware Bug? or Improper use?? TESLA C1060 CUDA Programming and Performance	70	19733	January 21, 2010
DeadLock on this piece of code. Require your help. CUDA Programming and Performance	13	4726	March 17, 2010
Copy hash table from GPU to CPU CUDA Programming and Performance	4	905	April 4, 2019
Shared mem atomics Repeat topic CUDA Programming and Performance	47	8780	December 1, 2009
Mutex problem problem with global mutex CUDA Programming and Performance	19	11198	November 17, 2010
newbie in Cuda needs help with 2D arrays CUDA Programming and Performance	9	921	March 9, 2018
Problem with lock using atomicCAS CUDA Programming and Performance	3	3496	July 19, 2014
Peculiar Shared Memory Behavior - NEED HELP! A test program appears to have two arrays sharing v CUDA Programming and Performance	21	2159	June 5, 2011
this code resets my computer CUDA Programming and Performance	21	7190	March 29, 2008
CUDA Memory Consistency CUDA Programming and Performance	23	55429	March 8, 2007

Problem of Hash Table Lock in CUDA

Related topics