Unexpected Behavior with __syncthreads() in CUDA Kernel Leading to Deadlock and Garbage Values

jeeadv2021failure · June 9, 2024, 3:47pm

Hello everyone,

I am encountering a perplexing issue with a CUDA kernel I’ve written. The kernel is designed to update a matrix and compute some results. It runs perfectly on my local laptop’s GPU, but when I compile and run it on a server or Google Colab, the application runs indefinitely, suggesting a deadlock.

Kernel Code

__global__ void updateMatrix(double* matrix, double* result, int lpivot, int m, int n, double* up, double* cl)
{
    double b = (*up) * matrix[(lpivot - 1) * n + lpivot - 1];
    if (b >= 0.0) return;
    b = 1 / b;

    extern __shared__ double sharedPivot[];
    __shared__ double sharedResults[ROW_PER_BLOCK][COL_PER_BLOCK];

    int tidx = threadIdx.x + blockIdx.x * blockDim.x;
    int tidy = threadIdx.y + blockIdx.y * blockDim.y;
    
    int row = lpivot + blockIdx.y * blockDim.y + threadIdx.y; 
    
    int iterations =  (n - lpivot + COL_PER_BLOCK - 1) / COL_PER_BLOCK;  
        
    for(int it = 0; it < iterations; it++)
    {
        int colStart = lpivot + it * blockDim.x; 
        
        if (threadIdx.y == 0 && colStart + threadIdx.x < n) 
        {   
            sharedPivot[threadIdx.x] = matrix[(lpivot - 1) * n + colStart + threadIdx.x];
        }    
        
        __syncthreads();  

        if (row < m && colStart + threadIdx.x < n) 
        {  
            sharedResults[threadIdx.y][threadIdx.x] = matrix[row * n + colStart + threadIdx.x] * sharedPivot[threadIdx.x];
        }
        else 
        {
            sharedResults[threadIdx.y][threadIdx.x] = 0;
        }
        __syncthreads();  
        
        reduceSum<COL_PER_BLOCK>(sharedResults[threadIdx.y]);
        
        if (row < m && threadIdx.x == 0) 
        {
            result[row - lpivot] += sharedResults[threadIdx.y][0];
        }

        __syncthreads();

        if(tidx < COL_PER_BLOCK && tidy < ROW_PER_BLOCK)
        {
            sharedPivot[tidx] = 0;
            sharedResults[tidy][tidx] = 0;
        }
        __syncthreads();
    }
   
    if(row < m && threadIdx.x == 0)
    {     
        result[row - lpivot] += matrix[row * n + lpivot - 1] * (*up);   
        if (result[row - lpivot] == 0) return;
        result[row - lpivot] *= b;
        matrix[row * n + lpivot - 1] += result[row - lpivot] * (*up);
    }
    __syncthreads();

    for (int it = 0; it < iterations; it++) 
    {
       int col = it * blockDim.x + threadIdx.x + lpivot;
       
       if (col < n && row < m)  
        {
            matrix[row * n + col] += result[row - lpivot] * matrix[(lpivot - 1) * n + col];
        }
        __syncthreads();

        if (row == lpivot && col < n)
        {
            sharedPivot[threadIdx.x] = matrix[row * n + col] * matrix[row * n + col];
        }
        else 
        {
            sharedPivot[threadIdx.x] = 0;
        }
        __syncthreads();

        reduceSum<COL_PER_BLOCK>(sharedPivot);
        if (threadIdx.x == 0) 
        {
            atomicAdd(&sumOfSquares, sharedPivot[0]);
        }     
        __syncthreads();
    }

    if (lpivot == 1)
        printf("sumOfSquares = %f\n", sumOfSquares);

    atomicAdd(&counter, 1);

    printf("lpivot = %d counter = %d\n", counter);

    if (counter == blockDim.x * blockDim.y * gridDim.y)
    {
        *cl = matrix[lpivot * n + lpivot];
        *cl = (*cl > 0.0) ? -sqrt(sumOfSquares) : sqrt(sumOfSquares);
        *up = matrix[lpivot * n + lpivot] - *cl;
        matrix[lpivot * n + lpivot] = *cl;
        sumOfSquares = 0; // Reset for the next iteration
        counter = 0;
    }
}

Deadlock Identification: I initially identified that the threads entering the following block were not leaving it due to conditional synchronization.

if (row == lpivot && col < n)
{
    sharedPivot[threadIdx.x] = matrix[row * 
  n + col] * matrix[row * n + col];
    __syncthreads();
    reduceSum<COL_PER_BLOCK>(sharedPivot);
    if (threadIdx.x == 0) 
    {
        sumOfSquares += sharedPivot[0];
    }
    sharedPivot[threadIdx.x] = 0;
    __syncthreads();
}

After moving __syncthreads() out of conditional blocks, the kernel avoids deadlock but results in garbage values being loaded into shared memory, leading to incorrect computation of sumOfSquares, which is printed as zero.

 for (int it = 0; it < iterations; it++) 
    {
       int col = it * blockDim.x + threadIdx.x + lpivot;
       
        if (col < n && row < m)  
        {
            matrix[row * n + col] += result[row - lpivot] * matrix[(lpivot - 1) * n + col];
        }
         __syncthreads();

        if(row == lpivot && col < n)
        {
            if(lpivot == 1)
            printf("before shared pivot loaded sharedPivot[%d] = %f\n",threadIdx.x , sharedPivot[threadIdx.x]);
            sharedPivot[threadIdx.x] = matrix[row * n + col] * matrix[row * n + col];
            if(lpivot == 1)
            printf("after shared pivot loaded sharedPivot[%d] = %f\n",threadIdx.x , sharedPivot[threadIdx.x]);
        }

        else
            sharedPivot[threadIdx.x] = 0;
         __syncthreads();
                
         if(row == lpivot && col < n)
         {
            reduceSum<COL_PER_BLOCK>(sharedPivot);        
            if(threadIdx.x == 0) 
            {
              sumOfSquares += sharedPivot[0];
            }  
              printf("sum of sqaures = %f\n",sumOfSqaures);
        }
        sharedPivot[threadIdx.x] = 0;  
        __syncthreads();
    }

How can I properly handle synchronization in this scenario to avoid deadlocks without introducing garbage values? Is there a more robust way to ensure correct accumulation of sumOfSquares across all threads?

striker159 · June 9, 2024, 5:21pm

Please post a complete minimal example that reproduces your problem.

striker159 · June 9, 2024, 6:17pm

The code does not compile for me using cuda 12.4

main.cu(180): error: identifier "gpuErrchk" is undefined
      gpuErrchk(cudaMalloc((void**)&matrix, sizeof(double) * m * n));
      ^

main.cu(196): error: identifier "start" is undefined
      cudaEventRecord(start);
                      ^

main.cu(198): error: identifier "updatePivotElement" is undefined
      updatePivotElement<<<1, 512, 512 * sizeof(double)>>>(matrix, 1, m, n, d_up_old, d_cl);
      ^

main.cu(225): error: expected a ";"
      householder(input_matrix);

jeeadv2021failure · June 10, 2024, 1:19am

I am sorry , i have updated the code , please try again

striker159 · June 10, 2024, 4:30am

Now the code compiles but execution gives segmentation fault.

Topic		Replies	Views
Desperate help needed! Sum rows of matrix CUDA Programming and Performance	16	40589	April 24, 2011
__syncthreads() + shared memory issue CUDA Programming and Performance	7	5597	August 26, 2008
Matrix Multiplication Garbage value :( CUDA Programming and Performance	10	3406	July 25, 2009
Reduction questions(newbie-ish) CUDA Programming and Performance	7	1793	January 14, 2009
help with kernel synchronization? CUDA Programming and Performance	22	13900	August 26, 2010
do not understand thread/block division CUDA Programming and Performance	10	2799	April 23, 2012
DeadLock on this piece of code. Require your help. CUDA Programming and Performance	13	4738	March 17, 2010
Iteration help in CUDA CUDA Programming and Performance	11	6888	April 19, 2012
Deadlock that shouldn't occur CUDA Programming and Performance	5	4305	January 27, 2010
Getting wrong output from CUDA kernel CUDA Programming and Performance	6	8288	April 15, 2011

Unexpected Behavior with __syncthreads() in CUDA Kernel Leading to Deadlock and Garbage Values

Related topics