Hello everyone,
I am encountering a perplexing issue with a CUDA kernel I’ve written. The kernel is designed to update a matrix and compute some results. It runs perfectly on my local laptop’s GPU, but when I compile and run it on a server or Google Colab, the application runs indefinitely, suggesting a deadlock.
Kernel Code
__global__ void updateMatrix(double* matrix, double* result, int lpivot, int m, int n, double* up, double* cl)
{
double b = (*up) * matrix[(lpivot - 1) * n + lpivot - 1];
if (b >= 0.0) return;
b = 1 / b;
extern __shared__ double sharedPivot[];
__shared__ double sharedResults[ROW_PER_BLOCK][COL_PER_BLOCK];
int tidx = threadIdx.x + blockIdx.x * blockDim.x;
int tidy = threadIdx.y + blockIdx.y * blockDim.y;
int row = lpivot + blockIdx.y * blockDim.y + threadIdx.y;
int iterations = (n - lpivot + COL_PER_BLOCK - 1) / COL_PER_BLOCK;
for(int it = 0; it < iterations; it++)
{
int colStart = lpivot + it * blockDim.x;
if (threadIdx.y == 0 && colStart + threadIdx.x < n)
{
sharedPivot[threadIdx.x] = matrix[(lpivot - 1) * n + colStart + threadIdx.x];
}
__syncthreads();
if (row < m && colStart + threadIdx.x < n)
{
sharedResults[threadIdx.y][threadIdx.x] = matrix[row * n + colStart + threadIdx.x] * sharedPivot[threadIdx.x];
}
else
{
sharedResults[threadIdx.y][threadIdx.x] = 0;
}
__syncthreads();
reduceSum<COL_PER_BLOCK>(sharedResults[threadIdx.y]);
if (row < m && threadIdx.x == 0)
{
result[row - lpivot] += sharedResults[threadIdx.y][0];
}
__syncthreads();
if(tidx < COL_PER_BLOCK && tidy < ROW_PER_BLOCK)
{
sharedPivot[tidx] = 0;
sharedResults[tidy][tidx] = 0;
}
__syncthreads();
}
if(row < m && threadIdx.x == 0)
{
result[row - lpivot] += matrix[row * n + lpivot - 1] * (*up);
if (result[row - lpivot] == 0) return;
result[row - lpivot] *= b;
matrix[row * n + lpivot - 1] += result[row - lpivot] * (*up);
}
__syncthreads();
for (int it = 0; it < iterations; it++)
{
int col = it * blockDim.x + threadIdx.x + lpivot;
if (col < n && row < m)
{
matrix[row * n + col] += result[row - lpivot] * matrix[(lpivot - 1) * n + col];
}
__syncthreads();
if (row == lpivot && col < n)
{
sharedPivot[threadIdx.x] = matrix[row * n + col] * matrix[row * n + col];
}
else
{
sharedPivot[threadIdx.x] = 0;
}
__syncthreads();
reduceSum<COL_PER_BLOCK>(sharedPivot);
if (threadIdx.x == 0)
{
atomicAdd(&sumOfSquares, sharedPivot[0]);
}
__syncthreads();
}
if (lpivot == 1)
printf("sumOfSquares = %f\n", sumOfSquares);
atomicAdd(&counter, 1);
printf("lpivot = %d counter = %d\n", counter);
if (counter == blockDim.x * blockDim.y * gridDim.y)
{
*cl = matrix[lpivot * n + lpivot];
*cl = (*cl > 0.0) ? -sqrt(sumOfSquares) : sqrt(sumOfSquares);
*up = matrix[lpivot * n + lpivot] - *cl;
matrix[lpivot * n + lpivot] = *cl;
sumOfSquares = 0; // Reset for the next iteration
counter = 0;
}
}
Deadlock Identification: I initially identified that the threads entering the following block were not leaving it due to conditional synchronization.
if (row == lpivot && col < n)
{
sharedPivot[threadIdx.x] = matrix[row *
n + col] * matrix[row * n + col];
__syncthreads();
reduceSum<COL_PER_BLOCK>(sharedPivot);
if (threadIdx.x == 0)
{
sumOfSquares += sharedPivot[0];
}
sharedPivot[threadIdx.x] = 0;
__syncthreads();
}
After moving __syncthreads()
out of conditional blocks, the kernel avoids deadlock but results in garbage values being loaded into shared memory, leading to incorrect computation of sumOfSquares
, which is printed as zero.
for (int it = 0; it < iterations; it++)
{
int col = it * blockDim.x + threadIdx.x + lpivot;
if (col < n && row < m)
{
matrix[row * n + col] += result[row - lpivot] * matrix[(lpivot - 1) * n + col];
}
__syncthreads();
if(row == lpivot && col < n)
{
if(lpivot == 1)
printf("before shared pivot loaded sharedPivot[%d] = %f\n",threadIdx.x , sharedPivot[threadIdx.x]);
sharedPivot[threadIdx.x] = matrix[row * n + col] * matrix[row * n + col];
if(lpivot == 1)
printf("after shared pivot loaded sharedPivot[%d] = %f\n",threadIdx.x , sharedPivot[threadIdx.x]);
}
else
sharedPivot[threadIdx.x] = 0;
__syncthreads();
if(row == lpivot && col < n)
{
reduceSum<COL_PER_BLOCK>(sharedPivot);
if(threadIdx.x == 0)
{
sumOfSquares += sharedPivot[0];
}
printf("sum of sqaures = %f\n",sumOfSqaures);
}
sharedPivot[threadIdx.x] = 0;
__syncthreads();
}
How can I properly handle synchronization in this scenario to avoid deadlocks without introducing garbage values? Is there a more robust way to ensure correct accumulation of sumOfSquares
across all threads?