I have a minimal (non)working example of a deadlock that I produce with 2 blocks and 32threads each.
It comes from a bigger problem where I try to use busy waiting to feed each block a piece of work until the end of the queue is reached. I know that this could be done smarter for this reduced example, this is just for showing the deadlock behaviour.
Whenever I add any other printf statement to my kernel or if I enter a breakpoint the deadlock disappears, otherwise it occurs all the time on my hardware.
NVIDIA RTX A500 Laptop Driver 535.171.04 Cuda 12.2
The code uses a custom atomic that increases a global variable in memory if it is not above a certain threshold. If the value was increased, this is the next workpackage for the block.
#include <iostream>
#include <cuda_runtime.h>
#define gpuErrchk(ans) \
{ gpuAssert((ans), __FILE__, __LINE__); }
__inline__ void gpuAssert(cudaError_t code, const char *file, int line) {
if (code != cudaSuccess) {
printf("GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
std::cout << std::endl;
}
}
// a function that increments the device qhead for each block entering
// do not increment over maximum trail size
__device__ bool atomicAddThreshold(unsigned int* address, unsigned int* threshold, unsigned int& qhead) {
__shared__ bool increased;
increased = false;
__syncthreads();
if (threadIdx.x == 0) {
unsigned int next;
unsigned int old = *address, assumed;
do {
assumed = old;
next = assumed + 1 > *threshold ? *threshold : assumed + 1;
old = atomicCAS(address, assumed, next);
} while (assumed != old);
qhead = old;
increased = next > assumed;
}
printf("before syncing threads in atomicAddThreshold, %d/%d has qhead %d\n", threadIdx.x, blockIdx.x, qhead);
__syncthreads();
return increased;
}
// stripped down version of busy working queue
// as long as there is work to do (qhead < trail_size) get next index to work on
__global__ void foo(unsigned int* qhead, unsigned int* device_trail_size, unsigned int* inactive_blocks) {
__shared__ bool finished;
__shared__ bool all_finished;
__shared__ unsigned int local_qhead;
finished = false;
all_finished = false;
local_qhead = 0;
__syncthreads();
while(true) { // busy waiting loop
if (!finished && atomicAddThreshold(qhead, device_trail_size, local_qhead)) {
// work() // do some work
printf("Block/Thread %d %d entering nary propagation with qhead %d\n", blockIdx.x, threadIdx.x, local_qhead);
if (threadIdx.x == 0) local_qhead++;
}
__syncthreads();
if (threadIdx.x == 0) {
if (!finished) {
finished = true;
atomicAdd(inactive_blocks, 1);
}
if (finished && *inactive_blocks == gridDim.x)
all_finished = true;
}
__syncthreads();
if (all_finished) {
printf("FINISHED PROPAGATION FOR BLOCK %d and THREAD %d\n", blockIdx.x, threadIdx.x);
break;
}
}
}
int main()
{
unsigned int number = 0;
unsigned int* qhead;
unsigned int* device_trail_size;
unsigned int* inactive_blocks;
gpuErrchk(cudaMalloc((void **)&qhead, sizeof(unsigned int)));
gpuErrchk(cudaMemcpy(qhead, &number, sizeof(unsigned int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMalloc((void **)&device_trail_size, sizeof(unsigned int)));
gpuErrchk(cudaMemcpy(device_trail_size, &number, sizeof(unsigned int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMalloc((void **)&inactive_blocks, sizeof(unsigned int)));
gpuErrchk(cudaMemcpy(inactive_blocks, &number, sizeof(unsigned int), cudaMemcpyHostToDevice));
foo<<<2, 32>>>(qhead, device_trail_size, inactive_blocks);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
std::cout << "kernel finished" << std::endl;
}
Compiled using nvcc -g -G -std=c++14 "--generate-code=arch=compute_86,code=[compute_86,sm_86]" -o test test.cu
Any help is appreciated to understand why this deadlocks.
All threads reach the end of atomicAddThreshold, but only 1 block finishes (either block 0 or 1). I do not know where all my other threads get stuck.