I have observed an unexpected CUDA behaviour which we can’t find documented, namely that certain CUDA on device API calls (printf, cudaMalloc etc) seem to cause a non-blocking stream synchronisation on an unrelated stream to behave like device synchronisation. This only happens the first time they are used. We noticed the problem when there is a persistent kernel running and the stream sync never exits. The workaround is to ensure that you exercise the problem APIs at least once and sync before the persistent kernel runs.
Specifically, I replicated it like this:
1 Launching WarmUpKernel to use cudaMalloc once (needed to avoid a hang below)
2 Launching Dummy persistent kernel
3 Launching DummyShortKernel in non-blobking stream B (also uses cudaMalloc)
4 cudaStreamSynchronize on stream B <- Hangs if no step 1
I replicated this in a standalone visual studio project (relocatable device code on)
#include <cuda_runtime_api.h>
#include <stdio.h>
// Persistent kernel - infinite loop, never returns.
__global__ void DummyPk()
{
while (1) { __nanosleep(1000); }
}
// This kernel uess the device APIs
__global__ void DummyShortKernel()
{
void* pTemp;
cudaMalloc(&pTemp, sizeof(int));
cudaFree(pTemp);
}
// this kernel exercises the deice APIs once before they are used for real.
__global__ void WarmUpKernel()
{
void* pTemp;
cudaMalloc(&pTemp, sizeof(int));
cudaFree(pTemp);
}
int main()
{
cudaStream_t StreamA = 0;
cudaStream_t StreamB = 0;
const bool bRunWarmup = false;
CUDA_CHECK(cudaStreamCreateWithFlags(&StreamA, cudaStreamNonBlocking));
CUDA_CHECK(cudaStreamCreateWithFlags(&StreamB, cudaStreamNonBlocking));
// Touch the function attributes to force JIT compilation now
cudaFuncAttributes attr;
CUDA_CHECK(cudaFuncGetAttributes(&attr, &DummyPk));
CUDA_CHECK(cudaFuncGetAttributes(&attr, &DummyShortKernel));
CUDA_CHECK(cudaFuncGetAttributes(&attr, &WarmUpKernel));
// launch warmup kernel to exercise APIs first time
if (bRunWarmup)
{
printf("Launching WarmUpKernel...\n");
WarmUpKernel << <1, 1, 0, StreamA >> > ();
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaStreamSynchronize(StreamA));
}
// Launch persistent kernel which never exits.
printf("Launching DummyPk (never exits)...\n");
DummyPk << <1, 1, 0, StreamA >> > ();
CUDA_CHECK(cudaGetLastError());
printf("Launching DummyShortKernel on StreamB...\n");
DummyShortKernel << <1, 1, 0, StreamB >> > ();
CUDA_CHECK(cudaGetLastError());
// Without the warmup Kernel call this stream sync hangs! Why?
printf("Calling cudaStreamSynchronize(StreamB)\n");
CUDA_CHECK(cudaStreamSynchronize(StreamB)); // Coda hangs here iff bRunWarmup is false
printf("cudaStreamSynchronize(StreamB) returned.\n");
printf("Done.\n");
return 0;
}
With const bool bRunWarmup = false;
Launching DummyPk (never exits)...
Launching DummyShortKernel on StreamB...
Calling cudaStreamSynchronize(StreamB)
and there the program hangs.
With const bool bRunWarmup = true;
Launching WarmUpKernel...
CDP warm-up
Launching DummyPk (never exits)...
Launching DummyShortKernel on StreamB...
Calling cudaStreamSynchronize(StreamB)
In Short kernel
cudaStreamSynchronize(StreamB) returned.
Done.
It seems to also apply to these fns: printf(), cudaMemsetAsync, cudaMemcpyAsync, cudaStreamCreateWithFlags, cudaStreamDestroy.
Can anyone explain this behaviour or find where it is documented?