In this example, I use cuda-gdb to set a conditional breakpoint in a CUDA kernel. For simplicity’s sake, I made a condition that will never evaluate TRUE, but in general this could be some condition that just won’t evaluate true for a while (e.g. a condition on the last element in an input vector). The kernel stops making progress and appears to hang. This program, which normally would complete very quickly, never completes
Here’s an example program (just another vectorAdd):
#include <stdio.h>
__global__ void VecInc(float* vec,
int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N) {
vec[i] = vec[i] + 1.0;
}
}
#define blockCount 256
#define numBlocks 20
int main(int argc, char** argv)
{
float* h_vec;
cudaHostAlloc((void**)&h_vec, numBlocks * blockCount * sizeof(float),
cudaHostAllocPortable | cudaHostAllocWriteCombined);
float* d_vec;
cudaMalloc((void**)&d_vec, numBlocks * blockCount * sizeof(float));
cudaMemcpy(d_vec, h_vec, numBlocks * blockCount * sizeof(float),
cudaMemcpyHostToDevice);
VecInc<<<numBlocks, blockCount>>>(d_vec, numBlocks * blockCount);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf(stderr, "kernel launch failure: %s (%d)\n",
cudaGetErrorString(err), err);
exit(-1);
}
cudaThreadSynchronize();
err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf(stderr, "kernel failure: %s (%d)\n", cudaGetErrorString(err), err);
exit(-1);
}
cudaMemcpy(h_vec, d_vec, numBlocks * blockCount * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_vec);
cudaFreeHost(h_vec);
cudaThreadExit();
}
And its makefile:
NVCC = /usr/local/cuda/bin/nvcc
NVCCGEN = -gencode=arch=compute_53,code=\"sm_53,compute_53\"
NVCCFLAGS = $(NVCCGEN) --compiler-options -fno-strict-aliasing -DUNIX -g -G -rdc=true
COMPILE.cu = $(NVCC) $(NVCCFLAGS) $(TARGET_ARCH) -c
NVLINK = $(NVCC) $(NVCCGEN) -rdc=true
it: it.o
$(NVLINK) $^ $(LD35LIBS) -o $@
it.o: it.cu
$(COMPILE.cu) -c $<
clean:
rm -f it it.o
Build this and then debug it with cuda-gdb:
/usr/local/cuda/bin/cuda-gdb ./it
NVIDIA (R) CUDA Debugger
8.0 release
Portions Copyright (C) 2007-2016 NVIDIA Corporation
GNU gdb (GDB) 7.6.2
Copyright (C) 2013 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law. Type "show copying"
and "show warranty" for details.
This GDB was configured as "aarch64-elf-linux-gnu".
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>...
Reading symbols from /mag1/todd/test/detach/it...done.
(cuda-gdb) set cuda software_preemption on
(cuda-gdb) b 8 if 0
Breakpoint 1 at 0x403824: file it.cu, line 8.
(cuda-gdb) r
Starting program: /mag1/todd/test/detach/./it
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/aarch64-linux-gnu/libthread_db.so.1".
[New Thread 0x20016081e0 (LWP 23328)]
[New Thread 0x20018081e0 (LWP 23329)]
[Thread 0x20018081e0 (LWP 23329) exited]
That is as far as it will get. Now, this vectorAdd only has 20*256 = 5120 elements. So even if it’s stopping for each warp, evaluating the condition, and then resuming, and this therefor may be slow, it shouldn’t take too terribly long.
I encountered a similar problem on Intel platforms and with a specific card back in CUDA 6.5. It disappeared after that. Perhaps this is related. Or perhaps it’s just a similar symptom. But in case it’s helpful, it was reported here:
https://partners.nvidia.com/bug/viewbug/1598128
I am no longer able to access that link, but maybe you still can internally.