Attempts to detach from a CUDA application being debugged with cuda-gdb cause the application to fail with “unspecified launch failure (4)”. Here’s an example program:
#include <stdio.h>
__global__ void VecInc(float* vec,
int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N) {
vec[i] = vec[i] + 1.0;
}
}
#define blockCount 256
#define numBlocks 20
int main(int argc, char** argv)
{
float* h_vec;
cudaHostAlloc((void**)&h_vec, numBlocks * blockCount * sizeof(float),
cudaHostAllocPortable | cudaHostAllocWriteCombined);
float* d_vec;
cudaMalloc((void**)&d_vec, numBlocks * blockCount * sizeof(float));
cudaMemcpy(d_vec, h_vec, numBlocks * blockCount * sizeof(float),
cudaMemcpyHostToDevice);
VecInc<<<numBlocks, blockCount>>>(d_vec, numBlocks * blockCount);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf(stderr, "kernel launch failure: %s (%d)\n",
cudaGetErrorString(err), err);
exit(-1);
}
cudaThreadSynchronize();
err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf(stderr, "kernel failure: %s (%d)\n", cudaGetErrorString(err), err);
exit(-1);
}
cudaMemcpy(h_vec, d_vec, numBlocks * blockCount * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_vec);
cudaFreeHost(h_vec);
cudaThreadExit();
}
And its makefile:
NVCC = /usr/local/cuda/bin/nvcc
NVCCGEN = -gencode=arch=compute_53,code=\"sm_53,compute_53\"
NVCCFLAGS = $(NVCCGEN) --compiler-options -fno-strict-aliasing -DUNIX -g -G -rdc=true
COMPILE.cu = $(NVCC) $(NVCCFLAGS) $(TARGET_ARCH) -c
NVLINK = $(NVCC) $(NVCCGEN) -rdc=true
it: it.o
$(NVLINK) $^ $(LD35LIBS) -o $@
it.o: it.cu
$(COMPILE.cu) -c $<
clean:
rm -f it it.o
Here’s an example transcript:
/usr/local/cuda/bin/cuda-gdb ./it
NVIDIA (R) CUDA Debugger
7.0 release
Portions Copyright (C) 2007-2015 NVIDIA Corporation
GNU gdb (GDB) 7.6.2
Copyright (C) 2013 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law. Type "show copying"
and "show warranty" for details.
This GDB was configured as "aarch64-elf-linux-gnu".
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>...
Reading symbols from /mag1/todd/test/detach/it...done.
(cuda-gdb) b 8
Breakpoint 1 at 0x402eb0: file it.cu, line 8.
(cuda-gdb) r
Starting program: /mag1/todd/test/detach/./it
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/aarch64-linux-gnu/libthread_db.so.1".
[New Thread 0x200129d1f0 (LWP 11195)]
[New Thread 0x20014be1f0 (LWP 11196)]
[Switching focus to CUDA kernel 0, grid 1, block (0,0,0), thread (0,0,0), device 0, sm 0, warp 0, lane 0]
Breakpoint 1, VecInc<<<(20,1,1),(256,1,1)>>> (vec=0x1018e0000, N=5120)
at it.cu:8
8 vec[i] = vec[i] + 1.0;
(cuda-gdb) detach
$1 = 14823424
kernel failure: unspecified launch failure (4)
[Thread 0x20014be1f0 (LWP 11196) exited]
[Thread 0x2000020230 (LWP 11188) exited]
[Inferior 1 (process 11188) exited with code 0377]
warning: Unexpected CUDA API attach state.
Meanwhile, the driver emitted the following errors to the console:
nvmap_pgprot: PID 11188: it: TAG: 0x0800 WARNING: NVMAP_HANDLE_WRITE_COMBINE should be used in place of NVMAP_HANDLE_UNCACHEABLE on ARM64
gk20a gpu.0: gk20a_set_error_notifier: error notifier set to 43 for ch 503
gk20a gpu.0: gk20a_fifo_set_ctx_mmu_error_ch: channel 503 generated a mmu fault
gk20a gpu.0: __locked_fifo_preempt: preempt TSG 0 timeout
gk20a gpu.0: gk20a_set_error_notifier: error notifier set to 8 for ch 504
gk20a gpu.0: gk20a_set_error_notifier: error notifier set to 8 for ch 506
gk20a gpu.0: gk20a_set_error_notifier: error notifier set to 8 for ch 505
gk20a gpu.0: gk20a_set_error_notifier: error notifier set to 8 for ch 501
gk20a gpu.0: gk20a_fifo_handle_mmu_fault: mmu fault on engine 0, engine subid 0 (gpc), client 26 (t1 6), addr 0x0000009d:0xc7144000, type 9 (work creation), info 0x00003a09,inst_ptr 0x6acc45000
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_os_r : 0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_cpuctl_r : 0x40
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_idlestate_r : 0x1
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_mailbox0_r : 0x0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_mailbox1_r : 0x0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_irqstat_r : 0x0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_irqmode_r : 0x4
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_irqmask_r : 0x8704
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_irqdest_r : 0x0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_debug1_r : 0x40
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_debuginfo_r : 0x0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_ctxsw_mailbox_r(0) : 0x0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_ctxsw_mailbox_r(1) : 0x1
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_ctxsw_mailbox_r(2) : 0x50009
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_ctxsw_mailbox_r(3) : 0x20
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_ctxsw_mailbox_r(4) : 0x2000a0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_ctxsw_mailbox_r(5) : 0x0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_ctxsw_mailbox_r(6) : 0x0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_ctxsw_mailbox_r(7) : 0x0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_engctl_r : 0x0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_curctx_r : 0x0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_nxtctx_r : 0x0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: FECS_FALCON_REG_IMB : 0xbadfbadf
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: FECS_FALCON_REG_DMB : 0xbadfbadf
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: FECS_FALCON_REG_CSW : 0xbadfbadf
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: FECS_FALCON_REG_CTX : 0xbadfbadf
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: FECS_FALCON_REG_EXCI : 0xbadfbadf
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: FECS_FALCON_REG_PC : 0xbadfbadf
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: FECS_FALCON_REG_SP : 0xbadfbadf
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: FECS_FALCON_REG_PC : 0xbadfbadf
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: FECS_FALCON_REG_SP : 0xbadfbadf
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: FECS_FALCON_REG_PC : 0xbadfbadf
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: FECS_FALCON_REG_SP : 0xbadfbadf
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: FECS_FALCON_REG_PC : 0xbadfbadf
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: FECS_FALCON_REG_SP : 0xbadfbadf
gk20a gpu.0: gk20a_fifo_handle_mmu_fault: gr_status_r : 0x1000081
gk20a gpu.0: gk20a_fifo_set_ctx_mmu_error_tsg: TSG 0 generated a mmu fault
gk20a gpu.0: gk20a_fifo_handle_sched_error: fifo sched error : 0x0000000a, failed to find engine