cuda-gdb detach causes "unspecified launch failure"

Attempts to detach from a CUDA application being debugged with cuda-gdb cause the application to fail with “unspecified launch failure (4)”. Here’s an example program:

#include <stdio.h>

__global__ void VecInc(float*  vec,
                       int     N)
{
   int i = blockDim.x * blockIdx.x + threadIdx.x;
   if (i < N) {
      vec[i] = vec[i] + 1.0;
   }
}

#define blockCount   256
#define numBlocks     20

int main(int argc, char** argv)
{
   float*  h_vec;
   cudaHostAlloc((void**)&h_vec, numBlocks * blockCount * sizeof(float),
                 cudaHostAllocPortable | cudaHostAllocWriteCombined);
   float*  d_vec;
   cudaMalloc((void**)&d_vec, numBlocks * blockCount * sizeof(float));
   cudaMemcpy(d_vec, h_vec, numBlocks * blockCount * sizeof(float),
              cudaMemcpyHostToDevice);

   VecInc<<<numBlocks, blockCount>>>(d_vec, numBlocks * blockCount);
   cudaError_t err = cudaGetLastError();
   if (err != cudaSuccess) {
      fprintf(stderr, "kernel launch failure: %s (%d)\n",
              cudaGetErrorString(err), err);
      exit(-1);
   }                                                             
   cudaThreadSynchronize();
   err = cudaGetLastError();
   if (err != cudaSuccess) {
      fprintf(stderr, "kernel failure: %s (%d)\n", cudaGetErrorString(err), err);
      exit(-1);
   }                                                             
   cudaMemcpy(h_vec, d_vec, numBlocks * blockCount * sizeof(float), cudaMemcpyDeviceToHost);
   cudaFree(d_vec);
   cudaFreeHost(h_vec);
   cudaThreadExit();
}

And its makefile:

NVCC = /usr/local/cuda/bin/nvcc
NVCCGEN = -gencode=arch=compute_53,code=\"sm_53,compute_53\"
NVCCFLAGS = $(NVCCGEN) --compiler-options -fno-strict-aliasing -DUNIX -g -G -rdc=true

COMPILE.cu = $(NVCC) $(NVCCFLAGS) $(TARGET_ARCH) -c
NVLINK = $(NVCC) $(NVCCGEN) -rdc=true

it: it.o
	$(NVLINK) $^ $(LD35LIBS) -o $@

it.o: it.cu
	$(COMPILE.cu) -c $<

clean:
	rm -f it it.o

Here’s an example transcript:

/usr/local/cuda/bin/cuda-gdb ./it
NVIDIA (R) CUDA Debugger
7.0 release
Portions Copyright (C) 2007-2015 NVIDIA Corporation
GNU gdb (GDB) 7.6.2
Copyright (C) 2013 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.  Type "show copying"
and "show warranty" for details.
This GDB was configured as "aarch64-elf-linux-gnu".
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>...
Reading symbols from /mag1/todd/test/detach/it...done.
(cuda-gdb) b 8
Breakpoint 1 at 0x402eb0: file it.cu, line 8.
(cuda-gdb) r
Starting program: /mag1/todd/test/detach/./it 
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/aarch64-linux-gnu/libthread_db.so.1".
[New Thread 0x200129d1f0 (LWP 11195)]
[New Thread 0x20014be1f0 (LWP 11196)]
[Switching focus to CUDA kernel 0, grid 1, block (0,0,0), thread (0,0,0), device 0, sm 0, warp 0, lane 0]

Breakpoint 1, VecInc<<<(20,1,1),(256,1,1)>>> (vec=0x1018e0000, N=5120)
    at it.cu:8
8             vec[i] = vec[i] + 1.0;
(cuda-gdb) detach
$1 = 14823424
kernel failure: unspecified launch failure (4)
[Thread 0x20014be1f0 (LWP 11196) exited]
[Thread 0x2000020230 (LWP 11188) exited]
[Inferior 1 (process 11188) exited with code 0377]
warning: Unexpected CUDA API attach state.

Meanwhile, the driver emitted the following errors to the console:

nvmap_pgprot: PID 11188: it: TAG: 0x0800 WARNING: NVMAP_HANDLE_WRITE_COMBINE should be used in place of NVMAP_HANDLE_UNCACHEABLE on ARM64
gk20a gpu.0: gk20a_set_error_notifier: error notifier set to 43 for ch 503
gk20a gpu.0: gk20a_fifo_set_ctx_mmu_error_ch: channel 503 generated a mmu fault
gk20a gpu.0: __locked_fifo_preempt: preempt TSG 0 timeout

gk20a gpu.0: gk20a_set_error_notifier: error notifier set to 8 for ch 504
gk20a gpu.0: gk20a_set_error_notifier: error notifier set to 8 for ch 506
gk20a gpu.0: gk20a_set_error_notifier: error notifier set to 8 for ch 505
gk20a gpu.0: gk20a_set_error_notifier: error notifier set to 8 for ch 501
gk20a gpu.0: gk20a_fifo_handle_mmu_fault: mmu fault on engine 0, engine subid 0 (gpc), client 26 (t1 6), addr 0x0000009d:0xc7144000, type 9 (work creation), info 0x00003a09,inst_ptr 0x6acc45000

gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_os_r : 0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_cpuctl_r : 0x40
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_idlestate_r : 0x1
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_mailbox0_r : 0x0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_mailbox1_r : 0x0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_irqstat_r : 0x0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_irqmode_r : 0x4
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_irqmask_r : 0x8704
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_irqdest_r : 0x0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_debug1_r : 0x40
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_debuginfo_r : 0x0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_ctxsw_mailbox_r(0) : 0x0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_ctxsw_mailbox_r(1) : 0x1
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_ctxsw_mailbox_r(2) : 0x50009
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_ctxsw_mailbox_r(3) : 0x20
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_ctxsw_mailbox_r(4) : 0x2000a0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_ctxsw_mailbox_r(5) : 0x0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_ctxsw_mailbox_r(6) : 0x0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_ctxsw_mailbox_r(7) : 0x0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_engctl_r : 0x0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_curctx_r : 0x0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_nxtctx_r : 0x0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: FECS_FALCON_REG_IMB : 0xbadfbadf
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: FECS_FALCON_REG_DMB : 0xbadfbadf
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: FECS_FALCON_REG_CSW : 0xbadfbadf
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: FECS_FALCON_REG_CTX : 0xbadfbadf
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: FECS_FALCON_REG_EXCI : 0xbadfbadf
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: FECS_FALCON_REG_PC : 0xbadfbadf
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: FECS_FALCON_REG_SP : 0xbadfbadf
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: FECS_FALCON_REG_PC : 0xbadfbadf
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: FECS_FALCON_REG_SP : 0xbadfbadf
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: FECS_FALCON_REG_PC : 0xbadfbadf
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: FECS_FALCON_REG_SP : 0xbadfbadf
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: FECS_FALCON_REG_PC : 0xbadfbadf
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: FECS_FALCON_REG_SP : 0xbadfbadf
gk20a gpu.0: gk20a_fifo_handle_mmu_fault: gr_status_r : 0x1000081
gk20a gpu.0: gk20a_fifo_set_ctx_mmu_error_tsg: TSG 0 generated a mmu fault
gk20a gpu.0: gk20a_fifo_handle_sched_error: fifo sched error : 0x0000000a, failed to find engine

Thank you, Todd, for reporting this as well, an investigation into this issue is underway.

Hi Todd,

“detach” command is not supported with cuda-gdb on mobile. Have you run through the entire program like this:

/usr/local/cuda/bin/cuda-gdb ./it
(cuda-gdb) r

Assuming everything works, you should get something similar to this:

CUDA Exception: Warp Out-of-range Address
The exception was triggered at PC 0xb84f48 (memexceptions_kernel.cu:22)

Program received signal CUDA_EXCEPTION_5, Warp Out-of-range Address.
[Switching focus to CUDA kernel 0, grid 1, block (0,0,0), thread (0,0,0), device 0, sm 0, warp 0, lane 0]
0x0000000000b84f58 in exception_kernel<<<(1,1,1),(1,1,1)>>> (data=0x707080000,
    exception=OOR_SHARED) at memexceptions_kernel.cu:24
24                  while(gridDim.x);
(cuda-gdb)

Please help to share more inputs.

Thanks

No, when I run the test program, it generates no exceptions. But the test program itself is uninteresting. I just presented it to demonstrate that detach doesn’t work. I wasn’t aware that that was intentional.

Hi Todd Allen,

If you run the program mentioned above, you should be able to debug the issue from then on.

For specific CUDA programming issue, please post to CUDA Programming and Performance board to get more assistance:
[url]https://devtalk.nvidia.com/default/board/57/[/url]

Thanks

The point was not that it couldn’t be debugged. It was that it couldn’t be detached, and left to run on its own without the debugger.

In R24.2, the behavior of this example has changed. It no longer fails with an “unspecified launch failure (4)”, as before. Now, it seems to actually detach from the process. But that process seems unable to make further progress after that. So the detach was not actually useful.

Consider the above program. When not being debugged, it runs and completes fairly quickly. But if you debug it and then detach, the detached process hangs around forever:

/usr/local/cuda/bin/cuda-gdb ./it
NVIDIA (R) CUDA Debugger
8.0 release
Portions Copyright (C) 2007-2016 NVIDIA Corporation
GNU gdb (GDB) 7.6.2
Copyright (C) 2013 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.  Type "show copying"
and "show warranty" for details.
This GDB was configured as "aarch64-elf-linux-gnu".
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>...
Reading symbols from /mag1/todd/test/detach/it...done.
(cuda-gdb) set cuda software_preemption on
(cuda-gdb) b 8
Breakpoint 1 at 0x403824: file it.cu, line 8.
(cuda-gdb) r
Starting program: /mag1/todd/test/detach/./it 
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/aarch64-linux-gnu/libthread_db.so.1".
[New Thread 0x20016081e0 (LWP 23023)]
[New Thread 0x20018081e0 (LWP 23024)]
[Thread 0x20018081e0 (LWP 23024) exited]
[Switching focus to CUDA kernel 0, grid 1, block (2,0,0), thread (0,0,0), device 0, sm 0, warp 9, lane 0]

Breakpoint 1, VecInc<<<(20,1,1),(256,1,1)>>> (vec=0x104280000, N=5120)
    at it.cu:8
8             vec[i] = vec[i] + 1.0;
(cuda-gdb) d 1
(cuda-gdb) detach
$1 = 19664896
0x0000002000305998 in sched_yield () from /lib/aarch64-linux-gnu/libc.so.6
Detaching from program: /mag1/todd/test/detach/./it, process 23008
(cuda-gdb) q

... wait as long as you like ...

/bin/ps -Lfe | grep ./it | grep -v grep
todd     23008     1 23008 97    2 15:08 pts/4    00:01:49 /mag1/todd/test/detach/./it
todd     23008     1 23023  0    2 15:08 pts/4    00:00:00 /mag1/todd/test/detach/./it

Hi Todd Allen,

This issue seems is based on software_preemption being enabled, which is untested to be working on mobile and therefore can be expected to fail oddly.
We’re investigating the proper solution for this issue, and will include the fix in coming release. Once any clear schedule, I will post to you.

Thanks