This happens with L4T R24.1 64-bit. I’m including an example of this problem:
#include <stdio.h>
__global__ void VecInc(float* vec,
int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N) {
vec[i] = vec[i] + 1.0;
}
}
#define blockCount 256
#define numBlocks 5000
int main(int argc, char** argv)
{
// Arays are only size blockCount.
float* h_vec;
cudaHostAlloc((void**)&h_vec, blockCount * sizeof(float),
cudaHostAllocPortable | cudaHostAllocWriteCombined);
float* d_vec;
cudaMalloc((void**)&d_vec, blockCount * sizeof(float));
cudaMemcpy(d_vec, h_vec, blockCount * sizeof(float), cudaMemcpyHostToDevice);
// Claim that arrays have numBlocks * blockCount elements. Gonna be ugly.
VecInc<<<numBlocks, blockCount>>>(d_vec, numBlocks * blockCount);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf(stderr, "kernel launch failure: %s (%d)\n",
cudaGetErrorString(err), err);
exit(-1);
}
cudaThreadSynchronize();
err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf(stderr, "kernel failure: %s (%d)\n", cudaGetErrorString(err), err);
exit(-1);
}
cudaMemcpy(h_vec, d_vec, blockCount * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_vec);
cudaFreeHost(h_vec);
cudaThreadExit();
}
NVCC = /usr/local/cuda/bin/nvcc
NVCCGEN = -gencode=arch=compute_53,code=\"sm_53,compute_53\"
NVCCFLAGS = $(NVCCGEN) --compiler-options -fno-strict-aliasing -DUNIX -g -G -rdc=true
COMPILE.cu = $(NVCC) $(NVCCFLAGS) $(TARGET_ARCH) -c
NVLINK = $(NVCC) $(NVCCGEN) -rdc=true
runit: it
CUDA_DEVICE_WAITS_ON_EXCEPTION=1 ./it
it: it.o
$(NVLINK) $^ $(LD35LIBS) -o $@
it.o: it.cu
$(COMPILE.cu) -c $<
clean:
rm -f it it.o
The Makefile will build and run “it”, using CUDA_DEVICE_WAITS_ON_EXCEPTION=1. The program has an intentional bug where it will overrun the end of the array by an extreme amount, triggering a memory exception. The idea is that this should stop the process and allow you to attach with cuda-gdb.
Once the exception happens, the driver will start throwing out errors like this:
gk20a gpu.0: gr_gk20a_ctx_wait_ucode: timeout waiting on ucode response
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_os_r : 0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_cpuctl_r : 0x40
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_idlestate_r : 0x1
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_mailbox0_r : 0x0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_mailbox1_r : 0x0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_irqstat_r : 0x0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_irqmode_r : 0x4
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_irqmask_r : 0x8704
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_irqdest_r : 0x0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_debug1_r : 0x40
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_debuginfo_r : 0x0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_ctxsw_mailbox_r(0) : 0x0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_ctxsw_mailbox_r(1) : 0x0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_ctxsw_mailbox_r(2) : 0x89
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_ctxsw_mailbox_r(3) : 0x0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_ctxsw_mailbox_r(4) : 0x2000a0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_ctxsw_mailbox_r(5) : 0x0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_ctxsw_mailbox_r(6) : 0x0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_ctxsw_mailbox_r(7) : 0x0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_engctl_r : 0x0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_curctx_r : 0x0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: gr_fecs_nxtctx_r : 0x0
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: FECS_FALCON_REG_IMB : 0xbadfbadf
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: FECS_FALCON_REG_DMB : 0xbadfbadf
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: FECS_FALCON_REG_CSW : 0xbadfbadf
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: FECS_FALCON_REG_CTX : 0xbadfbadf
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: FECS_FALCON_REG_EXCI : 0xbadfbadf
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: FECS_FALCON_REG_PC : 0xbadfbadf
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: FECS_FALCON_REG_SP : 0xbadfbadf
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: FECS_FALCON_REG_PC : 0xbadfbadf
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: FECS_FALCON_REG_SP : 0xbadfbadf
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: FECS_FALCON_REG_PC : 0xbadfbadf
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: FECS_FALCON_REG_SP : 0xbadfbadf
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: FECS_FALCON_REG_PC : 0xbadfbadf
gk20a gpu.0: gk20a_fecs_dump_falcon_stats: FECS_FALCON_REG_SP : 0xbadfbadf
gk20a gpu.0: gk20a_fifo_wait_engine_idle: cannot idle engine 0
This should look similar to another problem I reported about persistent kernels, and it may even be the same root problem.
Anyway, if you attempt to attach to this process with cuda-gdb’s attach command, things get worse. cuda-gdb somestimes just hangs, but usually aborts with:
Error: Internal error reported by CUDA debugger API (error=CUDBG_ERROR_INTERNAL(0xa)). The application cannot be further debugged.
And the driver then throws out worse errors in addition to the ones above:
gk20a gpu.0: gr_gk20a_exec_ctx_ops: unable to stop gr ctxsw
gk20a gpu.0: nvgpu_ioctl_channel_reg_ops: dbg regops failed
gk20a gpu.0: __locked_fifo_preempt: preempt TSG 0 timeout
...
gk20a gpu.0: gk20a_set_error_notifier: error notifier set to 8 for ch 504
gk20a gpu.0: gk20a_set_error_notifier: error notifier set to 8 for ch 503
gk20a gpu.0: gk20a_set_error_notifier: error notifier set to 8 for ch 502
gk20a gpu.0: gk20a_set_error_notifier: error notifier set to 8 for ch 501
...
gk20a gpu.0: gk20a_fifo_handle_mmu_fault: mmu fault on engine 0, engine subid 0
(gpc), client 1 (t1 0), addr 0x00000005:0x01481000, type 2 (pte), info 0x00000182,inst_ptr 0x15ed1d000
...
gk20a gpu.0: gk20a_fifo_reset_engine: failed to HALT gr pipe
gk20a gpu.0: gk20a_fifo_set_ctx_mmu_error_tsg: TSG 0 generated a mmu fault
gk20a gpu.0: gk20a_fifo_handle_sched_error: fifo sched error : 0x0000000a, failed to find engine