Cuda core dump with skip_abort will ignore an illegal memory access error

youkaichao1 · August 11, 2025, 3:38am

I’m trying the cuda core dump feature with the following simple code:

#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>

// CUDA error checking macro
#define cuda_check(call) do { \
    cudaError_t err = call; \
    if (err != cudaSuccess) { \
        printf("CUDA Error at %s:%d - %s: %s\n", __FILE__, __LINE__, #call, cudaGetErrorString(err)); \
        exit(EXIT_FAILURE); \
    } \
} while(0)

// Kernel with illegal memory access - accesses memory beyond allocated bounds
__global__ void illegalMemoryAccessKernel(int* data, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    
    // This will cause illegal memory access - accessing beyond allocated memory
    // We allocate 'size' elements but access up to size * 2
    if (idx < size * 2) {  // Access twice the allocated size
        data[idx - 1000000000] = idx;   // This will cause illegal access for idx == 0
    }
}

int main() {
    printf("CUDA Illegal Memory Access Test\n");
    printf("===============================\n\n");
    
    int size = 100;
    int* h_data = (int*)malloc(size * sizeof(int));
    int* d_data;
    
    // Initialize host memory
    for (int i = 0; i < size; i++) {
        h_data[i] = 0;
    }
    
    // Allocate device memory
    cuda_check(cudaMalloc(&d_data, (unsigned long long)(size) * sizeof(int)));
    cuda_check(cudaMemcpy(d_data, h_data, size * sizeof(int), cudaMemcpyHostToDevice));
    
    // Launch kernel with illegal memory access
    int blockSize = 256;
    int numBlocks = (size + blockSize - 1) / blockSize;
    
    printf("Launching kernel with out-of-bounds access...\n");
    illegalMemoryAccessKernel<<<numBlocks, blockSize>>>(d_data, size);
    
    // Check for errors
    cudaError_t error = cudaGetLastError();
    if (error != cudaSuccess) {
        printf("CUDA Error: %s\n", cudaGetErrorString(error));
    }
    
    // Synchronize to catch any runtime errors
    cuda_check(cudaDeviceSynchronize());
    
    printf("Test completed.\n");
    
    // Cleanup
    cuda_check(cudaFree(d_data));
    free(h_data);
    
    return 0;
}

Compile and run it directly, I get

CUDA Illegal Memory Access Test
===============================

Launching kernel with out-of-bounds access...
CUDA Error at test3.cu:56 - cudaDeviceSynchronize(): an illegal memory access was encountered

Expected.

Run it with `CUDA_ENABLE_COREDUMP_ON_EXCEPTION=1` , I get:

CUDA Illegal Memory Access Test
===============================

Launching kernel with out-of-bounds access...
Starting GPU coredump generation, set the CUDA_COREDUMP_SHOW_PROGRESS environment variable to 1 to enable more detailed output
[1]    946274 IOT instruction (core dumped)  CUDA_ENABLE_COREDUMP_ON_EXCEPTION=1 ./test3

Expected.

However, I don’t want abort right after cuda core dump, I want more information on the host side error trace, so i want to skip abort. So I run it with `CUDA_COREDUMP_GENERATION_FLAGS=“skip_abort” CUDA_ENABLE_COREDUMP_ON_EXCEPTION=1` . This time, cuda core dump is generated, but i notice something very strange:

CUDA Illegal Memory Access Test
===============================

Launching kernel with out-of-bounds access...
Starting GPU coredump generation, set the CUDA_COREDUMP_SHOW_PROGRESS environment variable to 1 to enable more detailed output
Test completed.

The host code runs without noticing the illegal memory access error! In theory, the illegal memory access error should corrupt the cuda context, and all my later calls to driver APIs should error out.

I’m using Driver Version: 570.133.20, on a DGX H100 box.

youkaichao1 · August 11, 2025, 3:43am

Some additional information, if i add `asm(“trap;“);` in the kernel to trigger “unspecified launch failure“ error, then cuda core dump works as expected, even if i use skip_abort flag. so i think this is specific to the illegal memory access error.

youkaichao1 · August 11, 2025, 4:37am

What’s more concerning, the illegal memory access error seems to be totally ignored, and I can launch kernels as usual, and they work as if no illegal memory access errors occur:

#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>

// CUDA error checking macro
#define cuda_check(call) do { \
    cudaError_t err = call; \
    if (err != cudaSuccess) { \
        printf("CUDA Error at %s:%d - %s: %s\n", __FILE__, __LINE__, #call, cudaGetErrorString(err)); \
        exit(EXIT_FAILURE); \
    } \
} while(0)

// Kernel with illegal memory access - accesses memory beyond allocated bounds
__global__ void illegalMemoryAccessKernel(int* data, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    
    // This will cause illegal memory access - accessing beyond allocated memory
    // We allocate 'size' elements but access up to size * 2
    if (idx < size * 2) {  // Access twice the allocated size
        data[idx - 1000000000] = idx;   // This will cause illegal access for idx == 0
    }
}

// Kernel with illegal memory access - accesses memory beyond allocated bounds
__global__ void normalKernel(int* data, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    
    // This will cause illegal memory access - accessing beyond allocated memory
    // We allocate 'size' elements but access up to size * 2
    if (idx < size) {  // Access twice the allocated size
        data[idx] = idx;   // 
    }
}

int main() {
    printf("CUDA Illegal Memory Access Test\n");
    printf("===============================\n\n");
    
    int size = 100;
    int* h_data = (int*)malloc(size * sizeof(int));
    int* d_data;
    
    // Initialize host memory
    for (int i = 0; i < size; i++) {
        h_data[i] = 0;
    }
    
    // Allocate device memory
    cuda_check(cudaMalloc(&d_data, (unsigned long long)(size) * sizeof(int)));
    cuda_check(cudaMemcpy(d_data, h_data, size * sizeof(int), cudaMemcpyHostToDevice));
    
    // Launch kernel with illegal memory access
    int blockSize = 256;
    int numBlocks = (size + blockSize - 1) / blockSize;
    
    printf("Launching kernel with out-of-bounds access...\n");
    illegalMemoryAccessKernel<<<numBlocks, blockSize>>>(d_data, size);
    
    // Check for errors
    cudaError_t error = cudaGetLastError();
    if (error != cudaSuccess) {
        printf("CUDA Error: %s\n", cudaGetErrorString(error));
    }

    normalKernel<<<numBlocks, blockSize>>>(d_data, size);
    error = cudaGetLastError();
    if (error != cudaSuccess) {
        printf("CUDA Error: %s\n", cudaGetErrorString(error));
    }

    cuda_check(cudaMemcpy(h_data, d_data, size * sizeof(int), cudaMemcpyDeviceToHost));
    for (int i = 0; i < 5; i++) {
        printf("%d ", h_data[i]);
    }
    printf("\n");
    
    // Synchronize to catch any runtime errors
    cuda_check(cudaDeviceSynchronize());
    
    printf("Test completed.\n");
    
    // Cleanup
    cuda_check(cudaFree(d_data));
    free(h_data);
    
    return 0;
}

Run it normally:

CUDA Illegal Memory Access Test
===============================

Launching kernel with out-of-bounds access...
CUDA Error at test3.cu:72 - cudaMemcpy(h_data, d_data, size * sizeof(int), cudaMemcpyDeviceToHost): an illegal memory access was encountered

Run it with `CUDA_COREDUMP_GENERATION_FLAGS=“skip_abort” CUDA_ENABLE_COREDUMP_ON_EXCEPTION=1`:

CUDA Illegal Memory Access Test
===============================

Launching kernel with out-of-bounds access...
Starting GPU coredump generation, set the CUDA_COREDUMP_SHOW_PROGRESS environment variable to 1 to enable more detailed output
0 1 2 3 4 
Test completed.

Core dump happens, but the following kernel can still run.

Robert_Crovella · August 12, 2025, 2:52pm

you could file a bug

You could probably just put a link in the bug to this posting, but you may also want to mention the CUDA version you are using:

This looks like all runtime API code to me.

Topic		Replies	Views
Cannot generate cuda coredump when cuda kernel access illegal address CUDA-GDB	8	1715	December 9, 2021
zero-copy Out of bound access CUDA Programming and Performance	2	2382	July 7, 2009
Tracking down CUDA illegal memory access CUDA Programming and Performance	1	1288	February 20, 2015
Debugging illegal accesses CUDA Programming and Performance	4	786	June 15, 2019
cuda-memcheck.exe caused an illegal memory access error. CUDA-MEMCHECK	0	1839	October 27, 2016
Alternately Encountering "Illegal Memory Access" Legacy PGI Compilers	4	6671	August 28, 2017
Should out of bounds shared memory accesses be errors? CUDA Programming and Performance	5	7184	April 6, 2009
CUDA and exceptions CUDA Programming and Performance	1	1798	August 5, 2010
Tracking down: bad memory request CUDA Programming and Performance	1	3994	November 22, 2008
Non-strict cuda driver ? in terms of memory accesses CUDA Programming and Performance	1	2881	January 29, 2012

Cuda core dump with skip_abort will ignore an illegal memory access error

Related topics