Cuda core dump with skip_abort will ignore an illegal memory access error

I’m trying the cuda core dump feature with the following simple code:

#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>

// CUDA error checking macro
#define cuda_check(call) do { \
    cudaError_t err = call; \
    if (err != cudaSuccess) { \
        printf("CUDA Error at %s:%d - %s: %s\n", __FILE__, __LINE__, #call, cudaGetErrorString(err)); \
        exit(EXIT_FAILURE); \
    } \
} while(0)

// Kernel with illegal memory access - accesses memory beyond allocated bounds
__global__ void illegalMemoryAccessKernel(int* data, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    
    // This will cause illegal memory access - accessing beyond allocated memory
    // We allocate 'size' elements but access up to size * 2
    if (idx < size * 2) {  // Access twice the allocated size
        data[idx - 1000000000] = idx;   // This will cause illegal access for idx == 0
    }
}

int main() {
    printf("CUDA Illegal Memory Access Test\n");
    printf("===============================\n\n");
    
    int size = 100;
    int* h_data = (int*)malloc(size * sizeof(int));
    int* d_data;
    
    // Initialize host memory
    for (int i = 0; i < size; i++) {
        h_data[i] = 0;
    }
    
    // Allocate device memory
    cuda_check(cudaMalloc(&d_data, (unsigned long long)(size) * sizeof(int)));
    cuda_check(cudaMemcpy(d_data, h_data, size * sizeof(int), cudaMemcpyHostToDevice));
    
    // Launch kernel with illegal memory access
    int blockSize = 256;
    int numBlocks = (size + blockSize - 1) / blockSize;
    
    printf("Launching kernel with out-of-bounds access...\n");
    illegalMemoryAccessKernel<<<numBlocks, blockSize>>>(d_data, size);
    
    // Check for errors
    cudaError_t error = cudaGetLastError();
    if (error != cudaSuccess) {
        printf("CUDA Error: %s\n", cudaGetErrorString(error));
    }
    
    // Synchronize to catch any runtime errors
    cuda_check(cudaDeviceSynchronize());
    
    printf("Test completed.\n");
    
    // Cleanup
    cuda_check(cudaFree(d_data));
    free(h_data);
    
    return 0;
}

Compile and run it directly, I get

CUDA Illegal Memory Access Test
===============================

Launching kernel with out-of-bounds access...
CUDA Error at test3.cu:56 - cudaDeviceSynchronize(): an illegal memory access was encountered

Expected.

Run it with `CUDA_ENABLE_COREDUMP_ON_EXCEPTION=1` , I get:

CUDA Illegal Memory Access Test
===============================

Launching kernel with out-of-bounds access...
Starting GPU coredump generation, set the CUDA_COREDUMP_SHOW_PROGRESS environment variable to 1 to enable more detailed output
[1]    946274 IOT instruction (core dumped)  CUDA_ENABLE_COREDUMP_ON_EXCEPTION=1 ./test3

Expected.

However, I don’t want abort right after cuda core dump, I want more information on the host side error trace, so i want to skip abort. So I run it with `CUDA_COREDUMP_GENERATION_FLAGS=“skip_abort” CUDA_ENABLE_COREDUMP_ON_EXCEPTION=1` . This time, cuda core dump is generated, but i notice something very strange:

CUDA Illegal Memory Access Test
===============================

Launching kernel with out-of-bounds access...
Starting GPU coredump generation, set the CUDA_COREDUMP_SHOW_PROGRESS environment variable to 1 to enable more detailed output
Test completed.

The host code runs without noticing the illegal memory access error! In theory, the illegal memory access error should corrupt the cuda context, and all my later calls to driver APIs should error out.

I’m using Driver Version: 570.133.20, on a DGX H100 box.

Some additional information, if i add `asm(“trap;“);` in the kernel to trigger “unspecified launch failure“ error, then cuda core dump works as expected, even if i use skip_abort flag. so i think this is specific to the illegal memory access error.

What’s more concerning, the illegal memory access error seems to be totally ignored, and I can launch kernels as usual, and they work as if no illegal memory access errors occur:

#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>

// CUDA error checking macro
#define cuda_check(call) do { \
    cudaError_t err = call; \
    if (err != cudaSuccess) { \
        printf("CUDA Error at %s:%d - %s: %s\n", __FILE__, __LINE__, #call, cudaGetErrorString(err)); \
        exit(EXIT_FAILURE); \
    } \
} while(0)

// Kernel with illegal memory access - accesses memory beyond allocated bounds
__global__ void illegalMemoryAccessKernel(int* data, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    
    // This will cause illegal memory access - accessing beyond allocated memory
    // We allocate 'size' elements but access up to size * 2
    if (idx < size * 2) {  // Access twice the allocated size
        data[idx - 1000000000] = idx;   // This will cause illegal access for idx == 0
    }
}

// Kernel with illegal memory access - accesses memory beyond allocated bounds
__global__ void normalKernel(int* data, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    
    // This will cause illegal memory access - accessing beyond allocated memory
    // We allocate 'size' elements but access up to size * 2
    if (idx < size) {  // Access twice the allocated size
        data[idx] = idx;   // 
    }
}

int main() {
    printf("CUDA Illegal Memory Access Test\n");
    printf("===============================\n\n");
    
    int size = 100;
    int* h_data = (int*)malloc(size * sizeof(int));
    int* d_data;
    
    // Initialize host memory
    for (int i = 0; i < size; i++) {
        h_data[i] = 0;
    }
    
    // Allocate device memory
    cuda_check(cudaMalloc(&d_data, (unsigned long long)(size) * sizeof(int)));
    cuda_check(cudaMemcpy(d_data, h_data, size * sizeof(int), cudaMemcpyHostToDevice));
    
    // Launch kernel with illegal memory access
    int blockSize = 256;
    int numBlocks = (size + blockSize - 1) / blockSize;
    
    printf("Launching kernel with out-of-bounds access...\n");
    illegalMemoryAccessKernel<<<numBlocks, blockSize>>>(d_data, size);
    
    // Check for errors
    cudaError_t error = cudaGetLastError();
    if (error != cudaSuccess) {
        printf("CUDA Error: %s\n", cudaGetErrorString(error));
    }

    normalKernel<<<numBlocks, blockSize>>>(d_data, size);
    error = cudaGetLastError();
    if (error != cudaSuccess) {
        printf("CUDA Error: %s\n", cudaGetErrorString(error));
    }

    cuda_check(cudaMemcpy(h_data, d_data, size * sizeof(int), cudaMemcpyDeviceToHost));
    for (int i = 0; i < 5; i++) {
        printf("%d ", h_data[i]);
    }
    printf("\n");
    
    // Synchronize to catch any runtime errors
    cuda_check(cudaDeviceSynchronize());
    
    printf("Test completed.\n");
    
    // Cleanup
    cuda_check(cudaFree(d_data));
    free(h_data);
    
    return 0;
}

Run it normally:

CUDA Illegal Memory Access Test
===============================

Launching kernel with out-of-bounds access...
CUDA Error at test3.cu:72 - cudaMemcpy(h_data, d_data, size * sizeof(int), cudaMemcpyDeviceToHost): an illegal memory access was encountered

Run it with `CUDA_COREDUMP_GENERATION_FLAGS=“skip_abort” CUDA_ENABLE_COREDUMP_ON_EXCEPTION=1`:

CUDA Illegal Memory Access Test
===============================

Launching kernel with out-of-bounds access...
Starting GPU coredump generation, set the CUDA_COREDUMP_SHOW_PROGRESS environment variable to 1 to enable more detailed output
0 1 2 3 4 
Test completed.

Core dump happens, but the following kernel can still run.

you could file a bug

You could probably just put a link in the bug to this posting, but you may also want to mention the CUDA version you are using:

This looks like all runtime API code to me.