Cuda core dump with skip_abort will ignore an illegal memory access error

What’s more concerning, the illegal memory access error seems to be totally ignored, and I can launch kernels as usual, and they work as if no illegal memory access errors occur:

#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>

// CUDA error checking macro
#define cuda_check(call) do { \
    cudaError_t err = call; \
    if (err != cudaSuccess) { \
        printf("CUDA Error at %s:%d - %s: %s\n", __FILE__, __LINE__, #call, cudaGetErrorString(err)); \
        exit(EXIT_FAILURE); \
    } \
} while(0)

// Kernel with illegal memory access - accesses memory beyond allocated bounds
__global__ void illegalMemoryAccessKernel(int* data, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    
    // This will cause illegal memory access - accessing beyond allocated memory
    // We allocate 'size' elements but access up to size * 2
    if (idx < size * 2) {  // Access twice the allocated size
        data[idx - 1000000000] = idx;   // This will cause illegal access for idx == 0
    }
}

// Kernel with illegal memory access - accesses memory beyond allocated bounds
__global__ void normalKernel(int* data, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    
    // This will cause illegal memory access - accessing beyond allocated memory
    // We allocate 'size' elements but access up to size * 2
    if (idx < size) {  // Access twice the allocated size
        data[idx] = idx;   // 
    }
}

int main() {
    printf("CUDA Illegal Memory Access Test\n");
    printf("===============================\n\n");
    
    int size = 100;
    int* h_data = (int*)malloc(size * sizeof(int));
    int* d_data;
    
    // Initialize host memory
    for (int i = 0; i < size; i++) {
        h_data[i] = 0;
    }
    
    // Allocate device memory
    cuda_check(cudaMalloc(&d_data, (unsigned long long)(size) * sizeof(int)));
    cuda_check(cudaMemcpy(d_data, h_data, size * sizeof(int), cudaMemcpyHostToDevice));
    
    // Launch kernel with illegal memory access
    int blockSize = 256;
    int numBlocks = (size + blockSize - 1) / blockSize;
    
    printf("Launching kernel with out-of-bounds access...\n");
    illegalMemoryAccessKernel<<<numBlocks, blockSize>>>(d_data, size);
    
    // Check for errors
    cudaError_t error = cudaGetLastError();
    if (error != cudaSuccess) {
        printf("CUDA Error: %s\n", cudaGetErrorString(error));
    }

    normalKernel<<<numBlocks, blockSize>>>(d_data, size);
    error = cudaGetLastError();
    if (error != cudaSuccess) {
        printf("CUDA Error: %s\n", cudaGetErrorString(error));
    }

    cuda_check(cudaMemcpy(h_data, d_data, size * sizeof(int), cudaMemcpyDeviceToHost));
    for (int i = 0; i < 5; i++) {
        printf("%d ", h_data[i]);
    }
    printf("\n");
    
    // Synchronize to catch any runtime errors
    cuda_check(cudaDeviceSynchronize());
    
    printf("Test completed.\n");
    
    // Cleanup
    cuda_check(cudaFree(d_data));
    free(h_data);
    
    return 0;
}

Run it normally:

CUDA Illegal Memory Access Test
===============================

Launching kernel with out-of-bounds access...
CUDA Error at test3.cu:72 - cudaMemcpy(h_data, d_data, size * sizeof(int), cudaMemcpyDeviceToHost): an illegal memory access was encountered

Run it with `CUDA_COREDUMP_GENERATION_FLAGS=“skip_abort” CUDA_ENABLE_COREDUMP_ON_EXCEPTION=1`:

CUDA Illegal Memory Access Test
===============================

Launching kernel with out-of-bounds access...
Starting GPU coredump generation, set the CUDA_COREDUMP_SHOW_PROGRESS environment variable to 1 to enable more detailed output
0 1 2 3 4 
Test completed.

Core dump happens, but the following kernel can still run.