I’m trying the cuda core dump feature with the following simple code:
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>
// CUDA error checking macro
#define cuda_check(call) do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
printf("CUDA Error at %s:%d - %s: %s\n", __FILE__, __LINE__, #call, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
} while(0)
// Kernel with illegal memory access - accesses memory beyond allocated bounds
__global__ void illegalMemoryAccessKernel(int* data, int size) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
// This will cause illegal memory access - accessing beyond allocated memory
// We allocate 'size' elements but access up to size * 2
if (idx < size * 2) { // Access twice the allocated size
data[idx - 1000000000] = idx; // This will cause illegal access for idx == 0
}
}
int main() {
printf("CUDA Illegal Memory Access Test\n");
printf("===============================\n\n");
int size = 100;
int* h_data = (int*)malloc(size * sizeof(int));
int* d_data;
// Initialize host memory
for (int i = 0; i < size; i++) {
h_data[i] = 0;
}
// Allocate device memory
cuda_check(cudaMalloc(&d_data, (unsigned long long)(size) * sizeof(int)));
cuda_check(cudaMemcpy(d_data, h_data, size * sizeof(int), cudaMemcpyHostToDevice));
// Launch kernel with illegal memory access
int blockSize = 256;
int numBlocks = (size + blockSize - 1) / blockSize;
printf("Launching kernel with out-of-bounds access...\n");
illegalMemoryAccessKernel<<<numBlocks, blockSize>>>(d_data, size);
// Check for errors
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) {
printf("CUDA Error: %s\n", cudaGetErrorString(error));
}
// Synchronize to catch any runtime errors
cuda_check(cudaDeviceSynchronize());
printf("Test completed.\n");
// Cleanup
cuda_check(cudaFree(d_data));
free(h_data);
return 0;
}
Compile and run it directly, I get
CUDA Illegal Memory Access Test
===============================
Launching kernel with out-of-bounds access...
CUDA Error at test3.cu:56 - cudaDeviceSynchronize(): an illegal memory access was encountered
Expected.
Run it with `CUDA_ENABLE_COREDUMP_ON_EXCEPTION=1` , I get:
CUDA Illegal Memory Access Test
===============================
Launching kernel with out-of-bounds access...
Starting GPU coredump generation, set the CUDA_COREDUMP_SHOW_PROGRESS environment variable to 1 to enable more detailed output
[1] 946274 IOT instruction (core dumped) CUDA_ENABLE_COREDUMP_ON_EXCEPTION=1 ./test3
Expected.
However, I don’t want abort right after cuda core dump, I want more information on the host side error trace, so i want to skip abort. So I run it with `CUDA_COREDUMP_GENERATION_FLAGS=“skip_abort” CUDA_ENABLE_COREDUMP_ON_EXCEPTION=1` . This time, cuda core dump is generated, but i notice something very strange:
CUDA Illegal Memory Access Test
===============================
Launching kernel with out-of-bounds access...
Starting GPU coredump generation, set the CUDA_COREDUMP_SHOW_PROGRESS environment variable to 1 to enable more detailed output
Test completed.
The host code runs without noticing the illegal memory access error! In theory, the illegal memory access error should corrupt the cuda context, and all my later calls to driver APIs should error out.
I’m using Driver Version: 570.133.20, on a DGX H100 box.