I noticed (by chance) that if an out-of-bound memory read happens inside a kernel, the kernel will not return a cudaError. I used this code snippet for tests on Windows 11 with CUDA Toolkit v12.4 and an RTX 2050 GPU (cc 8.6).
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
#include <vector>
#define gpuErrchk() { gpuAssert(__FILE__, __LINE__); }
inline void gpuAssert(const char* file, int line, bool abort = true)
{
cudaDeviceSynchronize();
cudaError_t code = cudaGetLastError();
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void copy(int* output, const int* input, const int len)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
while (i < len)
{
output[i] = input[i];
i += (blockDim.x * gridDim.x);
}
}
int main()
{
cudaError_t cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
return 1;
}
const int len{ (1 << 20) };
int* input = new int[len];
for (int i{}; i < len; ++i) input[i] = 5;
int* output = new int[len];
int* d_input, * d_output;
cudaMalloc(&d_input, sizeof(int) * len);
cudaMalloc(&d_output, sizeof(int) * len);
gpuErrchk();
cudaMemcpy(d_input, input, len * sizeof(int), cudaMemcpyHostToDevice);
copy << <32, 768 >> > (d_output, d_input, len);
gpuErrchk();
cudaMemcpy(output, d_output, len * sizeof(int), cudaMemcpyDeviceToHost);
gpuErrchk();
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
return 0;
}
The above code works fine and no error is expected. Now, I change the code such that an out-of-bound mem access (read) happens and expect “an illegal memory access was encountered” error by the gpuErrchk()
after the copy
kernel. I change
copy << <32, 768 >> > (d_output, d_input, len);
To
copy << <32, 768 >> > (d_output, d_input + 1, len);
and I expect the illegal mem access error because there’s an out-of-bound memory read but this doesn’t happen. However, if I do the exact same thing on the output, i.e. change the kernel call to
copy << <32, 768 >> > (d_output + 1, d_input, len);
it results in an illegal memory access error! What’s the deal here? If it had been pure C++, I would’ve said it’s undefined behavior and we may or may not get a seg fault. But as for CUDA, I thought these errors were detected by the cudaGetLastError()
API.
So to wrap it up, can anyone explain this observation? Is it my error checking method, some wrong expectation, my platform, or is it just a bug?
Thank you.