When working on allocating a lot of small (~2MB) allocations, I noticed that doing a memset on 2MB + 1 byte allocations will fail instead of the malloc failing. For devices without enough GPU memory for certain applications, such as RTX 3070, this causes crashes of the program for situations requiring the number of allocations greater than the fail number but still well below the reported GPU memory.
Here is a simple program that replicates this issue:
#include "cuda_runtime.h"
#include <vector>
#include <stdio.h>
#define MB (1 << 20)
bool floodMemory(size_t chunk_size)
{
printf("\nStarting memory flood with chunk size %d\n\n", chunk_size);
std::vector<uint8_t*> dev_ptrs;
size_t allocated_mem = 0;
while (1)
{
uint8_t* temp = 0;
if (cudaMalloc(&temp, chunk_size) == cudaSuccess)
{
if (cudaDeviceSynchronize() != cudaSuccess)
{
printf("cudaDeviceSynchronize after malloc failed! allocated mem: %dmb\n", allocated_mem / MB);
printf("cudaDeviceSynchronize returned error: %s\n", cudaGetErrorString(cudaGetLastError()));
break;
}
allocated_mem += chunk_size;
if (cudaMemset(temp, 0, chunk_size) != cudaSuccess)
{
printf("cudaMemset failed! allocated mem: %dmb\n", allocated_mem / MB);
printf("cudaMemset returned error: %s\n", cudaGetErrorString(cudaGetLastError()));
break;
}
if (cudaDeviceSynchronize() != cudaSuccess)
{
printf("cudaDeviceSynchronize after memset failed! This shouldn't have happened. allocated mem: %dmb\n", allocated_mem / MB);
printf("cudaDeviceSynchronize returned error: %s\n", cudaGetErrorString(cudaGetLastError()));
break;
}
dev_ptrs.push_back(temp);
}
else
{
printf("cudaMalloc failed! this is expected. allocated mem: %dmb\n", allocated_mem / MB);
printf("cudaMalloc returned error: %s\n", cudaGetErrorString(cudaGetLastError()));
break;
}
}
printf("\nflood completed, attempting to clean up...\n");
int i = 0;
for (uint8_t* ptr : dev_ptrs)
{
if (cudaFree(ptr) != cudaSuccess)
{
printf("cudaFree failed on item %d!\n", i);
printf("cudaFree returned error: %s\n", cudaGetErrorString(cudaGetLastError()));
printf("unable to recover\n");
return false;
}
i++;
}
printf("succeeded cleaning up\n");
return true;
}
int main()
{
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
return 1;
}
size_t chunk_size = (1 << 21);
if (!floodMemory(chunk_size)) return 1; // Should pass on all drivers
chunk_size++;
floodMemory(chunk_size); // fails on drivers >461.92 (game-ready) and >462.59 (studio)
return 0;
}
I was able to replicate this on a 1660Ti, 3070, 1080Ti, and a 3080 for drivers 465.89, 466.11, 466.27, 466.47, 466.77, and 471.11, but on driver 461.92 and earlier the issue wasn’t present.