cudaStream alloc after free result in oom

I have a stream, and I do something like:

cudaMallocAsync(A, stream)
cudaFreeAsync(A, stream)
cudaMallocAsync(B, stream)

where both A and B should fit in the device memory while A+B can’t.

I got the error out of memory when running the program without synchronizing after free. My question is how to solve this problem without calling the cudaDeviceSynchronize()


some edit:
what I’m actually doing is

stream
record event1 on stream
childstream wait event1
MallocAsync on childstream
FreeAsync on childstream
record event2 on childstream
stream wait event2
MallocAsync on stream

From what I read about the stream-ordered alloc, this should work because there is dependency between childstream and stream.

You could perform a single allocation of size max(A,B)

That’s against the purposes of the stream-ordered allocation, described in Using the NVIDIA CUDA Stream-Ordered Memory Allocator, Part 1 | NVIDIA Technical Blog and I’m still confused why this went wrong.

Could you post a complete minimal reproducer? Which cuda version / driver / gpu are you using?

I have tried to remove unrelated items, but still a bit messy.

#include <iostream>
#include <fstream>
#include <cuda_runtime.h>

#define PROPAGATE_CUDA_ERROR(x)                                                                                    \
{                                                                                                                \
    err = x;                                                                                                       \
    if (err != cudaSuccess)                                                                                        \
    {                                                                                                              \
        std::cerr << "CUDA Error [" << __FILE__ << ":" << __LINE__ << "]: " << cudaGetErrorString(err) << std::endl; \
        return err;                                                                                                  \
    }                                                                                                              \
}

cudaError_t precompute(
    const uint *h_points,
    unsigned long long len,
    uint *&h_points_precompute,
    cudaStream_t stream = 0
) {
    cudaError_t err;
    PROPAGATE_CUDA_ERROR(cudaHostAlloc(&h_points_precompute, 64 * len * 12, cudaHostAllocDefault));
    cudaEvent_t begin_precompute;
    PROPAGATE_CUDA_ERROR(cudaEventCreate(&begin_precompute));
    PROPAGATE_CUDA_ERROR(cudaEventRecord(begin_precompute, stream));
    cudaStream_t child_stream[2];
    PROPAGATE_CUDA_ERROR(cudaStreamCreate(&child_stream[0]));
    PROPAGATE_CUDA_ERROR(cudaStreamWaitEvent(child_stream[0], begin_precompute, cudaEventWaitDefault));
    uint *points[2];
    PROPAGATE_CUDA_ERROR(cudaMallocAsync(&points[0], 64 * len * 12, child_stream[0]));
    uint stage = 0;
    unsigned long long cur_len = std::min(len, len - 0);
    PROPAGATE_CUDA_ERROR(cudaMemcpyAsync(points[stage], h_points, 64 * len, cudaMemcpyHostToDevice, child_stream[stage]));
    PROPAGATE_CUDA_ERROR(cudaMemcpyAsync(h_points_precompute, points[stage], 64 * len * 12, cudaMemcpyDeviceToHost, child_stream[stage]));
    stage ^= 1;
    
    PROPAGATE_CUDA_ERROR(cudaFreeAsync(points[stage^1], child_stream[stage^1]));
    cudaEvent_t end_precompute[2];
    PROPAGATE_CUDA_ERROR(cudaEventCreate(&end_precompute[0]));
    PROPAGATE_CUDA_ERROR(cudaEventRecord(end_precompute[0], child_stream[0]));
    PROPAGATE_CUDA_ERROR(cudaStreamWaitEvent(stream, end_precompute[0], cudaEventWaitDefault));
    return cudaSuccess;
}
__host__ cudaError_t run(
    const unsigned long long len,
    const uint *h_scalers,
    const uint *h_points_precompute,
    cudaStream_t stream = 0
) {
    cudaError_t err;
    uint *buckets_sum_buf;
    PROPAGATE_CUDA_ERROR(cudaMallocAsync(&buckets_sum_buf, 128ull * 1 * ((1 << 21)), stream));
    unsigned short *mutex_buf;
    PROPAGATE_CUDA_ERROR(cudaMallocAsync(&mutex_buf, sizeof(unsigned short) * ((1 << 21)) * 1, stream));
    PROPAGATE_CUDA_ERROR(cudaMemsetAsync(mutex_buf, 0, sizeof(unsigned short) * ((1 << 21)) * 1, stream));
    unsigned short *initialized_buf;
    PROPAGATE_CUDA_ERROR(cudaMallocAsync(&initialized_buf, sizeof(unsigned short) * ((1 << 21)) * 1, stream));
    PROPAGATE_CUDA_ERROR(cudaMemsetAsync(initialized_buf, 0, sizeof(unsigned short) * ((1 << 21)) * 1, stream));
    uint *cnt_zero;
    PROPAGATE_CUDA_ERROR(cudaMallocAsync(&cnt_zero, sizeof(uint), stream));
    unsigned long long *indexs;
    PROPAGATE_CUDA_ERROR(cudaMallocAsync(&indexs, sizeof(unsigned long long) * 12 * len * 2, stream));
    uint *scalers;
    PROPAGATE_CUDA_ERROR(cudaMallocAsync(&scalers, sizeof(uint) * 8 * len, stream));
    uint* points;
    PROPAGATE_CUDA_ERROR(cudaMallocAsync(&points, 64 * len * 12, stream));
    return cudaSuccess;
}

int main(int argc, char *argv[])
{
  auto len = 1 << 24;

  uint* scalers = new uint[len * 8];
  uint* points = new uint[len * 64];

  cudaHostRegister((void*)scalers, len * sizeof(uint) * 8, cudaHostRegisterDefault);
  cudaHostRegister((void*)points, len * sizeof(uint) * 16, cudaHostRegisterDefault);

  uint *h_points_precompute, head;
  uint * h_points = (uint*)malloc(64 * len * sizeof(uint));

  cudaStream_t stream;
  cudaStreamCreate(&stream);

  precompute((uint*)points, len, h_points_precompute, stream);
  // cudaDeviceSynchronize();
  run(len, (uint*)scalers, h_points_precompute, stream);

  cudaDeviceSynchronize();

  cudaHostUnregister((void*)scalers);
  cudaHostUnregister((void*)points);
  cudaFreeHost(h_points_precompute);

  return 0;
}

This code is run on a RTX4090 with driver 550.107.02, cuda 12.3, with/without the cudaDeviceSynchronize() in line 87will result in success/oom.

I guess the initial 12GB chunk is reused for the small allocations. Then the second 12GB cudaMallocAsync needs a new allocation which fails. If the second 12GB is allocated before the smaller buffers, it seems to work.

You could create multiple memory pools which serve different sized allocations

1 Like

You are right, but it still seems strange that the Cuda memory pool can’t just extend itself. For example, 12GB is allocated, 2GB is used, and 12GB is needed, so just allocate 2GB. From what I read, the memory pool uses virtual addresses, which means this can be done easily.

The virtual memory api has some restrictions. For example,

cuMemUnmap cannot unmap a sub-range of an address range mapped by cuMemCreate / cuMemMap.

Nevertheless, you could open a bug report with suggestions for improvements.