cudaStream alloc after free result in oom

qiangxinwei · December 17, 2024, 2:30pm

I have a stream, and I do something like:

cudaMallocAsync(A, stream)
cudaFreeAsync(A, stream)
cudaMallocAsync(B, stream)

where both A and B should fit in the device memory while A+B can’t.

I got the error out of memory when running the program without synchronizing after free. My question is how to solve this problem without calling the cudaDeviceSynchronize()

some edit:
what I’m actually doing is

stream
record event1 on stream
childstream wait event1
MallocAsync on childstream
FreeAsync on childstream
record event2 on childstream
stream wait event2
MallocAsync on stream

From what I read about the stream-ordered alloc, this should work because there is dependency between childstream and stream.

striker159 · December 17, 2024, 5:45pm

You could perform a single allocation of size max(A,B)

qiangxinwei · December 18, 2024, 2:20am

That’s against the purposes of the stream-ordered allocation, described in Using the NVIDIA CUDA Stream-Ordered Memory Allocator, Part 1 | NVIDIA Technical Blog and I’m still confused why this went wrong.

striker159 · December 18, 2024, 5:28am

Could you post a complete minimal reproducer? Which cuda version / driver / gpu are you using?

qiangxinwei · December 18, 2024, 1:47pm

I have tried to remove unrelated items, but still a bit messy.

#include <iostream>
#include <fstream>
#include <cuda_runtime.h>

#define PROPAGATE_CUDA_ERROR(x)                                                                                    \
{                                                                                                                \
    err = x;                                                                                                       \
    if (err != cudaSuccess)                                                                                        \
    {                                                                                                              \
        std::cerr << "CUDA Error [" << __FILE__ << ":" << __LINE__ << "]: " << cudaGetErrorString(err) << std::endl; \
        return err;                                                                                                  \
    }                                                                                                              \
}

cudaError_t precompute(
    const uint *h_points,
    unsigned long long len,
    uint *&h_points_precompute,
    cudaStream_t stream = 0
) {
    cudaError_t err;
    PROPAGATE_CUDA_ERROR(cudaHostAlloc(&h_points_precompute, 64 * len * 12, cudaHostAllocDefault));
    cudaEvent_t begin_precompute;
    PROPAGATE_CUDA_ERROR(cudaEventCreate(&begin_precompute));
    PROPAGATE_CUDA_ERROR(cudaEventRecord(begin_precompute, stream));
    cudaStream_t child_stream[2];
    PROPAGATE_CUDA_ERROR(cudaStreamCreate(&child_stream[0]));
    PROPAGATE_CUDA_ERROR(cudaStreamWaitEvent(child_stream[0], begin_precompute, cudaEventWaitDefault));
    uint *points[2];
    PROPAGATE_CUDA_ERROR(cudaMallocAsync(&points[0], 64 * len * 12, child_stream[0]));
    uint stage = 0;
    unsigned long long cur_len = std::min(len, len - 0);
    PROPAGATE_CUDA_ERROR(cudaMemcpyAsync(points[stage], h_points, 64 * len, cudaMemcpyHostToDevice, child_stream[stage]));
    PROPAGATE_CUDA_ERROR(cudaMemcpyAsync(h_points_precompute, points[stage], 64 * len * 12, cudaMemcpyDeviceToHost, child_stream[stage]));
    stage ^= 1;
    
    PROPAGATE_CUDA_ERROR(cudaFreeAsync(points[stage^1], child_stream[stage^1]));
    cudaEvent_t end_precompute[2];
    PROPAGATE_CUDA_ERROR(cudaEventCreate(&end_precompute[0]));
    PROPAGATE_CUDA_ERROR(cudaEventRecord(end_precompute[0], child_stream[0]));
    PROPAGATE_CUDA_ERROR(cudaStreamWaitEvent(stream, end_precompute[0], cudaEventWaitDefault));
    return cudaSuccess;
}
__host__ cudaError_t run(
    const unsigned long long len,
    const uint *h_scalers,
    const uint *h_points_precompute,
    cudaStream_t stream = 0
) {
    cudaError_t err;
    uint *buckets_sum_buf;
    PROPAGATE_CUDA_ERROR(cudaMallocAsync(&buckets_sum_buf, 128ull * 1 * ((1 << 21)), stream));
    unsigned short *mutex_buf;
    PROPAGATE_CUDA_ERROR(cudaMallocAsync(&mutex_buf, sizeof(unsigned short) * ((1 << 21)) * 1, stream));
    PROPAGATE_CUDA_ERROR(cudaMemsetAsync(mutex_buf, 0, sizeof(unsigned short) * ((1 << 21)) * 1, stream));
    unsigned short *initialized_buf;
    PROPAGATE_CUDA_ERROR(cudaMallocAsync(&initialized_buf, sizeof(unsigned short) * ((1 << 21)) * 1, stream));
    PROPAGATE_CUDA_ERROR(cudaMemsetAsync(initialized_buf, 0, sizeof(unsigned short) * ((1 << 21)) * 1, stream));
    uint *cnt_zero;
    PROPAGATE_CUDA_ERROR(cudaMallocAsync(&cnt_zero, sizeof(uint), stream));
    unsigned long long *indexs;
    PROPAGATE_CUDA_ERROR(cudaMallocAsync(&indexs, sizeof(unsigned long long) * 12 * len * 2, stream));
    uint *scalers;
    PROPAGATE_CUDA_ERROR(cudaMallocAsync(&scalers, sizeof(uint) * 8 * len, stream));
    uint* points;
    PROPAGATE_CUDA_ERROR(cudaMallocAsync(&points, 64 * len * 12, stream));
    return cudaSuccess;
}

int main(int argc, char *argv[])
{
  auto len = 1 << 24;

  uint* scalers = new uint[len * 8];
  uint* points = new uint[len * 64];

  cudaHostRegister((void*)scalers, len * sizeof(uint) * 8, cudaHostRegisterDefault);
  cudaHostRegister((void*)points, len * sizeof(uint) * 16, cudaHostRegisterDefault);

  uint *h_points_precompute, head;
  uint * h_points = (uint*)malloc(64 * len * sizeof(uint));

  cudaStream_t stream;
  cudaStreamCreate(&stream);

  precompute((uint*)points, len, h_points_precompute, stream);
  // cudaDeviceSynchronize();
  run(len, (uint*)scalers, h_points_precompute, stream);

  cudaDeviceSynchronize();

  cudaHostUnregister((void*)scalers);
  cudaHostUnregister((void*)points);
  cudaFreeHost(h_points_precompute);

  return 0;
}

This code is run on a RTX4090 with driver 550.107.02, cuda 12.3, with/without the cudaDeviceSynchronize() in line 87will result in success/oom.

striker159 · December 18, 2024, 2:36pm

I guess the initial 12GB chunk is reused for the small allocations. Then the second 12GB cudaMallocAsync needs a new allocation which fails. If the second 12GB is allocated before the smaller buffers, it seems to work.

You could create multiple memory pools which serve different sized allocations

qiangxinwei · December 18, 2024, 2:49pm

You are right, but it still seems strange that the Cuda memory pool can’t just extend itself. For example, 12GB is allocated, 2GB is used, and 12GB is needed, so just allocate 2GB. From what I read, the memory pool uses virtual addresses, which means this can be done easily.

striker159 · December 18, 2024, 4:41pm

The virtual memory api has some restrictions. For example,

cuMemUnmap cannot unmap a sub-range of an address range mapped by cuMemCreate / cuMemMap.

Nevertheless, you could open a bug report with suggestions for improvements.

Topic		Replies	Views
Memory leak with cudagraph? CUDA Programming and Performance	4	228	June 10, 2024
cudaSafeCall() Runtime API error 11 CUDA Programming and Performance	12	7221	March 9, 2012
Using the NVIDIA CUDA Stream-Ordered Memory Allocator, Part 1 Technical Blog	1	615	September 13, 2024
Using the NVIDIA CUDA Stream-Ordered Memory Allocator, Part 2 Technical Blog	12	1151	September 12, 2023
Large allocations with cudaMallocManaged slow down synchronization CUDA Programming and Performance	11	1582	October 26, 2020
Problem regarding data transfer overlap between multiple asynchronous streams CUDA Programming and Performance	8	794	September 11, 2016
Stream ordered Memory Allocator runs out of memory with multiple streams CUDA Programming and Performance	0	199	February 16, 2024
CUDA non-default stream synchronization CUDA Programming and Performance jetson-orin	4	58	October 30, 2024
Why would cumemAllocAsync want to "insert new stream dependencies"? CUDA Programming and Performance	9	448	April 8, 2023
Cuda memory pool performance issue CUDA Programming and Performance cuda , api	4	2137	February 1, 2022

cudaStream alloc after free result in oom

Related topics