Cuda memory pool performance issue

I’ve noticed something that probably negatively impacts the performance of memory pool.
When specifying the attribute cudaMemPoolAttrReleaseThreshold, I noticed this number must be a multiply of 32MiB on RTX 2080, otherwise the below benchmark will perform very poorly. The number is obtained from trail and error, and I failed to find it documented anywhere. The only information I see is to set it to max uint64 to disable all freeing on stream syncing.

Are there any suggestions for how this value should be set? Is there any documentation I’m missing? Thanks for any help.

Compiled and ran with CUDART 11.2. Driver version 460.32.03. GPU is RTX 2080.

$ uname -a 
Linux desktop-18 5.4.0-91-generic #102~18.04.1-Ubuntu SMP Thu Nov 11 14:46:36 UTC 2021 x86_64 x86_64 x86_64 GNU/Linux
template <typename T>
void bm(benchmark::State& state) {  // NOLINT(runtime/references)
  const int64_t data_length_0 = state.range(0); // 1920 x 1080 x 3
  const int64_t data_length_1 = state.range(1); // 2560 x 1920 x 3
  const int64_t num_allocations = state.range(2);  // 32
  const int64_t allocation_size_in_bytes_0 = data_length_0 * sizeof(T);
  const int64_t allocation_size_in_bytes_1 = data_length_1 * sizeof(T);

  cudaStream_t stream = ...; // omitted

  cudaMemPoolProps pool_props{.allocType = cudaMemAllocationTypePinned,
                              .location = {.id = 0,
                                           .type = cudaMemLocationTypeDevice}};

  cudaMemPool_t mem_pool;

  cudaMemPoolCreate(&mem_pool, &pool_props);
  
  constexpr int64_t kMemoryPoolSizeUnit = 1ul << 25;  // 32 MiB, undocumented???
  uint64_t memory_pool_size =
      ((allocation_size_in_bytes_0 + allocation_size_in_bytes_1) * num_allocations +
       kMemoryPoolSizeUnit - 1) /
      kMemoryPoolSizeUnit * kMemoryPoolSizeUnit;

  // uint64_t memory_pool_size = (allocation_size_in_bytes_0 + allocation_size_in_bytes_1) * num_allocations; // much slower!!
  cudaMemPoolSetAttribute(mem_pool, cudaMemPoolAttrReleaseThreshold, &memory_pool_size);

  cudaStreamSynchronize(stream);

  void* allocated_ptrs[num_allocations * 2];

  for (auto _ : state) {
    for (int count = 0; count < num_allocations; count++) {
      cudaMallocFromPoolAsync(&allocated_ptrs[count * 2],
                              allocation_size_in_bytes_0,
                              mem_pool,
                              stream);
      cudaMallocFromPoolAsync(&allocated_ptrs[count * 2 + 1],
                              allocation_size_in_bytes_1,
                              mem_pool,
                              stream);
      benchmark::DoNotOptimize(allocated_ptrs[count * 2]);
      benchmark::DoNotOptimize(allocated_ptrs[count * 2 + 1]);
    }
    cudaStreamSynchronize(stream);
    for (int count = 0; count < num_allocations * 2; count++) {
      cudaFreeAsync(allocated_ptrs[count], stream);
    }
    cudaStreamSynchronize(stream);
  }
  cudaMemPoolDestroy(mem_pool);
}

Bump for visibility.

If you’d like help, I suggest providing more information.

When people ask for help, if they don’t provide a complete, compilable test case that I can directly use without having to add anything or change anything, it immediately makes it more difficult for me to help. Often times, I simply don’t have the time to invest in such inquiries. I will certainly prioritize other questions above those that are lacking important information.

I can’t run your code. Moreover, you seem to be asking about a benchmark performance, and the extent of description of that, that I can find in your post is this:

  benchmark::DoNotOptimize(allocated_ptrs[count * 2]);
  benchmark::DoNotOptimize(allocated_ptrs[count * 2 + 1]);

If that is supposed to be descriptive or informative, in my view it is not.

Additional information that would be useful would be the CUDA version, the operating system, and also how you measure performance (host based timing, profiler, etc.) and what the actual performance data was. For some folks, this description:

might mean a 25% reduction in performance, for others it might mean 10x reduction in performance.

Quite simply, if you want help, make it easy for others to help you.

I went ahead and wrote my own test case, attempting to follow as best I could what you have provided. Obviously I have no idea what the benchmark code is. Nevertheless, the baseline performance of the pool allocate/free mechanism plus a relatively simple memory test seems to be unaffected by kMemoryPoolSizeUnit. I have not done any sort of exhaustive testing, this is just one datapoint:

$ cat t1958.cu
#include <cstdint>
#include <iostream>
#include <cstdlib>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL

unsigned long long dtime_usec(unsigned long long start=0){

  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
__global__ void copy_kernel(int *data, uint64_t sz){
  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < sz)  data[idx] = data[idx+sz];
}

typedef unsigned char T;
uint64_t test(void *ptr){
  uint64_t t = dtime_usec(0);
  const int blocks = 160;
  const int threads = 1024;
  copy_kernel<<<blocks,threads>>>((int *)ptr, blocks*threads);
  cudaDeviceSynchronize();
  cudaError_t err = cudaGetLastError();
  if (err != cudaSuccess) {std::cout << "err 100: " << cudaGetErrorString(err) << std::endl;  return 0;}
  t = dtime_usec(t);
  return t;
}

int main(int argc, char *argv[]){
  bool supported = false;
  int device = 0;
  int driverVersion = 0;
  int deviceSupportsMemoryPools = 0;
  int poolSupportedHandleTypes = 0;
  cudaDriverGetVersion(&driverVersion);
  if (driverVersion >= 11020) {
    cudaDeviceGetAttribute(&deviceSupportsMemoryPools,
                           cudaDevAttrMemoryPoolsSupported, device);
    }
  if (deviceSupportsMemoryPools != 0) {
    // `device` supports the Stream Ordered Memory Allocator
    supported = true;
    std::cout << "Memory Pools supported!" << std::endl;
    }

  if (driverVersion >= 11030) {
    cudaDeviceGetAttribute(&poolSupportedHandleTypes,
              cudaDevAttrMemoryPoolSupportedHandleTypes, device);
    }
  if (poolSupportedHandleTypes & cudaMemHandleTypePosixFileDescriptor) {
   // Pools on the specified device can be created with posix file descriptor-based IPC
    std::cout << "including IPC!" << std::endl;
}

  if (!supported) return 0;
  int loops = 2;
  int incr = 0;
  if (argc > 1) loops = atoi(argv[1]);
  if (argc > 2) incr = atoi(argv[2]);
  const int64_t data_length_0 = 1920 * 1080 * 3;
  const int64_t data_length_1 = 2560 * 1920 * 3;
  const int64_t num_allocations = 32;
  const int64_t allocation_size_in_bytes_0 = data_length_0 * sizeof(T);
  const int64_t allocation_size_in_bytes_1 = data_length_1 * sizeof(T);

  cudaStream_t stream;
  cudaStreamCreate(&stream);

  cudaMemPoolProps pool_props;
  memset(&pool_props, 0, sizeof(pool_props));
  pool_props.allocType = cudaMemAllocationTypePinned;
  pool_props.location.id = 0;
  pool_props.location.type = cudaMemLocationTypeDevice;
  cudaMemPool_t mem_pool;

  cudaError_t err = cudaMemPoolCreate(&mem_pool, &pool_props);
  if (err != cudaSuccess) {std::cout << "err 1: " << cudaGetErrorString(err) << std::endl;  return 0;}

  int64_t kMemoryPoolSizeUnit = (1ul << 25)+incr;  // 32 MiB, undocumented???
  uint64_t memory_pool_size =
      ((allocation_size_in_bytes_0 + allocation_size_in_bytes_1) * num_allocations +
       kMemoryPoolSizeUnit - 1) /
      kMemoryPoolSizeUnit * kMemoryPoolSizeUnit;

  // uint64_t memory_pool_size = (allocation_size_in_bytes_0 + allocation_size_in_bytes_1) * num_allocations; // much slower!!
  err = cudaMemPoolSetAttribute(mem_pool, cudaMemPoolAttrReleaseThreshold, &memory_pool_size);
  if (err != cudaSuccess) {std::cout << "err 2: " << cudaGetErrorString(err) << std::endl;  return 0;}

  err = cudaStreamSynchronize(stream);
  if (err != cudaSuccess) {std::cout << "err 3: " << cudaGetErrorString(err) << std::endl;  return 0;}

  void* allocated_ptrs[num_allocations * 2];
  uint64_t time_sum = 0;
  for (int qq = 0; qq < loops; qq++) {
    for (int count = 0; count < num_allocations; count++) {
      err = cudaMallocFromPoolAsync(&allocated_ptrs[count * 2],
                              allocation_size_in_bytes_0,
                              mem_pool,
                              stream);
      if (err != cudaSuccess) {std::cout << "err 4: " << cudaGetErrorString(err) << std::endl;  return 0;}
      err = cudaMallocFromPoolAsync(&allocated_ptrs[count * 2 + 1],
                              allocation_size_in_bytes_1,
                              mem_pool,
                              stream);
      if (err != cudaSuccess) {std::cout << "err 5: " << cudaGetErrorString(err) << std::endl;  return 0;}
      time_sum += test(allocated_ptrs[count * 2]);
      time_sum += test(allocated_ptrs[count * 2 + 1]);
    }
    err = cudaStreamSynchronize(stream);
    if (err != cudaSuccess) {std::cout << "err 6: " << cudaGetErrorString(err) << std::endl;  return 0;}
    for (int count = 0; count < num_allocations * 2; count++) {
      err = cudaFreeAsync(allocated_ptrs[count], stream);
      if (err != cudaSuccess) {std::cout << "err 7: " << cudaGetErrorString(err) << std::endl;  return 0;}
    }
    err = cudaStreamSynchronize(stream);
    if (err != cudaSuccess) {std::cout << "err 8: " << cudaGetErrorString(err) << std::endl;  return 0;}
  }
  err = cudaMemPoolDestroy(mem_pool);
  if (err != cudaSuccess) {std::cout << "err 9: " << cudaGetErrorString(err) << std::endl;  return 0;}
  std::cout << "elapsed time: " << time_sum << "us" <<  std::endl;
}
$ nvcc -o t1958 t1958.cu
$ ./t1958 10
Memory Pools supported!
including IPC!
elapsed time: 8786us
$ ./t1958 10 1024
Memory Pools supported!
including IPC!
elapsed time: 8373us
$ ./t1958 100
Memory Pools supported!
including IPC!
elapsed time: 79157us
$ ./t1958 100 1024
Memory Pools supported!
including IPC!
elapsed time: 77558us
$

Tesla V100, CentOS 7, CUDA 11.4, 470.57.02

The above data simply indicates the performance of the memory test. I observed the overall application peformance as follows:

$ time ./t1958 10000
Memory Pools supported!
including IPC!
elapsed time: 6850860us

real    0m8.507s
user    0m6.916s
sys     0m1.586s
$ time ./t1958 10000 1024
Memory Pools supported!
including IPC!
elapsed time: 6817939us

real    0m8.517s
user    0m6.902s
sys     0m1.609s
$

The overall application measured wallclock time showed no significant difference between the case where kMemoryPoolSizeUnit was 32MB or 32MB+1024. Therefore I conclude that there was no significant difference in the aggregate time for the allocation/free mechanism.

1 Like

Thank you @Robert_Crovella for your in-depth support!

I have read my post again and I did miss mentioning that I was using google/benchmark as my benchmarker. My bad.

// The DoNotOptimize(...) function can be used to prevent a value or
// expression from being optimized away by the compiler. This function is
// intended to add little to no overhead.
// See: https://youtu.be/nXaxk27zwlk?t=2441
#ifndef BENCHMARK_HAS_NO_INLINE_ASSEMBLY
template <class Tp>
inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
  asm volatile("" : : "r,m"(value) : "memory");
}

As of very poorly, I’m observing >10x time in Nsight Systems for the API call cudaMallocFromPoolAsync.

From the original post, I wrote

Compiled and ran with CUDART 11.2. Driver version 460.32.03. GPU is RTX 2080.

$ uname -a 
Linux desktop-18 5.4.0-91-generic #102~18.04.1-Ubuntu SMP Thu Nov 11 14:46:36 UTC 2021 x86_64 x86_64 x86_64 GNU/Linux

I can’t think of any more information that you might require, and it seems to have all of the information you’ve mentioned in your reply. Should you need anything else please let me know, thank you!

I did some modification to your code: measuring the API call time, and I was able to reproduce the same observation from my original post.

#include <cstdint>
#include <iostream>
#include <cstdlib>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL

unsigned long long dtime_usec(unsigned long long start=0){

  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}

int main(int argc, char *argv[]){
  bool supported = false;
  int device = 0;
  int driverVersion = 0;
  int deviceSupportsMemoryPools = 0;
  int poolSupportedHandleTypes = 0;
  cudaDriverGetVersion(&driverVersion);
  if (driverVersion >= 11020) {
    cudaDeviceGetAttribute(&deviceSupportsMemoryPools,
                           cudaDevAttrMemoryPoolsSupported, device);
    }
  if (deviceSupportsMemoryPools != 0) {
    // `device` supports the Stream Ordered Memory Allocator
    supported = true;
    std::cout << "Memory Pools supported!" << std::endl;
    }

  if (!supported) return 0;
  int loops = 2;
  int incr = 0;
  if (argc > 1) loops = atoi(argv[1]);
  if (argc > 2) incr = atoi(argv[2]);
  printf("Loops = %d , incr = %d\n", loops, incr);
  const int64_t data_length_0 = 1920 * 1080 * 3;
  const int64_t data_length_1 = 2560 * 1920 * 3;
  const int64_t num_allocations = 32;
  const int64_t allocation_size_in_bytes_0 = data_length_0;
  const int64_t allocation_size_in_bytes_1 = data_length_1;

  cudaStream_t stream;
  cudaStreamCreate(&stream);

  cudaMemPoolProps pool_props;
  memset(&pool_props, 0, sizeof(pool_props));
  pool_props.allocType = cudaMemAllocationTypePinned;
  pool_props.location.id = 0;
  pool_props.location.type = cudaMemLocationTypeDevice;
  cudaMemPool_t mem_pool;

  cudaError_t err = cudaMemPoolCreate(&mem_pool, &pool_props);
  if (err != cudaSuccess) {std::cout << "err 1: " << cudaGetErrorString(err) << std::endl;  return 0;}

  int64_t kMemoryPoolSizeUnit = (1ul << 25)+incr;  // 32 MiB, undocumented???
  uint64_t memory_pool_size =
      ((allocation_size_in_bytes_0 + allocation_size_in_bytes_1) * num_allocations +
       kMemoryPoolSizeUnit - 1) /
      kMemoryPoolSizeUnit * kMemoryPoolSizeUnit;
  printf ("Memory pool size: (dec) %lu , (hex) %lX\n",memory_pool_size, memory_pool_size);

  // memory_pool_size = (allocation_size_in_bytes_0 + allocation_size_in_bytes_1) * num_allocations; // much slower!!
  err = cudaMemPoolSetAttribute(mem_pool, cudaMemPoolAttrReleaseThreshold, &memory_pool_size);
  if (err != cudaSuccess) {std::cout << "err 2: " << cudaGetErrorString(err) << std::endl;  return 0;}

  err = cudaStreamSynchronize(stream);
  if (err != cudaSuccess) {std::cout << "err 3: " << cudaGetErrorString(err) << std::endl;  return 0;}

  void* allocated_ptrs[num_allocations * 2];
  uint64_t time_sum = 0;
  for (int qq = 0; qq < loops; qq++) {
    for (int count = 0; count < num_allocations; count++) {
      uint64_t t = dtime_usec(0);
      err = cudaMallocFromPoolAsync(&allocated_ptrs[count * 2],
                              allocation_size_in_bytes_0,
                              mem_pool,
                              stream);
      if (err != cudaSuccess) {std::cout << "err 4: " << cudaGetErrorString(err) << std::endl;  return 0;}
      err = cudaMallocFromPoolAsync(&allocated_ptrs[count * 2 + 1],
                              allocation_size_in_bytes_1,
                              mem_pool,
                              stream);
      if (err != cudaSuccess) {std::cout << "err 5: " << cudaGetErrorString(err) << std::endl;  return 0;}
      t = dtime_usec(t);
      time_sum += t;
    }
    err = cudaStreamSynchronize(stream);
    if (err != cudaSuccess) {std::cout << "err 6: " << cudaGetErrorString(err) << std::endl;  return 0;}
    for (int count = 0; count < num_allocations * 2; count++) {
      err = cudaFreeAsync(allocated_ptrs[count], stream);
      if (err != cudaSuccess) {std::cout << "err 7: " << cudaGetErrorString(err) << std::endl;  return 0;}
    }
    err = cudaStreamSynchronize(stream);
    if (err != cudaSuccess) {std::cout << "err 8: " << cudaGetErrorString(err) << std::endl;  return 0;}
  }
  err = cudaMemPoolDestroy(mem_pool);
  if (err != cudaSuccess) {std::cout << "err 9: " << cudaGetErrorString(err) << std::endl;  return 0;}
  std::cout << "elapsed time: " << time_sum << "us" <<  std::endl;
}

With incr = -10 the time is >20x slower.

$ ./a.out 10000 -10
Memory Pools supported!
Loops = 10000 , incr = -10
Memory pool size: (dec) 671088440 , (hex) 27FFFF38
elapsed time: 6343497us

$ ./a.out 10000 10 
Memory Pools supported!
Loops = 10000 , incr = 10
Memory pool size: (dec) 671088840 , (hex) 280000C8
elapsed time: 269383us

$ ./a.out 10000 0 
Memory Pools supported!
Loops = 10000 , incr = 0
Memory pool size: (dec) 671088640 , (hex) 28000000
elapsed time: 267576us

I suggest filing a bug.