Cuda memory pool not thread safe?

Hi forum, I suspect that this is a bug or am I using it wrong? I’m trying to use the cudaMemPool in a multithreaded setup.


#include <pthread.h>
#include <iostream>
#include <string>
#include <thread>

#include "cuda_runtime.h"

struct UserData{
  cudaMemPool_t mem_pool;
  cudaStream_t stream;
};

void* f(void* data) {
  UserData* user_data = reinterpret_cast<UserData*>(data);
  cudaMemPool_t& mem_pool = user_data->mem_pool;
  cudaStream_t& stream = user_data->stream;
  void* ptr;
  cudaMallocFromPoolAsync(&ptr, 262144, mem_pool, stream);
  cudaStreamSynchronize(stream);
  cudaFreeAsync(ptr, stream);
  cudaStreamSynchronize(stream);
  return nullptr;
}

int main(int, char**) {
  cudaStream_t stream;
  cudaStreamCreateWithPriority(&stream, 0, 0);
  cudaMemPool_t mem_pool;
  cudaMemPoolProps pool_props{
      .allocType = cudaMemAllocationTypePinned,
      .location = {.type = cudaMemLocationTypeDevice, .id = 0}};
  cudaMemPoolCreate(&mem_pool, &pool_props);
  uint64_t threshold = 1048576;
  cudaMemPoolSetAttribute(
      mem_pool, cudaMemPoolAttrReleaseThreshold, &threshold);
  UserData data{
    .mem_pool = mem_pool, .stream = stream
  };
  pthread_t t1;
  pthread_create(&t1, nullptr, f, &data);
  pthread_t t2;
  pthread_create(&t2, nullptr, f, &data);
  pthread_join(t1, nullptr);
  pthread_join(t2, nullptr);
  return 0;
}

/*
Build:
$ /usr/bin/clang-6.0 -iquote /usr/local/cuda-11.2/targets/x86_64-linux/include -std=c++17 dynamic_mem_pool.cc \
-lstdc++ -lcudart -L"/usr/local/cuda-11.2/lib64" -ldl -lpthread -fsanitize=thread -o dynamic_mem_pool
Run:
$ ./dynamic_mem_pool
==================
WARNING: ThreadSanitizer: data race (pid=27748)
  Atomic read of size 1 at 0x7b140000cd28 by thread T5:
    #0 pthread_mutex_lock ??:? (dynamic_mem_pool+0x434027)
    #1 cuGetErrorString ??:? (libcuda.so.1+0x34803c)
  Previous write of size 8 at 0x7b140000cd28 by thread T4 (mutexes: read M63):
    #0 calloc ??:? (dynamic_mem_pool+0x44817c)
    #1 cuGetErrorString ??:? (libcuda.so.1+0x33d60a)
  Location is heap block of size 80 at 0x7b140000cd00 allocated by thread T4:
    #0 calloc ??:? (dynamic_mem_pool+0x44817c)
    #1 cuGetErrorString ??:? (libcuda.so.1+0x33d60a)
  Mutex M63 (0x7b1000000140) created at:
    #0 pthread_rwlock_init ??:? (dynamic_mem_pool+0x44d163)
    #1 cuGetErrorString ??:? (libcuda.so.1+0x28dda8)
    #2 cudaGraphicsVDPAURegisterOutputSurface ??:? (libcudart.so.11.0+0x65af8)
    #3 __libc_start_main /build/glibc-uZu3wS/glibc-2.27/csu/../csu/libc-start.c:310 (libc.so.6+0x21c86)
  Thread T5 (tid=27759, running) created by main thread at:
    #0 pthread_create ??:? (dynamic_mem_pool+0x427556)
    #1 main ??:? (dynamic_mem_pool+0x4b8f09)
  Thread T4 (tid=27758, running) created by main thread at:
    #0 pthread_create ??:? (dynamic_mem_pool+0x427556)
    #1 main ??:? (dynamic_mem_pool+0x4b8ee2)
SUMMARY: ThreadSanitizer: data race ??:? in __interceptor_pthread_mutex_lock
==================
ThreadSanitizer: reported 1 warnings
*/
$ uname -a
Linux desktop-18 5.4.0-100-generic #113~18.04.1-Ubuntu SMP Mon Feb 7 15:02:59 UTC 2022 x86_64 x86_64 x86_64 GNU/Linux
$ nvidia-smi -q
Timestamp                                 : Tue Mar 15 19:19:31 2022
Driver Version                            : 460.32.03
CUDA Version                              : 11.2

Attached GPUs                             : 1
GPU 00000000:01:00.0
    Product Name                          : GeForce RTX 2080
    Product Brand                         : GeForce
    Display Mode                          : Enabled
    Display Active                        : Enabled
    Persistence Mode                      : Enabled
    MIG Mode
        Current                           : N/A
        Pending                           : N/A
    Accounting Mode                       : Disabled
    Accounting Mode Buffer Size           : 4000
    Driver Model
        Current                           : N/A
        Pending                           : N/A
    Serial Number                         : N/A
    GPU UUID                              : GPU-d26b6c5b-df3d-f186-214b-363ab0ef7692
    Minor Number                          : 0
    VBIOS Version                         : 90.04.23.40.AA
    MultiGPU Board                        : No
    Board ID                              : 0x100
    GPU Part Number                       : N/A
    Inforom Version
        Image Version                     : G001.0000.02.04
        OEM Object                        : 1.1
        ECC Object                        : N/A
        Power Management Object           : N/A
    GPU Operation Mode
        Current                           : N/A
        Pending                           : N/A
    GPU Virtualization Mode
        Virtualization Mode               : None
        Host VGPU Mode                    : N/A
    IBMNPU
        Relaxed Ordering Mode             : N/A
    PCI
        Bus                               : 0x01
        Device                            : 0x00
        Domain                            : 0x0000
        Device Id                         : 0x1E8210DE
        Bus Id                            : 00000000:01:00.0
        Sub System Id                     : 0x37C11458
        GPU Link Info
            PCIe Generation
                Max                       : 3
                Current                   : 2
            Link Width
                Max                       : 16x
                Current                   : 16x
        Bridge Chip
            Type                          : N/A
            Firmware                      : N/A
        Replays Since Reset               : 0
        Replay Number Rollovers           : 0
        Tx Throughput                     : 2000 KB/s
        Rx Throughput                     : 12000 KB/s
    Fan Speed                             : 0 %
    Performance State                     : P5
    Clocks Throttle Reasons
        Idle                              : Active
        Applications Clocks Setting       : Not Active
        SW Power Cap                      : Not Active
        HW Slowdown                       : Not Active
            HW Thermal Slowdown           : Not Active
            HW Power Brake Slowdown       : Not Active
        Sync Boost                        : Not Active
        SW Thermal Slowdown               : Not Active
        Display Clock Setting             : Not Active
    FB Memory Usage
        Total                             : 7948 MiB
        Used                              : 1717 MiB
        Free                              : 6231 MiB
    BAR1 Memory Usage
        Total                             : 256 MiB
        Used                              : 105 MiB
        Free                              : 151 MiB
    Compute Mode                          : Default
    Utilization
        Gpu                               : 1 %
        Memory                            : 6 %
        Encoder                           : 0 %
        Decoder                           : 0 %
    Encoder Stats
        Active Sessions                   : 0
        Average FPS                       : 0
        Average Latency                   : 0
    FBC Stats
        Active Sessions                   : 0
        Average FPS                       : 0
        Average Latency                   : 0
    Ecc Mode
        Current                           : N/A
        Pending                           : N/A
    ECC Errors
        Volatile
            SRAM Correctable              : N/A
            SRAM Uncorrectable            : N/A
            DRAM Correctable              : N/A
            DRAM Uncorrectable            : N/A
        Aggregate
            SRAM Correctable              : N/A
            SRAM Uncorrectable            : N/A
            DRAM Correctable              : N/A
            DRAM Uncorrectable            : N/A
    Retired Pages
        Single Bit ECC                    : N/A
        Double Bit ECC                    : N/A
        Pending Page Blacklist            : N/A
    Remapped Rows                         : N/A
    Temperature
        GPU Current Temp                  : 58 C
        GPU Shutdown Temp                 : 100 C
        GPU Slowdown Temp                 : 97 C
        GPU Max Operating Temp            : 88 C
        GPU Target Temperature            : 83 C
        Memory Current Temp               : N/A
        Memory Max Operating Temp         : N/A
    Power Readings
        Power Management                  : Supported
        Power Draw                        : 18.55 W
        Power Limit                       : 215.00 W
        Default Power Limit               : 215.00 W
        Enforced Power Limit              : 215.00 W
        Min Power Limit                   : 125.00 W
        Max Power Limit                   : 240.00 W
    Clocks
        Graphics                          : 720 MHz
        SM                                : 720 MHz
        Memory                            : 810 MHz
        Video                             : 675 MHz
    Applications Clocks
        Graphics                          : N/A
        Memory                            : N/A
    Default Applications Clocks
        Graphics                          : N/A
        Memory                            : N/A
    Max Clocks
        Graphics                          : 2100 MHz
        SM                                : 2100 MHz
        Memory                            : 7000 MHz
        Video                             : 1950 MHz
    Max Customer Boost Clocks
        Graphics                          : N/A
    Clock Policy
        Auto Boost                        : N/A
        Auto Boost Default                : N/A
    Processes
        <Processes omitted>

I tried to reproduce your issue locally and found the same results. However, when linking with a debug CUDA driver, the issue was no longer reproducible.

My assumption is that the driver would need to be compiled with -fsanitize=thread as well for this setup to report race conditions accurately.

1 Like

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.