Hi forum, I suspect that this is a bug or am I using it wrong? I’m trying to use the cudaMemPool in a multithreaded setup.
#include <pthread.h>
#include <iostream>
#include <string>
#include <thread>
#include "cuda_runtime.h"
struct UserData{
cudaMemPool_t mem_pool;
cudaStream_t stream;
};
void* f(void* data) {
UserData* user_data = reinterpret_cast<UserData*>(data);
cudaMemPool_t& mem_pool = user_data->mem_pool;
cudaStream_t& stream = user_data->stream;
void* ptr;
cudaMallocFromPoolAsync(&ptr, 262144, mem_pool, stream);
cudaStreamSynchronize(stream);
cudaFreeAsync(ptr, stream);
cudaStreamSynchronize(stream);
return nullptr;
}
int main(int, char**) {
cudaStream_t stream;
cudaStreamCreateWithPriority(&stream, 0, 0);
cudaMemPool_t mem_pool;
cudaMemPoolProps pool_props{
.allocType = cudaMemAllocationTypePinned,
.location = {.type = cudaMemLocationTypeDevice, .id = 0}};
cudaMemPoolCreate(&mem_pool, &pool_props);
uint64_t threshold = 1048576;
cudaMemPoolSetAttribute(
mem_pool, cudaMemPoolAttrReleaseThreshold, &threshold);
UserData data{
.mem_pool = mem_pool, .stream = stream
};
pthread_t t1;
pthread_create(&t1, nullptr, f, &data);
pthread_t t2;
pthread_create(&t2, nullptr, f, &data);
pthread_join(t1, nullptr);
pthread_join(t2, nullptr);
return 0;
}
/*
Build:
$ /usr/bin/clang-6.0 -iquote /usr/local/cuda-11.2/targets/x86_64-linux/include -std=c++17 dynamic_mem_pool.cc \
-lstdc++ -lcudart -L"/usr/local/cuda-11.2/lib64" -ldl -lpthread -fsanitize=thread -o dynamic_mem_pool
Run:
$ ./dynamic_mem_pool
==================
WARNING: ThreadSanitizer: data race (pid=27748)
Atomic read of size 1 at 0x7b140000cd28 by thread T5:
#0 pthread_mutex_lock ??:? (dynamic_mem_pool+0x434027)
#1 cuGetErrorString ??:? (libcuda.so.1+0x34803c)
Previous write of size 8 at 0x7b140000cd28 by thread T4 (mutexes: read M63):
#0 calloc ??:? (dynamic_mem_pool+0x44817c)
#1 cuGetErrorString ??:? (libcuda.so.1+0x33d60a)
Location is heap block of size 80 at 0x7b140000cd00 allocated by thread T4:
#0 calloc ??:? (dynamic_mem_pool+0x44817c)
#1 cuGetErrorString ??:? (libcuda.so.1+0x33d60a)
Mutex M63 (0x7b1000000140) created at:
#0 pthread_rwlock_init ??:? (dynamic_mem_pool+0x44d163)
#1 cuGetErrorString ??:? (libcuda.so.1+0x28dda8)
#2 cudaGraphicsVDPAURegisterOutputSurface ??:? (libcudart.so.11.0+0x65af8)
#3 __libc_start_main /build/glibc-uZu3wS/glibc-2.27/csu/../csu/libc-start.c:310 (libc.so.6+0x21c86)
Thread T5 (tid=27759, running) created by main thread at:
#0 pthread_create ??:? (dynamic_mem_pool+0x427556)
#1 main ??:? (dynamic_mem_pool+0x4b8f09)
Thread T4 (tid=27758, running) created by main thread at:
#0 pthread_create ??:? (dynamic_mem_pool+0x427556)
#1 main ??:? (dynamic_mem_pool+0x4b8ee2)
SUMMARY: ThreadSanitizer: data race ??:? in __interceptor_pthread_mutex_lock
==================
ThreadSanitizer: reported 1 warnings
*/
$ uname -a
Linux desktop-18 5.4.0-100-generic #113~18.04.1-Ubuntu SMP Mon Feb 7 15:02:59 UTC 2022 x86_64 x86_64 x86_64 GNU/Linux
$ nvidia-smi -q
Timestamp : Tue Mar 15 19:19:31 2022
Driver Version : 460.32.03
CUDA Version : 11.2
Attached GPUs : 1
GPU 00000000:01:00.0
Product Name : GeForce RTX 2080
Product Brand : GeForce
Display Mode : Enabled
Display Active : Enabled
Persistence Mode : Enabled
MIG Mode
Current : N/A
Pending : N/A
Accounting Mode : Disabled
Accounting Mode Buffer Size : 4000
Driver Model
Current : N/A
Pending : N/A
Serial Number : N/A
GPU UUID : GPU-d26b6c5b-df3d-f186-214b-363ab0ef7692
Minor Number : 0
VBIOS Version : 90.04.23.40.AA
MultiGPU Board : No
Board ID : 0x100
GPU Part Number : N/A
Inforom Version
Image Version : G001.0000.02.04
OEM Object : 1.1
ECC Object : N/A
Power Management Object : N/A
GPU Operation Mode
Current : N/A
Pending : N/A
GPU Virtualization Mode
Virtualization Mode : None
Host VGPU Mode : N/A
IBMNPU
Relaxed Ordering Mode : N/A
PCI
Bus : 0x01
Device : 0x00
Domain : 0x0000
Device Id : 0x1E8210DE
Bus Id : 00000000:01:00.0
Sub System Id : 0x37C11458
GPU Link Info
PCIe Generation
Max : 3
Current : 2
Link Width
Max : 16x
Current : 16x
Bridge Chip
Type : N/A
Firmware : N/A
Replays Since Reset : 0
Replay Number Rollovers : 0
Tx Throughput : 2000 KB/s
Rx Throughput : 12000 KB/s
Fan Speed : 0 %
Performance State : P5
Clocks Throttle Reasons
Idle : Active
Applications Clocks Setting : Not Active
SW Power Cap : Not Active
HW Slowdown : Not Active
HW Thermal Slowdown : Not Active
HW Power Brake Slowdown : Not Active
Sync Boost : Not Active
SW Thermal Slowdown : Not Active
Display Clock Setting : Not Active
FB Memory Usage
Total : 7948 MiB
Used : 1717 MiB
Free : 6231 MiB
BAR1 Memory Usage
Total : 256 MiB
Used : 105 MiB
Free : 151 MiB
Compute Mode : Default
Utilization
Gpu : 1 %
Memory : 6 %
Encoder : 0 %
Decoder : 0 %
Encoder Stats
Active Sessions : 0
Average FPS : 0
Average Latency : 0
FBC Stats
Active Sessions : 0
Average FPS : 0
Average Latency : 0
Ecc Mode
Current : N/A
Pending : N/A
ECC Errors
Volatile
SRAM Correctable : N/A
SRAM Uncorrectable : N/A
DRAM Correctable : N/A
DRAM Uncorrectable : N/A
Aggregate
SRAM Correctable : N/A
SRAM Uncorrectable : N/A
DRAM Correctable : N/A
DRAM Uncorrectable : N/A
Retired Pages
Single Bit ECC : N/A
Double Bit ECC : N/A
Pending Page Blacklist : N/A
Remapped Rows : N/A
Temperature
GPU Current Temp : 58 C
GPU Shutdown Temp : 100 C
GPU Slowdown Temp : 97 C
GPU Max Operating Temp : 88 C
GPU Target Temperature : 83 C
Memory Current Temp : N/A
Memory Max Operating Temp : N/A
Power Readings
Power Management : Supported
Power Draw : 18.55 W
Power Limit : 215.00 W
Default Power Limit : 215.00 W
Enforced Power Limit : 215.00 W
Min Power Limit : 125.00 W
Max Power Limit : 240.00 W
Clocks
Graphics : 720 MHz
SM : 720 MHz
Memory : 810 MHz
Video : 675 MHz
Applications Clocks
Graphics : N/A
Memory : N/A
Default Applications Clocks
Graphics : N/A
Memory : N/A
Max Clocks
Graphics : 2100 MHz
SM : 2100 MHz
Memory : 7000 MHz
Video : 1950 MHz
Max Customer Boost Clocks
Graphics : N/A
Clock Policy
Auto Boost : N/A
Auto Boost Default : N/A
Processes
<Processes omitted>