TL;DR
On GB200 (GB200 NVL72), cuMemCreate takes ~88–89 ms for a 1 GiB VMM allocation, while on H200 NVL it’s ~0.08–0.13 ms under similar conditions. Repro code and Nsight Systems excerpts below.
I’d like to confirm whether this is expected on GB200 (e.g., architectural/firmware behavior), a known issue, or if there are recommended props/flags to reduce cuMemCreate latency.
Testbeds
GB200 (part of GB200 NVL72)
Tried multiple driver/CUDA combos; results are similar.
NVIDIA-SMI version : 570.133.20
NVML version : 570.133
DRIVER version : 570.133.20
CUDA Version : 12.8
NVCC Version : cuda_12.8.r12.8/compiler.35404655_0
NVIDIA-SMI version : 570.158.01
NVML version : 570.158
DRIVER version : 570.158.01
CUDA Version : 12.8
NVCC Version : cuda_12.8.r12.8/compiler.35404655_0
NVIDIA-SMI version : 580.95.05
NVML version : 580.95
DRIVER version : 580.95.05
CUDA Version : 13.0
NVCC Version : cuda_13.0.r13.0/compiler.36424714_0
H200 NVL
NVIDIA-SMI version : 575.57.08
NVML version : 575.57
DRIVER version : 575.57.08
CUDA Version : 12.9
NVCC Version : cuda_12.4.r12.4/compiler.34097967_0
I also test RTX4090, cuMemCreate(1 GiB) is sub-ms (similar to H200) in my tests.
Reproduction
Code
memory_bench.cu
#include <cuda.h>
#include <cuda_profiler_api.h>
#include <cuda_runtime.h>
#include <algorithm>
#include <chrono>
#include <cstring>
#include <iomanip>
#include <iostream>
#include <limits>
#include <memory>
#include <string>
#include <thread>
#include <vector>
// ===================================================================
// Constants
// ===================================================================
constexpr size_t GIB_TO_BYTES = 1ULL << 30;
constexpr size_t MIB_TO_BYTES = 1ULL << 20;
constexpr size_t WARMUP_SIZE = 8 * MIB_TO_BYTES;
constexpr int PRECISION = 3;
// ===================================================================
// Error Handling
// ===================================================================
[[noreturn]] static void die(const std::string &msg) {
std::cerr << "FATAL: " << msg << "\n";
std::exit(1);
}
#define CHECK_CU(expr) check_cu_error((expr), #expr, __FILE__, __LINE__)
#define CHECK_RT(expr) check_rt_error((expr), #expr, __FILE__, __LINE__)
static void check_cu_error(CUresult result, const char *expr, const char *file, int line) {
if (result != CUDA_SUCCESS) {
const char *name = nullptr, *str = nullptr;
cuGetErrorName(result, &name);
cuGetErrorString(result, &str);
std::cerr << "Driver API error at " << file << ":" << line << "\n"
<< " Expression: " << expr << "\n"
<< " Error: " << (name ? name : "unknown") << " - "
<< (str ? str : "unknown") << "\n";
std::exit(1);
}
}
static void check_rt_error(cudaError_t result, const char *expr, const char *file, int line) {
if (result != cudaSuccess) {
std::cerr << "Runtime API error at " << file << ":" << line << "\n"
<< " Expression: " << expr << "\n"
<< " Error: " << cudaGetErrorString(result) << "\n";
std::exit(1);
}
}
// ===================================================================
// Timing Utilities
// ===================================================================
using TimePoint = std::chrono::steady_clock::time_point;
static inline double ms_between(TimePoint a, TimePoint b) {
return std::chrono::duration<double, std::milli>(b - a).count();
}
// ===================================================================
// CLI Configuration
// ===================================================================
struct CLI {
std::string mode = "vmm";
int device = 0;
bool warmup = true;
// VMM mode
size_t gib = 1;
size_t chunk_mib = 0;
int repeats = 3;
// Pool mode
int pool_iters = 1000;
size_t pool_block_mib = 8;
// SetAcc mode
size_t setacc_total_mib = 4096;
size_t setacc_slice_mib = 16;
};
class CLIParser {
public:
static CLI parse(int argc, char **argv) {
CLI result;
for (int i = 1; i < argc; ++i) {
std::string arg(argv[i]);
if (arg == "--help" || arg == "-h") {
print_help();
std::exit(0);
}
if (i + 1 >= argc && arg.find('=') == std::string::npos) {
die("Missing value for " + arg);
}
parse_arg(result, arg, i, argc, argv);
}
return result;
}
private:
static void parse_arg(CLI &c, const std::string &arg, int &i, int argc, char **argv) {
auto get_value = [&]() { return std::string(argv[++i]); };
if (arg == "--mode")
c.mode = get_value();
else if (arg == "--gib")
c.gib = std::stoull(get_value());
else if (arg == "--chunk")
c.chunk_mib = std::stoull(get_value());
else if (arg == "--repeats")
c.repeats = std::stoi(get_value());
else if (arg == "--no-warmup")
c.warmup = false;
else if (arg == "--iters")
c.pool_iters = std::stoi(get_value());
else if (arg == "--block-mib")
c.pool_block_mib = std::stoull(get_value());
else if (arg == "--dev")
c.device = std::stoi(get_value());
else
die("Unknown argument: " + arg);
}
static void print_help() {
std::cout << R"(Usage: ./memory_bench [--mode vmm|pool|setacc] [options]
Modes and options:
--mode vmm Virtual Memory Management benchmark
--gib N : VA arena size in GiB (default 1)
--chunk M : chunk size for map/SetAccess in MiB; 0 = single batch (default 0)
--repeats R : number of runs (default 3)
--no-warmup : disable pre-test warmup
--dev D : device ordinal (default 0)
--mode pool Async Memory Pool benchmark
--iters I : alloc/free iterations (default 1000)
--block-mib B : allocation size per op in MiB (default 8)
--dev D : device ordinal (default 0)
Examples:
./memory_bench --mode vmm --gib 1 --chunk 0 --repeats 3 --dev 0
./memory_bench --mode pool --iters 1000 --block-mib 8 --dev 0
)";
}
};
// ===================================================================
// CUDA Utilities
// ===================================================================
class CUDAContext {
public:
CUDAContext(int device_id) : device_id_(device_id), context_(nullptr) {
CHECK_CU(cuInit(0));
CUdevice dev;
CHECK_CU(cuDeviceGet(&dev, device_id));
CHECK_CU(cuDevicePrimaryCtxRetain(&context_, dev));
CHECK_CU(cuCtxSetCurrent(context_));
}
~CUDAContext() {
if (context_) {
cuCtxSetCurrent(nullptr);
CUdevice dev;
cuDeviceGet(&dev, device_id_);
cuDevicePrimaryCtxRelease(dev);
}
}
private:
int device_id_;
CUcontext context_;
};
__global__ void noop_kernel() {}
static void legacy_warmup_driver() {
CUdeviceptr d = 0;
CHECK_CU(cuMemAlloc(&d, WARMUP_SIZE));
CHECK_CU(cuMemsetD8(d, 0, WARMUP_SIZE));
CHECK_CU(cuMemFree(d));
}
static void runtime_warmup(cudaStream_t s) {
void *p = nullptr;
CHECK_RT(cudaMallocAsync(&p, WARMUP_SIZE, s));
CHECK_RT(cudaMemsetAsync(p, 0, WARMUP_SIZE, s));
noop_kernel<<<1, 1, 0, s>>>();
CHECK_RT(cudaFreeAsync(p, s));
CHECK_RT(cudaStreamSynchronize(s));
}
// ===================================================================
// Size Utilities
// ===================================================================
static size_t align_up(uint64_t value, uint64_t alignment) {
const uint64_t aligned = ((value + alignment - 1) / alignment) * alignment;
if (aligned == 0 || aligned > static_cast<uint64_t>(std::numeric_limits<size_t>::max())) {
die("Size overflow or alignment error (value=" + std::to_string(value) + ")");
}
return static_cast<size_t>(aligned);
}
// ===================================================================
// VMM Benchmark
// ===================================================================
void run_vmm_benchmark(const CLI &c) {
CUDAContext ctx(c.device);
// Device info
CUdevice dev;
CHECK_CU(cuDeviceGet(&dev, c.device));
char dev_name[256] = {};
CHECK_CU(cuDeviceGetName(dev_name, 255, dev));
int compute_major = 0, compute_minor = 0;
CHECK_CU(cuDeviceGetAttribute(&compute_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev));
CHECK_CU(cuDeviceGetAttribute(&compute_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev));
std::cout << "Device : " << c.device << " (" << dev_name
<< ", Compute " << compute_major << "." << compute_minor << ")\n"
<< "Arena : " << c.gib << " GiB\n"
<< "Mapping mode : "
<< (c.chunk_mib == 0 ? "single batch" : "chunks of " + std::to_string(c.chunk_mib) + " MiB") << "\n"
<< "Repeats : " << c.repeats << "\n"
<< "Warmup : " << (c.warmup ? "enabled" : "disabled") << "\n\n";
// Allocation properties
CUmemAllocationProp prop{};
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
prop.location.id = c.device;
prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_NONE;
size_t granularity = 0;
CHECK_CU(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
const size_t total_bytes = align_up(static_cast<uint64_t>(c.gib) * GIB_TO_BYTES, granularity);
size_t chunk_bytes = total_bytes;
if (c.chunk_mib != 0) {
const uint64_t chunk_u64 = static_cast<uint64_t>(c.chunk_mib) * MIB_TO_BYTES;
chunk_bytes = align_up(chunk_u64, granularity);
if (chunk_bytes > total_bytes)
chunk_bytes = total_bytes;
}
// Main benchmark loop
for (int rep = 0; rep < c.repeats; ++rep) {
if (c.warmup)
legacy_warmup_driver();
// Reserve virtual address range
CUdeviceptr base = 0;
CHECK_CU(cuMemAddressReserve(&base, total_bytes, granularity, 0, 0));
// Measure cuMemCreate
auto t0 = std::chrono::steady_clock::now();
// describes the properties of the memory to allocate, e.g. physical location
CUmemGenericAllocationHandle handle;
// create the chunk of memory according to 'prop' and get a handle to it
CHECK_CU(cuMemCreate(&handle, total_bytes, &prop, 0));
auto t1 = std::chrono::steady_clock::now();
// Measure cuMemMap
auto t2 = std::chrono::steady_clock::now();
for (size_t off = 0; off < total_bytes; off += chunk_bytes) {
const size_t sz = std::min(chunk_bytes, total_bytes - off);
// Map the virtual address range to the physical location described by 'handle'
CHECK_CU(cuMemMap(base + off, sz, 0, handle, 0));
}
auto t3 = std::chrono::steady_clock::now();
// Measure cuMemSetAccess
CUmemAccessDesc acc{};
acc.location = prop.location;
acc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
auto t4 = std::chrono::steady_clock::now();
for (size_t off = 0; off < total_bytes; off += chunk_bytes) {
const size_t sz = std::min(chunk_bytes, total_bytes - off);
// To enable access to the mapped region, set the access permissions
CHECK_CU(cuMemSetAccess(base + off, sz, &acc, 1));
}
auto t5 = std::chrono::steady_clock::now();
// Measure cleanup
auto t6 = std::chrono::steady_clock::now();
for (size_t off = 0; off < total_bytes; off += chunk_bytes) {
const size_t sz = std::min(chunk_bytes, total_bytes - off);
CHECK_CU(cuMemUnmap(base + off, sz));
}
CHECK_CU(cuMemRelease(handle));
CHECK_CU(cuMemAddressFree(base, total_bytes));
auto t7 = std::chrono::steady_clock::now();
std::cout << std::fixed << std::setprecision(PRECISION)
<< "Run " << (rep + 1) << "/" << c.repeats << ":\n"
<< " cuMemCreate : " << ms_between(t0, t1) << " ms\n"
<< " cuMemMap : " << ms_between(t2, t3) << " ms\n"
<< " cuMemSetAccess : " << ms_between(t4, t5) << " ms\n"
<< " Unmap + AddressFree : " << ms_between(t6, t7) << " ms\n\n";
}
}
// ===================================================================
// Pool Benchmark
// ===================================================================
void run_pool_benchmark(const CLI &c) {
CHECK_RT(cudaSetDevice(c.device));
cudaDeviceProp prop{};
CHECK_RT(cudaGetDeviceProperties(&prop, c.device));
std::cout << "Device : " << c.device << " (" << prop.name << ")\n";
cudaMemPool_t pool;
CHECK_RT(cudaDeviceGetDefaultMemPool(&pool, c.device));
// Configure pool to never trim and allow reuse
unsigned long long threshold = ~0ULL;
CHECK_RT(cudaMemPoolSetAttribute(pool, cudaMemPoolAttrReleaseThreshold, &threshold));
int one = 1;
CHECK_RT(cudaMemPoolSetAttribute(pool, cudaMemPoolReuseAllowOpportunistic, &one));
CHECK_RT(cudaMemPoolSetAttribute(pool, cudaMemPoolReuseAllowInternalDependencies, &one));
CHECK_RT(cudaMemPoolTrimTo(pool, 0));
cudaStream_t stream;
CHECK_RT(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
runtime_warmup(stream);
const size_t block_bytes = c.pool_block_mib * MIB_TO_BYTES;
std::vector<void *> ptrs;
ptrs.reserve(c.pool_iters);
// Allocate
auto t0 = std::chrono::steady_clock::now();
for (int i = 0; i < c.pool_iters; ++i) {
void *p = nullptr;
CHECK_RT(cudaMallocAsync(&p, block_bytes, stream));
ptrs.push_back(p);
}
CHECK_RT(cudaStreamSynchronize(stream));
auto t1 = std::chrono::steady_clock::now();
// Free
for (void *p : ptrs)
CHECK_RT(cudaFreeAsync(p, stream));
CHECK_RT(cudaStreamSynchronize(stream));
auto t2 = std::chrono::steady_clock::now();
std::cout << std::fixed << std::setprecision(PRECISION)
<< "Allocated " << c.pool_iters << " x " << c.pool_block_mib
<< " MiB blocks : " << ms_between(t0, t1) << " ms\n"
<< "Freed " << c.pool_iters << " blocks : "
<< ms_between(t1, t2) << " ms\n";
// Simulate idle time
std::this_thread::sleep_for(std::chrono::milliseconds(500));
runtime_warmup(stream);
// Post-idle alloc+free
auto t3 = std::chrono::steady_clock::now();
for (int i = 0; i < c.pool_iters; ++i) {
void *p = nullptr;
CHECK_RT(cudaMallocAsync(&p, block_bytes, stream));
CHECK_RT(cudaFreeAsync(p, stream));
}
CHECK_RT(cudaStreamSynchronize(stream));
auto t4 = std::chrono::steady_clock::now();
std::cout << "Post-sleep alloc+free (" << c.pool_iters << " ops) : "
<< ms_between(t3, t4) << " ms\n";
CHECK_RT(cudaStreamDestroy(stream));
}
// ===================================================================
// Main
// ===================================================================
int main(int argc, char **argv) {
CLI cli = CLIParser::parse(argc, argv);
cudaProfilerStart();
try {
if (cli.mode == "vmm") {
run_vmm_benchmark(cli);
} else if (cli.mode == "pool") {
run_pool_benchmark(cli);
} else {
die("Unknown benchmark mode: " + cli.mode);
}
} catch (const std::exception &e) {
std::cerr << "Exception: " << e.what() << "\n";
cudaProfilerStop();
return 1;
}
cudaProfilerStop();
return 0;
}
Notes
- VMM path does
cuMemAddressReserve→cuMemCreate(total_bytes)→cuMemMap→cuMemSetAccess→unmap+free. - No peer access, single GPU (–dev 0), single thread.
- Warmup clears driver/runtime cold-start effects for basic ops;
cuMemCreateremains dominant on GB200.
Reproduction steps
# GB200 (Blackwell, sm_100)
nvcc -O3 -std=c++17 -gencode arch=compute_100,code=sm_100 memory_bench.cu -o memory_bench -lcuda
# H200 NVL (Hopper, sm_90)
nvcc -O3 -std=c++17 -gencode arch=compute_90,code=sm_90 memory_bench.cu -o memory_bench -lcuda
# RTX 4090 (Ada, sm_89)
# nvcc -O3 -std=c++17 -gencode arch=compute_89,code=sm_89 memory_bench.cu -o memory_bench -lcuda
# run vmm benchmark
./memory_bench --mode vmm --gib 1 --chunk 0 --dev 0 --repeats 3
# run async pool benchmark
./memory_bench --mode pool --iters 1000 --block-mib 8 --dev 0
Results
Full output logs
GB200
nvcc -O3 -std=c++17 -gencode arch=compute_100,code=sm_100 memory_bench.cu -o memory_bench -lcuda
./memory_bench --mode vmm --gib 1 --chunk 0 --dev 0 --repeats 3
Device : 0 (NVIDIA GB200, Compute 10.0)
Arena : 1 GiB
Mapping mode : single batch
Repeats : 3
Warmup : enabled
Run 1/3:
cuMemCreate : 87.823 ms
cuMemMap : 0.009 ms
cuMemSetAccess : 0.152 ms
Unmap + AddressFree : 1.008 ms
Run 2/3:
cuMemCreate : 87.784 ms
cuMemMap : 0.008 ms
cuMemSetAccess : 0.139 ms
Unmap + AddressFree : 0.985 ms
Run 3/3:
cuMemCreate : 89.189 ms
cuMemMap : 0.009 ms
cuMemSetAccess : 1.277 ms
Unmap + AddressFree : 1.438 ms
./memory_bench --mode pool --iters 1000 --block-mib 8 --dev 0
Device : 0 (NVIDIA GB200)
Allocated 1000 x 8 MiB blocks : 690.458 ms
Freed 1000 blocks : 0.512 ms
Post-sleep alloc+free (1000 ops) : 0.494 ms
H200 NVL
nvcc -O3 -std=c++17 -gencode arch=compute_90,code=sm_90 memory_bench.cu -o memory_bench -lcuda
./memory_bench --mode vmm --gib 1 --chunk 0 --dev 0 --repeats 3
Device : 0 (NVIDIA H200 NVL, Compute 9.0)
Arena : 1 GiB
Mapping mode : single batch
Repeats : 3
Warmup : enabled
Run 1/3:
cuMemCreate : 0.132 ms
cuMemMap : 0.003 ms
cuMemSetAccess : 0.085 ms
Unmap + AddressFree : 0.449 ms
Run 2/3:
cuMemCreate : 0.089 ms
cuMemMap : 0.001 ms
cuMemSetAccess : 0.078 ms
Unmap + AddressFree : 0.430 ms
Run 3/3:
cuMemCreate : 0.081 ms
cuMemMap : 0.001 ms
cuMemSetAccess : 0.077 ms
Unmap + AddressFree : 0.429 ms
./memory_bench --mode pool --iters 1000 --block-mib 8 --dev 0
Device : 0 (NVIDIA H200 NVL)
Allocated 1000 x 8 MiB blocks : 62.640 ms
Freed 1000 blocks : 0.279 ms
Post-sleep alloc+free (1000 ops) : 0.478 ms
VMM (1 GiB, single batch)
| GPU | cuMemCreate (ms) | cuMemMap (ms) | cuMemSetAccess (ms) | Unmap+Free (ms) |
|---|---|---|---|---|
| GB200 | 87.8 / 87.8 / 89.2 | 0.009 / 0.008 / 0.009 | 0.152 / 0.139 / 1.277 | 1.008 / 0.985 / 1.438 |
| H200 NVL | 0.132 / 0.089 / 0.081 | 0.003 / 0.001 / 0.001 | 0.085 / 0.078 / 0.077 | 0.449 / 0.430 / 0.429 |
cuMemCreate on GB200 is ~10^3x slower than on H200 for the same size.
Async pool sanity check (1000 × 8 MiB)
| GPU | Alloc batch (ms) | Free batch (ms) | Post-sleep alloc+free (ms, 1000 ops) |
|---|---|---|---|
| GB200 | 690.458 | 0.512 | 0.494 |
| H200 NVL | 62.640 | 0.279 | 0.478 |
Nsight Systems (CUDA API Summary excerpts)
GB200
** CUDA API Summary (cuda_api_sum):
Time (%) Total Time (ns) Num Calls Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
-------- --------------- --------- ------------- ------------- ----------- ----------- ----------- ----------------------------------
59.1 710,954,720 2,002 355,122.2 576.0 224 5,634,112 944,383.8 cudaMallocAsync_v11020
* 22.2 266,906,592 3 88,968,864.0 89,051,552.0 88,327,744 89,527,296 604,035.8 cuMemCreate
17.7 213,328,960 1 213,328,960.0 213,328,960.0 213,328,960 213,328,960 0.0 cudaProfilerStart
0.2 2,883,488 3 961,162.7 959,968.0 950,080 973,440 11,725.7 cuMemRelease
0.2 2,605,920 3 868,640.0 826,944.0 810,176 968,800 87,145.3 cuMemAlloc_v2
0.2 2,154,368 3 718,122.7 192,832.0 167,968 1,793,568 931,445.9 cuMemSetAccess
0.1 1,146,560 1 1,146,560.0 1,146,560.0 1,146,560 1,146,560 0.0 cudaGetDeviceProperties_v2_v12000
0.1 964,896 2,002 482.0 448.0 320 6,176 237.6 cudaFreeAsync_v11020
0.0 330,464 3 110,154.7 103,232.0 90,560 136,672 23,822.7 cuMemFree_v2
0.0 192,960 3 64,320.0 64,064.0 59,904 68,992 4,549.4 cuMemUnmap
0.0 116,480 3 38,826.7 37,056.0 36,384 43,040 3,664.3 cuMemsetD8_v2
0.0 79,712 1 79,712.0 79,712.0 79,712 79,712 0.0 cudaDeviceGetDefaultMemPool_v11020
0.0 79,072 2 39,536.0 39,536.0 9,312 69,760 42,743.2 cudaLaunchKernel
0.0 58,240 3 19,413.3 19,776.0 17,440 21,024 1,819.3 cuMemAddressReserve
0.0 55,168 3 18,389.3 18,688.0 17,568 18,912 720.1 cuMemAddressFree
0.0 52,768 2 26,384.0 26,384.0 23,104 29,664 4,638.6 cudaMemsetAsync
0.0 32,576 5 6,515.2 6,624.0 3,296 8,640 2,169.0 cudaStreamSynchronize
0.0 31,008 3 10,336.0 10,112.0 9,344 11,552 1,120.9 cuMemMap
0.0 14,240 1 14,240.0 14,240.0 14,240 14,240 0.0 cudaStreamDestroy
0.0 9,664 1 9,664.0 9,664.0 9,664 9,664 0.0 cudaStreamCreateWithFlags
0.0 4,384 3 1,461.3 352.0 288 3,744 1,977.1 cudaMemPoolSetAttribute_v11020
0.0 2,656 2 1,328.0 1,328.0 1,024 1,632 429.9 cuCtxSetCurrent
0.0 2,304 1 2,304.0 2,304.0 2,304 2,304 0.0 cuProfilerStart
0.0 2,080 1 2,080.0 2,080.0 2,080 2,080 0.0 cuMemGetAllocationGranularity
0.0 1,952 1 1,952.0 1,952.0 1,952 1,952 0.0 cuInit
0.0 1,184 1 1,184.0 1,184.0 1,184 1,184 0.0 cudaProfilerStop
0.0 1,120 1 1,120.0 1,120.0 1,120 1,120 0.0 cudaMemPoolTrimTo_v11020
0.0 1,056 1 1,056.0 1,056.0 1,056 1,056 0.0 cuModuleGetLoadingMode
** CUDA GPU MemOps Summary (by Size) (cuda_gpu_mem_size_sum):
Time (%) Total Time (ns) Count Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Operation
-------- --------------- ----- -------- -------- -------- -------- ----------- -------------
100.0 26,016 5 5,203.2 5,248.0 4,640 5,568 347.3 [CUDA memset]
H200
** CUDA API Summary (cuda_api_sum):
Time (%) Total Time (ns) Num Calls Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
-------- --------------- --------- ------------- ------------- ----------- ----------- ----------- ----------------------------------
88.7 319,144,097 1 319,144,097.0 319,144,097.0 319,144,097 319,144,097 0.0 cudaProfilerStart
7.2 25,932,255 2,002 12,953.2 400.0 320 2,981,070 76,505.2 cudaMallocAsync_v11020
1.1 3,904,773 5 780,954.6 3,760.0 1,980 2,410,288 1,114,488.0 cudaStreamSynchronize
1.0 3,578,870 3 1,192,956.7 1,104,783.0 516,521 1,957,566 724,557.5 cuMemFree_v2
0.6 2,017,217 2 1,008,608.5 1,008,608.5 9,490 2,007,727 1,412,966.9 cudaLaunchKernel
0.4 1,430,184 3 476,728.0 386,811.0 382,601 660,772 159,400.7 cuMemRelease
0.4 1,418,725 1 1,418,725.0 1,418,725.0 1,418,725 1,418,725 0.0 cudaGetDeviceProperties_v2_v12000
0.2 748,504 2,002 373.9 340.0 300 4,190 121.2 cudaFreeAsync_v11020
0.1 455,111 3 151,703.7 93,470.0 76,250 285,391 116,096.3 cuMemAlloc_v2
* 0.1 389,271 3 129,757.0 97,891.0 83,690 207,690 67,864.4 cuMemCreate
0.1 281,992 3 93,997.3 80,440.0 79,941 121,611 23,915.4 cuMemSetAccess
0.0 148,620 3 49,540.0 45,830.0 44,800 57,990 7,336.0 cuMemUnmap
0.0 130,720 1 130,720.0 130,720.0 130,720 130,720 0.0 cudaDeviceGetDefaultMemPool_v11020
0.0 91,543 3 30,514.3 9,561.0 5,881 76,101 39,522.1 cuMemsetD8_v2
0.0 67,050 2 33,525.0 33,525.0 24,190 42,860 13,201.7 cudaMemsetAsync
0.0 25,850 3 8,616.7 3,610.0 2,640 19,600 9,524.2 cuMemAddressReserve
0.0 16,610 3 5,536.7 2,900.0 2,390 11,320 5,015.0 cuMemAddressFree
0.0 11,470 1 11,470.0 11,470.0 11,470 11,470 0.0 cudaStreamDestroy
0.0 10,700 1 10,700.0 10,700.0 10,700 10,700 0.0 cudaStreamCreateWithFlags
0.0 10,690 3 3,563.3 330.0 110 10,250 5,791.9 cudaMemPoolSetAttribute_v11020
0.0 8,250 1 8,250.0 8,250.0 8,250 8,250 0.0 cudaProfilerStop
0.0 6,800 1 6,800.0 6,800.0 6,800 6,800 0.0 cuMemGetAllocationGranularity
0.0 6,440 3 2,146.7 1,290.0 1,050 4,100 1,695.9 cuMemMap
0.0 4,850 1 4,850.0 4,850.0 4,850 4,850 0.0 cuProfilerStart
0.0 4,030 1 4,030.0 4,030.0 4,030 4,030 0.0 cuInit
0.0 1,560 1 1,560.0 1,560.0 1,560 1,560 0.0 cuModuleGetLoadingMode
0.0 1,540 2 770.0 770.0 400 1,140 523.3 cuCtxSetCurrent
0.0 1,460 1 1,460.0 1,460.0 1,460 1,460 0.0 cudaMemPoolTrimTo_v11020
** CUDA GPU MemOps Summary (by Size) (cuda_gpu_mem_size_sum):
Total (MB) Count Avg (MB) Med (MB) Min (MB) Max (MB) StdDev (MB) Operation
---------- ----- -------- -------- -------- -------- ----------- -------------
41.943 5 8.389 8.389 8.389 8.389 0.000 [CUDA memset]
Question
Is the ~80–90 ms cuMemCreate(1 GiB) on GB200 expected?
If not, are there GB200-specific CUmemAllocationProp settings, handle types, or platform knobs recommended to reduce cuMemCreate latency?
Any insights or pointers to documentation/known issues would be greatly appreciated. Thanks!