Please be aware that I’m a total CUDA newbee and may be doing something completely stupid.
I’m running these experiments on Linux (Ubuntu 19.04) using this CUDA version:
$ nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Fri_Feb__8_19:08:17_PST_2019
Cuda compilation tools, release 10.1, V10.1.105
My test program is very simple: It allocates and frees larger and larger amounts of host memory using cudaMallocHost and cudaFreeHost. I noticed that around 2GB, the cudaFreeHost gets extremely slow.
My machine is equipped with a GeForce RTX 2070 and has driver version 418.56.
and the behavior is the same when I run the code as root user and also when I enlarge the ulimit for locked memory.
Any help here would be greatly appreciated!
Thanks a lot
Stefan
Here is my test code:
#include <iostream>
#include "cuda_runtime.h"
void checkError(cudaError_t result, char const *const func, const char *const file, int const line)
{
if (result != cudaSuccess) {
// Check error and exit applications.
// Only use this in test applications.
fprintf(stderr, "CUDA error at %s:%d code=%d \"%s\" \n", file, line, static_cast<unsigned int>(result), func);
exit(EXIT_FAILURE);
}
}
#define checkCudaErrors(val) checkError((val), #val, __FILE__, __LINE__)
int main(void)
{
for (size_t size_mb = 100; size_mb <= 2000+100; size_mb += 250) {
size_t const size_kb = size_mb * 1024;
size_t const size_b = size_kb * 1024;
bool use_gpu_host_alloc = true;
bool do_host_alloc = true;
bool do_device_alloc = false;
cudaEvent_t start, stop;
void * host;
void * device;
// Initialize timer events and start time (record start timer event).
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
float elapsed_time = 0.0f;
if (do_host_alloc) {
checkCudaErrors(cudaEventRecord(start, NULL));
checkCudaErrors(cudaEventSynchronize(start));
if (use_gpu_host_alloc) {
checkCudaErrors(cudaMallocHost(&host, size_b, cudaHostAllocPortable));
} else {
host = malloc(size_b);
}
checkCudaErrors(cudaEventRecord(stop, NULL));
checkCudaErrors(cudaEventSynchronize(stop));
elapsed_time = 0.0f;
checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start, stop));
std::cerr << "ALLOC_HOST SizeMB=" << size_mb << " Duration=" << elapsed_time / 1000.0
<< std::endl;
checkCudaErrors(cudaEventRecord(start, NULL));
checkCudaErrors(cudaEventSynchronize(start));
if (use_gpu_host_alloc) {
checkCudaErrors(cudaFreeHost(host));
} else {
free(host);
}
checkCudaErrors(cudaEventRecord(stop, NULL));
checkCudaErrors(cudaEventSynchronize(stop));
elapsed_time = 0.0f;
checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start, stop));
std::cerr << "FREE_HOST SizeMB=" << size_mb << " Duration=" << elapsed_time / 1000.0
<< std::endl;
}
if (do_device_alloc) {
checkCudaErrors(cudaEventRecord(start, NULL));
checkCudaErrors(cudaEventSynchronize(start));
checkCudaErrors(cudaMalloc(&device, size_b));
checkCudaErrors(cudaEventRecord(stop, NULL));
checkCudaErrors(cudaEventSynchronize(stop));
elapsed_time = 0.0f;
checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start, stop));
std::cerr << "ALLOC_DEVICE SizeMB=" << size_mb << " Duration=" << elapsed_time / 1000.0
<< std::endl;
checkCudaErrors(cudaEventRecord(start, NULL));
checkCudaErrors(cudaEventSynchronize(start));
checkCudaErrors(cudaFree(device));
checkCudaErrors(cudaEventRecord(stop, NULL));
checkCudaErrors(cudaEventSynchronize(stop));
elapsed_time = 0.0f;
checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start, stop));
std::cerr << "FREE_DEVICE SizeMB=" << size_mb << " Duration=" << elapsed_time / 1000.0
<< std::endl << std::endl;
}
}
checkCudaErrors(cudaDeviceReset());
return 0;
}
And here is the output of the program on my machines:
$ ./a.out
ALLOC_HOST SizeMB=100 Duration=0.0283465
FREE_HOST SizeMB=100 Duration=0.00904362
ALLOC_HOST SizeMB=350 Duration=0.086401
FREE_HOST SizeMB=350 Duration=0.0304353
ALLOC_HOST SizeMB=600 Duration=0.143823
FREE_HOST SizeMB=600 Duration=0.0518581
ALLOC_HOST SizeMB=850 Duration=0.205972
FREE_HOST SizeMB=850 Duration=0.0725446
ALLOC_HOST SizeMB=1100 Duration=0.263455
FREE_HOST SizeMB=1100 Duration=0.0935576
ALLOC_HOST SizeMB=1350 Duration=0.320504
FREE_HOST SizeMB=1350 Duration=0.114782
ALLOC_HOST SizeMB=1600 Duration=0.38255
FREE_HOST SizeMB=1600 Duration=0.134775
ALLOC_HOST SizeMB=1850 Duration=0.440723
FREE_HOST SizeMB=1850 Duration=0.15505
ALLOC_HOST SizeMB=2100 Duration=0.493754
FREE_HOST SizeMB=2100 Duration=36.0846
Please note the 36 seconds that the last FREE_HOST needs!
If I use native malloc instead of cudaMallocHost, this effect is not observed.