Hi to all,
I have a problem on using cudaHostRegister() under a Windows 10 64 bits system.
On the same PC just switching to Windows 7 we never experience such problem.
I try to explain it.
Our application allocate a lot of memory buffers and pin it using cudaHostRegister().
Under Windows 7 we can pin as much memory as we want, the limit it is just the physical available memory.
Under Windows 10 as the pinned memory is about half of the physical memory, the cudaHostRegister() fails with an “out of memory” error.
The PC has:
- Supermicro X10DRG-Q MB with two numa nodes
- 2 Xeon E5-2690 CPU
- 64 GB of RAM (32GB installed for each node).
- 2 RTX2080Ti GPU (each one connected to the PCI bus directly handled by a CPU)
- an nVidia 710 as video card
Both windows 10 and window 7 are 64 bits with latest CUDA 10 and latest nVidia drivers.
To generate the problem, in a cycle
- allocate a memory buffer (for instance of 0.5GB)
- ping it using cudaHostRegister()
Under Windows7 i can pin as much as I want (I just stop to 55GB to avoid to block the system)
Under Windows10 the cudaHostRegister() fails after the total pinned memory is around half of the system memory: with 64 GB it stops around 28.5GB, with 32GB (I remove some ram) it stops at 14.5GB
Anyone experience a similar problem?
I notice another difference between windows 7 and windows 10.
If I look to the properties of the display, under windows 7 the shared system memory for the vido is 3GB, while under Windows 10 is 32 GB. Anyway I don’t think is this the problem, but something related to the maximum non pageable memory.
Here my code.
#include <windows.h>
#include <stdio.h>
#include <tchar.h>
#include <exception>
#include <cuda_runtime.h>
//#include "mycuda.h"
#include <vector>
#include <conio.h>
#pragma comment(lib, "cudart.lib")
int numDevices = 0;
struct MYBUF {
BYTE *pBuf;
size_t size;
bool pinned;
MYBUF()
{
pBuf = NULL;
size = 0;
pinned = false;
}
};
std::vector<MYBUF> myBufs;
int main(_In_ int _Argc, _In_reads_(_Argc) _Pre_z_ char ** _Argv, _In_z_ char ** _Env)
{
int ret= cudaGetDeviceCount(&numDevices); // InitCuda(0, -1, -1);
if ( ret != cudaSuccess || numDevices < 1)
{
_tprintf(TEXT("No cuda devices detected. Ret: %d\n"), numDevices);
return -1;
}
else
{
_tprintf(TEXT("Detected devices: %d\n"), numDevices);
}
const size_t ONE_KB = 1024;
const size_t ONE_MB = 1024 * ONE_KB;
const size_t ONE_GB = 1024 * ONE_MB;
size_t size = 10*ONE_GB; // 1GB
size_t step = 512 * ONE_MB;
bool ok = true;
size_t total_size=0;
while ( ok )
{
MYBUF buf;
buf.size = 512 * ONE_MB; // ONE_GB;
try
{
_tprintf(TEXT("Allocate and pin buf %03u...."), myBufs.size() + 1);
buf.pBuf = new BYTE[buf.size];
cudaError_t ret = cudaHostRegister(buf.pBuf, buf.size, 0);
if (ret != cudaSuccess)
{
printf("Pinning of %.3f GB failed. ret = %d (%s).\n", (float)buf.size / ONE_GB, ret, cudaGetErrorString(ret));
ok = false;
}
else
{
total_size += buf.size;
_tprintf(TEXT("Pinning ok. Total size: %.3fGB\n"), (float)total_size/ONE_GB);
buf.pinned = true;
}
myBufs.push_back(buf);
if (total_size >= 55 * ONE_GB)
{
_tprintf(TEXT("Stop\n"));
ok = false;
}
}
catch (std::bad_alloc)
{
_tprintf(TEXT("Unable to allocate %.1f GB.\n"), (float)buf.size / ONE_GB);
buf.pBuf = NULL;
ok = false;
}
}
_tprintf(TEXT("Press any key to continue\n"));
_getch();
for (size_t i = 0; i < myBufs.size(); i++)
{
if ( myBufs[i].pinned )
{
_tprintf(TEXT("Unpin buf %03d..."), i + 1);
int ret = cudaHostUnregister((void *)myBufs[i].pBuf);
if (ret != cudaSuccess)
{
_tprintf(TEXT("Failed. Ret: %d."), ret);
}
else
{
_tprintf(TEXT("Ok."));
}
}
else
{
_tprintf(TEXT("Buf %03u is not pinned."), i + 1);
}
if ( myBufs[i].pBuf != NULL )
{
_tprintf(TEXT(" Deallocate buffer memory\n"));
delete [] myBufs[i].pBuf;
}
}
return 0;
}
This is the output (just the allocation part)
Allocate and pin buf 001....Pinning ok. Total size: 0.500GB
Allocate and pin buf 002....Pinning ok. Total size: 1.000GB
Allocate and pin buf 003....Pinning ok. Total size: 1.500GB
[...]
Allocate and pin buf 046....Pinning ok. Total size: 23.000GB
Allocate and pin buf 047....Pinning ok. Total size: 23.500GB
Allocate and pin buf 048....Pinning ok. Total size: 24.000GB
Allocate and pin buf 049....Pinning ok. Total size: 24.500GB
Allocate and pin buf 050....Pinning ok. Total size: 25.000GB
Allocate and pin buf 051....Pinning ok. Total size: 25.500GB
Allocate and pin buf 052....Pinning ok. Total size: 26.000GB
Allocate and pin buf 053....Pinning ok. Total size: 26.500GB
Allocate and pin buf 054....Pinning ok. Total size: 27.000GB
Allocate and pin buf 055....Pinning ok. Total size: 27.500GB
Allocate and pin buf 056....Pinning ok. Total size: 28.000GB
Allocate and pin buf 057....Pinning of 0.500 GB failed. ret = 2 (out of memory).