I try to allocate pinned memory and de-allocate it on two-GPUs case. Run 5 iterations. One GPU shows ‘Shared Memory’ usage up and down, but another just grows up. Simple codes are shown below, “shared memory” of GPU 0 up and down on Task Manager, but “shared memory” of GPU 1 is just up and up, until crashes. I am using Windows 10, Visual studio 2019 and CUDA v10.1.
The graphic cards are the same, properties are shown below.
Properties of GPU 0 : name= GeForce GTX 1080 Ti, uuid= 1484675040, major= 6, minor= 1, integrated= 0, canMapHostMemory= 1, managedMemory= 1, memoryClockRate= 5505000, memoryBusWidth= 352, sharedMemPerBlockOptin= 49152, sharedMemPerBlock= 49152, sharedMemPerMultiprocessor= 98304, computeMode= 0, ECCEnabled= 0, tccDriver= 0, deviceOverlap= 1, asyncEngineCount= 2, unifiedAddressing= 1, globalL1CacheSupported= 1, localL1CacheSupported= 1, isMultiGpuBoard= 0, multiGpuBoardGroupID = 0, hostNativeAtomicSupported= 0, pageableMemoryAccess= 0, pageableMemoryAccessUsesHostPageTables= 0, concurrentManagedAccess= 0, computePreemptionSupported= 1, canUseHostPointerForRegisteredMem= 0, directManagedMemAccessFromHost= 0
Properties of GPU 1 : name= GeForce GTX 1080 Ti, uuid= 1484675040, major= 6, minor= 1, integrated= 0, canMapHostMemory= 1, managedMemory= 1, memoryClockRate= 5505000, memoryBusWidth= 352, sharedMemPerBlockOptin= 49152, sharedMemPerBlock= 49152, sharedMemPerMultiprocessor= 98304, computeMode= 0, ECCEnabled= 0, tccDriver= 0, deviceOverlap= 1, asyncEngineCount= 2, unifiedAddressing= 1, globalL1CacheSupported= 1, localL1CacheSupported= 1, isMultiGpuBoard= 0, multiGpuBoardGroupID = 1, hostNativeAtomicSupported= 0, pageableMemoryAccess= 0, pageableMemoryAccessUsesHostPageTables= 0, concurrentManagedAccess= 0, computePreemptionSupported= 1, canUseHostPointerForRegisteredMem= 0, directManagedMemAccessFromHost= 0
Visual studio 2019 is a builder with x64 platform. Cuda compiling by
“C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1\bin\nvcc.exe” -gencode=arch=compute_30,code="sm_30,compute_30" -gencode=arch=compute_35,code="sm_35,compute_35" -gencode=arch=compute_37,code="sm_37,compute_37" -gencode=arch=compute_50,code="sm_50,compute_50" -gencode=arch=compute_52,code="sm_52,compute_52" -gencode=arch=compute_60,code="sm_60,compute_60" -gencode=arch=compute_61,code="sm_61,compute_61" -gencode=arch=compute_70,code="sm_70,compute_70" --use-local-env -ccbin “C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.21.27702\bin\HostX86\x64” -x cu -I…\E57ToPly -I…\mapmap -I…\rayint -I…\mve -I"mvs-texturing" -I…\eigen -I…\libpng -I…\tbb\include -IC:\dev\boost_1_71_0 -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1\include" -I"C:\Program Files\NVIDIA Corporation\NvToolsExt\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1\include" --keep-dir x64\Release -maxrregcount=0 --machine 64 --compile -cudart static -DCPU_REPLACE_GPU_WHEN_RAM_LACK -DNOMINMAX -DNDEBUG -DBOOST_FILESYSTEM_NO_DEPRECATED -D_WINDLL -D_UNICODE -DUNICODE -Xcompiler "/EHsc /W3 /nologo /Ox /Fdx64\Release\vc142.pdb /Zi /MD " -o x64\Release\x.cu.obj “x.cu”
main()
{
for (int ii = 0; ii < 5; ++ii)
{
int gpuIndex = 0;
gpuErrchk(cudaSetDevice(gpuIndex));
std::vector<uint32_t*> textures;
textures.resize(50);
for (int i = 0; i < textures.size(); ++i)
{
gpuErrchk(cudaHostAlloc((void**)& textures[i], 4096 * 4096 * sizeof(uint32_t), cudaHostAllocDefault));
}
// set GPU 1 and get memory info
gpuIndex = 1;
gpuErrchk(cudaSetDevice(gpuIndex));
size_t freebyte;
size_t totalbyte;
cudaError_t cuda_status;
cuda_status = cudaMemGetInfo(&freebyte, &totalbyte); // This causes "shared memory" GPU 1 up and up
// back to GPU 0
gpuIndex = 0;
gpuErrchk(cudaSetDevice(gpuIndex));
for (int i = 0; i < textures.size(); ++i)
{
gpuErrchk(cudaFreeHost(textures[i]));
}
batchHighTextures.clear();
}
auto cudaStatus = cudaDeviceReset();
return;
}