Multi-GPU sample code showing the issue(s): output from console up top, code down below. Note that on a dual-GPU setup under WDDM if you comment out the line setting “iDeviceCount=1” then the single-GPU scenario if fine.
Just by enabling peer-2-peer mode we lost the ability to copy from buffer A (via cudaMalloc) to buffer B (via cudaMallocAsync) with both being on device 0!
Peer-2-peer mode seems to cause the invalid-value errors on same device. Not clear if it’s “legal” on between devices, but it’s failing even within the same device.
Anyone able to test the sample code under a Linux dual-GPU platform and report back if this is WDDM specfic, or a general 11.2 behavior?
Output:
Runtime = 11.020000
Driver = 11.020000
Device Count = 2
Peer-2-Peer: Device 0 can access device 1 => YES
INFO :: Peer-2-Peer: Device 0 -> Device 1 enabled!
Peer-2-Peer: Device 1 can access device 0 => YES
INFO :: Peer-2-Peer: Device 1 -> Device 0 enabled!
Async Memory Pools: Device 0 API support => YES
cudaMemPoolGetAccess[0] => READ+WRITE
cudaMemPoolGetAccess[1] => NONE
Async Memory Pools: Device 1 API support => YES
cudaMemPoolGetAccess[0] => NONE
cudaMemPoolGetAccess[1] => READ+WRITE
WARNING :: cudaMemcpyAsync doesn't work between buffers from cudaMalloc and cudaMallocAsync { errors = (1,1) } on the SAME device!
WARNING :: cudaMemcpyAsync doesn't work between buffers from cudaMalloc and cudaMallocAsync { errors = (1,1) } on DIFFERENT devices!
Finished!
Code:
#include <cuda.h>
#include <stdio.h>
#include <stdint.h>
#include <thread>
//
// Error Checking
//
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
int main()
{
int iRuntimeVersion = 0;
int iDriverVersion = 0;
int iDeviceCount = 0;
gpuErrchk(cudaRuntimeGetVersion(&iRuntimeVersion));
gpuErrchk(cudaDriverGetVersion(&iDriverVersion));
gpuErrchk(cudaGetDeviceCount(&iDeviceCount));
//iDeviceCount = 1; // On a dual GPU setup under WDDM, if we do *not* enable peer-to-peer mode we can copy between cudaMalloc and cudaMallocAsync!
::printf("Runtime = %f\n", iRuntimeVersion / 1000.0f);
::printf("Driver = %f\n", iDriverVersion / 1000.0f);
::printf("Device Count = %i\n", iDeviceCount);
//
// Enable peer2peer mode...
//
for (int i = 0; i < iDeviceCount; i++)
{
for (int j = 0; j < iDeviceCount; j++)
{
if (i != j)
{
int iCanAccessPeer = 0;
cudaError_t error = cudaDeviceCanAccessPeer(&iCanAccessPeer, i, j);
if (error == cudaSuccess)
{
::printf("Peer-2-Peer: Device %i can access device %i => %s", i, j, iCanAccessPeer ? "YES\n" : "NO\n");
cudaError_t error1 = cudaSetDevice(i);
cudaError_t error2 = cudaDeviceEnablePeerAccess(j, 0);
if (error1 == cudaSuccess &&
error2 == cudaSuccess)
{
::printf("\tINFO :: Peer-2-Peer: Device %i -> Device %i enabled!\n", i, j);
}
else
{
::printf("\tERROR :: Peer-2-Peer: Device %i -> %i errors enabling { %i , %i }\n", i, j, error1, error2);
}
}
else
{
::printf("Peer-2-Peer: Device %i can access device %i => ERROR CODE 0x%X\n", i, j, error);
}
}
}
}
gpuErrchk( cudaSetDevice(0) );
//
// Async Memory APIs...
//
for (int i = 0; i < iDeviceCount; i++)
{
int iValue = 0;
cudaError_t error = cudaDeviceGetAttribute(&iValue, cudaDevAttrMemoryPoolsSupported, i);
bool bSupported = (error == cudaSuccess && iValue != 0);
::printf("Async Memory Pools: Device %i API support => %s", i, bSupported ? "YES\n" : "NO\n" );
if (bSupported)
{
cudaMemPool_t defaultMemPool = (cudaMemPool_t)0;
error = cudaDeviceGetDefaultMemPool(&defaultMemPool, i);
for (int j = 0; j < iDeviceCount; j++)
{
cudaMemLocation location = { cudaMemLocationTypeDevice , j };
cudaMemAccessFlags flags = cudaMemAccessFlagsProtNone;
error = cudaMemPoolGetAccess(&flags, defaultMemPool, &location);
if (error != cudaSuccess)
{
::printf("cudaMemPoolGetAccess[%i] => error %i\n", j, error);
}
else if (flags == cudaMemAccessFlagsProtNone)
{
::printf("cudaMemPoolGetAccess[%i] => NONE\n", j);
}
else if (flags == cudaMemAccessFlagsProtRead)
{
::printf("cudaMemPoolGetAccess[%i] => READ\n", j);
}
else if (flags == cudaMemAccessFlagsProtReadWrite)
{
::printf("cudaMemPoolGetAccess[%i] => READ+WRITE\n", j);
}
else
{
::printf("cudaMemPoolGetAccess[%i] => UNKNOWN\n" , j );
}
}
}
}
//
//
//
void* pPtr0 = nullptr;
void* pPtr1 = nullptr;
void* pAsyncPtr0 = nullptr;
void* pAsyncPtr1 = nullptr;
cudaStream_t stream0 = (cudaStream_t)0;
cudaStream_t stream1 = (cudaStream_t)1;
const size_t kBytes = 4 * 1024;
//
// Allocate
//
gpuErrchk(cudaSetDevice(0));
gpuErrchk(cudaMalloc(&pPtr0, kBytes));
gpuErrchk(cudaStreamCreateWithFlags(&stream0, cudaStreamNonBlocking));
gpuErrchk(cudaMallocAsync(&pAsyncPtr0, kBytes, stream0));
gpuErrchk(cudaDeviceSynchronize());
if (iDeviceCount == 1)
{
//
// Single GPU mode!
//
gpuErrchk(cudaMalloc(&pPtr1, kBytes));
//
// Traditional async copy from cudaMalloc -- all good!
//
gpuErrchk(cudaMemcpyAsync(pPtr1, pPtr0, kBytes, cudaMemcpyDeviceToDevice, stream0));
gpuErrchk(cudaMemcpyAsync(pPtr0, pPtr1, kBytes, cudaMemcpyDeviceToDevice, stream0));
gpuErrchk(cudaMemcpyAsync(pPtr0, pPtr1, kBytes, cudaMemcpyDeviceToDevice, stream1));
gpuErrchk(cudaMemcpyAsync(pPtr0, pPtr1, kBytes, cudaMemcpyDeviceToDevice, stream1));
//
// Async copy with cudaMalloc and the 'natively' allocated pointers within each device FAILS!
//
cudaError_t copy_ptr0_to_asyncptr0 = cudaMemcpyAsync(pAsyncPtr0, pPtr0, kBytes, cudaMemcpyDeviceToDevice, stream0);
cudaError_t copy_asyncptr0_to_ptr0 = cudaMemcpyAsync(pPtr0, pAsyncPtr0, kBytes, cudaMemcpyDeviceToDevice, stream0);
if (copy_ptr0_to_asyncptr0 != cudaSuccess || copy_asyncptr0_to_ptr0 != cudaSuccess)
{
::printf("WARNING :: cudaMemcpyAsync doesn't work between buffers from cudaMalloc and cudaMallocAsync { errors = (%i,%i) } on the SAME device!\n", copy_ptr0_to_asyncptr0, copy_asyncptr0_to_ptr0);
}
else
{
::printf("SUCCESS :: cudaMemcpyAsync between buffers from cudaMalloc and cudaMallocAsync on the SAME device is good! [single-GPU scenario]\n");
}
}
else if (iDeviceCount > 1)
{
//
// Dual GPU mode!
//
gpuErrchk(cudaSetDevice(1));
gpuErrchk(cudaMalloc(&pPtr1, kBytes));
gpuErrchk(cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking));
gpuErrchk(cudaMallocAsync(&pAsyncPtr1, kBytes, stream1));
gpuErrchk(cudaDeviceSynchronize());
//
// Traditional async copy from cudaMalloc -- all good!
//
gpuErrchk(cudaMemcpyAsync(pPtr1, pPtr0, kBytes, cudaMemcpyDeviceToDevice, stream0));
gpuErrchk(cudaMemcpyAsync(pPtr0, pPtr1, kBytes, cudaMemcpyDeviceToDevice, stream0));
gpuErrchk(cudaMemcpyAsync(pPtr0, pPtr1, kBytes, cudaMemcpyDeviceToDevice, stream1));
gpuErrchk(cudaMemcpyAsync(pPtr0, pPtr1, kBytes, cudaMemcpyDeviceToDevice, stream1));
//
// Async copy with cudaMalloc and the 'natively' allocated pointers within each device FAILS!
//
cudaError_t copy_ptr0_to_asyncptr0 = cudaMemcpyAsync(pAsyncPtr0, pPtr0, kBytes, cudaMemcpyDeviceToDevice, stream0);
cudaError_t copy_asyncptr0_to_ptr0 = cudaMemcpyAsync(pPtr0, pAsyncPtr0, kBytes, cudaMemcpyDeviceToDevice, stream0);
if (copy_ptr0_to_asyncptr0 != cudaSuccess || copy_asyncptr0_to_ptr0 != cudaSuccess)
{
::printf("WARNING :: cudaMemcpyAsync doesn't work between buffers from cudaMalloc and cudaMallocAsync { errors = (%i,%i) } on the SAME device!\n", copy_ptr0_to_asyncptr0, copy_asyncptr0_to_ptr0);
}
else
{
::printf("SUCCESS :: cudaMemcpyAsync between buffers from cudaMalloc and cudaMallocAsync on the SAME device is good! [Multi-GPU scenario]\n");
}
//
// Async copy with cudaMalloc and the 'natively' allocated pointers across different device FAILS!
//
cudaError_t copy_ptr1_to_asyncptr0 = cudaMemcpyAsync(pAsyncPtr0, pPtr1, kBytes, cudaMemcpyDeviceToDevice, stream0);
cudaError_t copy_asyncptr0_to_ptr1 = cudaMemcpyAsync(pPtr1, pAsyncPtr0, kBytes, cudaMemcpyDeviceToDevice, stream0);
if (copy_ptr1_to_asyncptr0 != cudaSuccess || copy_asyncptr0_to_ptr1 != cudaSuccess)
{
::printf("WARNING :: cudaMemcpyAsync doesn't work between buffers from cudaMalloc and cudaMallocAsync { errors = (%i,%i) } on DIFFERENT devices!\n", copy_ptr1_to_asyncptr0, copy_asyncptr0_to_ptr1);
}
else
{
::printf("SUCCESS :: cudaMemcpyAsync between buffers from cudaMalloc and cudaMallocAsync on the DIFFERENT devices is good!\n");
}
/*
All variations show the same issues...
gpuErrchk(cudaMemcpyAsync(pPtr0, pAsyncPtr0, kBytes, cudaMemcpyDeviceToDevice, stream0));
gpuErrchk(cudaMemcpyAsync(pAsyncPtr0, pPtr1, kBytes, cudaMemcpyDeviceToDevice, stream0));
gpuErrchk(cudaMemcpyAsync(pPtr1, pAsyncPtr0, kBytes, cudaMemcpyDeviceToDevice, stream0));
gpuErrchk(cudaMemcpyAsync(pAsyncPtr1, pPtr0, kBytes, cudaMemcpyDeviceToDevice, stream0));
gpuErrchk(cudaMemcpyAsync(pPtr0, pAsyncPtr1, kBytes, cudaMemcpyDeviceToDevice, stream0));
gpuErrchk(cudaMemcpyAsync(pAsyncPtr1, pPtr1, kBytes, cudaMemcpyDeviceToDevice, stream0));
gpuErrchk(cudaMemcpyAsync(pPtr1, pAsyncPtr1, kBytes, cudaMemcpyDeviceToDevice, stream0));
*/
//
// Clean up...
//
gpuErrchk(cudaStreamSynchronize(stream1));
gpuErrchk(cudaStreamSynchronize(stream0));
gpuErrchk(cudaStreamDestroy(stream1));
}
gpuErrchk(cudaStreamDestroy(stream0));
gpuErrchk(cudaFree(pPtr1));
gpuErrchk(cudaFree(pPtr0));
//
// Finished
//
::printf("Finished!\n");
return 0;
}