11.2 > cudaMemPool_t and Peer2Peer

The new async memory APIs in 11.2 are a great addition to CUDA greatly simplifying memory management that the user normally has to manage on the host side. We’ve just started investigating them but are seeing good performance and reduced memory usage vs. doing intelligent caching on the host side.

But there doesn’t seem to be a way to use them directly in a multi-GPU environment with peer2peer access enabled.

Using an async device-to-device copy between stream A on GPU 0, and stream B on GPU 1, returns an “cudaErrorInvalidValue” when using the new async memory APIs (cudaMallocAsync/cudaFreeAsync). Falling back to cudaMalloc and cudaFree work fine.

Clearly the user needs to use streams/events to ensure that their is safety – e.g. cudaStreamWaitForEvent along with create/record event just like in other peer2peer scenarios with streams, buffers, and async cmds.

Looking at the “default mem-pool” on a dual-GPU setup (under WDDM), the default pool returned by cudaDeviceGetDefaultMemPool() reports read+write access for itself (cudaMemAccessFlagsProtReadWrite), but none (cudaMemAccessFlagsProtNone) for the other GPU when the default pool is queried via cudaMemPoolGetAccess().

Likewise, the cudaMemPoolCreate API which takes a cudaMemPoolProps structure that contains a cudaMemLocation structure only appears to allow making a pool on a single device.

I’m sure we can fall back to using cudaMalloc() to create “staging buffers” to copy from pool0 -> staging0 and then from staging0-> staging1 (p2p), and then staging1->pool1…but that seems unnecessary.

Is there a reason that peer2peer access isn’t supported for the memory pools? It seems similar constraints should exist on cudaFreeAsync regardless of whether that stream contains events/cmds that are local (same gpu) or remote (diff. gpu). That stream passed to cudaFreeAsync may contain previously executed cudaStreamWaitForEvent() for other streams on local or remote GPUs.

OK, so perhaps we can’t work around that limitation of peer2peer not being support for buffers allocated via cudaMallocAsync.

It appears that we can’t even copy within the same device with a buffer allocate via cudaMalloc and a buffer allocated via cudaMallocAsync as it also returns an invalid-value error.

This poses a major limitation when working with 3rd party libraries – we can’t use cudaMemcpyAsync to transfer between two buffers on the same device!

Multi-GPU sample code showing the issue(s): output from console up top, code down below. Note that on a dual-GPU setup under WDDM if you comment out the line setting “iDeviceCount=1” then the single-GPU scenario if fine.

Just by enabling peer-2-peer mode we lost the ability to copy from buffer A (via cudaMalloc) to buffer B (via cudaMallocAsync) with both being on device 0!

Peer-2-peer mode seems to cause the invalid-value errors on same device. Not clear if it’s “legal” on between devices, but it’s failing even within the same device.

Anyone able to test the sample code under a Linux dual-GPU platform and report back if this is WDDM specfic, or a general 11.2 behavior?

Output:

Runtime = 11.020000
Driver = 11.020000
Device Count = 2
Peer-2-Peer: Device 0 can access device 1 => YES
        INFO :: Peer-2-Peer: Device 0 -> Device 1 enabled!
Peer-2-Peer: Device 1 can access device 0 => YES
        INFO :: Peer-2-Peer: Device 1 -> Device 0 enabled!
Async Memory Pools: Device 0 API support => YES
cudaMemPoolGetAccess[0] => READ+WRITE
cudaMemPoolGetAccess[1] => NONE
Async Memory Pools: Device 1 API support => YES
cudaMemPoolGetAccess[0] => NONE
cudaMemPoolGetAccess[1] => READ+WRITE
WARNING :: cudaMemcpyAsync doesn't work between buffers from cudaMalloc and cudaMallocAsync { errors = (1,1) } on the SAME device!
WARNING :: cudaMemcpyAsync doesn't work between buffers from cudaMalloc and cudaMallocAsync { errors = (1,1) } on DIFFERENT devices!
Finished!

Code:

#include <cuda.h>

#include <stdio.h>
#include <stdint.h>
#include <thread>


//
// Error Checking
//

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
	if (code != cudaSuccess)
	{
		fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
		if (abort) exit(code);
	}
}

int main()
{
	int iRuntimeVersion = 0;
	int iDriverVersion = 0;
	int iDeviceCount = 0;

	gpuErrchk(cudaRuntimeGetVersion(&iRuntimeVersion));
	gpuErrchk(cudaDriverGetVersion(&iDriverVersion));
	gpuErrchk(cudaGetDeviceCount(&iDeviceCount));

	//iDeviceCount = 1;	// On a dual GPU setup under WDDM, if we do *not* enable peer-to-peer mode we can copy between cudaMalloc and cudaMallocAsync!

	::printf("Runtime = %f\n", iRuntimeVersion / 1000.0f);
	::printf("Driver = %f\n", iDriverVersion / 1000.0f);
	::printf("Device Count = %i\n", iDeviceCount);


	//
	// Enable peer2peer mode...
	//

	for (int i = 0; i < iDeviceCount; i++)
	{
		for (int j = 0; j < iDeviceCount; j++)
		{
			if (i != j)
			{
				int iCanAccessPeer = 0;

				cudaError_t error = cudaDeviceCanAccessPeer(&iCanAccessPeer, i, j);

				if (error == cudaSuccess)
				{
					::printf("Peer-2-Peer: Device %i can access device %i => %s", i, j, iCanAccessPeer ? "YES\n" : "NO\n");

					cudaError_t error1 = cudaSetDevice(i);
					cudaError_t error2 = cudaDeviceEnablePeerAccess(j, 0);

					if (error1 == cudaSuccess &&
						error2 == cudaSuccess)
					{
						::printf("\tINFO :: Peer-2-Peer: Device %i -> Device %i enabled!\n", i, j);
					}
					else
					{
						::printf("\tERROR :: Peer-2-Peer: Device %i -> %i errors enabling { %i , %i }\n", i, j, error1, error2);
					}
				}
				else
				{
					::printf("Peer-2-Peer: Device %i can access device %i => ERROR CODE 0x%X\n", i, j, error);
				}
			}
		}
	}

	gpuErrchk( cudaSetDevice(0) );


	//
	// Async Memory APIs...
	//

	for (int i = 0; i < iDeviceCount; i++)
	{
		int iValue = 0;

		cudaError_t error = cudaDeviceGetAttribute(&iValue, cudaDevAttrMemoryPoolsSupported, i);

		bool bSupported = (error == cudaSuccess && iValue != 0);

		::printf("Async Memory Pools: Device %i API support => %s", i, bSupported ? "YES\n" : "NO\n" );

		if (bSupported)
		{
			cudaMemPool_t defaultMemPool = (cudaMemPool_t)0;

			error = cudaDeviceGetDefaultMemPool(&defaultMemPool, i);

			for (int j = 0; j < iDeviceCount; j++)
			{
				cudaMemLocation location = { cudaMemLocationTypeDevice , j };
				cudaMemAccessFlags flags = cudaMemAccessFlagsProtNone;

				error = cudaMemPoolGetAccess(&flags, defaultMemPool, &location);

				if (error != cudaSuccess)
				{
					::printf("cudaMemPoolGetAccess[%i] => error %i\n", j, error);
				}
				else if (flags == cudaMemAccessFlagsProtNone)
				{
					::printf("cudaMemPoolGetAccess[%i] => NONE\n", j);
				}
				else if (flags == cudaMemAccessFlagsProtRead)
				{
					::printf("cudaMemPoolGetAccess[%i] => READ\n", j);
				}
				else if (flags == cudaMemAccessFlagsProtReadWrite)
				{
					::printf("cudaMemPoolGetAccess[%i] => READ+WRITE\n", j);
				}
				else
				{
					::printf("cudaMemPoolGetAccess[%i] => UNKNOWN\n" , j );
				}
			}
		}
	}


	//
	//
	//

	void* pPtr0 = nullptr;
	void* pPtr1 = nullptr;

	void* pAsyncPtr0 = nullptr;
	void* pAsyncPtr1 = nullptr;

	cudaStream_t stream0 = (cudaStream_t)0;
	cudaStream_t stream1 = (cudaStream_t)1;

	const size_t kBytes = 4 * 1024;


	//
	// Allocate
	//

	gpuErrchk(cudaSetDevice(0));
	gpuErrchk(cudaMalloc(&pPtr0, kBytes));
	gpuErrchk(cudaStreamCreateWithFlags(&stream0, cudaStreamNonBlocking));
	gpuErrchk(cudaMallocAsync(&pAsyncPtr0, kBytes, stream0));
	gpuErrchk(cudaDeviceSynchronize());

	if (iDeviceCount == 1)
	{
		//
		// Single GPU mode!
		//

		gpuErrchk(cudaMalloc(&pPtr1, kBytes));

		//
		// Traditional async copy from cudaMalloc -- all good!
		//

		gpuErrchk(cudaMemcpyAsync(pPtr1, pPtr0, kBytes, cudaMemcpyDeviceToDevice, stream0));
		gpuErrchk(cudaMemcpyAsync(pPtr0, pPtr1, kBytes, cudaMemcpyDeviceToDevice, stream0));

		gpuErrchk(cudaMemcpyAsync(pPtr0, pPtr1, kBytes, cudaMemcpyDeviceToDevice, stream1));
		gpuErrchk(cudaMemcpyAsync(pPtr0, pPtr1, kBytes, cudaMemcpyDeviceToDevice, stream1));


		//
		// Async copy with cudaMalloc and the 'natively' allocated pointers within each device FAILS!
		//

		cudaError_t copy_ptr0_to_asyncptr0 = cudaMemcpyAsync(pAsyncPtr0, pPtr0, kBytes, cudaMemcpyDeviceToDevice, stream0);
		cudaError_t copy_asyncptr0_to_ptr0 = cudaMemcpyAsync(pPtr0, pAsyncPtr0, kBytes, cudaMemcpyDeviceToDevice, stream0);

		if (copy_ptr0_to_asyncptr0 != cudaSuccess || copy_asyncptr0_to_ptr0 != cudaSuccess)
		{
			::printf("WARNING :: cudaMemcpyAsync doesn't work between buffers from cudaMalloc and cudaMallocAsync { errors = (%i,%i) } on the SAME device!\n", copy_ptr0_to_asyncptr0, copy_asyncptr0_to_ptr0);
		}
		else
		{
			::printf("SUCCESS :: cudaMemcpyAsync between buffers from cudaMalloc and cudaMallocAsync on the SAME device is good! [single-GPU scenario]\n");
		}
	}
	else if (iDeviceCount > 1)
	{
		//
		// Dual GPU mode!
		//

		gpuErrchk(cudaSetDevice(1));
		gpuErrchk(cudaMalloc(&pPtr1, kBytes));
		gpuErrchk(cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking));
		gpuErrchk(cudaMallocAsync(&pAsyncPtr1, kBytes, stream1));
		gpuErrchk(cudaDeviceSynchronize());


		//
		// Traditional async copy from cudaMalloc -- all good!
		//

		gpuErrchk(cudaMemcpyAsync(pPtr1, pPtr0, kBytes, cudaMemcpyDeviceToDevice, stream0));
		gpuErrchk(cudaMemcpyAsync(pPtr0, pPtr1, kBytes, cudaMemcpyDeviceToDevice, stream0));

		gpuErrchk(cudaMemcpyAsync(pPtr0, pPtr1, kBytes, cudaMemcpyDeviceToDevice, stream1));
		gpuErrchk(cudaMemcpyAsync(pPtr0, pPtr1, kBytes, cudaMemcpyDeviceToDevice, stream1));


		//
		// Async copy with cudaMalloc and the 'natively' allocated pointers within each device FAILS!
		//

		cudaError_t copy_ptr0_to_asyncptr0 = cudaMemcpyAsync(pAsyncPtr0, pPtr0, kBytes, cudaMemcpyDeviceToDevice, stream0);
		cudaError_t copy_asyncptr0_to_ptr0 = cudaMemcpyAsync(pPtr0, pAsyncPtr0, kBytes, cudaMemcpyDeviceToDevice, stream0);

		if (copy_ptr0_to_asyncptr0 != cudaSuccess || copy_asyncptr0_to_ptr0 != cudaSuccess)
		{
			::printf("WARNING :: cudaMemcpyAsync doesn't work between buffers from cudaMalloc and cudaMallocAsync { errors = (%i,%i) } on the SAME device!\n", copy_ptr0_to_asyncptr0, copy_asyncptr0_to_ptr0);
		}
		else
		{
			::printf("SUCCESS :: cudaMemcpyAsync between buffers from cudaMalloc and cudaMallocAsync on the SAME device is good! [Multi-GPU scenario]\n");
		}

		//
		// Async copy with cudaMalloc and the 'natively' allocated pointers across different device FAILS!
		//

		cudaError_t copy_ptr1_to_asyncptr0 = cudaMemcpyAsync(pAsyncPtr0, pPtr1, kBytes, cudaMemcpyDeviceToDevice, stream0);
		cudaError_t copy_asyncptr0_to_ptr1 = cudaMemcpyAsync(pPtr1, pAsyncPtr0, kBytes, cudaMemcpyDeviceToDevice, stream0);

		if (copy_ptr1_to_asyncptr0 != cudaSuccess || copy_asyncptr0_to_ptr1 != cudaSuccess)
		{
			::printf("WARNING :: cudaMemcpyAsync doesn't work between buffers from cudaMalloc and cudaMallocAsync { errors = (%i,%i) } on DIFFERENT devices!\n", copy_ptr1_to_asyncptr0, copy_asyncptr0_to_ptr1);
		}
		else
		{
			::printf("SUCCESS :: cudaMemcpyAsync between buffers from cudaMalloc and cudaMallocAsync on the DIFFERENT devices is good!\n");
		}

		/*

		All variations show the same issues...

		gpuErrchk(cudaMemcpyAsync(pPtr0, pAsyncPtr0, kBytes, cudaMemcpyDeviceToDevice, stream0));
		gpuErrchk(cudaMemcpyAsync(pAsyncPtr0, pPtr1, kBytes, cudaMemcpyDeviceToDevice, stream0));
		gpuErrchk(cudaMemcpyAsync(pPtr1, pAsyncPtr0, kBytes, cudaMemcpyDeviceToDevice, stream0));

		gpuErrchk(cudaMemcpyAsync(pAsyncPtr1, pPtr0, kBytes, cudaMemcpyDeviceToDevice, stream0));
		gpuErrchk(cudaMemcpyAsync(pPtr0, pAsyncPtr1, kBytes, cudaMemcpyDeviceToDevice, stream0));
		gpuErrchk(cudaMemcpyAsync(pAsyncPtr1, pPtr1, kBytes, cudaMemcpyDeviceToDevice, stream0));
		gpuErrchk(cudaMemcpyAsync(pPtr1, pAsyncPtr1, kBytes, cudaMemcpyDeviceToDevice, stream0));
		*/

		//
		// Clean up...
		//

		gpuErrchk(cudaStreamSynchronize(stream1));
		gpuErrchk(cudaStreamSynchronize(stream0));

		gpuErrchk(cudaStreamDestroy(stream1));
	}


	gpuErrchk(cudaStreamDestroy(stream0));

	gpuErrchk(cudaFree(pPtr1));
	gpuErrchk(cudaFree(pPtr0));

	//
	// Finished
	//

	::printf("Finished!\n");

	return 0;
}

Hi @stephen.nowalk , thank you very much for bringing this issue to our attention. We are still discussing how to best address it internally, as soon as we find a good solution we will post more details here, including when it will be publicly available.

As a workaround, and until a fix is available, we would suggest you set the current context to match the stream’s context, that should allow the memory copy to work.

Thanks for the feedback and update @entschev, along with the workaround suggestion.

FYI, in local testing, cudaMemcpyAsync with D2H and H2D seem to suffer the same issue(s) as well.

The cudaMemcpyPeerAsync() API does not seem to be impacted – in testing this works using buffers allocated via cudaMalloc() vs cudaMallocAsync()…on both the same device, as well as different devices.

Added the same notes to the bug here: https://developer.nvidia.com/nvidia_bug/3210768