problem with cudaMemcpy3DPeer

Hi!

In the example shown below I am unable to get cudaMemcpy3DPeer to work (it works fine with cudaArray).

(Windows Server 2008 R2, TCC driver 270.90).

Am I making a mistake in using it correctly (the extent is in bytes since no cudaArray is involved in the copy)?

No matter what I do I get an “invalid argument” error from the call to cudaMemcpy3DPeer.

cudaPitchedPtr mem0, mem1;

        float *hostmem;

cudaExtent extentByte = make_cudaExtent(672 * sizeof(float), 128, 256);

hostmem = new float[672 * 128 * 256];

// allocate memory for image, also with pitch malloc

        CUDA_SAFE_CALL(cudaSetDevice(0));

        CUDA_SAFE_CALL(cudaMalloc3D (&(mem0), extentByte));

CUDA_SAFE_CALL(cudaSetDevice(1));

        CUDA_SAFE_CALL(cudaMalloc3D (&(mem1), extentByte));

printf("\n");

        // peer to peer with pitched memory

        for ( size_t i=0 ; i<672 * 128 * 256 ; ++i ) {

            hostmem[i] = i;

        }

        cudaMemcpy3DParms hParms = {0};

        hParms.srcPtr = make_cudaPitchedPtr ((float *)hostmem, 672 * sizeof(float), 672, 128);

        hParms.dstPtr = mem0;

        hParms.extent = extentByte;

        hParms.kind = cudaMemcpyHostToDevice;

        CUDA_SAFE_CALL (cudaSetDevice (0));

        CUDA_SAFE_CALL (cudaMemcpy3D (&hParms));

        printf("copied to device pitchedPtr memory\n");

cudaMemcpy3DPeerParms pParms = {0};

        pParms.srcDevice = 0;

        pParms.srcPtr = mem0;

        pParms.dstDevice = 1;

        pParms.dstPtr = mem1;

        pParms.extent = extentByte;

        // CUDA_SAFE_CALL (cudaMemcpy3DPeer (&pParms)); //gives "invalid argument"

        // printf("copied from device 0 to 1 via cudaMemcpy3DPeer\n");

        CUDA_SAFE_CALL (cudaMemcpyPeer (mem1.ptr, 1, mem0.ptr, 0, mem0.pitch * 128 * 256)); // seems to work

        printf("copied cudaPitchedPtr from device 0 to 1 via cudaMemcpyPeer\n");

memset( hostmem, 0, sizeof(float)*672*128*256);

memset( &hParms, 0, sizeof(hParms));

        hParms.srcPtr = mem1;

        hParms.dstPtr = make_cudaPitchedPtr ((float *)hostmem, 672 * sizeof(float), 672, 128);

        hParms.extent = extentByte;

        hParms.kind = cudaMemcpyDeviceToHost;

        CUDA_SAFE_CALL (cudaSetDevice (1));

        CUDA_SAFE_CALL (cudaMemcpy3D (&hParms));

        printf("copied from device 1 to host\n");

for ( size_t i=0 ; i<672 * 128 * 256 ; ++i ) {

            if ( hostmem[i] != (float)i ) {

                std::cerr << "copy error at " << i << ": got " << hostmem[i] << "\n";

                break;

            }

        }

        printf("checked result\n\n");

In addition to the above I also get an “invalid argument” when simply replacing cudaMemcpyPeer by cudaMemcpyPeerAsync.

I have also upgraded the driver the current version 276.14, however no change.

I actually figured out the problem (or part of it):

On the machine we have, for some reson peer-to-peer access is only possible between GPU0 & 1 and between GPU2 & 3.
That is what the driver 270.81 reported correctly.

And, when peer-to-peer access is not enabled, cudaMemcpyPeer3D gives an error (although according to documentation in this case the memory copy should be staged through the host).

The latest driver version however suggests that full peer to peer access is possible. On our system however, the system crashes completely and needs a reboot.