Hi!
In the example shown below I am unable to get cudaMemcpy3DPeer to work (it works fine with cudaArray).
(Windows Server 2008 R2, TCC driver 270.90).
Am I making a mistake in using it correctly (the extent is in bytes since no cudaArray is involved in the copy)?
No matter what I do I get an “invalid argument” error from the call to cudaMemcpy3DPeer.
cudaPitchedPtr mem0, mem1;
float *hostmem;
cudaExtent extentByte = make_cudaExtent(672 * sizeof(float), 128, 256);
hostmem = new float[672 * 128 * 256];
// allocate memory for image, also with pitch malloc
CUDA_SAFE_CALL(cudaSetDevice(0));
CUDA_SAFE_CALL(cudaMalloc3D (&(mem0), extentByte));
CUDA_SAFE_CALL(cudaSetDevice(1));
CUDA_SAFE_CALL(cudaMalloc3D (&(mem1), extentByte));
printf("\n");
// peer to peer with pitched memory
for ( size_t i=0 ; i<672 * 128 * 256 ; ++i ) {
hostmem[i] = i;
}
cudaMemcpy3DParms hParms = {0};
hParms.srcPtr = make_cudaPitchedPtr ((float *)hostmem, 672 * sizeof(float), 672, 128);
hParms.dstPtr = mem0;
hParms.extent = extentByte;
hParms.kind = cudaMemcpyHostToDevice;
CUDA_SAFE_CALL (cudaSetDevice (0));
CUDA_SAFE_CALL (cudaMemcpy3D (&hParms));
printf("copied to device pitchedPtr memory\n");
cudaMemcpy3DPeerParms pParms = {0};
pParms.srcDevice = 0;
pParms.srcPtr = mem0;
pParms.dstDevice = 1;
pParms.dstPtr = mem1;
pParms.extent = extentByte;
// CUDA_SAFE_CALL (cudaMemcpy3DPeer (&pParms)); //gives "invalid argument"
// printf("copied from device 0 to 1 via cudaMemcpy3DPeer\n");
CUDA_SAFE_CALL (cudaMemcpyPeer (mem1.ptr, 1, mem0.ptr, 0, mem0.pitch * 128 * 256)); // seems to work
printf("copied cudaPitchedPtr from device 0 to 1 via cudaMemcpyPeer\n");
memset( hostmem, 0, sizeof(float)*672*128*256);
memset( &hParms, 0, sizeof(hParms));
hParms.srcPtr = mem1;
hParms.dstPtr = make_cudaPitchedPtr ((float *)hostmem, 672 * sizeof(float), 672, 128);
hParms.extent = extentByte;
hParms.kind = cudaMemcpyDeviceToHost;
CUDA_SAFE_CALL (cudaSetDevice (1));
CUDA_SAFE_CALL (cudaMemcpy3D (&hParms));
printf("copied from device 1 to host\n");
for ( size_t i=0 ; i<672 * 128 * 256 ; ++i ) {
if ( hostmem[i] != (float)i ) {
std::cerr << "copy error at " << i << ": got " << hostmem[i] << "\n";
break;
}
}
printf("checked result\n\n");