now , i am testing the transfer data between gps on the cluster,but it always print some error
the code is :
real,device,allocatable :: d_0(:),d_1(:)
istat = cudaSetDevice(0)
istat = cudaGetDevice(j)
print * ,‘the first j is’ ,j
allocate(d_0(1:100))
istat = cudaSetDevice(1)
istat = cudaGetDevice(j)
print * ,‘the second j is’ ,j
allocate(d_1(1:100))
istat = cudaSetDevice(0)
!call test_kernel<<<1,100>>>(d_0)
istat =cudaDeviceCanAccessPeer( canAccessPeer, 0, 1 )
istat =cudaDeviceEnablePeerAccess (1,0)
print * ,'cudaDeviceEnablePeerAccess’s istat is ',istat
write(fh,“('cudaDeviceEnablePeerAccess is ',i3)”) istat
!istat = cudaMemcpyPeer(d_1, 1, d_0, 0,100) !This method fail
istat = cudaMemcpy2D(d_1(:),100, d_0(:),100,100,100) !This method fail too
end subroutine test_diff_kernel_cop
the print on the log:
myid is 0
0.log
0
devicenum is 2
the first j is 0
the second j is 1
cudaDeviceEnablePeerAccess’s istat is 0
[enode25:454139] *** Process received signal ***
[enode25:454139] Signal: Segmentation fault (11)
[enode25:454139] Signal code: Address not mapped (1)
[enode25:454139] Failing at address: (nil)
[enode25:454139] [ 0] /usr/lib/gcc/x86_64-redhat-linux/4.8.5/…/…/…/…/lib64/libpthread.so.0(+0xf100)[0x2ad0e726c100]
[enode25:454139] [ 1] ./a.exe[0x403397]
[enode25:454139] [ 2] ./a.exe[0x40295a]
[enode25:454139] [ 3] ./a.exe[0x402316]
[enode25:454139] [ 4] /usr/lib/gcc/x86_64-redhat-linux/4.8.5/…/…/…/…/lib64/libc.so.6(__libc_start_main+0xf5)[0x2ad0e810cb15]
[enode25:454139] [ 5] ./a.exe[0x402189]
I wouldn’t think the cudaMemcpy2D call would work since the arrays are on different devices, but the cudaMemcpyPeer should be fine. I just wrote an example using your snip-it and it worked as expected.
If this doesn’t help you solve your issue, can you please post a complete minimal reproducing example?
% cat peer.cuf
program testpeer
use cudafor
real,device,allocatable :: d_0(:),d_1(:)
real,allocatable :: d_h(:)
integer :: canAccessPeer
allocate(d_h(1:100))
istat = cudaSetDevice(0)
istat = cudaGetDevice(j)
print * ,"the first j is" ,j
allocate(d_0(1:100))
d_0=222.2
istat = cudaSetDevice(1)
istat = cudaGetDevice(j)
print * ,"the second j is" ,j
allocate(d_1(1:100))
d_1=999999.0
istat = cudaSetDevice(0)
istat = cudaDeviceCanAccessPeer( canAccessPeer, 0, 1 )
istat = cudaDeviceEnablePeerAccess (1,0)
print *,"cudaDeviceEnablePeerAccess’s istat is ",istat, canAccessPeer
istat = cudaMemcpyPeer(d_1, 1, d_0, 0, 100) !This method fail
istat = cudaSetDevice(1)
d_h=d_1
print *, d_h
deallocate(d_h)
deallocate(d_0)
deallocate(d_1)
end program testpeer
% nvfortran peer.cuf -V20.9; a.out
the first j is 0
the second j is 1
cudaDeviceEnablePeerAccess’s istat is 0 1
222.2000 222.2000 222.2000 222.2000
222.2000 222.2000 222.2000 222.2000
222.2000 222.2000 222.2000 222.2000
222.2000 222.2000 222.2000 222.2000
222.2000 222.2000 222.2000 222.2000
222.2000 222.2000 222.2000 222.2000
222.2000 222.2000 222.2000 222.2000
222.2000 222.2000 222.2000 222.2000
222.2000 222.2000 222.2000 222.2000
222.2000 222.2000 222.2000 222.2000
222.2000 222.2000 222.2000 222.2000
222.2000 222.2000 222.2000 222.2000
222.2000 222.2000 222.2000 222.2000
222.2000 222.2000 222.2000 222.2000
222.2000 222.2000 222.2000 222.2000
222.2000 222.2000 222.2000 222.2000
222.2000 222.2000 222.2000 222.2000
222.2000 222.2000 222.2000 222.2000
222.2000 222.2000 222.2000 222.2000
222.2000 222.2000 222.2000 222.2000
222.2000 222.2000 222.2000 222.2000
222.2000 222.2000 222.2000 222.2000
222.2000 222.2000 222.2000 222.2000
222.2000 222.2000 222.2000 222.2000
222.2000 222.2000 222.2000 222.2000
-Mat