Transfortm intel mpi to openmpi when use cuda

now , i am testing the transfer data between gps on the cluster,but it always print some error
the code is :
real,device,allocatable :: d_0(:),d_1(:)
istat = cudaSetDevice(0)
istat = cudaGetDevice(j)
print * ,‘the first j is’ ,j
allocate(d_0(1:100))
istat = cudaSetDevice(1)
istat = cudaGetDevice(j)
print * ,‘the second j is’ ,j
allocate(d_1(1:100))
istat = cudaSetDevice(0)
!call test_kernel<<<1,100>>>(d_0)
istat =cudaDeviceCanAccessPeer( canAccessPeer, 0, 1 )
istat =cudaDeviceEnablePeerAccess (1,0)
print * ,'cudaDeviceEnablePeerAccess’s istat is ',istat
write(fh,"('cudaDeviceEnablePeerAccess is ',i3)") istat
!istat = cudaMemcpyPeer(d_1, 1, d_0, 0,100) !This method fail
istat = cudaMemcpy2D(d_1(:),100, d_0(:),100,100,100) !This method fail too
end subroutine test_diff_kernel_cop
the print on the log:
myid is 0
0.log
0
devicenum is 2
the first j is 0
the second j is 1
cudaDeviceEnablePeerAccess’s istat is 0
[enode25:454139] *** Process received signal ***
[enode25:454139] Signal: Segmentation fault (11)
[enode25:454139] Signal code: Address not mapped (1)
[enode25:454139] Failing at address: (nil)
[enode25:454139] [ 0] /usr/lib/gcc/x86_64-redhat-linux/4.8.5/…/…/…/…/lib64/libpthread.so.0(+0xf100)[0x2ad0e726c100]
[enode25:454139] [ 1] ./a.exe[0x403397]
[enode25:454139] [ 2] ./a.exe[0x40295a]
[enode25:454139] [ 3] ./a.exe[0x402316]
[enode25:454139] [ 4] /usr/lib/gcc/x86_64-redhat-linux/4.8.5/…/…/…/…/lib64/libc.so.6(__libc_start_main+0xf5)[0x2ad0e810cb15]
[enode25:454139] [ 5] ./a.exe[0x402189]

I wouldn’t think the cudaMemcpy2D call would work since the arrays are on different devices, but the cudaMemcpyPeer should be fine. I just wrote an example using your snip-it and it worked as expected.

If this doesn’t help you solve your issue, can you please post a complete minimal reproducing example?

% cat peer.cuf

program testpeer

use cudafor
real,device,allocatable :: d_0(:),d_1(:)
real,allocatable :: d_h(:)
integer :: canAccessPeer

allocate(d_h(1:100))

istat = cudaSetDevice(0)
istat = cudaGetDevice(j)
print * ,"the first j is" ,j
allocate(d_0(1:100))
d_0=222.2

istat = cudaSetDevice(1)
istat = cudaGetDevice(j)
print * ,"the second j is" ,j
allocate(d_1(1:100))
d_1=999999.0

istat = cudaSetDevice(0)
istat = cudaDeviceCanAccessPeer( canAccessPeer, 0, 1 )
istat = cudaDeviceEnablePeerAccess (1,0)
print *,"cudaDeviceEnablePeerAccess’s istat is ",istat, canAccessPeer
istat = cudaMemcpyPeer(d_1, 1, d_0, 0, 100) !This method fail

istat = cudaSetDevice(1)
d_h=d_1
print *, d_h

deallocate(d_h)
deallocate(d_0)
deallocate(d_1)
end program testpeer
% nvfortran peer.cuf -V20.9; a.out
 the first j is            0
 the second j is            1
 cudaDeviceEnablePeerAccess’s istat is             0            1
    222.2000        222.2000        222.2000        222.2000
    222.2000        222.2000        222.2000        222.2000
    222.2000        222.2000        222.2000        222.2000
    222.2000        222.2000        222.2000        222.2000
    222.2000        222.2000        222.2000        222.2000
    222.2000        222.2000        222.2000        222.2000
    222.2000        222.2000        222.2000        222.2000
    222.2000        222.2000        222.2000        222.2000
    222.2000        222.2000        222.2000        222.2000
    222.2000        222.2000        222.2000        222.2000
    222.2000        222.2000        222.2000        222.2000
    222.2000        222.2000        222.2000        222.2000
    222.2000        222.2000        222.2000        222.2000
    222.2000        222.2000        222.2000        222.2000
    222.2000        222.2000        222.2000        222.2000
    222.2000        222.2000        222.2000        222.2000
    222.2000        222.2000        222.2000        222.2000
    222.2000        222.2000        222.2000        222.2000
    222.2000        222.2000        222.2000        222.2000
    222.2000        222.2000        222.2000        222.2000
    222.2000        222.2000        222.2000        222.2000
    222.2000        222.2000        222.2000        222.2000
    222.2000        222.2000        222.2000        222.2000
    222.2000        222.2000        222.2000        222.2000
    222.2000        222.2000        222.2000        222.2000

-Mat