Issue with acc_memcpy_device

Hi,
I need to create a copy of an array on the device. The following code stops with an error

      program Bug
      use openacc
      implicit none

      Integer N, i, Asize
      Parameter(N=4096)
      Parameter(Asize=N*8)
      Real*8 A(N)
      Real*8 Anew(N)

      do i=1,N
       A(i) = 17
      enddo

 666  format(I)
      write(*, 666) sizeof(A)
!$acc data copyin(A) create(Anew)
      
c     call acc_memcpy_device(Anew, A, sizeof(A))
      CALL acc_memcpy_device(acc_deviceptr(Anew), acc_deviceptr(A),
     & Asize)
!$acc kernels
      do i=2,N-1
       Anew(i) = Anew(i) - 0.5 * (A(i-1) - 2*A(i) + A(i+1))
      enddo
!$acc end kernels
!$acc end data

      end program



$ pgfortran --version
pgfortran 18.10-1 64-bit target on x86-64 Linux -tp haswell 
PGI Compilers and Tools
Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.

$pgfortran -acc -ta=tesla,cc50 Bug.F
$PGI_ACC_DEBUG=1 ./a.out
       32768
ACC: detected 1 CUDA devices
cuda_initdev thread:0 data.default_device_num:0 pdata.cuda.default_device_num:0
ACC: device[1] is NVIDIA CUDA device 0 compute capability 5.0
ACC: initialized 1 CUDA devices
ACC: device[2] is PGI native
pinitialize (threadid=1)
cuda_init_device thread:1 data.default_device_num:1 pdata.cuda.default_device_num:1
cuda_init_device(threadid=1, device 0) dindex=1, api_context=(nil)
cuda_init_device(threadid=1, device 0) dindex=1, setting api_context=(nil)
cuda_init_device(threadid=1, device 0) dindex=1, new api_context=0x1f39380
argument memory for queue 16 device:0x701b40000 host:0x203860000
curr_devid for threadid=1 is 1
pgi_uacc_dataenterstart( file=/home/alexander/Bachelorarbeit/Bug.F, function=bug, line=1:1, line=17, devid=0,threadid=1 )
cuda_init_device thread:1 data.default_device_num:1 pdata.cuda.default_device_num:1
cuda_init_device(threadid=1, device 0) dindex=1, api_context=0x1f39380
pgi_uacc_dataon(hostptr=0x60bce0,stride=1,size=4096,eltsize=8,lineno=17,name=a,flags=0x700=present+create+copyin,async=-1,threadid=1)
cuda_shared( 0x60bce0 ) is not-managed
pgi_uacc_alloc(size=32768,devid=1,threadid=1)
allocate device memory 0x701c40000(32768B,threadid=1)
pgi_uacc_alloc(size=32768,devid=1,threadid=1) returns 0x701c40000
map    dev:0x701c40000 host:0x60bce0 dindex:1 size:32768 offset:0  (line:17 name:a) thread:1
alloc done with devptr at 0x701c40000 (threadid=1)
pgi_uacc_dataupx(devptr=0x701c40000,hostptr=0x60bce0,stride=1,size=4096,eltsize=8,lineno=17,name=a,async=-1,threadid=1)
pgi_uacc_cuda_dataup1(devdst=0x701c40000,hostsrc=0x60bce0,offset=0,stride=1,size=4096,eltsize=8,lineno=17,name=a,threadid=1)
pgi_uacc_dataon(hostptr=0x603ce0,stride=1,size=4096,eltsize=8,lineno=17,name=anew,flags=0x300=present+create,async=-1,threadid=1)
cuda_shared( 0x603ce0 ) is not-managed
pgi_uacc_alloc(size=32768,devid=1,threadid=1)
allocate device memory 0x701c48000(32768B,threadid=1)
pgi_uacc_alloc(size=32768,devid=1,threadid=1) returns 0x701c48000
map    dev:0x701c48000 host:0x603ce0 dindex:1 size:32768 offset:0  (line:17 name:anew) thread:1
alloc done with devptr at 0x701c48000 (threadid=1)
pgi_uacc_dataenterdone(devid=1,threadid=1)
pgi_uacc_cuda_wait(lineno=-99,async=-1,dindex=1,threadid=1)
pgi_uacc_cuda_wait(sync on stream=0x228c9d0,threadid=1)
pgi_uacc_cuda_wait done (threadid=1)
cuda_shared( 0x603ce0 ) is not-managed (cached)
cuda_shared( 0x60bce0 ) is not-managed (cached)
pgi_uacc_cuda_memcpy(devdst=0x701c48000,devsrc=0x701c40000,bytes=6306356,async=-1,dindex=1)
call to cuMemcpyDtoDAsync returned error 1: Invalid value

The pointers are correct but the size is too large.

-Alex

Hi Alex,

It looks like we missed adding an interface for “acc_memcpy_device” in the OpenACC module when it was added to the OpenACC 2.5 standard. Since it’s a C/C++ routine, an interface is needed in order for it to be called from Fortran.

I added a problem report (TPR#26706) requesting that the interface be added to the OpenACC module.

In the meantime, you can add the following interface to your code to get it to work properly.

% cat test.F
      program Bug
#ifdef _OPENACC
      use openacc
#endif
      implicit none

      Integer N, i, Asize
      Parameter(N=4096)
      Parameter(Asize=N*8)
      Real*8 A(N)
      Real*8 Anew(N)
#ifdef _OPENACC
      interface
        subroutine acc_memcpy_device( dest, src, bytes )
     &     bind(c,name='acc_memcpy_device_')
           import c_ptr, c_devptr
           type(c_devptr) :: dest
           type(c_devptr) :: src
!dir$ ignore_tkr (k) bytes
           integer(kind=int_ptr_kind()), value :: bytes
        end subroutine
      end interface
#endif

      do i=1,N
       A(i) = 17
      enddo

 666  format(I)
      write(*, 666) sizeof(A)
!$acc data copyin(A) create(Anew)

#ifdef _OPENACC
      CALL acc_memcpy_device(acc_deviceptr(Anew), acc_deviceptr(A),
     & Asize)
#else
      Anew=A
#endif

!$acc kernels
      do i=2,N-1
       Anew(i) = Anew(i) - 0.5 * (A(i-1) - 2*A(i) + A(i+1))
      enddo
!$acc end kernels
!$acc update host (Anew)
!$acc end data

      print *, Anew(1), Anew(N)
      end program
% pgfortran -ta=tesla test.F; a.out
       32768
    17.00000000000000         17.00000000000000

Hope this helps,
Mat

Hi Mat,

adding the interface fixed my problem. Thanks for the quick reply.

Alex

The TPR should resolve the issue with 19.7 and above.