cudaMemcpy2DAsync

Hi there,

I’m tring to use the cudaMemcpy2DAsync extension in Cuda Fortran to copy my 2D arrays but I’m struggling to work out what dpitch and spitch represent.

If anyone could give me some advice it would be much appreciated.

Here’s an example of an array I’m trying to copy:

! Host array
double precision, allocatable, pinned,dimension(:,:) ARRAY_host

! Device array
doible precision, device,dimension(MAXELMNT,3):: ARRAY_device

allocate(ARRAY_host(MAXELMNT,3))

! So far I have

istat=cudaMemcpy2DAsync(dst=ARRAY_device,
dpitch= ,
src=ARRAY_host,
spitch= ,
width=3,
height=MAXELMNT,
kdir=cudaMemcpyHostToDevice,
stream=0)

I’m not sure if I have the height and width the right way round and what to put for dpitch and spitch.

Thanks in advance for your help,
Crip_crop

Hi crip_crop,

The ‘pitch’ is the size of the leading dimension which is MAXELMENT in your case. However, ‘width’ is the number of columns (MAXELMENT) and the ‘height’ is the number of rows (3).

Here’s a small example program:

% cat mul2.cuf 
module mul2_mod
   use cudafor
   double precision, device, allocatable:: a_d(:,:), b_d(:,:) 
   double precision, pinned, allocatable:: a(:,:), b(:,:) 

contains

   attributes(global) subroutine mymul2 (N,M)
      integer, value :: N,M
      integer :: idx, idy
      idx = (blockidx%x-1)*blockdim%x + threadidx%x
      idy = (blockidx%y-1)*blockdim%y + threadidx%x
      if (idx.le.N.and.idy.le.M) then  
        a_d(idx,idy)=b_d(idx,idy)*2.0
      endif
   end subroutine mymul2 

   subroutine init_GPU(N,M)
     integer :: N, M, rv
     allocate(a(N,M), a_d(N,M))
     allocate(b(N,M), b_d(N,M))
     b=1.0
     rv =  cudaMemcpy2DAsync(b_d,N,b,N,N,M,cudaMemcpyHostToDevice)
     if (rv.ne.0) then
       print *, 'Error in init_GPU: ', cudaGetErrorString(rv)
       stop
     endif

   end subroutine init_GPU

   subroutine cleanup_GPU()
      deallocate(a_d,a,b_d,b)
   end subroutine cleanup_GPU


end module mul2_mod

program mul2
   use mul2_mod
   integer :: N,M,rv
   type(dim3) :: grid, block
   N=1024
   M=3

   call init_GPU(N,M)
   grid = dim3((N+15)/16,(M+15)/16,1)
   block = dim3(16,16,1)

! Go do something else
   call mymul2<<<grid,block>>>(N,M)
   a=a_d
   print *, a(1,1), a(N,M)
   call cleanup_GPU()

end program mul2
   
  
% pgf90 mul2.cuf -V11.4 ; a.out
    2.000000000000000         2.000000000000000

Hope this helps,
Mat

Hi Mat,

I modified your code as following:

rv=cudaStreamCreate(stream1)

rv = cudaMemcpy2DAsync(b_d,N,b,N,N,M,cudaMemcpyHostToDevice,stream1)

Then I got an error:

PGF90-S-0155-Could not resolve generic procedure cudamemcpy2dasync

The same error happened for:

rv = cudaMemcpy2DAsync(b_d,N,b,N,N,M,cudaMemcpyHostToDevice,stream=0)

Can you explain why I had such error? As I know of, the syntax is:

cudaMemcpy2DAsync( dst, dpitch, src, spitch, width, height, kdir, stream).

Thanks,

Lam