How to operate variables on GPU

I have two variables in GPU:
real, device, allocatable:: a_d(:,:), b_d(:,:)

which I used in the calculate use cufft. But I want to do a_d=b_d*2, but it said:

PGF90-S-0519-More than one device-resident object in assignment

Now I transfer b_d to another variable in host, multiple it by 2, and then pass it to a_b. But that cost too much time on data trasfer. Could I directly do that in GPU?

Hi hzhcapricorn,

Device variables can only be operated on within device code so you either need to write a device kernel to perform the operation or use the PGI Accelerator Model directives to have the compiler write one for you.

Now I transfer b_d to another variable in host, multiple it by 2, and then pass it to a_b. But that cost too much time on data trasfer. Could I directly do that in GPU?

Yes. Here’s two examples:

module mul2_mod
   use cudafor
   real, device, allocatable:: a_d(:,:), b_d(:,:) 
   real, allocatable:: a(:,:), b(:,:) 

contains

   attributes(global) subroutine mymul2 (N,M)
      integer, value :: N,M
      integer :: idx, idy
      idx = (blockidx%x-1)*blockdim%x + threadidx%x
      idy = (blockidx%y-1)*blockdim%y + threadidx%x
      if (idx.le.N.and.idy.le.M) then  
        a_d(idx,idy)=b_d(idx,idy)*2.0
      endif
   end subroutine mymul2 


end module mul2_mod

program mul2
   use mul2_mod
   integer :: N,M
   type(dim3) :: grid, block
   N=1024
   M=1024

   allocate(a(N,M), a_d(N,M))
   allocate(b(N,M), b_d(N,M))
   grid = dim3(N/16,M/16,1)
   block = dim3(16,16,1)

   b=1.0
   b_d=b
   call mymul2<<<grid,block>>>(N,M)
   a=a_d
   print *, a(1,1)

end program mul2
% pgf90 mul2.cuf ; a.out
    2.000000

% cat mul2_acc.cuf

program mul2

   real, device, allocatable:: a_d(:,:), b_d(:,:) 
   real, allocatable:: a(:,:), b(:,:) 

   allocate(a(1024,1024), a_d(1024,1024))
   allocate(b(1024,1024), b_d(1024,1024))

   b=1.0
   b_d=b
!$acc region
   a_d=b_d*2
!$acc end region
   a=a_d
   print *, a(1,1)

end program mul2
   
  % pgf90 -ta=nvidia,cuda3.2 -Minfo=accel mul2_acc.cuf ; a.out
mul2:
     14, Loop is parallelizable
         Accelerator kernel generated
         14, !$acc do parallel, vector(16) ! blockidx%x threadidx%x
             !$acc do parallel, vector(16) ! blockidx%y threadidx%y
    2.000000

Hope this helps,
Mat