Hi hzhcapricorn,
Device variables can only be operated on within device code so you either need to write a device kernel to perform the operation or use the PGI Accelerator Model directives to have the compiler write one for you.
Now I transfer b_d to another variable in host, multiple it by 2, and then pass it to a_b. But that cost too much time on data trasfer. Could I directly do that in GPU?
Yes. Here’s two examples:
module mul2_mod
use cudafor
real, device, allocatable:: a_d(:,:), b_d(:,:)
real, allocatable:: a(:,:), b(:,:)
contains
attributes(global) subroutine mymul2 (N,M)
integer, value :: N,M
integer :: idx, idy
idx = (blockidx%x-1)*blockdim%x + threadidx%x
idy = (blockidx%y-1)*blockdim%y + threadidx%x
if (idx.le.N.and.idy.le.M) then
a_d(idx,idy)=b_d(idx,idy)*2.0
endif
end subroutine mymul2
end module mul2_mod
program mul2
use mul2_mod
integer :: N,M
type(dim3) :: grid, block
N=1024
M=1024
allocate(a(N,M), a_d(N,M))
allocate(b(N,M), b_d(N,M))
grid = dim3(N/16,M/16,1)
block = dim3(16,16,1)
b=1.0
b_d=b
call mymul2<<<grid,block>>>(N,M)
a=a_d
print *, a(1,1)
end program mul2
% pgf90 mul2.cuf ; a.out
2.000000
% cat mul2_acc.cuf
program mul2
real, device, allocatable:: a_d(:,:), b_d(:,:)
real, allocatable:: a(:,:), b(:,:)
allocate(a(1024,1024), a_d(1024,1024))
allocate(b(1024,1024), b_d(1024,1024))
b=1.0
b_d=b
!$acc region
a_d=b_d*2
!$acc end region
a=a_d
print *, a(1,1)
end program mul2
% pgf90 -ta=nvidia,cuda3.2 -Minfo=accel mul2_acc.cuf ; a.out
mul2:
14, Loop is parallelizable
Accelerator kernel generated
14, !$acc do parallel, vector(16) ! blockidx%x threadidx%x
!$acc do parallel, vector(16) ! blockidx%y threadidx%y
2.000000
Hope this helps,
Mat