Hi Silvio,
In this case, the simplest thing to do is use the PGI Accelerator Model and have the compiler accelerate it for you. However, you many want the CUDA Fortran version for reference. Both versions are below:
First the PGI Accelerator Model:
% cat test.f90
program foo
integer :: i, k
integer, dimension(1000,512) :: valore
!$acc region
DO i=1,1000
DO k=1,512
valore(i,k)=k
END DO
END DO
!$acc end region
print *, 'Result:', valore(123,213)
end program foo
% pgfortran -ta=nvidia -Minfo=accel -o testacc.out test.f90 ; testacc.out
foo:
7, Generating copyout(valore(:,:))
Generating compute capability 1.0 binary
Generating compute capability 2.0 binary
8, Loop is parallelizable
9, Loop is parallelizable
Accelerator kernel generated
8, !$acc do parallel, vector(16) ! blockidx%x threadidx%x
9, !$acc do parallel, vector(16) ! blockidx%y threadidx%y
CC 1.0 : 8 registers; 40 shared, 16 constant, 0 local memory bytes; 100% occupancy
CC 2.0 : 10 registers; 16 shared, 40 constant, 0 local memory bytes; 100% occupancy
Result: 213
And the CUDA Fortran version:
% cat test.cuf
module testme
contains
attributes(global) subroutine setvalore(v,N,M)
integer, value :: N, M
integer, dimension(:,:) :: v
integer :: i,k
i = (blockIdx%x-1)*blockDim%x + threadIdx%x
k = (blockIdx%y-1)*blockDim%y + threadIdx%y
if (i .le. N .and. k .le. M) then
v(i,k) = k
end if
end subroutine setvalore
end module testme
program foo
use testme
use cudafor
integer, parameter :: N=1000
integer, parameter :: M=512
integer, dimension(:,:), allocatable, device :: valore_d
integer, dimension(:,:), allocatable :: valore
type(dim3) :: dimGrid, dimBlock
integer :: rc
allocate(valore_d(N,M), valore(N,M))
valore=0
valore_d=0
dimBlock = dim3(16,16,1)
dimGrid = dim3((N+15)/16,(M+15)/16,1)
! Substitute > for } and < for {
call setvalore{{{dimGrid,dimBlock}}}(valore_d,N,M)
rc = cudaGetLastError()
if (rc .ne. 0) then
print *, 'Error:', cudaGetErrorString(rc)
else
valore = valore_d
print *, 'Result:', valore(123,213)
endif
end program foo
% pgfortran -o testcuf.out test.cuf -V12.2 ; testcuf.out
Result: 213
Hope this helps,
Mat