coce_parallel_loop

Hallo!
I’m a very beginner fortran Cuda programmer.
I’m trying to parallelize a serial Fortran code :
DO i=0,1000
DO k=0,512
valore(i,k)=k
END DO
END DO

I’d like to parallelize with Fortran CUDA the k index.

Is there anyone that could help me to write the parallel code in Cuda Fortran?

Thanks.

Silvio.

Hi Silvio,

In this case, the simplest thing to do is use the PGI Accelerator Model and have the compiler accelerate it for you. However, you many want the CUDA Fortran version for reference. Both versions are below:

First the PGI Accelerator Model:

% cat test.f90

program foo

integer :: i, k
integer, dimension(1000,512) :: valore

!$acc region
DO i=1,1000
DO k=1,512
valore(i,k)=k
END DO
END DO
!$acc end region

print *, 'Result:', valore(123,213)

end program foo 
% pgfortran -ta=nvidia -Minfo=accel -o testacc.out test.f90 ; testacc.out
foo:
      7, Generating copyout(valore(:,:))
         Generating compute capability 1.0 binary
         Generating compute capability 2.0 binary
      8, Loop is parallelizable
      9, Loop is parallelizable
         Accelerator kernel generated
          8, !$acc do parallel, vector(16) ! blockidx%x threadidx%x
          9, !$acc do parallel, vector(16) ! blockidx%y threadidx%y
             CC 1.0 : 8 registers; 40 shared, 16 constant, 0 local memory bytes; 100% occupancy
             CC 2.0 : 10 registers; 16 shared, 40 constant, 0 local memory bytes; 100% occupancy
 Result:          213

And the CUDA Fortran version:

% cat test.cuf
module testme

contains

attributes(global) subroutine setvalore(v,N,M) 

integer, value :: N, M
integer, dimension(:,:) :: v
integer :: i,k

i = (blockIdx%x-1)*blockDim%x + threadIdx%x
k = (blockIdx%y-1)*blockDim%y + threadIdx%y

if (i .le. N .and. k .le. M) then
  v(i,k) = k
end if

end subroutine setvalore

end module testme


program foo
use testme
use cudafor

integer, parameter :: N=1000
integer, parameter :: M=512
integer, dimension(:,:), allocatable, device :: valore_d
integer, dimension(:,:), allocatable :: valore
type(dim3) :: dimGrid, dimBlock
integer :: rc

allocate(valore_d(N,M), valore(N,M))
valore=0
valore_d=0
dimBlock = dim3(16,16,1)
dimGrid = dim3((N+15)/16,(M+15)/16,1)

! Substitute > for } and < for {
call setvalore{{{dimGrid,dimBlock}}}(valore_d,N,M)  
rc = cudaGetLastError()
if (rc .ne. 0) then
   print *, 'Error:', cudaGetErrorString(rc)
else
   valore = valore_d
   print *, 'Result:', valore(123,213)
endif

end program foo 

% pgfortran -o testcuf.out test.cuf -V12.2 ; testcuf.out
 Result:          213

Hope this helps,
Mat