Hello,
I’m exploring the possibility of porting my fortran codes to the CUDA-world. One of the things I tested with the example code (f3.f90) shipped with PGI 10.1 is to test the GPU/host processing speeds with a 3D matrix. My modified version of f3.f90 is below. Compilation works fine (compiler output is below.) However, I cant get this to run - runtime error message is “call to cuMemcpy2D returned error 700: Launch failed”
Any insight into this will be appreciated.
Thanks,
Rooni.
module sm
contains
subroutine smooth( a, b, w0, w1, w2, n, m, niters )
real, dimension(:,:,:) :: a,b
real :: w0, w1, w2
integer :: n, m, niters
integer :: i, j, iter, k
!$acc region
do iter = 1,niters
do i = 2,n-1
do j = 2,m-1
a(i,j,iter) = w0 * b(i,j,iter) + &
w1 * (b(i-1,j,iter) + b(i,j-1,iter) + b(i+1,j,iter) + b(i,j+1,iter)) + &
w2 * (b(i-1,j-1,iter) + b(i-1,j+1,iter) + b(i+1,j-1,iter) + b(i+1,j+1,iter))
enddo
enddo
do i = 2,n-1
do j = 2,m-1
b(i,j,iter) = a(i,j,iter)
enddo
enddo
enddo
!$acc end region
end subroutine
subroutine smoothhost( a, b, w0, w1, w2, n, m, niters )
real, dimension(:,:,:) :: a,b
real :: w0, w1, w2
integer :: n, m, niters
integer :: i, j, iter, k
do iter = 1,niters
do i = 2,n-1
do j = 2,m-1
a(i,j,iter) = w0 * b(i,j,iter) + &
w1 * (b(i-1,j,iter) + b(i,j-1,iter) + b(i+1,j,iter) + b(i,j+1,iter)) + &
w2 * (b(i-1,j-1,iter) + b(i-1,j+1,iter) + b(i+1,j-1,iter) + b(i+1,j+1,iter))
enddo
enddo
do i = 2,n-1
do j = 2,m-1
b(i,j,iter) = a(i,j,iter)
enddo
enddo
enddo
end subroutine
end module
program main
use sm
use accel_lib
real,dimension(:,:,:),allocatable :: aa, bb
real,dimension(:,:,:),allocatable :: aahost, bbhost
real :: w0, w1, w2
integer :: i,j,n,m,k
integer :: c0, c1, c2, c3, cgpu, chost
integer :: errs, args
character(10) :: arg
real :: dif, tol
n = 0
m = 0
args = command_argument_count()
if( args .gt. 0 )then
call getarg( 1, arg )
read(arg,‘(i10)’) n
if( args .gt. 1 )then
call getarg( 2, arg )
read(arg,‘(i10)’) m
if( args .gt. 2 )then
call getarg( 3, arg )
if( arg .eq. ‘host’ .or. arg .eq. ‘HOST’ )then
call acc_set_device( acc_device_host )
print *, ‘set host’
else if( arg .eq. ‘nvidia’ .or. arg .eq. ‘NVIDIA’ )then
call acc_set_device( acc_device_nvidia )
call acc_init( acc_device_nvidia )
print *, ‘initialize nvidia’
else
print *, ‘unknown device:’, arg
print *, ‘using default’
endif
endif
endif
endif
if( n .le. 0 ) n = 1000
if( m .le. 0 ) m = n
k = 11
allocate( aa(n,m,k) )
allocate( bb(n,m,k) )
allocate( aahost(n,m,k) )
allocate( bbhost(n,m,k) )
do k = 1,11
do i = 1,n
do j = 1,m
aa(i,j,k) = 0
bb(i,j,k) = i1000 + j
aahost(i,j,k) = 0
bbhost(i,j,k) = i1000 + j
enddo
enddo
enddo
w0 = 0.5
w1 = 0.3
w2 = 0.2
call system_clock( count=c1 )
call smooth( aa, bb, w0, w1, w2, n, m, 11 )
call system_clock( count=c2 )
cgpu = c2 - c1
call smoothhost( aahost, bbhost, w0, w1, w2, n, m, 11 )
call system_clock( count=c3)
chost = c3 - c2
! check the results
errs = 0
tol = 0.000005
do k = 1,11
do i = 1,n
do j = 1,m
dif = abs(aa(i,j,k) - aahost(i,j,k))
if( aahost(i,j,k) .ne. 0 ) dif = abs(dif/aahost(i,j,k))
if( dif .gt. tol )then
errs = errs + 1
if( errs .le. 10 )then
print *, i, j, aa(i,j,k), aahost(i,j,k)
endif
endif
enddo
enddo
enddo
print *, errs, ’ errors found’
print *, cgpu, ’ microseconds on GPU’
print *, chost, ’ microseconds on host’
end program
[localhost@localhost TEST]$ make f3.exe
pgfortran -o f3.exe f3.f90 -ta=nvidia -Minfo=accel -O2
NOTE: your trial license will expire in 14 days, 7.7 hours.
NOTE: your trial license will expire in 14 days, 7.7 hours.
smooth:
24, Generating copyout(a(2:n-1,2:m-1,1:niters))
Generating copyin(b(1:n,1:m,1:niters))
Generating copyout(b(2:n-1,2:m-1,1:niters))
25, Loop is parallelizable
26, Loop is parallelizable
27, Loop is parallelizable
Accelerator kernel generated
25, !$acc do parallel
Cached references to size [18x18] block of ‘b’
26, !$acc do parallel, vector(16)
27, !$acc do vector(16)
33, Loop is parallelizable
34, Loop is parallelizable
Accelerator kernel generated
25, !$acc do parallel, vector(4)
33, !$acc do parallel, vector(16)
34, !$acc do vector(4)