! This code is for testing GPU offloading @NERSC
PROGRAM CK
USE OMP_LIB
IMPLICIT NONE
call GPU_test()
Contains
SUBROUTINE GPU_test()
implicit none
REAL(kind=8), allocatable :: x(:) ! data … cannot be double complex.
REAL(KIND=8) :: startTime ! for timing
INTEGER(kind=8) :: maxloop=100000000 ! the loop number=100M for testing
INTEGER(kind=8) :: i, j ! loop counter
INTEGER(kind=8) :: N = 1000 ! array size
allocate(x(1:N)); x=0d0
! serial …
x=0d0; startTime= omp_get_wtime()
Do i=1,maxloop !
Do j=1,N
x(j) = x(j)+1D0/maxloop ! every element has the same value.
END DO
END DO
print *, 'CPU calculation time (sec) = ', sngl(omp_get_wtime()-startTime) , sum(x)
! threaded ... GPU offloading ....
x=0d0 ; startTime= omp_get_wtime()
!$OMP target teams distribute parallel do reduction(+:x) map(tofrom:x) private(i,j)
Do i=1,maxloop !
Do j=1,N !
x(j) = x(j)+1D0/maxloop ! every element has the same value.
END DO
END DO
!$OMP end target teams distribute parallel do
print *, 'OMP calculation time (sec) = ', sngl(omp_get_wtime()-startTime) , sum(x)
END SUBROUTINE GPU_test
END PROGRAM ck
I am testing GPU acceleration with this code which is basically doing an array reduction after huge number of loops. The error I got: “Accelerator Fatal Error: call to cuMemcpyDtoHAsync returned error 700: Illegal address during kernel execution”, or sometimes it is the cuStreamSynchronize error
if I put num_teams(2) in the directive:
!$OMP target teams distribute parallel do num_teams(2) reduction(+:x) private(i,j)
The code works but it is very slow. If any number is larger than 4 in num_teams(), it will give me the same cuStreamSynchronize/cuMemcpyDtoHAsync error.
Could you please advice me on how to fix the error and speed up this simple test using GPU offloading?
Thank you!