Larger arrays not work but smaller work in CUDA Fortran

Hi NVIDIA Experts,

Look at this simple code which works for smaller arrays (32, 32,32) but not for bigger arrays (128,128,128):

module KCUF

implicit none

contains

 subroutine GetArg(argc, argv)
     implicit none
     integer :: argc, ix
     character (len=16), dimension(:), allocatable  :: argv

     argc = command_argument_count()
     allocate(argv(argc))

     do  ix=1, argc
         call get_command_argument(ix, argv(ix))
     end do
 end subroutine GetArg

 elemental subroutine str2int(in_str,out_int,out_stat)
    implicit none
    ! Arguments
    character(len=*),intent(in) :: in_str
    integer*4,intent(out)         :: out_int
    integer*4,intent(out)         :: out_stat

    read(in_str,*,iostat=out_stat)  out_int
  end subroutine str2int

 elemental subroutine str2real(in_str,out_real,out_stat)
    implicit none
    ! Arguments
    character(len=*),intent(in) :: in_str
    real,intent(out)         :: out_real
    integer,intent(out)         :: out_stat

   read(in_str,*,iostat=out_stat)  out_real
  end subroutine str2real


attributes(global) subroutine turb_kernel_kj(IBP1,JBP1,KBP1, HAT)

        implicit none

        real*8 :: HAT(0:IBP1, 0:JBP1, 0:KBP1)
        integer*4, value :: IBP1, JBP1, KBP1

        integer*4 :: I, J, K

        !blockIdx%x starts from 1, which is different from cuda c/c++ code
        !threadIdx%x also starts from 1, in this case HAT starts from 0 in k and j dimentions, so we use threadIdx%x -1

        K = (blockIdx%x-1) * blockDim%x + threadIdx%x -1
        J = (blockIdx%y-1) * blockDim%y + threadIdx%y -1

        HAT(0,J,K) = 2.0 * HAT(0+1,J,K) - HAT(0+2,J,K)
        HAT(IBP1,J,K) = 2.0 * HAT(IBP1-1,J,K) - HAT(IBP1-2,J,K)

end subroutine turb_kernel_kj

subroutine turb_kw(IBP1, JBP1, KBP1, HAT)
         !@cuf use cudafor
    implicit none

    integer :: IBP1, KBP1, JBP1, istat
    REAL(8),  DIMENSION(:,:,:) :: HAT
    REAL(8), device, DIMENSION(0:IBP1,0:JBP1,0:KBP1) :: D_HAT

    integer*4 ::  blockDim_x=16, blockDim_y=16
    integer*4 :: gridDim_x, gridDim_y

    type(dim3) :: grid, block
    type(cudaEvent) :: start, stop

    !KBP1+1 is the real dimension length since here D_HAT starts from 0, not default 1 in Fortran
    !For example, if KBP1=16, D_HAT really has 17 elements in K dimension, so girdDim_x should be (16+1+16-1/16)=2, if we do not use
    !KBP1+1, we will have gridDim_x = 1, the kernel will need stride, and things become confusing  --- WHG 01/02/2024

    D_HAT = HAT


    !Call KJ kernel
    gridDim_x = (KBP1+1+blockDim_x-1)/blockDim_x
    gridDim_y = (JBP1+1+blockDim_y-1)/blockDim_y
    grid = dim3(gridDim_x, gridDim_y,1)
    block= dim3(blockDim_x, blockDim_y,1)
    call turb_kernel_kj<<<grid,block>>>(IBP1, JBP1, KBP1, D_HAT)
    HAT=D_HAT
    istat = cudaDeviceSynchronize()
end subroutine turb_kw

Subroutine test_gpu(IBP1, KBP1, JBP1)

   !@cuf use cudafor
   IMPLICIT NONE

    integer :: i,j,k
    integer :: IBP1, KBP1, JBP1, istat
    REAL(8), DIMENSION(0:IBP1,0:JBP1,0:KBP1) :: HAT


   real*8 :: Terror=0.0
   real e, etime, t(2), start_time, stop_time

   !istat=cudaMemcpy(a, h_a, N_i*N_j*N_k)
   print*, size(HAT), sizeof(HAT)

   HAT = 0.0

   call cpu_time(start_time)
   print *, 'before kernal cpu time :', start_time


  CALL turb_kw(IBP1, JBP1, KBP1, HAT)

  call cpu_time(stop_time)
  print *, 'test_128 after kernal cpu time :',stop_time

  !e = etime(t) 
  !print *, 'Time elapsed after kernal :', e, ',user:', t(1), ', sys:', t(2)

  END subroutine

end module

program test_no_mpi

Compiled with:
nvfortran -g -m64 -O0 -Wall -Werror -gpu=ccnative -cuda -traceback -Mrecursive -o test_128 test_128.f90

works for small arrays:

test_device# ./test_128 32 32 32
32 32 32
35937 287496
before kernal cpu time : 0.000000
test_128 after kernal cpu time : 0.3167009
(rapids) root@n6k4hh8336:/notebooks/CUDA/cuda_fortran/test_device# ./test_128 64 64 64
64 64 64
274625 2197000
before kernal cpu time : 0.000000
test_128 after kernal cpu time : 0.3029881

But it does not work for larger arrays:

test_device# ./test_128 128 128 128
128 128 128
2146689 17173512
before kernal cpu time : 0.000000
0: copyout Memcpy (host=0x2d84de0, dev=0x7f3012000000, size=17173512) FAILED: 700(an illegal memory access was encountered)

Note: export NV_ACC_CUDA_STACKSIZE=64MB does not work.

Thank you so much!

Hi honggangwang1979,

It looks like the code was cut off so I can’t compile it myself, but my best guess is that it’s an out-bounds access issue. You have a schedule of (9,9)x(16,16) or 144 threads in the x-dimension and 144 in y-dimension. Since there’s no bounds check in the kernel, threads 129-144 access memory beyond then end of the array.

Try adding an if condition to check the bounds. Something like:

        K = (blockIdx%x-1) * blockDim%x + threadIdx%x -1
        J = (blockIdx%y-1) * blockDim%y + threadIdx%y -1

        if (J.le.JBP1.and.K.le.KBP1) then
          HAT(0,J,K) = 2.0 * HAT(0+1,J,K) - HAT(0+2,J,K)
          HAT(IBP1,J,K) = 2.0 * HAT(IBP1-1,J,K) - HAT(IBP1-2,J,K)
        end if

Super thanks.

It works!

This just reminds me of the same 700 error when I was using !$cuf and/or !$ACC. So do they check the bounds in the automatically generated kernels?

Thanks.

Yes, with OpenACC and CUF kernels, the bounds check is implicitly added.

Note, an illegal access error is very generic and just means a bad address was accessed. Hence the earlier errors, especially if you’re meaning the issues you were having with pointers, were likely unrelated.

OK, thank you so much!

Hi Mat, would you please help me to look at this code? I cannot get the correct answer, thanks.

test# cat test.f90
attributes(global) subroutine mass_kernel_5(N_TOTAL_SCALARS, IBAR, JBAR, KBAR, IBP1, JBP1, KBP1, SOLID, RHOS, ZZS )

  implicit none

integer*4 :: I, J, K,N

integer, value :: IBAR, JBAR, KBAR, IBP1, JBP1, KBP1, N_TOTAL_SCALARS

LOGICAL, intent(in) :: SOLID(0:IBP1,0:JBP1,0:KBP1)
REAL(8), intent(in), DIMENSION(0:IBP1,0:JBP1,0:KBP1) :: RHOS
REAL(8), intent(out), DIMENSION(0:IBP1,0:JBP1,0:KBP1,1:N_TOTAL_SCALARS) :: ZZS

    !blockIdx%x starts from 1, which is different from cuda c/c++ code
    !threadIdx%x also starts from 1, in this case main arrays start from 1 in k, j,i,n dimentions, so we use threadIdx%x 
    !Note that some arrys start from 0, but they are not main arrays

I = (blockIdx%x-1) * blockDim%x + threadIdx%x 
J = (blockIdx%y-1) * blockDim%y + threadIdx%y
K = (blockIdx%z-1) * blockDim%z + threadIdx%z

if(I .ge. 1 .and. I .le. IBAR &
        .and.  J .ge. 1 .and. J .le. JBAR &
        .and. K .ge. 1 .and. K .le. KBAR &
        .and. SOLID(I,J,K) .eq. .false. ) then
            DO N=1, N_TOTAL_SCALARS
               ZZS(I,J,K,N) = ZZS(I,J,K,N)/RHOS(I,J,K)
            ENDDO
endif

end subroutine mass_kernel_5

program mass_kw_5

!@cuf use cudafor

implicit none

integer, PARAMETER :: IBAR=36, JBAR=24, KBAR=24, IBP1=37, JBP1=25, KBP1=25, N_TOTAL_SCALARS=3

REAL(8),  DIMENSION(0:IBP1,0:JBP1,0:KBP1,1:N_TOTAL_SCALARS):: ZZS
REAL(8),  DIMENSION(0:IBP1,0:JBP1,0:KBP1) :: RHOS
LOGICAL :: SOLID(0:IBP1,0:JBP1,0:KBP1)

REAL(8), device, DIMENSION(0:IBP1,0:JBP1,0:KBP1,1:N_TOTAL_SCALARS) :: D_ZZS
REAL(8), device, DIMENSION(0:IBP1,0:JBP1,0:KBP1) :: D_RHOS
LOGICAL, device :: D_SOLID(0:IBP1,0:JBP1,0:KBP1)

integer :: I,J,K,istat

integer*4 ::  blockDim_x=8, blockDim_y=8, blockDim_z=8
integer*4 :: gridDim_x, gridDim_y, gridDim_z

type(dim3) :: grid, block

SOLID = .false.

ZZS=0.0
D_ZZS = ZZS
ZZS(1,13,3,1)=0.99576610439851176
ZZS(1,13,3,2)=0.0
ZZS(1,13,3,3)=0.030113580765101414

RHOS=0.92710234959020099

D_SOLID = SOLID
D_RHOS= RHOS

gridDim_x = (IBAR+blockDim_x-1)/blockDim_x
gridDim_y = (JBAR+blockDim_y-1)/blockDim_y
gridDim_z = (KBAR+blockDim_z-1)/blockDim_z

grid = dim3(gridDim_x, gridDim_y,gridDim_z)
block= dim3(blockDim_x, blockDim_y,blockDim_z)

!call mass_kernel_5<<<grid,block,0,cudaforGetDefaultStream()>>>(N_TOTAL_SCALARS, &
call mass_kernel_5<<<grid,block>>>(N_TOTAL_SCALARS, &
        IBAR, JBAR, KBAR, IBP1, JBP1, KBP1, D_SOLID, D_RHOS, D_ZZS )

istat = cudaDeviceSynchronize()

ZZS= D_ZZS

print *, ZZS(1,13,3,1), ZZS(1,13,3,2), ZZS(1,13,3,3)

end program mass_kw_5

compiled with:
nvfortran -g -pg -O0 -Mpreprocess -Mlarge_arrays -m64 -Wall -Werror -gpu=ccall,nomanaged,implicitsections -stdpar -traceback -Minfo=accel -cpp -cuda -o test ./test.f90

./test
0.000000000000000 0.000000000000000 0.000000000000000

Thanks.

You need to move the copy to the device for ZZS after you initialize the element at (1,13,3,:), otherwise the device will have D_ZZS all zero. Just move D_ZZS=ZZS before D_SOLID=SOLID

./test
1.074062780229849 0.000000000000000 3.2481400035077351E-002

Oh sorry, my bad.

Thank you so much!