Hi NVIDIA Experts,
Look at this simple code which works for smaller arrays (32, 32,32) but not for bigger arrays (128,128,128):
module KCUF
implicit none
contains
subroutine GetArg(argc, argv)
implicit none
integer :: argc, ix
character (len=16), dimension(:), allocatable :: argv
argc = command_argument_count()
allocate(argv(argc))
do ix=1, argc
call get_command_argument(ix, argv(ix))
end do
end subroutine GetArg
elemental subroutine str2int(in_str,out_int,out_stat)
implicit none
! Arguments
character(len=*),intent(in) :: in_str
integer*4,intent(out) :: out_int
integer*4,intent(out) :: out_stat
read(in_str,*,iostat=out_stat) out_int
end subroutine str2int
elemental subroutine str2real(in_str,out_real,out_stat)
implicit none
! Arguments
character(len=*),intent(in) :: in_str
real,intent(out) :: out_real
integer,intent(out) :: out_stat
read(in_str,*,iostat=out_stat) out_real
end subroutine str2real
attributes(global) subroutine turb_kernel_kj(IBP1,JBP1,KBP1, HAT)
implicit none
real*8 :: HAT(0:IBP1, 0:JBP1, 0:KBP1)
integer*4, value :: IBP1, JBP1, KBP1
integer*4 :: I, J, K
!blockIdx%x starts from 1, which is different from cuda c/c++ code
!threadIdx%x also starts from 1, in this case HAT starts from 0 in k and j dimentions, so we use threadIdx%x -1
K = (blockIdx%x-1) * blockDim%x + threadIdx%x -1
J = (blockIdx%y-1) * blockDim%y + threadIdx%y -1
HAT(0,J,K) = 2.0 * HAT(0+1,J,K) - HAT(0+2,J,K)
HAT(IBP1,J,K) = 2.0 * HAT(IBP1-1,J,K) - HAT(IBP1-2,J,K)
end subroutine turb_kernel_kj
subroutine turb_kw(IBP1, JBP1, KBP1, HAT)
!@cuf use cudafor
implicit none
integer :: IBP1, KBP1, JBP1, istat
REAL(8), DIMENSION(:,:,:) :: HAT
REAL(8), device, DIMENSION(0:IBP1,0:JBP1,0:KBP1) :: D_HAT
integer*4 :: blockDim_x=16, blockDim_y=16
integer*4 :: gridDim_x, gridDim_y
type(dim3) :: grid, block
type(cudaEvent) :: start, stop
!KBP1+1 is the real dimension length since here D_HAT starts from 0, not default 1 in Fortran
!For example, if KBP1=16, D_HAT really has 17 elements in K dimension, so girdDim_x should be (16+1+16-1/16)=2, if we do not use
!KBP1+1, we will have gridDim_x = 1, the kernel will need stride, and things become confusing --- WHG 01/02/2024
D_HAT = HAT
!Call KJ kernel
gridDim_x = (KBP1+1+blockDim_x-1)/blockDim_x
gridDim_y = (JBP1+1+blockDim_y-1)/blockDim_y
grid = dim3(gridDim_x, gridDim_y,1)
block= dim3(blockDim_x, blockDim_y,1)
call turb_kernel_kj<<<grid,block>>>(IBP1, JBP1, KBP1, D_HAT)
HAT=D_HAT
istat = cudaDeviceSynchronize()
end subroutine turb_kw
Subroutine test_gpu(IBP1, KBP1, JBP1)
!@cuf use cudafor
IMPLICIT NONE
integer :: i,j,k
integer :: IBP1, KBP1, JBP1, istat
REAL(8), DIMENSION(0:IBP1,0:JBP1,0:KBP1) :: HAT
real*8 :: Terror=0.0
real e, etime, t(2), start_time, stop_time
!istat=cudaMemcpy(a, h_a, N_i*N_j*N_k)
print*, size(HAT), sizeof(HAT)
HAT = 0.0
call cpu_time(start_time)
print *, 'before kernal cpu time :', start_time
CALL turb_kw(IBP1, JBP1, KBP1, HAT)
call cpu_time(stop_time)
print *, 'test_128 after kernal cpu time :',stop_time
!e = etime(t)
!print *, 'Time elapsed after kernal :', e, ',user:', t(1), ', sys:', t(2)
END subroutine
end module
program test_no_mpi
Compiled with:
nvfortran -g -m64 -O0 -Wall -Werror -gpu=ccnative -cuda -traceback -Mrecursive -o test_128 test_128.f90
works for small arrays:
test_device# ./test_128 32 32 32
32 32 32
35937 287496
before kernal cpu time : 0.000000
test_128 after kernal cpu time : 0.3167009
(rapids) root@n6k4hh8336:/notebooks/CUDA/cuda_fortran/test_device# ./test_128 64 64 64
64 64 64
274625 2197000
before kernal cpu time : 0.000000
test_128 after kernal cpu time : 0.3029881
But it does not work for larger arrays:
test_device# ./test_128 128 128 128
128 128 128
2146689 17173512
before kernal cpu time : 0.000000
0: copyout Memcpy (host=0x2d84de0, dev=0x7f3012000000, size=17173512) FAILED: 700(an illegal memory access was encountered)
Note: export NV_ACC_CUDA_STACKSIZE=64MB does not work.
Thank you so much!