Failure of the CUDA Fortran code during runtime

Hi,
Following is my code in CUDA fortran. I am using NVIDIA Tesla S1070 for running my code. Here I am using global memory to transfer my arrays. The code works fine when the device arrays are small, but it fails during the runtime for larger arrays. I don’t think the size of the arrays is big enough to occupy the device memory. I have removed details of calculation to make code shorter.

!
module fouriercalc
Use ewald
Use xyz
Use sp
Use cudafor
!Storing the constant arrays in constant memory of GPUs
implicit none
integer, constant :: nwallsdev
real, constant :: alphadev
real, constant :: rrlcutsqdev
real, constant :: axdev, aydev, azdev
real, constant :: box_xdev, box_ydev, box_zdev
real, constant, dimension(3) :: Xdevnew, Ydevnew, Zdevnew
real, constant, dimension(3) :: Xdevold, Ydevold, Zdevold
real, constant, dimension(3) :: chargedev
integer, constant :: Nionpmdev
integer, constant :: Kxmaxdev
integer, constant :: Kymaxdev
integer, constant :: Kzmaxdev
integer, constant :: Nkvecdev
integer, device, allocatable, dimension(:) :: KXdev, KYdev, KZdev
!complex, device, allocatable, dimension(:,:) :: EXPXdev, EXPYdev, EXPZdev
!complex, device, allocatable, dimension(:,:) :: EXPX_Tdev, EXPY_Tdev, EXPZ_Tdev
complex, device, allocatable, dimension(:) :: SUMQEXPVdev
complex, device, allocatable, dimension(:) :: SUMQEXPV_Tdev
integer, constant :: flagdev2, flagdev1
contains

subroutine fourier_move(mol, icb, de)

Use sp
Use ewald
Use xyz

implicit none

! This routine calculates the change in the Fourier term of the
! Ewald.

integer :: mol
integer :: icb
real :: de


integer :: i, k
integer :: iflag
integer, device :: iflagdev
integer :: kkx, kky, kkz
real :: xii, yii, zii
real :: b
complex :: expv_diff
real, device, allocatable, dimension(:) :: dedev
type(dim3) :: dimGrid, dimBlock


de = 0.0
!dedev = 0.0
if( icb < 0 ) then
iflag = 1
flagdev2 = 1
icb = -icb
SUMQEXPV_T(icb,:) = SUMQEXPV_T(1,:)
else
iflag = 0
flagdev2 = 0
SUMQEXPV_T(icb,:) = SUMQEXPV(:)
end if

EXPX_T(icb,:,0) = (1.0,0.0)
EXPY_T(icb,:,0) = (1.0,0.0)
EXPZ_T(icb,:,0) = (1.0,0.0)
allocate(dedev(1:Nkvec))
dedev = 0.0
!!!
! Assigning the values for device variables

do i = 1, Nionpm

Xdevnew(i) = Xi(mol,i)
Ydevnew(i) = Yi(mol,i)
Zdevnew(i) = Zi(mol,i)

end do

do i = 1, Nkvec

SUMQEXPVdev(i) = SUMQEXPV(i)
SUMQEXPV_Tdev(i) = SUMQEXPV_T(icb,i)
end do

! Allocating the number of threads and invoking the kernel

dimGrid = dim3(1, 1, 1)
dimBlock = dim3(Nkvec, 1, 1)

call fourier_kernel<<<dimGrid,dimBlock>>>( SUMQEXPVdev, &
SUMQEXPV_Tdev, KXdev, KYdev, KZdev, dedev)

! Assigning the output back to the host variable
SUMQEXPV_T(icb,:) = SUMQEXPV_Tdev(:)



de = sum(dedev(1:Nkvec/2))
de = de * 4.0 * pi / volume_e
!write(,) de, Nmol
return

end subroutine fourier_move

!!!
!Starting with the Kernel subroutine

attributes(global) subroutine fourier_kernel( SUMQEXPVdev, &
SUMQEXPV_T, KXdev, KYdev, KZdev, de )
implicit none
integer :: k, i, j
real :: dethread
real :: de(Nkvecdev)
integer :: KXdev(Nkvecdev), KYdev(Nkvecdev), KZdev(Nkvecdev)
!complex :: EXPX(Nionpmdev, 0:kxmaxdev), EXPY(Nionpmdev, -kymaxdev:kymaxdev), EXPZ(Nionpmdev, -kzmaxdev:kzmaxdev)
!complex :: EXPX_T(Nionpmdev, 0:kxmaxdev), EXPY_T(Nionpmdev, -kymaxdev:kymaxdev), EXPZ_T(Nionpmdev, -kzmaxdev:kzmaxdev)
complex :: EXPX, EXPY, EXPZ
complex :: EXPX_T, EXPY_T, EXPZ_T
complex :: SUMQEXPVdev(Nkvecdev)
complex :: SUMQEXPV
complex :: SUMQEXPV_TS
complex :: SUMQEXPV_T(Nkvecdev)
complex :: expv_diff
integer :: kkx, kky, kkz
real :: b
real :: mycol, ksq

j = threadidx%x

call syncthreads()

SUMQEXPV_T(k) = SUMQEXPV_TS
de(j) = de(j) + dethread

!end do

end subroutine fourier_kernel

end module fouriercalc

Regards
Kaustubh

Hi Kaustubh,

Sorry but I don’t have enough information to give you an answer as to why this code would be failing. Can you send a complete reproducing example (along with the missing module files and host driver routine) to PGI Customer Service (trs@pgroup.com)?

What is the exact error message that you are getting? Where in your program does it fail?

Also, you can try adding error handling after your host to device copies and kernel launches to help find the cause.

istat = cudathreadsynchronize()
errCode = cudaGetLastError()
if (errCode .gt. 0) then
       print *, errCode
       stop 'Error! Kernel failed!'
endif
  • Mat