Hi,
Following is my code in CUDA fortran. I am using NVIDIA Tesla S1070 for running my code. Here I am using global memory to transfer my arrays. The code works fine when the device arrays are small, but it fails during the runtime for larger arrays. I don’t think the size of the arrays is big enough to occupy the device memory. I have removed details of calculation to make code shorter.
!
module fouriercalc
Use ewald
Use xyz
Use sp
Use cudafor
!Storing the constant arrays in constant memory of GPUs
implicit none
integer, constant :: nwallsdev
real, constant :: alphadev
real, constant :: rrlcutsqdev
real, constant :: axdev, aydev, azdev
real, constant :: box_xdev, box_ydev, box_zdev
real, constant, dimension(3) :: Xdevnew, Ydevnew, Zdevnew
real, constant, dimension(3) :: Xdevold, Ydevold, Zdevold
real, constant, dimension(3) :: chargedev
integer, constant :: Nionpmdev
integer, constant :: Kxmaxdev
integer, constant :: Kymaxdev
integer, constant :: Kzmaxdev
integer, constant :: Nkvecdev
integer, device, allocatable, dimension(:) :: KXdev, KYdev, KZdev
!complex, device, allocatable, dimension(:,:) :: EXPXdev, EXPYdev, EXPZdev
!complex, device, allocatable, dimension(:,:) :: EXPX_Tdev, EXPY_Tdev, EXPZ_Tdev
complex, device, allocatable, dimension(:) :: SUMQEXPVdev
complex, device, allocatable, dimension(:) :: SUMQEXPV_Tdev
integer, constant :: flagdev2, flagdev1
contains
subroutine fourier_move(mol, icb, de)
Use sp
Use ewald
Use xyz
implicit none
! This routine calculates the change in the Fourier term of the
! Ewald.
integer :: mol
integer :: icb
real :: de
integer :: i, k
integer :: iflag
integer, device :: iflagdev
integer :: kkx, kky, kkz
real :: xii, yii, zii
real :: b
complex :: expv_diff
real, device, allocatable, dimension(:) :: dedev
type(dim3) :: dimGrid, dimBlock
de = 0.0
!dedev = 0.0
if( icb < 0 ) then
iflag = 1
flagdev2 = 1
icb = -icb
SUMQEXPV_T(icb,:) = SUMQEXPV_T(1,:)
else
iflag = 0
flagdev2 = 0
SUMQEXPV_T(icb,:) = SUMQEXPV(:)
end if
EXPX_T(icb,:,0) = (1.0,0.0)
EXPY_T(icb,:,0) = (1.0,0.0)
EXPZ_T(icb,:,0) = (1.0,0.0)
allocate(dedev(1:Nkvec))
dedev = 0.0
!!!
! Assigning the values for device variables
do i = 1, Nionpm
Xdevnew(i) = Xi(mol,i)
Ydevnew(i) = Yi(mol,i)
Zdevnew(i) = Zi(mol,i)
end do
do i = 1, Nkvec
SUMQEXPVdev(i) = SUMQEXPV(i)
SUMQEXPV_Tdev(i) = SUMQEXPV_T(icb,i)
end do
! Allocating the number of threads and invoking the kernel
dimGrid = dim3(1, 1, 1)
dimBlock = dim3(Nkvec, 1, 1)
call fourier_kernel<<<dimGrid,dimBlock>>>( SUMQEXPVdev, &
SUMQEXPV_Tdev, KXdev, KYdev, KZdev, dedev)
! Assigning the output back to the host variable
SUMQEXPV_T(icb,:) = SUMQEXPV_Tdev(:)
de = sum(dedev(1:Nkvec/2))
de = de * 4.0 * pi / volume_e
!write(,) de, Nmol
return
end subroutine fourier_move
!!!
!Starting with the Kernel subroutine
attributes(global) subroutine fourier_kernel( SUMQEXPVdev, &
SUMQEXPV_T, KXdev, KYdev, KZdev, de )
implicit none
integer :: k, i, j
real :: dethread
real :: de(Nkvecdev)
integer :: KXdev(Nkvecdev), KYdev(Nkvecdev), KZdev(Nkvecdev)
!complex :: EXPX(Nionpmdev, 0:kxmaxdev), EXPY(Nionpmdev, -kymaxdev:kymaxdev), EXPZ(Nionpmdev, -kzmaxdev:kzmaxdev)
!complex :: EXPX_T(Nionpmdev, 0:kxmaxdev), EXPY_T(Nionpmdev, -kymaxdev:kymaxdev), EXPZ_T(Nionpmdev, -kzmaxdev:kzmaxdev)
complex :: EXPX, EXPY, EXPZ
complex :: EXPX_T, EXPY_T, EXPZ_T
complex :: SUMQEXPVdev(Nkvecdev)
complex :: SUMQEXPV
complex :: SUMQEXPV_TS
complex :: SUMQEXPV_T(Nkvecdev)
complex :: expv_diff
integer :: kkx, kky, kkz
real :: b
real :: mycol, ksq
j = threadidx%x
call syncthreads()
SUMQEXPV_T(k) = SUMQEXPV_TS
de(j) = de(j) + dethread
!end do
end subroutine fourier_kernel
end module fouriercalc
Regards
Kaustubh