Here the whole code (main one, let me know if you also need ran.f90):
module d3bmod
use, intrinsic :: iso_fortran_env
implicit none
integer, private, parameter :: i4=int32
integer, private, parameter :: i8=int64
integer, private, parameter :: r8=real64
complex, private, parameter :: czero=(0.0_r8,0.0_r8)
!
! Put npart in constant memory since it never changes
!
integer(kind=i4), constant, private :: npart
contains
!
! I prefer to keep variables private and use setter subroutines.
!
subroutine setnpart(npartin)
integer(kind=i4), intent(in) :: npartin
npart=npartin
end subroutine setnpart
!
! zero d3b on gpu
!
attributes (global) subroutine zerod3b_gpu(d3b)
complex(kind=r8), intent(inout) :: d3b(4,4,4,*)
integer(kind=i4) :: i,j,k,is,js,ks,ijk
i=blockIdx%x
j=blockIdx%y
k=blockIdx%z
is=threadIdx%x
js=threadIdx%y
ks=threadIdx%z
if ((i.lt.j).and.(j.lt.k)) then
ijk=i+((j-2)*(j-1)+(k-3)*(k-2)*(k-1)/3)/2
d3b(is,js,ks,ijk)=czero
endif
end subroutine zerod3b_gpu
!
! add to d3b on gpu
!
attributes(global) subroutine addtod3b_gpu(sxz,fij,d3b)
complex(kind=r8), intent(in) :: sxz(4,npart,*),fij(*)
complex(kind=r8), intent(inout) :: d3b(4,4,4,*)
integer(kind=i4) :: i,j,k,is,js,ks,ijk
i=blockIdx%x
j=blockIdx%y
k=blockIdx%z
is=threadIdx%x
js=threadIdx%y
ks=threadIdx%z
if ((i.lt.j).and.(j.lt.k)) then
ijk=i+((j-2)*(j-1)+(k-3)*(k-2)*(k-1)/3)/2
d3b(is,js,ks,ijk)=d3b(is,js,ks,ijk)+fij(1)*( &
sxz(is,i,i)*(sxz(js,j,j)*sxz(ks,k,k)-sxz(js,j,k)*sxz(ks,k,j)) &
+sxz(is,i,j)*(sxz(js,j,k)*sxz(ks,k,i)-sxz(js,j,i)*sxz(ks,k,k)) &
+sxz(is,i,k)*(sxz(js,j,i)*sxz(ks,k,j)-sxz(js,j,j)*sxz(ks,k,i)))
endif
end subroutine addtod3b_gpu
!
! add to d3b on cpu
!
subroutine addtod3b(sxz,fij,npart,d3b)
integer(kind=i4), intent(in) :: npart
complex(kind=r8), intent(in) :: sxz(:,:,:),fij
complex(kind=r8), intent(inout) :: d3b(:,:,:,:)
integer(kind=i4) :: i,j,k,js,ks,ijk
ijk=0
do k=3,npart
do j=2,k-1
do i=1,j-1
ijk=ijk+1
do ks=1,4
do js=1,4
d3b(:,js,ks,ijk)=d3b(:,js,ks,ijk)+fij*( &
sxz(:,i,i)*(sxz(js,j,j)*sxz(ks,k,k) &
-sxz(js,j,k)*sxz(ks,k,j))+sxz(:,i,j) &
*(sxz(js,j,k)*sxz(ks,k,i)-sxz(js,j,i)*sxz(ks,k,k)) &
+sxz(:,i,k)*(sxz(js,j,i)*sxz(ks,k,j) &
-sxz(js,j,j)*sxz(ks,k,i)))
enddo
enddo
enddo
enddo
enddo
end subroutine addtod3b
end module d3bmod
program d3btest
use cudafor
use, intrinsic :: iso_fortran_env
use random
use random2
use d3bmod
implicit none
integer, parameter :: i4=int32
integer, parameter :: i8=int64
integer, parameter :: r8=real64
complex(kind=r8), parameter :: czero=(0.0_r8,0.0_r8)
complex(kind=r8), parameter :: cone=(1.0_r8,0.0_r8)
complex(kind=r8), parameter :: ci=(0.0_r8,1.0_r8)
! pin these for better transfer speed
complex(kind=r8), pinned, allocatable :: sxz(:,:,:),d3b(:,:,:,:),fij(:)
complex(kind=r8), allocatable :: d3b2(:,:,:,:)
complex(kind=r8), device, allocatable :: sxz_d(:,:,:)
complex(kind=r8), device, allocatable :: d3b_d(:,:,:,:)
complex(kind=r8), device, allocatable :: fij_d(:)
real(kind=r8) :: rnc(2)
real :: time0,time1,time2
integer(kind=i8) :: iseed
integer(kind=i4) :: npart,npair,ntrip,i,j,k,ijk,js,ks
integer :: istat
type(dim3) :: blocks, threads
iseed=1717171717
npart=40
npair=(npart*(npart-1))/2
ntrip=(npart*(npart-1)*(npart-2))/6
allocate(sxz(4,npart,npart))
allocate(fij(1))
allocate(d3b(4,4,4,ntrip))
allocate(d3b2(4,4,4,ntrip))
allocate(sxz_d(4,npart,npart))
allocate(fij_d(1))
allocate(d3b_d(4,4,4,ntrip))
call setrn(iseed)
call cpu_time(time0)
sxz=reshape(randn(4*npart**2),[4,npart,npart]) &
+ci*reshape(randn(4*npart**2),[4,npart,npart])
rnc=randn(2)
fij(1)=rnc(1)+ci*rnc(2)
call cpu_time(time1)
write(*,*) 'fij',fij,'cpu time for random junk',time1-time0
!
! set up with 64 threads, I think 2 warps of 32 threads will then have the
! same i<j<k if test result in the gpu code, limiting thread divergence.
!
threads=dim3(4,4,4)
blocks=dim3(npart,npart,npart)
!
! Transfer data to gpu
!
call setnpart(npart)
call cpu_time(time0)
sxz_d=sxz
fij_d=fij
call cpu_time(time1)
write (*,*) 'to device time',time1-time0
istat = cudaDeviceSynchronize()
!
! zero d3b then calculate addition from sxz and fij
!
call cpu_time(time0)
call zerod3b_gpu<<<blocks,threads>>>(d3b_d)
call addtod3b_gpu<<<blocks,threads>>>(sxz_d,fij_d,d3b_d)
istat = cudaDeviceSynchronize()
call cpu_time(time1)
write (*,*) 'kernel time',time1-time0
!
! Transfer result back
!
d3b=d3b_d
call cpu_time(time2)
write (*,*) 'from device time',time2-time1
d3b2=czero
call cpu_time(time0)
call addtod3b(sxz,fij(1),npart,d3b2)
call cpu_time(time1)
write (*,*) 'cpu d3b time',time1-time0
write (*,*) d3b(1,2,3,37),d3b2(1,2,3,37)
write (*,*) 'max difference',maxval(abs(d3b-d3b2))
end program d3btest
Stefano