Hi,
I have been trying to get a simple kernel to multiply two arrays together, element wise. I can successfully perform the operation on a real array. However, when I make the arrays complex the subroutine fails. By redefining subroutine to perform the problem in terms of a loop of 1D row operations I can obtain the correct result. Furthermore, using directives one obtains the correct result
The attached code shows my implementation, and if anyone has suggestions as to where this is going wrong I would be very grateful.
module kernel_def
contains
attributes(global) subroutine multiplication_complex(c_dTemp, c_eTemp,c_fTemp, m,n)
integer, value :: n,m
integer :: ix,iy
complex,device :: c_dTemp(n,m),c_eTemp(n,m),c_fTemp(n,m)
!calculate the number of threads and blocks
ix = (blockIdx%x-1)*blockDim%x + threadIdx%x
iy = (blockIdx%y-1)*blockDim%y + threadIdx%y
c_fTemp(ix,iy+j)=c_dTemp(ix,iy+j)*c_eTemp(ix,iy+j)
end subroutine multiplication_complex
attributes(global) subroutine multiplication_real(r_dTemp, r_eTemp,r_fTemp, m,n)
integer, value :: n,m
integer :: ix,iy
real,device :: r_dTemp(n,m),r_eTemp(n,m),r_fTemp(n,m)
!calculate the number of threads and blocks
ix = (blockIdx%x-1)*blockDim%x + threadIdx%x
iy = (blockIdx%y-1)*blockDim%y + threadIdx%y
r_fTemp(ix,iy)=r_dTemp(ix,iy)*r_eTemp(ix,iy)
end subroutine multiplication_real
end module
program prog
use cudafor
use kernel_def
implicit none
integer :: n = 1024
integer :: tbp = 1024
integer :: i,j
real :: total
real, dimension (n,n) :: a,c
real, device, dimension(n,n) :: r_dTemp,r_eTemp,r_fTemp
complex, device, dimension(n,n) :: c_dTemp,c_eTemp,c_fTemp
complex, dimension(n,n) :: c_a
type(dim3) :: blocksize, gridsize
blocksize = dim3(n/tbp,n/tbp,1)
gridsize=dim3(ceiling(dfloat(n)/dfloat(blocksize%x)),ceiling(dfloat(n)/dfloat(blocksize%y)),1)
c_fTemp = cmplx(0.0,0.0)
c_dtemp = cmplx(25.0,0.0)
c_eTemp = cmplx(60.0,0.0)
r_fTemp = 0.0
r_dtemp = 25.0
r_eTemp = 60.0
call multiplication_complex<<<blocksize,gridsize>>>(c_dTemp,c_eTemp,c_fTemp, N,n)
c_a=c_fTemp
total=sum(c_a*conjg(c_a))
write(*,*) 'complex total ',total
!accelerator directives
!$acc region
c_fTemp=c_dTemp*c_eTemp
!$acc end region
c_a=c_fTemp
total=sum(c_a*conjg(c_a))
write(*,*) 'acc complex total',total
!real implementation
call multiplication_real<<<blocksize,gridsize>>>(r_dTemp,r_eTemp,r_fTemp, N,n)
c_a=c_fTemp
total=sum(c_a)
write(*,*) 'real total ',total
!accelerator directives
!$acc region
r_fTemp=r_dTemp*r_eTemp
!$acc end region
a=r_fTemp
total=sum(a)
write(*,*) 'acc real total ',total
end