Incorrect result with complex arrays in global subroutine

Hi,
I have been trying to get a simple kernel to multiply two arrays together, element wise. I can successfully perform the operation on a real array. However, when I make the arrays complex the subroutine fails. By redefining subroutine to perform the problem in terms of a loop of 1D row operations I can obtain the correct result. Furthermore, using directives one obtains the correct result
The attached code shows my implementation, and if anyone has suggestions as to where this is going wrong I would be very grateful.

 module kernel_def
      contains
      
      attributes(global) subroutine multiplication_complex(c_dTemp, c_eTemp,c_fTemp, m,n)
      integer, value :: n,m
      integer :: ix,iy
      complex,device :: c_dTemp(n,m),c_eTemp(n,m),c_fTemp(n,m)

      !calculate the number of threads and blocks
      ix = (blockIdx%x-1)*blockDim%x + threadIdx%x
      iy = (blockIdx%y-1)*blockDim%y + threadIdx%y

      c_fTemp(ix,iy+j)=c_dTemp(ix,iy+j)*c_eTemp(ix,iy+j)       
      
      end subroutine multiplication_complex

      attributes(global) subroutine multiplication_real(r_dTemp, r_eTemp,r_fTemp, m,n)
      integer, value :: n,m
      integer :: ix,iy
      real,device :: r_dTemp(n,m),r_eTemp(n,m),r_fTemp(n,m)

      !calculate the number of threads and blocks
      ix = (blockIdx%x-1)*blockDim%x + threadIdx%x
      iy = (blockIdx%y-1)*blockDim%y + threadIdx%y

      r_fTemp(ix,iy)=r_dTemp(ix,iy)*r_eTemp(ix,iy)       

      end subroutine multiplication_real
      end module
      
      program prog
      use cudafor
      use kernel_def
      implicit none
      
      integer :: n = 1024
      integer :: tbp = 1024
      integer :: i,j
      real :: total
      real, dimension (n,n) :: a,c
      real, device, dimension(n,n) :: r_dTemp,r_eTemp,r_fTemp
      complex, device, dimension(n,n) :: c_dTemp,c_eTemp,c_fTemp
      complex, dimension(n,n) :: c_a
	  
      type(dim3) :: blocksize, gridsize
      
      
      blocksize = dim3(n/tbp,n/tbp,1)
      gridsize=dim3(ceiling(dfloat(n)/dfloat(blocksize%x)),ceiling(dfloat(n)/dfloat(blocksize%y)),1)
     
      c_fTemp = cmplx(0.0,0.0)
      c_dtemp = cmplx(25.0,0.0)
      c_eTemp = cmplx(60.0,0.0)
      r_fTemp = 0.0
      r_dtemp = 25.0
      r_eTemp = 60.0
      
      call multiplication_complex<<<blocksize,gridsize>>>(c_dTemp,c_eTemp,c_fTemp, N,n)
      c_a=c_fTemp
      total=sum(c_a*conjg(c_a))
      write(*,*) 'complex total ',total
       
      !accelerator directives
      !$acc region
      c_fTemp=c_dTemp*c_eTemp
      !$acc end region
      c_a=c_fTemp
      total=sum(c_a*conjg(c_a))
      write(*,*) 'acc complex total',total

      
      !real implementation
      call multiplication_real<<<blocksize,gridsize>>>(r_dTemp,r_eTemp,r_fTemp, N,n)
      c_a=c_fTemp
      total=sum(c_a)
      write(*,*) 'real total ',total
       
      !accelerator directives
      !$acc region
      r_fTemp=r_dTemp*r_eTemp
      !$acc end region
      a=r_fTemp
      total=sum(a)
      write(*,*) 'acc real total ',total

	end

Hi bullion,

Adding a bit of error checking after your kernel calls, shows the error:

 invalid configuration argument                                                                                                  
 complex total     0.000000

The problem is that your block size is 1024 x 1024 which is too many threads. While the exact maximum number of threads will change per device, on my GTX690 is (seen via pgaccelinfo)

Maximum Threads per Block: 1024
Maximum Block Dimensions: 1024, 1024, 64

So while the first two dimension can have 1024 threads, the product of the three dimensions can not be more than 1024.

To fix your code, set “tpb” to something smaller, like 16. Note that you add “j” to the second index in the complex arrays. This looked like a programming error to me so I removed it.

Here’s the fixed code:

% cat complex.cuf 
module kernel_def
      contains
     
      attributes(global) subroutine multiplication_complex(c_dTemp, c_eTemp,c_fTemp, m,n)
      integer, value :: n,m
      integer :: ix,iy
      complex,device :: c_dTemp(n,m),c_eTemp(n,m),c_fTemp(n,m)

      !calculate the number of threads and blocks
      ix = (blockIdx%x-1)*blockDim%x + threadIdx%x
      iy = (blockIdx%y-1)*blockDim%y + threadIdx%y

      !c_fTemp(ix,iy+j)=c_dTemp(ix,iy+j)*c_eTemp(ix,iy+j)       
      c_fTemp(ix,iy)=c_dTemp(ix,iy)*c_eTemp(ix,iy)       
     
      end subroutine multiplication_complex

      attributes(global) subroutine multiplication_real(r_dTemp, r_eTemp,r_fTemp, m,n)
      integer, value :: n,m
      integer :: ix,iy
      real,device :: r_dTemp(n,m),r_eTemp(n,m),r_fTemp(n,m)

      !calculate the number of threads and blocks
      ix = (blockIdx%x-1)*blockDim%x + threadIdx%x
      iy = (blockIdx%y-1)*blockDim%y + threadIdx%y

      r_fTemp(ix,iy)=r_dTemp(ix,iy)*r_eTemp(ix,iy)       

      end subroutine multiplication_real
      end module kernel_def
     
      program prog
      use cudafor
      use kernel_def
      implicit none
     
      integer :: n = 1024
      integer :: tbp = 16
      integer :: i,j, istat
      real :: total
      real, dimension (n,n) :: a,c
      real, device, dimension(n,n) :: r_dTemp,r_eTemp,r_fTemp
      complex, device, dimension(n,n) :: c_dTemp,c_eTemp,c_fTemp
      complex, dimension(n,n) :: c_a
    
      type(dim3) :: blocksize, gridsize
     
     
      blocksize = dim3(n/tbp,n/tbp,1)
      gridsize=dim3(ceiling(dfloat(n)/dfloat(blocksize%x)),ceiling(dfloat(n)/dfloat(blocksize%y)),1)
     
      c_fTemp = cmplx(0.0,0.0)
      c_dtemp = cmplx(25.0,0.0)
      c_eTemp = cmplx(60.0,0.0)
      r_fTemp = 0.0
      r_dtemp = 25.0
      r_eTemp = 60.0
      print *, blocksize
      print *, gridsize     
      call multiplication_complex<<<blocksize,gridsize>>>(c_dTemp,c_eTemp,c_fTemp, N,n)
      istat = cudaGetLastError()
      if (istat .ne. 0) then
       print *, cudaGetErrorString(istat)
      endif
      c_a=c_fTemp
      total=sum(c_a*conjg(c_a))
      write(*,*) 'complex total ',total
      
#ifdef _ACCEL 
      !accelerator directives
      !$acc region
      c_fTemp=c_dTemp*c_eTemp
      !$acc end region
      c_a=c_fTemp
      total=sum(c_a*conjg(c_a))
      write(*,*) 'acc complex total',total
#endif
     
      !real implementation
      call multiplication_real<<<blocksize,gridsize>>>(r_dTemp,r_eTemp,r_fTemp, N,n)
      istat = cudaGetLastError()
      if (istat .ne. 0) then
       print *, cudaGetErrorString(istat)
      endif
      c_a=c_fTemp
      total=sum(c_a)
      write(*,*) 'real total ',total

#ifdef _ACCEL 
      !accelerator directives
      !$acc region
      r_fTemp=r_dTemp*r_eTemp
      !$acc end region
      a=r_fTemp
      total=sum(a)
      write(*,*) 'acc real total ',total
#endif

   end 
% pgfortran complex.cuf -Mcuda -Mpreprocess -ta=nvidia 
% a.out
           64           64            1
           16           16            1
 complex total    2.3513098E+12
 acc complex total   2.3513098E+12
 real total    1.5756649E+09
 acc real total    1.5756649E+09

Hope this helps,
Mat

Thanks matt, that worked!
The reduction of the blocksize to 16 (or 32) for the 2D case also explains why my earlier 1D implementation worked, the number of threads was 1024 in that case. I am still a bit confused as to why the real implementation worked with the large number of threads whereas the complex version failed?
Finally, your correct regarding the coding bug, the extra j was added when I was testing a change, but forgot to remove the loop index.
Regards

I am still a bit confused as to why the real implementation worked with the large number of threads whereas the complex version failed?

It didn’t work. You were just printing out the OpenACC results twice.

  • Mat