Unable to get correct square root results in kernel

Hi,
I have some trouble using the sqrt() function in the CUDA Fortran kernel. The kernel input is a matrix, calculate the square root of each element in the matrix, but the output matrix is all 0. Below is the kernel code:

module cudaModule
  use cudafor
  contains
  attributes(global) subroutine test_sqrt(jdim,kdim,idim,input,output)
  implicit none
  integer, intent(in) :: jdim,kdim,idim
  real, intent(in) :: input(jdim-1,kdim-1,idim-1)
  real, intent(inout) :: output(jdim-1,kdim-1,idim-1)
  integer gtidx,gtidy,gtidz

  gtidx = blockDim%x * (blockIdx%x - 1) + threadIdx%x
  gtidy = blockDim%y * (blockIdx%y - 1) + threadIdx%y
  gtidz = blockDim%z * (blockIdx%z - 1) + threadIdx%z

  if (gtidy <= idim-1 .and. gtidx <= jdim-1 .and. gtidz <= kdim-1) then
    output(gtidx,gtidz,gtidy)=SQRT(input(gtidx,gtidz,gtidy))
  endif
  end subroutine test_sqrt
end module cudaModule

program testTwoeqn
      use cudafor
      use cudaModule

      implicit none
      integer, parameter :: jdim=17
      integer, parameter :: kdim=25
      integer, parameter :: idim=2
      integer result,i,j,k
      real :: input(jdim-1,kdim-1,idim-1),cpu_output(jdim-1,kdim-1,idim-1), &
        gpu_output(jdim-1,kdim-1,idim-1)

!     device data     
      integer, parameter :: TILE_X = 32
      integer, parameter :: TILE_Y = 32
      integer, parameter :: TILE_Z = 2
      integer, device :: jdim_d, kdim_d, idim_d
      real, device :: input_d(jdim-1,kdim-1,idim-1),output_d(jdim-1,kdim-1,idim-1)
      
      type(dim3) :: grid, tBlock

!     init
      do k=1,kdim-1
        do i=1,idim-1
          do j=1,jdim-1
            input(j,k,i)=j
            cpu_output(j,k,i)=0.0
            gpu_output(j,k,i)=0.0
          enddo
        enddo
      enddo

!     CPU TEST
      do k=1,kdim-1
        do i=1,idim-1
          do j=1,jdim-1
            cpu_output(j,k,i)=sqrt(input(j,k,i))
          enddo
        enddo
      enddo

!     GPU TEST
      jdim_d=jdim
      kdim_d=kdim
      idim_d=idim
      input_d=input 
      output_d=gpu_output
      tBlock = dim3(TILE_X,TILE_Y,TILE_Z)
      grid = dim3(CEILING(REAL(jdim-1)/TILE_X), &
                  CEILING(REAL(kdim-1)/TILE_Y), &
                  CEILING(REAL(idim-1)/TILE_Z))
      call test_sqrt<<<grid, tBlock>>>(jdim_d,kdim_d,idim_d,input_d,output_d)
      gpu_output=output_d
      call compare3D(cpu_output,gpu_output,jdim-1,kdim-1,idim-1,result)
      if (result == 0) then
        print*, "Pass Test!"
      else
        print*, "Test Failed!"
      endif
end program testTwoeqn

Is there anything wrong here?

Thank you for your help.

Hi Zhou Heng,

The main problem here is that you have an illegal schedule so the kernel isn’t getting launched. The maximum number of threads per block is 1024, but you’re trying to use 2048. Adding error checking will show the problem:

       print *, "Block: ", tBlock
       print *, "Grid: ", grid
       call test_sqrt<<<grid, tBlock>>>(jdim,kdim,idim,input_d,output_d)
       istat = cudagetlasterror()
       if (istat .ne. 0) then
          write(*,*) 'Error =' , cudaGetErrorString(istat)
       endif



% a.out
 Block:            32           32            2
 Grid:             1            1            1
 Error =
 invalid configuration argument

Also, it looks like your indexing in the kernel is wrong in that on the host, idim cooresponds to the z dimension and kdim to the y dimension, but in the kernel you have these reversed.

Here’s my changes:

% cat test.f90
module cudaModule
   use cudafor
   contains
   attributes(global) subroutine test_sqrt(jdim,kdim,idim,input,output)
   implicit none
   integer,  value, intent(in) :: jdim,kdim,idim
   real, intent(in) :: input(jdim-1,kdim-1,idim-1)
   real, intent(inout) :: output(jdim-1,kdim-1,idim-1)
   integer gtidx,gtidy,gtidz

   gtidx = blockDim%x * (blockIdx%x - 1) + threadIdx%x
   gtidy = blockDim%y * (blockIdx%y - 1) + threadIdx%y
   gtidz = blockDim%z * (blockIdx%z - 1) + threadIdx%z

   if (gtidz <= idim-1 .and. gtidx <= jdim-1 .and. gtidy <= kdim-1) then
    output(gtidx,gtidy,gtidz)=SQRT(input(gtidx,gtidy,gtidz))
   endif
   end subroutine test_sqrt
 end module cudaModule

 program testTwoeqn
       use cudafor
       use cudaModule

       implicit none
       integer, parameter :: jdim=17
       integer, parameter :: kdim=25
       integer, parameter :: idim=2
       integer result,i,j,k
       real :: input(jdim-1,kdim-1,idim-1),cpu_output(jdim-1,kdim-1,idim-1), &
         gpu_output(jdim-1,kdim-1,idim-1)

 !     device data
       integer, parameter :: TILE_X = 32
       integer, parameter :: TILE_Y = 32
       integer, parameter :: TILE_Z = 1
       integer, device :: jdim_d, kdim_d, idim_d
       real, device :: input_d(jdim-1,kdim-1,idim-1),output_d(jdim-1,kdim-1,idim-1)
       integer :: istat
       type(dim3) :: grid, tBlock

 !     init
       do k=1,kdim-1
         do i=1,idim-1
           do j=1,jdim-1
             input(j,k,i)=j
             cpu_output(j,k,i)=0.0
             gpu_output(j,k,i)=0.0
           enddo
         enddo
       enddo

 !     CPU TEST
       do k=1,kdim-1
         do i=1,idim-1
           do j=1,jdim-1
             cpu_output(j,k,i)=sqrt(input(j,k,i))
           enddo
         enddo
       enddo

 !     GPU TEST
       jdim_d=jdim
       kdim_d=kdim
       idim_d=idim
       input_d=input
       output_d=gpu_output
       tBlock = dim3(TILE_X,TILE_Y,TILE_Z)
       grid = dim3(CEILING(REAL(jdim-1)/TILE_X), &
                   CEILING(REAL(kdim-1)/TILE_Y), &
                   CEILING(REAL(idim-1)/TILE_Z))
       print *, "Block: ", tBlock
       print *, "Grid: ", grid
       call test_sqrt<<<grid, tBlock>>>(jdim,kdim,idim,input_d,output_d)
       istat = cudagetlasterror()
       if (istat .ne. 0) then
          write(*,*) 'Error =' , cudaGetErrorString(istat)
       endif
       gpu_output=output_d
!       call compare3D(cpu_output,gpu_output,jdim-1,kdim-1,idim-1,result)
       result = 0;
       do k=1,kdim-1
         do i=1,idim-1
           do j=1,jdim-1
             if (abs(cpu_output(j,k,i)-gpu_output(j,k,i)) > 0.00000001) then
                result = result + 1
             endif
           enddo
         enddo
       enddo


       if (result == 0) then
         print*, "Pass Test!"
       else
         print*, "Test Failed with result=", result
       endif
 end program testTwoeqn
% pgf90 -Mcuda=cc60 test.f90 ; a.out
 Block:            32           32            1
 Grid:             1            1            1
 Pass Test!

Hope this helps,
Mat

Hi Mat,
Thank you for your reply. It works! The error checking should be add to the code.