Invalid Read in Kernel

Trying to get this very simple example to run. The following code fails with an unspecified launch failure. It looks like the kernel doesn’t get initialized properly. When trying to do device debugging on DDT (which is supposed to work now?) I get ‘Cannot find bounds of current function’ when trying to step into the kernel. It seems to me either n, m or i, j don’t get initialized properly, but I can’t for the better of me figure out why. Emulated mode doesn’t fail. Other programs, much larger than this, running on the same system, compiled with the same settings, don’t fail.

PGI Version 14.2
DDT Version 4.2-PR-36863

Makefile

FFLAGS= -g -Mcuda=cc3x -ta=nvidia,cc3x,keepgpu,keepbin,time -Minfo=accel,inline,ipa -Mneginfo -Minform=inform -I/usr/local/include -r8
LDFLAGS= -g -Mcuda=cc3x -ta=nvidia,cc3x,time -L/usr/local/lib -Minfo=accel,inline -Mneginfo -lpp
%.o: %.F90
	@$(FC) $(FFLAGS) -c $< -o $@

Code

#define CUDA_BLOCKSIZE_X 32
#define CUDA_BLOCKSIZE_Y 32
attributes(global) subroutine stencil(n, m, a, b)
	use cudafor
	implicit none
	integer(4), intent(in) ,value :: n, m
	real(8), intent(in) :: a(n, m)
	real(8), intent(out) :: b(n, m)
	integer(4) :: i, j

	i = (blockidx%x - 1) * blockDim%x + threadidx%x
	j = (blockidx%y - 1) * blockDim%y + threadidx%y
	if (i .GT. 3 .OR. i .LT. 1 .OR. j .GT. 4 .OR. j .LT. 1) then
		return
	end if
	b(i,j) = a(i,j)
end subroutine

subroutine stencil_wrapper(n, m, a, b)
	use cudafor
	implicit none
	integer(4), intent(in) :: n, m
	real(8), intent(in) :: a(n, m)
	real(8) ,device :: a_d(n, m)
	real(8), intent(out) :: b(n, m)
	real(8) ,device :: b_d(n, m)
	type(dim3) :: cugrid, cublock
	integer(4) :: cugridSizeX, cugridSizeY, cugridSizeZ, cuerror
	a_d(:,:) = a(:,:)
	b_d(:,:) = 0

	cugridSizeX = ceiling(real(4) / real(CUDA_BLOCKSIZE_X))
	cugridSizeY = ceiling(real(4) / real(CUDA_BLOCKSIZE_Y))
	cugridSizeZ = 1
	cugrid = dim3(cugridSizeX, cugridSizeY, cugridSizeZ)
	cublock = dim3(CUDA_BLOCKSIZE_X, CUDA_BLOCKSIZE_Y, 1)
	write(0,*) 'calling kernel stencil_wrapper with grid size', cugridSizeX, cugridSizeY
	call stencil <<< cugrid, cublock >>>(n, m, a_d(:,:), b_d(:,:))
	cuerror = cudaThreadSynchronize()
	if(cuerror .NE. cudaSuccess) then
		write(0, *) 'CUDA error in kernel stencil:', cudaGetErrorString(cuerror)
		stop 1
	end if
	b(:,:) = b_d(:,:)
end subroutine

program main
	implicit none
	real(8), dimension(:,:), allocatable :: a, b
	integer(4) :: n, m

	n = 4
	m = 4
	allocate(a(n,m))
	allocate(b(n,m))
	a(:,:) = 1.0d0
	b(:,:) = 0.0d0
	call stencil_wrapper(n, m, a, b)
	write(6,*) b
	deallocate(a)
	deallocate(b)

	stop
end program main

Output

calling kernel stencil_wrapper with grid size 1 1
CUDA error in kernel stencil:
unspecified launch failure
Warning: ieee_inexact is signaling
1

cuda memcheck
Lots of errors of the following form

========= Invalid global read of size 8
========= at 0x00000638 in /home0/usr4/mueller-m-ab/hybrid/my_example_stencil_project/build/gpu/source/example.F90:16:stencil_
========= by thread (2,2,0) in block (0,0,0)
========= Address 0x2709a3c90 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib64/libcuda.so (cuLaunchKernel + 0x331) [0xcd951]
========= Host Frame:/opt/cuda/5.5/lib64/libcudart.so.5.5 [0xe108]
========= Host Frame:/opt/cuda/5.5/lib64/libcudart.so.5.5 (cudaLaunch + 0x143) [0x2cb53]
========= Host Frame:./test/example/example_gpu [0x35c3]

Hi MuellerM,

CUDA Fortran device kernels are required to have an implicit or explicit interface. Putting you kernel into a module is the earliest way to fix your example.

Hope this helps,
Mat

#define CUDA_BLOCKSIZE_X 32
 #define CUDA_BLOCKSIZE_Y 32

module stencil_mod

contains
 attributes(global) subroutine stencil(n, m, a, b)
    use cudafor
    implicit none
    integer(4), intent(in) ,value :: n, m
    real(8), intent(in) :: a(n, m)
    real(8), intent(out) :: b(n, m)
    integer(4) :: i, j

    i = (blockidx%x - 1) * blockDim%x + threadidx%x
    j = (blockidx%y - 1) * blockDim%y + threadidx%y
    if (i .GT. 3 .OR. i .LT. 1 .OR. j .GT. 4 .OR. j .LT. 1) then
       return
    end if
    b(i,j) = a(i,j)
 end subroutine

 subroutine stencil_wrapper(n, m, a, b)
    use cudafor
    implicit none
    integer(4), intent(in) :: n, m
    real(8), intent(in) :: a(n, m)
    real(8) ,device :: a_d(n, m)
    real(8), intent(out) :: b(n, m)
    real(8) ,device :: b_d(n, m)
    type(dim3) :: cugrid, cublock
    integer(4) :: cugridSizeX, cugridSizeY, cugridSizeZ, cuerror
    a_d(:,:) = a(:,:)
    b_d(:,:) = 0

    cugridSizeX = ceiling(real(4) / real(CUDA_BLOCKSIZE_X))
    cugridSizeY = ceiling(real(4) / real(CUDA_BLOCKSIZE_Y))
    cugridSizeZ = 1
    cugrid = dim3(cugridSizeX, cugridSizeY, cugridSizeZ)
    cublock = dim3(CUDA_BLOCKSIZE_X, CUDA_BLOCKSIZE_Y, 1)
    write(0,*) 'calling kernel stencil_wrapper with grid size', cugridSizeX, &
      cugridSizeY
    call stencil <<< cugrid, cublock >>>(n, m, a_d(:,:), b_d(:,:))
    cuerror = cudaThreadSynchronize()
    if(cuerror .NE. cudaSuccess) then
       write(0, *) 'CUDA error in kernel stencil:', cudaGetErrorString(cuerror)
       stop 1
    end if
    b(:,:) = b_d(:,:)
 end subroutine
end module stencil_mod

 program main
    use stencil_mod
    implicit none
    real(8), dimension(:,:), allocatable :: a, b
    integer(4) :: n, m

    n = 4
    m = 4
    allocate(a(n,m))
    allocate(b(n,m))
    a(:,:) = 1.0d0
    b(:,:) = 0.0d0
    call stencil_wrapper(n, m, a, b)
    write(6,*) b
    deallocate(a)
    deallocate(b)

    stop
 end program main
% pgf90 -Mcuda test.f90 -Mpreprocess ; a.out
 calling kernel stencil_wrapper with grid size            1            1
    1.000000000000000         1.000000000000000         1.000000000000000
    0.000000000000000         1.000000000000000         1.000000000000000
    1.000000000000000         0.000000000000000         1.000000000000000
    1.000000000000000         1.000000000000000         0.000000000000000
    1.000000000000000         1.000000000000000         1.000000000000000
    0.000000000000000
Warning: ieee_inexact is signaling
FORTRAN STOP

Thank you so much, Mat. I thought I’m going crazy about this example. Usually I put everything into a module, but I somehow wasn’t aware that it is mandatory for CUDA Fortran kernels. The funny thing is, it worked for some edge case where I didn’t pass in the domain lenghts, so I didn’t think about the interface being the problem.

May I suggest a compiler warning (or maybe even a runtime error message) for kernels without a valid interface?