Trying to get this very simple example to run. The following code fails with an unspecified launch failure. It looks like the kernel doesn’t get initialized properly. When trying to do device debugging on DDT (which is supposed to work now?) I get ‘Cannot find bounds of current function’ when trying to step into the kernel. It seems to me either n, m or i, j don’t get initialized properly, but I can’t for the better of me figure out why. Emulated mode doesn’t fail. Other programs, much larger than this, running on the same system, compiled with the same settings, don’t fail.
PGI Version 14.2
DDT Version 4.2-PR-36863
Makefile
FFLAGS= -g -Mcuda=cc3x -ta=nvidia,cc3x,keepgpu,keepbin,time -Minfo=accel,inline,ipa -Mneginfo -Minform=inform -I/usr/local/include -r8
LDFLAGS= -g -Mcuda=cc3x -ta=nvidia,cc3x,time -L/usr/local/lib -Minfo=accel,inline -Mneginfo -lpp
%.o: %.F90
@$(FC) $(FFLAGS) -c $< -o $@
Code
#define CUDA_BLOCKSIZE_X 32
#define CUDA_BLOCKSIZE_Y 32
attributes(global) subroutine stencil(n, m, a, b)
use cudafor
implicit none
integer(4), intent(in) ,value :: n, m
real(8), intent(in) :: a(n, m)
real(8), intent(out) :: b(n, m)
integer(4) :: i, j
i = (blockidx%x - 1) * blockDim%x + threadidx%x
j = (blockidx%y - 1) * blockDim%y + threadidx%y
if (i .GT. 3 .OR. i .LT. 1 .OR. j .GT. 4 .OR. j .LT. 1) then
return
end if
b(i,j) = a(i,j)
end subroutine
subroutine stencil_wrapper(n, m, a, b)
use cudafor
implicit none
integer(4), intent(in) :: n, m
real(8), intent(in) :: a(n, m)
real(8) ,device :: a_d(n, m)
real(8), intent(out) :: b(n, m)
real(8) ,device :: b_d(n, m)
type(dim3) :: cugrid, cublock
integer(4) :: cugridSizeX, cugridSizeY, cugridSizeZ, cuerror
a_d(:,:) = a(:,:)
b_d(:,:) = 0
cugridSizeX = ceiling(real(4) / real(CUDA_BLOCKSIZE_X))
cugridSizeY = ceiling(real(4) / real(CUDA_BLOCKSIZE_Y))
cugridSizeZ = 1
cugrid = dim3(cugridSizeX, cugridSizeY, cugridSizeZ)
cublock = dim3(CUDA_BLOCKSIZE_X, CUDA_BLOCKSIZE_Y, 1)
write(0,*) 'calling kernel stencil_wrapper with grid size', cugridSizeX, cugridSizeY
call stencil <<< cugrid, cublock >>>(n, m, a_d(:,:), b_d(:,:))
cuerror = cudaThreadSynchronize()
if(cuerror .NE. cudaSuccess) then
write(0, *) 'CUDA error in kernel stencil:', cudaGetErrorString(cuerror)
stop 1
end if
b(:,:) = b_d(:,:)
end subroutine
program main
implicit none
real(8), dimension(:,:), allocatable :: a, b
integer(4) :: n, m
n = 4
m = 4
allocate(a(n,m))
allocate(b(n,m))
a(:,:) = 1.0d0
b(:,:) = 0.0d0
call stencil_wrapper(n, m, a, b)
write(6,*) b
deallocate(a)
deallocate(b)
stop
end program main
Output
calling kernel stencil_wrapper with grid size 1 1
CUDA error in kernel stencil:
unspecified launch failure
Warning: ieee_inexact is signaling
1
cuda memcheck
Lots of errors of the following form
========= Invalid global read of size 8
========= at 0x00000638 in /home0/usr4/mueller-m-ab/hybrid/my_example_stencil_project/build/gpu/source/example.F90:16:stencil_
========= by thread (2,2,0) in block (0,0,0)
========= Address 0x2709a3c90 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib64/libcuda.so (cuLaunchKernel + 0x331) [0xcd951]
========= Host Frame:/opt/cuda/5.5/lib64/libcudart.so.5.5 [0xe108]
========= Host Frame:/opt/cuda/5.5/lib64/libcudart.so.5.5 (cudaLaunch + 0x143) [0x2cb53]
========= Host Frame:./test/example/example_gpu [0x35c3]