CUDA fortran FDTD unspecified launch failure

Hello to everybody. I am trying to write a FDTD Fortran version with CUDA.

This is my compiler

pgfortran 11.4-0 64-bit target on x86-64 Linux -tp nehalem

my graphic card

Driver version: 4.0
Runtime version: 3.10

 2 CUDA devices found

 Device Number: 0
   Device name: GeForce GTX 295
   Compute Capability: 1.3
   Number of Multiprocessors: 30
   Number of Cores: 240
   Max Clock Rate (kHz): 1242000
   Warpsize: 32

    Settings
      Compute Mode: Default
      Runtime Limit on Kernels Enabled: No
      ECC Support Enabled: No

    Device Features/Capabilities
      Concurrent Copy and Kernel Execution: Yes
      Concurrent Kernel Executions: No
      Zero-Copy Capable: Yes

    Execution Configuration Limits
      Maximum Grid Dimensions: 65535 x 65535 x 1
      Maximum Block Dimensions: 512 x 512 x 64
      Maximum Threads per Block: 512

    Off-Chip Memory
      Total Global Memory (B): 939327488
      Total Constant Memory (B): 65536
      Maximum Memory Pitch for Copies (B): 2147483647
      Integrated: No

    On-Chip Memory
      Shared Memory per Multiprocessor (B): 16384
      Number of Registers per Multiprocessor: 16384

    PCI attributes
      PCI Device ID: 0
      PCI Bus ID: 6

    Textures
      Texture alignment: 256
      Maximum 1D Texture Size: 8192
      Maximum 2D Texture Size: 65536 x 32768
      Maximum 3D Texture Size: 2048 x 2048 x 2048


 Device Number: 1
   Device name: GeForce GTX 295
   Compute Capability: 1.3
   Number of Multiprocessors: 30
   Number of Cores: 240
   Max Clock Rate (kHz): 1242000
   Warpsize: 32

    Settings
      Compute Mode: Default
      Runtime Limit on Kernels Enabled: Yes
      ECC Support Enabled: No

    Device Features/Capabilities
      Concurrent Copy and Kernel Execution: Yes
      Concurrent Kernel Executions: No
      Zero-Copy Capable: Yes

    Execution Configuration Limits
      Maximum Grid Dimensions: 65535 x 65535 x 1
      Maximum Block Dimensions: 512 x 512 x 64
      Maximum Threads per Block: 512

    Off-Chip Memory
      Total Global Memory (B): 938803200
      Total Constant Memory (B): 65536
      Maximum Memory Pitch for Copies (B): 2147483647
      Integrated: No

    On-Chip Memory
      Shared Memory per Multiprocessor (B): 16384
      Number of Registers per Multiprocessor: 16384

    PCI attributes
      PCI Device ID: 0
      PCI Bus ID: 7

    Textures
      Texture alignment: 256
      Maximum 1D Texture Size: 8192
      Maximum 2D Texture Size: 65536 x 32768
      Maximum 3D Texture Size: 2048 x 2048 x 2048

my compiler option

pgfortran -Mcuda=cc13 -Mpreprocess -O3 fdtd_cuda_fortran.f90

and my code

module cuda_kernels
contains
	attributes (global) subroutine kernel_h(k,num_cells_x,num_cells_y,num_cells_z,Hx,Hy,Hz,Ex,Ey,Ez,Cbdx,Cbdy,Cbdz)
		implicit none
		integer :: idx,idy
		integer,value :: k,num_cells_x,num_cells_y,num_cells_z
		real(kind=8), intent(in), dimension(1:num_cells_x,1:num_cells_y,1:num_cells_z) :: Ex, Ey, Ez
		real(kind=8), intent(inout), dimension(1:num_cells_x,1:num_cells_y,1:num_cells_z) :: Hx, Hy, Hz
		real(kind=8), intent(in), constant, dimension(1:num_cells_x,1:num_cells_y,1:num_cells_z) :: Cbdx,Cbdy,Cbdz
		idx = threadIdx%x + ((blockIdx%x-1) * blockDim%x)
		idy = threadIdx%y + ((blockIdx%y-1) * blockDim%y)
		do while (idx < num_cells_x)
			Hz(idx,idy,k) = Hz(idx,idy,k) + ((Ex(idx,idy+1,k)-Ex(idx,idy,k))*Cbdy(idx,idy,k) + (Ey(idx,idy,k)-Ey(idx+1,idy,k))*Cbdx(idx,idy,k))
			Hx(idx,idy,k) = Hx(idx,idy,k) +	((Ey(idx,idy,k+1)-Ey(idx,idy,k))*Cbdz(idx,idy,k) + (Ez(idx,idy,k)-Ez(idx,idy+1,k))*Cbdy(idx,idy,k))
			Hy(idx,idy,k) = Hy(idx,idy,k) + ((Ez(idx+1,idy,k)-Ez(idx,idy,k))*Cbdx(idx,idy,k) + (Ex(idx,idy,k)-Ex(idx,idy,k+1))*Cbdz(idx,idy,k))
			idx = idx + (blockDim%x * gridDim%x)
			idy = idy + (blockDim%y * gridDim%y)
		end do
	end subroutine kernel_h
	
	attributes (global) subroutine kernel_e(k,num_cells_x,num_cells_y,num_cells_z,Hx,Hy,Hz,Ex,Ey,Ez,Dbdx,Dbdy,Dbdz)
		implicit none
		integer :: idx,idy
		integer, value :: k,num_cells_x,num_cells_y,num_cells_z
		real(kind=8), intent(inout), dimension(1:num_cells_x,1:num_cells_y,1:num_cells_z) :: Ex, Ey, Ez
		real(kind=8), intent(in), dimension(1:num_cells_x,1:num_cells_y,1:num_cells_z) :: Hx, Hy, Hz
		real(kind=8), intent(in), constant, dimension(1:num_cells_x,1:num_cells_y,1:num_cells_z) :: Dbdx,Dbdy,Dbdz
		idx = threadIdx%x + (blockIdx%x * blockDim%x)
		idy = threadIdx%y + (blockIdx%y * blockDim%y)
		do while (idx < num_cells_x)
			Ez(idx,idy,k) = Ez(idx,idy,k) + ((Hy(idx,idy,k)-Hy(idx-1,idy,k))*Dbdx(idx,idy,k) + (Hx(idx,idy-1,k)-Hx(idx,idy,k))*Dbdy(idx,idy,k))
			Ex(idx,idy,k) = Ex(idx,idy,k) + ((Hz(idx,idy,k)-Hz(idx,idy-1,k))*Dbdy(idx,idy,k) + (Hy(idx,idy,k-1)-Hy(idx,idy,k))*Dbdz(idx,idy,k))               
			Ey(idx,idy,k) = Ey(idx,idy,k) + ((Hx(idx,idy,k)-Hx(idx,idy,k-1))*Dbdz(idx,idy,k) + (Hz(idx-1,idy,k)-Hz(idx,idy,k))*Dbdx(idx,idy,k))
			idx = idx + (blockDim%x * gridDim%x)
			idy = idy + (blockDim%y * gridDim%y)
		end do
	end subroutine kernel_e
end module

module sim_utils
contains
	subroutine Update_init(nx,ny,nz,dx,dy,dz,dt,Cbdx,Cbdy,Cbdz,Dbdx,Dbdy,Dbdz)
		implicit none
		
		integer, intent(in) :: nx, ny, nz
		real(kind=8), intent(in) :: dx, dy, dz, dt
		real(kind=8), intent(inout), dimension(1:nx,1:ny,1:nz) :: Cbdx,Cbdy,Cbdz
		real(kind=8), intent(inout), dimension(1:nx,1:ny,1:nz) :: Dbdx,Dbdy,Dbdz
		
		real(kind=8), parameter :: c = 2.9979251e8, pi = 3.14159265358973238e0
		real(kind=8), parameter :: eps0 = 0.8854194E-11, mu0 = 1.25663706E-6

		Cbdx(1:nx,1:ny,1:nz) = dt/(mu0*dx)
		Cbdy(1:nx,1:ny,1:nz) = dt/(mu0*dy)
		Cbdz(1:nx,1:ny,1:nz) = dt/(mu0*dz)

		Dbdx(1:nx,1:ny,1:nz) = dt/(eps0*dx)
		Dbdy(1:nx,1:ny,1:nz) = dt/(eps0*dy)
		Dbdz(1:nx,1:ny,1:nz) = dt/(eps0*dz)
		
	end subroutine Update_init
end module


program main
	use cudafor
	use cuda_kernels
	use sim_utils
	implicit none
	
	!Creo gli eventi per misurare il tempo trascorso
	type(cudaEvent) :: startEvent, stopEvent
	real :: time
	integer :: istat,errCode

	integer, parameter :: rfp = 8

	real(kind=rfp), parameter :: c = 2.9979251e8, pi = 3.14159265358973238e0
	real(kind=rfp), parameter :: eps0 = 0.8854194E-11, mu0 = 1.25663706E-6

	integer :: num_cells_x,num_cells_y,num_cells_z,num_steps,l,k,j
	type(dim3) :: bdim,gdim

	real(kind=rfp) :: dx,dy,dz,dt

	real(kind=rfp), allocatable :: Cbdx(:,:,:), Cbdy(:,:,:), Cbdz(:,:,:)
	real(kind=rfp), allocatable :: Dbdx(:,:,:), Dbdy(:,:,:), Dbdz(:,:,:)
	real(kind=rfp), allocatable :: EX(:,:,:),EY(:,:,:),EZ(:,:,:)
	real(kind=rfp), allocatable :: HX(:,:,:),HY(:,:,:),HZ(:,:,:)

!	Creo le variabili per il device
	real(kind=rfp), allocatable,device :: Cbdx_d(:,:,:), Cbdy_d(:,:,:), Cbdz_d(:,:,:)
	real(kind=rfp), allocatable,device :: Dbdx_d(:,:,:), Dbdy_d(:,:,:), Dbdz_d(:,:,:)
	real(kind=rfp), allocatable,device :: EX_d(:,:,:),EY_d(:,:,:),EZ_d(:,:,:)
	real(kind=rfp), allocatable,device :: HX_d(:,:,:),HY_d(:,:,:),HZ_d(:,:,:)


	write(*,*) ' '
	write(*,*) ' ---------------------------------------------------'
	write(*,*) ' Domain and Iterations Setup '
	write(*,*) ' ---------------------------------------------------'
	write(*,*) ' '
	!
	write(*,*) 'Enter the dimension of matrices'
	write(*,*) 'Number of cells for X'
	read(*,*) num_cells_x
	write(*,*) 'Number of cells for Y'
	read(*,*) num_cells_y
	write(*,*) 'Number of cells for Z'
	read(*,*) num_cells_z

	write(*,*) 'Enter the number of iterations'
	write(*,*) 'Number of iterations'
	read(*,*) num_steps


	!-----------------------------------------------------------------------
	!
	! Just to initialize with something ,,,
	!
	dx = 1.0d0
	dy = 1.0d0
	dz = 1.0d0

	dt = 1.0d0
	!
	!-----------------------------------------------------------------------
	! Allocation and Initialization for Fields and Coefficients
	! 
	if (.not.allocated(Cbdx)) then
		allocate(Cbdx(1:num_cells_x,1:num_cells_y,1:num_cells_z))
	end if
	if (.not.allocated(Cbdy)) then
		allocate(Cbdy(1:num_cells_x,1:num_cells_y,1:num_cells_z))
	end if
	if (.not.allocated(Cbdz)) then
		allocate(Cbdz(1:num_cells_x,1:num_cells_y,1:num_cells_z))
	end if

	if (.not.allocated(Dbdx)) then
		allocate(Dbdx(1:num_cells_x,1:num_cells_y,1:num_cells_z))
	end if
	if (.not.allocated(Dbdy)) then
		allocate(Dbdy(1:num_cells_x,1:num_cells_y,1:num_cells_z))
	end if
	if (.not.allocated(Dbdz)) then
		allocate(Dbdz(1:num_cells_x,1:num_cells_y,1:num_cells_z))
	end if

	if (.not.allocated(EX)) then
		allocate(EX(1:num_cells_x,1:num_cells_y,1:num_cells_z))
	end if
	if (.not.allocated(EY)) then
		allocate(EY(1:num_cells_x,1:num_cells_y,1:num_cells_z))
	end if
	if (.not.allocated(EZ)) then
		allocate(EZ(1:num_cells_x,1:num_cells_y,1:num_cells_z))
	end if

	if (.not.allocated(HX)) then
		allocate(HX(1:num_cells_x,1:num_cells_y,1:num_cells_z))
	end if
	if (.not.allocated(HY)) then
		allocate(HY(1:num_cells_x,1:num_cells_y,1:num_cells_z))
	end if
	if (.not.allocated(HZ)) then
		allocate(HZ(1:num_cells_x,1:num_cells_y,1:num_cells_z))
	end if

!	Allocazione sul device
	if (.not.allocated(Cbdx_d)) then
		allocate(Cbdx_d(1:num_cells_x,1:num_cells_y,1:num_cells_z))
	end if
	if (.not.allocated(Cbdy_d)) then
		allocate(Cbdy_d(1:num_cells_x,1:num_cells_y,1:num_cells_z))
	end if
	if (.not.allocated(Cbdz_d)) then
		allocate(Cbdz_d(1:num_cells_x,1:num_cells_y,1:num_cells_z))
	end if

	if (.not.allocated(Dbdx_d)) then
		allocate(Dbdx_d(1:num_cells_x,1:num_cells_y,1:num_cells_z))
	end if
	if (.not.allocated(Dbdy_d)) then
		allocate(Dbdy_d(1:num_cells_x,1:num_cells_y,1:num_cells_z))
	end if
	if (.not.allocated(Dbdz_d)) then
		allocate(Dbdz_d(1:num_cells_x,1:num_cells_y,1:num_cells_z))
	end if

	if (.not.allocated(EX_d)) then
		allocate(EX_d(1:num_cells_x,1:num_cells_y,1:num_cells_z))
	end if
	if (.not.allocated(EY_d)) then
		allocate(EY_d(1:num_cells_x,1:num_cells_y,1:num_cells_z))
	end if
	if (.not.allocated(EZ_d)) then
		allocate(EZ_d(1:num_cells_x,1:num_cells_y,1:num_cells_z))
	end if

	if (.not.allocated(HX_d)) then
		allocate(HX_d(1:num_cells_x,1:num_cells_y,1:num_cells_z))
	end if
	if (.not.allocated(HY_d)) then
		allocate(HY_d(1:num_cells_x,1:num_cells_y,1:num_cells_z))
	end if
	if (.not.allocated(HZ_d)) then
		allocate(HZ_d(1:num_cells_x,1:num_cells_y,1:num_cells_z))
	end if

	print *, ' Inizializzo le matrici '
	call Update_init(num_cells_x,num_cells_y,num_cells_z,dx,dy,dz,dt,Cbdx,Cbdy,Cbdz,Dbdx,Dbdy,Dbdz)
	print *, 'Matrici inizializzate '

	Ex=5.0d0
	Ey=4.0d0
	Ez=3.0d0

	Hx=2.0d0
	Hy=5.0d0
	Hz=3.0d0
	
	istat = cudaEventCreate(startEvent)
	istat = cudaEventCreate(stopEvent)

!	Carico i dati sul device

	EX_d = EX
	Ey_d = Ey
	Ez_d = Ez
	Hx_d = Hx
	Hy_d = Hy
	Hz_d = Hz
	Cbdx_d = Cbdx
	Cbdy_d = Cbdy
	Cbdz_d = Cbdz
	Dbdx_d = Dbdx
	Dbdy_d = Dbdy
	Dbdz_d = Dbdz

!	Definisco la dimensione di blocchi e griglia	
	bdim=dim3(16,16,1)
	gdim=dim3((num_cells_x+(bdim%x-1))/bdim%x,(num_cells_y+(bdim%y-1))/bdim%y,1)
	
	write(*,*)"Threads per Block: ",bdim%x,"x",bdim%y,"x",bdim%z,"=",bdim%x*bdim%y
	write(*,*)"Griglia in blocchi: ",gdim%x,"x",gdim%y,"=",gdim%x*gdim%y
	write(*,*) ' '
	write(*,*) ' '
	
	istat = cudaEventRecord(startEvent,0)
		
!	Chiamo il kernel
	do l=1,num_steps
		do k=1,num_cells_z
			call kernel_h<<<gdim>>>(k,num_cells_x,num_cells_y,num_cells_z,Hx_d,Hy_d,Hz_d,Ex_d,Ey_d,Ez_d,Cbdx_d,Cbdy_d,Cbdz_d)
			call kernel_e<<<gdim>>>(k,num_cells_x,num_cells_y,num_cells_z,Hx_d,Hy_d,Hz_d,Ex_d,Ey_d,Ez_d,Dbdx_d,Dbdy_d,Dbdz_d)
		end do
	end do
	
	istat = cudathreadsynchronize()
	errCode = cudaGetLastError()
	if (errCode .gt. 0) then
		print *, "Errore nel lancio del kernel"
		print *, errCode
		print *, cudaGetErrorString(errCode)
		stop 'Error! Kernel failed!'
	endif

	istat = cudaEventRecord(stopEvent,0)
	istat = cudaEventSynchronize(stopEvent)
	istat = cudaEventElapsedTime(time,startEvent,stopEvent)
	
!	Copio i risultati sull'host
	Hx = Hx_d
	Hy = Hy_d
	Hz = Hz_d
	Ex = Ex_d
	Ey = Ey_d
	Ez = Ez_d


	print *,"La simulazione e finita"
	write(*,*)"Tempo di esecuzione del kernel (ms): ",time	
	
	istat = cudaEventDestroy(startEvent)
	istat = cudaEventDestroy(stopEvent)
	
end program main

When i set the dimension of E, H, Cb and Db greater than 135 elements, the program crashes and returns

4
unspecified launch failure
Warning: ieee_inexact is signaling

Do you have any idea?
Thank you!

Hi horace1234,

An “unspecified launch failure” usually means that your kernel abnormally executed for some reason. In your case, it looks like your array accesses are going out-of-bounds. For example:

Ey(idx,idy,k) = Ey(idx,idy,k) + ((Hx(idx,idy,k)-Hx(idx,idy,k-1))*Dbdz(idx,idy,k) + (Hz(idx-1,idy,k)-Hz(idx,idy,k))*Dbdx(idx,idy,k))

“Hz(idx-1,idy,k)” will be out of bounds since idx starts at 1. One fix could be to change your lower bound to zero.

Also your initial idx and idy calculation in kernel_e is incorrect since you need to minus one from the blockidx. (see kernel_h). Finally, the “constant” attribute can only be applied to module device data.

Hope this helps,
Mat

Thank you Mat, now it works!
Can i ask you another question: how can i run my code on another machine? Because if i move the binary it says that i miss some library, is there a way to include it?

pgfortran -Mcuda=cc13 -r -Mpreprocess -O3 fdtd_2d_prop.f90

/usr/bin/ld: warning: /opt/pgi/linux86-64/11.4/lib/pgi.ld contains output sections; did you forget -T?
/usr/bin/ld: cannot find -lcudart

Hi horace1234,

Thank you Mat, now it works!

Excellent!

how can i run my code on another machine?

To run your binary on another system that does not have the PGI compilers installed, you either need to install CUDA on this system or copy the dependent shared libraries with your binary and set the environment variable “LD_LIBRARY_PATH” to include the directory containing these libraries.

pgfortran -Mcuda=cc13 -r -Mpreprocess -O3 fdtd_2d_prop.f90

/usr/bin/ld: warning: /opt/pgi/linux86-64/11.4/lib/pgi.ld contains output sections; did you forget -T?
/usr/bin/ld: cannot find -lcudart

This error is a different problem where the PGI supplied CUDA libraries where not installed as part of your PGI installation. You will need to reinstall the compilers and select “yes” when prompted if you wish to install the CUDA Toolkit.

  • Mat