Hello to everybody. I am trying to write a FDTD Fortran version with CUDA.
This is my compiler
pgfortran 11.4-0 64-bit target on x86-64 Linux -tp nehalem
my graphic card
Driver version: 4.0
Runtime version: 3.10
2 CUDA devices found
Device Number: 0
Device name: GeForce GTX 295
Compute Capability: 1.3
Number of Multiprocessors: 30
Number of Cores: 240
Max Clock Rate (kHz): 1242000
Warpsize: 32
Settings
Compute Mode: Default
Runtime Limit on Kernels Enabled: No
ECC Support Enabled: No
Device Features/Capabilities
Concurrent Copy and Kernel Execution: Yes
Concurrent Kernel Executions: No
Zero-Copy Capable: Yes
Execution Configuration Limits
Maximum Grid Dimensions: 65535 x 65535 x 1
Maximum Block Dimensions: 512 x 512 x 64
Maximum Threads per Block: 512
Off-Chip Memory
Total Global Memory (B): 939327488
Total Constant Memory (B): 65536
Maximum Memory Pitch for Copies (B): 2147483647
Integrated: No
On-Chip Memory
Shared Memory per Multiprocessor (B): 16384
Number of Registers per Multiprocessor: 16384
PCI attributes
PCI Device ID: 0
PCI Bus ID: 6
Textures
Texture alignment: 256
Maximum 1D Texture Size: 8192
Maximum 2D Texture Size: 65536 x 32768
Maximum 3D Texture Size: 2048 x 2048 x 2048
Device Number: 1
Device name: GeForce GTX 295
Compute Capability: 1.3
Number of Multiprocessors: 30
Number of Cores: 240
Max Clock Rate (kHz): 1242000
Warpsize: 32
Settings
Compute Mode: Default
Runtime Limit on Kernels Enabled: Yes
ECC Support Enabled: No
Device Features/Capabilities
Concurrent Copy and Kernel Execution: Yes
Concurrent Kernel Executions: No
Zero-Copy Capable: Yes
Execution Configuration Limits
Maximum Grid Dimensions: 65535 x 65535 x 1
Maximum Block Dimensions: 512 x 512 x 64
Maximum Threads per Block: 512
Off-Chip Memory
Total Global Memory (B): 938803200
Total Constant Memory (B): 65536
Maximum Memory Pitch for Copies (B): 2147483647
Integrated: No
On-Chip Memory
Shared Memory per Multiprocessor (B): 16384
Number of Registers per Multiprocessor: 16384
PCI attributes
PCI Device ID: 0
PCI Bus ID: 7
Textures
Texture alignment: 256
Maximum 1D Texture Size: 8192
Maximum 2D Texture Size: 65536 x 32768
Maximum 3D Texture Size: 2048 x 2048 x 2048
my compiler option
pgfortran -Mcuda=cc13 -Mpreprocess -O3 fdtd_cuda_fortran.f90
and my code
module cuda_kernels
contains
attributes (global) subroutine kernel_h(k,num_cells_x,num_cells_y,num_cells_z,Hx,Hy,Hz,Ex,Ey,Ez,Cbdx,Cbdy,Cbdz)
implicit none
integer :: idx,idy
integer,value :: k,num_cells_x,num_cells_y,num_cells_z
real(kind=8), intent(in), dimension(1:num_cells_x,1:num_cells_y,1:num_cells_z) :: Ex, Ey, Ez
real(kind=8), intent(inout), dimension(1:num_cells_x,1:num_cells_y,1:num_cells_z) :: Hx, Hy, Hz
real(kind=8), intent(in), constant, dimension(1:num_cells_x,1:num_cells_y,1:num_cells_z) :: Cbdx,Cbdy,Cbdz
idx = threadIdx%x + ((blockIdx%x-1) * blockDim%x)
idy = threadIdx%y + ((blockIdx%y-1) * blockDim%y)
do while (idx < num_cells_x)
Hz(idx,idy,k) = Hz(idx,idy,k) + ((Ex(idx,idy+1,k)-Ex(idx,idy,k))*Cbdy(idx,idy,k) + (Ey(idx,idy,k)-Ey(idx+1,idy,k))*Cbdx(idx,idy,k))
Hx(idx,idy,k) = Hx(idx,idy,k) + ((Ey(idx,idy,k+1)-Ey(idx,idy,k))*Cbdz(idx,idy,k) + (Ez(idx,idy,k)-Ez(idx,idy+1,k))*Cbdy(idx,idy,k))
Hy(idx,idy,k) = Hy(idx,idy,k) + ((Ez(idx+1,idy,k)-Ez(idx,idy,k))*Cbdx(idx,idy,k) + (Ex(idx,idy,k)-Ex(idx,idy,k+1))*Cbdz(idx,idy,k))
idx = idx + (blockDim%x * gridDim%x)
idy = idy + (blockDim%y * gridDim%y)
end do
end subroutine kernel_h
attributes (global) subroutine kernel_e(k,num_cells_x,num_cells_y,num_cells_z,Hx,Hy,Hz,Ex,Ey,Ez,Dbdx,Dbdy,Dbdz)
implicit none
integer :: idx,idy
integer, value :: k,num_cells_x,num_cells_y,num_cells_z
real(kind=8), intent(inout), dimension(1:num_cells_x,1:num_cells_y,1:num_cells_z) :: Ex, Ey, Ez
real(kind=8), intent(in), dimension(1:num_cells_x,1:num_cells_y,1:num_cells_z) :: Hx, Hy, Hz
real(kind=8), intent(in), constant, dimension(1:num_cells_x,1:num_cells_y,1:num_cells_z) :: Dbdx,Dbdy,Dbdz
idx = threadIdx%x + (blockIdx%x * blockDim%x)
idy = threadIdx%y + (blockIdx%y * blockDim%y)
do while (idx < num_cells_x)
Ez(idx,idy,k) = Ez(idx,idy,k) + ((Hy(idx,idy,k)-Hy(idx-1,idy,k))*Dbdx(idx,idy,k) + (Hx(idx,idy-1,k)-Hx(idx,idy,k))*Dbdy(idx,idy,k))
Ex(idx,idy,k) = Ex(idx,idy,k) + ((Hz(idx,idy,k)-Hz(idx,idy-1,k))*Dbdy(idx,idy,k) + (Hy(idx,idy,k-1)-Hy(idx,idy,k))*Dbdz(idx,idy,k))
Ey(idx,idy,k) = Ey(idx,idy,k) + ((Hx(idx,idy,k)-Hx(idx,idy,k-1))*Dbdz(idx,idy,k) + (Hz(idx-1,idy,k)-Hz(idx,idy,k))*Dbdx(idx,idy,k))
idx = idx + (blockDim%x * gridDim%x)
idy = idy + (blockDim%y * gridDim%y)
end do
end subroutine kernel_e
end module
module sim_utils
contains
subroutine Update_init(nx,ny,nz,dx,dy,dz,dt,Cbdx,Cbdy,Cbdz,Dbdx,Dbdy,Dbdz)
implicit none
integer, intent(in) :: nx, ny, nz
real(kind=8), intent(in) :: dx, dy, dz, dt
real(kind=8), intent(inout), dimension(1:nx,1:ny,1:nz) :: Cbdx,Cbdy,Cbdz
real(kind=8), intent(inout), dimension(1:nx,1:ny,1:nz) :: Dbdx,Dbdy,Dbdz
real(kind=8), parameter :: c = 2.9979251e8, pi = 3.14159265358973238e0
real(kind=8), parameter :: eps0 = 0.8854194E-11, mu0 = 1.25663706E-6
Cbdx(1:nx,1:ny,1:nz) = dt/(mu0*dx)
Cbdy(1:nx,1:ny,1:nz) = dt/(mu0*dy)
Cbdz(1:nx,1:ny,1:nz) = dt/(mu0*dz)
Dbdx(1:nx,1:ny,1:nz) = dt/(eps0*dx)
Dbdy(1:nx,1:ny,1:nz) = dt/(eps0*dy)
Dbdz(1:nx,1:ny,1:nz) = dt/(eps0*dz)
end subroutine Update_init
end module
program main
use cudafor
use cuda_kernels
use sim_utils
implicit none
!Creo gli eventi per misurare il tempo trascorso
type(cudaEvent) :: startEvent, stopEvent
real :: time
integer :: istat,errCode
integer, parameter :: rfp = 8
real(kind=rfp), parameter :: c = 2.9979251e8, pi = 3.14159265358973238e0
real(kind=rfp), parameter :: eps0 = 0.8854194E-11, mu0 = 1.25663706E-6
integer :: num_cells_x,num_cells_y,num_cells_z,num_steps,l,k,j
type(dim3) :: bdim,gdim
real(kind=rfp) :: dx,dy,dz,dt
real(kind=rfp), allocatable :: Cbdx(:,:,:), Cbdy(:,:,:), Cbdz(:,:,:)
real(kind=rfp), allocatable :: Dbdx(:,:,:), Dbdy(:,:,:), Dbdz(:,:,:)
real(kind=rfp), allocatable :: EX(:,:,:),EY(:,:,:),EZ(:,:,:)
real(kind=rfp), allocatable :: HX(:,:,:),HY(:,:,:),HZ(:,:,:)
! Creo le variabili per il device
real(kind=rfp), allocatable,device :: Cbdx_d(:,:,:), Cbdy_d(:,:,:), Cbdz_d(:,:,:)
real(kind=rfp), allocatable,device :: Dbdx_d(:,:,:), Dbdy_d(:,:,:), Dbdz_d(:,:,:)
real(kind=rfp), allocatable,device :: EX_d(:,:,:),EY_d(:,:,:),EZ_d(:,:,:)
real(kind=rfp), allocatable,device :: HX_d(:,:,:),HY_d(:,:,:),HZ_d(:,:,:)
write(*,*) ' '
write(*,*) ' ---------------------------------------------------'
write(*,*) ' Domain and Iterations Setup '
write(*,*) ' ---------------------------------------------------'
write(*,*) ' '
!
write(*,*) 'Enter the dimension of matrices'
write(*,*) 'Number of cells for X'
read(*,*) num_cells_x
write(*,*) 'Number of cells for Y'
read(*,*) num_cells_y
write(*,*) 'Number of cells for Z'
read(*,*) num_cells_z
write(*,*) 'Enter the number of iterations'
write(*,*) 'Number of iterations'
read(*,*) num_steps
!-----------------------------------------------------------------------
!
! Just to initialize with something ,,,
!
dx = 1.0d0
dy = 1.0d0
dz = 1.0d0
dt = 1.0d0
!
!-----------------------------------------------------------------------
! Allocation and Initialization for Fields and Coefficients
!
if (.not.allocated(Cbdx)) then
allocate(Cbdx(1:num_cells_x,1:num_cells_y,1:num_cells_z))
end if
if (.not.allocated(Cbdy)) then
allocate(Cbdy(1:num_cells_x,1:num_cells_y,1:num_cells_z))
end if
if (.not.allocated(Cbdz)) then
allocate(Cbdz(1:num_cells_x,1:num_cells_y,1:num_cells_z))
end if
if (.not.allocated(Dbdx)) then
allocate(Dbdx(1:num_cells_x,1:num_cells_y,1:num_cells_z))
end if
if (.not.allocated(Dbdy)) then
allocate(Dbdy(1:num_cells_x,1:num_cells_y,1:num_cells_z))
end if
if (.not.allocated(Dbdz)) then
allocate(Dbdz(1:num_cells_x,1:num_cells_y,1:num_cells_z))
end if
if (.not.allocated(EX)) then
allocate(EX(1:num_cells_x,1:num_cells_y,1:num_cells_z))
end if
if (.not.allocated(EY)) then
allocate(EY(1:num_cells_x,1:num_cells_y,1:num_cells_z))
end if
if (.not.allocated(EZ)) then
allocate(EZ(1:num_cells_x,1:num_cells_y,1:num_cells_z))
end if
if (.not.allocated(HX)) then
allocate(HX(1:num_cells_x,1:num_cells_y,1:num_cells_z))
end if
if (.not.allocated(HY)) then
allocate(HY(1:num_cells_x,1:num_cells_y,1:num_cells_z))
end if
if (.not.allocated(HZ)) then
allocate(HZ(1:num_cells_x,1:num_cells_y,1:num_cells_z))
end if
! Allocazione sul device
if (.not.allocated(Cbdx_d)) then
allocate(Cbdx_d(1:num_cells_x,1:num_cells_y,1:num_cells_z))
end if
if (.not.allocated(Cbdy_d)) then
allocate(Cbdy_d(1:num_cells_x,1:num_cells_y,1:num_cells_z))
end if
if (.not.allocated(Cbdz_d)) then
allocate(Cbdz_d(1:num_cells_x,1:num_cells_y,1:num_cells_z))
end if
if (.not.allocated(Dbdx_d)) then
allocate(Dbdx_d(1:num_cells_x,1:num_cells_y,1:num_cells_z))
end if
if (.not.allocated(Dbdy_d)) then
allocate(Dbdy_d(1:num_cells_x,1:num_cells_y,1:num_cells_z))
end if
if (.not.allocated(Dbdz_d)) then
allocate(Dbdz_d(1:num_cells_x,1:num_cells_y,1:num_cells_z))
end if
if (.not.allocated(EX_d)) then
allocate(EX_d(1:num_cells_x,1:num_cells_y,1:num_cells_z))
end if
if (.not.allocated(EY_d)) then
allocate(EY_d(1:num_cells_x,1:num_cells_y,1:num_cells_z))
end if
if (.not.allocated(EZ_d)) then
allocate(EZ_d(1:num_cells_x,1:num_cells_y,1:num_cells_z))
end if
if (.not.allocated(HX_d)) then
allocate(HX_d(1:num_cells_x,1:num_cells_y,1:num_cells_z))
end if
if (.not.allocated(HY_d)) then
allocate(HY_d(1:num_cells_x,1:num_cells_y,1:num_cells_z))
end if
if (.not.allocated(HZ_d)) then
allocate(HZ_d(1:num_cells_x,1:num_cells_y,1:num_cells_z))
end if
print *, ' Inizializzo le matrici '
call Update_init(num_cells_x,num_cells_y,num_cells_z,dx,dy,dz,dt,Cbdx,Cbdy,Cbdz,Dbdx,Dbdy,Dbdz)
print *, 'Matrici inizializzate '
Ex=5.0d0
Ey=4.0d0
Ez=3.0d0
Hx=2.0d0
Hy=5.0d0
Hz=3.0d0
istat = cudaEventCreate(startEvent)
istat = cudaEventCreate(stopEvent)
! Carico i dati sul device
EX_d = EX
Ey_d = Ey
Ez_d = Ez
Hx_d = Hx
Hy_d = Hy
Hz_d = Hz
Cbdx_d = Cbdx
Cbdy_d = Cbdy
Cbdz_d = Cbdz
Dbdx_d = Dbdx
Dbdy_d = Dbdy
Dbdz_d = Dbdz
! Definisco la dimensione di blocchi e griglia
bdim=dim3(16,16,1)
gdim=dim3((num_cells_x+(bdim%x-1))/bdim%x,(num_cells_y+(bdim%y-1))/bdim%y,1)
write(*,*)"Threads per Block: ",bdim%x,"x",bdim%y,"x",bdim%z,"=",bdim%x*bdim%y
write(*,*)"Griglia in blocchi: ",gdim%x,"x",gdim%y,"=",gdim%x*gdim%y
write(*,*) ' '
write(*,*) ' '
istat = cudaEventRecord(startEvent,0)
! Chiamo il kernel
do l=1,num_steps
do k=1,num_cells_z
call kernel_h<<<gdim>>>(k,num_cells_x,num_cells_y,num_cells_z,Hx_d,Hy_d,Hz_d,Ex_d,Ey_d,Ez_d,Cbdx_d,Cbdy_d,Cbdz_d)
call kernel_e<<<gdim>>>(k,num_cells_x,num_cells_y,num_cells_z,Hx_d,Hy_d,Hz_d,Ex_d,Ey_d,Ez_d,Dbdx_d,Dbdy_d,Dbdz_d)
end do
end do
istat = cudathreadsynchronize()
errCode = cudaGetLastError()
if (errCode .gt. 0) then
print *, "Errore nel lancio del kernel"
print *, errCode
print *, cudaGetErrorString(errCode)
stop 'Error! Kernel failed!'
endif
istat = cudaEventRecord(stopEvent,0)
istat = cudaEventSynchronize(stopEvent)
istat = cudaEventElapsedTime(time,startEvent,stopEvent)
! Copio i risultati sull'host
Hx = Hx_d
Hy = Hy_d
Hz = Hz_d
Ex = Ex_d
Ey = Ey_d
Ez = Ez_d
print *,"La simulazione e finita"
write(*,*)"Tempo di esecuzione del kernel (ms): ",time
istat = cudaEventDestroy(startEvent)
istat = cudaEventDestroy(stopEvent)
end program main
When i set the dimension of E, H, Cb and Db greater than 135 elements, the program crashes and returns
4
unspecified launch failure
Warning: ieee_inexact is signaling
Do you have any idea?
Thank you!