Tuan
1
It seems that CUDA Fortran code only work when I declare 2D/3D thread block using the following approach
type(dim3) :: dimGrid, dimBlock
dimGrid = dim3( N/16, L/16, 1 )
dimBlock = dim3( 16, 16, 1 )
call mmul_kernel<<<dimGrid,dimBlock>>>( Adev,Bdev,Cdev,N,M,L )
I get runtime error if I use the C-like declaration
type(dim3) :: dimGrid, dimBlock
dimGrid%x = N/16
dimGrid%y = L/16
dimGrid%z = 1
dimBlock%x = 16
dimBlock%y = 16
dimBlock%z = 1
call mmul_kernel<<<dimGrid,dimBlock>>>( Adev,Bdev,Cdev,N,M,L )
I think it should be okay to use either approach. Any idea?
Tuan
Hi Tuan,
Something else is going on since both methods work for me. Can you post a reproducer?
Go Ducks!
Mat
Example:
% cat test2.cuf
module testme
use cudafor
contains
attributes (global) subroutine mmul_kernel(A,N,L)
use cudafor
real, dimension(:,:) :: A
integer, value :: N,L
integer :: ix,iy
ix = threadidx%x + blockdim%x*(blockidx%x-1)
iy = threadidx%y + blockdim%y*(blockidx%y-1)
if (ix.le.N.and.iy.le.L) then
A(ix,iy) = ix*iy
endif
end subroutine
end module testme
program test
use cudafor
use testme
real, dimension(:,:), allocatable, device :: Adev
real, dimension(:,:), allocatable :: A
integer :: N,L
type(dim3) :: dimGrid, dimBlock
N=64
L=64
allocate(Adev(N,L), A(N,L))
dimGrid%x = N/16
dimGrid%y = L/16
dimGrid%z = 1
dimBlock%x = 16
dimBlock%y = 16
dimBlock%z = 1
call mmul_kernel<<<dimGrid,dimBlock>>>( Adev,N,L )
A=Adev
print *, A(1,1), A(N,L)
end program test
% pgf90 test2.cuf -o test2.out -V11.0 -fast
% test2.out
1.000000 4096.000