I have a CUDA Fortran addition code that works correctly with static arrays, but when I switch to allocatable arrays, data transfer between host and device fails. What could be the reason?
thanks very much.
have a nice day.
example .txt (1.9 KB)
module parameters
implicit none
integer,parameter::nx=2
integer,parameter::ny=2
integer,parameter::nxny=(nx+1)*(ny+1)
end module
!==========================================================================
!==========================================================================
module simpleOps_m
contains
attributes(global)subroutine increment(a,b)
use parameters
implicit none
integer,dimension(:,:)::a
integer::b
integer::idx,i,j,n,nx_1
nx_1=nx+1
idx=(blockIdx%x-1)*blockDim%x+threadIdx%x-1
j=int(idx/nx_1)
i=mod(idx,nx_1)
if (idx < nxny) then
print *, 1,i,j,idx,a(i,j),b
a(i,j)=a(i,j)+b
print *, 2,i,j,idx,a(i,j),b
end if
end subroutine increment
end module simpleOps_m
!==========================================================================
!==========================================================================
program main
use cudafor
use parameters
use simpleOps_m
implicit none
integer,dimension(:,:), allocatable::a
integer, device,dimension(:,:), allocatable::a_d
integer::i,j,b,status,tPB=256
integer,device::b_d
allocate(a(0:nx,0:ny))
a=1
b=3
allocate(a_d(0:nx,0:ny))
do j=0,ny
do i=0,nx
write(*,'(6I6)') 1,i,j,a(i,j),b
end do
end do
!write(*,*) 'a=',a,'b=',b
!a_d=a
!b_d=b
status=cudaMemcpy(a_d,a,nxny,cudaMemcpyHostToDevice)
status=cudaMemcpy(b_d,b,1,cudaMemcpyHostToDevice)
call increment<<<ceiling(real(nxny)/tPB),tPB>>>(a_d,b_d)!ceiling是不小于实数的最小整数,
status=cudaMemcpy(a,a_d,nxny,cudaMemcpyDeviceToHost)
status=cudaMemcpy(b,b_d,1,cudaMemcpyDeviceToHost)
!a=a_d
!b=b_d
if (any(a /=4)) then
write(*,*) 'program failed'
else
write(*,*) 'program passed'
end if
write(*,*) ceiling(real(nxny)/tPB),tPB
do j=0,ny
do i=0,nx
write(*,'(6I6)') 2,i,j,a(i,j),b
end do
end do
!deallocate(a,a_d)
end program main
!==========================================================================
Hi and welcome!
The problem here is that you’re passing in zero-based arrays but the assumption in the kernel is that they are default one-based arrays. The fix is to set the lower bounds of “a” in the kernel to zero. Also, since “b” is a scalar, I’d recommend passing it in by value rather than create a device variable.
example.cuf:
module parameters
implicit none
integer,parameter::nx=2
integer,parameter::ny=2
integer,parameter::nxny=(nx+1)*(ny+1)
end module
!==========================================================================
!==========================================================================
module simpleOps_m
contains
attributes(global)subroutine increment(a,b)
use parameters
implicit none
integer,dimension(0:,0:)::a
integer, value ::b
integer::idx,i,j,n,nx_1
nx_1=nx+1
idx=(blockIdx%x-1)*blockDim%x+threadIdx%x-1
j=int(idx/nx_1)
i=mod(idx,nx_1)
if (idx < nxny) then
print *, 1,i,j,idx,a(i,j),b
a(i,j)=a(i,j)+b
print *, 2,i,j,idx,a(i,j),b
end if
end subroutine increment
end module simpleOps_m
!==========================================================================
!==========================================================================
program main
use cudafor
use parameters
use simpleOps_m
implicit none
integer,dimension(:,:), allocatable::a
integer, device,dimension(:,:), allocatable::a_d
integer::i,j,b,status,tPB=256
allocate(a(0:nx,0:ny))
a=1
b=3
allocate(a_d(0:nx,0:ny))
do j=0,ny
do i=0,nx
write(*,'(6I6)') 1,i,j,a(i,j),b
end do
end do
!write(*,*) 'a=',a,'b=',b
a_d=a
call increment<<<ceiling(real(nxny)/tPB),tPB>>>(a_d,b)
a=a_d
if (any(a /=4)) then
write(*,*) 'program failed'
else
write(*,*) 'program passed'
end if
write(*,*) ceiling(real(nxny)/tPB),tPB
do j=0,ny
do i=0,nx
write(*,'(6I6)') 2,i,j,a(i,j),b
end do
end do
!deallocate(a,a_d)
end program main
!==========================================================================
% nvfortran example.cuf ; a.out
1 0 0 1 3
1 1 0 1 3
1 2 0 1 3
1 0 1 1 3
1 1 1 1 3
1 2 1 1 3
1 0 2 1 3
1 1 2 1 3
1 2 2 1 3
1 0 0 0 1 3
1 1 0 1 1 3
1 2 0 2 1 3
1 0 1 3 1 3
1 1 1 4 1 3
1 2 1 5 1 3
1 0 2 6 1 3
1 1 2 7 1 3
1 2 2 8 1 3
2 0 0 0 4 3
2 1 0 1 4 3
2 2 0 2 4 3
2 0 1 3 4 3
2 1 1 4 4 3
2 2 1 5 4 3
2 0 2 6 4 3
2 1 2 7 4 3
2 2 2 8 4 3
program passed
1 256
2 0 0 4 3
2 1 0 4 3
2 2 0 4 3
2 0 1 4 3
2 1 1 4 3
2 2 1 4 3
2 0 2 4 3
2 1 2 4 3
2 2 2 4 3
Hope this helps,
Mat
Hi Mat,
Thank you so much for your help — your reply was extremely helpful! I had been stuck on this issue for several days.
I tried the three cases (0:,0:), (:,:), and (1:,1:), and just as you said, “the assumption in the kernel is that they are default one-based arrays.” That was exactly the key.
I really appreciate your assistance.
By the way, I’m currently learning CUDA Fortran. Would you be able to recommend any learning materials — books, papers, manuals, or anything else you think is useful?
Thanks again, and I hope you have a great day!
CUDA Fortran for Scientists and Engineers is quite good.
Then we have the CUDA Fortran Programmer’s Guide, but this is more for the syntax rather than examples like the Book.
The Fortran CUDA Interface Guide is also very helpful if you’re calling CUDA Libraries.
There is a new edition of the CUDA Fortran for Scientists and Engineer book:
got it!
thanks mat, you are my hero!
have a nice day~
thank you my friend! it is really helpful!
