Hi,
I have a problem with a simple fortran cuda application. Below is the code. It works fine if I set the dimensions of the array small (i.e. Nb=2, Ns=3 in the below code). However, if I increase the dimensions to the below numbers, then it gives the following error:
0: copyover Memcpy <…> FAILED: 30
It looks like there is a problem during the copy of the device array. But I don’t understand it. Why do I have the problem when the array dimension is high only?
Can anyone suggest anything?
Thanks
Bulent
!==============================================!
module gpu
use cudafor
implicit none
real, device, dimension(:,:,:,:), allocatable :: vd,vdp
integer, device :: Ns_dev,Nb_dev
contains
attributes(global) subroutine v_kernel()
implicit none
integer :: i,j,k,s,b,n,q
real :: v1,v2
real, shared :: vtemp(768),vtemp2(768)
i=blockidx%y
j=blockidx%x
k=ceiling(real(j)/(Ns_devNb_dev))
s=ceiling(real(j-(k-1)Ns_devNb_dev)/Nb_dev)
b=j-(k-1)(Ns_dev*Nb_dev)-(s-1)*Nb_dev
j=threadidx%x
vtemp(j)=vdp(b,s,k,j)
call syncthreads()
vtemp2(j)=log(real(b+s+k+i-j))+vtemp(j)
call syncthreads()
vd(b,s,k,i)=maxval(vtemp2)
end subroutine v_kernel
end module
!==============================================================================!
program main
use cudafor
use gpu
implicit none
integer :: istat, idevice
integer :: j
type(dim3) :: dimGrid, dimBlock
real :: begin, finish, error
integer, parameter :: Nb=20,Ns=30,Nk=7,Ni=768
idevice=0
istat=cudaSetDevice(idevice)
dimGrid = dim3(NbNsNk,Ni,1)
dimBlock = dim3(Ni,1,1)
Ns_dev=Ns
Nb_dev=Nb
allocate(vd(Nb,Ns,Nk,Ni),vdp(Nb,Ns,Nk,Ni))
vdp=0
call cpu_time(begin)
do j=1,5
call v_kernel<<<dimGrid,dimBlock>>>()
istat = cudaThreadSynchronize()
vdp=vd
end do
istat = cudaThreadSynchronize()
call cpu_time(finish)
print*, ‘gpu time=’, finish-begin, ‘seconds’
deallocate(vd,vdp)
end program