I’m facing some issue with a cuda fortran test program involving 2 dimensional arrays. The data copy of 2D array from device to host is not working. Please verify the below program and let me know where I’m doing the wrong.
$ pgf90 -Mcuda multi_dim.cuf -o multi_dim.exe
$ ./multi_dim.exe
Data host to device
kernel finish
0: copyout Memcpy (host=0x675b80, dev=0x700200000, size=32) FAILED: 4(unspecified launch failure)
$
attributes(global) subroutine mdimen_kernel(m,n,a)
implicit none
integer i,j
integer, value :: m,n
real :: a(:,:)
i = ( blockIdx %x -1)* blockDim %x + threadIdx %x
! 2 blocks & 4 threads
a( blockIdx %x , threadIdx %x) = i
end subroutine
program testmdim
use cudafor
implicit none
integer m,n,i,j,istat
parameter (m = 2, n = 4)
! Separately two arrays are declared. aout is the array to
! hold back the values from kernel
real :: a(m,n), aout(m,n)
real, device :: a_d(m,n)
! Initialize host array
do i = 1,m
do j = 1,n
a(i,j) = i+j
enddo
enddo
! Data copy from host to device array
a_d = a
write(*,*) 'Data host to device'
call mdimen_kernel<<<2,4>>> (m,n,a_d)
istat=cudaThreadSynchronize()
write(*,*) 'kernel finish'
! Data copy back to host from kernel
aout = a_d
write(*,*) 'Data device to host'
! Printing the kernel output
do i=1,m
do j=1,n
write(*,*) 'aout( ',i,' ',j,' )', aout(i,j)
enddo
enddo
end program