cuda fortran problem

what is wrong in my program,i am getting wrong values :-

attributes (global)subroutine matadd(a,b,c)

integer ,dimension(2,2) :: a,b,c
integer i,j
i = threadIdx%x
j = blockIdx%x

	 
	c(i+j*blockDim%x) = a(i+j*blockDim%x) + b(i+j*blockDim%x)
end subroutine matadd

program matrix
use cudafor
 integer ,dimension(2,2):: m,n,p
 integer, device,allocatable,dimension(:,:) :: mdev,ndev,pdev
 integer i,j
 type(dim3) :: dimBlock
 allocate (mdev(2,2),ndev(2,2),pdev(2,2))
 do i=1,2
 do j=1,2
 m(i,j)=1
 n(i,j)=1
 end do
 end do

 mdev=m
 ndev=n


 dimBlock =dim(1,1)
 call matadd <<<4>>> (mdev, ndev, pdev)

 p=pdev

do i=1,2
do j=1,2
print*,p(i,j)

!print*,n(i,j)

!print*,p(i,j)
end do
end do

deallocate (mdev(2,2),ndev(2,2),pdev(2,2))
 end program matrix

i am getting these warning

The number of subscripts is less than the rank of c,b,a

Hi Kuldeep Gupta,

You have a number of issues here. First your index in the kernel is incorrectly indexing the arrays as a single dimension (they are two dimensions), the “Idx” usage is incorrect, and global kernel routines need explicit interfaces (or implicit if put in a module). I’ve fixed your code (below) and also made it a bit more general.

Hope this helps,
Mat

% cat test.cuf
module foo

integer, parameter :: N=2
integer, parameter :: M=2
integer, parameter :: BLOCK_SIZE=16

contains

attributes (global)subroutine matadd(a,b,c)

integer ,dimension(N,M) :: a,b,c
integer i,j
i = (blockDim%x * (blockIdx%x-1)) + threadIdx%x
j = (blockDim%y * (blockIdx%y-1)) + threadIdx%y
if (i .le. N .and. j .le. M) then
   c(i,j) = a(i,j) + b(i,j)
endif

end subroutine matadd

end module foo

program matrix
use cudafor
use foo
 integer ,dimension(N,M):: a,b,c
 integer, device,allocatable,dimension(:,:) :: adev,bdev,cdev
 integer i,j
 type(dim3) :: dimBlock, dimGrid
 allocate (cdev(N,M),bdev(N,M),adev(N,M))
 do i=1,N
 do j=1,M
 a(i,j)=1
 b(i,j)=1
 end do
 end do

 adev=a
 bdev=b

 dimGrid =dim3((N+BLOCK_SIZE-1)/BLOCK_SIZE,(M+BLOCK_SIZE-1)/BLOCK_SIZE,1)
 dimBlock =dim3(BLOCK_SIZE,BLOCK_SIZE,1)
 call matadd {{{dimGrid,dimBlock}}} (adev, bdev, cdev)  !! Replace } with > and { with < 

 c=cdev

do i=1,N
do j=1,M
print*,i,j,c(i,j)
end do
end do

deallocate (adev,bdev,cdev)
 end program matrix
% pgf90 test.cuf -Mcuda
% a.out
            1            1            2
            1            2            2
            2            1            2
            2            2            2