module simpleOps_m
contains
attributes ( global ) subroutine increment (a, b ,c ,nx,ny,pp)
use cublas_device
implicit none
integer,value :: nx,ny
integer :: a(: ,:),c(:,:),pp(:,:,:,:)
integer :: b(:,:)
integer :: i, j,n(2),m,ll,kk,knode,aa,i2,j2
! double precision, device :: alpha, beta
! type(cublasHandle) :: ch1
! integer ::transa, transb,istat
i = ( blockIdx %x -1)* blockDim %x + threadIdx %x
j = ( blockIdx %y -1)* blockDim %y + threadIdx %y
if(i<=100 .and. j<=10) then
knode=1
do ll=1,5
do kk=1,5
aa=(i-ll)+(j-kk)
if(aa.ge.0) then
c(1,knode)=a(ll,kk)*b(i,j)
c(2,knode)=2*a(ll,kk)*b(i,j)
c(3,knode)=3*a(ll,kk)*b(i,j)
c(4,knode)=4*a(ll,kk)*b(i,j)
c(5,knode)=5*a(ll,kk)*b(i,j)
c(6,knode)=6*a(ll,kk)*b(i,j)
do i2=1,6
pp(i,j,i2,knode)=c(i2,knode)
end do
knode=knode+1
end if
end do
end do
end if
call syncthreads()
end subroutine
! end module
attributes ( global ) subroutine dgemm (ppp ,cc )
use cublas_device
implicit none
integer,device :: pppp(:,:,: ,:), cc(:,:,:,:),ppp(:,:,:,:)
integer,shared :: p1(6,25),p2(6,25)
integer,device :: p3(6,6)
integer :: i, j,n(2),m,ll,kk,knode,aa,i2,j2
double precision, device :: alpha, beta
type(cublasHandle) :: ch1
integer ::transa, transb,istat,tx,ty,idx,idy
tx = threadidx%x
ty = threadidx%y
idx = blockidx%x
idy = blockidx%y
p1(tx,ty) = ppp(idx,idy,tx,ty)
p2(tx,ty) = ppp(idx,idy,tx,ty)
call syncthreads()
istat = cublasCreate_v2(ch1)
alpha = 1.0d0
beta = 0.0d0
transa = cublas_op_n
transb = cublas_op_t
istat = cublasDgemm_v2(ch1, transa, transb, 6, 6, 25, alpha, &
p1, 6, p2, 6, beta, p3, 6)
do i2=1,6
do j2=1,6
cc(i,j,i2,j2)= p3(i2,j2)
end do
end do
istat = cublasDestroy_v2(ch1)
! end do
! end do
call syncthreads()
end subroutine
end module
program incrementTest
use cudafor
use cublas_device
use simpleOps_m
use simpleOps_m1
implicit none
integer , parameter :: nx =100 , ny =10
integer :: a(5 ,5), b(100,10) ,c(6,25),pp(100,10,6,25),cc(100,10,6,6),&
c1(6,25),ppp(100,10,6,25),p1(6,25),p2(6,25),pppp(100,10,6,25)
integer , device :: ppp_d(100,10,6,25),d_p1(6,25),d_p2(6,25)
type ( dim3 ) :: grid , tBlock, grid1 ,tBlock1
integer ::i,j,i2,j2,istat,i3
real*8 ::alpha,beta
integer,device, allocatable ::pp_d(:,:,:,:),c_d(:,:),c1_d(:,:),a_d(:,:),&
b_d(:,:),cc_d(:,:,:,:),pppp_d(:,:,:,:)
allocate(pp_d(100,10,6,25),c_d(6,25),c1_d(6,25),a_d(5,5),&
b_d(100,10),cc_d(100,10,6,6),pppp_d(1000,10,6,26))
a = 2
c = 0
do i=1,100
do j=1,10
b(i,j)=i*j
end do
end do
open(1,file='zzz.dat')
tBlock = dim3 (32 ,32 ,1)
grid = dim3 ( ceiling ( real (nx )/ tBlock %x), &
ceiling ( real (ny )/ tBlock %y), 1)
a_d = a
b_d = b
call increment <<<grid , tBlock >>>(a_d , b_d ,c_d,nx,ny,pp_d)
pp=pp_d
ppp=pp
deallocate(pp_d,a_d,b_d,c_d)
! ! ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
tBlock1 = dim3 (6 ,25 ,1)
grid1 = dim3 ( 100, 10, 1)
ppp_d =ppp
call dgemm <<<grid1 , tBlock1 >>>( ppp_d ,cc_d)
cc = cc_d
do i=1,100
do j=1,10
do i2=1,6
do j2=1,6
if(cc(i,j,i2,j2)>=0.0000001) then
write(1,*) cc(i,j,i2,j2),i2,j2,i,j
end if
end do
end do
end do
end do
end
!!!
This code is my writing .And the compilin is okey,but when I run the result of compilation,my servicer is dead!
(GPU kard is K20 / cuda 5.0/ PGI 13.9)
can you tell me what goes wrong ? And if you give me a correct code I will be much appreciated !