About part of 4-D array multiply

 module simpleOps_m
  contains
   attributes ( global ) subroutine increment (a, b ,c ,nx,ny,pp)
     use cublas_device
    implicit none
    integer,value :: nx,ny
    integer :: a(: ,:),c(:,:),pp(:,:,:,:)
    integer  :: b(:,:)
    integer :: i, j,n(2),m,ll,kk,knode,aa,i2,j2
!    double precision, device :: alpha, beta
!     type(cublasHandle) :: ch1
!     integer ::transa, transb,istat

   i = ( blockIdx %x -1)* blockDim %x + threadIdx %x
   j = ( blockIdx %y -1)* blockDim %y + threadIdx %y
    if(i<=100 .and. j<=10) then
     knode=1
      do ll=1,5
        do kk=1,5
	aa=(i-ll)+(j-kk)
      if(aa.ge.0) then
       c(1,knode)=a(ll,kk)*b(i,j)
       c(2,knode)=2*a(ll,kk)*b(i,j)
       c(3,knode)=3*a(ll,kk)*b(i,j)
       c(4,knode)=4*a(ll,kk)*b(i,j)
       c(5,knode)=5*a(ll,kk)*b(i,j)
       c(6,knode)=6*a(ll,kk)*b(i,j)
       do i2=1,6
       pp(i,j,i2,knode)=c(i2,knode)  
       end do
       knode=knode+1
      end if
      end do
     end do
    end if
    call syncthreads()
    end subroutine
!     end module

   attributes ( global ) subroutine dgemm (ppp ,cc )
     use cublas_device
    implicit none
    integer,device :: pppp(:,:,: ,:), cc(:,:,:,:),ppp(:,:,:,:)
    integer,shared :: p1(6,25),p2(6,25)
     integer,device :: p3(6,6)
    integer :: i, j,n(2),m,ll,kk,knode,aa,i2,j2
    double precision, device :: alpha, beta
     type(cublasHandle) :: ch1
     integer ::transa, transb,istat,tx,ty,idx,idy
     
      tx = threadidx%x
      ty = threadidx%y
      idx = blockidx%x
      idy = blockidx%y 

     p1(tx,ty) = ppp(idx,idy,tx,ty)
     p2(tx,ty) = ppp(idx,idy,tx,ty)

     call syncthreads()

     istat = cublasCreate_v2(ch1)
     alpha = 1.0d0
      beta  = 0.0d0
       transa = cublas_op_n
       transb = cublas_op_t
      istat = cublasDgemm_v2(ch1, transa, transb, 6, 6, 25, alpha, &
       p1, 6, p2, 6, beta, p3, 6)
      
         do i2=1,6
	  do j2=1,6
          cc(i,j,i2,j2)= p3(i2,j2)  
          end do
	  end do
	   
        istat = cublasDestroy_v2(ch1)
  !  end do
  !  end do
    call syncthreads()
  end subroutine
   end module 
    
    
    
  program incrementTest
   use cudafor
   use cublas_device
   use simpleOps_m
   use simpleOps_m1
   implicit none
   integer , parameter :: nx =100 , ny =10
   integer :: a(5 ,5), b(100,10) ,c(6,25),pp(100,10,6,25),cc(100,10,6,6),&
              c1(6,25),ppp(100,10,6,25),p1(6,25),p2(6,25),pppp(100,10,6,25)
   integer , device :: ppp_d(100,10,6,25),d_p1(6,25),d_p2(6,25)
   type ( dim3 ) :: grid , tBlock, grid1 ,tBlock1
   integer ::i,j,i2,j2,istat,i3
   real*8 ::alpha,beta
   integer,device, allocatable ::pp_d(:,:,:,:),c_d(:,:),c1_d(:,:),a_d(:,:),&
                                b_d(:,:),cc_d(:,:,:,:),pppp_d(:,:,:,:)
   allocate(pp_d(100,10,6,25),c_d(6,25),c1_d(6,25),a_d(5,5),&
           b_d(100,10),cc_d(100,10,6,6),pppp_d(1000,10,6,26))
   a = 2
   c = 0
   do i=1,100
     do j=1,10
     b(i,j)=i*j
   end do
   end do

   open(1,file='zzz.dat')
   tBlock = dim3 (32 ,32 ,1)
   grid = dim3 ( ceiling ( real (nx )/ tBlock %x), &
   ceiling ( real (ny )/ tBlock %y), 1)
   a_d = a
   b_d = b
    call increment <<<grid , tBlock >>>(a_d , b_d ,c_d,nx,ny,pp_d)
   pp=pp_d
  
   ppp=pp
   deallocate(pp_d,a_d,b_d,c_d)
! ! ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
   
   tBlock1 = dim3 (6 ,25 ,1)
   grid1 = dim3 ( 100, 10, 1)
   
     ppp_d =ppp
 call dgemm <<<grid1 , tBlock1 >>>( ppp_d ,cc_d)
      cc = cc_d
     do i=1,100
       do j=1,10
       do i2=1,6
        do j2=1,6
	if(cc(i,j,i2,j2)>=0.0000001) then
        write(1,*) cc(i,j,i2,j2),i2,j2,i,j
	end if
	end do
	end do
    end do
   end do
       
   end

!!!

This code is my writing .And the compilin is okey,but when I run the result of compilation,my servicer is dead!
(GPU kard is K20 / cuda 5.0/ PGI 13.9)
can you tell me what goes wrong ? And if you give me a correct code I will be much appreciated !

Hi tsariner1986,

I see two initial problems:

         do i2=1,6 
     do j2=1,6 
          cc(i,j,i2,j2)= p3(i2,j2)  
          end do 
     end do

I and j are initialized so you’re using garbage indices. I suspect you meant to use idx and idy

Second, I believe only one thread should be calling the cublas routines, hence this section of code should be guarded by an “if (tx.eq.1.and.ty.eq.1) then”. Though, I’m doubling checking with Brent on this point.

Also, I’m not sure you can pass shared data to cublasDgemm_v2. Again, I’m checking with Brent on this.

  • Mat

Hi Mat:
Thank you for your prompt reply!
First,’ i & j’ should be replaced by ’ idx & idy’, this is my clerical error.
Second, I am not sure yet multi-thread should be calling the cublas routines .So once you and Brent have an fix result and tell me the answer, I would be very grateful !

----tsariner1986

Hi tsariner1986,

Brent confirmed that you can’t pass shared to the cublas routines. They will spawn another kernel which wont have access to this block’s shared memory.

Sorry,
Mat