About part of 4-D array multiply

tsariner1986 · January 6, 2014, 4:12pm

 module simpleOps_m
  contains
   attributes ( global ) subroutine increment (a, b ,c ,nx,ny,pp)
     use cublas_device
    implicit none
    integer,value :: nx,ny
    integer :: a(: ,:),c(:,:),pp(:,:,:,:)
    integer  :: b(:,:)
    integer :: i, j,n(2),m,ll,kk,knode,aa,i2,j2
!    double precision, device :: alpha, beta
!     type(cublasHandle) :: ch1
!     integer ::transa, transb,istat

   i = ( blockIdx %x -1)* blockDim %x + threadIdx %x
   j = ( blockIdx %y -1)* blockDim %y + threadIdx %y
    if(i<=100 .and. j<=10) then
     knode=1
      do ll=1,5
        do kk=1,5
	aa=(i-ll)+(j-kk)
      if(aa.ge.0) then
       c(1,knode)=a(ll,kk)*b(i,j)
       c(2,knode)=2*a(ll,kk)*b(i,j)
       c(3,knode)=3*a(ll,kk)*b(i,j)
       c(4,knode)=4*a(ll,kk)*b(i,j)
       c(5,knode)=5*a(ll,kk)*b(i,j)
       c(6,knode)=6*a(ll,kk)*b(i,j)
       do i2=1,6
       pp(i,j,i2,knode)=c(i2,knode)  
       end do
       knode=knode+1
      end if
      end do
     end do
    end if
    call syncthreads()
    end subroutine
!     end module

   attributes ( global ) subroutine dgemm (ppp ,cc )
     use cublas_device
    implicit none
    integer,device :: pppp(:,:,: ,:), cc(:,:,:,:),ppp(:,:,:,:)
    integer,shared :: p1(6,25),p2(6,25)
     integer,device :: p3(6,6)
    integer :: i, j,n(2),m,ll,kk,knode,aa,i2,j2
    double precision, device :: alpha, beta
     type(cublasHandle) :: ch1
     integer ::transa, transb,istat,tx,ty,idx,idy
     
      tx = threadidx%x
      ty = threadidx%y
      idx = blockidx%x
      idy = blockidx%y 

     p1(tx,ty) = ppp(idx,idy,tx,ty)
     p2(tx,ty) = ppp(idx,idy,tx,ty)

     call syncthreads()

     istat = cublasCreate_v2(ch1)
     alpha = 1.0d0
      beta  = 0.0d0
       transa = cublas_op_n
       transb = cublas_op_t
      istat = cublasDgemm_v2(ch1, transa, transb, 6, 6, 25, alpha, &
       p1, 6, p2, 6, beta, p3, 6)
      
         do i2=1,6
	  do j2=1,6
          cc(i,j,i2,j2)= p3(i2,j2)  
          end do
	  end do
	   
        istat = cublasDestroy_v2(ch1)
  !  end do
  !  end do
    call syncthreads()
  end subroutine
   end module 
    
    
    
  program incrementTest
   use cudafor
   use cublas_device
   use simpleOps_m
   use simpleOps_m1
   implicit none
   integer , parameter :: nx =100 , ny =10
   integer :: a(5 ,5), b(100,10) ,c(6,25),pp(100,10,6,25),cc(100,10,6,6),&
              c1(6,25),ppp(100,10,6,25),p1(6,25),p2(6,25),pppp(100,10,6,25)
   integer , device :: ppp_d(100,10,6,25),d_p1(6,25),d_p2(6,25)
   type ( dim3 ) :: grid , tBlock, grid1 ,tBlock1
   integer ::i,j,i2,j2,istat,i3
   real*8 ::alpha,beta
   integer,device, allocatable ::pp_d(:,:,:,:),c_d(:,:),c1_d(:,:),a_d(:,:),&
                                b_d(:,:),cc_d(:,:,:,:),pppp_d(:,:,:,:)
   allocate(pp_d(100,10,6,25),c_d(6,25),c1_d(6,25),a_d(5,5),&
           b_d(100,10),cc_d(100,10,6,6),pppp_d(1000,10,6,26))
   a = 2
   c = 0
   do i=1,100
     do j=1,10
     b(i,j)=i*j
   end do
   end do

   open(1,file='zzz.dat')
   tBlock = dim3 (32 ,32 ,1)
   grid = dim3 ( ceiling ( real (nx )/ tBlock %x), &
   ceiling ( real (ny )/ tBlock %y), 1)
   a_d = a
   b_d = b
    call increment <<<grid , tBlock >>>(a_d , b_d ,c_d,nx,ny,pp_d)
   pp=pp_d
  
   ppp=pp
   deallocate(pp_d,a_d,b_d,c_d)
! ! ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
   
   tBlock1 = dim3 (6 ,25 ,1)
   grid1 = dim3 ( 100, 10, 1)
   
     ppp_d =ppp
 call dgemm <<<grid1 , tBlock1 >>>( ppp_d ,cc_d)
      cc = cc_d
     do i=1,100
       do j=1,10
       do i2=1,6
        do j2=1,6
	if(cc(i,j,i2,j2)>=0.0000001) then
        write(1,*) cc(i,j,i2,j2),i2,j2,i,j
	end if
	end do
	end do
    end do
   end do
       
   end

!!!

This code is my writing .And the compilin is okey,but when I run the result of compilation,my servicer is dead!
(GPU kard is K20 / cuda 5.0/ PGI 13.9)
can you tell me what goes wrong ? And if you give me a correct code I will be much appreciated !

MatColgrove · January 7, 2014, 2:28am

Hi tsariner1986,

I see two initial problems:

         do i2=1,6 
     do j2=1,6 
          cc(i,j,i2,j2)= p3(i2,j2)  
          end do 
     end do

I and j are initialized so you’re using garbage indices. I suspect you meant to use idx and idy

Second, I believe only one thread should be calling the cublas routines, hence this section of code should be guarded by an “if (tx.eq.1.and.ty.eq.1) then”. Though, I’m doubling checking with Brent on this point.

Also, I’m not sure you can pass shared data to cublasDgemm_v2. Again, I’m checking with Brent on this.

Mat

tsariner1986 · January 7, 2014, 5:23am

mkcolg:

Hi tsariner1986,

I see two initial problems:
         do i2=1,6 
     do j2=1,6 
          cc(i,j,i2,j2)= p3(i2,j2)  
          end do 
     end do
I and j are initialized so you’re using garbage indices. I suspect you meant to use idx and idy

Second, I believe only one thread should be calling the cublas routines, hence this section of code should be guarded by an “if (tx.eq.1.and.ty.eq.1) then”. Though, I’m doubling checking with Brent on this point.

Also, I’m not sure you can pass shared data to cublasDgemm_v2. Again, I’m checking with Brent on this.

Mat

Hi Mat:
Thank you for your prompt reply!
First,’ i & j’ should be replaced by ’ idx & idy’, this is my clerical error.
Second, I am not sure yet multi-thread should be calling the cublas routines .So once you and Brent have an fix result and tell me the answer, I would be very grateful !

----tsariner1986

MatColgrove · January 7, 2014, 4:05pm

Hi tsariner1986,

Brent confirmed that you can’t pass shared to the cublas routines. They will spawn another kernel which wont have access to this block’s shared memory.

Sorry,
Mat

Topic		Replies	Views
Is it correct that my Pascal card is calling Maxwell_gemm kernels through cublas? And if so, why is cublas unusably slow for me? CUDA Programming and Performance	6	940	August 23, 2018
0: copyout Memcpy... FAILED: 4(unspecified launch failure) Legacy PGI Compilers	3	5244	May 13, 2013
cuda memory issues Legacy PGI Compilers	13	9511	March 19, 2012
DGEMM parameter number 8 had an illegal value GPU-Accelerated Libraries	7	10060	August 12, 2013
Using CUDA Libraries from CUDA Fortran Device Code Legacy PGI Compilers	6	7434	July 19, 2017
Please help for using cublas Zgemm~ CUDA Programming and Performance	4	1733	July 27, 2015
cublasZgemm() gives false result for large data and potential bug GPU-Accelerated Libraries	6	1149	October 12, 2021
pgfortran hangs on .cuf file with shared memory Legacy PGI Compilers	2	4736	June 11, 2010
limit of computation CUDA Programming and Performance	44	32905	April 8, 2008
Incorrect result with complex arrays in global subroutine Legacy PGI Compilers	3	2505	August 2, 2012

About part of 4-D array multiply

Related topics