Hi Jing,
Having not used cublasDgemmBatched before myself, it was nice to learn something new!
cublasDgemmBatched doesn’t take matrices as arguments, but rather it takes arrays of pointers to matrices as arguments. So you can’t pass A, B, and directly, but rather need to create an array of device pointers to each of the 1000 arrays. I’ve have updated your code (See below) with this change.
However since your arrays are evenly strided, you can instead use cublasDgemmStridedBatched and then pass A, B, and C directly to routine (within a host_data region). I’ve included an example of this below as well. Also, the strided version should perform better as well.
Note for my tests, I’m using a Skylake CPU on Linux with a Tesla V100 and PGI 18.10. Though earlier versions of PGI work as well.
Hope this helps,
Mat
% cat batchedDgemm.f90
program main
use cublas
implicit none
integer :: dim, stat, i, j, k, batch_count
integer(8) :: bytes
real(8),dimension(:,:,:), allocatable :: A, B, C
type(c_devptr), device, dimension(:), allocatable :: dA, dB, dC
real(8) :: alpha, beta
real(8) :: index, sum
type(cublasHandle) :: handle
integer :: sizeof_double
parameter (sizeof_double=8)
!Linear dimension of matrices
dim = 100
! Number of A,B,C matrix sets
batch_count = 1000
! Allocate host storage for A,B,C square matrices
allocate(A(dim,dim,batch_count))
allocate(B(dim,dim,batch_count))
allocate(C(dim,dim,batch_count))
allocate(dA(batch_count))
allocate(dB(batch_count))
allocate(dC(batch_count))
!$acc enter data create(A,B,C)
! Fill A,Bd diagonals with sin(i) data, C diagonal with cos(i)^2
! Matrices are arranged column major
!$acc data present(A,B,C)
!$acc kernels
!$acc loop
do k=1,batch_count
!$acc loop
do j=1,dim
!$acc loop
do i=1,dim
if (i==j) then
index = real(j*dim + i)
a(i,j,k) = k*sin(index)
b(i,j,k) = sin(index)
c(i,j,k) = k*cos(index) * cos(index)
else
a(i,j,k) = 0.0
b(i,j,k) = 0.0
c(i,j,k) = 0.0
endif
enddo ! i
enddo ! j
enddo ! k
!$acc end kernels
! Create cublas instance
stat = cublasCreate(handle)
if(stat /= CUBLAS_STATUS_SUCCESS) write(*,*) 'cublasCreate failed'
! Set matrix coefficients
alpha = 1.0
beta = 1.0
! batched DGEMM: C = alpha*A*B + beta*C
do i=1,1000
dA(i) = C_DEVLOC(A(:,:,i))
dB(i) = C_DEVLOC(B(:,:,i))
dC(i) = C_DEVLOC(C(:,:,i))
enddo
stat=cublasDgemmBatched(handle,CUBLAS_OP_N,CUBLAS_OP_N,dim,dim,dim,alpha,&
dA,dim,dB,dim,beta,dC,dim,batch_count)
if(stat /= CUBLAS_STATUS_SUCCESS) write(*,*) 'cublasDgemm failed:',stat
! Simple sanity test, sum up all elements
sum = 0.0
!$acc kernels
!$acc loop
do k=1,batch_count
!$acc loop
do j=1,dim
do i=1,dim
sum = sum + C(i,j,k)
enddo
enddo
enddo
!$acc end kernels
print *, "Sum is:", sum, "should be: ",dim*(batch_count)*(batch_count+1)/2
!$acc end data
!$acc exit data delete(A,B,C)
deallocate(A)
deallocate(B)
deallocate(C)
deallocate(dA)
deallocate(dB)
deallocate(dC)
stat = cublasDestroy(handle)
end program
% pgf90 -ta=tesla:cc70 -Mcudalib=cublas batchedDgemm.f90 -Mcuda -V18.10
% a.out
Sum is: 50050000.00000000 should be: 50050000
Example of using strided:
% cat batchedStridedDgemm.f90
program main
use cublas
implicit none
integer :: dim, stat, i, j, k, stride, batch_count
integer(8) :: bytes
real(8),dimension(:,:,:), allocatable :: A, B, C
real(8) :: alpha, beta
real(8) :: index, sum
type(cublasHandle) :: handle
integer :: sizeof_double
parameter (sizeof_double=8)
!Linear dimension of matrices
dim = 100
stride = dim*dim
! Number of A,B,C matrix sets
batch_count = 1000
! Allocate host storage for A,B,C square matrices
allocate(A(dim,dim,batch_count))
allocate(B(dim,dim,batch_count))
allocate(C(dim,dim,batch_count))
allocate(dA(batch_count))
allocate(dB(batch_count))
allocate(dC(batch_count))
!$acc enter data create(A,B,C)
! Fill A,Bd diagonals with sin(i) data, C diagonal with cos(i)^2
! Matrices are arranged column major
!$acc data present(A,B,C)
!$acc kernels
!$acc loop
do k=1,batch_count
!$acc loop
do j=1,dim
!$acc loop
do i=1,dim
if (i==j) then
index = real(j*dim + i)
a(i,j,k) = k*sin(index)
b(i,j,k) = sin(index)
c(i,j,k) = k*cos(index) * cos(index)
else
a(i,j,k) = 0.0
b(i,j,k) = 0.0
c(i,j,k) = 0.0
endif
enddo ! i
enddo ! j
enddo ! k
!$acc end kernels
! Create cublas instance
stat = cublasCreate(handle)
if(stat /= CUBLAS_STATUS_SUCCESS) write(*,*) 'cublasCreate failed'
! Set matrix coefficients
alpha = 1.0
beta = 1.0
! batched DGEMM: C = alpha*A*B + beta*C
!$acc host_data use_device(A,B,C)
stat=cublasDgemmStridedBatched(handle,CUBLAS_OP_N,CUBLAS_OP_N,dim,dim,dim,alpha,&
A,dim,stride,B,dim,stride,beta,C,dim,stride,batch_count)
!$acc end host_data
if(stat /= CUBLAS_STATUS_SUCCESS) write(*,*) 'cublasDgemm failed:',stat
! Simple sanity test, sum up all elements
sum = 0.0
!$acc kernels
!$acc loop
do k=1,batch_count
!$acc loop
do j=1,dim
do i=1,dim
sum = sum + C(i,j,k)
enddo
enddo
enddo
!$acc end kernels
print *, "Sum is:", sum, "should be: ",dim*(batch_count)*(batch_count+1)/2
!$acc end data
!$acc exit data delete(A,B,C)
deallocate(A)
deallocate(B)
deallocate(C)
deallocate(dA)
deallocate(dB)
deallocate(dC)
stat = cublasDestroy(handle)
end program
% pgf90 -ta=tesla:cc70 -Mcudalib=cublas batchedStridedDgemm.f90 -Mcuda -V18.10
% a.out
Sum is: 50050000.00000000 should be: 50050000