Hello,
I’m currently facing an issue with gtsv2 function, which is supposed to solve tridiagonal matrix with multiple RHS. Based on some earlier discussions in the forum, I have created interfaces in fortran module and bind it with C (Sample code to compare times on cpu and gpu is given below.)
Code is executed as follows:
pgf90 -o tdma.exe -ta=tesla:cc60 -Minfo=accel -Mcudalib=cusparse -Mcuda -r8 -fast cusparse_vs_CPU_comparion.F90
As per the CUDA 10.1 manual, gtsv2 requires buffersize, which is estimated using cusparseDgtsv2_bufferSizeExt and the pbuffer is allocated once.
It was mentioned in the manual that gtsv2 is superior to gtsv as it
a) does not require temporary storage
b) supports asynchronous execution
c) supports CUDA graph structure
However, apart from asynchronous execution, I can’t see any improvement.
Say, I call cusparseDgtsv2_nopivot function in a loop, I expect that gtsv will allocate temporary memory for every call, while gtsv2 will not require temporary storage (this has been allotted once using buf(buffersize)) and will be faster.
I see a similar profile output using gtsv and gtsv2 functions. Most of the time is spent in the following activities:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 70.38% 3.50011s 30 116.67ms 113.15ms 133.53ms void pcrGlobalMemKernel_manyRhs(pcrGlobalMemParams_t)
17.43% 866.71ms 5 173.34ms 173.01ms 173.86ms void pcrLastStageKernel_manyRhs(pcrLastStageParams_t)
11.23% 558.27ms 5 111.65ms 109.36ms 115.99ms void pcrGlobalMemKernelFirstPass_manyRhs(pcrGlobalMemFirstPassParams_t)
Can you please help me spot the mistake, please? In the above result, I can’t make out why the number of calls for void pcrGlobalMemKernel_manyRhs is 30 while I had only called it the gtsv2 function 5 times.
Final answers from the tridiagonal solver are fine but the time taken is much longer than the serial code on CPU. Am I passing the pbuffer properly?
Module tridiag
interface
integer (4) function cusparseDgtsv2(handle, m, n, dl, d, du, x, ldb, pBuffer) bind(C,name='cusparseDgtsv2')
use iso_c_binding
use cusparse
implicit none
type(cusparseHandle),value :: handle
integer(4),value :: m, n, batchCount, batchStride, ldb
real(c_double),dimension(*), device :: dl, d, du, x
!integer(8),value :: pBuffer
type(c_ptr), value :: pBuffer
end function cusparseDgtsv2
integer(4) function cusparseDgtsv2_bufferSizeExt(handle, m, n, dl, d, du, x, ldb, pBufferSize) bind(C,name='cusparseDgtsv2_bufferSizeExt')
use iso_c_binding
use cusparse
implicit none
type(cusparseHandle),value :: handle
integer(4),value :: m, n, batchCount, batchStride, ldb
real(c_double),dimension(*), device :: dl, d, du, x
!integer(c_size_t),value :: pBufferSize
integer(c_size_t) pBufferSize
end function cusparseDgtsv2_bufferSizeExt
integer (4) function cusparseDgtsv2_nopivot(handle, m, n, dl, d, du, x, ldb, pBuffer) bind(C,name='cusparseDgtsv2_nopivot')
use iso_c_binding
use cusparse
type(cusparseHandle),value :: handle
integer(c_int),value :: m, n, batchCount, batchStride, ldb
real(c_double),dimension(*), device :: dl, d, du, x
!integer(8),value :: pBuffer
type(c_ptr), value :: pBuffer
!character(c_char),device :: pBuffer(*)
end function cusparseDgtsv2_nopivot
integer(4) function cusparseDgtsv2_nopivot_bufferSizeExt(handle, m, n, dl, d, du, x, ldb, pBufferSize) bind(C,name='cusparseDgtsv2_nopivot_bufferSizeExt')
use iso_c_binding
use cusparse
implicit none
type(cusparseHandle),value :: handle
integer(4),value :: m, n, batchCount, batchStride, ldb
real(c_double),dimension(*), device :: dl, d, du, x
!integer(c_size_t),value :: pBufferSize
integer(c_size_t) pBufferSize
end function cusparseDgtsv2_nopivot_bufferSizeExt
end interface
End
!********** cuda_tdma.cuf ************!
PROGRAM TDMA
use iso_C_binding
use cusparse
use openacc
use tridiag
implicit none
! atri - sub-diagonal (means it is the diagonal below the main diagonal)
! btri - the main diagonal
! ctri - sup-diagonal (means it is the diagonal above the main diagonal)
! dtri - right part
! npts - number of equations
integer(4) npts,i,istat,j,iter,sizebuf,ierr, status, pts, calls, count
type(cusparseHandle) :: handle1
integer(4) :: m, n
integer(c_size_t) :: bufsize
!character(c_char),allocatable,device :: buf(:)
type(c_ptr), device,allocatable,target :: buf(:)
parameter (npts=192)
parameter (n=192*192*5)
real :: t1,t2
real, dimension(npts) :: atri,btri,ctri
real, dimension(npts*n) :: dtri
!***** TEST VALUES INITIALIZATION *****!
atri = 1.0
atri(1) = 0.0
btri = 2.0
ctri = 1.0
ctri(npts) = 0.0
count = 0
Do i=1,n
Do pts = 1,npts
count = count + 1
dtri(count) = 4.0*i
if(pts.eq.1.or.pts.eq.npts) dtri(count)=3.0*i
Enddo
Enddo
print*, dtri(1:npts)
!***** memcpy() HtoD *****!
!$acc data copyin(atri,btri,ctri,dtri)
!**** cusparse_create and check ****!
istat = cusparseCreate(handle1)
!***** Calling Dgtsv and checking the output Status ****
call cpu_time(t1)
status = cusparseDgtsv2_nopivot_bufferSizeExt(handle1, npts, n, atri, btri, ctri, dtri, npts, bufsize)
print*, bufsize
allocate(buf(bufsize))
call cpu_time(t2)
print*, 'buff size:', t2-t1
!********** Multiple calls to gtsv2 using the same buffersize ******************
Do calls = 1,5
!$acc parallel loop collapse(2)
Do i=1,n
Do pts = 1,npts
count = pts + (i-1)*npts
dtri(count) = 4.0*i
if(pts.eq.1.or.pts.eq.npts) dtri(count)=3.0*i
Enddo
Enddo
!call cpu_time(t1)
!status = cusparseDgtsv_nopivot(handle1,npts,n,atri,btri,ctri,dtri,npts)
!call cpu_time(t2)
!print*,'time gtsv=',t2-t1, status
call cpu_time(t1)
status = cusparseDgtsv2_nopivot(handle1,npts,n,atri,btri,ctri,dtri,npts,c_loc(buf))
call cpu_time(t2)
print*,'time gtsv2=',t2-t1, status
Enddo
!$acc update self(dtri)
print*, dtri(1:npts)
!$acc end data
!**** TDMA on CPU ************
call cpu_time(t1)
Do calls = 1,5
call CPU_TDMA()
Enddo
call cpu_time(t2)
print*,'CPU TIME:', t2-t1
END PROGRAM TDMA
Subroutine CPU_TDMA()
implicit none
! atri - sub-diagonal (means it is the diagonal below the main diagonal)
! btri - the main diagonal
! ctri - sup-diagonal (means it is the diagonal above the main diagonal)
! dtri - right part
! xsol - the answer
! npts - number of equations
integer npts,pts,i,l,m,check,iter, nl, nm
parameter(npts=192)
real,dimension(npts) :: atri,btri,ctri,dtri
real,dimension(npts) :: xsol
real,dimension(npts) :: cp,dp
real dnr,t1,t2
nl = 192*192*5
!***** FIRST EQUATION ******! RHS = 1.0
atri = 1.0
atri(1) = 0.0
btri = 2.0
ctri = 1.0
ctri(npts) = 0.0
call cpu_time(t1)
!!$acc parallel loop collapse(2) copyin(atri,btri,ctri,dtri) create(xsol,dnr,cp,dp)
do l=1,nl
dtri = 4.0*l
dtri(1) = 3.0*l
dtri(npts) = 3.0*l
! initialize c-prime and d-prime
cp(1) = ctri(1)/btri(1)
dp(1) = dtri(1)/btri(1)
! solve for vectors c-prime and d-prime
!!$acc parallel loop
do pts = 2,npts
dnr = btri(pts)-cp(pts-1)*atri(pts)
cp(pts) = ctri(pts)/dnr
dp(pts) = (dtri(pts)-dp(pts-1)*atri(pts))/dnr
enddo
! initialize x
xsol(npts) = dp(npts)
! solve for x from the vectors c-prime and d-prime
do pts = npts-1, 1, -1
xsol(pts) = dp(pts)-cp(pts)*xsol(pts+1)
end do
enddo
call cpu_time(t2)
!!$acc update self(dtri)
print*,'TIME: ',t2-t1, xsol
!check=0
do i=1,npts
if(xsol(i).gt.0.9999.and.xsol(i).lt.1.0001) then
check=1
Endif
Enddo
if(check.eq.0) then
print*,"Final Solution of CPU TDMA Unsuccessfull"
else
print*,"Final Solution of CPU TDMA Successfull"
Endif
END subroutine CPU_TDMA