Segmentation fault : cublasDgetrfBatched and cublasDgetriBatched

Hi,

I’m getting segmentation fault from calling “cublasDgetrfBatched” and “cublasDgetriBatched”. Then I wrote a small code to repeat this error. Would you please take a look?

Thanks.

– Jing

----------------------------------- code start from here ------------------------------

module LU
interface cuda_getrf
subroutine cuda_dgetrf(n, Aarray, lda, ipvt, info, batchCount) bind(C,name=‘cublasDgetrfBatched’)
use cudafor
use iso_c_binding
integer, value :: n
type(c_devptr), device :: Aarray()
integer, value :: lda
integer, device :: ipvt(
)
integer, device :: info()
integer, value :: batchCount
end subroutine cuda_dgetrf
end interface
interface cuda_getri
subroutine cuda_dgetri(n, Aarray, lda, ipvt, Carray, ldc, info, batchCount) bind(C,name=‘cublasDgetriBatched’)
use cudafor
use iso_c_binding
integer, value:: n
type(c_devptr), device :: Aarray(
)
integer, value :: lda
integer, device :: ipvt()
type(c_devptr), device :: Carray(
)
integer, value :: ldc
integer, device :: info(*)
integer, value :: batchCount
end subroutine cuda_dgetri
end interface
end module LU

PROGRAM solvelinear
use cudafor
use LU
implicit none
REAL(8), dimension(3,3) :: A,Ainv,M,LU
INTEGER,dimension(3) :: ipiv
INTEGER :: i

integer, parameter :: batchCount=1, ndim=3
integer, device :: ipiv_d(ndim)
integer, device :: info1_d(batchCount), info2_d(batchCount)
real(8), device :: t_d(ndim,ndim)
real(8), device :: ti_d(ndim,ndim)
type(c_devptr) :: devPtrA(batchCount), devPtrC(batchCount)
type(c_devptr), device :: devPtrA_d(batchCount), devPtrC_d(batchCount)
INTEGER istat, status

A = reshape((/1,-1,3,3,1,1,1,1,1,1/),(/3,3/))

t_d(1:ndim,1:ndim) = A
ti_d(1:ndim,1:ndim) = 0.

devPtrA(batchCount) = c_devloc(t_d(1,1))
devPtrC(batchCount) = c_devloc(ti_d(1,1))

devPtrA_d = devPtrA
devPtrC_d = devPtrC

call cuda_dgetrf(ndim,devPtrA_d,ndim,ipiv_d,info1_d,batchCount)
call cuda_dgetri(ndim,devPtrA_d,ndim,ipiv_d,devPtrC_d,ndim,info2_d,batchCount)
Ainv(1:ndim,1:ndim)=ti_d(1:ndim,1:ndim)

!-- computation of A^-1 * A to check the inverse
M = matmul(Ainv,A)
do i=1,3
print*,M(i,:)
end do
END PROGRAM solvelinear

!.. CORRECT OUTPUT
!.. 1. 0. 0.
!.. 0. 1. 0.
!.. 0. 0. 1.

------------------------------ code end here -------------------------

Hi Jing,

You are missing a cuBLAS handle which is probably the cause of the segv.
https://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-getribatched

Is there a reason why you not using the cuBLAS interface modules we provide? Might be easier than trying to write your own interface.
https://www.pgroup.com/resources/docs/19.5/x86/pgi-cuda-interfaces/index.htm#dp-cublasdgetrfbatched

-Mat

Hi Mat,

I wrote my own is because whenever I declare the handle

type(cublasHandle) :: handle

The compile complains about

mpif90 -c -r8 -w -O3 -fast -Mcuda=cuda10.0 mylinearsolve.CUF

PGF90-S-0155-Derived type has not been declared - cublashandle (mylinearsolve.CUF: 377)
0 inform, 0 warnings, 1 severes, 0 fatal for tmatrix
makefile:107: recipe for target ‘mylinearsolve.o’ failed

Do you know why?

Thanks,

– Jing

Hi Jing,

“type(cublasHandle)” is defined in the PGI cuBLAS interface module. Did you add “use cublas” which will include this module?

-Mat

Yes. I did. I’m re-post my code here which include “cublas” and “handle” and gives the complains:

----------------------------------- code start from here ------------------------------

module LU
interface cuda_getrf
subroutine cuda_dgetrf(n, Aarray, lda, ipvt, info, batchCount) bind(C,name=‘cublasDgetrfBatched’)
use cudafor
use iso_c_binding
integer, value :: n
type(c_devptr), device :: Aarray()
integer, value :: lda
integer, device :: ipvt(
)
integer, device :: info()
integer, value :: batchCount
end subroutine cuda_dgetrf
end interface
interface cuda_getri
subroutine cuda_dgetri(n, Aarray, lda, ipvt, Carray, ldc, info, batchCount) bind(C,name=‘cublasDgetriBatched’)
use cudafor
use iso_c_binding
integer, value:: n
type(c_devptr), device :: Aarray(
)
integer, value :: lda
integer, device :: ipvt()
type(c_devptr), device :: Carray(
)
integer, value :: ldc
integer, device :: info(*)
integer, value :: batchCount
end subroutine cuda_dgetri
end interface
end module LU

PROGRAM solvelinear
use cudafor
use cublas
use LU
implicit none
REAL(8), dimension(3,3) :: A,Ainv,M,LU
INTEGER,dimension(3) :: ipiv
INTEGER :: i

integer, parameter :: batchCount=1, ndim=3
integer, device :: ipiv_d(ndim)
integer, device :: info1_d(batchCount), info2_d(batchCount)
real(8), device :: t_d(ndim,ndim)
real(8), device :: ti_d(ndim,ndim)
type(c_devptr) :: devPtrA(batchCount), devPtrC(batchCount)
type(c_devptr), device :: devPtrA_d(batchCount), devPtrC_d(batchCount)
INTEGER istat, status
type(cublasHandle) :: h

A = reshape((/1,-1,3,3,1,1,1,1,1,1/),(/3,3/))

t_d(1:ndim,1:ndim) = A
ti_d(1:ndim,1:ndim) = 0.

devPtrA(batchCount) = c_devloc(t_d(1,1))
devPtrC(batchCount) = c_devloc(ti_d(1,1))

devPtrA_d = devPtrA
devPtrC_d = devPtrC

call cuda_dgetrf(ndim,devPtrA_d,ndim,ipiv_d,info1_d,batchCount)
call cuda_dgetri(ndim,devPtrA_d,ndim,ipiv_d,devPtrC_d,ndim,info2_d,batchCount)
Ainv(1:ndim,1:ndim)=ti_d(1:ndim,1:ndim)

!-- computation of A^-1 * A to check the inverse
M = matmul(Ainv,A)
do i=1,3
print*,M(i,:)
end do
END PROGRAM solvelinear

!.. CORRECT OUTPUT
!.. 1. 0. 0.
!.. 0. 1. 0.
!.. 0. 0. 1.

I wasn’t able to reproduce this error, but did go ahead and fix the segv by passing in a handle. I have not investigated why the results are incorrect.

% cat solve2.cuf
module LU
interface cuda_getrf
subroutine cuda_dgetrf(h,n, Aarray, lda, ipvt, info, batchCount) bind(C,name='cublasDgetrfBatched')
use cudafor
use cublas
use iso_c_binding
type(cublasHandle) :: h
integer, value :: n
type(c_devptr), device :: Aarray(*)
integer, value :: lda
integer, device :: ipvt(*)
integer, device :: info(*)
integer, value :: batchCount
end subroutine cuda_dgetrf
end interface
interface cuda_getri
subroutine cuda_dgetri(h,n, Aarray, lda, ipvt, Carray, ldc, info, batchCount) bind(C,name='cublasDgetriBatched')
use cudafor
use cublas
use iso_c_binding
integer, value:: n
type(cublasHandle) :: h
type(c_devptr), device :: Aarray(*)
integer, value :: lda
integer, device :: ipvt(*)
type(c_devptr), device :: Carray(*)
integer, value :: ldc
integer, device :: info(*)
integer, value :: batchCount
end subroutine cuda_dgetri
end interface
end module LU

PROGRAM solvelinear
use cudafor
use cublas
use LU
implicit none
REAL(8), dimension(3,3) :: A,Ainv,M,LU
INTEGER,dimension(3) :: ipiv
INTEGER :: i

integer, parameter :: batchCount=1, ndim=3
integer, device :: ipiv_d(ndim)
integer, device :: info1_d(batchCount), info2_d(batchCount)
real(8), device :: t_d(ndim,ndim)
real(8), device :: ti_d(ndim,ndim)
type(c_devptr) :: devPtrA(batchCount), devPtrC(batchCount)
type(c_devptr), device :: devPtrA_d(batchCount), devPtrC_d(batchCount)
INTEGER istat, status
type(cublasHandle) :: h

A = reshape((/1,-1,3,3,1,1,1,1,1,1/),(/3,3/))

t_d(1:ndim,1:ndim) = A
ti_d(1:ndim,1:ndim) = 0.

devPtrA(batchCount) = c_devloc(t_d(1,1))
devPtrC(batchCount) = c_devloc(ti_d(1,1))

devPtrA_d = devPtrA
devPtrC_d = devPtrC

istat = cublasGetHandle(h)

call cuda_dgetrf(h,ndim,devPtrA_d,ndim,ipiv_d,info1_d,batchCount)
call cuda_dgetri(h,ndim,devPtrA_d,ndim,ipiv_d,devPtrC_d,ndim,info2_d,batchCount)
Ainv(1:ndim,1:ndim)=ti_d(1:ndim,1:ndim)

!-- computation of A^-1 * A to check the inverse
M = matmul(Ainv,A)
do i=1,3
print*,M(i,:)
end do
END PROGRAM solvelinear

!... CORRECT OUTPUT
!... 1. 0. 0.
!... 0. 1. 0.
!... 0. 0. 1.
% pgfortran solve2.cuf -Mcudalib=cublas; a.out
    0.000000000000000         0.000000000000000         0.000000000000000
    0.000000000000000         0.000000000000000         0.000000000000000
    0.000000000000000         0.000000000000000         0.000000000000000

Here’s the same code using just the cuBLAS interface module we provide

% cat solvelinear.cuf
PROGRAM solvelinear
use cudafor
use cublas
implicit none
integer, parameter :: batchCount=1, ndim=3
INTEGER :: i
REAL(8), dimension(ndim,ndim) :: A,Ainv,M,LU
INTEGER,dimension(ndim) :: ipiv
integer, device :: ipiv_d(ndim)
integer, device :: info1_d(ndim*batchCount), info2_d(ndim*batchCount)
real(8), device :: t_d(ndim,ndim)
real(8), device :: ti_d(ndim,ndim)
type(c_devptr) :: devPtrA(batchCount), devPtrC(batchCount)
type(c_devptr), device :: devPtrA_d(batchCount), devPtrC_d(batchCount)
type(cublasHandle) :: h
INTEGER istat, status

istat = cublasGetHandle(h)

A = reshape((/1,-1,3,3,1,1,1,1,1,1/),(/3,3/))

t_d(1:ndim,1:ndim) = A
ti_d(1:ndim,1:ndim) = 0.

devPtrA = c_devloc(t_d(1,1))
devPtrC = c_devloc(ti_d(1,1))

devPtrA_d = devPtrA
devPtrC_d = devPtrC

istat = cublasDgetrfBatched(h,ndim,devPtrA_d,ndim,ipiv_d,info1_d,batchCount)
istat = cublasDgetriBatched(h,ndim,devPtrA_d,ndim,ipiv_d,devPtrC_d,ndim,info2_d,batchCount)
!Ainv(1:ndim,1:ndim)=ti_d(1:ndim,1:ndim)
Ainv=ti_d

!-- computation of A^-1 * A to check the inverse
M = matmul(Ainv,A)
do i=1,3
print*,M(i,:)
end do
END PROGRAM solvelinear

!... CORRECT OUTPUT
!... 1. 0. 0.
!... 0. 1. 0.
!... 0. 0. 1.
% pgf90 solvelinear.cuf -Mcudalib=cublas ; a.out
    1.000000000000000         0.000000000000000       -5.5511151231257827E-017
    0.000000000000000         1.000000000000000       -5.5511151231257827E-017
  -1.1102230246251565E-016  -1.1102230246251565E-016    1.000000000000000