So I’ve been tasked with adopting CUDA into our main FORTRAN program. I figured the easiest thing to do first is calling CUBLAS instead of the generic BLAS libraries. Or so I thought…
My set-up is like this:
The main program calls xgemm.f90 subroutine to handle the preliminary logistics, e.g. what to call depending on size of the arrays. xgemm.f90 will then call my cxgemm.f90 subroutine containing the ISO_C_BINDING interface to call CUBLAS’ DGEMM and ZGEMM subroutines. This was the setup for the generic BLAS library so it should still work for CUBLAS, right?
module cublas
!Define the INTERFACE to the NVIDIA C cublasZgemm & cublasDgemm
interface cuda_gemm
subroutine cuda_zgemm(cta, ctb, m, n, k,&
alpha, A, lda, B, ldb, beta, c, ldc) bind(C,name='cublasZgemm')
use iso_c_binding
character(1,c_char),value :: cta, ctb
integer(c_int),value :: m,n,k,lda,ldb,ldc
complex(c_double_complex),value :: alpha,beta
complex(c_double_complex), device, dimension(lda,*) :: A
complex(c_double_complex), device, dimension(ldb,*) :: B
complex(c_double_complex), device, dimension(ldc,*) :: C
end subroutine cuda_zgemm
subroutine cuda_dgemm(cta, ctb, m, n, k,&
alpha, A, lda, B, ldb, beta, c, ldc) bind(C,name='cublasDgemm')
use iso_c_binding
character(1,c_char),value :: cta, ctb
integer(c_int),value :: m,n,k,lda,ldb,ldc
real(c_double),value :: alpha,beta
real(c_double), device, dimension(lda,*) :: A
real(c_double), device, dimension(ldb,*) :: B
real(c_double), device, dimension(ldc,*) :: C
end subroutine cuda_dgemm
end interface
end module cublas
!
Subroutine cDGEMM(Str1,Str2,M,N,K,Alpha,A,LDA,&
B,LDB,Beta,C,LDC)
!Wrapper for matrix multiplies which handles both real/complex using GPU
use cublas
Implicit Real*8(A-H,O-Z)
!Host memory
Character*(*):: Str1, Str2
Integer:: M, N, K, LDA, LDB, LDC
Real(kind=8):: Alpha,Beta
Real(kind=8):: A(M,K),B(K,N),C(M,N)
! Real(kind=8),dimension(:,:),allocatable:: A,B,C
!GPU memory
Real(kind=8),device,allocatable,dimension(:,:):: A_d,B_d,C_d
!
!Get the array sizes
Print *, 'Array Sizes:'
Print *, M, N, K
! allocate (A(M,K),B(K,N),C(M,N))
Print *, 'Allocating memory on the device...'
allocate (A_d(M,K),B_d(K,N),C_d(M,N))
print *, 'Successfully allocated!...'
print *, 'Copy data to device...'
A_d= A
B_d= B
C_d= C
Print *, 'Calling cublas_DGEMM...'
Call cuda_dgemm(Str1,Str2,M,N,K,Alpha,A_d,&
LDA,B_d,LDB,Beta,C_d,LDC)
Print *, 'Copying data back to host...'
C=C_d
Print *, 'Unloading GPU...'
deallocate (A_d,B_d,C_d)
Print *, 'Done!'
end subroutine cDGEMM
!
!
Subroutine cZGEMM(Str1,Str2,M,N,K,Alpha,A,LDA,&
B,LDB,Beta,C,LDC)
!Wrapper for matrix multiplies which handles both real/complex using GPU
use cublas
Implicit Real*8(A-H,O-Z)
!Host memory
Character*(*):: Str1, Str2
Integer:: M, N, K, LDA, LDB, LDC
Complex(kind=8):: Alpha,Beta
Complex(kind=8):: A(M,K),B(K,N),C(M,N)
! Complex(kind=8),allocatable,dimension(:,:):: A,B,C
!GPU memory
Complex(kind=8),device,allocatable,dimension(:,:):: A_d,B_d,C_d
!
!Get the array sizes
Print *, 'Array Sizes:'
Print *, M, N, K
! allocate (A(M,K),B(K,N),C(M,N))
Print *, 'Allocating memory on the device...'
allocate (A_d(M,K),B_d(K,N),C_d(M,N))
print *, 'Successfully allocated!...'
print *, 'Copy data to device...'
A_d= A
B_d= B
C_d= C
Print *, 'Calling cublas_ZGEMM...'
Call cuda_zgemm(Str1,Str2,M,N,K,Alpha,A_d,&
LDA,B_d,LDB,Beta,C_d,LDC)
Print *, 'Copying data back to host ...'
C=C_d
Print *, 'Unloading GPU...'
deallocate (A_d,B_d,C_d)
Print *, 'Done!'
end subroutine cZGEMM
I compile with the command
pgf90 -Mcuda=cc20 -Mfree -c cxgemm.f90 -L/usr/local/cuda/lib64 -lcublas
.
Initially, I declared A,B,C as:
Real(kind=8),dimension(:,:),allocatable:: A,B,C
....
allocate (A(M,K),B(K,N),C(M,N))
but then I got a “segmentation violation, address not mapped to object” error. So I changed them to fixed-size arrays. Something tells me this is not a good way to go about it, but I don’t know what else to do. Everything works fine until now. The program now behaves very erratically: certain jobs finished all the way perfectly, some others still finished but I get huge errors, e.g. RelErr=5.90D-01 while Tolerance=1.00D-03) crash halfway though, and then some just crashed halfway through with output like this:
Initiating cublas_DGEMM
Array Sizes:
508 508 32
Allocating memory on the device...
Successfully allocated!...
Copy data to device...
Calling cublas_DGEMM...
Copying data back to host...
Unloading GPU...
Done!
Initiating cublas_DGEMM
Array Sizes:
508 508 32
Allocating memory on the device...
Successfully allocated!...
Copy data to device...
Initiating cublas_DGEMM...
Copying data back to host...
Unloading GPU...
Done!
Initiating cublas_DGEMM
Array Sizes:
476 476 32
Allocating memory on the device...
Successfully allocated!...
Copy data to device...
Calling cublas_DGEMM...
Copying data back to host...
0: copyout Memcpy (host=0x2b7981063b70, dev=0xb00300000, size=1812608) FAILED: 4(unspecified launch failure)
What am I doing wrong? Any thought will be greatly appreciated.