0: copyout Memcpy... FAILED: 4(unspecified launch failure)

So I’ve been tasked with adopting CUDA into our main FORTRAN program. I figured the easiest thing to do first is calling CUBLAS instead of the generic BLAS libraries. Or so I thought…

My set-up is like this:
The main program calls xgemm.f90 subroutine to handle the preliminary logistics, e.g. what to call depending on size of the arrays. xgemm.f90 will then call my cxgemm.f90 subroutine containing the ISO_C_BINDING interface to call CUBLAS’ DGEMM and ZGEMM subroutines. This was the setup for the generic BLAS library so it should still work for CUBLAS, right?

module cublas
!Define the INTERFACE to the NVIDIA C cublasZgemm & cublasDgemm
  interface cuda_gemm
    subroutine cuda_zgemm(cta, ctb, m, n, k,&
      alpha, A, lda, B, ldb, beta, c, ldc) bind(C,name='cublasZgemm')
      use iso_c_binding
      character(1,c_char),value :: cta, ctb
      integer(c_int),value :: m,n,k,lda,ldb,ldc
      complex(c_double_complex),value :: alpha,beta
      complex(c_double_complex), device, dimension(lda,*) :: A
      complex(c_double_complex), device, dimension(ldb,*) :: B
      complex(c_double_complex), device, dimension(ldc,*) :: C
    end subroutine cuda_zgemm    
    subroutine cuda_dgemm(cta, ctb, m, n, k,&
      alpha, A, lda, B, ldb, beta, c, ldc) bind(C,name='cublasDgemm')
      use iso_c_binding
      character(1,c_char),value :: cta, ctb
      integer(c_int),value :: m,n,k,lda,ldb,ldc
      real(c_double),value :: alpha,beta
      real(c_double), device, dimension(lda,*) :: A
      real(c_double), device, dimension(ldb,*) :: B
      real(c_double), device, dimension(ldc,*) :: C
    end subroutine cuda_dgemm
  end interface
end module cublas
!
    Subroutine cDGEMM(Str1,Str2,M,N,K,Alpha,A,LDA,&
      B,LDB,Beta,C,LDC)
!Wrapper for matrix multiplies which handles both real/complex using GPU 
        use cublas
        Implicit Real*8(A-H,O-Z)
!Host memory
        Character*(*):: Str1, Str2
        Integer:: M, N, K, LDA, LDB, LDC
        Real(kind=8):: Alpha,Beta
        Real(kind=8):: A(M,K),B(K,N),C(M,N)
!        Real(kind=8),dimension(:,:),allocatable:: A,B,C
!GPU memory
        Real(kind=8),device,allocatable,dimension(:,:):: A_d,B_d,C_d
!  
!Get the array sizes
          Print *, 'Array Sizes:'
          Print *, M, N, K
!          allocate (A(M,K),B(K,N),C(M,N))
          Print *, 'Allocating memory on the device...'
          allocate (A_d(M,K),B_d(K,N),C_d(M,N))
          print *, 'Successfully allocated!...'
          print *, 'Copy data to device...'
          A_d= A
          B_d= B
          C_d= C
          Print *, 'Calling cublas_DGEMM...'
          Call cuda_dgemm(Str1,Str2,M,N,K,Alpha,A_d,&
            LDA,B_d,LDB,Beta,C_d,LDC)
          Print *, 'Copying data back to host...'
          C=C_d
          Print *, 'Unloading GPU...'
          deallocate (A_d,B_d,C_d)
          Print *, 'Done!'
      end subroutine cDGEMM
!
!
    Subroutine cZGEMM(Str1,Str2,M,N,K,Alpha,A,LDA,&
      B,LDB,Beta,C,LDC)
!Wrapper for matrix multiplies which handles both real/complex using GPU 
        use cublas
        Implicit Real*8(A-H,O-Z)
!Host memory
        Character*(*):: Str1, Str2
        Integer:: M, N, K, LDA, LDB, LDC
        Complex(kind=8):: Alpha,Beta
        Complex(kind=8):: A(M,K),B(K,N),C(M,N)
!        Complex(kind=8),allocatable,dimension(:,:):: A,B,C
!GPU memory
        Complex(kind=8),device,allocatable,dimension(:,:):: A_d,B_d,C_d
!  
!Get the array sizes
          Print *, 'Array Sizes:'
          Print *, M, N, K
!          allocate (A(M,K),B(K,N),C(M,N))
          Print *, 'Allocating memory on the device...'
          allocate (A_d(M,K),B_d(K,N),C_d(M,N))
          print *, 'Successfully allocated!...'
          print *, 'Copy data to device...'
          A_d= A
          B_d= B
          C_d= C
          Print *, 'Calling cublas_ZGEMM...'
          Call cuda_zgemm(Str1,Str2,M,N,K,Alpha,A_d,&
            LDA,B_d,LDB,Beta,C_d,LDC)
          Print *, 'Copying data back to host ...'
          C=C_d
          Print *, 'Unloading GPU...'      
          deallocate (A_d,B_d,C_d)
          Print *, 'Done!'
      end subroutine cZGEMM

I compile with the command

pgf90 -Mcuda=cc20 -Mfree -c cxgemm.f90 -L/usr/local/cuda/lib64 -lcublas

.

Initially, I declared A,B,C as:

Real(kind=8),dimension(:,:),allocatable:: A,B,C
....
allocate (A(M,K),B(K,N),C(M,N))

but then I got a “segmentation violation, address not mapped to object” error. So I changed them to fixed-size arrays. Something tells me this is not a good way to go about it, but I don’t know what else to do. Everything works fine until now. The program now behaves very erratically: certain jobs finished all the way perfectly, some others still finished but I get huge errors, e.g. RelErr=5.90D-01 while Tolerance=1.00D-03) crash halfway though, and then some just crashed halfway through with output like this:

 Initiating cublas_DGEMM
 Array Sizes:
          508          508           32
 Allocating memory on the device...
 Successfully allocated!...
 Copy data to device...
 Calling cublas_DGEMM...
 Copying data back to host...
 Unloading GPU...
 Done!
 Initiating cublas_DGEMM
 Array Sizes:
          508          508           32
 Allocating memory on the device...
 Successfully allocated!...
 Copy data to device...
 Initiating cublas_DGEMM...
 Copying data back to host...
 Unloading GPU...
 Done!
 Initiating cublas_DGEMM
 Array Sizes:
          476          476           32
 Allocating memory on the device...
 Successfully allocated!...
 Copy data to device...
 Calling cublas_DGEMM...
 Copying data back to host...
0: copyout Memcpy (host=0x2b7981063b70, dev=0xb00300000, size=1812608) FAILED: 4(unspecified launch failure)

What am I doing wrong? Any thought will be greatly appreciated.

but then I got a “segmentation violation, address not mapped to object” error. So I changed them to fixed-size arrays. Something tells me this is not a good way to go about it, but I don’t know what else to do.

A, B, and C are arguments so can’t be allocated within the subroutine. Though, they can be declared as assumed-sized arrays. i.e.:

Real(kind=8),dimension(:,:) :: A,B,C



What am I doing wrong?

Without the complete code, it’s hard to tell. Though, I think you’re over complicating things. PGI provides a cublas module that you can use so you don’t need the overhead of your module. Plus, you can then call “dgemm” and “zgemm” directly, and since they are generic interfaces, if you call them with a “device” array, the cublas version is used, or if you call then with a host array, the regular host blas is used.

Below, I modified a CUDA Fortran SDK sgemm example to use both host and cublas versions. (/opt/pgi/linux86-64/2013/cuda/CUDA-Fortran-SDK/cublasTestSgemm.F90)

  • Mat
% cat cublasTestDgemm.F90 

program test_cublasDgemm
use cudafor
use cublas
real*8, device, allocatable, dimension(:,:) :: A_d, B_d, C_d
real*8, allocatable, dimension(:,:) :: A, B, C
real*8, allocatable :: T(:,:), Expct(:)

real*8 :: alpha = 1.0e0
real*8 :: beta  = 0.0e0
real*8 :: t1, t2,t3,tt,gflops
integer :: i, j, k

print *, "Enter N: "
read(5,*) n

allocate(A(n,n), B(n,n), C(n,n), T(n,n), Expct(n))
allocate(A_d(n,n), B_d(n,n), C_d(n,n))

call random_number(T)
do j = 1, n
  Expct(j) = 2.0 * sum(T(:,j))
end do
    
A_d = 2.0
B_d = T
C_d = -9.9
A = 2.0
B = T
C = -9.9

call cpu_time(t1)
call dgemm('n','n', n, n, n, alpha, A, n, B, n, beta, C, n)
call cpu_time(t2)
call dgemm('n','n', n, n, n, alpha, A_d, n, B_d, n, beta, C_d, n)
istat = cudaThreadSynchronize()  ! Only needed for a fair time
call cpu_time(t3)

print *, "Checking results...."

T = C_d

do j = 1, n
  do i = 1, n
    if (abs(T(i,j) - Expct(j)) .gt. 0.012) then
      print *, "Error:  ",i,j,T(i,j), Expct(j)
    endif
    if (abs(C(i,j) - Expct(j)) .gt. 0.012) then
      print *, "Error:  ",i,j,C(i,j), Expct(j)
    endif
  enddo
enddo

gflops = (real(n) * real(n) * real(n) * 4.0) / 1000000000.0
tt = t2 - t1
print *, "Total CPU Time: ",tt
print *, "Total CPU DGEMM gflops: ",gflops/tt
tt = t3 - t2
print *, "Total GPU Time: ",tt
print *, "Total GPU DGEMM gflops: ",gflops/tt
print *, "Done...."

end
% pgfortran -Mcuda=5.0 -o dgemm_gpu cublasTestDgemm.F90 -lcublas -L/opt/pgi/linux86-64/2013/acml/5.3.0/lib -lacml
% dgemm_gpu
 Enter N: 
1024
 Checking results....
 Total CPU Time:    0.2080838680267334     
 Total CPU DGEMM gflops:     20.64055813292666     
 Total GPU Time:    2.0368099212646484E-002
 Total GPU DGEMM gflops:     210.8673533887393     
 Done....

Hi Mat,
Thanks for suggesting the built-in CUBLAS module of the PGI compiler. Your sample code works beautifully!

My issue is that A, B, C are NOT allocatable by the subroutine. The main program has already m’allocated these arrays and pass them on to the subroutines, which will then call DGEMM and ZGEMM. If I used assumed-sized array, I wouldn’t be able to use conforming statements such as “C_d=C” and “C=C_d.” The CPU-only code was using the assume-sized arrays but then I had to change to fix-sized simply for this purpose. Is there an alternative method for transfer of data between host and device?

I don’t think my situation is that unusual, as the xGEMM routines are at the core of any scientific program. They get called all over the place, often times several subroutines nested inside the main program.

If I used assumed-sized array, I wouldn’t be able to use conforming statements such as “C_d=C” and “C=C_d.”

Why not? If you could use them before, there shouldn’t be a reason why you couldn’t use then with CUDA Fortran. My guess is when you moved the routines, you didn’t use them in a module or didn’t define an explicit interface.

% cat gemm.f90 
module mycublas

#ifdef _CUDA
use cudafor
use cublas
#endif

contains

subroutine mydgemm (A, B, C, n, alpha, beta,tt)
implicit none
real*8, intent(in), dimension(:,:) :: A, B
real*8, intent(out), dimension(:,:) :: C
integer :: n
real*8 :: alpha,beta
real*8 :: t1,t2,tt
#ifdef _CUDA
real*8, device, allocatable, dimension(:,:) :: A_d, B_d, C_d
#else
real*8, allocatable, dimension(:,:) :: A_d, B_d, C_d
#endif
allocate(A_d(n,n), B_d(n,n), C_d(n,n))

call cpu_time(t1)
A_d = A
B_d = B
C_d = C

call dgemm('n','n', n, n, n, alpha, A_d, n, B_d, n, beta, C_d, n)

C = C_d
call cpu_time(t2)
tt = t2-t1

deallocate(A_d,B_d,C_d)

end subroutine mydgemm

end module mycublas

program test_cublasDgemm

use mycublas
real*8, allocatable, dimension(:,:) :: A, B, C
real*8, allocatable :: T(:,:), Expct(:)

real*8 :: alpha = 1.0e0
real*8 :: beta  = 0.0e0
real*8 :: tt,gflops
integer :: i, j, k

print *, "Enter N: "
read(5,*) n

allocate(A(n,n), B(n,n), C(n,n), T(n,n), Expct(n))

call random_number(T)
do j = 1, n
  Expct(j) = 2.0 * sum(T(:,j))
end do

A = 2.0
B = T
C = -9.9

call mydgemm(A,B,C,n,alpha,beta,tt)

print *, "Checking results...."

do j = 1, n
  do i = 1, n
    if (abs(C(i,j) - Expct(j)) .gt. 0.012) then
      print *, "Error:  ",i,j,C(i,j), Expct(j)
    endif
  enddo
enddo

gflops = (real(n) * real(n) * real(n) * 4.0) / 1000000000.0
print *, "Total CPU Time: ",tt
print *, "Total CPU DGEMM gflops: ",gflops/tt
print *, "Done...."

end 

!! Target the host
% pgf90 gemm.f90 -Mpreprocess -lblas ; a.out
 Enter N: 
1024
 Checking results....
 Total CPU Time:     1.202882051467896     
 Total CPU DGEMM gflops:     3.570563854775965     
 Done....

!! Target the GPU
% pgf90 gemm.f90 -Mpreprocess -Mcuda -lcublas ; a.out
 Enter N: 
1024
 Checking results....
 Total CPU Time:    4.5428991317749023E-002
 Total CPU DGEMM gflops:     94.54242874311835     
 Done....