Hi,
I’ve been trying to write some simple code that calls a CUBLAS routine from inside a kernel, but I keep getting the following error message when I compile with the command:
pgfortran -Mcuda test_sgemm_from_kernel.cuf -o test_sgemm_from_kernel -lcublas
PGF90-S-0155-Calls from device code to a host subroutine are allowed only in emulation mode - sgemm (test_sgemm_from_kernel.cuf: 25)
0 inform, 0 warnings, 1 severes, 0 fatal for kernel
Any help would be appreciated. My code is appended below.
module kernel_mod
interface
subroutine sgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc) bind(c,name='cublasSgemm')
use iso_c_binding
integer(c_int), value :: m, n, k, lda, ldb, ldc
real(c_float), device, dimension(m,n) :: a, b, c
real(c_float), value :: alpha, beta
character(kind=c_char), value :: transa, transb
end subroutine sgemm
! subroutine cublasinit() bind(c,name='cublasInit')
! end subroutine cublasinit
end interface
contains
attributes(global) subroutine kernel(k,m,n,alpha,a,lda,b,ldb,beta,c,ldc)
real, device :: a(k,m), b(m,n), c(k,m)
real :: alpha, beta
integer :: m,n,k,lda,ldb,ldc
! CUBLAS call
call sgemm('n','n',m,n,k,alpha,a,lda,b,ldb,beta,c,ldc)
end subroutine kernel
end module kernel_mod
program sgemm_mod
use cudafor
use kernel_mod
real, dimension(1000,1000) :: a, b, c
real, device, dimension(1000,1000) :: dA, dB, dC
real :: alpha, beta
real, device :: dAlpha, dBeta
integer :: m,n,k,lda,ldb,ldc
integer, device :: dM,dN,dK,dLda,dLdb,dLdc
! dim3 variables to define the grid and block shapes
type(dim3) :: dimGrid, dimBlock
m = 1000
n = 1000
k = 1000
lda = 1000
ldb = 1000
ldc = 1000
alpha = 1.0
beta = 0.0
! initialize all entries of A, B, and C
a = 2.0e0
b = 1.5e0
c = -9.9e0
! print 25 values od A and B
write(*,*) '25 entries of A:'
write(*,10) (a(1,i), i=1,5)
write(*,10) (a(2,i), i=1,5)
write(*,10) (a(3,i), i=1,5)
write(*,10) (a(4,i), i=1,5)
write(*,10) (a(5,i), i=1,5)
write(*,*) '25 entries of B:'
write(*,10) (b(1,i), i=1,5)
write(*,10) (b(2,i), i=1,5)
write(*,10) (b(3,i), i=1,5)
write(*,10) (b(4,i), i=1,5)
write(*,10) (b(5,i), i=1,5)
!call cublasinit()
! Copy data to device
dA = a
dB = b
dC = c
dAlpha = alpha
dBete = beta
dM = m
dN = n
dK = k
dLda = lda
dLdb = ldb
dLdc = ldc
! Create the grid and block dimensions
dimGrid = dim3( N/16, L/16, 1 )
dimBlock = dim3( 16, 16, 1 )
call kernel<<<dimGrid>>>(dK,dM,dN,dAlpha,dA,dLda,dB,dLdb,dBeta,dC,dLdc)
! Copy results from device
c = dC
! print results in C
write(*,*) '25 entries of C:'
write(*,10) (c(1,i), i=1,5)
write(*,10) (c(2,i), i=1,5)
write(*,10) (c(3,i), i=1,5)
write(*,10) (c(4,i), i=1,5)
write(*,10) (c(5,i), i=1,5)
10 format(F15.5,F15.5,F15.5,F15.5,F15.5)
end