Hi,
When I use cuBLAS Dgemm routine, it gave “NVFORTRAN-S-0155-Could not resolve generic procedure cublasdgemm (nondev.f90: 66)”. I tried two different methods; one of them with the C pointers, C_DEVLOC( ), and the other one without it. Both gave the same error. The codes are as follows:
Program main
! CUDA MODULES
USE openacc
USE cublas
USE cudafor
IMPLICIT NONE
! COUNTERS
INTEGER :: A2,L,L0,M,P1,P2,NUCY
! VARIABLES
INTEGER, parameter :: N = 2000
INTEGER, parameter :: P = 1000
DOUBLE PRECISION, allocatable, dimension(:,:) :: X
DOUBLE PRECISION, allocatable, dimension(:,:) :: Y
DOUBLE PRECISION, allocatable, dimension(:,:) :: Z
! CUDA VARIABLES
TYPE(C_DEVPTR) :: devptr_A, devptr_B, devptr_C
TYPE(cublasHandle) :: handle
INTEGER :: status
DOUBLE PRECISION :: alpha, beta
alpha = 1.0D0
beta = 0.0D0
ALLOCATE(X(2,P))
ALLOCATE(Y(N,P))
ALLOCATE(Z(N,1))
Y = 1.0D0
Z = 0.180D0
X = 0.0D0
P1 = 0
DO L0 = 2,NUCY
L = L0-2
DO M = -L,L
P2 = P1
DO A2 = 2,3
P2 = P2+1
! CUDA BLAS ROUTINE -----------------------------------------------------
status =cublasCreate(handle)
!$ACC data copy(Y,Z(:,P2),X(A2-1,:)) create(devptr_A, devptr_B, devptr_C)
!$ACC host_data use_device(Y,Z(:,P2),X(A2-1,:))
devptr_A = C_DEVLOC(Y)
devptr_B = C_DEVLOC(Z(:,P2))
devptr_C = C_DEVLOC(X(A2-1,:))
!$ACC end host_data
!$ACC update device(devptr_A, devptr_B, devptr_C)
status = cudaDeviceSynchronize()
!$ACC host_data use_device(devptr_A, devptr_B, devptr_C)
status = cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, P, 1, N, alpha, devptr_A, N, devptr_B, N, beta, devptr_C, P)
!$ACC end host_data
status = cudaDeviceSynchronize()
!$ACC end data
status = cublasDestroy(handle)
! -----------------------------------------------------------------------
END DO
END DO ! M
END DO ! L0
END program main
The one without C_DEVLOC( )
Program main
! CUDA MODULES
USE openacc
USE cublas
IMPLICIT NONE
! COUNTERS
INTEGER :: A2,L,L0,M,P1,P2,NUCY
! VARIABLES
INTEGER, parameter :: N = 2000
INTEGER, parameter :: P = 1000
DOUBLE PRECISION, allocatable, dimension(:,:) :: X
DOUBLE PRECISION, allocatable, dimension(:,:) :: Y
DOUBLE PRECISION, allocatable, dimension(:,:) :: Z
! CUDA VARIABLES
TYPE(cublasHandle) :: handle
INTEGER :: status
DOUBLE PRECISION :: alpha, beta
alpha = 1.0D0
beta = 0.0D0
ALLOCATE(X(2,P))
ALLOCATE(Y(N,P))
ALLOCATE(Z(N,1))
Y = 1.0D0
Z = 0.180D0
NUCY = 4
P1 = 0
!$acc data copyin(Y,Z) create(X)
DO L0 = 2,NUCY
L = L0-2
DO M = -L,L
P2 = P1
DO A2 = 2,3
P2 = P2+1
! CUDA BLAS ROUTINE -----------------------------------------------------
status =cublasCreate(handle)
IF(status /= CUBLAS_STATUS_SUCCESS) write(*,*) "cublasCreate failed"
!$ACC host_data use_device(Y,Z(:,P2),X(A2-1,:))
status = cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, P, 1, N, alpha, Y, N, Z(:,P2), N, beta, X(A2-1,:), P)
!$ACC update host(X(A2-1,:))
!$ACC end host_data
IF(status /= CUBLAS_STATUS_SUCCESS) write(*,*) "cublasDgemm failed:", status
status = cublasDestroy(handle)
! -----------------------------------------------------------------------
END DO
END DO ! M
END DO ! L0
!$ACC end data
END program main
Both works with the Makefile as follows:
FC=nvfortran
TIMER=/usr/bin/time
OPT=
NOPT=-fast -Minfo=opt $(OPT)
FCFLAGS = -Mpreprocess -fast -acc=gpu -cuda -Mcudalib=cublas -Mcudalib=cusparse
nondev: nondev.o
$(TIMER) ./nondev.o $(STEPS)
nondev.o: nondev.f90
$(FC) $(FCFLAGS) -o $@ $< $(NOPT) -Minfo=accel -acc
clean:
rm -f *.o *.exe *.s *.mod a.out
This region of the code works fine with Lapack. I could not figure out what the problem is.
Thanks,
Yunus