Hi, all.
I used a simple code from internet and modified it to check interfacing openacc with cublas batche routine in fortran. The code could run, but the result was not correct. Here are the codes. I am appreciated for any suggestion.
cublas_for.cu
#include <stdio.h>
#include "cublas_v2.h"
extern "C" int f_cublasCreate(cublasHandle_t **handle)
{
*handle = (cublasHandle_t*)malloc(sizeof(cublasHandle_t));
return cublasCreate(*handle);
}
extern "C" void f_cublasDestroy(cublasHandle_t *handle)
{
cublasDestroy(*handle);
free(handle);
}
cublas_for_iso.f90
Module cublas_f
use ISO_C_BINDING
enum, BIND(C)
enumerator :: CUBLAS_OP_N, CUBLAS_OP_T, CUBLAS_OP_C
end enum
enum, BIND(C)
enumerator :: CUBLAS_STATUS_SUCCESS,CUBLAS_STATUS_NOT_INITIALIZED,CUBLAS_STATUS_INVALID_VALUE,&
CUBLAS_STATUS_ARCH_MISMATCH,CUBLAS_STATUS_EXECUTION_FAILED
end enum
INTERFACE
integer(C_INT) function cublasCreate(handle_ptr) BIND(C, NAME='f_cublasCreate')
use ISO_C_BINDING
type(C_PTR) :: handle_ptr
end function
subroutine cublasDestroy(handle_ptr) BIND(C, NAME='f_cublasDestroy')
use ISO_C_BINDING
type(C_PTR), value :: handle_ptr
end subroutine
integer(C_INT) function cublasDgemmBatched(handle, transa, transb, m, n, k, alpha, &
A, lda, B, ldb, beta, C, ldc, batch_count) &
BIND(C, NAME='cublasDgemmBatched')
use ISO_C_BINDING
type(C_PTR), value :: handle
integer(C_INT), value :: transa
integer(C_INT), value :: transb
integer(C_INT), value :: m
integer(C_INT), value :: n
integer(C_INT), value :: k
real(C_DOUBLE) :: alpha
REAL(C_DOUBLE) :: A(lda,k,batch_count)
integer(C_INT), value :: lda
real(C_DOUBLE) :: B(ldb,n,batch_count)
integer(C_INT), value :: ldb
real(C_DOUBLE) :: beta
real(C_DOUBLE) :: C(ldc,n,batch_count)
integer(C_INT), value :: ldc
integer(C_INT), value :: batch_count
end function
END INTERFACE
end module cublas_f
cublas-batch.f90
program main
use ISO_C_BINDING
use cublas_f
implicit none
integer :: dim, stat, i, j, k, batch_count
integer(8) :: bytes
real(8),dimension(:,:,:), allocatable :: A, B, C
real(8) :: alpha, beta, index, sum
type(C_PTR) :: handle
integer :: sizeof_double
parameter (sizeof_double=8)
!Linear dimension of matrices
dim = 100
! Number of A,B,C matrix sets
batch_count = 1000
! Allocate host storage for A,B,C square matrices
allocate(A(dim,dim,batch_count))
allocate(B(dim,dim,batch_count))
allocate(C(dim,dim,batch_count))
!$acc enter data create(A,B,C)
! Fill A,B diagonals with sin(i) data, C diagonal with cos(i)^2
! Matrices are arranged column major
!$acc data present(A,B,C)
!$acc kernels
!$acc loop
do k=1,batch_count
!$acc loop
do j=1,dim
!$acc loop
do i=1,dim
if (i==j) then
index = real(j*dim + i)
a(i,j,k) = k*sin(index)
b(i,j,k) = sin(index)
c(i,j,k) = k*cos(index) * cos(index)
else
a(i,j,k) = 0.0
b(i,j,k) = 0.0
c(i,j,k) = 0.0
endif
enddo ! i
enddo ! j
enddo ! k
!$acc end kernels
! Create cublas instance
stat = cublasCreate(handle)
if(stat /= CUBLAS_STATUS_SUCCESS) write(*,*) 'cublasCreate failed'
! Set matrix coefficients
alpha = 1.0
beta = 1.0
! batched DGEMM: C = alpha*A*B + beta*C
!$acc host_data use_device(A,B,C)
stat=cublasDgemmBatched(handle,CUBLAS_OP_N,CUBLAS_OP_N,dim,dim,dim,alpha,&
A,dim,B,dim,beta,C,dim,batch_count)
!$acc end host_data
if(stat /= CUBLAS_STATUS_SUCCESS) write(*,*) 'cublasDgemm failed:',stat
! Simple sanity test, sum up all elements
sum = 0.0
!$acc kernels
!$acc loop
do k=1,batch_count
!$acc loop
do j=1,dim
do i=1,dim
sum = sum + C(i,j,k)
enddo
enddo
enddo
!$acc end kernels
print *, "Sum is:", sum, "should be: ", dim*(batch_count)*(batch_count+1)/2
!$acc end data
!$acc exit data delete(A,B,C)
deallocate(A)
deallocate(B)
deallocate(C)
call cublasDestroy(handle)
end program
I compiled the codes as follows.
nvcc -c cublas_for.cu
pgfortran -c cublas_for_iso.f90
pgfortran -o cublas-batch.out cublas-batch.f90 cublas_for_iso.o cublas_for.cu -acc -ta=nvidia -Minfo -Mcuda
This is the running result.
cublasDgemm failed: 1
Sum is: 24907700.05277213 should be: 50050000
Thanks a lot.