Hello,
I am trying to use matrix multiplication function from cuBLAS in a neural network library (Shark). The Shark is using a modified version of BLAS and I managed to replace one of the functions (gemm in code below) with cublasDgemm:
template<class M, class E1, class E2>
void axpy_prod(
matrix_expression<E1> const& e1,
matrix_expression<E2> const& e2,
matrix_expression<M>& m,
bool init = true,
typename M::value_type alpha = 1.0
) {
SIZE_CHECK(m().size1() == e1().size1());
SIZE_CHECK(m().size2() == e2().size2());
SIZE_CHECK(e1().size2() == e2().size1());
if (init)
m().clear();
//kernels::gemm(e1,e2,m,alpha); // original code
// my code begins here
int n,k,mm,lda,ldb,ldc;
mm=e1().size1();
n=e2().size2();
k=e2().size1();
lda=mm;
ldb=k;
double alph=1.0,bet=0.0;
double* e1_d;
e1_d = (double*) malloc (mm*k*sizeof(double));
const double* e1_Ptr = e1().storage();
for(std::size_t j = 0; j<k; j++)
{
for(std::size_t i=0;i<mm;i++)
{
e1_d[IDX2C(i,j,k)] = e1_Ptr[i*k+j];
}
}
double* e2_d;
e1_d = (double*) malloc (k*n*sizeof(double));
const double* e2_Ptr = e2().storage();
for(std::size_t j = 0; j<n; j++)
{
for(std::size_t i=0;i<k;i++)
{
e2_d[IDX2C(i,j,n)] = e2_Ptr[i*n+j];
}
}
double* m_d;
m_d = (double*) malloc (mm*n*sizeof(double));
double* m_Ptr = m().storage();
for(std::size_t j = 0; j<n; j++)
{
for(std::size_t i=0;i<mm;i++)
{
m_d[IDX2C(i,j,n)] = m_Ptr[i*n+j];
}
}
#pragma acc data copyin(e1_d[0:mm][0:k],e2_d[0:k][0:n],mm,n,k,lda,ldb,alph,bet) copyout(m_d[0:mm][0:n])
{
#pragma acc host_data use_device(e1_d,e2_d,mm,n,k,lda,ldb,alph,bet,m_d)
{
//for(int tmp=0;tmp<1000;tmp++)
//{
cublasHandle_t handle;
cublasDgemm(handle,CUBLAS_OP_N,CUBLAS_OP_N,mm,n,k,&alph,e1_d,lda,e2_d,ldb,&bet,m_d,mm);
//}
}
}
for(std::size_t j = 0; j<n; j++)
{
for(std::size_t i=0;i<mm;i++)
{
m_Ptr[i*n+j] = m_d[IDX2C(i,j,n)];
}
}
}
I am using this make file to compile the code:
export BOOST_ROOT=/usr/local/include/
export SHARK_ROOT=/opt/Shark
export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH
SOURCE = NN_Shark
CPPFLAGS = -I${BOOST_ROOT} -I${SHARK_ROOT}/include/ -I/usr/local/cuda-7.5/include/
CPPFLAGS += -acc #-DNDEBUG -DBOOST_UBLAS_NDEBUG -g -O3
LDFLAGS += -L/opt/Shark/lib
LDLIBS = -lshark
LDLIBS += -lboost_serialization -lboost_system -lboost_filesystem -lboost_program_options
CC = pgc++
all: Shark_OpenAcc
Shark_OpenAcc: $(SOURCE).o
$(CC) $(SOURCE).o -o Shark_OpenAcc $(CPPFLAGS) $(LDFLAGS) $(LDLIBS)
$(SOURCE).o: $(SOURCE).cpp
$(CC) $(CPPFLAGS) $(LDFLAGS) $(LDLIBS) $(SOURCE).cpp -c
clean:
rm Shark_OpenAcc $(SOURCE).o
And I am getting this error for every function in cublas.h:
"/usr/local/cuda-7.5/include/cublas.h", line 90: error: more than one instance
of overloaded function "cublasGetVersion_v2" has "C" linkage
cublasStatus CUBLASWINAPI cublasGetVersion(int *version);
^
I have two versions (7 &7.5) installed on my system. One under /opt/pgi/linux86-64/2015/cuda/ and the other under /usr/local/cuda-7.5/. I removed the one from pgi and made a symbolic link of version 7.5 in the /opt/pgi/linux86-64/2015/cuda/. Still getting the same errors.
How can I fix this problem? Any help is appreciated.
Thanks,
Ali