Dear all,
I created a small wrapper experiment around CUBLAS. Therefore I picked up the NETLIB BLAS implementation and replaced the dgemm.f file ( Which implements the double precision matrix-matrix multiply) by the following C code:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <dlfcn.h>
#ifndef PREFIX
#define PREFIX "/usr/lib/blastest"
#endif
static void *dl_handle;
static void *gemm_handle;
static int initialized = 0;
typedef void (*gemm_call)(char * TA, char *TB, int *m , int *n , int *k, double *alpha, double *a, int *lda, double *b, int *ldb, double *beta, double *c, int *ldc);
__attribute__((constructor)) void __dgemm_init () {
initialized = 1;
dlerror();
dl_handle = dlopen (PREFIX"/libblastest_dgemm.so", RTLD_NOW|RTLD_GLOBAL|RTLD_DEEPBIND);
if (dl_handle == NULL) {
printf("DLOPEN: %s\n", dlerror());
abort();
}
gemm_handle = dlsym(dl_handle,"gpu_dgemm");
if ( gemm_handle == NULL ) {
printf("DLSYM: %s\n", dlerror());
abort();
}
return;
}
__attribute__((destructor)) void __dgemm_exit(){
if ( initialized == 0 ) {
return;
}
dlclose(dl_handle);
return;
}
void dgemm_(char * TA, char *TB, int *m , int *n , int *k, double *alpha, double *a, int *lda, double *b, int *ldb, double *beta, double *c, int *ldc){
gemm_call call;
if ( initialized == 0 || gemm_handle == NULL) {
printf("Not initialized. \n");
abort();
}
call = (gemm_call) gemm_handle;
call(TA, TB, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
return;
}
The I wrote the real wrapper as a second shared object in order to be able to replace it easily without relinking or LD_LIBRARY_PATH stuff. This looks like
#ifndef Int
#define Int int
#endif
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
#ifndef GPU_ALIGN
#define GPU_ALIGN 32
#endif
int gemm_initialized = 0;
cublasHandle_t gemm_handle;
cublasOperation_t acc_blas_to_cublas_operation(char *trans) {
/**
* Convert transposition indicators from blas to cublas style
* eg. "N" -> "CUBLAS_OP_N"
*/
char t = tolower(*trans);
switch(t) {
case 'n':
return CUBLAS_OP_N;
break;
case 't':
return CUBLAS_OP_T;
break;
case 'c':
return CUBLAS_OP_C;
break;
default:
fprintf(stderr, "Unknown operation '%c'. Using 'N' instead.\n", *trans);
return CUBLAS_OP_N;
}
}
void dgemm_exit() {
cublasDestroy(gemm_handle);
}
void gpu_dgemm(char *transA, char *transB, int * M , int *N , int * K , double *alpha, double *A, int * LDA, double *B, int *LDB, double *beta, double *C , int *LDC){
/** \brief Matrix Multiplication on GPU
* Device memory is allocated, matrices are transfered and cublas function is called. C=alpha*A*B+C is transferred back to host.
*/
//Device Pointers
double *dA;
double *dB;
double *dC;
int _M = *M;
int _N = *N;
int _K = *K;
int _LDA = *LDA;
int _LDB = *LDB;
int _LDC = *LDC;
int ldda = ((_M+(GPU_ALIGN-1))/GPU_ALIGN) *GPU_ALIGN ;
int lddb = ((_K+(GPU_ALIGN-1))/GPU_ALIGN) *GPU_ALIGN ;
int lddc = ((_M+(GPU_ALIGN-1))/GPU_ALIGN) *GPU_ALIGN ;
cublasOperation_t cublTransA;
cublasOperation_t cublTransB;
cublasStatus_t cublasError=0;
cudaError_t cudaError = cudaSuccess;
if ( gemm_initialized == 0 ) {
cudaSetDevice(0);
if ( cublasCreate(&gemm_handle) != CUBLAS_STATUS_SUCCESS ) {
fprintf(stderr, "ACC_BLAS: Failed to create CUBLAS Handle.\n");
abort();
}
atexit(dgemm_exit);
gemm_initialized = 1 ;
}
//Convert transposition indicators from fortran to cublas style
cublTransA=acc_blas_to_cublas_operation(transA);
cublTransB=acc_blas_to_cublas_operation(transB);
//Allocate device memory
cudaError=cudaMalloc((void**)&dA, ldda*_K*sizeof(double));
if (cudaError != cudaSuccess )
{
printf("GPU memory allocation failed.\n");
abort();
}
cudaError=cudaMalloc((void**)&dB, lddb*_N*sizeof(double));
if (cudaError != cudaSuccess)
{
cudaFree(dA);
printf("GPU memory allocation failed.\n");
abort();
}
cudaError=cudaMalloc((void**)&dC, lddc*_N*sizeof(double));
if (cudaError!=cudaSuccess)
{
cudaFree(dA);
cudaFree(dB);
abort();
}
//Copy matrices from host to device
cublasError=cublasSetMatrix(_M,_K,sizeof(double),A,_LDA,dA,ldda);
if (cublasError != CUBLAS_STATUS_SUCCESS)
{
cudaFree(dA);
cudaFree(dB);
cudaFree(dC);
abort();
}
cublasError=cublasSetMatrix(_K,_N,sizeof(double),B,_LDB,dB,lddb);
if (cublasError != CUBLAS_STATUS_SUCCESS)
{
cudaFree(dA);
cudaFree(dB);
cudaFree(dC);
printf("Copying to GPU memory failed.\n");
abort();
}
cublasError=cublasSetMatrix(_M,_N,sizeof(double),C,_LDC,dC,lddc);
if (cublasError != CUBLAS_STATUS_SUCCESS)
{
cudaFree(dA);
cudaFree(dB);
cudaFree(dC);
printf("Copying to GPU memory failed.\n");
abort();
}
//Matrix multiplication on device
cublasError=CUBLASGEMM(gemm_handle,cublTransA,cublTransB,_M,_N,_K, (double*) alpha, (double *) dA, ldda, (double *)dB, lddb,(double *)beta, (double *)dC, lddc);
if (cublasError != CUBLAS_STATUS_SUCCESS)
{
cudaFree(dA);
cudaFree(dB);
cudaFree(dC);
printf("Matrix Multiplication failed.\n");fflush(stdout);
abort();
}
//Copy resulting matrix from device to host
cublasError=cublasGetMatrix(_M,_N,sizeof(double),dC,lddc,C,_LDC);
if (cublasError!=CUBLAS_STATUS_SUCCESS)
{
printf("Copying to CPU memory failed.\n");
abort();
}
cudaFree(dA);
cudaFree(dB);
cudaFree(dC);
return;
}
And is compiled using “gcc -shared -O2 -o libblastest_dgemm.so -lcublas -lcudart”. I integrated my the whole thing using update-alternatives into my Ubuntu system. If I then run programs that use the dgemm call everything works fine and faster than on the CPU for large data sets. Only one application does not work as well and run into a deadlock. If I run GNU Octave it hangs after displaying its copyright information. Interrupting octave with GDB I got the backtrace posted there: http://pastebin.com/nUGRXGbU
I think the problem may the fact that the libcuda.so from the driver is as well loaded via dlopen from the libcudart library. But I do not know how to fix this. Any suggestions how I can fix this problem while keeping my dlopen approach?
The whole source of the BLAS library is available at: http://www-e.uni-magdeburg.de/makoehle/blas_test.tar.gz The make install target also includes the setup of update-alternatives on debian-like systems.
Use hard and software:
- Ubuntu 14.04.1 (amd64) with gcc 4.8.2 and glibc 2.19
- CUDA 6.5, Driver 340.29
- GNU Octave 3.8.1
- GeForce GTX 580