Problems with CUBLAS, dlopen and GNU Octave

Dear all,
I created a small wrapper experiment around CUBLAS. Therefore I picked up the NETLIB BLAS implementation and replaced the dgemm.f file ( Which implements the double precision matrix-matrix multiply) by the following C code:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <dlfcn.h>
#ifndef PREFIX
#define PREFIX "/usr/lib/blastest"
#endif

static void *dl_handle; 
static void *gemm_handle; 
static int initialized = 0; 

typedef void (*gemm_call)(char * TA, char *TB, int *m , int *n , int *k, double *alpha, double *a, int *lda, double *b, int *ldb, double *beta, double *c, int *ldc); 

__attribute__((constructor)) void __dgemm_init () {
	initialized = 1; 
	
	dlerror(); 
	dl_handle = dlopen (PREFIX"/libblastest_dgemm.so", RTLD_NOW|RTLD_GLOBAL|RTLD_DEEPBIND); 
	if (dl_handle == NULL) {
		printf("DLOPEN: %s\n", dlerror());
		abort(); 
	}
	gemm_handle = dlsym(dl_handle,"gpu_dgemm"); 
	if ( gemm_handle == NULL ) {
		printf("DLSYM: %s\n", dlerror());
		abort(); 
	}
	return; 
}

__attribute__((destructor)) void __dgemm_exit(){
	if ( initialized == 0 ) {
		return; 
	}
	dlclose(dl_handle); 
	return; 
}

void dgemm_(char * TA, char *TB, int *m , int *n , int *k, double *alpha, double *a, int *lda, double *b, int *ldb, double *beta, double *c, int *ldc){ 
	gemm_call call; 

	if ( initialized == 0  || gemm_handle == NULL) {
		printf("Not initialized. \n");
		abort(); 
	}

	call = (gemm_call) gemm_handle; 

	call(TA, TB, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); 
	return; 

}

The I wrote the real wrapper as a second shared object in order to be able to replace it easily without relinking or LD_LIBRARY_PATH stuff. This looks like

#ifndef Int 
#define Int int
#endif 

#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>

#ifndef GPU_ALIGN
#define GPU_ALIGN 32
#endif 


int gemm_initialized = 0; 
cublasHandle_t gemm_handle; 

cublasOperation_t acc_blas_to_cublas_operation(char *trans) {
	/**
	 * Convert transposition indicators from blas to cublas style
	 * eg. "N" -> "CUBLAS_OP_N"
	 */
	char t = tolower(*trans); 
	switch(t) {
		case 'n':
			return CUBLAS_OP_N;    
			break;
		case 't':
			return CUBLAS_OP_T;    
			break;
		case 'c':
			return CUBLAS_OP_C;    
			break;
		default: 
			fprintf(stderr, "Unknown operation '%c'. Using 'N' instead.\n", *trans);
		 	return CUBLAS_OP_N; 
	}
}

void dgemm_exit() {
	cublasDestroy(gemm_handle); 
}


void gpu_dgemm(char *transA, char *transB, int * M , int *N , int * K , double *alpha, double *A, int * LDA, double *B, int *LDB, double *beta, double *C , int *LDC){
	/** \brief Matrix Multiplication on GPU
	 * Device memory is allocated, matrices are transfered and cublas function is called. C=alpha*A*B+C is transferred back to host.
	 */
	//Device Pointers
	double *dA;
	double *dB;
	double *dC;
	
	int _M = *M;
	int _N = *N;
	int _K = *K; 
	int _LDA = *LDA; 
	int _LDB = *LDB;
	int _LDC = *LDC;
	
	int ldda = ((_M+(GPU_ALIGN-1))/GPU_ALIGN) *GPU_ALIGN ;  
	int lddb = ((_K+(GPU_ALIGN-1))/GPU_ALIGN) *GPU_ALIGN ;
	int lddc = ((_M+(GPU_ALIGN-1))/GPU_ALIGN) *GPU_ALIGN ;

	cublasOperation_t cublTransA; 
	cublasOperation_t cublTransB;
	cublasStatus_t cublasError=0;
	cudaError_t cudaError = cudaSuccess; 

	if ( gemm_initialized == 0 ) {
		cudaSetDevice(0); 
		if ( cublasCreate(&gemm_handle) != CUBLAS_STATUS_SUCCESS ) {
			fprintf(stderr, "ACC_BLAS: Failed to create CUBLAS Handle.\n");
			abort(); 
		}
		atexit(dgemm_exit); 
		gemm_initialized = 1 ; 
	}

	//Convert transposition indicators from fortran to cublas style
	cublTransA=acc_blas_to_cublas_operation(transA);
	cublTransB=acc_blas_to_cublas_operation(transB); 	

	//Allocate device memory
	cudaError=cudaMalloc((void**)&dA, ldda*_K*sizeof(double));
	if (cudaError != cudaSuccess ) 
	{
		printf("GPU memory allocation failed.\n");
		abort(); 
	} 
	cudaError=cudaMalloc((void**)&dB, lddb*_N*sizeof(double));
	if (cudaError != cudaSuccess) 
	{
		cudaFree(dA); 
		printf("GPU memory allocation failed.\n");
		abort(); 
	} 
	cudaError=cudaMalloc((void**)&dC, lddc*_N*sizeof(double));      

	if (cudaError!=cudaSuccess) 
	{
		cudaFree(dA); 
		cudaFree(dB); 
		abort(); 
	} 

	//Copy matrices from host to device
	cublasError=cublasSetMatrix(_M,_K,sizeof(double),A,_LDA,dA,ldda);
	if (cublasError != CUBLAS_STATUS_SUCCESS) 
	{
		cudaFree(dA); 
		cudaFree(dB); 
		cudaFree(dC); 
		abort(); 
	}
	cublasError=cublasSetMatrix(_K,_N,sizeof(double),B,_LDB,dB,lddb);
	if (cublasError != CUBLAS_STATUS_SUCCESS) 
	{
		cudaFree(dA); 
		cudaFree(dB); 
		cudaFree(dC); 
		printf("Copying to GPU memory failed.\n");
		abort(); 
	}
	cublasError=cublasSetMatrix(_M,_N,sizeof(double),C,_LDC,dC,lddc);
	if (cublasError != CUBLAS_STATUS_SUCCESS) 
	{
		cudaFree(dA); 
		cudaFree(dB); 
		cudaFree(dC); 
		printf("Copying to GPU memory failed.\n");
		abort(); 
	}
	
	
	//Matrix multiplication on device
	cublasError=CUBLASGEMM(gemm_handle,cublTransA,cublTransB,_M,_N,_K, (double*) alpha, (double *) dA, ldda, (double *)dB, lddb,(double *)beta, (double *)dC, lddc);

	if (cublasError != CUBLAS_STATUS_SUCCESS) 
	{
		cudaFree(dA); 
		cudaFree(dB); 
		cudaFree(dC); 
		printf("Matrix Multiplication failed.\n");fflush(stdout); 
		abort(); 
	}

	//Copy resulting matrix from device to host
	cublasError=cublasGetMatrix(_M,_N,sizeof(double),dC,lddc,C,_LDC);
	if (cublasError!=CUBLAS_STATUS_SUCCESS) 
	{ 
		printf("Copying to CPU memory failed.\n");
		abort(); 
	}
	cudaFree(dA); 
	cudaFree(dB);
	cudaFree(dC);
	return; 
}

And is compiled using “gcc -shared -O2 -o libblastest_dgemm.so -lcublas -lcudart”. I integrated my the whole thing using update-alternatives into my Ubuntu system. If I then run programs that use the dgemm call everything works fine and faster than on the CPU for large data sets. Only one application does not work as well and run into a deadlock. If I run GNU Octave it hangs after displaying its copyright information. Interrupting octave with GDB I got the backtrace posted there: http://pastebin.com/nUGRXGbU

I think the problem may the fact that the libcuda.so from the driver is as well loaded via dlopen from the libcudart library. But I do not know how to fix this. Any suggestions how I can fix this problem while keeping my dlopen approach?

The whole source of the BLAS library is available at: http://www-e.uni-magdeburg.de/makoehle/blas_test.tar.gz The make install target also includes the setup of update-alternatives on debian-like systems.

Use hard and software:

  • Ubuntu 14.04.1 (amd64) with gcc 4.8.2 and glibc 2.19
  • CUDA 6.5, Driver 340.29
  • GNU Octave 3.8.1
  • GeForce GTX 580

Not an answer to your question, but you might want to investigate using NVBLAS:

http://docs.nvidia.com/cuda/nvblas/index.html#abstract

rather than re-inventing the wheel. It can be used without relinking and without modifying LD_LIBRARY_PATH by using the LD_PRELOAD env. variable, perhaps on the command line used to invoke your app that uses the blas library.

The problem behind NVBLAS for my application is the support for only one card ( without purchasing it) and that it is only able to load single-shared object BLAS implementations which causes problems when using libmkl_rt.so and you do not want to rely on the Intel threads or if you want to use 8 byte integers. And of course, there must be a reason behind the above described bug.