Hi, I am trying to get nvblas to work with a simple C program. Im using the intel MKL dgemm example below, with Matrices of size 5k,5k * 5k,5k (which should be offloaded) .
/* C source code is found in dgemm_example.c */
#define min(x,y) (((x) < (y)) ? (x) : (y))
#include <stdio.h>
#include <stdlib.h>
#include "mkl.h"
int main()
{
double *A, *B, *C;
int m, n, k, i, j;
double alpha, beta;
printf ("\n This example computes real matrix C=alpha*A*B+beta*C using \n"
" Intel(R) MKL function dgemm, where A, B, and C are matrices and \n"
" alpha and beta are double precision scalars\n\n");
m = 5000, k = 5000, n = 5000;
printf (" Initializing data for matrix multiplication C=A*B for matrix \n"
" A(%ix%i) and matrix B(%ix%i)\n\n", m, k, k, n);
alpha = 1.0; beta = 0.0;
printf (" Allocating memory for matrices aligned on 64-byte boundary for better \n"
" performance \n\n");
A = (double *)mkl_malloc( m*k*sizeof( double ), 64 );
B = (double *)mkl_malloc( k*n*sizeof( double ), 64 );
C = (double *)mkl_malloc( m*n*sizeof( double ), 64 );
if (A == NULL || B == NULL || C == NULL) {
printf( "\n ERROR: Can't allocate memory for matrices. Aborting... \n\n");
mkl_free(A);
mkl_free(B);
mkl_free(C);
return 1;
}
printf (" Intializing matrix data \n\n");
for (i = 0; i < (m*k); i++) {
A[i] = (double)(i+1);
}
for (i = 0; i < (k*n); i++) {
B[i] = (double)(-i-1);
}
for (i = 0; i < (m*n); i++) {
C[i] = 0.0;
}
printf (" Computing matrix product using Intel(R) MKL dgemm function via CBLAS interface \n\n");
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
m, n, k, alpha, A, k, B, n, beta, C, n);
printf ("\n Computations completed.\n\n");
printf (" Top left corner of matrix A: \n");
for (i=0; i<min(m,6); i++) {
for (j=0; j<min(k,6); j++) {
printf ("%12.0f", A[j+i*k]);
}
printf ("\n");
}
printf ("\n Top left corner of matrix B: \n");
for (i=0; i<min(k,6); i++) {
for (j=0; j<min(n,6); j++) {
printf ("%12.0f", B[j+i*n]);
}
printf ("\n");
}
printf ("\n Top left corner of matrix C: \n");
for (i=0; i<min(m,6); i++) {
for (j=0; j<min(n,6); j++) {
printf ("%12.5G", C[j+i*n]);
}
printf ("\n");
}
printf ("\n Deallocating memory \n\n");
mkl_free(A);
mkl_free(B);
mkl_free(C);
printf (" Example completed. \n\n");
return 0;
}
nvblas.conf file:
# Specify which output log file (default is stderr)
NVBLAS_LOGFILE nvblas.log
# Enable trace log of every intercepted BLAS calls
NVBLAS_TRACE_LOG_ENABLED
#Put here the CPU BLAS fallback Library of your choice
#NVBLAS_CPU_BLAS_LIB libopenblas.so
NVBLAS_CPU_BLAS_LIB /opt/intel/mkl/mkl_2018p4/compilers_and_libraries_2018.5.274/linux/mkl/lib/intel64/libmkl_rt.so
NVBLAS_GPU_LIST ALL0
# Tile Dimension
NVBLAS_TILE_DIM 2048
# Autopin Memory
NVBLAS_AUTOPIN_MEM_ENABLED
I tried both methods of linking the executable using the command and running the command:
$ gcc -L/software/cuda-10/lib64 -lnvblas -L${MKLROOT}/lib/intel64 -Wl,--no-as-needed -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core -lpthread -lm -ldl dgemm.c -o dgemm
and dynamically loading the library without recompiling
LD_PRELOAD=libnvblas.so ./dgemm
In both cases, the nvblas.log has the following output:
[NVBLAS] Using devices :0
[NVBLAS] Config parsed
Which indicates that nothing is being offloaded.
Similarly, nvidia-smi always shows that the program always allocates around 190MB on the GPU no matter the Matrix size. The GPU-Util percentage never goes above 0%
+-----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
| 0 10347 C ./dgemm 193MiB |
+-----------------------------------------------------------------------------+
I cannot figure out what is going on. I’ve also tried a similar setup as above using openBLAS with no success and much larger matrix sizes (20k x 20k). Similarly, I tried the above on a different machine, with no luck.
CPU: Intel Xeon E5-2603 v2
GPU: TitanXp with
CUDA Driver Version / Runtime Version 10.1 / 10.0,
CUDA Capability Major/Minor version number: 6.1
Can anyone help me figure out what is going wrong?
Thanks