CublasXT hybrid CPU/GPU matrix multiplication

Dear all,
I am programming a matrix multiplication program using CublasXT. Now I want to do this not only using GPUs but also the CPU, but I do not understand how to do it.
Here follows my questions :
Is it necessary add to my cublasxt program just calls to cublasxtsetcpuroutine and cublasxtsetcpuratio?
Then how to use cublasxtsetcpuroutine? The cublasxt.h file reports as last input parameter “void *blasFunctor” , and I don’t know what to put there.
As last, does exist a function to specify the number of threads that the CPU should use?

To perform the computation i am using cublasxt <t> gemm functions.

Do you want to have two paths (e.g., CPU or GPU) at compile time or runtime?

My goal is to be able to input to my program the number of gpu to use and the percentage of workload to process on the cpu, at runtime.

I think this should be most of what you need. Here is a modified version of the simpleCUBLASXT sample code. It demonstrates putting 10% of the problem on the CPU.

Pretty much. Those were the main additions I made to the cublasXT sample code.

This took some inspection and head scratching on my part. Eventually I concluded that the expected prototype matches a particular kind of fortran->C conversion of the blas API. Therefore I lifted my “functor” mostly from here.

No. You would need to account for this in your functor. For example you could use an OpenMP parallelized version of the code I provided. Alternatively, you could connect to a library like MKL which can handle threading “automatically”.

Here is an example.

main.cpp (mostly adapted from the CUDA sample code):

/* Includes, system */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

/* Includes, cuda */
#include <cublasXt.h>
#include <cuda_runtime.h>
#include <helper_cuda.h>

/* Matrix size */
//#define N  (275)
#define N (1024)
// Restricting the max used GPUs as input matrix is not so large
#define MAX_NUM_OF_GPUS 2

int sgemm_(char *, char *, int  *, int *, int *, float *, float *, int *, float *, int *, float *, float *, int *);
/* Host implementation of a simple version of sgemm */
static void simple_sgemm(int n, float alpha, const float *A, const float *B,
                         float beta, float *C) {
  int i;
  int j;
  int k;

  for (i = 0; i < n; ++i) {
    for (j = 0; j < n; ++j) {
      float prod = 0;

      for (k = 0; k < n; ++k) {
        prod += A[k * n + i] * B[j * n + k];
      }

      C[j * n + i] = alpha * prod + beta * C[j * n + i];
    }
  }
}

void findMultipleBestGPUs(int &num_of_devices, int *device_ids) {
  // Find the best CUDA capable GPU device
  int current_device = 0;

  int device_count;
  checkCudaErrors(cudaGetDeviceCount(&device_count));
  typedef struct gpu_perf_t {
    uint64_t compute_perf;
    int device_id;
  } gpu_perf;

  gpu_perf *gpu_stats = (gpu_perf *)malloc(sizeof(gpu_perf) * device_count);

  cudaDeviceProp deviceProp;
  int devices_prohibited = 0;
  while (current_device < device_count) {
    cudaGetDeviceProperties(&deviceProp, current_device);

    // If this GPU is not running on Compute Mode prohibited,
    // then we can add it to the list
    int sm_per_multiproc;
    if (deviceProp.computeMode != cudaComputeModeProhibited) {
      if (deviceProp.major == 9999 && deviceProp.minor == 9999) {
        sm_per_multiproc = 1;
      } else {
        sm_per_multiproc =
            _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
      }

      gpu_stats[current_device].compute_perf =
          (uint64_t)deviceProp.multiProcessorCount * sm_per_multiproc *
          deviceProp.clockRate;
      gpu_stats[current_device].device_id = current_device;

    } else {
      devices_prohibited++;
    }

    ++current_device;
  }
  if (devices_prohibited == device_count) {
    fprintf(stderr,
            "gpuGetMaxGflopsDeviceId() CUDA error:"
            " all devices have compute mode prohibited.\n");
    exit(EXIT_FAILURE);
  } else {
    gpu_perf temp_elem;
    // Sort the GPUs by highest compute perf.
    for (int i = 0; i < current_device - 1; i++) {
      for (int j = 0; j < current_device - i - 1; j++) {
        if (gpu_stats[j].compute_perf < gpu_stats[j + 1].compute_perf) {
          temp_elem = gpu_stats[j];
          gpu_stats[j] = gpu_stats[j + 1];
          gpu_stats[j + 1] = temp_elem;
        }
      }
    }

    for (int i = 0; i < num_of_devices; i++) {
      device_ids[i] = gpu_stats[i].device_id;
    }
  }
  free(gpu_stats);
}
void my_sgemm(char *transa, char *transb, int  *m, int *n, int *k, float *alpha, float *A, int *lda, float *B, int *ldb, float *beta, float *C, int *ldc){
        printf("transa = %c\n", *transa);
        printf("transb = %c\n", *transb);
        printf("m = %d\n", *m);
        printf("n = %d\n", *n);
        printf("k = %d\n", *k);
        printf("alpha = %f\n", *alpha);
        printf("beta = %f\n", *beta);
        printf("lda = %d\n", *lda);
        printf("ldb = %d\n", *ldb);
        printf("ldc = %d\n", *ldc);
        sgemm_(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
}
/* Main */
int main(int argc, char **argv) {
  cublasStatus_t status;
  float *h_A;
  float *h_B;
  float *h_C;
  float *h_C_ref;
  float *d_A = 0;
  float *d_B = 0;
  float *d_C = 0;
  float alpha = 1.0f;
  float beta = 0.0f;
  int n2 = N * N;
  int i;
  float error_norm;
  float ref_norm;
  float diff;
  cublasXtHandle_t handle;
  int *devices = NULL;

  int num_of_devices = 0;

  checkCudaErrors(cudaGetDeviceCount(&num_of_devices));

  if (num_of_devices > MAX_NUM_OF_GPUS) {
    num_of_devices = MAX_NUM_OF_GPUS;
  }
  devices = (int *)malloc(sizeof(int) * num_of_devices);

  findMultipleBestGPUs(num_of_devices, devices);
  cudaDeviceProp deviceProp;
  printf("Using %d GPUs\n", num_of_devices);
  for (i = 0; i < num_of_devices; i++) {
    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devices[i]));
    printf("GPU ID = %d, Name = %s \n", devices[i], deviceProp.name);
  }

  /* Initialize CUBLAS */
  printf("simpleCUBLASXT test running..\n");

  status = cublasXtCreate(&handle);

  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! CUBLASXT initialization error\n");
    return EXIT_FAILURE;
  }

  /* Select devices for use in CUBLASXT math functions */
  status = cublasXtDeviceSelect(handle, num_of_devices, devices);

  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! CUBLASXT device selection error\n");
    return EXIT_FAILURE;
  }

  /* Optional: Set a block size for CUBLASXT math functions */
  status = cublasXtSetBlockDim(handle, 64);

  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! CUBLASXT set block dimension error\n");
    return EXIT_FAILURE;
  }

  /* Allocate host memory for the matrices */
  h_A = (float *)malloc(n2 * sizeof(h_A[0]));

  if (h_A == 0) {
    fprintf(stderr, "!!!! host memory allocation error (A)\n");
    return EXIT_FAILURE;
  }

  h_B = (float *)malloc(n2 * sizeof(h_B[0]));

  if (h_B == 0) {
    fprintf(stderr, "!!!! host memory allocation error (B)\n");
    return EXIT_FAILURE;
  }

  h_C_ref = (float *)malloc(n2 * sizeof(h_C[0]));

  if (h_C_ref == 0) {
    fprintf(stderr, "!!!! host memory allocation error (C_ref)\n");
    return EXIT_FAILURE;
  }

  h_C = (float *)malloc(n2 * sizeof(h_C[0]));

  if (h_C == 0) {
    fprintf(stderr, "!!!! host memory allocation error (C)\n");
    return EXIT_FAILURE;
  }

  /* Fill the matrices with test data */
  for (i = 0; i < n2; i++) {
    h_A[i] = rand() / (float)RAND_MAX;
    h_B[i] = rand() / (float)RAND_MAX;
    h_C[i] = rand() / (float)RAND_MAX;
    h_C_ref[i] = h_C[i];
  }

  /* Performs operation using plain C code */
  simple_sgemm(N, alpha, h_A, h_B, beta, h_C_ref);
  status = cublasXtSetCpuRatio(handle, CUBLASXT_GEMM, CUBLASXT_FLOAT, 0.1f);
  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! set CPU ratio error.\n");
    return EXIT_FAILURE;
  }
  status = cublasXtSetCpuRoutine(handle, CUBLASXT_GEMM, CUBLASXT_FLOAT, (void *)my_sgemm);
  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! set CPU routine error.\n");
    return EXIT_FAILURE;
  }
  /* Performs operation using cublas */
  status = cublasXtSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, h_A,
                         N, h_B, N, &beta, h_C, N);

  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! kernel execution error.\n");
    return EXIT_FAILURE;
  }

  /* Check result against reference */
  error_norm = 0;
  ref_norm = 0;

  for (i = 0; i < n2; ++i) {
    diff = h_C_ref[i] - h_C[i];
    error_norm += diff * diff;
    ref_norm += h_C_ref[i] * h_C_ref[i];
  }

  error_norm = (float)sqrt((double)error_norm);
  ref_norm = (float)sqrt((double)ref_norm);

  if (fabs(ref_norm) < 1e-7) {
    fprintf(stderr, "!!!! reference norm is 0\n");
    return EXIT_FAILURE;
  }

  /* Memory clean up */
  free(h_A);
  free(h_B);
  free(h_C);
  free(h_C_ref);

  if (cudaFree(d_A) != cudaSuccess) {
    fprintf(stderr, "!!!! memory free error (A)\n");
    return EXIT_FAILURE;
  }

  if (cudaFree(d_B) != cudaSuccess) {
    fprintf(stderr, "!!!! memory free error (B)\n");
    return EXIT_FAILURE;
  }

  if (cudaFree(d_C) != cudaSuccess) {
    fprintf(stderr, "!!!! memory free error (C)\n");
    return EXIT_FAILURE;
  }

  /* Shutdown */
  status = cublasXtDestroy(handle);

  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! shutdown error (A)\n");
    return EXIT_FAILURE;
  }

  if (error_norm / ref_norm < 1e-6f) {
    printf("simpleCUBLASXT test passed.\n");
    exit(EXIT_SUCCESS);
  } else {
    printf("simpleCUBLASXT test failed.\n");
    exit(EXIT_FAILURE);
  }
}

sgemm.cpp (mostly adapted from here):

#include <stdio.h>
typedef int integer;
typedef float real;
typedef bool logical;
#define max(a,b) ((a>b)?a:b)
int sgemm_(char *transa, char *transb, integer *m, integer *
        n, integer *k, real *alpha, real *a, integer *lda, real *b, integer *
        ldb, real *beta, real *c, integer *ldc)
{


    /* System generated locals */
    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
            i__3;

    /* Local variables */
    static integer info;
    static logical nota, notb;
    static real temp;
    static integer i, j, l, ncola;
    static integer nrowa, nrowb;


/*  Purpose
    =======

    SGEMM  performs one of the matrix-matrix operations

       C := alpha*op( A )*op( B ) + beta*C,

    where  op( X ) is one of

       op( X ) = X   or   op( X ) = X',

    alpha and beta are scalars, and A, B and C are matrices, with op( A )

    an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix.


    Parameters
    ==========

    TRANSA - CHARACTER*1.
             On entry, TRANSA specifies the form of op( A ) to be used in

             the matrix multiplication as follows:

                TRANSA = 'N' or 'n',  op( A ) = A.

                TRANSA = 'T' or 't',  op( A ) = A'.

    (some comments removed here)
    LDA    - INTEGER.
             On entry, LDA specifies the first dimension of A as declared

             in the calling (sub) program. When  TRANSA = 'N' or 'n' then

             LDA must be at least  max( 1, m ), otherwise  LDA must be at

             least  max( 1, k ).
             Unchanged on exit.

    B      - REAL             array of DIMENSION ( LDB, kb ), where kb is

             n  when  TRANSB = 'N' or 'n',  and is  k  otherwise.
             Before entry with  TRANSB = 'N' or 'n',  the leading  k by n

             part of the array  B  must contain the matrix  B,  otherwise

             the leading  n by k  part of the array  B  must contain  the

             matrix B.
             Unchanged on exit.

    LDB    - INTEGER.
             On entry, LDB specifies the first dimension of B as declared

             in the calling (sub) program. When  TRANSB = 'N' or 'n' then

             LDB must be at least  max( 1, k ), otherwise  LDB must be at

             least  max( 1, n ).
             Unchanged on exit.

    BETA   - REAL            .
             On entry,  BETA  specifies the scalar  beta.  When  BETA  is

             supplied as zero then C need not be set on input.
             Unchanged on exit.

    C      - REAL             array of DIMENSION ( LDC, n ).
             Before entry, the leading  m by n  part of the array  C must

             contain the matrix  C,  except when  beta  is zero, in which

             case C need not be set on entry.
             On exit, the array  C  is overwritten by the  m by n  matrix

             ( alpha*op( A )*op( B ) + beta*C ).

    LDC    - INTEGER.
             On entry, LDC specifies the first dimension of C as declared

             in  the  calling  (sub)  program.   LDC  must  be  at  least

             max( 1, m ).
             Unchanged on exit.


    Level 3 Blas routine.

    -- Written on 8-February-1989.
       Jack Dongarra, Argonne National Laboratory.
       Iain Duff, AERE Harwell.
       Jeremy Du Croz, Numerical Algorithms Group Ltd.
       Sven Hammarling, Numerical Algorithms Group Ltd.



       Set  NOTA  and  NOTB  as  true if  A  and  B  respectively are not

       transposed and set  NROWA, NCOLA and  NROWB  as the number of rows

       and  columns of  A  and the  number of  rows  of  B  respectively.



   Parameter adjustments
       Function Body */

#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
#define B(I,J) b[(I)-1 + ((J)-1)* ( *ldb)]
#define C(I,J) c[(I)-1 + ((J)-1)* ( *ldc)]

    nota = (*transa ==  'N');
    notb = (*transb ==  'N');
    if (nota) {
 nrowa = *m;
 ncola = *k;
    } else {
 nrowa = *k;
 ncola = *m;
    }
    if (notb) {
 nrowb = *k;
    } else {
 nrowb = *n;
    }

/*     Test the input parameters. */

    info = 0;
    if (! nota && ! (*transa == 'C') && ! (*transa == 'T')) {
 info = 1;
    } else if (! notb && ! (*transb == 'C') && ! (*transb == 'T')) {
 info = 2;
    } else if (*m < 0) {
 info = 3;
    } else if (*n < 0) {
 info = 4;
    } else if (*k < 0) {
 info = 5;
    } else if (*lda < max(1,nrowa)) {
 info = 8;
    } else if (*ldb < max(1,nrowb)) {
 info = 10;
    } else if (*ldc < max(1,*m)) {
 info = 13;
    }
    if (info != 0) {
 printf("SGEMM %d\n", info);
 return 0;
    }

/*     Quick return if possible. */

    if (*m == 0 || *n == 0 || (*alpha == 0.f || *k == 0) && *beta == 1.f) {
 return 0;
    }

/*     And if  alpha.eq.zero. */

    if (*alpha == 0.f) {
 if (*beta == 0.f) {
     i__1 = *n;
     for (j = 1; j <= *n; ++j) {
  i__2 = *m;
  for (i = 1; i <= *m; ++i) {
      C(i,j) = 0.f;
/* L10: */
  }
/* L20: */
     }
 } else {
     i__1 = *n;
     for (j = 1; j <= *n; ++j) {
  i__2 = *m;
  for (i = 1; i <= *m; ++i) {
      C(i,j) = *beta * C(i,j);
/* L30: */
  }
/* L40: */
     }
 }
 return 0;
    }

/*     Start the operations. */

    if (notb) {
 if (nota) {

/*           Form  C := alpha*A*B + beta*C. */

     i__1 = *n;
     for (j = 1; j <= *n; ++j) {
  if (*beta == 0.f) {
      i__2 = *m;
      for (i = 1; i <= *m; ++i) {
   C(i,j) = 0.f;
/* L50: */
      }
  } else if (*beta != 1.f) {
      i__2 = *m;
      for (i = 1; i <= *m; ++i) {
   C(i,j) = *beta * C(i,j);
/* L60: */
      }
  }
  i__2 = *k;
  for (l = 1; l <= *k; ++l) {
      if (B(l,j) != 0.f) {
   temp = *alpha * B(l,j);
   i__3 = *m;
   for (i = 1; i <= *m; ++i) {
       C(i,j) += temp * A(i,l);
/* L70: */
   }
      }
/* L80: */
  }
/* L90: */
     }
 } else {

/*           Form  C := alpha*A'*B + beta*C */

     i__1 = *n;
     for (j = 1; j <= *n; ++j) {
  i__2 = *m;
  for (i = 1; i <= *m; ++i) {
      temp = 0.f;
      i__3 = *k;
      for (l = 1; l <= *k; ++l) {
   temp += A(l,i) * B(l,j);
/* L100: */
      }
      if (*beta == 0.f) {
   C(i,j) = *alpha * temp;
      } else {
   C(i,j) = *alpha * temp + *beta * C(i,j);
      }
/* L110: */
  }
/* L120: */
     }
 }
    } else {
 if (nota) {

/*           Form  C := alpha*A*B' + beta*C */

     i__1 = *n;
     for (j = 1; j <= *n; ++j) {
  if (*beta == 0.f) {
      i__2 = *m;
      for (i = 1; i <= *m; ++i) {
   C(i,j) = 0.f;
/* L130: */
      }
  } else if (*beta != 1.f) {
      i__2 = *m;
      for (i = 1; i <= *m; ++i) {
   C(i,j) = *beta * C(i,j);
/* L140: */
      }
  }
  i__2 = *k;
  for (l = 1; l <= *k; ++l) {
      if (B(j,l) != 0.f) {
   temp = *alpha * B(j,l);
   i__3 = *m;
   for (i = 1; i <= *m; ++i) {
       C(i,j) += temp * A(i,l);
/* L150: */
   }
      }
/* L160: */
  }
/* L170: */
     }
 } else {

/*           Form  C := alpha*A'*B' + beta*C */

     i__1 = *n;
     for (j = 1; j <= *n; ++j) {
  i__2 = *m;
  for (i = 1; i <= *m; ++i) {
      temp = 0.f;
      i__3 = *k;
      for (l = 1; l <= *k; ++l) {
   temp += A(l,i) * B(j,l);
/* L180: */
      }
      if (*beta == 0.f) {
   C(i,j) = *alpha * temp;
      } else {
   C(i,j) = *alpha * temp + *beta * C(i,j);
      }
/* L190: */
  }
/* L200: */
     }
 }
    }

    return 0;

/*     End of SGEMM . */

} /* sgemm_ */

compile/run:

$ g++ -I/usr/local/cuda/include -I/usr/local/cuda/samples/common/inc main.cpp sgemm.cpp -o test -L/usr/local/cuda/lib64 -lcublas -lcudart
$ ./test
Using 2 GPUs
GPU ID = 0, Name = NVIDIA GeForce GTX 960
GPU ID = 1, Name = NVIDIA GeForce GT 640
simpleCUBLASXT test running..
transa = N
transb = N
m = 1024
n = 102
k = 1024
alpha = 1.000000
beta = 0.000000
lda = 1024
ldb = 1024
ldc = 1024
simpleCUBLASXT test passed.
$

CUDA 11.3, g++ 8.3.1

Just repeating a few of the requirements from the docs:

  • only works for gemm operations
  • only works if all the matrices are in host memory
  • only works for devices of cc3.5 or higher (and this might change in future versions of CUDA)

You also might be interested in cuBLASMg locate in Early Access. CUDA Math Library Early Access Program | NVIDIA Developer

It can provide 2x performance improvements, depending on hardware and use cases.