I think this should be most of what you need. Here is a modified version of the simpleCUBLASXT sample code. It demonstrates putting 10% of the problem on the CPU.
Pretty much. Those were the main additions I made to the cublasXT sample code.
This took some inspection and head scratching on my part. Eventually I concluded that the expected prototype matches a particular kind of fortran->C conversion of the blas API. Therefore I lifted my “functor” mostly from here.
No. You would need to account for this in your functor. For example you could use an OpenMP parallelized version of the code I provided. Alternatively, you could connect to a library like MKL which can handle threading “automatically”.
Here is an example.
main.cpp (mostly adapted from the CUDA sample code):
/* Includes, system */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
/* Includes, cuda */
#include <cublasXt.h>
#include <cuda_runtime.h>
#include <helper_cuda.h>
/* Matrix size */
//#define N (275)
#define N (1024)
// Restricting the max used GPUs as input matrix is not so large
#define MAX_NUM_OF_GPUS 2
int sgemm_(char *, char *, int *, int *, int *, float *, float *, int *, float *, int *, float *, float *, int *);
/* Host implementation of a simple version of sgemm */
static void simple_sgemm(int n, float alpha, const float *A, const float *B,
float beta, float *C) {
int i;
int j;
int k;
for (i = 0; i < n; ++i) {
for (j = 0; j < n; ++j) {
float prod = 0;
for (k = 0; k < n; ++k) {
prod += A[k * n + i] * B[j * n + k];
}
C[j * n + i] = alpha * prod + beta * C[j * n + i];
}
}
}
void findMultipleBestGPUs(int &num_of_devices, int *device_ids) {
// Find the best CUDA capable GPU device
int current_device = 0;
int device_count;
checkCudaErrors(cudaGetDeviceCount(&device_count));
typedef struct gpu_perf_t {
uint64_t compute_perf;
int device_id;
} gpu_perf;
gpu_perf *gpu_stats = (gpu_perf *)malloc(sizeof(gpu_perf) * device_count);
cudaDeviceProp deviceProp;
int devices_prohibited = 0;
while (current_device < device_count) {
cudaGetDeviceProperties(&deviceProp, current_device);
// If this GPU is not running on Compute Mode prohibited,
// then we can add it to the list
int sm_per_multiproc;
if (deviceProp.computeMode != cudaComputeModeProhibited) {
if (deviceProp.major == 9999 && deviceProp.minor == 9999) {
sm_per_multiproc = 1;
} else {
sm_per_multiproc =
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
}
gpu_stats[current_device].compute_perf =
(uint64_t)deviceProp.multiProcessorCount * sm_per_multiproc *
deviceProp.clockRate;
gpu_stats[current_device].device_id = current_device;
} else {
devices_prohibited++;
}
++current_device;
}
if (devices_prohibited == device_count) {
fprintf(stderr,
"gpuGetMaxGflopsDeviceId() CUDA error:"
" all devices have compute mode prohibited.\n");
exit(EXIT_FAILURE);
} else {
gpu_perf temp_elem;
// Sort the GPUs by highest compute perf.
for (int i = 0; i < current_device - 1; i++) {
for (int j = 0; j < current_device - i - 1; j++) {
if (gpu_stats[j].compute_perf < gpu_stats[j + 1].compute_perf) {
temp_elem = gpu_stats[j];
gpu_stats[j] = gpu_stats[j + 1];
gpu_stats[j + 1] = temp_elem;
}
}
}
for (int i = 0; i < num_of_devices; i++) {
device_ids[i] = gpu_stats[i].device_id;
}
}
free(gpu_stats);
}
void my_sgemm(char *transa, char *transb, int *m, int *n, int *k, float *alpha, float *A, int *lda, float *B, int *ldb, float *beta, float *C, int *ldc){
printf("transa = %c\n", *transa);
printf("transb = %c\n", *transb);
printf("m = %d\n", *m);
printf("n = %d\n", *n);
printf("k = %d\n", *k);
printf("alpha = %f\n", *alpha);
printf("beta = %f\n", *beta);
printf("lda = %d\n", *lda);
printf("ldb = %d\n", *ldb);
printf("ldc = %d\n", *ldc);
sgemm_(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
}
/* Main */
int main(int argc, char **argv) {
cublasStatus_t status;
float *h_A;
float *h_B;
float *h_C;
float *h_C_ref;
float *d_A = 0;
float *d_B = 0;
float *d_C = 0;
float alpha = 1.0f;
float beta = 0.0f;
int n2 = N * N;
int i;
float error_norm;
float ref_norm;
float diff;
cublasXtHandle_t handle;
int *devices = NULL;
int num_of_devices = 0;
checkCudaErrors(cudaGetDeviceCount(&num_of_devices));
if (num_of_devices > MAX_NUM_OF_GPUS) {
num_of_devices = MAX_NUM_OF_GPUS;
}
devices = (int *)malloc(sizeof(int) * num_of_devices);
findMultipleBestGPUs(num_of_devices, devices);
cudaDeviceProp deviceProp;
printf("Using %d GPUs\n", num_of_devices);
for (i = 0; i < num_of_devices; i++) {
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devices[i]));
printf("GPU ID = %d, Name = %s \n", devices[i], deviceProp.name);
}
/* Initialize CUBLAS */
printf("simpleCUBLASXT test running..\n");
status = cublasXtCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf(stderr, "!!!! CUBLASXT initialization error\n");
return EXIT_FAILURE;
}
/* Select devices for use in CUBLASXT math functions */
status = cublasXtDeviceSelect(handle, num_of_devices, devices);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf(stderr, "!!!! CUBLASXT device selection error\n");
return EXIT_FAILURE;
}
/* Optional: Set a block size for CUBLASXT math functions */
status = cublasXtSetBlockDim(handle, 64);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf(stderr, "!!!! CUBLASXT set block dimension error\n");
return EXIT_FAILURE;
}
/* Allocate host memory for the matrices */
h_A = (float *)malloc(n2 * sizeof(h_A[0]));
if (h_A == 0) {
fprintf(stderr, "!!!! host memory allocation error (A)\n");
return EXIT_FAILURE;
}
h_B = (float *)malloc(n2 * sizeof(h_B[0]));
if (h_B == 0) {
fprintf(stderr, "!!!! host memory allocation error (B)\n");
return EXIT_FAILURE;
}
h_C_ref = (float *)malloc(n2 * sizeof(h_C[0]));
if (h_C_ref == 0) {
fprintf(stderr, "!!!! host memory allocation error (C_ref)\n");
return EXIT_FAILURE;
}
h_C = (float *)malloc(n2 * sizeof(h_C[0]));
if (h_C == 0) {
fprintf(stderr, "!!!! host memory allocation error (C)\n");
return EXIT_FAILURE;
}
/* Fill the matrices with test data */
for (i = 0; i < n2; i++) {
h_A[i] = rand() / (float)RAND_MAX;
h_B[i] = rand() / (float)RAND_MAX;
h_C[i] = rand() / (float)RAND_MAX;
h_C_ref[i] = h_C[i];
}
/* Performs operation using plain C code */
simple_sgemm(N, alpha, h_A, h_B, beta, h_C_ref);
status = cublasXtSetCpuRatio(handle, CUBLASXT_GEMM, CUBLASXT_FLOAT, 0.1f);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf(stderr, "!!!! set CPU ratio error.\n");
return EXIT_FAILURE;
}
status = cublasXtSetCpuRoutine(handle, CUBLASXT_GEMM, CUBLASXT_FLOAT, (void *)my_sgemm);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf(stderr, "!!!! set CPU routine error.\n");
return EXIT_FAILURE;
}
/* Performs operation using cublas */
status = cublasXtSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, h_A,
N, h_B, N, &beta, h_C, N);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf(stderr, "!!!! kernel execution error.\n");
return EXIT_FAILURE;
}
/* Check result against reference */
error_norm = 0;
ref_norm = 0;
for (i = 0; i < n2; ++i) {
diff = h_C_ref[i] - h_C[i];
error_norm += diff * diff;
ref_norm += h_C_ref[i] * h_C_ref[i];
}
error_norm = (float)sqrt((double)error_norm);
ref_norm = (float)sqrt((double)ref_norm);
if (fabs(ref_norm) < 1e-7) {
fprintf(stderr, "!!!! reference norm is 0\n");
return EXIT_FAILURE;
}
/* Memory clean up */
free(h_A);
free(h_B);
free(h_C);
free(h_C_ref);
if (cudaFree(d_A) != cudaSuccess) {
fprintf(stderr, "!!!! memory free error (A)\n");
return EXIT_FAILURE;
}
if (cudaFree(d_B) != cudaSuccess) {
fprintf(stderr, "!!!! memory free error (B)\n");
return EXIT_FAILURE;
}
if (cudaFree(d_C) != cudaSuccess) {
fprintf(stderr, "!!!! memory free error (C)\n");
return EXIT_FAILURE;
}
/* Shutdown */
status = cublasXtDestroy(handle);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf(stderr, "!!!! shutdown error (A)\n");
return EXIT_FAILURE;
}
if (error_norm / ref_norm < 1e-6f) {
printf("simpleCUBLASXT test passed.\n");
exit(EXIT_SUCCESS);
} else {
printf("simpleCUBLASXT test failed.\n");
exit(EXIT_FAILURE);
}
}
sgemm.cpp (mostly adapted from here):
#include <stdio.h>
typedef int integer;
typedef float real;
typedef bool logical;
#define max(a,b) ((a>b)?a:b)
int sgemm_(char *transa, char *transb, integer *m, integer *
n, integer *k, real *alpha, real *a, integer *lda, real *b, integer *
ldb, real *beta, real *c, integer *ldc)
{
/* System generated locals */
integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
i__3;
/* Local variables */
static integer info;
static logical nota, notb;
static real temp;
static integer i, j, l, ncola;
static integer nrowa, nrowb;
/* Purpose
=======
SGEMM performs one of the matrix-matrix operations
C := alpha*op( A )*op( B ) + beta*C,
where op( X ) is one of
op( X ) = X or op( X ) = X',
alpha and beta are scalars, and A, B and C are matrices, with op( A )
an m by k matrix, op( B ) a k by n matrix and C an m by n matrix.
Parameters
==========
TRANSA - CHARACTER*1.
On entry, TRANSA specifies the form of op( A ) to be used in
the matrix multiplication as follows:
TRANSA = 'N' or 'n', op( A ) = A.
TRANSA = 'T' or 't', op( A ) = A'.
(some comments removed here)
LDA - INTEGER.
On entry, LDA specifies the first dimension of A as declared
in the calling (sub) program. When TRANSA = 'N' or 'n' then
LDA must be at least max( 1, m ), otherwise LDA must be at
least max( 1, k ).
Unchanged on exit.
B - REAL array of DIMENSION ( LDB, kb ), where kb is
n when TRANSB = 'N' or 'n', and is k otherwise.
Before entry with TRANSB = 'N' or 'n', the leading k by n
part of the array B must contain the matrix B, otherwise
the leading n by k part of the array B must contain the
matrix B.
Unchanged on exit.
LDB - INTEGER.
On entry, LDB specifies the first dimension of B as declared
in the calling (sub) program. When TRANSB = 'N' or 'n' then
LDB must be at least max( 1, k ), otherwise LDB must be at
least max( 1, n ).
Unchanged on exit.
BETA - REAL .
On entry, BETA specifies the scalar beta. When BETA is
supplied as zero then C need not be set on input.
Unchanged on exit.
C - REAL array of DIMENSION ( LDC, n ).
Before entry, the leading m by n part of the array C must
contain the matrix C, except when beta is zero, in which
case C need not be set on entry.
On exit, the array C is overwritten by the m by n matrix
( alpha*op( A )*op( B ) + beta*C ).
LDC - INTEGER.
On entry, LDC specifies the first dimension of C as declared
in the calling (sub) program. LDC must be at least
max( 1, m ).
Unchanged on exit.
Level 3 Blas routine.
-- Written on 8-February-1989.
Jack Dongarra, Argonne National Laboratory.
Iain Duff, AERE Harwell.
Jeremy Du Croz, Numerical Algorithms Group Ltd.
Sven Hammarling, Numerical Algorithms Group Ltd.
Set NOTA and NOTB as true if A and B respectively are not
transposed and set NROWA, NCOLA and NROWB as the number of rows
and columns of A and the number of rows of B respectively.
Parameter adjustments
Function Body */
#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
#define B(I,J) b[(I)-1 + ((J)-1)* ( *ldb)]
#define C(I,J) c[(I)-1 + ((J)-1)* ( *ldc)]
nota = (*transa == 'N');
notb = (*transb == 'N');
if (nota) {
nrowa = *m;
ncola = *k;
} else {
nrowa = *k;
ncola = *m;
}
if (notb) {
nrowb = *k;
} else {
nrowb = *n;
}
/* Test the input parameters. */
info = 0;
if (! nota && ! (*transa == 'C') && ! (*transa == 'T')) {
info = 1;
} else if (! notb && ! (*transb == 'C') && ! (*transb == 'T')) {
info = 2;
} else if (*m < 0) {
info = 3;
} else if (*n < 0) {
info = 4;
} else if (*k < 0) {
info = 5;
} else if (*lda < max(1,nrowa)) {
info = 8;
} else if (*ldb < max(1,nrowb)) {
info = 10;
} else if (*ldc < max(1,*m)) {
info = 13;
}
if (info != 0) {
printf("SGEMM %d\n", info);
return 0;
}
/* Quick return if possible. */
if (*m == 0 || *n == 0 || (*alpha == 0.f || *k == 0) && *beta == 1.f) {
return 0;
}
/* And if alpha.eq.zero. */
if (*alpha == 0.f) {
if (*beta == 0.f) {
i__1 = *n;
for (j = 1; j <= *n; ++j) {
i__2 = *m;
for (i = 1; i <= *m; ++i) {
C(i,j) = 0.f;
/* L10: */
}
/* L20: */
}
} else {
i__1 = *n;
for (j = 1; j <= *n; ++j) {
i__2 = *m;
for (i = 1; i <= *m; ++i) {
C(i,j) = *beta * C(i,j);
/* L30: */
}
/* L40: */
}
}
return 0;
}
/* Start the operations. */
if (notb) {
if (nota) {
/* Form C := alpha*A*B + beta*C. */
i__1 = *n;
for (j = 1; j <= *n; ++j) {
if (*beta == 0.f) {
i__2 = *m;
for (i = 1; i <= *m; ++i) {
C(i,j) = 0.f;
/* L50: */
}
} else if (*beta != 1.f) {
i__2 = *m;
for (i = 1; i <= *m; ++i) {
C(i,j) = *beta * C(i,j);
/* L60: */
}
}
i__2 = *k;
for (l = 1; l <= *k; ++l) {
if (B(l,j) != 0.f) {
temp = *alpha * B(l,j);
i__3 = *m;
for (i = 1; i <= *m; ++i) {
C(i,j) += temp * A(i,l);
/* L70: */
}
}
/* L80: */
}
/* L90: */
}
} else {
/* Form C := alpha*A'*B + beta*C */
i__1 = *n;
for (j = 1; j <= *n; ++j) {
i__2 = *m;
for (i = 1; i <= *m; ++i) {
temp = 0.f;
i__3 = *k;
for (l = 1; l <= *k; ++l) {
temp += A(l,i) * B(l,j);
/* L100: */
}
if (*beta == 0.f) {
C(i,j) = *alpha * temp;
} else {
C(i,j) = *alpha * temp + *beta * C(i,j);
}
/* L110: */
}
/* L120: */
}
}
} else {
if (nota) {
/* Form C := alpha*A*B' + beta*C */
i__1 = *n;
for (j = 1; j <= *n; ++j) {
if (*beta == 0.f) {
i__2 = *m;
for (i = 1; i <= *m; ++i) {
C(i,j) = 0.f;
/* L130: */
}
} else if (*beta != 1.f) {
i__2 = *m;
for (i = 1; i <= *m; ++i) {
C(i,j) = *beta * C(i,j);
/* L140: */
}
}
i__2 = *k;
for (l = 1; l <= *k; ++l) {
if (B(j,l) != 0.f) {
temp = *alpha * B(j,l);
i__3 = *m;
for (i = 1; i <= *m; ++i) {
C(i,j) += temp * A(i,l);
/* L150: */
}
}
/* L160: */
}
/* L170: */
}
} else {
/* Form C := alpha*A'*B' + beta*C */
i__1 = *n;
for (j = 1; j <= *n; ++j) {
i__2 = *m;
for (i = 1; i <= *m; ++i) {
temp = 0.f;
i__3 = *k;
for (l = 1; l <= *k; ++l) {
temp += A(l,i) * B(j,l);
/* L180: */
}
if (*beta == 0.f) {
C(i,j) = *alpha * temp;
} else {
C(i,j) = *alpha * temp + *beta * C(i,j);
}
/* L190: */
}
/* L200: */
}
}
}
return 0;
/* End of SGEMM . */
} /* sgemm_ */
compile/run:
$ g++ -I/usr/local/cuda/include -I/usr/local/cuda/samples/common/inc main.cpp sgemm.cpp -o test -L/usr/local/cuda/lib64 -lcublas -lcudart
$ ./test
Using 2 GPUs
GPU ID = 0, Name = NVIDIA GeForce GTX 960
GPU ID = 1, Name = NVIDIA GeForce GT 640
simpleCUBLASXT test running..
transa = N
transb = N
m = 1024
n = 102
k = 1024
alpha = 1.000000
beta = 0.000000
lda = 1024
ldb = 1024
ldc = 1024
simpleCUBLASXT test passed.
$
CUDA 11.3, g++ 8.3.1
Just repeating a few of the requirements from the docs:
- only works for gemm operations
- only works if all the matrices are in host memory
- only works for devices of cc3.5 or higher (and this might change in future versions of CUDA)