I’m trying to figure out how to use cublasSgemmBatched. So far I have managed to create a working version that takes an (m x n x k) matrix A from the host, multiplies it with an (n x q x k) matrix B from the host and returns the resulting (m x q x k) matrix C to the host. This is my code (which is heavily inspired by the example found at https://stackoverflow.com/questions/23743384/how-performing-multiple-matrix-multiplications-in-cuda/23743838#23743838):
#include "mex.h"
#include "cublas_v2.h"
// The MEX gateway function.
void mexFunction(int nlhs, mxArray ∗plhs[], int nrhs,const mxArray ∗prhs[]) {
// Get input variables from Matlab (host variables).
const float *A, *B;
A = (float*)mxGetPr(prhs[0]);
B = (float*)mxGetPr(prhs[1]);
// Get dimensions of input variables from Matlab.
size_t m, n, k, q;
const mwSize *Adims, *Bdims;
Adims = mxGetDimensions(prhs[0]);
Bdims = mxGetDimensions(prhs[1]);
m = Adims[0];
n = Adims[1];
k = Adims[2];
q = Bdims[1];
// Create host version of resulting C matrix.
float *C;
mwSize dims[3] = { m, q, k };
plhs[0] = mxCreateNumericArray(3, dims, mxSINGLE_CLASS, mxREAL);
C = (float*)mxGetPr(plhs[0]);
// Device versions of A, B and C. Allocated in a strange way.
float **devA = (float **)malloc(k * sizeof(float**));
float **devB = (float **)malloc(k * sizeof(float**));
float **devC = (float **)malloc(k * sizeof(float**));
// size of each A, B and C page.
size_t A_size = sizeof(float) * m * n;
size_t B_size = sizeof(float) * n * q;
size_t C_size = sizeof(float) * m * q;
// Allocate space page by page.
for(int i = 0 ; i < k; i ++){
cudaMalloc((void**)&devA[i], A_size);
cudaMalloc((void**)&devB[i], B_size);
cudaMalloc((void**)&devC[i], C_size);
}
// Cublas handle.
cublasHandle_t h;
cublasCreate(&h);
// Create arrays of pointers that cublasSgemmBatched will use.
const float **d_Aarray, **d_Barray;
float **d_Carray;
cudaMalloc((void**)&d_Aarray, k * sizeof(float *));
cudaMalloc((void**)&d_Barray, k * sizeof(float *));
cudaMalloc((void**)&d_Carray, k * sizeof(float *));
// Copy matrices A and B are here copied from host, page by page, into devA and devB.
for(int i = 0 ; i < k; i ++ ) {
cudaMemcpy(devA[i], A + m * n * i, A_size , cudaMemcpyHostToDevice);
cudaMemcpy(devB[i], B + n * q * i, B_size , cudaMemcpyHostToDevice);
cudaMemcpy(devC[i], C + m * q * i, C_size , cudaMemcpyHostToDevice);
}
// Pointers pointing to locations in the real matrices are copied into pointer arrays.
cudaMemcpy(d_Aarray, devA, k * sizeof(float *), cudaMemcpyHostToDevice);
cudaMemcpy(d_Barray, devB, k * sizeof(float *), cudaMemcpyHostToDevice);
cudaMemcpy(d_Carray, devC, k * sizeof(float *), cudaMemcpyHostToDevice);
// Run cublasSgemmBatched.
const float alpha = 1.0;
const float beta = 0.0;
cublasSgemmBatched(h, // cublas handle.
CUBLAS_OP_N, // Dont transpose A.
CUBLAS_OP_N, // Dont transpose B.
m, // Rows of A.
q, // Columns of B.
n, // Columns of A (after potential transpose).
&alpha, // Scalar alpha.
d_Aarray, // Pointerarray to A.
m, // ldA.
d_Barray, // Pointerarray to B.
n, // ldB.
&beta, // Scalar beta.
d_Carray, // Pointerarray to C.
m, // ldC.
k); // Number of pages to batch.
// Free memory & destroy cublas handle.
cublasDestroy(h);
for(int i = 0 ; i < k; i ++ ){
cudaMemcpy(C + m * q * i, devC[i], C_size, cudaMemcpyDeviceToHost);
cudaFree(devA[i]);
cudaFree(devB[i]);
cudaFree(devC[i]);
}
cudaFree(d_Aarray);
cudaFree(d_Barray);
cudaFree(d_Carray);
}
However, I would like to use cublasSgemmBatched in a calculation sequence where the matrices A and B are already on the device to begin with, not on the host as in the case in my current implementation.
Say that matrices A and B for example were inserted at the very start of my function like this:
float *devA, *devB, *devC;
cudaMalloc(&devA, m * n * k * sizeof(float));
cudaMalloc(&devB, n * q * k * sizeof(float));
cudaMalloc(&devC, n * q * k * sizeof(float));
cudaMemcpy(devA, A, m * n * k * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(devB, B, n * q * k * sizeof(float), cudaMemcpyHostToDevice);
How would I then go about creating the pointer arrays d_Aarray, d_Barray, d_Carray that cublas wants without having to make any additional copies of dA, dB, dC? In other words, what would I have to change in the code after line 44 to make it work with dA, dB and dC if they are given as shown above?
The solution is probably trivial to people with a deep understanding of double pointers and arrays of pointers etc. But since my understanding of these things is currently overwhelmingly shallow its far from obvious to me how I should create these pointer arrays.