Hi,

I am developing an matlab function to speed-up solving many small SVDs on the gpu. The size of the problem is about 200x200 but there should be about 100+ of these that I would like to solve in parallel. Since there is a new SVD solver in the cuda 7.0 cusolver I decided to give it a try. The problem is that at the moment it seems like it’s much faster to use the matlab svd (mostlikely MKL based) routine on the CPU than my current cuda solution so I hope you can help me figure out if this comes from my implementation or it is just impossible to get the performance gains I hope for.

First of all my configuration is an 8 CPU Xeon with a Titan Z card and matlab 2013a with cuda 7.0 and driver version 346.46. I found a post on this http://stackoverflow.com/questions/17401765/parallel-implementation-for-multiple-svds-using-cuda. The idea was to use streams to run the solvers in parallel. There seems to still be synchronization because of the C API i.e. the solver needs to finish before a new one is run so I had to multi-thread it and for that the simplest solution was OMP. So the implementation I have is the following:

```
#define CUDA_API_PER_THREAD_DEFAULT_STREAM
#include "mex.h"
#include "matrix.h"
#include "gpu/mxGPUArray.h"
#include <cuda.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>
#include <math.h>
#include <assert.h>
#include <omp.h>
/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
extern "C" void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }
/**************************/
/* CUSOLVE ERROR CHECKING */
/**************************/
static const char *_cudaGetErrorEnum(cusolverStatus_t error)
{
switch (error)
{
case CUSOLVER_STATUS_SUCCESS:
return "CUSOLVER_SUCCESS";
case CUSOLVER_STATUS_NOT_INITIALIZED:
return "CUSOLVER_STATUS_NOT_INITIALIZED";
case CUSOLVER_STATUS_ALLOC_FAILED:
return "CUSOLVER_STATUS_ALLOC_FAILED";
case CUSOLVER_STATUS_INVALID_VALUE:
return "CUSOLVER_STATUS_INVALID_VALUE";
case CUSOLVER_STATUS_ARCH_MISMATCH:
return "CUSOLVER_STATUS_ARCH_MISMATCH";
case CUSOLVER_STATUS_EXECUTION_FAILED:
return "CUSOLVER_STATUS_EXECUTION_FAILED";
case CUSOLVER_STATUS_INTERNAL_ERROR:
return "CUSOLVER_STATUS_INTERNAL_ERROR";
case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
}
return "<unknown>";
}
inline void __cusolveSafeCall(cusolverStatus_t err, const char *file, const int line)
{
if(CUSOLVER_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSOLVE error in file '%s', line %d\n\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
_cudaGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
void cusolveSafeCall(cusolverStatus_t err) { __cusolveSafeCall(err, __FILE__, __LINE__); }
// create the solver
int MAX_THREADS=omp_get_max_threads();
cusolverDnHandle_t *solver_handle;
float *work=NULL;
float *d_U=NULL, *d_V=NULL, *d_S=NULL;
cudaStream_t *streams;
int work_size = 0;
int *devInfo;
int Nrows;
int Ncols;
int Nims;
float const *X;
/**
* Replacement for mxGPUCopyFromMxArray
*/
mxGPUArray * mxGPUCopyFromMxArray2(mxArray const * mx) {
if (mxIsGPUArray(mx)) {
mexPrintf("gpu!\n");
mxGPUArray const * tmp = mxGPUCreateFromMxArray(mx);
mxGPUArray * returnValue = mxGPUCopyGPUArray(tmp);
mxGPUDestroyGPUArray(tmp);
return returnValue;
} else {
mexPrintf("cpu!\n");
return const_cast<mxGPUArray *>(mxGPUCreateFromMxArray(mx));
}
}
/*****************************************************************************
* mex function
****************************************************************************/
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
{
const mwSize* Xdims;
int num_streams=MAX_THREADS;
mxInitGPU();
const mxGPUArray* Xgpu = mxGPUCreateFromMxArray(prhs[0]);
mxClassID classID = mxGPUGetClassID(Xgpu);
if ( classID != mxSINGLE_CLASS ) {
mexErrMsgTxt("Input arrays must be single precision!");
}
Xdims = mxGPUGetDimensions(Xgpu);
mwSize Xndims = mxGPUGetNumberOfDimensions(Xgpu);
if (Xndims == 3) {
Nims = Xdims[2];
} else {
Nims = 1;
}
Nrows = Xdims[0];
Ncols = Xdims[1];
mxGPUArray * out=mxGPUCopyFromMxArray2(prhs[0]);
float * X=(float* ) mxGPUGetData(out);
//X = (float const *) mxGPUGetDataReadOnly(Xgpu);
mexPrintf("%d %d %d\n",Nrows,Ncols,Nims);
gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
// these need to be allocated by matlab to remain after exit
const mwSize Udims[3] = {Nrows,Nrows,Nims};
mxGPUArray* mxd_U = mxGPUCreateGPUArray(3, Udims, mxSINGLE_CLASS, mxREAL, MX_GPU_INITIALIZE_VALUES);
d_U = (float *) mxGPUGetData(mxd_U);
const mwSize Vdims[3]= {Ncols,Ncols,Nims};
mxGPUArray* mxd_V = mxGPUCreateGPUArray(3, Vdims, mxSINGLE_CLASS, mxREAL, MX_GPU_INITIALIZE_VALUES);
d_V = (float *)mxGPUGetData(mxd_V);
const mwSize Sdims[2]= {Ncols,Nims};
mxGPUArray* mxd_S = mxGPUCreateGPUArray(2, Sdims, mxSINGLE_CLASS, mxREAL, MX_GPU_INITIALIZE_VALUES);
d_S = (float *)mxGPUGetData(mxd_S);
// create the streams and allocate the outputs
streams = (cudaStream_t*) malloc(MAX_THREADS*sizeof(cudaStream_t));
solver_handle = (cusolverDnHandle_t*) malloc(MAX_THREADS*sizeof(cusolverDnHandle_t));
for (int i = 0; i < num_streams; i++) {
//gpuErrchk(cudaStreamCreateWithFlags(&streams[i],cudaStreamNonBlocking));
gpuErrchk(cudaStreamCreate(&streams[i]));
// allocate the work space
cusolveSafeCall(cusolverDnCreate(&solver_handle[i]));
cusolveSafeCall(cusolverDnSetStream(solver_handle[i], streams[i]));
}
cusolveSafeCall(cusolverDnSgesvd_bufferSize(solver_handle[0], Nrows, Ncols, &work_size));
gpuErrchk(cudaMalloc(&work, num_streams*work_size * sizeof(float)));
// launch -- this needs to be in a separate loop otherwise it will synch because of the cudaStreamCreate calls
// mexPrintf("max threads %d for %d images\n",MAX_THREADS,Nims);
#pragma omp parallel for
for (int i = 0; i < Nims; i++) {
// launch one worker kernel per stream
int thread_num = omp_get_thread_num();
//mexPrintf("im %d on thread no %d\n",i,thread_num);
cusolveSafeCall(cusolverDnSgesvd(solver_handle[thread_num], 'A', 'A', Nrows, Ncols, &X[Nrows*Ncols*i], Nrows, &d_S[Ncols*i], &d_U[Nrows*Nrows*i], Nrows, &d_V[Ncols*Ncols*i], Ncols, &work[work_size*thread_num], work_size, NULL, devInfo));
//gpuErrchk(cudaStreamSynchronize(streams[thread_num]));
}
mexPrintf("Destroying solver handles!\n");
for (int i = 0; i<num_streams; i++) {
//gpuErrchk(cudaStreamSynchronize(streams[i]));
//cusolveSafeCall(cusolverDnDestroy(solver_handle[i]));
}
//mexPrintf("Releasing resources!\n");
//free(streams);
//free(solver_handle);
// cudaDeviceSynchronize();
plhs[0] = mxGPUCreateMxArrayOnGPU(mxd_U);
plhs[2] = mxGPUCreateMxArrayOnGPU(mxd_V);
plhs[1] = mxGPUCreateMxArrayOnGPU(mxd_S);
return;
}
```

I am aware that there are memory leaks and not all the synchronizations are kosher but even like this the implementation is about 3 times slower than the MKL version from matlab. I’m running 10 threads and a tensor 200x200x10 which is a bit small for what i want but in the right ballpark.

I’m adding the profiler output. There are a lot of very odd things there like all the HtoD which shouldn’t appear at all since the data is on the GPU and never moved from there. The threads are clearly working but there doesn’t seem to be any major performance improvement.

Any ideas ?

Help would be greatly appreciated.