I am testing some of the new Cuda Dense capabilities in Cuda 7.0. I am finding the SVD to be extremely slow compared to MKL. For example, in the code snippet below I load up a 1856 by 1849 complex matrix and perform an SVD. MKL can do the SVD in 2.2 sec wall-clock time. It takes cusolverDnCgesvd a whopping 41.3 sec wall-clock time. The reduction appears to be correct in both cases. Are there special compiling flags needed to achieve good performance with the new Cuda Dense functions?
Platform : Ubuntu 14.04, GeForce GTX 690, Driver 346.46, MKL bundled with Intel 15.0, 12 CPU threads.
int main()
{
cusolverDnHandle_t cuHandle;
cusolverDnCreate(&cuHandle);
int m = 1856;
int n = 1849;
int ldA = m;
int minMN = min(m,n);
complex<float>* h_A = (complex<float> *)malloc(m * n * sizeof(complex<float>));
_readMatrix(m, n, h_A, "./VFull.dat");
cuComplex* d_A;
gpuErrchk(cudaMalloc(&d_A, m * n * sizeof(cuComplex)));
gpuErrchk(cudaMemcpy(d_A, h_A, m * n * sizeof(cuComplex), cudaMemcpyHostToDevice));
int ldUsvd = ldA;
cuComplex* d_Usvd;
gpuErrchk(cudaMalloc(&d_Usvd, m * m * sizeof(cuComplex)));
int ldVsvdH = n;
cuComplex* d_VsvdH;
gpuErrchk(cudaMalloc(&d_VsvdH, n * n * sizeof(cuComplex)));
float* d_Ssvd;
gpuErrchk(cudaMalloc(&d_Ssvd, minMN * sizeof(float)));
cusolverStatus_t status;
int workSize = 0;
status = cusolverDnCgesvd_bufferSize(cuHandle, m, n, &workSize);
if (status != CUSOLVER_STATUS_SUCCESS )
cout << "Initialization of cuSolver failed." << endl;
int *devInfo;
gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
cuComplex* work;
gpuErrchk(cudaMalloc(&work, workSize * sizeof(cuComplex)));
float* rwork;
gpuErrchk(cudaMalloc(&rwork, workSize * sizeof(cuComplex)));
status = cusolverDnCgesvd(cuHandle, 'A', 'A', m, n, d_A, ldA,
d_Ssvd, d_Usvd, ldUsvd, d_VsvdH, ldVsvdH, work, workSize, rwork, devInfo);
switch(status)
{
case CUSOLVER_STATUS_SUCCESS:
cout << "success" << endl;
break;
case CUSOLVER_STATUS_NOT_INITIALIZED :
cout << "Library cuSolver not initialized correctly" << endl;
break;
case CUSOLVER_STATUS_INVALID_VALUE:
cout << "Invalid parameters passed" << endl;
break;
case CUSOLVER_STATUS_INTERNAL_ERROR:
cout << "Internal operation failed" << endl;
break;
case CUSOLVER_STATUS_EXECUTION_FAILED:
cout << "Execution failed" << endl;
break;
}
cudaDeviceSynchronize();
cusolverDnDestroy(cuHandle);
return 0;
}