With CUDA ToolKit 8.0 it is possible to perform SVD on much larger matrices using “cusolverDnDgesvd” than with CUDA ToolKit 10.1. Testing with a Telsa P40, the results as per the code below indicate that for a 40000 X 5000 matrix, the working space required is:
14.833 GB for 10.1
0.200 GB for 8.0
Sadly, the 10.1 version of “cusolverDnDgesvd” runs something like 20X faster than the 8.0 version so it would be great to use the 10.1 version, but I need to be able to handle matrices that are larger than this – 10.1 won’t solve a 46747 X 3300 matrix because it exhausts available device memory (22 GB).
Any suggestions?
#include <stdio.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>
int main()
{
cusolverDnHandle_t cusolverH = NULL;
cusolverStatus_t cusolver_status = cusolverDnCreate(&cusolverH);
assert(cusolver_status == CUSOLVER_STATUS_SUCCESS);
int lwork = 0;
cusolver_status = cusolverDnDgesvd_bufferSize(
cusolverH,
40000,
5000,
&lwork);
assert(cusolver_status == CUSOLVER_STATUS_SUCCESS);
if (cusolverH)
cusolverDnDestroy(cusolverH);
cudaDeviceReset();
printf("Workspace: %0.6f GB\n", 1e-9 * lwork * sizeof(double));
printf("Hit <CR> to continue: ");
getchar();
// Tesla P40 with drivers as per installed toolkit
// CUDA Toolkit 10.1 -> Workspace: 14.883487 GB
// CUDA Toolkit 8.0 -> Workspace: 0.200121 GB
return (0);
}