Hello , I am trying to use cuSolver and specific cusolverDnSgesvd ( really , where can I find any documentation??? ) and I noticed that the results differ a lot from using LAPACKE_sgesvd.
( Also, i am not sure about the “work” , “work size” ,“rwork” )
For example :
cuSolver:
S[ 0 ] = 1.43155e+09
S[ 1 ] = 1.06301e+08
S[ 2 ] = 6.08459e+06
S[ 3 ] = 320.892
S[ 4 ] = 255.5
S[ 5 ] = 253.708
S[ 6 ] = 241.768
S[ 7 ] = 240.104
S[ 8 ] = 230.025
S[ 9 ] = 228.298
S[ 10 ] = 225.957
lapack:
S[0] = 1.43155e+09
S[1] = 1.06301e+08
S[2] = 6.08458e+06
S[3] = 1499.65
S[4] = 1473.44
S[5] = 1190.09
S[6] = 1040.91
S[7] = 824.54
S[8] = 819.075
S[9] = 775.057
S[10] = 769.074
code:
#include <cstdio>
#include <cstdlib>
#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>
using namespace std;
int main()
{
int M = 1000;
int N = 1000;
float * A = (float *)malloc( M * N * sizeof(*A) );
for( int i = 0; i < M; i++ )
{
for( int j = 0; j < N; j++ )
{
A[ j * M + i ] = ( i + j ) * ( i + j );
}
}
float * devA;
cudaMalloc( &devA , M * N * sizeof(*devA) );
float * S = (float *)malloc( M * sizeof(*S) );
float * U = (float *)malloc( M * M * sizeof(*U) );
float * V = (float *)malloc( N * N * sizeof(*V) );
int WorkSize = M * M;
int * devInfo;
cudaMalloc( &devInfo, sizeof(*devInfo) );
float * devS;
cudaMalloc( &devS, M * sizeof(*devS) );
float * devU;
cudaMalloc( &devU,M * M * sizeof(*devU) );
float * devV;
cudaMalloc( &devV, N * N * sizeof(*devV) );
cusolverStatus_t cuSolverStatus;
cusolverDnHandle_t cuSolverHandle;
cusolverDnCreate( &cuSolverHandle );
cuSolverStatus = cusolverDnSgesvd_bufferSize( cuSolverHandle, M, N, &WorkSize );
float * Work;
cudaMalloc( &Work, WorkSize * sizeof(*Work) );
float * rwork;
cudaMalloc( &rwork, M * M * sizeof(*rwork) );
cudaMemcpy( devA, A, M * N * sizeof(*A), cudaMemcpyHostToDevice );
cuSolverStatus = cusolverDnSgesvd( cuSolverHandle, 'A', 'A', M, N, devA, M, devS, devU, M, devV, N, Work, WorkSize, NULL, devInfo );
cudaPeekAtLastError();
cudaDeviceSynchronize();
cudaMemcpy( S, devS, M * sizeof(*devS), cudaMemcpyDeviceToHost );
for( int i = 0; i < N; i++ )
cout << "S[ " << i << " ] = " << S[ i ] << endl;
cusolverDnDestroy( cuSolverHandle );
cudaDeviceReset();
return 0;
}