The modified example is here,
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusolverDn.h>
#define A(row, col) A[(col) * 3 + (row)]
#define S_exact(row, col) S_exact[(col) * 3 + (row)]
void printMatrix(int m, int n, const cuDoubleComplex*A, int lda, const char* name)
{
int row, col;
for(row = 0 ; row < m ; row++){
for(col = 0 ; col < n ; col++){
printf("%s(%d,%d) = % 3.0f%+4.0fi\n", name, row+1, col+1, A(row, col).x, A(row, col).y);
}
}
}
int main(int argc, char*argv[])
{
cusolverDnHandle_t cusolverH = NULL;
cublasHandle_t cublasH = NULL;
cublasStatus_t cublas_status = CUBLAS_STATUS_SUCCESS;
cusolverStatus_t cusolver_status = CUSOLVER_STATUS_SUCCESS;
cudaError_t cudaStat1 = cudaSuccess;
cudaError_t cudaStat2 = cudaSuccess;
cudaError_t cudaStat3 = cudaSuccess;
cudaError_t cudaStat4 = cudaSuccess;
cudaError_t cudaStat5 = cudaSuccess;
cudaError_t cudaStat6 = cudaSuccess;
int j;
const int m = 3;
const int n = 3;
const int lda = m;
/*
* | 1+0i 1+2i 2 +10i |
* A = | 1+1i 0+3i -5 +14i |
* | 1+1i 0+5i -8 +20i |
*/
cuDoubleComplex A[lda*n];
cuDoubleComplex U[lda*m]; /* m-by-m unitary matrix */
cuDoubleComplex VT[lda*n];/* n-by-n unitary matrix */
cuDoubleComplex S[n]; /* singular value */
cuDoubleComplex S_exact[n];
A(0, 0) = make_cuDoubleComplex(1.0, 0.0);
A(0, 1) = make_cuDoubleComplex(1.0, 2.0);
A(0, 2) = make_cuDoubleComplex(2.0, 10.0);
A(1, 0) = make_cuDoubleComplex(1.0, 1.0);
A(1, 1) = make_cuDoubleComplex(0.0, 3.0);
A(1, 2) = make_cuDoubleComplex(-5.0, 14.0);
A(2, 0) = make_cuDoubleComplex(1.0, 1.0);
A(2, 1) = make_cuDoubleComplex(0.0, 5.0);
A(2, 2) = make_cuDoubleComplex(-8.0, 20.0);
S_exact(0, 1) = make_cuDoubleComplex(-7.21983412, 23.5353263);
S_exact(0, 2) = make_cuDoubleComplex(0.06502936, 0.04481);
S_exact(0, 3) = make_cuDoubleComplex(0.15480475, -0.4905163);
cuDoubleComplex *d_A = NULL;
cuDoubleComplex *d_S = NULL;
cuDoubleComplex *d_U = NULL;
cuDoubleComplex *d_VT = NULL;
int *d_info = NULL;
cuDoubleComplex *d_W = NULL; /* W = S*VT */
void *bufferOnDevice = NULL;
size_t workspaceInBytesOnDevice = 0;
void *bufferOnHost = NULL;
size_t workspaceInBytesOnHost = 0;
int info_gpu = 0;
const cuDoubleComplex h_one = make_cuDoubleComplex(1, 0);
const cuDoubleComplex h_minus_one = make_cuDoubleComplex(-1, 0);
printf("A = (matlab base-1)\n");
printMatrix(m, n, A, lda, "A");
printf("=====\n");
/* step 1: create cusolverDn/cublas handle */
cusolver_status = cusolverDnCreate(&cusolverH);
assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);
cublas_status = cublasCreate(&cublasH);
assert(CUBLAS_STATUS_SUCCESS == cublas_status);
/* step 2: copy A and B to device */
cudaStat1 = cudaMalloc ((void**)&d_A , sizeof(cuDoubleComplex)*lda*n);
cudaStat2 = cudaMalloc ((void**)&d_S , sizeof(cuDoubleComplex)*n);
cudaStat3 = cudaMalloc ((void**)&d_U , sizeof(cuDoubleComplex)*lda*m);
cudaStat4 = cudaMalloc ((void**)&d_VT , sizeof(cuDoubleComplex)*lda*n);
cudaStat5 = cudaMalloc ((void**)&d_info, sizeof(int));
cudaStat6 = cudaMalloc ((void**)&d_W , sizeof(cuDoubleComplex)*lda*n);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
assert(cudaSuccess == cudaStat4);
assert(cudaSuccess == cudaStat5);
assert(cudaSuccess == cudaStat6);
cudaStat1 = cudaMemcpy(d_A, A, sizeof(cuDoubleComplex)*lda*n, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat1);
cudaDeviceSynchronize(); /* wait until d_A is ready */
/* step 3: query working space of SVD */
signed char jobu = 'A'; /* all m columns of U */
signed char jobvt = 'A'; /* all n columns of VT */
cusolver_status = cusolverDnXgesvd_bufferSize (
cusolverH,
NULL, /* params */
jobu,
jobvt,
(int64_t)m,
(int64_t)n,
CUDA_C_64F, /* dataTypeA */
d_A,
(int64_t)lda,
CUDA_C_64F, /* dataTypeS */
d_S,
CUDA_C_64F, /* dataTypeU */
d_U,
(int64_t)lda, /* ldu */
CUDA_C_64F, /* dataTypeVT */
d_VT,
(int64_t)lda, /* ldvt */
CUDA_C_64F, /* computeType */
&workspaceInBytesOnDevice,
&workspaceInBytesOnHost);
cudaStat1 = cudaMalloc((void**)&bufferOnDevice , workspaceInBytesOnDevice);
assert(cudaSuccess == cudaStat1);
if (0 < workspaceInBytesOnHost){
bufferOnHost = (void*)malloc(workspaceInBytesOnHost);
assert(NULL != bufferOnHost);
}
/* step 4: compute SVD */
cusolver_status = cusolverDnXgesvd(
cusolverH,
NULL, /* params */
jobu,
jobvt,
(int64_t)m,
(int64_t)n,
CUDA_C_64F, /* dataTypeA */
d_A,
(int64_t)lda,
CUDA_C_64F, /* dataTypeS */
d_S,
CUDA_C_64F, /* dataTypeU */
d_U,
(int64_t)lda, /* ldu */
CUDA_C_64F, /* dataTypeVT */
d_VT,
(int64_t)lda, /* ldvt */
CUDA_C_64F, /* computeType */
bufferOnDevice,
workspaceInBytesOnDevice,
bufferOnHost,
workspaceInBytesOnHost,
d_info);
cudaStat1 = cudaDeviceSynchronize();
//~ assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);
//~ assert(cudaSuccess == cudaStat1);
cudaStat1 = cudaMemcpy(U , d_U , sizeof(cuDoubleComplex)*lda*m, cudaMemcpyDeviceToHost);
cudaStat2 = cudaMemcpy(VT, d_VT, sizeof(cuDoubleComplex)*lda*n, cudaMemcpyDeviceToHost);
cudaStat3 = cudaMemcpy(S , d_S , sizeof(cuDoubleComplex)*n , cudaMemcpyDeviceToHost);
cudaStat4 = cudaMemcpy(&info_gpu, d_info, sizeof(int), cudaMemcpyDeviceToHost);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
assert(cudaSuccess == cudaStat4);
cudaDeviceSynchronize(); /* wait until host data is ready */
printf("after gesvd: info_gpu = %d\n", info_gpu);
assert(0 == info_gpu);
printf("=====\n");
printf("S = (matlab base-1)\n");
printMatrix(n, 1, S, lda, "S");
printf("=====\n");
printf("U = (matlab base-1)\n");
printMatrix(m, m, U, lda, "U");
printf("=====\n");
printf("VT = (matlab base-1)\n");
printMatrix(n, n, VT, lda, "VT");
printf("=====\n");
//~ /* step 5: measure error of singular value */
//~ cuDoubleComplex ds_sup = 0;
//~ for(j = 0; j < n; j++){
//~ cuDoubleComplex err = fabs( S[j] - S_exact[j] );
//~ ds_sup = (ds_sup > err)? ds_sup : err;
//~ }
//~ printf("|S - S_exact| = %E \n", ds_sup);
/* step 6: |A - U*S*VT| */
/* W = S*VT */
cublas_status = cublasZdgmm(
cublasH,
CUBLAS_SIDE_LEFT,
n,
n,
d_VT,
lda,
d_S,
1,
d_W,
lda);
assert(CUBLAS_STATUS_SUCCESS == cublas_status);
/* A := -U*W + A */
cudaStat1 = cudaMemcpy(d_A, A, sizeof(cuDoubleComplex)*lda*n, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat1);
cudaDeviceSynchronize(); /* wait until d_A is ready */
cublas_status = cublasZgemm_v2(
cublasH,
CUBLAS_OP_N,
CUBLAS_OP_N,
m, /* number of rows of A */
n, /* number of columns of A */
n, /* number of columns of U */
&h_minus_one,
d_U,
lda,
d_W,
lda,
&h_one,
d_A,
lda);
assert(CUBLAS_STATUS_SUCCESS == cublas_status);
double dR_fro = 0.0;
cublas_status = cublasDznrm2_v2(
cublasH, lda*n, d_A, 1, &dR_fro);
assert(CUBLAS_STATUS_SUCCESS == cublas_status);
printf("|A - U*S*VT| = %E \n", dR_fro);
/* free resources */
if (d_A ) cudaFree(d_A);
if (d_S ) cudaFree(d_S);
if (d_U ) cudaFree(d_U);
if (d_VT ) cudaFree(d_VT);
if (d_info) cudaFree(d_info);
if (d_W ) cudaFree(d_W);
if (bufferOnDevice) cudaFree(bufferOnDevice);
if (bufferOnHost ) free(bufferOnHost);
if (cublasH ) cublasDestroy(cublasH);
if (cusolverH) cusolverDnDestroy(cusolverH);
cudaDeviceReset();
return 0;
}
When i run it, i got,
A = (matlab base-1)
A(1,1) = 1 +0i
A(1,2) = 1 +2i
A(1,3) = 2 +10i
A(2,1) = 1 +1i
A(2,2) = 0 +3i
A(2,3) = -5 +14i
A(3,1) = 1 +1i
A(3,2) = 0 +5i
A(3,3) = -8 +20i
=====
after gesvd: info_gpu = 0
=====
S = (matlab base-1)
S(1,1) = 0 +0i
S(2,1) = 0 +0i
S(3,1) = 0 +0i
=====
U = (matlab base-1)
U(1,1) = 0 +0i
U(1,2) = 0 +0i
U(1,3) = 0 +0i
U(2,1) = 0 +0i
U(2,2) = 0 +0i
U(2,3) = 0 +0i
U(3,1) = 0 +0i
U(3,2) = 0 +0i
U(3,3) = 0 +0i
=====
VT = (matlab base-1)
VT(1,1) = 0 +0i
VT(1,2) = 0 +0i
VT(1,3) = 0 +0i
VT(2,1) = 0 +0i
VT(2,2) = 0 +0i
VT(2,3) = 0 +0i
VT(3,1) = 0 +0i
VT(3,2) = 0 +0i
VT(3,3) = 0 +0i
=====
|A - U*S*VT| = 2.886174E+01