Cusolver SVD for general complex matrix

I want to compute the SVD for genral complex matrix with cusolverDnXgesvd. I refer to svd64_example (cf. cuSOLVER :: CUDA Toolkit Documentation), i replace the double variables by cuDoubleComplex variables, CUDA_R_64F by CUDA_C_64F but it didn’t work.

thanks for your advice.

The modified example is here,

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusolverDn.h>

#define A(row, col) A[(col) * 3 + (row)]
#define S_exact(row, col) S_exact[(col) * 3 + (row)]

void printMatrix(int m, int n, const cuDoubleComplex*A, int lda, const char* name)
{
    int row, col;
    for(row = 0 ; row < m ; row++){
        for(col = 0 ; col < n ; col++){
            printf("%s(%d,%d) = % 3.0f%+4.0fi\n", name, row+1, col+1, A(row, col).x, A(row, col).y);
        }
    }
}

int main(int argc, char*argv[])
{
    cusolverDnHandle_t cusolverH = NULL;
    cublasHandle_t cublasH = NULL;
    cublasStatus_t cublas_status = CUBLAS_STATUS_SUCCESS;
    cusolverStatus_t cusolver_status = CUSOLVER_STATUS_SUCCESS;
    cudaError_t cudaStat1 = cudaSuccess;
    cudaError_t cudaStat2 = cudaSuccess;
    cudaError_t cudaStat3 = cudaSuccess;
    cudaError_t cudaStat4 = cudaSuccess;
    cudaError_t cudaStat5 = cudaSuccess;
    cudaError_t cudaStat6 = cudaSuccess;
    int j;
    const int m = 3;
    const int n = 3;
    const int lda = m;
     
/* 
*     | 1+0i  1+2i   2 +10i  |
* A = | 1+1i  0+3i  -5 +14i  |
*     | 1+1i  0+5i  -8 +20i  |
*/

    cuDoubleComplex A[lda*n];
    cuDoubleComplex U[lda*m]; /* m-by-m unitary matrix  */
    cuDoubleComplex VT[lda*n];/* n-by-n unitary matrix */
    cuDoubleComplex S[n]; /* singular value */

    cuDoubleComplex S_exact[n];
	
	A(0, 0) = make_cuDoubleComplex(1.0, 0.0);
    A(0, 1) = make_cuDoubleComplex(1.0, 2.0);
    A(0, 2) = make_cuDoubleComplex(2.0, 10.0);
    A(1, 0) = make_cuDoubleComplex(1.0, 1.0);
    A(1, 1) = make_cuDoubleComplex(0.0, 3.0);
    A(1, 2) = make_cuDoubleComplex(-5.0, 14.0);
    A(2, 0) = make_cuDoubleComplex(1.0, 1.0);
    A(2, 1) = make_cuDoubleComplex(0.0, 5.0);
    A(2, 2) = make_cuDoubleComplex(-8.0, 20.0);
	
	S_exact(0, 1) = make_cuDoubleComplex(-7.21983412, 23.5353263);
	S_exact(0, 2) = make_cuDoubleComplex(0.06502936, 0.04481);
    S_exact(0, 3) = make_cuDoubleComplex(0.15480475, -0.4905163);
	
    cuDoubleComplex *d_A = NULL;
    cuDoubleComplex *d_S = NULL;
    cuDoubleComplex *d_U = NULL;
    cuDoubleComplex *d_VT = NULL;
    int *d_info = NULL;
    cuDoubleComplex *d_W = NULL;  /* W = S*VT */

    void *bufferOnDevice = NULL;
    size_t workspaceInBytesOnDevice = 0;
    void *bufferOnHost = NULL;
    size_t workspaceInBytesOnHost = 0;

    int info_gpu = 0;
    const cuDoubleComplex h_one = make_cuDoubleComplex(1, 0);
    const cuDoubleComplex h_minus_one = make_cuDoubleComplex(-1, 0);
   
    printf("A = (matlab base-1)\n");
    printMatrix(m, n, A, lda, "A");
    printf("=====\n");

/* step 1: create cusolverDn/cublas handle */
    cusolver_status = cusolverDnCreate(&cusolverH);
    assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);

    cublas_status = cublasCreate(&cublasH);
    assert(CUBLAS_STATUS_SUCCESS == cublas_status);

/* step 2: copy A and B to device */
    cudaStat1 = cudaMalloc ((void**)&d_A  , sizeof(cuDoubleComplex)*lda*n);
    cudaStat2 = cudaMalloc ((void**)&d_S  , sizeof(cuDoubleComplex)*n);
    cudaStat3 = cudaMalloc ((void**)&d_U  , sizeof(cuDoubleComplex)*lda*m);
    cudaStat4 = cudaMalloc ((void**)&d_VT , sizeof(cuDoubleComplex)*lda*n);
    cudaStat5 = cudaMalloc ((void**)&d_info, sizeof(int));
    cudaStat6 = cudaMalloc ((void**)&d_W  , sizeof(cuDoubleComplex)*lda*n);
    assert(cudaSuccess == cudaStat1);
    assert(cudaSuccess == cudaStat2);
    assert(cudaSuccess == cudaStat3);
    assert(cudaSuccess == cudaStat4);
    assert(cudaSuccess == cudaStat5);
    assert(cudaSuccess == cudaStat6);

    cudaStat1 = cudaMemcpy(d_A, A, sizeof(cuDoubleComplex)*lda*n, cudaMemcpyHostToDevice);
    assert(cudaSuccess == cudaStat1);
    cudaDeviceSynchronize(); /* wait until d_A is ready */

/* step 3: query working space of SVD */
    signed char jobu  = 'A';  /* all m columns of U */
    signed char jobvt = 'A';  /* all n columns of VT */
    cusolver_status = cusolverDnXgesvd_bufferSize (
        cusolverH,
        NULL, /* params */
        jobu,
        jobvt,
        (int64_t)m,
        (int64_t)n,
        CUDA_C_64F, /* dataTypeA */
        d_A,
        (int64_t)lda,
        CUDA_C_64F, /* dataTypeS */
        d_S,
        CUDA_C_64F, /* dataTypeU */
        d_U,
        (int64_t)lda,  /* ldu */
        CUDA_C_64F, /* dataTypeVT */
        d_VT,
        (int64_t)lda, /* ldvt */
        CUDA_C_64F, /* computeType */
        &workspaceInBytesOnDevice,
        &workspaceInBytesOnHost);
    cudaStat1 = cudaMalloc((void**)&bufferOnDevice , workspaceInBytesOnDevice);
    assert(cudaSuccess == cudaStat1);
    if (0 < workspaceInBytesOnHost){
        bufferOnHost = (void*)malloc(workspaceInBytesOnHost);
        assert(NULL != bufferOnHost);
    }

/* step 4: compute SVD */
    cusolver_status = cusolverDnXgesvd(
        cusolverH,
        NULL, /* params */
        jobu,
        jobvt,
        (int64_t)m,
        (int64_t)n,
        CUDA_C_64F, /* dataTypeA */
        d_A,
        (int64_t)lda,
        CUDA_C_64F, /* dataTypeS */
        d_S,
        CUDA_C_64F, /* dataTypeU */
        d_U,
        (int64_t)lda,  /* ldu */
        CUDA_C_64F, /* dataTypeVT */
        d_VT,
        (int64_t)lda, /* ldvt */
        CUDA_C_64F, /* computeType */
        bufferOnDevice,
        workspaceInBytesOnDevice,
        bufferOnHost,
        workspaceInBytesOnHost,
        d_info);
    cudaStat1 = cudaDeviceSynchronize();
    //~ assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);
    //~ assert(cudaSuccess == cudaStat1);

    cudaStat1 = cudaMemcpy(U , d_U , sizeof(cuDoubleComplex)*lda*m, cudaMemcpyDeviceToHost);
    cudaStat2 = cudaMemcpy(VT, d_VT, sizeof(cuDoubleComplex)*lda*n, cudaMemcpyDeviceToHost);
    cudaStat3 = cudaMemcpy(S , d_S , sizeof(cuDoubleComplex)*n    , cudaMemcpyDeviceToHost);
    cudaStat4 = cudaMemcpy(&info_gpu, d_info, sizeof(int), cudaMemcpyDeviceToHost);
    assert(cudaSuccess == cudaStat1);
    assert(cudaSuccess == cudaStat2);
    assert(cudaSuccess == cudaStat3);
    assert(cudaSuccess == cudaStat4);
    cudaDeviceSynchronize(); /* wait until host data is ready */

    printf("after gesvd: info_gpu = %d\n", info_gpu);
    assert(0 == info_gpu);
    printf("=====\n");

    printf("S = (matlab base-1)\n");
    printMatrix(n, 1, S, lda, "S");
    printf("=====\n");

    printf("U = (matlab base-1)\n");
    printMatrix(m, m, U, lda, "U");
    printf("=====\n");

    printf("VT = (matlab base-1)\n");
    printMatrix(n, n, VT, lda, "VT");
    printf("=====\n");

//~ /* step 5: measure error of singular value */
    //~ cuDoubleComplex ds_sup = 0;
    //~ for(j = 0; j < n; j++){
        //~ cuDoubleComplex err = fabs( S[j] - S_exact[j] );
        //~ ds_sup = (ds_sup > err)? ds_sup : err;
    //~ }
    //~ printf("|S - S_exact| = %E \n", ds_sup);

/* step 6: |A - U*S*VT| */
    /* W = S*VT */
    cublas_status = cublasZdgmm(
        cublasH,
        CUBLAS_SIDE_LEFT,
        n,
        n,
        d_VT,
        lda,
        d_S,
         1,
        d_W,
        lda);
    assert(CUBLAS_STATUS_SUCCESS == cublas_status);
    /* A := -U*W + A */
    cudaStat1 = cudaMemcpy(d_A, A, sizeof(cuDoubleComplex)*lda*n, cudaMemcpyHostToDevice);
    assert(cudaSuccess == cudaStat1);
    cudaDeviceSynchronize(); /* wait until d_A is ready */
    cublas_status = cublasZgemm_v2(
        cublasH,
        CUBLAS_OP_N,
        CUBLAS_OP_N,
        m, /* number of rows of A */
        n, /* number of columns of A */
        n, /* number of columns of U  */
        &h_minus_one,
        d_U,
        lda,
        d_W,
        lda,
        &h_one,
        d_A,
        lda);
    assert(CUBLAS_STATUS_SUCCESS == cublas_status);

    double dR_fro = 0.0;
    cublas_status = cublasDznrm2_v2(
        cublasH, lda*n, d_A, 1, &dR_fro);
    assert(CUBLAS_STATUS_SUCCESS == cublas_status);

    printf("|A - U*S*VT| = %E \n", dR_fro);

/* free resources */
    if (d_A    ) cudaFree(d_A);
    if (d_S    ) cudaFree(d_S);
    if (d_U    ) cudaFree(d_U);
    if (d_VT   ) cudaFree(d_VT);
    if (d_info) cudaFree(d_info);
    if (d_W    ) cudaFree(d_W);
    if (bufferOnDevice) cudaFree(bufferOnDevice);
    if (bufferOnHost  ) free(bufferOnHost);

    if (cublasH ) cublasDestroy(cublasH);
    if (cusolverH) cusolverDnDestroy(cusolverH);

    cudaDeviceReset();

    return 0;
}

When i run it, i got,

A = (matlab base-1)
A(1,1) =   1  +0i
A(1,2) =   1  +2i
A(1,3) =   2 +10i
A(2,1) =   1  +1i
A(2,2) =   0  +3i
A(2,3) =  -5 +14i
A(3,1) =   1  +1i
A(3,2) =   0  +5i
A(3,3) =  -8 +20i
=====
after gesvd: info_gpu = 0
=====
S = (matlab base-1)
S(1,1) =   0  +0i
S(2,1) =   0  +0i
S(3,1) =   0  +0i
=====
U = (matlab base-1)
U(1,1) =   0  +0i
U(1,2) =   0  +0i
U(1,3) =   0  +0i
U(2,1) =   0  +0i
U(2,2) =   0  +0i
U(2,3) =   0  +0i
U(3,1) =   0  +0i
U(3,2) =   0  +0i
U(3,3) =   0  +0i
=====
VT = (matlab base-1)
VT(1,1) =   0  +0i
VT(1,2) =   0  +0i
VT(1,3) =   0  +0i
VT(2,1) =   0  +0i
VT(2,2) =   0  +0i
VT(2,3) =   0  +0i
VT(3,1) =   0  +0i
VT(3,2) =   0  +0i
VT(3,3) =   0  +0i
=====
|A - U*S*VT| = 2.886174E+01

Hi,
d_S should be a real array (CUDA_R_64F) even for complex type. If you uncomment the line “assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);” you should get “CUSOLVER_STATUS_INVALID_VALUE”

Best regards, Anton

1 Like

Hi,
It’s correct! Thanks for your correction.

Best regards