cusparseSpMM fails with complex data

I have tried the spmm_csr example with cuDoubleComplex data as followed and it failed. Is it limitation of the lib or I am doing something wrong?

int main(void) {
    // Host problem definition
    int   A_num_rows      = 4;
    int   A_num_cols      = 4;
    int   A_nnz           = 9;
    int   B_num_rows      = A_num_cols;
    int   B_num_cols      = 3;
    int   ldb             = B_num_rows;
    int   ldc             = A_num_rows;
    int   B_size          = ldb * B_num_cols;
    int   C_size          = ldc * B_num_cols;
    int   hA_csrOffsets[] = { 0, 3, 4, 7, 9 };
    int   hA_columns[]    = { 0, 2, 3, 1, 0, 2, 3, 1, 3 };
    std::vector<cuDoubleComplex> a_vm = { {1.0, 0.}, {2.0,0.}, {3.0,0.}, {4.0,0.}, {5.0,0.0}, {6.0,0.0}, {7.0,0.}, {8.0,0.}, {9.0,0.} };

    std::vector<cuDoubleComplex> b_vm = { {1.0, 0.}, {2.,0.}, {3.,0.}, {4.,0.},
                                        {5.0,0}, {6.0,0.}, {7.0,0.}, {8.0,0.},
                                        {9.0,0}, {10.0,0.}, {11.0,0.}, {12.0,0.}};

    std::vector<cuDoubleComplex> c_vm = { {0.0,0.0}, {0.0,0.0}, {0.0,0.0}, {0.0,0.0},
                                          {0.0,0.0}, {0.0,0.0}, {0.0,0.0}, {0.0,0.0},
                                          {0.0,0.0}, {0.0,0.0}, {0.0,0.0}, {0.0,0.0}};

    float hC_result[]     = { 19.0f,  8.0f,  51.0f,  52.0f,
                              43.0f, 24.0f, 123.0f, 120.0f,
                              67.0f, 40.0f, 195.0f, 188.0f };
    float alpha           = 1.0f;
    float beta            = 0.0f;
    //--------------------------------------------------------------------------
//    std::cout << "A " << std::endl;
//    print_csr(hA_csrOffsets, hA_columns, a_vm, A_num_rows, A_num_cols, A_nnz);
//    std::cout << "\nB" << std::endl;
//    print_dn(b_vm, B_num_rows, B_num_cols);
//    std::cout << "\nC" << std::endl;
//    print_dn(c_vm, A_num_rows, B_num_cols);
    // Device memory management
    int   *dA_csrOffsets, *dA_columns;
//    float *dA_values, *dB, *dC;
    cuDoubleComplex *dA_values, *dB, *dC;
    CHECK_CUDA( cudaMalloc((void**) &dA_csrOffsets,(A_num_rows + 1) * sizeof(int)) )
    CHECK_CUDA( cudaMalloc((void**) &dA_columns, A_nnz * sizeof(int))    )
//    CHECK_CUDA( cudaMalloc((void**) &dA_values,  A_nnz * sizeof(float))  )
    CHECK_CUDA( cudaMalloc((void**) &dA_values,  A_nnz * sizeof(cuDoubleComplex))  )
    CHECK_CUDA( cudaMalloc((void**) &dB,         B_size * sizeof(cuDoubleComplex)) )
//    CHECK_CUDA( cudaMalloc((void**) &dB,         B_size * sizeof(float)) )
    CHECK_CUDA( cudaMalloc((void**) &dC,         C_size * sizeof(cuDoubleComplex)) )
//    CHECK_CUDA( cudaMalloc((void**) &dC,         C_size * sizeof(float)) )

    CHECK_CUDA( cudaMemcpy(dA_csrOffsets, hA_csrOffsets,(A_num_rows + 1) * sizeof(int), cudaMemcpyHostToDevice) )
//    CHECK_CUDA( cudaMemcpy(dA_csrOffsets, hA_csrOffsets,(A_num_rows + 1) * sizeof(int), cudaMemcpyHostToDevice) )
    CHECK_CUDA( cudaMemcpy(dA_columns, hA_columns, A_nnz * sizeof(int),cudaMemcpyHostToDevice) )
    CHECK_CUDA( cudaMemcpy(dA_values, a_vm.data(), A_nnz * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice) )
//    CHECK_CUDA( cudaMemcpy(dA_values, hA_values, A_nnz * sizeof(float), cudaMemcpyHostToDevice) )
    CHECK_CUDA( cudaMemcpy(dB, b_vm.data(), B_size * sizeof(cuDoubleComplex),cudaMemcpyHostToDevice) )
//    CHECK_CUDA( cudaMemcpy(dB, hB, B_size * sizeof(float),cudaMemcpyHostToDevice) )
    CHECK_CUDA( cudaMemcpy(dC, c_vm.data(), C_size * sizeof(cuDoubleComplex),cudaMemcpyHostToDevice) )
//    CHECK_CUDA( cudaMemcpy(dC, hC, C_size * sizeof(float),cudaMemcpyHostToDevice) )
    //--------------------------------------------------------------------------
    // CUSPARSE APIs
    cusparseHandle_t     handle = NULL;
    cusparseSpMatDescr_t matA;
    cusparseDnMatDescr_t matB, matC;
    void*                dBuffer    = NULL;
    size_t               bufferSize = 0;
    CHECK_CUSPARSE( cusparseCreate(&handle) )
    // Create sparse matrix A in CSR format
    CHECK_CUSPARSE( cusparseCreateCsr(&matA, A_num_rows, A_num_cols, A_nnz,
                                      dA_csrOffsets, dA_columns, dA_values,
                                      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
                                      CUSPARSE_INDEX_BASE_ZERO, CUDA_C_64F) )

    CHECK_CUSPARSE( cusparseCreateDnMat(&matB, A_num_cols, B_num_cols, ldb, dB,
                                        CUDA_C_64F, CUSPARSE_ORDER_COL) )

    // Create dense matrix C
    CHECK_CUSPARSE( cusparseCreateDnMat(&matC, A_num_rows, B_num_cols, ldc, dC,
                                        CUDA_C_64F, CUSPARSE_ORDER_COL) )

    // allocate an external buffer if needed
    CHECK_CUSPARSE( cusparseSpMM_bufferSize(
            handle,
            CUSPARSE_OPERATION_NON_TRANSPOSE,
            CUSPARSE_OPERATION_NON_TRANSPOSE,
            &alpha, matA, matB, &beta, matC, CUDA_C_64F,
            CUSPARSE_SPMM_ALG_DEFAULT, &bufferSize) )
    CHECK_CUDA( cudaMalloc(&dBuffer, bufferSize) )

    // execute SpMM
    CHECK_CUSPARSE( cusparseSpMM(handle,
                                 CUSPARSE_OPERATION_NON_TRANSPOSE,
                                 CUSPARSE_OPERATION_NON_TRANSPOSE,
                                 &alpha, matA, matB, &beta, matC, CUDA_C_64F,
                                 CUSPARSE_SPMM_ALG_DEFAULT, dBuffer) )

    // destroy matrix/vector descriptors
    CHECK_CUSPARSE( cusparseDestroySpMat(matA) )
    CHECK_CUSPARSE( cusparseDestroyDnMat(matB) )
    CHECK_CUSPARSE( cusparseDestroyDnMat(matC) )
    CHECK_CUSPARSE( cusparseDestroy(handle) )
    //--------------------------------------------------------------------------
    // device result check
    CHECK_CUDA( cudaMemcpy(c_vm.data(), dC, C_size * sizeof(cuDoubleComplex),cudaMemcpyDeviceToHost) )
    print_dn(c_vm, A_num_rows, B_num_cols);
    int correct = 1;
    for (int i = 0; i < A_num_rows; i++) {
        for (int j = 0; j < B_num_cols; j++) {
            if (c_vm[i + j * ldc].x != hC_result[i + j * ldc]) {
                correct = 0; // direct floating point comparison is not reliable
                break;
            }
        }
    }
    if (correct)
        printf("\nspmm_csr_example test PASSED\n");
    else
        printf("\nspmm_csr_example test FAILED: wrong result\n");
    //--------------------------------------------------------------------------
    // device memory deallocation
    CHECK_CUDA( cudaFree(dBuffer) )
    CHECK_CUDA( cudaFree(dA_csrOffsets) )
    CHECK_CUDA( cudaFree(dA_columns) )
    CHECK_CUDA( cudaFree(dA_values) )
    CHECK_CUDA( cudaFree(dB) )
    CHECK_CUDA( cudaFree(dC) )
    return EXIT_SUCCESS;
}

result:

A 
(1, 0)	(*, *)	(2, 0)	(3, 0)	
(*, *)	(4, 0)	(*, *)	(*, *)	
(5, 0)	(*, *)	(6, 0)	(7, 0)	
(*, *)	(8, 0)	(*, *)	(9, 0)	

B
(1, 0)	(4, 0)	(7, 0)	(10, 0)	
(2, 0)	(5, 0)	(8, 0)	(11, 0)	
(3, 0)	(6, 0)	(9, 0)	(12, 0)	

C
(0, 0)	(0, 0)	(0, 0)	(0, 0)	
(0, 0)	(0, 0)	(0, 0)	(0, 0)	
(0, 0)	(0, 0)	(0, 0)	(0, 0)	

C
(1.00007e-313, 1.31779e-308)	(2.73704e-313, 3.60658e-308)	(6.47416e-313, 8.53094e-308)	(2.10542e-313, 2.77429e-308)	
(4.21084e-314, 5.54858e-309)	(2.26332e-313, 2.98236e-308)	(6.31625e-313, 8.32287e-308)	(1.02639e-312, 1.35247e-307)	
(2.68441e-313, 3.53722e-308)	(1.26325e-313, 1.66457e-308)	(3.52657e-313, 4.64693e-308)	(9.89546e-313, 1.30392e-307)	

spmm_csr_example test FAILED: wrong result

ps
Cuda compilation tools, release 12.1, V12.1.105

Hi stoitchko,
These arrays also must be cuDoubleComplex:

After correcting the above mistakes, the example will pass.

Note that with cuDoubleComplex this direct comparison isn’t allowed, you’ll need to modify it:

Thx!

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.