I have tried the spmm_csr example with cuDoubleComplex data as followed and it failed. Is it limitation of the lib or I am doing something wrong?
int main(void) {
// Host problem definition
int A_num_rows = 4;
int A_num_cols = 4;
int A_nnz = 9;
int B_num_rows = A_num_cols;
int B_num_cols = 3;
int ldb = B_num_rows;
int ldc = A_num_rows;
int B_size = ldb * B_num_cols;
int C_size = ldc * B_num_cols;
int hA_csrOffsets[] = { 0, 3, 4, 7, 9 };
int hA_columns[] = { 0, 2, 3, 1, 0, 2, 3, 1, 3 };
std::vector<cuDoubleComplex> a_vm = { {1.0, 0.}, {2.0,0.}, {3.0,0.}, {4.0,0.}, {5.0,0.0}, {6.0,0.0}, {7.0,0.}, {8.0,0.}, {9.0,0.} };
std::vector<cuDoubleComplex> b_vm = { {1.0, 0.}, {2.,0.}, {3.,0.}, {4.,0.},
{5.0,0}, {6.0,0.}, {7.0,0.}, {8.0,0.},
{9.0,0}, {10.0,0.}, {11.0,0.}, {12.0,0.}};
std::vector<cuDoubleComplex> c_vm = { {0.0,0.0}, {0.0,0.0}, {0.0,0.0}, {0.0,0.0},
{0.0,0.0}, {0.0,0.0}, {0.0,0.0}, {0.0,0.0},
{0.0,0.0}, {0.0,0.0}, {0.0,0.0}, {0.0,0.0}};
float hC_result[] = { 19.0f, 8.0f, 51.0f, 52.0f,
43.0f, 24.0f, 123.0f, 120.0f,
67.0f, 40.0f, 195.0f, 188.0f };
float alpha = 1.0f;
float beta = 0.0f;
//--------------------------------------------------------------------------
// std::cout << "A " << std::endl;
// print_csr(hA_csrOffsets, hA_columns, a_vm, A_num_rows, A_num_cols, A_nnz);
// std::cout << "\nB" << std::endl;
// print_dn(b_vm, B_num_rows, B_num_cols);
// std::cout << "\nC" << std::endl;
// print_dn(c_vm, A_num_rows, B_num_cols);
// Device memory management
int *dA_csrOffsets, *dA_columns;
// float *dA_values, *dB, *dC;
cuDoubleComplex *dA_values, *dB, *dC;
CHECK_CUDA( cudaMalloc((void**) &dA_csrOffsets,(A_num_rows + 1) * sizeof(int)) )
CHECK_CUDA( cudaMalloc((void**) &dA_columns, A_nnz * sizeof(int)) )
// CHECK_CUDA( cudaMalloc((void**) &dA_values, A_nnz * sizeof(float)) )
CHECK_CUDA( cudaMalloc((void**) &dA_values, A_nnz * sizeof(cuDoubleComplex)) )
CHECK_CUDA( cudaMalloc((void**) &dB, B_size * sizeof(cuDoubleComplex)) )
// CHECK_CUDA( cudaMalloc((void**) &dB, B_size * sizeof(float)) )
CHECK_CUDA( cudaMalloc((void**) &dC, C_size * sizeof(cuDoubleComplex)) )
// CHECK_CUDA( cudaMalloc((void**) &dC, C_size * sizeof(float)) )
CHECK_CUDA( cudaMemcpy(dA_csrOffsets, hA_csrOffsets,(A_num_rows + 1) * sizeof(int), cudaMemcpyHostToDevice) )
// CHECK_CUDA( cudaMemcpy(dA_csrOffsets, hA_csrOffsets,(A_num_rows + 1) * sizeof(int), cudaMemcpyHostToDevice) )
CHECK_CUDA( cudaMemcpy(dA_columns, hA_columns, A_nnz * sizeof(int),cudaMemcpyHostToDevice) )
CHECK_CUDA( cudaMemcpy(dA_values, a_vm.data(), A_nnz * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice) )
// CHECK_CUDA( cudaMemcpy(dA_values, hA_values, A_nnz * sizeof(float), cudaMemcpyHostToDevice) )
CHECK_CUDA( cudaMemcpy(dB, b_vm.data(), B_size * sizeof(cuDoubleComplex),cudaMemcpyHostToDevice) )
// CHECK_CUDA( cudaMemcpy(dB, hB, B_size * sizeof(float),cudaMemcpyHostToDevice) )
CHECK_CUDA( cudaMemcpy(dC, c_vm.data(), C_size * sizeof(cuDoubleComplex),cudaMemcpyHostToDevice) )
// CHECK_CUDA( cudaMemcpy(dC, hC, C_size * sizeof(float),cudaMemcpyHostToDevice) )
//--------------------------------------------------------------------------
// CUSPARSE APIs
cusparseHandle_t handle = NULL;
cusparseSpMatDescr_t matA;
cusparseDnMatDescr_t matB, matC;
void* dBuffer = NULL;
size_t bufferSize = 0;
CHECK_CUSPARSE( cusparseCreate(&handle) )
// Create sparse matrix A in CSR format
CHECK_CUSPARSE( cusparseCreateCsr(&matA, A_num_rows, A_num_cols, A_nnz,
dA_csrOffsets, dA_columns, dA_values,
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
CUSPARSE_INDEX_BASE_ZERO, CUDA_C_64F) )
CHECK_CUSPARSE( cusparseCreateDnMat(&matB, A_num_cols, B_num_cols, ldb, dB,
CUDA_C_64F, CUSPARSE_ORDER_COL) )
// Create dense matrix C
CHECK_CUSPARSE( cusparseCreateDnMat(&matC, A_num_rows, B_num_cols, ldc, dC,
CUDA_C_64F, CUSPARSE_ORDER_COL) )
// allocate an external buffer if needed
CHECK_CUSPARSE( cusparseSpMM_bufferSize(
handle,
CUSPARSE_OPERATION_NON_TRANSPOSE,
CUSPARSE_OPERATION_NON_TRANSPOSE,
&alpha, matA, matB, &beta, matC, CUDA_C_64F,
CUSPARSE_SPMM_ALG_DEFAULT, &bufferSize) )
CHECK_CUDA( cudaMalloc(&dBuffer, bufferSize) )
// execute SpMM
CHECK_CUSPARSE( cusparseSpMM(handle,
CUSPARSE_OPERATION_NON_TRANSPOSE,
CUSPARSE_OPERATION_NON_TRANSPOSE,
&alpha, matA, matB, &beta, matC, CUDA_C_64F,
CUSPARSE_SPMM_ALG_DEFAULT, dBuffer) )
// destroy matrix/vector descriptors
CHECK_CUSPARSE( cusparseDestroySpMat(matA) )
CHECK_CUSPARSE( cusparseDestroyDnMat(matB) )
CHECK_CUSPARSE( cusparseDestroyDnMat(matC) )
CHECK_CUSPARSE( cusparseDestroy(handle) )
//--------------------------------------------------------------------------
// device result check
CHECK_CUDA( cudaMemcpy(c_vm.data(), dC, C_size * sizeof(cuDoubleComplex),cudaMemcpyDeviceToHost) )
print_dn(c_vm, A_num_rows, B_num_cols);
int correct = 1;
for (int i = 0; i < A_num_rows; i++) {
for (int j = 0; j < B_num_cols; j++) {
if (c_vm[i + j * ldc].x != hC_result[i + j * ldc]) {
correct = 0; // direct floating point comparison is not reliable
break;
}
}
}
if (correct)
printf("\nspmm_csr_example test PASSED\n");
else
printf("\nspmm_csr_example test FAILED: wrong result\n");
//--------------------------------------------------------------------------
// device memory deallocation
CHECK_CUDA( cudaFree(dBuffer) )
CHECK_CUDA( cudaFree(dA_csrOffsets) )
CHECK_CUDA( cudaFree(dA_columns) )
CHECK_CUDA( cudaFree(dA_values) )
CHECK_CUDA( cudaFree(dB) )
CHECK_CUDA( cudaFree(dC) )
return EXIT_SUCCESS;
}
result:
A
(1, 0) (*, *) (2, 0) (3, 0)
(*, *) (4, 0) (*, *) (*, *)
(5, 0) (*, *) (6, 0) (7, 0)
(*, *) (8, 0) (*, *) (9, 0)
B
(1, 0) (4, 0) (7, 0) (10, 0)
(2, 0) (5, 0) (8, 0) (11, 0)
(3, 0) (6, 0) (9, 0) (12, 0)
C
(0, 0) (0, 0) (0, 0) (0, 0)
(0, 0) (0, 0) (0, 0) (0, 0)
(0, 0) (0, 0) (0, 0) (0, 0)
C
(1.00007e-313, 1.31779e-308) (2.73704e-313, 3.60658e-308) (6.47416e-313, 8.53094e-308) (2.10542e-313, 2.77429e-308)
(4.21084e-314, 5.54858e-309) (2.26332e-313, 2.98236e-308) (6.31625e-313, 8.32287e-308) (1.02639e-312, 1.35247e-307)
(2.68441e-313, 3.53722e-308) (1.26325e-313, 1.66457e-308) (3.52657e-313, 4.64693e-308) (9.89546e-313, 1.30392e-307)
spmm_csr_example test FAILED: wrong result
ps
Cuda compilation tools, release 12.1, V12.1.105