cusparseSpGEMM_copy doesn't work

Hi, I’m trying to use cusparseSpGEMM routine to do sparse matrix multiplication of A and B resulting in C. Basically, I followed the example from the github. However, while no error was reported it seems that the final copy from temporary memory to sparse matrix C failed. I tried to print out the resulting C values_array on device. Turns out that array is empty, same goes to offset_array and columIndex_array of C. The following is my code. Any idea why it failed? Thanks in advance.

I work with CUDA v12.0 with Visual studio 2022. The card I have is RTXA3000 laptop GPU with 6GB dedicated GPU memory.

Matrix A : nRowsA: 6, nColumnsA: 136553, nNonZerosA: 369057.
Matrix B: nRowsB: 136553, nColumnsB: 136553, nNonZerosB: 9871015

Code follows:

cuSparseSpgemm(
const cusparseHandle_t &cusparseHandle,
const SpMatrix &SpMatrixA,
const SpMatrix &SpMatrixB,
SpMatrix &SpMatrixC)
{
cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
cusparseOperation_t opB = CUSPARSE_OPERATION_NON_TRANSPOSE;
cusparseSpGEMMAlg_t alg = CUSPARSE_SPGEMM_ALG3; //CUSPARSE_SPGEMM_DEFAULT

// Allocate alpha and beta on device
ScalarPtr device_scalars, device_alpha, device_beta;
cudaMalloc(&device_scalars, 4 * sizeof(Scalar));
checkCudaError(“cudaMalloc of device_scalars failed!”);
device_alpha = device_scalars++;
device_beta = device_scalars++;
device_scalars = device_alpha;
device_assign(device_alpha, 1.0);
device_assign(device_beta, 2.0);

// Get sparse matrix sizes of A and B
Index64 nRowsA{0}, nColumnsA{0}, nNonZerosA{0};
Index64 nRowsB{0}, nColumnsB{0}, nNonZerosB{0};
cusparseSpMatDescr_t descriptorA = SpMatrixA.getDeviceMatrix();
cusparseSpMatDescr_t descriptorB = SpMatrixB.getDeviceMatrix();

if (descriptorA == nullptr || descriptorB == nullptr)
{
std::cerr << “null A or B\n”;
exit(EXIT_FAILURE);
}

auto cuSparseStatus = cusparseSpMatGetSize(descriptorA, &nRowsA, &nColumnsA, &nNonZerosA);
checkCudaStatus(cuSparseStatus, “Get cuSparseMatrixA size failed!”);
cuSparseStatus = cusparseSpMatGetSize(descriptorB, &nRowsB, &nColumnsB, &nNonZerosB);
checkCudaStatus(cuSparseStatus, “Get cuSparseMatrixB size failed!”);

// Allocate device memory for C
IntPtr device_csrRowOffsetsC{nullptr}, device_csrColumnIndexC{nullptr};
ScalarPtr device_csrValuesC{nullptr};
auto nRowsC = nRowsA;
auto nColumnsC = nColumnsB;
cudaMalloc(reinterpret_cast<VoidPtr *>(&device_csrRowOffsetsC), (nRowsC + 1) * sizeof(int));
checkCudaError(“Allocate device csrRowOffsetsC failed!\n”);

// Create device C descriptor
cusparseSpMatDescr_t descriptorC;
cuSparseStatus = cusparseCreateCsr(
&descriptorC,
nRowsC,
nColumnsC,
0,
device_csrRowOffsetsC,
nullptr,
nullptr,
CUSPARSE_INDEX_32I,
CUSPARSE_INDEX_32I,
CUSPARSE_INDEX_BASE_ONE,
CUDA_R_64F);
checkCudaStatus(cuSparseStatus, “Create of cuSPARSE sparseCsr matrix C failed!”);

// SpGEMM descriptor
cusparseSpGEMMDescr_t spgemmDescriptor;
cuSparseStatus = cusparseSpGEMM_createDescr(&spgemmDescriptor);
checkCudaStatus(cuSparseStatus, “Create of cuSPARSE spgemmDescriptor failed!”);

// Allocate buffer
VoidPtr device_buffer1{nullptr}, device_buffer2{nullptr};
size_t bufferSize1{0}, bufferSize2{0};

// Ask bufferSize1 for external memory
cuSparseStatus = cusparseSpGEMM_workEstimation(
cusparseHandle,
opA,
opB,
&device_alpha,
descriptorA,
descriptorB,
&device_beta,
descriptorC,
CUDA_R_64F,
alg,
spgemmDescriptor,
&bufferSize1,
nullptr);
checkCudaStatus(
cuSparseStatus, “cusparseSpGEMM_workEstimation0 for matrix C failed!”);
cudaMalloc(&device_buffer1, bufferSize1);
checkCudaError(“cudaMalloc of device_buffer1 failed!”);

// Inspect the matrices A and B to understand the memory requirement for
// the next step
cuSparseStatus = cusparseSpGEMM_workEstimation(
cusparseHandle,
opA,
opB,
&device_alpha,
descriptorA,
descriptorB,
&device_beta,
descriptorC,
CUDA_R_64F,
alg,
spgemmDescriptor,
&bufferSize1,
device_buffer1);
checkCudaStatus(cuSparseStatus, “cusparseSpGEMM_workEstimation1 for matrix C failed!”);

int64_t num_prods;
float chunk_fraction = 0.2f;
VoidPtr device_buffer3{nullptr};
size_t bufferSize3{0};
cuSparseStatus = cusparseSpGEMM_getNumProducts(spgemmDescriptor, &num_prods);
checkCudaStatus(cuSparseStatus, “cusparseSpGEMM_getNumProducts failed!”);

// Ask bufferSize3 bytes for external memory
cuSparseStatus = cusparseSpGEMM_estimateMemory(
cusparseHandle,
opA,
opB,
&device_alpha,
descriptorA,
descriptorB,
&device_beta,
descriptorC,
CUDA_R_64F,
alg,
spgemmDescriptor,
chunk_fraction,
&bufferSize3,
nullptr,
nullptr);
checkCudaStatus(cuSparseStatus, “cusparseSpGEMM_estimateMemory failed!”);
cudaMalloc(&device_buffer3, bufferSize3);
checkCudaError(“cudaMalloc of device_buffer3 failed!”);

cuSparseStatus = cusparseSpGEMM_estimateMemory(
cusparseHandle,
opA,
opB,
&device_alpha,
descriptorA,
descriptorB,
&device_beta,
descriptorC,
CUDA_R_64F,
alg,
spgemmDescriptor,
chunk_fraction,
&bufferSize3,
device_buffer3,
&bufferSize2);
checkCudaStatus(cuSparseStatus, “cusparseSpGEMM_estimateMemory failed!”);

cudaFree(device_buffer3);
checkCudaError(“Free device_buffer3 failed!\n”);

cudaMalloc(&device_buffer2, bufferSize2);
checkCudaError(“cudaMalloc of device_buffer2 failed!”);

// Compute the intermediate product of A * B
cuSparseStatus = cusparseSpGEMM_compute(
cusparseHandle,
opA,
opB,
&device_alpha,
descriptorA,
descriptorB,
&device_beta,
descriptorC,
CUDA_R_64F,
alg,
spgemmDescriptor,
&bufferSize2,
device_buffer2);
checkCudaStatus(cuSparseStatus, “cusparseSpGEMM compute failed!”);

// Get matrix C size
Index64 nRowsCTmp{0}, nColumnsCTmp{0}, nNonZerosC{0};
cuSparseStatus = cusparseSpMatGetSize(descriptorC, &nRowsCTmp, &nColumnsCTmp, &nNonZerosC);
checkCudaStatus(cuSparseStatus, “Get cuSparseMatrixC size failed!”);

// Allocate matrix C
cudaMalloc(reinterpret_cast<VoidPtr *>(&device_csrColumnIndexC), nNonZerosC * sizeof(int));
checkCudaError(“Allocate device csrColumnIndexC failed!\n”);
cudaMalloc(reinterpret_cast<VoidPtr *>(&device_csrValuesC), nNonZerosC * sizeof(Scalar));
checkCudaError(“Allocate device csrValuesC failed!\n”);

// Update matrix C with the new device pointers
cuSparseStatus = cusparseCsrSetPointers(
descriptorC, device_csrRowOffsetsC, device_csrColumnIndexC, device_csrValuesC);
checkCudaStatus(cuSparseStatus, “cusparseCsrSetPointers failed!”);

// Copy the final products to the matrix C
cuSparseStatus = cusparseSpGEMM_copy(
cusparseHandle,
opA,
opB,
&device_alpha,
descriptorA,
descriptorB,
&device_beta,
descriptorC,
CUDA_R_64F,
alg,
spgemmDescriptor);
checkCudaStatus(cuSparseStatus, “cusparseSpGEMM_copy failed!”);

/* Following simple print function is used.
void device_printScalar(std::string descr, ScalarPtr val)
{
std::cout << descr << “\n”;
printScalar<<<1, 1>>>(val);
cudaDeviceSynchronize();
}
global void printScalar(ScalarPtr val)
{
printf(“%d\n”, val[0]);
}
*/

for (int i = 0; i < 20; i++)
{
device_printScalar(“C device_csrValues”, device_csrValuesC++); // prints out nothing.
}

// Use of descriptorC…


// Free device resources
cuSparseStatus = cusparseSpGEMM_destroyDescr(spgemmDescriptor);
cudaFree(device_buffer1);
cudaFree(device_buffer2);
cudaFree(device_scalars);
}

Problem solved.

It is related to device_alpha and device_beta which are device pointers. In the call of e.g. cusparseSpGEMM_workEstimation(), device_alpha and device_beta should be passed, not &device_alpha and &device_beta. Silly mistake.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.