Bugs when trying to perform tranpose of a matrix using cuSPARSE

Hi!
Thanks for your attention!
I am need to perform tranpose of a matrix(CSR) using cuSPARSE, but get “internal error”. I write my code referring to https://stackoverflow.com/questions/57368010/how-to-transpose-a-sparse-matrix-in-cusparse and https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2.

I am running on Nvidia GeForce GTX 1080, with driver cuda_11.1.0. I am using Windows 10.

The following is my codes. I am not familiar with this forum and do not know how to post my codes neatly. So I just paste my codes here. You can download the folder from https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/sparse2dense, and replace the sparse2dense_example.c with my codes. Then configure and make using CMake, maybe in this way you can reproduce my problems.

#include <cuda_runtime_api.h> // cudaMalloc, cudaMemcpy, etc.
#include <cusparse.h>         // cusparseSparseToDense
#include <stdio.h>            // printf
#include <stdlib.h>           // EXIT_FAILURE

#define CHECK_CUDA(func)                                                       \
{                                                                              \
    cudaError_t status = (func);                                               \
    if (status != cudaSuccess) {                                               \
        printf("CUDA API failed at line %d with error: %s (%d)\n",             \
               __LINE__, cudaGetErrorString(status), status);                  \
        return EXIT_FAILURE;                                                   \
    }                                                                          \
}

#define CHECK_CUSPARSE(func)                                                   \
{                                                                              \
    cusparseStatus_t status = (func);                                          \
    if (status != CUSPARSE_STATUS_SUCCESS) {                                   \
        printf("CUSPARSE API failed at line %d with error: %s (%d)\n",         \
               __LINE__, cusparseGetErrorString(status), status);              \
        return EXIT_FAILURE;                                                   \
    }                                                                          \
}

int main(void) {
    // CUSPARSE APIs
    cusparseHandle_t     handle = NULL;
	cusparseStatus_t status = (cusparseCreate(&handle));
	if (status != CUSPARSE_STATUS_SUCCESS) {
		printf("CUSPARSE API failed at line %d with error: %s (%d)\n", __LINE__, cusparseGetErrorString(status), status);
	}
    
	// Initialize matrix A
	// this matrix is the same as https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/sparse2dense/sparse2dense_example.c
	int   num_rows = 5;
	int   num_cols = 4;
	int   nnz = 11;
	int   h_csr_offsets[] = { 0, 3, 4, 7, 9, 11 };
	int   h_csr_columns[] = { 0, 2, 3, 1, 0, 2, 3, 1, 3, 1, 2 };
	float h_csr_values[] = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f,
							   7.0f, 8.0f, 9.0f, 10.0f, 11.0f };
	// Device memory management
	int* d_csr_offsets, * d_csr_columns;
	float* d_csr_values;
	CHECK_CUDA(cudaMalloc((void**)&d_csr_offsets, (num_rows + 1) * sizeof(int)))
	CHECK_CUDA(cudaMalloc((void**)&d_csr_columns, nnz * sizeof(int)))
	CHECK_CUDA(cudaMalloc((void**)&d_csr_values, nnz * sizeof(float)))

	CHECK_CUDA(cudaMemcpy(d_csr_offsets, h_csr_offsets,	(num_rows + 1) * sizeof(int), cudaMemcpyHostToDevice))
	CHECK_CUDA(cudaMemcpy(d_csr_columns, h_csr_columns, nnz * sizeof(int), cudaMemcpyHostToDevice))
	CHECK_CUDA(cudaMemcpy(d_csr_values, h_csr_values, nnz * sizeof(float), cudaMemcpyHostToDevice))

	// Memory allocation of transpose A
	int* d_csr_offsets_AT, * d_csr_columns_AT;
	float* d_csr_values_AT;
	//first allocate memory to ATT
	CHECK_CUDA(cudaMalloc((void**)&d_csr_offsets_AT, (num_cols + 1) * sizeof(int)))
	CHECK_CUDA(cudaMalloc((void**)&d_csr_columns_AT, nnz * sizeof(int)))
	CHECK_CUDA(cudaMalloc((void**)&d_csr_values_AT, nnz * sizeof(float)))

	size_t buffer_temp_size;
	cusparseCsr2cscEx2_bufferSize(
		handle, num_rows, num_cols, nnz, h_csr_values, h_csr_offsets, h_csr_columns,
		d_csr_values_AT, d_csr_offsets_AT, d_csr_columns_AT, CUDA_R_32F, CUSPARSE_ACTION_NUMERIC,
		CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1, &buffer_temp_size);
	void* buffer_temp = NULL;
	printf("buffer_temp_size is %zd\n", buffer_temp_size);
	CHECK_CUDA(cudaMalloc(&buffer_temp, buffer_temp_size))
	CHECK_CUSPARSE(cusparseCsr2cscEx2(handle, num_rows, num_cols, nnz, h_csr_values, h_csr_offsets, h_csr_columns,
		d_csr_values_AT, d_csr_offsets_AT, d_csr_columns_AT, CUDA_R_32F, CUSPARSE_ACTION_NUMERIC,
		CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1, buffer_temp))
}

cross posting here

with answer: You are passing host pointers to a routine that expects device pointers, e.g. h_csr_values

change your usages of:

 ... h_csr_values, h_csr_offsets, h_csr_columns, ...

to:

 ... d_csr_values, d_csr_offsets, d_csr_columns, ...
1 Like