Hello!
I tried to use cusparseCsrmvEx(...)
function to do matrix-vector multiplication with different types of input-output vector. It returns “CUSPARSE_STATUS_INVALID_VALUE”, when I try to pass complex (CUDA_C_64F) vector/scalar or even useless buffer-argument.
Maybe I just don’t understand this function correctly.
The code bellow shows my attempts to do it. It tries to d multiplication of double matrix and double vector and put the answer in complex vector. If you change vector d_y to double and pass corresponding types (CUDA_R_64F) to malloc, cusparseCsrmvEx_bufferSize, cusparseCsrmvEx functions, it should work. But it fails on any complex additions.
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusparse.h>
void printMatrix(int m, int n, const double*A, int lda, const char* name)
{
for(int row = 0 ; row < m ; row++){
for(int col = 0 ; col < n ; col++){
double Areg = A[row + col*lda];
printf("%s(%d,%d) = %f\n", name, row+1, col+1, Areg);
}
}
}
int main()
{
cublasHandle_t cublasH = NULL;
cusparseHandle_t cusparseH = NULL;
cudaStream_t stream = NULL;
cusparseMatDescr_t descrA = NULL;
cublasStatus_t cublasStat = CUBLAS_STATUS_SUCCESS;
cusparseStatus_t cusparseStat = CUSPARSE_STATUS_SUCCESS;
cudaError_t cudaStat1 = cudaSuccess;
cudaError_t cudaStat2 = cudaSuccess;
cudaError_t cudaStat3 = cudaSuccess;
cudaError_t cudaStat4 = cudaSuccess;
cudaError_t cudaStat5 = cudaSuccess;
const int n = 4;
const int nnzA = 9;
/*
* | 1 0 2 3 |
* | 0 4 0 0 |
* A = | 5 0 6 7 |
* | 0 8 0 9 |
*
*/
const int csrRowPtrA[n+1] = { 0, 3, 4, 7, 9 };
const int csrColIndA[nnzA] = {0, 2, 3, 1, 0, 2, 3, 1, 3 };
const double csrValA[nnzA] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0 };
const double x0[n] = {1.0, 2.0, 3.0, 4.0 };
int *d_csrRowPtrA = NULL;
int *d_csrColIndA = NULL;
double *d_csrValA = NULL;
double *d_x = NULL;
cuDoubleComplex *d_y = NULL;
double h_one = 1.0;
//cuDoubleComplex h_one;
//h_one.x = 1.0; h_one.y = 0.0;
const double h_zero = 0.0;
/* step 1: create cublas/cusparse handle, bind a stream */
cudaStat1 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
assert(cudaSuccess == cudaStat1);
cublasStat = cublasCreate(&cublasH);
assert(CUBLAS_STATUS_SUCCESS == cublasStat);
cublasStat = cublasSetStream(cublasH, stream);
assert(CUBLAS_STATUS_SUCCESS == cublasStat);
cusparseStat = cusparseCreate(&cusparseH);
assert(CUSPARSE_STATUS_SUCCESS == cusparseStat);
cusparseStat = cusparseSetStream(cusparseH, stream);
assert(CUSPARSE_STATUS_SUCCESS == cusparseStat);
/* step 2: configuration of matrix A */
cusparseStat = cusparseCreateMatDescr(&descrA);
assert(CUSPARSE_STATUS_SUCCESS == cusparseStat);
cusparseSetMatIndexBase(descrA,CUSPARSE_INDEX_BASE_ZERO);
cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL );
/* step 3: copy A and x0 to device */
cudaStat1 = cudaMalloc ((void**)&d_csrRowPtrA, sizeof(int) * (n+1) );
cudaStat2 = cudaMalloc ((void**)&d_csrColIndA, sizeof(int) * nnzA );
cudaStat3 = cudaMalloc ((void**)&d_csrValA , sizeof(double) * nnzA );
cudaStat4 = cudaMalloc ((void**)&d_x , sizeof(double) * n );
cudaStat5 = cudaMalloc ((void**)&d_y , sizeof(cuDoubleComplex) * n );
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
assert(cudaSuccess == cudaStat4);
assert(cudaSuccess == cudaStat5);
cudaStat1 = cudaMemcpy(d_csrRowPtrA, csrRowPtrA, sizeof(int) * (n+1) , cudaMemcpyHostToDevice);
cudaStat2 = cudaMemcpy(d_csrColIndA, csrColIndA, sizeof(int) * nnzA , cudaMemcpyHostToDevice);
cudaStat3 = cudaMemcpy(d_csrValA , csrValA , sizeof(double) * nnzA , cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
/*
* 4.1: initial guess x0
*/
cudaStat1 = cudaMemcpy(d_x, x0, sizeof(double) * n, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat1);
// CREATE BUFFER FOR cusparseCsrmvEx(...)
size_t buffer_size = 0;
cusparseAlgMode_t dummyAlg = CUSPARSE_ALG0;
cusparseCsrmvEx_bufferSize(cusparseH,
dummyAlg,
CUSPARSE_OPERATION_NON_TRANSPOSE,
n,
n,
nnzA,
&h_one, CUDA_R_64F,
descrA,
d_csrValA, CUDA_R_64F,
d_csrRowPtrA,
d_csrColIndA,
d_x, CUDA_R_64F,
&h_zero, CUDA_R_64F,
d_y, CUDA_C_64F,
CUDA_C_64F,
&buffer_size);
void* buffer = NULL;
cudaError_t cudaStat = cudaSuccess;
cudaStat = cudaMalloc ((void**)&buffer, buffer_size);
assert(cudaSuccess == cudaStat);
printf("BUFFER is %E\n", buffer_size );
/*
* 4.3: y = A*x
*/
cusparseStat = cusparseCsrmvEx(cusparseH,
dummyAlg,
CUSPARSE_OPERATION_NON_TRANSPOSE,
n,
n,
nnzA,
&h_one, CUDA_R_64F,
descrA,
d_csrValA, CUDA_R_64F,
d_csrRowPtrA,
d_csrColIndA,
d_x, CUDA_R_64F,
&h_zero, CUDA_R_64F,
d_y, CUDA_C_64F,
CUDA_C_64F,
buffer
);
assert(CUSPARSE_STATUS_SUCCESS == cusparseStat);
/* free resources */
if (d_csrRowPtrA ) cudaFree(d_csrRowPtrA);
if (d_csrColIndA ) cudaFree(d_csrColIndA);
if (d_csrValA ) cudaFree(d_csrValA);
if (d_x ) cudaFree(d_x);
if (d_y ) cudaFree(d_y);
if (cublasH ) cublasDestroy(cublasH);
if (cusparseH ) cusparseDestroy(cusparseH);
if (stream ) cudaStreamDestroy(stream);
if (descrA ) cusparseDestroyMatDescr(descrA);
cudaDeviceReset();
printf("EVERYTHIG IS FINE =)\n");
return 0;
}
nvcc cuComplexTest.cu -o cuComplexTest -lcublas -lcusparse && ./cuComplexTest
Thank you for any help!