Hello ,
I want to solve the Ax=B system.
And I want to use cusparseScsrmv function.
I haven’t quite understand how to use it though.
I checked here http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrmv.
I tried to do in my code:
int nz = 0 , *I = NULL, *J = NULL;
float *val = NULL;
int *d_col, *d_row;
float *d_val, *d_x,*d_Ax;
float alpha, betaA;
I = (int *)malloc(sizeof(int)*(N+1));
J = (int *)malloc(sizeof(int)*nz);
/* Get handle to the CUSPARSE context */
cusparseHandle_t cusparseHandle = 0;
cusparseStatus_t cusparseStatus;
cusparseStatus = cusparseCreate(&cusparseHandle);
cusparseMatDescr_t descr = 0;
cusparseStatus = cusparseCreateMatDescr(&descr);
cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO);
gpuErrchk( cudaMalloc((void **)&d_col, nz*sizeof(int)) );
gpuErrchk( cudaMalloc((void **)&d_row, (n+1)*sizeof(int)) );
gpuErrchk( cudaMalloc((void **)&d_val, nz*sizeof(float)) );
gpuErrchk( cudaMalloc((void **)&d_x, (C[N-1] + R[N-1]) *sizeof(float)) );
gpuErrchk( cudaMalloc((void **)&d_Ax, N *sizeof(float)) );
gpuErrchk( cudaMemcpy(d_col, J, nz*sizeof(int), cudaMemcpyHostToDevice) );
gpuErrchk( cudaMemcpy(d_row, I, (N+ 1)*sizeof(int), cudaMemcpyHostToDevice) );
gpuErrchk( cudaMemcpy(d_val, val, nz*sizeof(float), cudaMemcpyHostToDevice) );
gpuErrchk( cudaMemcpy(d_x, X, (C[N-1] + R[N-1]) *sizeof(float), cudaMemcpyHostToDevice) );
alpha = 1.0;
beta = -1.0;
cusparseScsrmv(cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, NEl, (C[N-1] + R[N-1]), nz, &alpha, descr, d_val, d_row, d_col, d_x, &beta, d_Ax);
gpuErrchk( cudaMemcpy(X, d_x, (C[N-1] + R[N-1]) *sizeof(float), cudaMemcpyDeviceToHost) );
Is this the right implementation for taking as result the X ?
-
What about beta?Is it right that have it -1 ? When do I have to put it zero?
-
It says in docs:
x <type> vector of n elements if op ( A ) = A
(it’s d_x in my code)
y <type> vector of m elements if op ( A ) = A
(it’s d_Ax in mycode)
but then , what should be the dimensions of d_x and d_Ax ?The one says about column elements (n) and the other about row elements (m) .
When I am going to copy :
gpuErrchk( cudaMemcpy(X, d_x, (C[N-1] + R[N-1]) *sizeof(float), cudaMemcpyDeviceToHost) );
what size should I put?
- What about d_val?How should I use it?
csrValA <type> array of nnz ( = csrRowPtrA(m) - csrRowPtrA(0) ) nonzero elements of matrix A.
I used the example from samples which uses d_val (crsVal) t like in the code above.
Thank you!