# Low speed MatrixVectorMult & Vector Sum on CUDA (x10 slower then CPU)

I tried to implement the part of an iterative method with Matrix-Vector multiplication on CUDA. The pseudocode bellow shows this part. It attempts to multiply input double vector x on double block-matrices (G + C). Every block also multiplies on complex scalar. So the output is complex vector.

``````//variables
int N;
int NFFT;
int NReal;
complex omega,coef;
mult_coef[] = complex[N];

x[] = double[N * NReal];
y[] = complex[N * NFFT];

G[] = SparseMatrix<double>(N,N)[NReal]; //vector of sparse matrices with dim [N x N], can be csr or any other format
C[] = SparseMatrix<double>(N,N)[NReal]; //

//function with MVM and SumVec
func(x, y){
//loop, that should be parallelized
for (int i = 1; i < NFFT; ++i){
omega = GetCoefMult(i); //just some complex coefficient
coef = 1.0;
for (int j = 0; j < NReal; ++j, coef *= mult_coef[i]){
y[i*N : (i+1)*N] = y[i*N : (i+1)*N] + coef * G[j] * x[j*N : (j+1)N];
y[i*N : (i+1)*N] = y[i*N : (i+1)*N] + omega * coef * C[j] * x[j*N : (j+1)N];
}
}
return y;
}
``````

I tried to transfer calculating of y[i*N : (i+1)*N] on GPU. The matrix-arrays G,C are copied to GPU before the algorithm starts. Vector x is copied to GPU and y from GPU on every func call. The code bellow shows this attempt:

``````__global__ void vecAdd(const double *a, cuDoubleComplex *c, double coef_x, double coef_y, int n)
{
// Get our global thread ID

// Make sure we do not go out of bounds
if (id < n){
c[id].x += coef_x * a[id];
c[id].y += coef_y * a[id];
}
}

void CUDA_SOLVER_HANDLE::MatrMultVectPlus(MATRIX_INDEX j, MATRIX_INDEX i, const _complex& coef, int threadNum, MatrName matr){
CUDA_CONTAINER_HANDLE* cudaCont = nullptr;
switch(matr){
case G_MATR:
cudaCont = &G;
break;
case C_MATR:
cudaCont = &C;
break;
default:
assert("WTF?!");
return;
}

int N = (int)cudaCont->cont->GetMatrixHeight();
int NNa = (int)cudaCont->cont->GetItemsCount();

if (NNa == 0) return;

const double h_one = 1.0;
const double h_zero = 0.0;

//d_y = A * d_x
cusparseStatus_t cusparseStat = CUSPARSE_STATUS_SUCCESS;
CUSPARSE_OPERATION_NON_TRANSPOSE,
N,
N,
NNa,
&h_one,
info.descrA,
cudaCont->d_csrValA + j * ItemsCount,
cudaCont->d_csrRowPtrA,
cudaCont->d_csrColIndA,
d_x + j * N,
&h_zero,
d_y + i * N
);
assert(CUSPARSE_STATUS_SUCCESS == cusparseStat);

coef.x, coef.y, N);
}
``````

I also tried to parallelize loop with openMP and use its threads to put tasks on queue of every stream.
So the func becomes:

``````int maxThreadNum = omp_get_max_thread();
int blockSize = 1024 / maxThreadNum;
int gridSize = (N + blockSize - 1)/blockSize;

//function with MVM and SumVec
func(x, y){
cudaHandle.Zero_Y_Vector();
cudaHandle.Copy_X_Vector_ToCUDA(x);

#pragma omp parallel for
for (int i = 1; i < NFFT; ++i){
omega = GetCoefMult(i); //just some complex coefficient
coef = 1.0;
for (int j = 0; j < NReal; ++j, coef *= mult_coef[i]){
}
}