Low speed MatrixVectorMult & Vector Sum on CUDA (x10 slower then CPU)

I tried to implement the part of an iterative method with Matrix-Vector multiplication on CUDA. The pseudocode bellow shows this part. It attempts to multiply input double vector x on double block-matrices (G + C). Every block also multiplies on complex scalar. So the output is complex vector.

//variables
int N;
int NFFT;
int NReal;
complex omega,coef;
mult_coef[] = complex[N];

x[] = double[N * NReal];
y[] = complex[N * NFFT];

G[] = SparseMatrix<double>(N,N)[NReal]; //vector of sparse matrices with dim [N x N], can be csr or any other format
C[] = SparseMatrix<double>(N,N)[NReal]; // 

//function with MVM and SumVec
func(x, y){
	//loop, that should be parallelized
	for (int i = 1; i < NFFT; ++i){
		omega = GetCoefMult(i); //just some complex coefficient
		coef = 1.0;
		for (int j = 0; j < NReal; ++j, coef *= mult_coef[i]){
			y[i*N : (i+1)*N] = y[i*N : (i+1)*N] + coef * G[j] * x[j*N : (j+1)N];
			y[i*N : (i+1)*N] = y[i*N : (i+1)*N] + omega * coef * C[j] * x[j*N : (j+1)N]; 		
		}
	}
        return y;
}

I tried to transfer calculating of y[i*N : (i+1)*N] on GPU. The matrix-arrays G,C are copied to GPU before the algorithm starts. Vector x is copied to GPU and y from GPU on every func call. The code bellow shows this attempt:

__global__ void vecAdd(const double *a, cuDoubleComplex *c, double coef_x, double coef_y, int n)
{
    // Get our global thread ID
    int id = blockIdx.x*blockDim.x+threadIdx.x;
 
    // Make sure we do not go out of bounds
    if (id < n){
        c[id].x += coef_x * a[id];
        c[id].y += coef_y * a[id];
    }
}

void CUDA_SOLVER_HANDLE::MatrMultVectPlus(MATRIX_INDEX j, MATRIX_INDEX i, const _complex& coef, int threadNum, MatrName matr){
	CUDA_CONTAINER_HANDLE* cudaCont = nullptr;
	switch(matr){
	case G_MATR:
		cudaCont = &G;
		break;
	case C_MATR:
		cudaCont = &C;
		break;
	default:
		assert("WTF?!");
		return;
	}

	int N = (int)cudaCont->cont->GetMatrixHeight();
	int NNa = (int)cudaCont->cont->GetItemsCount();

    if (NNa == 0) return;

    const double h_one = 1.0;
    const double h_zero = 0.0;
	

//d_y = A * d_x
	cusparseStatus_t cusparseStat = CUSPARSE_STATUS_SUCCESS;
    cusparseStat = cusparseDcsrmv(info.cusparseH_Vec[threadNum],
                                    CUSPARSE_OPERATION_NON_TRANSPOSE,
                                    N,
                                    N,
                                    NNa,
                                    &h_one,
                                    info.descrA,
                                    cudaCont->d_csrValA + j * ItemsCount,
                                    cudaCont->d_csrRowPtrA,
                                    cudaCont->d_csrColIndA,
                                    d_x + j * N,
                                    &h_zero,
                                    d_y + i * N
                                    );
    assert(CUSPARSE_STATUS_SUCCESS == cusparseStat);

    vecAdd<<<gridSize,blockSize,0,info.stream_Vec[threadNum]>>>(d_y + i * N, d_yOUT + i * N,
        coef.x, coef.y, N);
}

I also tried to parallelize loop with openMP and use its threads to put tasks on queue of every stream.
So the func becomes:

int maxThreadNum = omp_get_max_thread();
int blockSize = 1024 / maxThreadNum; 
int gridSize = (N + blockSize - 1)/blockSize;

//function with MVM and SumVec
func(x, y){
	cudaHandle.Zero_Y_Vector();
	cudaHandle.Copy_X_Vector_ToCUDA(x);
	cudaHandle.cudaDeviceSynchronize();  

	#pragma omp parallel for
	for (int i = 1; i < NFFT; ++i){
		int threadNum = omp_get_thread_num();
		omega = GetCoefMult(i); //just some complex coefficient
		coef = 1.0;
		for (int j = 0; j < NReal; ++j, coef *= mult_coef[i]){
			cudaHandle.MatrMultVecPlus(j,i, coef, threadNum,'G');
			cudaHandle.MatrMultVecPlus(j,i, coef*omega, threadNum, 'C');
		}
	}

	cudaHandle.cudaDeviceSynchronize();
	cudaHandle.Copy_Y_VectorFromCUDA(y);
}

The results of the operations are correct, but the speed is 10x slower then on CPU. I have to notice that “the best” speed on GPU achieved with one stream… OpenMP without CUDA increases speed. I checked on different sizes (N = 100, NFFT = 30, NReal = 128), (N = 50к, NFFT = 15, NReal = 32) and CUDA results were really poor.

Callgrind shows that “func” takes about 80% of all application work on CPU. nvprof tells me that memcpy, memset functions are fast enough (about 0% of all work) and the cusparse mvm / sum – 70%/30% of gpu work.

Maybe someone can find a mistake in my algorithm or give a piece of advice.

My gpu is GTX 1050 4gb, but I also tested on Quadro p6000 24gb. Compiled with "-gencode arch=compute_61,code="sm_61,compute_61". CUDA version 9.1.

Thank you