I tried to implement the part of an iterative method with Matrix-Vector multiplication on CUDA. The pseudocode bellow shows this part. It attempts to multiply input double vector x on double block-matrices (G + C). Every block also multiplies on complex scalar. So the output is complex vector.
//variables
int N;
int NFFT;
int NReal;
complex omega,coef;
mult_coef[] = complex[N];
x[] = double[N * NReal];
y[] = complex[N * NFFT];
G[] = SparseMatrix<double>(N,N)[NReal]; //vector of sparse matrices with dim [N x N], can be csr or any other format
C[] = SparseMatrix<double>(N,N)[NReal]; //
//function with MVM and SumVec
func(x, y){
//loop, that should be parallelized
for (int i = 1; i < NFFT; ++i){
omega = GetCoefMult(i); //just some complex coefficient
coef = 1.0;
for (int j = 0; j < NReal; ++j, coef *= mult_coef[i]){
y[i*N : (i+1)*N] = y[i*N : (i+1)*N] + coef * G[j] * x[j*N : (j+1)N];
y[i*N : (i+1)*N] = y[i*N : (i+1)*N] + omega * coef * C[j] * x[j*N : (j+1)N];
}
}
return y;
}
I tried to transfer calculating of y[i*N : (i+1)*N] on GPU. The matrix-arrays G,C are copied to GPU before the algorithm starts. Vector x is copied to GPU and y from GPU on every func call. The code bellow shows this attempt:
__global__ void vecAdd(const double *a, cuDoubleComplex *c, double coef_x, double coef_y, int n)
{
// Get our global thread ID
int id = blockIdx.x*blockDim.x+threadIdx.x;
// Make sure we do not go out of bounds
if (id < n){
c[id].x += coef_x * a[id];
c[id].y += coef_y * a[id];
}
}
void CUDA_SOLVER_HANDLE::MatrMultVectPlus(MATRIX_INDEX j, MATRIX_INDEX i, const _complex& coef, int threadNum, MatrName matr){
CUDA_CONTAINER_HANDLE* cudaCont = nullptr;
switch(matr){
case G_MATR:
cudaCont = &G;
break;
case C_MATR:
cudaCont = &C;
break;
default:
assert("WTF?!");
return;
}
int N = (int)cudaCont->cont->GetMatrixHeight();
int NNa = (int)cudaCont->cont->GetItemsCount();
if (NNa == 0) return;
const double h_one = 1.0;
const double h_zero = 0.0;
//d_y = A * d_x
cusparseStatus_t cusparseStat = CUSPARSE_STATUS_SUCCESS;
cusparseStat = cusparseDcsrmv(info.cusparseH_Vec[threadNum],
CUSPARSE_OPERATION_NON_TRANSPOSE,
N,
N,
NNa,
&h_one,
info.descrA,
cudaCont->d_csrValA + j * ItemsCount,
cudaCont->d_csrRowPtrA,
cudaCont->d_csrColIndA,
d_x + j * N,
&h_zero,
d_y + i * N
);
assert(CUSPARSE_STATUS_SUCCESS == cusparseStat);
vecAdd<<<gridSize,blockSize,0,info.stream_Vec[threadNum]>>>(d_y + i * N, d_yOUT + i * N,
coef.x, coef.y, N);
}
I also tried to parallelize loop with openMP and use its threads to put tasks on queue of every stream.
So the func becomes:
int maxThreadNum = omp_get_max_thread();
int blockSize = 1024 / maxThreadNum;
int gridSize = (N + blockSize - 1)/blockSize;
//function with MVM and SumVec
func(x, y){
cudaHandle.Zero_Y_Vector();
cudaHandle.Copy_X_Vector_ToCUDA(x);
cudaHandle.cudaDeviceSynchronize();
#pragma omp parallel for
for (int i = 1; i < NFFT; ++i){
int threadNum = omp_get_thread_num();
omega = GetCoefMult(i); //just some complex coefficient
coef = 1.0;
for (int j = 0; j < NReal; ++j, coef *= mult_coef[i]){
cudaHandle.MatrMultVecPlus(j,i, coef, threadNum,'G');
cudaHandle.MatrMultVecPlus(j,i, coef*omega, threadNum, 'C');
}
}
cudaHandle.cudaDeviceSynchronize();
cudaHandle.Copy_Y_VectorFromCUDA(y);
}
The results of the operations are correct, but the speed is 10x slower then on CPU. I have to notice that “the best” speed on GPU achieved with one stream… OpenMP without CUDA increases speed. I checked on different sizes (N = 100, NFFT = 30, NReal = 128), (N = 50к, NFFT = 15, NReal = 32) and CUDA results were really poor.
Callgrind shows that “func” takes about 80% of all application work on CPU. nvprof tells me that memcpy, memset functions are fast enough (about 0% of all work) and the cusparse mvm / sum – 70%/30% of gpu work.
Maybe someone can find a mistake in my algorithm or give a piece of advice.
My gpu is GTX 1050 4gb, but I also tested on Quadro p6000 24gb. Compiled with "-gencode arch=compute_61,code="sm_61,compute_61". CUDA version 9.1.
Thank you