Hello,
I’m using cusparse library to do matrix/vector multiplications. I would like to make use of streams to launch kernels concurrently, but it doesn’t seem to work. Runtimes are about the same with or without streams. The profiler is showing serial kernel execution when streams are enabled. I’m using “Tesla C2075” GPU device which supports concurrent kernel execution.
Here is my code:
int numOfMult = 100;
int numStreams = 8;
int N = 100000; // dimensions of the matrix
int nz = (N-2)*3 + 4; // # of non-zero values
// use Compressed Sparse Row Format (CSR) to represent matrices
thrust::host_vector hCsrRowIdx(N+1);
thrust::host_vector hCsrColIdx(nz, 0);
thrust::host_vector hCsrVal(nz, 0.0);
thrust::host_vector hX(N, 2.0);
thrust::host_vector hManualAx(N, 0.0);
thrust::host_vector hAx(N*numOfMult, 0.0);
thrust::host_vector hStreamVec(numStreams);
for (int i = 0; i < numStreams; ++i)
cudaStreamCreate(&hStreamVec[i]);
// generate matrix
genTridiag(hCsrRowIdx, hCsrColIdx, hCsrVal, N, nz);
// create all the gpu device memory
thrust::device_vector dCsrRowIdx = hCsrRowIdx;
thrust::device_vector dCsrColIdx = hCsrColIdx;
thrust::device_vector dCsrVal = hCsrVal;
thrust::device_vector dX = hX;
thrust::device_vector dAx(N*numOfMult, 0.0);
cusparseHandle_t cusparseHandle = 0;
cusparseStatus_t cusparseStatus = cusparseCreate(&cusparseHandle);
cusparseMatDescr_t descr = 0;
cusparseCreateMatDescr(&descr);
cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO);
int offset = 0, streamIdx = 0;
for(int i = 0; i < numOfMult; ++i)
{
offset = i*N;
cusparseStatus = cusparseSetStream(cusparseHandle, hStreamVec[streamIdx]);
cusparseStatus = cusparseDcsrmv(cusparseHandle,
CUSPARSE_OPERATION_NON_TRANSPOSE,
N,
N,
nz,
&alpha,
descr,
thrust::raw_pointer_cast(&dCsrVal[0]),
thrust::raw_pointer_cast(&dCsrRowIdx[0]),
thrust::raw_pointer_cast(&dCsrColIdx[0]),
thrust::raw_pointer_cast(&dX[0]),
&beta,
thrust::raw_pointer_cast(&dAx[offset]));
++streamIdx;
if (streamIdx >= numStreams)
streamIdx = 0;
}
// wait for all streams to finish
cudaError_t cudaStatus = cudaDeviceSynchronize();
Is cusparse library blocking concurrent execution of kernels? Is Implicit Synchronization getting triggered? Any insight into this is much appreciated.