cusparse function are blocked by internal (hidden) cudaFree on all stream...

Dear all,

I use cuSPARSE to compress to the CSR matrix format an array of float in device memory previously computed by a cuda kernel.

To overlap with data transfer I use stream and double buffer (data coming with RDMA)

synchronization is done between kernel launch by cuStreamWaitValue32()

the issue is that cusparseSnnz() block on this cuStreamWaitValue32 even in another stream. As of my understanding it should not…
I use CUSPARSE_POINTER_MODE_DEVICE

Why ???

MWE below: process 3 nnz calculation , 5 nnz if comment removed to wake up the wait

#include <stdio.h>
#include <stdlib.h>
#include <cusparse_v2.h>
#include <cuda.h>
#include <cuda_runtime.h>

int main(int argc, char **argv)
{
int N = 24;
int M = 24;
int row;
float *A, *dA;
int *dNnzPerRow;
float *dCsrValA;
int *dCsrRowPtrA;
int *dCsrColIndA;

int devId=1;
cudaSetDevice(devId);
char name[128];
cuDeviceGetName(name, sizeof(name), devId);
printf(name);

int prop = -1;
cuDeviceGetAttribute (&prop,CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS,devId);
if (prop == 1)
	printf("CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS enable\n");
else {
	printf("Cuda USE_STREAM_MEM_OPS not supported, try moprobe nvidia NVreg_EnableStreamMemOPs=1\n");
	exit(0);
	}

//cudaSetDeviceFlags(cudaDeviceBlockingSync);

printf("memory\n");
//X = (float*)malloc(sizeof(float) * N);
//Y = (float*)malloc(sizeof(float) * M);
A = (float*)malloc(sizeof(float) * M*N);
for(int i=0;i<M*N;i++)
{
	A[i] = i%2;

}

float *dX, *X;
float *dY, *Y;
int *dNNN;
int totalNnz=314;

cudaMalloc((void **)&dNNN, sizeof(int) * 1);
cudaMalloc((void **)&dX, sizeof(float) * N);
cudaMalloc((void **)&dY, sizeof(float) * M);
cudaMalloc((void **)&dA, sizeof(float) * M * N);
cudaMalloc((void **)&dNnzPerRow, sizeof(int) * M);
//
//cudaMemcpy(dX, X, sizeof(float) * N, cudaMemcpyHostToDevice);
//cudaMemcpy(dY, Y, sizeof(float) * M, cudaMemcpyHostToDevice);
cudaMemcpy(dA, A, sizeof(float) * M * N, cudaMemcpyHostToDevice);

printf("cusparse\n");
cusparseHandle_t handle = 0;
cusparseMatDescr_t descr = 0;
cusparseCreate(&handle);
cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_DEVICE);
cusparseCreateMatDescr(&descr);
cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);

int statusFlag;
CUdeviceptr d_statusFlag;
cuMemHostRegister((void*)&statusFlag, 4, CU_MEMHOSTALLOC_DEVICEMAP);
cuMemHostGetDevicePointer(&d_statusFlag, (void*) &statusFlag, 0);

cudaStream_t s1,s2;
cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
cudaStreamCreateWithFlags(&s2, cudaStreamNonBlocking);
//cudaStreamCreate(&s1);
//cudaStreamCreate(&s2);

//OK HERE
printf("nnz1\n");
cusparseSetStream(handle,s1);
cusparseSnnz(handle, CUSPARSE_DIRECTION_ROW, M, N, descr, dA,M, dNnzPerRow, dNNN);

//OK HERE
printf("nnz2\n");
cusparseSetStream(handle,s1);
cusparseSnnz(handle, CUSPARSE_DIRECTION_ROW, M, N, descr, dA,M, dNnzPerRow, dNNN);

//code on stream s2 should wait until statusFlag==111
//
cuStreamWaitValue32(s2,d_statusFlag, 111,CU_STREAM_WAIT_VALUE_EQ);

//ISSUE HERE : nnz should be enqueued as the previous occurence in stream s1, but does not return if s2 is still waiting. Why ???
printf("nnz3\n");
cusparseSetStream(handle,s1);
//REMOVE COMMENT ON NEXT LINE SHOW nnz() waiting on s2
//statusFlag=111;
cusparseSnnz(handle, CUSPARSE_DIRECTION_ROW, M, N, descr, dA,M, dNnzPerRow, dNNN);

printf("nnz4\n");
cusparseSetStream(handle,s1);
cusparseSnnz(handle, CUSPARSE_DIRECTION_ROW, M, N, descr, dA,M, dNnzPerRow, dNNN);

printf("nnz5\n");
cusparseSetStream(handle,s1);
cudaMemcpyAsync(&totalNnz, dNNN, sizeof(int) * 1, cudaMemcpyDeviceToHost,s1);

cudaDeviceSynchronize();
printf("totalNnz=%d\n",totalNnz);

printf("...\n");

free(A);
//free(X);
//free(Y);

cudaFree(dX);
cudaFree(dY);
cudaFree(dA);
cudaFree(dNnzPerRow);
cudaFree(dCsrValA);
cudaFree(dCsrRowPtrA);
cudaFree(dCsrColIndA);

cusparseDestroyMatDescr(descr);
cusparseDestroy(handle);

return 0;
}

I think I found the culprit with nvvp:

there is a cudaFree() call inside the cusparse function (Snnz) and cudaFree enforces synchronization…
this is an issue if you want enqueue many call in a stream…

Is there any workaround ? it is a severe limitation

You may wish to file a bug. The instructions are linked here:

https://devtalk.nvidia.com/default/topic/1044668/cuda-programming-and-performance/-how-to-report-a-bug/

For the problem description, it may be sufficient simply to link to this forum posting.

bug reporting done.

is there any known workaround/suggestion for this issue ?

thanks