Performance cusparseScsrsm_solve

Hello @ all,
I’ve got a performance problem. We try to implement Routines for solving linear Systems using the cuSPARSE function cusparseScsrsm_solve(…). We have a System A * AB = B. A and B are known, sparse matrices with 400 x 400 and 400 x 10 Elements. B and AB are stored column major. Both matrices have a sparsity of 0.25. The problem is, that the solving of a part of this system A * ab = b (where are ab and b are columns from AB and B) takes 110seconds (!!!) and creates many elements with the value ‘nan’ and infinity. But i don’t get any error from CUDA or cuSPARSE. Can somebody please help me? Maybe I’ve got an error in my implementation? Or are the matrices bad conditioned? We tried it already with smaller matrices (24 x 24, 24 x 10) and get the result after 300 milliseconds and get plausible results. Below is a example code. I kicked out all error handling to shorten it.

System:
intel Core i7
6 cores with 3.6 GHz
2 Nvidia Geforce gtx 780 Ti (using only one at the moment)
CUDA-Toolkit 5.5

Thank you in advance and greetings
Noran

int
pyMAGMA_MMILU_sparse_float (	magma_int_t rowsMatrixA,
				magma_int_t colsMatrixA,
				magma_index_t *rowPointerA,
				magma_index_t *colIndA,
				float *valA,
				magma_int_t colsMatrixB,
				float *valB,
				float *valAB,
				int num_queues)
{
	printf("Start calculation in pyMAGMA_MMILU_sparse_float");
	int Annz = rowPointerA[rowsMatrixA];

	int* d_rowPointerA;
	int* d_colIndA;
	float* d_valA;
	float* d_valB;
	float* d_valAB;

	const float alpha = 1.0;
	int ldB = colsMatrixA;
	int ldAB = rowsMatrixA;

	cusparseHandle_t cuSparseHandle;
	cusparseMatDescr_t descrA;
	cusparseSolveAnalysisInfo_t info;

	DEBUG_LOG("Allocating Memory");
	cudaMalloc (&d_rowPointerA,(rowsMatrixA + 1) * sizeof(int));
	cudaMalloc (&d_colIndA, Annz * sizeof(int));
	cudaMalloc (&d_valA, Annz * sizeof(float));
	cudaMalloc (&d_valB, colsMatrixA * colsMatrixB * sizeof(float));
	cudaMalloc (&d_valAB, rowsMatrixA * colsMatrixB	* sizeof(float));

	//copy values, initialize d_valAB
	DEBUG_LOG("copy values");
	cudaMemcpy (	(void*) d_rowPointerA,
			(void*) rowPointerA,
			(rowsMatrixA + 1) * sizeof(int),
			cudaMemcpyHostToDevice);
	cudaMemcpy (	(void*) d_colIndA,
			(void*) colIndA,
			Annz * sizeof(int),
			cudaMemcpyHostToDevice);
	cudaMemcpy (	(void*) d_valA,
			(void*) valA,
			Annz * sizeof(float),
			cudaMemcpyHostToDevice);
	cudaMemcpy (	(void*) d_valB,
			(void*) valB,
			colsMatrixA * colsMatrixB
			* sizeof(float),
			cudaMemcpyHostToDevice);
	cudaMemset (	d_valAB,
			0,
			rowsMatrixA * colsMatrixB
			* sizeof(float));

	//cuSparse Context
	DEBUG_LOG("Creating cusparse context");
	cusparseCreate (&cuSparseHandle);
	cusparseSetPointerMode (cuSparseHandle, CUSPARSE_POINTER_MODE_HOST);
	cusparseCreateMatDescr (&descrA);
	cusparseSetMatIndexBase (descrA, CUSPARSE_INDEX_BASE_ZERO);
	cusparseSetMatType (descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
	cusparseCreateSolveAnalysisInfo (&info);

	//run cusparseScsrsm_analysis for setup of info
	DEBUG_LOG("initialize info");
	cusparseScsrsm_analysis (	cuSparseHandle,
					CUSPARSE_OPERATION_NON_TRANSPOSE,
					rowsMatrixA,
					Annz,
					descrA,
					d_valA,
					d_rowPointerA,
					d_colIndA,
					info);

	for (int col = 0; col < colsMatrixB; col++)
	{

		//run cusparseScsrsm_solve solve linear system
		DEBUG_LOG("Call solver");
		cusparseScsrsm_solve (	cuSparseHandle,
					CUSPARSE_OPERATION_NON_TRANSPOSE,
					rowsMatrixA,
					colsMatrixB,
					&alpha,
					descrA,
					d_valA,
					d_rowPointerA,
					d_colIndA,
					info,
					d_valB + col * ldB,
					ldB,
					d_valAB + col * ldB,
					ldAB);
	}

	//fetch result
	DEBUG_LOG("fetch result");
	cudaMemcpy (	valAB,
			d_valAB,
			rowsMatrixA * colsMatrixB * sizeof(float),
			cudaMemcpyDeviceToHost);

	//free GPU memory
	DEBUG_LOG("free GPU memory");
	cudaFree (d_colIndA);
	cudaFree (d_rowPointerA);
	cudaFree (d_valA);
	cudaFree (d_valB);
	cudaFree (d_valAB);
	cusparseDestroySolveAnalysisInfo (info);

	cusparseDestroyMatDescr (descrA);
	cusparseDestroy (cuSparseHandle);

	return error;
}