Keep getting wrong results from cusparseScsrmm ()

I am trying to test sparse matrix multiplciation using cusparseScsrmm().

The code is simple as the following:

#include <stdio.h>
#include <cuda.h>
#include "cusparse_v2.h"
#include "cublas_v2.h"

const int M = 4;
const int N = 3;

void genMatrix(int m, int n, float *A)
{
	for(int i=0; i<m; i++)
		for(int j=0; j<n; j++)
			A[i*n+j] = i*n+j;
}

void printMat(float *A, int m, int n)
{
	for(int i=0; i<m; i++){
		for(int j=0; j<n; j++)
			printf("%2.2f ", A[i*n+j]);
		printf("\n");
	}
	printf("\n");
}

void printVal(float *val, int m)
{
	for(int i=0; i<m; i++)
		printf("%2.2f ", val[i]);
	printf("\n\n");
}

void printIdx(int *idx, int m)
{
	for(int i=0; i<m; i++)
		printf("%2d ", idx[i]);
	printf("\n\n");
}

int main(void)
{
	float *hA = (float*)malloc(sizeof(float)*M*N);
	float *hB = (float*)malloc(sizeof(float)*N*M);
	float *hC = (float*)malloc(sizeof(float)*M*M);

	genMatrix(M, N, hA);
	genMatrix(N, M, hB);

	float *dA, *dB, *dC;
	cudaMalloc((void**)&dA, sizeof(float)*M*N);
	cudaMalloc((void**)&dB, sizeof(float)*N*M);
	cudaMalloc((void**)&dC, sizeof(float)*M*M);

	cudaMemcpy(dA, hA, sizeof(float)*M*N, cudaMemcpyHostToDevice);
	cudaMemcpy(dB, hB, sizeof(float)*N*M, cudaMemcpyHostToDevice);

	float one = 1.0;
	float zero = 0.0;

	cusparseHandle_t cusparseHandle = 0;
	cusparseMatDescr_t descr = 0;

	cusparseCreate(&cusparseHandle);
	cusparseCreateMatDescr(&descr);
	cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
	cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
	cusparseSetPointerMode(cusparseHandle, CUSPARSE_POINTER_MODE_HOST);
	
	int *d_nNonzPerRowA;
	int nNonzA;
	cudaMalloc((void**)&d_nNonzPerRowA, sizeof(int)*M);
	cusparseSnnz(cusparseHandle, CUSPARSE_DIRECTION_ROW, M, N, descr, dA, M, d_nNonzPerRowA, &nNonzA);

	float *d_csrVal_A;
	int *d_csrRowPtr_A, *d_csrColIdx_A;
	float *h_csrVal_A;
	int *h_csrRowPtr_A, *h_csrColIdx_A;
	h_csrVal_A = (float*)malloc(sizeof(float)*nNonzA);
	h_csrRowPtr_A = (int*)malloc(sizeof(int)*(M+1));
	h_csrColIdx_A = (int*)malloc(sizeof(int)*nNonzA);
	cudaMalloc((void**)&d_csrVal_A, sizeof(float)*nNonzA);
	cudaMalloc((void**)&d_csrRowPtr_A, sizeof(int)*(M+1));
	cudaMalloc((void**)&d_csrColIdx_A, sizeof(int)*nNonzA);

	cusparseSdense2csr(cusparseHandle, M, N, descr, dA, M, d_nNonzPerRowA,
						d_csrVal_A, d_csrRowPtr_A, d_csrColIdx_A);

	cudaMemcpy(h_csrVal_A, d_csrVal_A, sizeof(float)*nNonzA, cudaMemcpyDeviceToHost);
	cudaMemcpy(h_csrRowPtr_A, d_csrRowPtr_A, sizeof(int)*nNonzA, cudaMemcpyDeviceToHost);
	cudaMemcpy(h_csrColIdx_A, d_csrColIdx_A, sizeof(int)*(M+1), cudaMemcpyDeviceToHost);

	printf("A:\n");
	printMat(hA, M, N);
	printf("h_csrVal_A:\n");
	printVal(h_csrVal_A, nNonzA);
	printf("h_csrRowPtr_A:\n");
	printIdx(h_csrRowPtr_A, M+1);
	printf("h_csrColIdx_A:\n");
	printIdx(h_csrColIdx_A, nNonzA);

	cusparseScsrmm(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, M, N, N, nNonzA, &one, descr,
					d_csrVal_A, d_csrRowPtr_A, d_csrColIdx_A, dB, N, &zero, dC, M);
	cudaMemcpy(hC, dC, sizeof(float)*M*M, cudaMemcpyDeviceToHost);
	printf("B:\n");
	printMat(hB, N, M);
	printf("C:\n");
	printMat(hC, M, M);

	cublasHandle_t cublasHandle = 0;
	cublasCreate(&cublasHandle);
	
	
	return 0;
}

However, for some reasons I keep getting weird results. The following is the result I got.

A:
0.00 1.00 2.00 
3.00 4.00 5.00 
6.00 7.00 8.00 
9.00 10.00 11.00 

h_csrVal_A:
4.00 8.00 1.00 5.00 9.00 2.00 6.00 10.00 3.00 7.00 11.00 

h_csrRowPtr_A:
33040128  0 28054528  0 -1437690858 

h_csrColIdx_A:
 1  2  0  1  2  0 33040464  0 32987456  0  0 

B:
0.00 1.00 2.00 3.00 
4.00 5.00 6.00 7.00 
8.00 9.00 10.00 11.00 

C:
20.00 23.00 26.00 29.00 
56.00 68.00 80.00 92.00 
92.00 113.00 134.00 155.00 
-nan -nan -nan -nan

I don’t really know why it happens. What is the problem!? Thank you in advance.