cuBLAS non-square matrix multiplication error

I’m trying to multiply two matrices A nk and B km but i am getting an error when they are not square. Also, can anyone explain why cuBLAS takes considerable time when dealing with small arrays as opposed to a simple MatrixMul kernel?

void gpu_blas_mmul(const float *A, const float *B, float *C, const int n, const int k, const int m, cublasHandle_t handle) {
	int lda=n, ldb=k, ldc=n;
	const float alf = 1;
	const float bet = 0;
	const float *alpha = &alf;
	const float *beta = &bet;

	

	// Do the actual multiplication
	cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, alpha, A, lda, B, ldb, beta, C, ldc);

}

int main(){
	int n, k, m;
	int nr_rows_A, nr_cols_A, nr_rows_B, nr_cols_B, nr_rows_C, nr_cols_C;
	float *h_A, *h_B, *h_C;

	printf("Give n : ");
        scanf("%d", &n);
	printf("\tk : ");
	scanf("%d", &k);
	printf("\tm : ");
	scanf("%d", &m);

	nr_rows_A = nr_rows_C = n;
	nr_cols_A = nr_rows_B = k;
	nr_cols_B = nr_cols_C = m;

	float *d_A, *d_B, *d_C;
	cudaMalloc(&d_A,nr_rows_A * nr_cols_A * sizeof(float));
        cudaMalloc(&d_B,nr_rows_B * nr_cols_B * sizeof(float));
        cudaMalloc(&d_C,nr_rows_C * nr_cols_C * sizeof(float));

	h_A = (float*)malloc(nr_rows_A * nr_cols_A*sizeof(float));
	h_B = (float*)malloc(nr_rows_B * nr_cols_B*sizeof(float));
	h_C = (float*)malloc(nr_rows_C * nr_cols_C * sizeof(float));

	cudaMemcpy(d_A,h_A,nr_rows_A * nr_cols_A * sizeof(float),cudaMemcpyHostToDevice);
        cudaMemcpy(d_B,h_B,nr_rows_B * nr_cols_B * sizeof(float),cudaMemcpyHostToDevice);
	cudaMemcpy(d_C,h_C,nr_rows_C * nr_cols_C * sizeof(float),cudaMemcpyHostToDevice);

	// Create a handle for CUBLAS
	cublasHandle_t handle;
	cublasCreate(&handle);

	// Multiply A and B on GPU
	gpu_blas_mmul(d_A, d_B, d_C, n, k, m, handle);
	
	// Destroy the handle
	cublasDestroy(handle);

	cudaMemcpy(h_C,d_C,nr_rows_C * nr_cols_C * sizeof(float),cudaMemcpyDeviceToHost);

	.
	.
	.
}