cuBLAS non-square matrix multiplication error

I’m trying to multiply two matrices A nk and B km but i am getting an error when they are not square. Also, can anyone explain why cuBLAS takes considerable time when dealing with small arrays as opposed to a simple MatrixMul kernel?

``````void gpu_blas_mmul(const float *A, const float *B, float *C, const int n, const int k, const int m, cublasHandle_t handle) {
int lda=n, ldb=k, ldc=n;
const float alf = 1;
const float bet = 0;
const float *alpha = &alf;
const float *beta = &bet;

// Do the actual multiplication
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, alpha, A, lda, B, ldb, beta, C, ldc);

}

int main(){
int n, k, m;
int nr_rows_A, nr_cols_A, nr_rows_B, nr_cols_B, nr_rows_C, nr_cols_C;
float *h_A, *h_B, *h_C;

printf("Give n : ");
scanf("%d", &n);
printf("\tk : ");
scanf("%d", &k);
printf("\tm : ");
scanf("%d", &m);

nr_rows_A = nr_rows_C = n;
nr_cols_A = nr_rows_B = k;
nr_cols_B = nr_cols_C = m;

float *d_A, *d_B, *d_C;
cudaMalloc(&d_A,nr_rows_A * nr_cols_A * sizeof(float));
cudaMalloc(&d_B,nr_rows_B * nr_cols_B * sizeof(float));
cudaMalloc(&d_C,nr_rows_C * nr_cols_C * sizeof(float));

h_A = (float*)malloc(nr_rows_A * nr_cols_A*sizeof(float));
h_B = (float*)malloc(nr_rows_B * nr_cols_B*sizeof(float));
h_C = (float*)malloc(nr_rows_C * nr_cols_C * sizeof(float));

cudaMemcpy(d_A,h_A,nr_rows_A * nr_cols_A * sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(d_B,h_B,nr_rows_B * nr_cols_B * sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(d_C,h_C,nr_rows_C * nr_cols_C * sizeof(float),cudaMemcpyHostToDevice);

// Create a handle for CUBLAS
cublasHandle_t handle;
cublasCreate(&handle);

// Multiply A and B on GPU
gpu_blas_mmul(d_A, d_B, d_C, n, k, m, handle);

// Destroy the handle
cublasDestroy(handle);

cudaMemcpy(h_C,d_C,nr_rows_C * nr_cols_C * sizeof(float),cudaMemcpyDeviceToHost);

.
.
.
}
``````