I’m trying to multiply two matrices A nk and B km but i am getting an error when they are not square. Also, can anyone explain why cuBLAS takes considerable time when dealing with small arrays as opposed to a simple MatrixMul kernel?
void gpu_blas_mmul(const float *A, const float *B, float *C, const int n, const int k, const int m, cublasHandle_t handle) {
int lda=n, ldb=k, ldc=n;
const float alf = 1;
const float bet = 0;
const float *alpha = &alf;
const float *beta = &bet;
// Do the actual multiplication
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, alpha, A, lda, B, ldb, beta, C, ldc);
}
int main(){
int n, k, m;
int nr_rows_A, nr_cols_A, nr_rows_B, nr_cols_B, nr_rows_C, nr_cols_C;
float *h_A, *h_B, *h_C;
printf("Give n : ");
scanf("%d", &n);
printf("\tk : ");
scanf("%d", &k);
printf("\tm : ");
scanf("%d", &m);
nr_rows_A = nr_rows_C = n;
nr_cols_A = nr_rows_B = k;
nr_cols_B = nr_cols_C = m;
float *d_A, *d_B, *d_C;
cudaMalloc(&d_A,nr_rows_A * nr_cols_A * sizeof(float));
cudaMalloc(&d_B,nr_rows_B * nr_cols_B * sizeof(float));
cudaMalloc(&d_C,nr_rows_C * nr_cols_C * sizeof(float));
h_A = (float*)malloc(nr_rows_A * nr_cols_A*sizeof(float));
h_B = (float*)malloc(nr_rows_B * nr_cols_B*sizeof(float));
h_C = (float*)malloc(nr_rows_C * nr_cols_C * sizeof(float));
cudaMemcpy(d_A,h_A,nr_rows_A * nr_cols_A * sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(d_B,h_B,nr_rows_B * nr_cols_B * sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(d_C,h_C,nr_rows_C * nr_cols_C * sizeof(float),cudaMemcpyHostToDevice);
// Create a handle for CUBLAS
cublasHandle_t handle;
cublasCreate(&handle);
// Multiply A and B on GPU
gpu_blas_mmul(d_A, d_B, d_C, n, k, m, handle);
// Destroy the handle
cublasDestroy(handle);
cudaMemcpy(h_C,d_C,nr_rows_C * nr_cols_C * sizeof(float),cudaMemcpyDeviceToHost);
.
.
.
}