Hello
I am trying to use cublasGemmBatchedEx to perform matrix multiplication. Here is my code.
#include <iostream>
#include <cublas_v2.h>
#define M 4
#define N 4
#define K 4
//nvcc -lcublas -o matmul_gemmBatchedEx matmul_gemmBatchedEx.cu
void print_matrix(float **A, int rows, int cols, int batch_size) {
for (int i = 0; i < batch_size; i++){
for (int j = 0; j < rows; j++){
for(int k = 0; k < cols; k++){
std::cout << A[i][k * rows + j] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
}
}
int main(int argc, char* argv[])
{
// Linear dimension of matrices
int batch_size = 2;
float *h_A[batch_size], *h_B[batch_size], *h_C[batch_size];
for (int i = 0; i < batch_size; i++){
h_A[i] = (float*)malloc(M * K * sizeof(float));
h_B[i] = (float*)malloc(K * N * sizeof(float));
h_C[i] = (float*)malloc(M * N * sizeof(float));
}
for (int i = 0; i < batch_size; i++){
for (int j = 0; j < M * K; j++)
h_A[i][j] = j%4;
for (int j = 0; j < K * N; j++)
h_B[i][j] = j%4 + 4;
for (int j = 0; j < M * N; j++)
h_C[i][j] = 0;
}
std::cout << "A =" << std::endl;
print_matrix(h_A, M, K, batch_size);
std::cout << "B =" << std::endl;
print_matrix(h_B, K, N, batch_size);
float *d_A[batch_size], *d_B[batch_size], *d_C[batch_size];
for (int i = 0; i < batch_size; i++){
cudaMalloc(&d_A[i], sizeof(float)* M * K);
cudaMalloc(&d_B[i], sizeof(float)* K * N);
cudaMalloc(&d_C[i], sizeof(float)* M * N);
}
cudaMemcpy(d_A, h_A, sizeof(float)* M * K * batch_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, sizeof(float)* K * N * batch_size, cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasCreate(&handle);
// Set up the matrix dimensions and batch size
int lda = M;
int ldb = K;
int ldc = M;
// Set the alpha and beta parameters for the gemm operation
float alpha = 1.0f;
float beta = 0.0f;
cublasStatus_t status = cublasGemmBatchedEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K,
&alpha,
(const void**)d_A, CUDA_R_32F, lda,
(const void**)d_B, CUDA_R_32F, ldb,
&beta,
(void**)d_C, CUDA_R_32F, ldc,
batch_size,
CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT);
cudaMemcpy(h_C,d_C,sizeof(float) * M * N * batch_size, cudaMemcpyDeviceToHost);
if (status == CUBLAS_STATUS_SUCCESS) {
std::cout << "C =" << std::endl;
print_matrix(h_C, M, N, batch_size);
} else {
std::cout << status << std::endl;
}
// Destroy the handle
cublasDestroy(handle);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
cudaFreeHost(h_A);
cudaFreeHost(h_B);
cudaFreeHost(h_C);
}
The problem is I don’t get a expected result. Result matrix comes full of zeros. Is there any problem with my code?