Using cublasGemmBatchedEx

I am trying to use cublasGemmBatchedEx to perform matrix multiplication. Here is my code.

#include <iostream>
#include <cublas_v2.h>

#define M 4
#define N 4
#define K 4

//nvcc -lcublas -o matmul_gemmBatchedEx
void print_matrix(float **A, int rows, int cols, int batch_size) {
    for (int i = 0; i < batch_size; i++){
        for (int j = 0; j < rows; j++){
            for(int k = 0; k < cols; k++){
                std::cout << A[i][k * rows + j] << " ";
            std::cout << std::endl;
        std::cout << std::endl;

int main(int argc, char* argv[])
    // Linear dimension of matrices
    int batch_size = 2;

    float *h_A[batch_size], *h_B[batch_size], *h_C[batch_size];
    for (int i = 0; i < batch_size; i++){
        h_A[i] = (float*)malloc(M * K * sizeof(float));
        h_B[i] = (float*)malloc(K * N * sizeof(float));
        h_C[i] = (float*)malloc(M * N * sizeof(float));

    for (int i = 0; i < batch_size; i++){
        for (int j = 0; j < M * K; j++)
            h_A[i][j] = j%4;
        for (int j = 0; j < K * N; j++)
            h_B[i][j] = j%4 + 4;
        for (int j = 0; j < M * N; j++)
            h_C[i][j] = 0;

    std::cout << "A =" << std::endl;
    print_matrix(h_A, M, K, batch_size);
    std::cout << "B =" << std::endl;
    print_matrix(h_B, K, N, batch_size);

    float *d_A[batch_size], *d_B[batch_size], *d_C[batch_size];

    for (int i = 0; i < batch_size; i++){
        cudaMalloc(&d_A[i], sizeof(float)* M * K);
        cudaMalloc(&d_B[i], sizeof(float)* K * N);
        cudaMalloc(&d_C[i], sizeof(float)* M * N);

    cudaMemcpy(d_A, h_A, sizeof(float)* M * K * batch_size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, sizeof(float)* K * N * batch_size, cudaMemcpyHostToDevice);

    cublasHandle_t handle;

    // Set up the matrix dimensions and batch size
    int lda = M;
    int ldb = K;
    int ldc = M;

    // Set the alpha and beta parameters for the gemm operation
    float alpha = 1.0f;
    float beta = 0.0f;

    cublasStatus_t status = cublasGemmBatchedEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, 
                            (const void**)d_A, CUDA_R_32F, lda, 
                            (const void**)d_B, CUDA_R_32F, ldb, 
                            (void**)d_C, CUDA_R_32F, ldc, 
                            CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT);

    cudaMemcpy(h_C,d_C,sizeof(float) * M * N * batch_size, cudaMemcpyDeviceToHost);

    if (status == CUBLAS_STATUS_SUCCESS) {
        std::cout << "C =" << std::endl;
        print_matrix(h_C, M, N, batch_size);
    } else {
        std::cout << status << std::endl;
    // Destroy the handle



The problem is I don’t get a expected result. Result matrix comes full of zeros. Is there any problem with my code?

see here.

1 Like

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.