cublasGemmEx failed: 15, ieee_inexact in Fortran and C++

When using cublasGemmEx using FP32 inputs, FP64 output, and FP64 computation in Fortran, I am getting a cublasGemmEx 15 failure on an A1000 and RTX4060 GPU (both on Fedora Silverblue 41) with both Fortran and C++ minimal code samples:
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Tue_Oct_29_23:50:19_PDT_2024
Cuda compilation tools, release 12.6, V12.6.85
Build cuda_12.6.r12.6/compiler.35059454_0

nvfortran 25.1-0 64-bit target on x86-64 Linux -tp alderlake

| NVIDIA-SMI 565.77 Driver Version: 565.77 CUDA Version: 12.7 |

A simple cublasSaxpy test passes (as does most of my tensor core application). I have tried the usual reduction of optimisation, setting num_segments=1, replacing ishft() with custom, tested, bitwise operators,

Here is the minimal fortran test:

program cublas_gemmex_test
    use cudafor
    use cublas_v2
    use iso_c_binding
    implicit none

    integer, parameter :: m = 1024, n = 1024, k = 1024
    real(c_float), allocatable, device :: d_a(:,:), d_b(:,:)
    real(c_double), allocatable, device :: d_c(:,:)
    real(c_double), parameter :: alpha = 1.0_c_double, beta = 0.0_c_double
    type(cublasHandle) :: handle
    integer :: stat

    ! Allocate device memory
    allocate(d_a(m,k), d_b(k,n), d_c(m,n))

    ! Initialize with simple values on the *device*
    d_a = 0.001_c_float  ! Use the *same* scaling as your original code
    d_b = 0.001_c_float  ! Same scaling
    d_c = 0.0_c_double

    ! Initialize cuBLAS
    stat = cublasCreate(handle)
    if (stat /= CUBLAS_STATUS_SUCCESS) then
        print *, "CUBLAS initialization failed:", stat
        stop
    end if

    ! Enable Tensor Cores
    stat = cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH)
    if (stat /= CUBLAS_STATUS_SUCCESS) then
        print *, "cublasSetMathMode failed:", stat
        stop
    end if

    ! Call cublasGemmEx with FP32 inputs, FP64 output, and FP64 computation
    stat = cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, &
                        d_a, CUDA_R_32F, m, &
                        d_b, CUDA_R_32F, k, &
                        beta, d_c, CUDA_R_64F, m, &
                        CUDA_R_64F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)

    if (stat /= CUBLAS_STATUS_SUCCESS) then
        print *, "cublasGemmEx failed:", stat  ! Print the error code
        if (stat == CUBLAS_STATUS_NOT_INITIALIZED) then
           print *, "CUBLAS_STATUS_NOT_INITIALIZED"
        else if (stat == CUBLAS_STATUS_INVALID_VALUE) then
           print *, "CUBLAS_STATUS_INVALID_VALUE"
        else if (stat == CUBLAS_STATUS_ARCH_MISMATCH) then
            print *, "CUBLAS_STATUS_ARCH_MISMATCH"
        else if (stat == CUBLAS_STATUS_EXECUTION_FAILED) then
            print *, "CUBLAS_STATUS_EXECUTION_FAILED"
        else
          print *,"Unknown error"
        endif

        stop
    end if

    ! Synchronize and check for CUDA errors
    stat = cudaDeviceSynchronize()
    if (stat /= cudaSuccess) then
        print *, "CUDA error after cublasGemmEx: ", cudaGetErrorString(stat)
        stop
    endif

    print *, "cublasGemmEx completed successfully."

    ! Clean up
    deallocate(d_a, d_b, d_c)
    stat = cublasDestroy(handle)

contains

    subroutine cuda_check(err, message)
        integer, intent(in) :: err
        character(len=*), intent(in) :: message
        if (err /= 0) then
            print *, message, ": ", cudaGetErrorString(err)
            stop
        endif
    end subroutine cuda_check
end program cublas_gemmex_test

A similar C++ test also failed with code 15, the same error (on both machines):

#include <iostream>
#include <vector>
#include <cublas_v2.h>
#include <cuda_runtime.h>

int main() {
    int m = 1024;
    int n = 1024;
    int k = 1024;

    // Allocate host memory and initialize with some values
    std::vector<float> h_A(m * k, 1.0f);  // FP32 for A
    std::vector<float> h_B(k * n, 2.0f);  // FP32 for B
    std::vector<double> h_C(m * n, 0.0);   // FP64 for C

    // Allocate device memory
    float *d_A;
    float *d_B;
    double *d_C;
    cudaMalloc(&d_A, m * k * sizeof(float));
    cudaMalloc(&d_B, k * n * sizeof(float));
    cudaMalloc(&d_C, m * n * sizeof(double));

    // Copy data to device
    cudaMemcpy(d_A, h_A.data(), m * k * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B.data(), k * n * sizeof(float), cudaMemcpyHostToDevice);

    // Create a cuBLAS handle
    cublasHandle_t handle;
    cublasCreate(&handle);
    cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH);

    // Set alpha and beta (FP64)
    double alpha = 1.0;
    double beta = 0.0;

    // Call cublasGemmEx with FP32 inputs, FP64 output, and FP64 computation
    cublasStatus_t stat = cublasGemmEx(handle,
                                        CUBLAS_OP_N, CUBLAS_OP_N,
                                        m, n, k,
                                        &alpha,
                                        d_A, CUDA_R_32F, m, // A, FP32
                                        d_B, CUDA_R_32F, k, // B, FP32
                                        &beta,
                                        d_C, CUDA_R_64F, m, // C, FP64
                                        CUDA_R_64F,         // Compute type: FP64
                                        CUBLAS_GEMM_DEFAULT_TENSOR_OP);

    if (stat != CUBLAS_STATUS_SUCCESS) {
        std::cerr << "cublasGemmEx failed: " << stat << std::endl;

        // Print cublas error message.
        if (stat == CUBLAS_STATUS_NOT_INITIALIZED)
            std::cerr << "CUBLAS_STATUS_NOT_INITIALIZED" << std::endl;
        if (stat == CUBLAS_STATUS_INVALID_VALUE)
          std::cerr << "CUBLAS_STATUS_INVALID_VALUE" << std::endl;
        if (stat == CUBLAS_STATUS_ARCH_MISMATCH)
          std::cerr << "CUBLAS_STATUS_ARCH_MISMATCH" << std::endl;
        if (stat== CUBLAS_STATUS_EXECUTION_FAILED)
          std::cerr << "CUBLAS_STATUS_EXECUTION_FAILED" << std::endl;


    }


    // Synchronize and check for CUDA errors (good practice)
    cudaDeviceSynchronize();
    cudaError_t cuda_err = cudaGetLastError();
     if(cuda_err != cudaSuccess) {
        std::cerr << "CUDA Error: " << cudaGetErrorString(cuda_err) << std::endl;
    }
    else{
       std::cerr<<"Passed!"<<std::endl;
    }
    // Clean up
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    cublasDestroy(handle);

    return 0;
}

Many thanks,

Fraser

I think there are two different issues at play:

(1) A cubasStatus_t value of 15 indicates CUBLAS_STATUS_NOT_SUPPORTED. Not sure what would be causing this. Your GPUs are supported by CUDA 12.6. Is the function you are calling supported on the architecture you are trying to execute it on? Add standard CUDA error checking to your code to see whether this is a followup error caused by an earlier failure.

(2) ieee_inexact is something specific to Fortran that is reported when you execute a stop command and is a harmless warning in my understanding. The Fortran standard apparently requires the state of the IEEE floating-point status bits to be reported when a stop is issued. Check the CUDA Fortran manual on how to suppress this. Or remove the stop.

cublasGemmEx doesn’t support calculation with 32-bit inputs and a 64-bit output. Refer to the table here of supported operations/combinations

Brilliant! Someday I should learn to read the manual. Thank you so much for your prompt and expert reply.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.