When using cublasGemmEx using FP32 inputs, FP64 output, and FP64 computation in Fortran, I am getting a cublasGemmEx 15 failure on an A1000 and RTX4060 GPU (both on Fedora Silverblue 41) with both Fortran and C++ minimal code samples:
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Tue_Oct_29_23:50:19_PDT_2024
Cuda compilation tools, release 12.6, V12.6.85
Build cuda_12.6.r12.6/compiler.35059454_0
nvfortran 25.1-0 64-bit target on x86-64 Linux -tp alderlake
| NVIDIA-SMI 565.77 Driver Version: 565.77 CUDA Version: 12.7 |
A simple cublasSaxpy test passes (as does most of my tensor core application). I have tried the usual reduction of optimisation, setting num_segments=1, replacing ishft() with custom, tested, bitwise operators,
Here is the minimal fortran test:
program cublas_gemmex_test
use cudafor
use cublas_v2
use iso_c_binding
implicit none
integer, parameter :: m = 1024, n = 1024, k = 1024
real(c_float), allocatable, device :: d_a(:,:), d_b(:,:)
real(c_double), allocatable, device :: d_c(:,:)
real(c_double), parameter :: alpha = 1.0_c_double, beta = 0.0_c_double
type(cublasHandle) :: handle
integer :: stat
! Allocate device memory
allocate(d_a(m,k), d_b(k,n), d_c(m,n))
! Initialize with simple values on the *device*
d_a = 0.001_c_float ! Use the *same* scaling as your original code
d_b = 0.001_c_float ! Same scaling
d_c = 0.0_c_double
! Initialize cuBLAS
stat = cublasCreate(handle)
if (stat /= CUBLAS_STATUS_SUCCESS) then
print *, "CUBLAS initialization failed:", stat
stop
end if
! Enable Tensor Cores
stat = cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH)
if (stat /= CUBLAS_STATUS_SUCCESS) then
print *, "cublasSetMathMode failed:", stat
stop
end if
! Call cublasGemmEx with FP32 inputs, FP64 output, and FP64 computation
stat = cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, &
d_a, CUDA_R_32F, m, &
d_b, CUDA_R_32F, k, &
beta, d_c, CUDA_R_64F, m, &
CUDA_R_64F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)
if (stat /= CUBLAS_STATUS_SUCCESS) then
print *, "cublasGemmEx failed:", stat ! Print the error code
if (stat == CUBLAS_STATUS_NOT_INITIALIZED) then
print *, "CUBLAS_STATUS_NOT_INITIALIZED"
else if (stat == CUBLAS_STATUS_INVALID_VALUE) then
print *, "CUBLAS_STATUS_INVALID_VALUE"
else if (stat == CUBLAS_STATUS_ARCH_MISMATCH) then
print *, "CUBLAS_STATUS_ARCH_MISMATCH"
else if (stat == CUBLAS_STATUS_EXECUTION_FAILED) then
print *, "CUBLAS_STATUS_EXECUTION_FAILED"
else
print *,"Unknown error"
endif
stop
end if
! Synchronize and check for CUDA errors
stat = cudaDeviceSynchronize()
if (stat /= cudaSuccess) then
print *, "CUDA error after cublasGemmEx: ", cudaGetErrorString(stat)
stop
endif
print *, "cublasGemmEx completed successfully."
! Clean up
deallocate(d_a, d_b, d_c)
stat = cublasDestroy(handle)
contains
subroutine cuda_check(err, message)
integer, intent(in) :: err
character(len=*), intent(in) :: message
if (err /= 0) then
print *, message, ": ", cudaGetErrorString(err)
stop
endif
end subroutine cuda_check
end program cublas_gemmex_test
A similar C++ test also failed with code 15, the same error (on both machines):
#include <iostream>
#include <vector>
#include <cublas_v2.h>
#include <cuda_runtime.h>
int main() {
int m = 1024;
int n = 1024;
int k = 1024;
// Allocate host memory and initialize with some values
std::vector<float> h_A(m * k, 1.0f); // FP32 for A
std::vector<float> h_B(k * n, 2.0f); // FP32 for B
std::vector<double> h_C(m * n, 0.0); // FP64 for C
// Allocate device memory
float *d_A;
float *d_B;
double *d_C;
cudaMalloc(&d_A, m * k * sizeof(float));
cudaMalloc(&d_B, k * n * sizeof(float));
cudaMalloc(&d_C, m * n * sizeof(double));
// Copy data to device
cudaMemcpy(d_A, h_A.data(), m * k * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B.data(), k * n * sizeof(float), cudaMemcpyHostToDevice);
// Create a cuBLAS handle
cublasHandle_t handle;
cublasCreate(&handle);
cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH);
// Set alpha and beta (FP64)
double alpha = 1.0;
double beta = 0.0;
// Call cublasGemmEx with FP32 inputs, FP64 output, and FP64 computation
cublasStatus_t stat = cublasGemmEx(handle,
CUBLAS_OP_N, CUBLAS_OP_N,
m, n, k,
&alpha,
d_A, CUDA_R_32F, m, // A, FP32
d_B, CUDA_R_32F, k, // B, FP32
&beta,
d_C, CUDA_R_64F, m, // C, FP64
CUDA_R_64F, // Compute type: FP64
CUBLAS_GEMM_DEFAULT_TENSOR_OP);
if (stat != CUBLAS_STATUS_SUCCESS) {
std::cerr << "cublasGemmEx failed: " << stat << std::endl;
// Print cublas error message.
if (stat == CUBLAS_STATUS_NOT_INITIALIZED)
std::cerr << "CUBLAS_STATUS_NOT_INITIALIZED" << std::endl;
if (stat == CUBLAS_STATUS_INVALID_VALUE)
std::cerr << "CUBLAS_STATUS_INVALID_VALUE" << std::endl;
if (stat == CUBLAS_STATUS_ARCH_MISMATCH)
std::cerr << "CUBLAS_STATUS_ARCH_MISMATCH" << std::endl;
if (stat== CUBLAS_STATUS_EXECUTION_FAILED)
std::cerr << "CUBLAS_STATUS_EXECUTION_FAILED" << std::endl;
}
// Synchronize and check for CUDA errors (good practice)
cudaDeviceSynchronize();
cudaError_t cuda_err = cudaGetLastError();
if(cuda_err != cudaSuccess) {
std::cerr << "CUDA Error: " << cudaGetErrorString(cuda_err) << std::endl;
}
else{
std::cerr<<"Passed!"<<std::endl;
}
// Clean up
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
cublasDestroy(handle);
return 0;
}
Many thanks,
Fraser