When using `cublasGemmEx`

with CUDA_R_8I for A and B, and CUDA_R_32I for C, I’m getting results that slightly differ from the actual results. This all sounds pretty exactly like the float bug that was supposedly fixed in 12.3.1: 1. CUDA 12.4 Release Notes — Release Notes 12.4 documentation

“cublasLtMatmul with A and B types equal CUDA_R_8I, scale and C type equal CUDA_R_32I and compute type equals CUBLAS_COMPUTE_32I{,_PEDANTIC} could erroneously perform casting of the result to float data type and then back to integer which result in accuracy loss (for output greater than 2^24). The issue exists since cuBLAS 11.0.”

However, also when updating to the latest CUDA Version 12.4, I still get the inaccurate results. Interestingly, the bug only seems to occur on Hopper (H100 in my test), neither on A100 nor RTX4090. Here is a little demo. Would be interesting to see if this bug is also reproducible for others or my CUDA version update went wrong.

```
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <cstdlib>
#include <ctime>
// Basic CPU impl
void simpleCpuGemm(int m, int n, int k, const int8_t *A, const int8_t *B, int32_t *C)
{
for (int row = 0; row < m; ++row)
{
for (int col = 0; col < n; ++col)
{
int32_t sum = 0;
for (int i = 0; i < k; ++i)
{
sum += A[i + row * k] * B[i + col * k];
}
C[row + col * m] = sum;
}
}
}
int main()
{
cublasHandle_t handle;
cublasCreate(&handle);
int m = 100, n = 32, k = 12800;
std::srand(std::time(0));
std::vector<int8_t> h_A(m * k);
std::vector<int8_t> h_B(k * n);
std::vector<int32_t> h_C(m * n);
std::vector<int32_t> h_C_cpu(m * n);
for (int i = 0; i < m * k; ++i)
{
h_A[i] = static_cast<int8_t>(std::rand() % 127);
}
for (int i = 0; i < k * n; ++i)
{
h_B[i] = static_cast<int8_t>(std::rand() % 127);
}
int8_t *A, *B;
int32_t *C;
cudaMalloc(&A, m * k * sizeof(int8_t));
cudaMalloc(&B, k * n * sizeof(int8_t));
cudaMalloc(&C, m * n * sizeof(int32_t));
cudaMemcpy(A, h_A.data(), m * k * sizeof(int8_t), cudaMemcpyHostToDevice);
cudaMemcpy(B, h_B.data(), k * n * sizeof(int8_t), cudaMemcpyHostToDevice);
const int32_t alpha = 1;
const int32_t beta = 0;
// Matmul using cublasGemmEx
cublasGemmEx(handle, CUBLAS_OP_T, CUBLAS_OP_N,
m, n, k,
&alpha,
A, CUDA_R_8I, k,
B, CUDA_R_8I, k,
&beta,
C, CUDA_R_32I, m,
CUBLAS_COMPUTE_32I_PEDANTIC, CUBLAS_GEMM_DEFAULT);
cudaMemcpy(h_C.data(), C, m * n * sizeof(int32_t), cudaMemcpyDeviceToHost);
// CPU sanity check
simpleCpuGemm(m, n, k, h_A.data(), h_B.data(), h_C_cpu.data());
// Compare the results
int diffs = 0;
for (int i = 0; i < m * n; ++i)
{
if (h_C_cpu[i] != h_C[i])
{
std::cout << h_C_cpu[i] << " " << h_C[i] << " " << i << "\n";
diffs++;
}
}
std::cout << "The results " << (diffs == 0 ? "MATCH" : "DO NOT MATCH") << ": " << diffs << " out of " << n * m << " values differ\n";
cudaFree(A);
cudaFree(B);
cudaFree(C);
cublasDestroy(handle);
return 0;
}
```