CUBLASLT_EPILOGUE_BGRADB may occur illegal memory access

Hi, I think I have met a bug about cublas.

Here I use Apex fused_dense module:

import torch
from  apex.fused_dense import FusedDense
t = torch.rand((13824, 400), device="cuda", dtype=torch.float, requires_grad=True)
dense = FusedDense(400, 1).to("cuda")
out = dense(t - 0.5)
loss = torch.sum(out)
loss.backward()
print(t.grad)

and it raise an error: RuntimeError: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

For more details, I found it’s backward kernel’s bug. And I copy the naive cuda code from apex and compile it to run.

#include <cublasLt.h>
#include <iostream>

int main(void) {
  int m = 400;
  int k = 13824;
  int n = 1;

  float *a;
  float *b;
  float *c;
  float *d;
  void *ws;

  int ws_size = 16 * 1024 * 1024;

  cudaMalloc(&a, m * k * sizeof(float));
  cudaMalloc(&b, k * n * sizeof(float));
  cudaMalloc(&c, m * n * sizeof(float));
  cudaMalloc(&d, n * sizeof(float));
  cudaMalloc(&ws, ws_size);

  cudaStream_t stream;
  cudaStreamCreate(&stream);
  cublasLtHandle_t handle;
  cublasLtCreate(&handle);

  cublasLtMatmulDesc_t op;
  cublasLtMatmulDescCreate(&op, CUBLAS_COMPUTE_32F, CUDA_R_32F);

  auto transb = CUBLAS_OP_T;
  cublasLtMatmulDescSetAttribute(op, CUBLASLT_MATMUL_DESC_TRANSB, &transb,
                                 sizeof(CUBLAS_OP_T));

  auto epilogue = CUBLASLT_EPILOGUE_BGRADB;

  cublasLtMatmulDescSetAttribute(op, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue,
                                 sizeof(epilogue));

  cublasLtMatmulDescSetAttribute(op, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &d,
                                 sizeof(d));

  float alpha = 1.0;
  float beta = 0.0;

  cublasLtMatrixLayout_t a_desc;
  cublasLtMatrixLayout_t b_desc;
  cublasLtMatrixLayout_t c_desc;

  cublasLtMatrixLayoutCreate(&a_desc, CUDA_R_32F, m, k, m);
  cublasLtMatrixLayoutCreate(&b_desc, CUDA_R_32F, n, k, n);
  cublasLtMatrixLayoutCreate(&c_desc, CUDA_R_32F, m, n, m);

  cublasLtMatmul(handle, op, &alpha, b, a_desc, a, b_desc, &beta, c, c_desc, c,
                 c_desc, NULL, ws, ws_size, stream);
  std::cout << cudaDeviceSynchronize() << std::endl;
  std::cout << cudaGetLastError() << std::endl;

  cudaStreamSynchronize(stream);
  cudaStreamDestroy(stream);
  cublasLtDestroy(handle);
  cudaFree(a);
  cudaFree(b);
  cudaFree(c);
  cudaFree(d);
  cudaFree(ws);

  cublasLtMatmulDescDestroy(op);
  cublasLtMatrixLayoutDestroy(a_desc);
  cublasLtMatrixLayoutDestroy(b_desc);
  cublasLtMatrixLayoutDestroy(c_desc);
  return 0;
}

I use different nvcc to compile and run it, and in cuda11.5 it runs successfully. But in cuda11.6/11.7, it raise 700(illegal memory access error).