Hi, I think I have met a bug about cublas.
Here I use Apex fused_dense module:
import torch
from apex.fused_dense import FusedDense
t = torch.rand((13824, 400), device="cuda", dtype=torch.float, requires_grad=True)
dense = FusedDense(400, 1).to("cuda")
out = dense(t - 0.5)
loss = torch.sum(out)
loss.backward()
print(t.grad)
and it raise an error: RuntimeError: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
For more details, I found it’s backward kernel’s bug. And I copy the naive cuda code from apex and compile it to run.
#include <cublasLt.h>
#include <iostream>
int main(void) {
int m = 400;
int k = 13824;
int n = 1;
float *a;
float *b;
float *c;
float *d;
void *ws;
int ws_size = 16 * 1024 * 1024;
cudaMalloc(&a, m * k * sizeof(float));
cudaMalloc(&b, k * n * sizeof(float));
cudaMalloc(&c, m * n * sizeof(float));
cudaMalloc(&d, n * sizeof(float));
cudaMalloc(&ws, ws_size);
cudaStream_t stream;
cudaStreamCreate(&stream);
cublasLtHandle_t handle;
cublasLtCreate(&handle);
cublasLtMatmulDesc_t op;
cublasLtMatmulDescCreate(&op, CUBLAS_COMPUTE_32F, CUDA_R_32F);
auto transb = CUBLAS_OP_T;
cublasLtMatmulDescSetAttribute(op, CUBLASLT_MATMUL_DESC_TRANSB, &transb,
sizeof(CUBLAS_OP_T));
auto epilogue = CUBLASLT_EPILOGUE_BGRADB;
cublasLtMatmulDescSetAttribute(op, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue,
sizeof(epilogue));
cublasLtMatmulDescSetAttribute(op, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &d,
sizeof(d));
float alpha = 1.0;
float beta = 0.0;
cublasLtMatrixLayout_t a_desc;
cublasLtMatrixLayout_t b_desc;
cublasLtMatrixLayout_t c_desc;
cublasLtMatrixLayoutCreate(&a_desc, CUDA_R_32F, m, k, m);
cublasLtMatrixLayoutCreate(&b_desc, CUDA_R_32F, n, k, n);
cublasLtMatrixLayoutCreate(&c_desc, CUDA_R_32F, m, n, m);
cublasLtMatmul(handle, op, &alpha, b, a_desc, a, b_desc, &beta, c, c_desc, c,
c_desc, NULL, ws, ws_size, stream);
std::cout << cudaDeviceSynchronize() << std::endl;
std::cout << cudaGetLastError() << std::endl;
cudaStreamSynchronize(stream);
cudaStreamDestroy(stream);
cublasLtDestroy(handle);
cudaFree(a);
cudaFree(b);
cudaFree(c);
cudaFree(d);
cudaFree(ws);
cublasLtMatmulDescDestroy(op);
cublasLtMatrixLayoutDestroy(a_desc);
cublasLtMatrixLayoutDestroy(b_desc);
cublasLtMatrixLayoutDestroy(c_desc);
return 0;
}
I use different nvcc to compile and run it, and in cuda11.5 it runs successfully. But in cuda11.6/11.7, it raise 700(illegal memory access error).