Cublas Bug

Hi, I think I have met a bug about cublas.

Here I use Apex fused_dense module:

import torch
from  apex.fused_dense import FusedDense
t = torch.rand((13824, 400), device="cuda", dtype=torch.float, requires_grad=True)
dense = FusedDense(400, 1).to("cuda")
out = dense(t - 0.5)
loss = torch.sum(out)
loss.backward()
print(t.grad)

and it raise an error: RuntimeError: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

For more details, I found it’s backward kernel’s bug. And I copy the naive cuda code from apex and compile it to run.

#include <cublasLt.h>
#include <iostream>

int main(void) {
  int m = 400;
  int k = 13824;
  int n = 1;

  float *a;
  float *b;
  float *c;
  float *d;
  void *ws;

  int ws_size = 16 * 1024 * 1024;

  cudaMalloc(&a, m * k * sizeof(float));
  cudaMalloc(&b, k * n * sizeof(float));
  cudaMalloc(&c, m * n * sizeof(float));
  cudaMalloc(&d, n * sizeof(float));
  cudaMalloc(&ws, ws_size);

  cudaStream_t stream;
  cudaStreamCreate(&stream);
  cublasLtHandle_t handle;
  cublasLtCreate(&handle);

  cublasLtMatmulDesc_t op;
  cublasLtMatmulDescCreate(&op, CUBLAS_COMPUTE_32F, CUDA_R_32F);

  auto transb = CUBLAS_OP_T;
  cublasLtMatmulDescSetAttribute(op, CUBLASLT_MATMUL_DESC_TRANSB, &transb,
                                 sizeof(CUBLAS_OP_T));

  auto epilogue = CUBLASLT_EPILOGUE_BGRADB;

  cublasLtMatmulDescSetAttribute(op, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue,
                                 sizeof(epilogue));

  cublasLtMatmulDescSetAttribute(op, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &d,
                                 sizeof(d));

  float alpha = 1.0;
  float beta = 0.0;

  cublasLtMatrixLayout_t a_desc;
  cublasLtMatrixLayout_t b_desc;
  cublasLtMatrixLayout_t c_desc;

  cublasLtMatrixLayoutCreate(&a_desc, CUDA_R_32F, m, k, m);
  cublasLtMatrixLayoutCreate(&b_desc, CUDA_R_32F, n, k, n);
  cublasLtMatrixLayoutCreate(&c_desc, CUDA_R_32F, m, n, m);

  cublasLtMatmul(handle, op, &alpha, b, a_desc, a, b_desc, &beta, c, c_desc, c,
                 c_desc, NULL, ws, ws_size, stream);
  std::cout << cudaDeviceSynchronize() << std::endl;
  std::cout << cudaGetLastError() << std::endl;

  cudaStreamSynchronize(stream);
  cudaStreamDestroy(stream);
  cublasLtDestroy(handle);
  cudaFree(a);
  cudaFree(b);
  cudaFree(c);
  cudaFree(d);
  cudaFree(ws);

  cublasLtMatmulDescDestroy(op);
  cublasLtMatrixLayoutDestroy(a_desc);
  cublasLtMatrixLayoutDestroy(b_desc);
  cublasLtMatrixLayoutDestroy(c_desc);
  return 0;
}

I use different nvcc to compile and run it, and in cuda11.5 it runs successfully. But in cuda11.6/11.7, it raise 700(illegal memory access error).

Did you intend to swap a and b parameters in cublasLtMatmul?

nope, cause I want to use this epilogue to compute the gradient of Matrix Multiply

why are you doing this:

the docs indicate expected usage is A pointer followed by A descriptor, etc.:

cublasStatus_t cublasLtMatmul(
      cublasLtHandle_t               lightHandle,
      cublasLtMatmulDesc_t           computeDesc,
      const void                    *alpha,
      const void                    *A,
      cublasLtMatrixLayout_t         Adesc,
      const void                    *B,
      cublasLtMatrixLayout_t         Bdesc,

sry, I found my code was wrong, here is updated code.

the gemm_bgradb_lt function was copied from Apex.

#include <cuda_runtime_api.h>
#include "cublas_v2.h"
#include <cublasLt.h>
#include "stdio.h"
#include "iostream"

void gemm_bgradb_lt(
    cublasLtHandle_t ltHandle,
    cublasOperation_t transa,
    cublasOperation_t transb,
    int m,
    int n,
    int k,
    const float *alpha, /* host pointer */
    float* A,
    int lda,
    float* B,
    int ldb,
    const float *beta, /* host pointer */
    float* C,
    int ldc,
    void *workspace,
    size_t workspaceSize,
    cudaStream_t stream,
    bool use_bias,
    const void* bgrad) {
  printf("m is: %d \n", m); 
  printf("n is: %d \n", n); 
  printf("k is: %d \n", k); 
  printf("lda is: %d \n", lda); 
  printf("ldb is: %d \n", ldb); 
  printf("ldc is: %d \n", ldc); 

  std::cout << cudaDeviceSynchronize() << std::endl;
  std::cout << cudaGetLastError() << std::endl;
  cublasStatus_t status = CUBLAS_STATUS_SUCCESS;

  cublasLtMatmulDescOpaque_t operationDesc = {};
  cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {};
  cublasLtMatmulPreferenceOpaque_t preference = {};

  int returnedResults                             = 0;
  cublasLtMatmulHeuristicResult_t heuristicResult = {};
  cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DEFAULT;

  // Create operation descriptor; see cublasLtMatmulDescAttributes_t
  // for details about defaults; here we just set the transforms for
  // A and B.
  status = cublasLtMatmulDescInit(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F);
  
  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa));
  
  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa));
  

  if (use_bias) {
    status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bgrad, sizeof(bgrad));
    epilogue = CUBLASLT_EPILOGUE_BGRADB;
  } 

  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue));

  // Create matrix descriptors. Not setting any extra attributes.
  status = cublasLtMatrixLayoutInit(
    &Adesc, CUDA_R_32F, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda);
  
  status = cublasLtMatrixLayoutInit(
    &Bdesc, CUDA_R_32F, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb);
  
  status = cublasLtMatrixLayoutInit(&Cdesc, CUDA_R_32F, m, n, ldc);
  

  // Create preference handle; In general, extra attributes can be
  // used here to disable tensor ops or to make sure algo selected
  // will work with badly aligned A, B, C. However, for simplicity
  // here we assume A,B,C are always well aligned (e.g., directly
  // come from cudaMalloc)
  status = cublasLtMatmulPreferenceInit(&preference);
  status = cublasLtMatmulPreferenceSetAttribute(
    &preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize, sizeof(workspaceSize));

  // We just need the best available heuristic to try and run matmul.
  // There is no guarantee that this will work. For example, if A is
  // badly aligned, you can request more (e.g. 32) algos and try to
  // run them one by one until something works.
  status = cublasLtMatmulAlgoGetHeuristic(
    ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, 1, &heuristicResult, &returnedResults);
  status = cublasLtMatmul(ltHandle,
                          &operationDesc,
                          alpha,
                          A,
                          &Adesc,
                          B,
                          &Bdesc,
                          beta,
                          C,
                          &Cdesc,
                          C,
                          &Cdesc,
                          //&heuristicResult.algo,
                          NULL,
                          workspace,
                          workspaceSize,
                          stream);
    std::cout << cudaDeviceSynchronize() << std::endl;
    std::cout << cudaGetLastError() << std::endl;
}

int main(){
    float* dy; 
    float* x; 
    float* wgrad; 
    float* dbias; 

    int batch = 13824; 
    int hidden = 400; 
    int out = 1; 
    cudaMalloc(&x, batch * hidden * sizeof(float));
    cudaMalloc(&wgrad, hidden * out * sizeof(float));
    cudaMalloc(&dy, batch * out * sizeof(float));
    cudaMalloc(&dbias, out * sizeof(float));

    printf("X shape is: %d %d \n", batch, hidden); 
    printf("Wgrad shape is: %d %d \n", hidden, out); 
    printf("dy shape is: %d %d \n", batch, out); 
    printf("dbias shape is: %d \n", out); 

    void* workspace{};
    int workspace_size = 0;
    cudaMalloc(&workspace, workspace_size);

    cublasLtHandle_t handle{}; 
    cublasLtCreate(&handle); 
    cudaStream_t cuda_stream{};
    cudaStreamCreate(&cuda_stream);
    float alpha = 1.0; 
    float beta = 0.0; 

    gemm_bgradb_lt(
        handle,
        CUBLAS_OP_N,
        CUBLAS_OP_T,
        hidden,
        out,
        batch,
        &alpha, /* host pointer */
        x,
        hidden,
        dy,
        out,
        &beta, /* host pointer */
        wgrad,
        hidden,
        workspace,
        workspace_size,
        cuda_stream,
        true,
        dbias); 
    std::cout << cudaDeviceSynchronize() << std::endl;
    std::cout << cudaGetLastError() << std::endl;
    cudaFree(x);
    cudaFree(wgrad);
    cudaFree(dy);
    cublasLtDestroy(handle); 
    cudaFree(workspace);
    cudaStreamDestroy(cuda_stream); 

    return 0; 
}

Here is the matmul logic:
X shape is 13824x400
Weight shape is 1x400
bias shape is 1
Out is: X matmul Weight(transposed) + bias, and get 13824x1

Now I use cublasLtMatmul to get wgrad:
dy_transposed(1x13824) matmul X(13824, 400) get wgrad(1x400), and reduce grad to get bias_grad.

Here is my command:

/usr/local/cuda-11.6/bin/nvcc cublas_bgrad.cu -O3 -o cublas_bgrad -lcublasLt_static

// Here is output: 
X shape is: 13824 400 
Wgrad shape is: 400 1 
dy shape is: 13824 1 
dbias shape is: 1 
m is: 400 
n is: 1 
k is: 13824 
lda is: 400 
ldb is: 1 
ldc is: 400 
0
0
700
700
700
700

in cuda11.5, it success.

/usr/local/cuda-11.5/bin/nvcc cublas_bgrad.cu -O3 -o cublas_bgrad -lcublasLt_static

X shape is: 13824 400 
Wgrad shape is: 400 1 
dy shape is: 13824 1 
dbias shape is: 1 
m is: 400 
n is: 1 
k is: 13824 
lda is: 400 
ldb is: 1 
ldc is: 400 
0
0
0
0
0
0

Are you able to check if the issue is resolved with CUDA 11.7?

Yeah I also check in cuda11.7, it still raise error code with 700.

I suggest filing a bug and be sure to add your reproducer code.
How to report a bug

Fine, I submit a bug issue(bug ID is: 3689038), looking forward to reply!

1 Like