Can't capture stream when using cublasSgemmStridedBatched with batchCount=1

felipe.suchwp6m3 · January 24, 2019, 9:43pm

Hi everyone,

I was eager to try the new CUDA Graph API but ran into an issue with cuBLAS.

Everything works fine until I call cublasSgemmStridedBatched with batchCount = 1.

The following code works fine with batchCount=2 or when batchCount=1 and I don’t capture the stream.

#include <cuda_runtime.h>
#include <helper_cuda.h>
#include <vector>
#include <cublas_v2.h>

int main(int argc, char **argv) {
  cudaStream_t stream1;
  cudaGraph_t graph;
  cublasHandle_t handle;
  cublasStatus_t status = cublasCreate(&handle);

  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! CUBLAS initialization error\n");
  }

  checkCudaErrors(cudaStreamCreate(&stream1));

  float *a = NULL, *b = NULL, *c = NULL;

  size_t size = 1 * 256 * 32;  // number of elements to reduce
  checkCudaErrors(cudaMalloc(&a, sizeof(float) * size));
  checkCudaErrors(cudaMalloc(&b, sizeof(float) * size));
  checkCudaErrors(cudaMalloc(&c, sizeof(float) * size));
  float alpha = 1.0f;
  float beta = 1.0f;
  const int lda = 256;
  const int ldb = 32;
  const int ldc = 256;
  const int strideA = 8192;
  const int strideB = 1024;
  const int strideC = 8192;
  const int batchCount = 2;
  cublasSetStream(handle, stream1);
  status = cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, 256, 32, 32, &alpha, a, lda, strideA, b, ldb, strideB, &beta, c, ldc, strideC, batchCount);
  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! kernel execution error.\n");
  }
  status = cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, 256, 32, 32, &alpha, a, lda, strideA, b, ldb, strideB, &beta, c, ldc, strideC, batchCount);
  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! kernel execution 2 error.\n");
  }
  checkCudaErrors(cudaStreamBeginCapture(stream1));
  status = cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, 256, 32, 32, &alpha, a, lda, strideA, b, ldb, strideB, &beta, c, ldc, strideC, batchCount);
  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! kernel capture error.\n");
  }
  status = cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, 256, 32, 32, &alpha, a, lda, strideA, b, ldb, strideB, &beta, c, ldc, strideC, batchCount);
  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! kernel capture 2 error.\n");
  }
  checkCudaErrors(cudaStreamEndCapture(stream1, &graph));

  checkCudaErrors(cudaGraphDestroy(graph));
  checkCudaErrors(cudaStreamDestroy(stream1));

  checkCudaErrors(cudaFree(a));
  checkCudaErrors(cudaFree(b));
  checkCudaErrors(cudaFree(c));
  return EXIT_SUCCESS;
}

The interesting thing is that the capture doesn’t fail until the second call, the first call is successfull.

Does anyone have insight on what I could be doing wrong or is this just a bug?

Thanks,
Felipe

felipe.suchwp6m3 · March 18, 2019, 11:04pm

Anyone?

Robert_Crovella · March 19, 2019, 2:58am

Not according to my testing. Run it with cuda-memcheck

I don’t believe this has anything to do with graph usage.

$ cat t420.cu
#include <cuda_runtime.h>
#include <helper_cuda.h>
#include <vector>
#include <cublas_v2.h>

int main(int argc, char **argv) {
  cudaStream_t stream1;
#ifdef USE_GRAPH
  cudaGraph_t graph;
#endif
  cublasHandle_t handle;
  cublasStatus_t status = cublasCreate(&handle);

  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! CUBLAS initialization error\n");
  }

  checkCudaErrors(cudaStreamCreate(&stream1));

  float *a = NULL, *b = NULL, *c = NULL;

  size_t size = 1 * 256 * 32;  // number of elements to reduce
  checkCudaErrors(cudaMalloc(&a, sizeof(float) * size));
  checkCudaErrors(cudaMalloc(&b, sizeof(float) * size));
  checkCudaErrors(cudaMalloc(&c, sizeof(float) * size));
  float alpha = 1.0f;
  float beta = 1.0f;
  const int lda = 256;
  const int ldb = 32;
  const int ldc = 256;
  const int strideA = 8192;
  const int strideB = 1024;
  const int strideC = 8192;
  const int batchCount = 2;
  cublasSetStream(handle, stream1);
  status = cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, 256, 32, 32, &alpha, a, lda, strideA, b, ldb, strideB, &beta, c, ldc, strideC, batchCount);
  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! kernel execution error.\n");
  }
  status = cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, 256, 32, 32, &alpha, a, lda, strideA, b, ldb, strideB, &beta, c, ldc, strideC, batchCount);
  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! kernel execution 2 error.\n");
  }
#ifdef USE_GRAPH
  checkCudaErrors(cudaStreamBeginCapture(stream1));
#endif
  status = cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, 256, 32, 32, &alpha, a, lda, strideA, b, ldb, strideB, &beta, c, ldc, strideC, batchCount);
  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! kernel capture error.\n");
  }
  status = cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, 256, 32, 32, &alpha, a, lda, strideA, b, ldb, strideB, &beta, c, ldc, strideC, batchCount);
  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! kernel capture 2 error.\n");
  }
#ifdef USE_GRAPH
  checkCudaErrors(cudaStreamEndCapture(stream1, &graph));

  checkCudaErrors(cudaGraphDestroy(graph));
#endif
  checkCudaErrors(cudaStreamDestroy(stream1));

  checkCudaErrors(cudaFree(a));
  checkCudaErrors(cudaFree(b));
  checkCudaErrors(cudaFree(c));
  return EXIT_SUCCESS;
}
$ nvcc -arch=sm_60 -o t420 t420.cu -I/usr/local/cuda/samples/common/inc -lcublas -DUSE_GRAPH
$ cuda-memcheck ./t420
========= CUDA-MEMCHECK
========= Invalid __global__ read of size 4
=========     at 0x00002950 in maxwell_sgemm_128x64_nn
=========     by thread (31,0,0) in block (1,0,1)
=========     Address 0x7f2112a2027c is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/lib64/libcuda.so.1 (cuLaunchKernel + 0x2cd) [0x24f88d]
=========     Host Frame:/usr/local/cuda/lib64/libcublas.so.10.0 [0x76cc22]
=========     Host Frame:/usr/local/cuda/lib64/libcublas.so.10.0 [0x76ce17]
=========     Host Frame:/usr/local/cuda/lib64/libcublas.so.10.0 [0x7a11d5]
=========     Host Frame:/usr/local/cuda/lib64/libcublas.so.10.0 [0x608825]
=========     Host Frame:/usr/local/cuda/lib64/libcublas.so.10.0 [0x731b71]
=========     Host Frame:/usr/local/cuda/lib64/libcublas.so.10.0 [0x40666d]
=========     Host Frame:/usr/local/cuda/lib64/libcublas.so.10.0 (cublasSgemmStridedBatched + 0x13f) [0x3f69af]
=========     Host Frame:./t420 [0x3831]
=========     Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf5) [0x21c05]
=========     Host Frame:./t420 [0x3349]
=========
========= Invalid __global__ read of size 4
...

Topic		Replies	Views
Stream capture of cublas gemm CUDA Programming and Performance	8	709	March 31, 2025
[CUDA Graph] cuBLAS routine produces incorrect result after calling cudaStreamBeginCapture CUDA Programming and Performance	2	680	March 2, 2022
Graph Capture of cublasDdot in Device Pointer Mode GPU-Accelerated Libraries cublas	3	417	March 26, 2024
[BUG][DEVICE LAUNCHED GRAPHS] CudaErrorInvalidValue with cuBLAS combined with the cudaGraphInstantiateFlagsDeviceLaunch starting with CUDA 12.1 CUDA Programming and Performance nvbugs	3	771	February 19, 2025
Cublas_status_execution_failed GPU-Accelerated Libraries	2	10692	February 23, 2021
cuBLAS call from kernel in CUDA 10.0 GPU-Accelerated Libraries	9	4861	April 7, 2021
CUDA graph capture mode with cuSolver get CUSOLVER_STATUS_INTERNAL_ERROR CUDA Programming and Performance cuda	2	85	October 9, 2024
Does cublasSgemm executes on default stream ? CUDA Programming and Performance	1	568	February 21, 2019
cuStreamWaitValue32 and cuStreamWriteValue32 blocking issue CUDA Programming and Performance	8	510	April 12, 2024
Stream Concurrency (or lack thereof) on GTX 480 CUDA Programming and Performance	6	2496	July 15, 2010

Can't capture stream when using cublasSgemmStridedBatched with batchCount=1

Related topics