Can't capture stream when using cublasSgemmStridedBatched with batchCount=1

Hi everyone,

I was eager to try the new CUDA Graph API but ran into an issue with cuBLAS.

Everything works fine until I call cublasSgemmStridedBatched with batchCount = 1.

The following code works fine with batchCount=2 or when batchCount=1 and I don’t capture the stream.

#include <cuda_runtime.h>
#include <helper_cuda.h>
#include <vector>
#include <cublas_v2.h>

int main(int argc, char **argv) {
  cudaStream_t stream1;
  cudaGraph_t graph;
  cublasHandle_t handle;
  cublasStatus_t status = cublasCreate(&handle);

  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! CUBLAS initialization error\n");
  }

  checkCudaErrors(cudaStreamCreate(&stream1));

  float *a = NULL, *b = NULL, *c = NULL;

  size_t size = 1 * 256 * 32;  // number of elements to reduce
  checkCudaErrors(cudaMalloc(&a, sizeof(float) * size));
  checkCudaErrors(cudaMalloc(&b, sizeof(float) * size));
  checkCudaErrors(cudaMalloc(&c, sizeof(float) * size));
  float alpha = 1.0f;
  float beta = 1.0f;
  const int lda = 256;
  const int ldb = 32;
  const int ldc = 256;
  const int strideA = 8192;
  const int strideB = 1024;
  const int strideC = 8192;
  const int batchCount = 2;
  cublasSetStream(handle, stream1);
  status = cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, 256, 32, 32, &alpha, a, lda, strideA, b, ldb, strideB, &beta, c, ldc, strideC, batchCount);
  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! kernel execution error.\n");
  }
  status = cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, 256, 32, 32, &alpha, a, lda, strideA, b, ldb, strideB, &beta, c, ldc, strideC, batchCount);
  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! kernel execution 2 error.\n");
  }
  checkCudaErrors(cudaStreamBeginCapture(stream1));
  status = cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, 256, 32, 32, &alpha, a, lda, strideA, b, ldb, strideB, &beta, c, ldc, strideC, batchCount);
  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! kernel capture error.\n");
  }
  status = cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, 256, 32, 32, &alpha, a, lda, strideA, b, ldb, strideB, &beta, c, ldc, strideC, batchCount);
  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! kernel capture 2 error.\n");
  }
  checkCudaErrors(cudaStreamEndCapture(stream1, &graph));

  checkCudaErrors(cudaGraphDestroy(graph));
  checkCudaErrors(cudaStreamDestroy(stream1));

  checkCudaErrors(cudaFree(a));
  checkCudaErrors(cudaFree(b));
  checkCudaErrors(cudaFree(c));
  return EXIT_SUCCESS;
}

The interesting thing is that the capture doesn’t fail until the second call, the first call is successfull.

Does anyone have insight on what I could be doing wrong or is this just a bug?

Thanks,
Felipe

Anyone?

Not according to my testing. Run it with cuda-memcheck

I don’t believe this has anything to do with graph usage.

$ cat t420.cu
#include <cuda_runtime.h>
#include <helper_cuda.h>
#include <vector>
#include <cublas_v2.h>

int main(int argc, char **argv) {
  cudaStream_t stream1;
#ifdef USE_GRAPH
  cudaGraph_t graph;
#endif
  cublasHandle_t handle;
  cublasStatus_t status = cublasCreate(&handle);

  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! CUBLAS initialization error\n");
  }

  checkCudaErrors(cudaStreamCreate(&stream1));

  float *a = NULL, *b = NULL, *c = NULL;

  size_t size = 1 * 256 * 32;  // number of elements to reduce
  checkCudaErrors(cudaMalloc(&a, sizeof(float) * size));
  checkCudaErrors(cudaMalloc(&b, sizeof(float) * size));
  checkCudaErrors(cudaMalloc(&c, sizeof(float) * size));
  float alpha = 1.0f;
  float beta = 1.0f;
  const int lda = 256;
  const int ldb = 32;
  const int ldc = 256;
  const int strideA = 8192;
  const int strideB = 1024;
  const int strideC = 8192;
  const int batchCount = 2;
  cublasSetStream(handle, stream1);
  status = cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, 256, 32, 32, &alpha, a, lda, strideA, b, ldb, strideB, &beta, c, ldc, strideC, batchCount);
  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! kernel execution error.\n");
  }
  status = cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, 256, 32, 32, &alpha, a, lda, strideA, b, ldb, strideB, &beta, c, ldc, strideC, batchCount);
  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! kernel execution 2 error.\n");
  }
#ifdef USE_GRAPH
  checkCudaErrors(cudaStreamBeginCapture(stream1));
#endif
  status = cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, 256, 32, 32, &alpha, a, lda, strideA, b, ldb, strideB, &beta, c, ldc, strideC, batchCount);
  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! kernel capture error.\n");
  }
  status = cublasSgemmStridedBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, 256, 32, 32, &alpha, a, lda, strideA, b, ldb, strideB, &beta, c, ldc, strideC, batchCount);
  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! kernel capture 2 error.\n");
  }
#ifdef USE_GRAPH
  checkCudaErrors(cudaStreamEndCapture(stream1, &graph));

  checkCudaErrors(cudaGraphDestroy(graph));
#endif
  checkCudaErrors(cudaStreamDestroy(stream1));

  checkCudaErrors(cudaFree(a));
  checkCudaErrors(cudaFree(b));
  checkCudaErrors(cudaFree(c));
  return EXIT_SUCCESS;
}
$ nvcc -arch=sm_60 -o t420 t420.cu -I/usr/local/cuda/samples/common/inc -lcublas -DUSE_GRAPH
$ cuda-memcheck ./t420
========= CUDA-MEMCHECK
========= Invalid __global__ read of size 4
=========     at 0x00002950 in maxwell_sgemm_128x64_nn
=========     by thread (31,0,0) in block (1,0,1)
=========     Address 0x7f2112a2027c is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/lib64/libcuda.so.1 (cuLaunchKernel + 0x2cd) [0x24f88d]
=========     Host Frame:/usr/local/cuda/lib64/libcublas.so.10.0 [0x76cc22]
=========     Host Frame:/usr/local/cuda/lib64/libcublas.so.10.0 [0x76ce17]
=========     Host Frame:/usr/local/cuda/lib64/libcublas.so.10.0 [0x7a11d5]
=========     Host Frame:/usr/local/cuda/lib64/libcublas.so.10.0 [0x608825]
=========     Host Frame:/usr/local/cuda/lib64/libcublas.so.10.0 [0x731b71]
=========     Host Frame:/usr/local/cuda/lib64/libcublas.so.10.0 [0x40666d]
=========     Host Frame:/usr/local/cuda/lib64/libcublas.so.10.0 (cublasSgemmStridedBatched + 0x13f) [0x3f69af]
=========     Host Frame:./t420 [0x3831]
=========     Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf5) [0x21c05]
=========     Host Frame:./t420 [0x3349]
=========
========= Invalid __global__ read of size 4
...