Here is a full repro example, based loosely off the TRSM example in the documentation:
#include <cstdio>
#include <cstdlib>
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <vector>
#define CUDA_CHECK(err) \
do { \
cudaError_t err_ = (err); \
if (err_ != cudaSuccess) { \
std::printf("CUDA error %d at %s:%d\n", err_, __FILE__, __LINE__); \
std::abort(); \
} \
} while (0)
#define CUBLAS_CHECK(err) \
do { \
cublasStatus_t err_ = (err); \
if (err_ != CUBLAS_STATUS_SUCCESS) { \
std::printf("cublas error %d at %s:%d\n", err_, __FILE__, __LINE__); \
std::abort(); \
} \
} while (0)
int main(int argc, char *argv[]) {
int m = 256;
if (argc > 1) {
m = atoi(argv[1]);
}
int n = 256;
if (argc > 2) {
n = atoi(argv[2]);
}
cublasHandle_t cublasH = NULL;
const int lda = n;
const int ldb = m;
std::printf("m = %d n = %d lda = %d ldb = %d\n", m, n, lda, ldb);
cublasSideMode_t side = CUBLAS_SIDE_RIGHT;
cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
cublasOperation_t transa = CUBLAS_OP_N;
cublasDiagType_t diag = CUBLAS_DIAG_NON_UNIT;
const double alpha = 1.0;
double *d_A = nullptr;
double *d_B = nullptr;
CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_A), n * lda * sizeof(double)));
CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_B), n * ldb * sizeof(double)));
CUBLAS_CHECK(cublasCreate(&cublasH));
cudaGraph_t main_graph;
CUDA_CHECK(cudaGraphCreate(&main_graph, 0));
cudaStream_t capture_stream = NULL;
CUDA_CHECK(cudaStreamCreate(&capture_stream));
CUDA_CHECK(cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeThreadLocal));
CUBLAS_CHECK(cublasSetStream(cublasH, capture_stream));
CUBLAS_CHECK(cublasSetPointerMode(cublasH, CUBLAS_POINTER_MODE_HOST));
CUBLAS_CHECK(cublasDtrsm(cublasH, side, uplo, transa, diag, m, n, &alpha, d_A, lda, d_B, ldb));
cudaGraph_t cublas_graph;
CUDA_CHECK(cudaStreamEndCapture(capture_stream, &cublas_graph));
cudaGraphNode_t node;
CUDA_CHECK(cudaGraphAddChildGraphNode(&node, main_graph, nullptr, 0, cublas_graph));
CUDA_CHECK(cudaFree(d_A));
CUDA_CHECK(cudaFree(d_B));
CUBLAS_CHECK(cublasDestroy(cublasH));
CUDA_CHECK(cudaDeviceReset());
return EXIT_SUCCESS;
}
I build that with nvcc 12.3.52, and run it with size 256 256 it errors
$ ./cmd_repro_d 256 256
m = 256 n = 256 lda = 256 ldb = 256
CUDA error 801 at repro.cc:72
and with a smaller size, 32 32, it works
$ ./cmd_repro_d 32 32
m = 32 n = 32 lda = 32 ldb = 32
If I recompile with nvcc 11.8.89, the program works without error for the larger sizes, as well.