[CUDA Graph] cuBLAS routine produces incorrect result after calling cudaStreamBeginCapture

Hello Forum,

I have been trying to create a CUDA Graph by using stream capture method to capture a cublasDgemv call in it. But I am getting wrong result after calling the stream capture routines. Here is my code snippet:

#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <ctime>
#include "cublas_v2.h"

#define cudaErrChk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

static const char *cublasErrChk(cublasStatus_t error)
{
    switch (error)
    {
        case CUBLAS_STATUS_SUCCESS:
            return "CUBLAS_STATUS_SUCCESS";

        case CUBLAS_STATUS_NOT_INITIALIZED:
            return "CUBLAS_STATUS_NOT_INITIALIZED";

        case CUBLAS_STATUS_ALLOC_FAILED:
            return "CUBLAS_STATUS_ALLOC_FAILED";

        case CUBLAS_STATUS_INVALID_VALUE:
            return "CUBLAS_STATUS_INVALID_VALUE";

        case CUBLAS_STATUS_ARCH_MISMATCH:
            return "CUBLAS_STATUS_ARCH_MISMATCH";

        case CUBLAS_STATUS_MAPPING_ERROR:
            return "CUBLAS_STATUS_MAPPING_ERROR";

        case CUBLAS_STATUS_EXECUTION_FAILED:
            return "CUBLAS_STATUS_EXECUTION_FAILED";

        case CUBLAS_STATUS_INTERNAL_ERROR:
            return "CUBLAS_STATUS_INTERNAL_ERROR";
    }

    return "<unknown>";
}

int main() {

    size_t dims = 4;

    double *vec, *mat, *results;

    cudaErrChk( cudaMallocManaged(&vec, dims * sizeof(double)) );
    cudaErrChk( cudaMallocManaged(&mat, dims * dims * sizeof(double)) );
    cudaErrChk( cudaMallocManaged(&results, dims * sizeof(double)) );

    printf("Vector:\n");
    for (int i = 1; i < dims + 1; i++) {
        vec[i] = 0.5 * i;
        printf("%.2lf ", vec[i]);
    }
    printf("\n\nMatrix:\n");

    for (int i = 1; i < dims * dims + 1; i++) {
        mat[i] = 1.0 * i;
        printf("%.2lf ", mat[i]);

        if (i % dims == 0)
            printf("\n");
    }
    printf("\n");

// CUDA graph creation

  cublasHandle_t handle;
  cublasErrChk( cublasCreate(&handle) );
  cudaGraph_t gemvGraph;
  cudaStream_t stream1, streamForGraph;
  cudaErrChk(cudaStreamCreate(&stream1));
  cublasStatus_t stat = cublasSetStream(handle, stream1);
  std::cout << "stat: " << stat << std::endl;
  cudaErrChk(cudaGraphCreate(&gemvGraph, 0));
  cudaErrChk(cudaStreamCreate(&streamForGraph));
  double alpha = 1.f, beta = 1.f;

  cudaErrChk(cudaStreamBeginCapture(stream1, cudaStreamCaptureModeGlobal));
    // multiply mat by vec to get results
  cublasErrChk(
        cublasDgemv(
            handle, CUBLAS_OP_N,
            dims, dims,
            &alpha,
            mat, dims,
            vec, 1,
            &beta,
            results, 1
        )
    );

  cudaErrChk(cudaStreamEndCapture(stream1, &gemvGraph));
  cudaStreamSynchronize(stream1);
  cudaDeviceSynchronize();

    for (int i = 0; i < dims; i++)
        printf("%.2lf ", results[i]);
    printf("\n");

    cudaErrChk( cudaFree(vec) );
    cudaErrChk( cudaFree(mat) );
    cudaErrChk( cudaFree(results) );

    return 0;
}

Here is the expected output:

vec:
0.50 1.00 1.50 2.00

mat:
1.00 2.00 3.00 4.00
5.00 6.00 7.00 8.00
9.00 10.00 11.00 12.00
13.00 14.00 15.00 16.00

results:
28.00 31.00 34.00 37.00

But I am getting the following output:

vec:
0.50 1.00 1.50 2.00

mat:
1.00 2.00 3.00 4.00
5.00 6.00 7.00 8.00
9.00 10.00 11.00 12.00
13.00 14.00 15.00 16.00

results:
0.00 0.00 0.00 0.00

I am wondering about what I have missed here and what might be the reason of getting the wrong result after calling stream capture routines. I would appreciate your help.

The graph creation process starts from line number 77.
Here is the pastebin link: https://pastebin.com/jpFiRSdE
To compile: nvcc -o gemm.x gemm.cu -lcublas
@Robert_Crovella , tagging you in case you get a chance to have a look at it.

Thanks,
Fazlay

You haven’t instantiated or launched the graph. All you did was capture the graph. You might want to study some resources that describe CUDA graphs, or study the sample code that was previously suggested to you. Stream capture, by itself, doesn’t run anything.

From here:

A call to cudaStreamBeginCapture() places a stream in capture mode. When a stream is being captured, work launched into the stream is not enqueued for execution. It is instead appended to an internal graph that is progressively being built up.

1 Like

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.