Hello Forum,
I have been trying to create a CUDA Graph by using stream capture method to capture a cublasDgemv
call in it. But I am getting wrong result after calling the stream capture routines. Here is my code snippet:
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <ctime>
#include "cublas_v2.h"
#define cudaErrChk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
static const char *cublasErrChk(cublasStatus_t error)
{
switch (error)
{
case CUBLAS_STATUS_SUCCESS:
return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED:
return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED:
return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE:
return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH:
return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR:
return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED:
return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR:
return "CUBLAS_STATUS_INTERNAL_ERROR";
}
return "<unknown>";
}
int main() {
size_t dims = 4;
double *vec, *mat, *results;
cudaErrChk( cudaMallocManaged(&vec, dims * sizeof(double)) );
cudaErrChk( cudaMallocManaged(&mat, dims * dims * sizeof(double)) );
cudaErrChk( cudaMallocManaged(&results, dims * sizeof(double)) );
printf("Vector:\n");
for (int i = 1; i < dims + 1; i++) {
vec[i] = 0.5 * i;
printf("%.2lf ", vec[i]);
}
printf("\n\nMatrix:\n");
for (int i = 1; i < dims * dims + 1; i++) {
mat[i] = 1.0 * i;
printf("%.2lf ", mat[i]);
if (i % dims == 0)
printf("\n");
}
printf("\n");
// CUDA graph creation
cublasHandle_t handle;
cublasErrChk( cublasCreate(&handle) );
cudaGraph_t gemvGraph;
cudaStream_t stream1, streamForGraph;
cudaErrChk(cudaStreamCreate(&stream1));
cublasStatus_t stat = cublasSetStream(handle, stream1);
std::cout << "stat: " << stat << std::endl;
cudaErrChk(cudaGraphCreate(&gemvGraph, 0));
cudaErrChk(cudaStreamCreate(&streamForGraph));
double alpha = 1.f, beta = 1.f;
cudaErrChk(cudaStreamBeginCapture(stream1, cudaStreamCaptureModeGlobal));
// multiply mat by vec to get results
cublasErrChk(
cublasDgemv(
handle, CUBLAS_OP_N,
dims, dims,
&alpha,
mat, dims,
vec, 1,
&beta,
results, 1
)
);
cudaErrChk(cudaStreamEndCapture(stream1, &gemvGraph));
cudaStreamSynchronize(stream1);
cudaDeviceSynchronize();
for (int i = 0; i < dims; i++)
printf("%.2lf ", results[i]);
printf("\n");
cudaErrChk( cudaFree(vec) );
cudaErrChk( cudaFree(mat) );
cudaErrChk( cudaFree(results) );
return 0;
}
Here is the expected output:
vec:
0.50 1.00 1.50 2.00
mat:
1.00 2.00 3.00 4.00
5.00 6.00 7.00 8.00
9.00 10.00 11.00 12.00
13.00 14.00 15.00 16.00
results:
28.00 31.00 34.00 37.00
But I am getting the following output:
vec:
0.50 1.00 1.50 2.00
mat:
1.00 2.00 3.00 4.00
5.00 6.00 7.00 8.00
9.00 10.00 11.00 12.00
13.00 14.00 15.00 16.00
results:
0.00 0.00 0.00 0.00
I am wondering about what I have missed here and what might be the reason of getting the wrong result after calling stream capture routines. I would appreciate your help.
The graph creation process starts from line number 77.
Here is the pastebin link: https://pastebin.com/jpFiRSdE
To compile: nvcc -o gemm.x gemm.cu -lcublas
@Robert_Crovella , tagging you in case you get a chance to have a look at it.
Thanks,
Fazlay