I am trying to capture graph associated with a cublas sgemm call (cuda 11.5 on Ampere). I modified the matrixMultiply function in the matrixMulCUBLAS CUDA sample as below. I get an error cudaErrorInvalidDeviceFunction for the function captured by stream capture, see line tagged as FAILURE. What am I doing wrong?
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test matrix multiply using CUBLAS
////////////////////////////////////////////////////////////////////////////////
int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size) {
cudaDeviceProp deviceProp;
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
int block_size = 32;
// set seed for rand()
srand(2006);
// allocate host memory for matrices A and B
unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA;
unsigned int mem_size_A = sizeof(float) * size_A;
float *h_A = (float *)malloc(mem_size_A);
unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB;
unsigned int mem_size_B = sizeof(float) * size_B;
float *h_B = (float *)malloc(mem_size_B);
// set seed for rand()
srand(2006);
// initialize host memory
randomInit(h_A, size_A);
randomInit(h_B, size_B);
// allocate device memory
float *d_A, *d_B, *d_C;
unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC;
unsigned int mem_size_C = sizeof(float) * size_C;
// allocate host memory for the result
float *h_C = (float *)malloc(mem_size_C);
float *h_CUBLAS = (float *)malloc(mem_size_C);
checkCudaErrors(cudaMalloc((void **)&d_A, mem_size_A));
checkCudaErrors(cudaMalloc((void **)&d_B, mem_size_B));
checkCudaErrors(cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMalloc((void **)&d_C, mem_size_C));
// setup execution parameters
dim3 threads(block_size, block_size);
dim3 grid(matrix_size.uiWC / threads.x, matrix_size.uiHC / threads.y);
// create and start timer
printf("Computing result using CUBLAS...");
// execute the kernel
int nIter = 30;
// CUBLAS version 2.0
{
const float alpha = 1.0f;
const float beta = 0.0f;
cublasHandle_t handle;
cudaStream_t stream;
cudaEvent_t start, stop;
cudaGraph_t graph;
cudaGraphExec_t graphExec = NULL;
cudaKernelNodeParams NodeParams;
cudaMemsetParams MemsetParams;
std::vector<cudaGraphNode_t> vnodes;
checkCudaErrors(cublasCreate(&handle));
checkCudaErrors(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE));
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
cublasSetStream(handle,stream);
checkCudaErrors(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));
// Perform warmup operation with cublas
checkCudaErrors(cublasSgemm(
handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA,
matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA,
&beta, d_C, matrix_size.uiWB));
checkCudaErrors(cudaStreamEndCapture(stream, &graph));
size_t numNodes;
checkCudaErrors(cudaGraphGetNodes(graph, NULL, &numNodes));
vnodes.resize(numNodes);
checkCudaErrors(cudaGraphGetNodes(graph, vnodes.data(), &numNodes));
std::vector<cudaGraphNodeType> nodeType(numNodes);
for(size_t i=0; i<numNodes; i++) {
checkCudaErrors(cudaGraphNodeGetType (vnodes[i], &nodeType[i]));
switch(nodeType[i]) {
case cudaGraphNodeTypeMemset:
checkCudaErrors(cudaGraphMemsetNodeGetParams (vnodes[i], &MemsetParams));
break;
case cudaGraphNodeTypeKernel:
checkCudaErrors(cudaGraphKernelNodeGetParams(vnodes[i], &NodeParams));
// FAILURE code=98 (cudaErrorInvalidDeviceFunction)
break;
}
}
cudaGraphExecUpdateResult updateResult_out;
checkCudaErrors(cudaGraphExecUpdate(graphExec, graph, NULL, &updateResult_out));
if (updateResult_out != cudaGraphExecUpdateSuccess) {
if (graphExec != NULL) {
checkCudaErrors(cudaGraphExecDestroy(graphExec));
}
printf("graph update failed with error - %d\n", updateResult_out);
checkCudaErrors(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0));
}
checkCudaErrors(cudaGraphLaunch(graphExec, stream));
checkCudaErrors(cudaStreamSynchronize(stream));
// Allocate CUDA events that we'll use for timing
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
// Record the start event
checkCudaErrors(cudaEventRecord(start, NULL));
for (int j = 0; j < nIter; j++) {
// note cublas is column primary!
// need to transpose the order
checkCudaErrors(cudaGraphLaunch(graphExec, stream));
checkCudaErrors(cudaStreamSynchronize(stream));
}
printf("done.\n");
// Record the stop event
checkCudaErrors(cudaEventRecord(stop, NULL));
// Wait for the stop event to complete
checkCudaErrors(cudaEventSynchronize(stop));
float msecTotal = 0.0f;
checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
// Compute and print the performance
float msecPerMatrixMul = msecTotal / nIter;
double flopsPerMatrixMul = 2.0 * (double)matrix_size.uiHC *
(double)matrix_size.uiWC *
(double)matrix_size.uiHB;
double gigaFlops =
(flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n",
gigaFlops, msecPerMatrixMul, flopsPerMatrixMul);
// copy result from device to host
checkCudaErrors(
cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost));
// Destroy the handle
checkCudaErrors(cublasDestroy(handle));
checkCudaErrors(cudaStreamDestroy(stream));
}
// compute reference solution
printf("Computing result using host CPU...");
float *reference = (float *)malloc(mem_size_C);
matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA,
matrix_size.uiWB);
printf("done.\n");
// check result (CUBLAS)
bool resCUBLAS = sdkCompareL2fe(reference, h_CUBLAS, size_C, 1.0e-6f);
if (resCUBLAS != true) {
printDiff(reference, h_CUBLAS, matrix_size.uiWC, matrix_size.uiHC, 100,
1.0e-5f);
}
printf("Comparing CUBLAS Matrix Multiply with CPU results: %s\n",
(true == resCUBLAS) ? "PASS" : "FAIL");
printf(
"\nNOTE: The CUDA Samples are not meant for performance measurements. "
"Results may vary when GPU Boost is enabled.\n");
// clean up memory
free(h_A);
free(h_B);
free(h_C);
free(reference);
checkCudaErrors(cudaFree(d_A));
checkCudaErrors(cudaFree(d_B));
checkCudaErrors(cudaFree(d_C));
if (resCUBLAS == true) {
return EXIT_SUCCESS; // return value = 1
} else {
return EXIT_FAILURE; // return value = 0
}
}