Hi,
I’m trying create very simple graph to add two numbers. Creating graph and memcpy works fine but “cudaGraphAddKernelNode” return code=8(cudaErrorInvalidDeviceFunction) when I compile by MSVC.
When I use NVCC it works.
Kernel is from ptx file so it should work well from MSVC.
float *inputNode1_d, *inputNode2_d;
float *inputNode1_h, *inputNode2_h;
float *outputNode_d, *outputNode_h;
inputNode1_h = (float*)malloc(sizeof(float));
inputNode2_h = (float*)malloc(sizeof(float));
outputNode_h = (float*)malloc(sizeof(float));
checkCudaErrors(cudaMalloc(&inputNode1_d, sizeof(float)));
checkCudaErrors(cudaMalloc(&inputNode2_d, sizeof(float)));
checkCudaErrors(cudaMalloc(&outputNode_d, sizeof(float)));
*inputNode1_h = 1;
*inputNode2_h = 1;
*outputNode_h = 0;
cudaStream_t streamForGraph;
cudaGraph_t graph;
std::vector<cudaGraphNode_t> nodeDependencies;
cudaGraphNode_t memcpyNode_1, memcpyNode_2, kernelNode, memcpyNode_3;
checkCudaErrors(cudaStreamCreate(&streamForGraph));
cudaKernelNodeParams kernelNodeParams = { 0 };
cudaHostNodeParams cudaHostNodeParams = { 0 };
cudaMemcpy3DParms memcpyParams = { 0 };
checkCudaErrors(cudaGraphCreate(&graph, 0));
memcpyParams.srcArray = NULL;
memcpyParams.srcPos = make_cudaPos(0, 0, 0);
memcpyParams.srcPtr = make_cudaPitchedPtr(inputNode1_h, sizeof(float), 1, 1);
memcpyParams.dstArray = NULL;
memcpyParams.dstPos = make_cudaPos(0, 0, 0);
memcpyParams.dstPtr = make_cudaPitchedPtr(inputNode1_d, sizeof(float), 1, 1);
memcpyParams.extent = make_cudaExtent(sizeof(float), 1, 1);
memcpyParams.kind = cudaMemcpyHostToDevice;
checkCudaErrors(cudaGraphAddMemcpyNode(&memcpyNode_1, graph, NULL, 0, &memcpyParams));
memset(&memcpyParams, 0, sizeof(memcpyParams));
memcpyParams.srcArray = NULL;
memcpyParams.srcPos = make_cudaPos(0, 0, 0);
memcpyParams.srcPtr = make_cudaPitchedPtr(inputNode2_h, sizeof(float), 1, 1);
memcpyParams.dstArray = NULL;
memcpyParams.dstPos = make_cudaPos(0, 0, 0);
memcpyParams.dstPtr = make_cudaPitchedPtr(inputNode2_d, sizeof(float), 1, 1);
memcpyParams.extent = make_cudaExtent(sizeof(float), 1, 1);
memcpyParams.kind = cudaMemcpyHostToDevice;
checkCudaErrors(cudaGraphAddMemcpyNode(&memcpyNode_2, graph, NULL, 0, &memcpyParams));
nodeDependencies.push_back(memcpyNode_1);
nodeDependencies.push_back(memcpyNode_2);
void *kernelArgs[4] = { (void*)&inputNode1_d, (void*)&inputNode2_d, (void*)&outputNode_d };
kernelNodeParams.func = (void*)addKernel; // Here I put kernel loaded from ptx
kernelNodeParams.gridDim = dim3(1, 1, 1);
kernelNodeParams.blockDim = dim3(1, 1, 1);
kernelNodeParams.sharedMemBytes = 0;
kernelNodeParams.kernelParams = (void **)kernelArgs;
kernelNodeParams.extra = NULL;
checkCudaErrors(cudaGraphAddKernelNode(&kernelNode, graph, nodeDependencies.data(), nodeDependencies.size(), &kernelNodeParams));
nodeDependencies.clear();
nodeDependencies.push_back(kernelNode);
memset(&memcpyParams, 0, sizeof(memcpyParams));
memcpyParams.srcArray = NULL;
memcpyParams.srcPos = make_cudaPos(0, 0, 0);
memcpyParams.srcPtr = make_cudaPitchedPtr(outputNode_d, sizeof(float), 1, 1);
memcpyParams.dstArray = NULL;
memcpyParams.dstPos = make_cudaPos(0, 0, 0);
memcpyParams.dstPtr = make_cudaPitchedPtr(outputNode_h, sizeof(float), 1, 1);
memcpyParams.extent = make_cudaExtent(sizeof(float), 1, 1);
memcpyParams.kind = cudaMemcpyDeviceToHost;
checkCudaErrors(cudaGraphAddMemcpyNode(&memcpyNode_3, graph, nodeDependencies.data(), nodeDependencies.size(), &memcpyParams));
cudaGraphNode_t *nodes = NULL;
size_t numNodes = 0;
checkCudaErrors(cudaGraphGetNodes(graph, nodes, &numNodes));
printf("\nNum of nodes in the graph created manually = %zu\n", numNodes);
cudaGraphExec_t graphExec;
checkCudaErrors(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0));
checkCudaErrors(cudaGraphLaunch(graphExec, streamForGraph));
checkCudaErrors(cudaStreamSynchronize(streamForGraph));
checkCudaErrors(cudaGraphExecDestroy(graphExec));
checkCudaErrors(cudaGraphDestroy(graph));
checkCudaErrors(cudaStreamDestroy(streamForGraph));
std::cout << *outputNode_h << std::endl;
cudaFree(inputNode1_d);
cudaFree(inputNode2_d);
cudaFree(outputNode_d);
cudaFree(outputNode_h);