cudaGraphAddKernelNode doesn't work on host side.

Hi,
I’m trying create very simple graph to add two numbers. Creating graph and memcpy works fine but “cudaGraphAddKernelNode” return code=8(cudaErrorInvalidDeviceFunction) when I compile by MSVC.

When I use NVCC it works.
Kernel is from ptx file so it should work well from MSVC.

float *inputNode1_d, *inputNode2_d;
	float *inputNode1_h, *inputNode2_h;
	float *outputNode_d, *outputNode_h;

	inputNode1_h = (float*)malloc(sizeof(float));
	inputNode2_h = (float*)malloc(sizeof(float));
	outputNode_h = (float*)malloc(sizeof(float));
	checkCudaErrors(cudaMalloc(&inputNode1_d, sizeof(float)));
	checkCudaErrors(cudaMalloc(&inputNode2_d, sizeof(float)));
	checkCudaErrors(cudaMalloc(&outputNode_d, sizeof(float)));

	*inputNode1_h = 1;
	*inputNode2_h = 1;
	*outputNode_h = 0;

	cudaStream_t streamForGraph;
	cudaGraph_t graph;
	std::vector<cudaGraphNode_t> nodeDependencies;
	cudaGraphNode_t memcpyNode_1, memcpyNode_2, kernelNode, memcpyNode_3;

	checkCudaErrors(cudaStreamCreate(&streamForGraph));

	cudaKernelNodeParams kernelNodeParams = { 0 };
	cudaHostNodeParams cudaHostNodeParams = { 0 };
	cudaMemcpy3DParms memcpyParams = { 0 };

	checkCudaErrors(cudaGraphCreate(&graph, 0));

	memcpyParams.srcArray = NULL;
	memcpyParams.srcPos = make_cudaPos(0, 0, 0);
	memcpyParams.srcPtr = make_cudaPitchedPtr(inputNode1_h, sizeof(float), 1, 1);
	memcpyParams.dstArray = NULL;
	memcpyParams.dstPos = make_cudaPos(0, 0, 0);
	memcpyParams.dstPtr = make_cudaPitchedPtr(inputNode1_d, sizeof(float), 1, 1);
	memcpyParams.extent = make_cudaExtent(sizeof(float), 1, 1);
	memcpyParams.kind = cudaMemcpyHostToDevice;

	checkCudaErrors(cudaGraphAddMemcpyNode(&memcpyNode_1, graph, NULL, 0, &memcpyParams));

	memset(&memcpyParams, 0, sizeof(memcpyParams));

	memcpyParams.srcArray = NULL;
	memcpyParams.srcPos = make_cudaPos(0, 0, 0);
	memcpyParams.srcPtr = make_cudaPitchedPtr(inputNode2_h, sizeof(float), 1, 1);
	memcpyParams.dstArray = NULL;
	memcpyParams.dstPos = make_cudaPos(0, 0, 0);
	memcpyParams.dstPtr = make_cudaPitchedPtr(inputNode2_d, sizeof(float), 1, 1);
	memcpyParams.extent = make_cudaExtent(sizeof(float), 1, 1);
	memcpyParams.kind = cudaMemcpyHostToDevice;

	checkCudaErrors(cudaGraphAddMemcpyNode(&memcpyNode_2, graph, NULL, 0, &memcpyParams));

	nodeDependencies.push_back(memcpyNode_1);
	nodeDependencies.push_back(memcpyNode_2);

	void *kernelArgs[4] = { (void*)&inputNode1_d, (void*)&inputNode2_d, (void*)&outputNode_d };

	kernelNodeParams.func = (void*)addKernel; // Here I put kernel loaded from ptx
	kernelNodeParams.gridDim = dim3(1, 1, 1);
	kernelNodeParams.blockDim = dim3(1, 1, 1);
	kernelNodeParams.sharedMemBytes = 0;
	kernelNodeParams.kernelParams = (void **)kernelArgs;
	kernelNodeParams.extra = NULL;

	checkCudaErrors(cudaGraphAddKernelNode(&kernelNode, graph, nodeDependencies.data(), nodeDependencies.size(), &kernelNodeParams));

	nodeDependencies.clear();
	nodeDependencies.push_back(kernelNode);

	memset(&memcpyParams, 0, sizeof(memcpyParams));

	memcpyParams.srcArray = NULL;
	memcpyParams.srcPos = make_cudaPos(0, 0, 0);
	memcpyParams.srcPtr = make_cudaPitchedPtr(outputNode_d, sizeof(float), 1, 1);
	memcpyParams.dstArray = NULL;
	memcpyParams.dstPos = make_cudaPos(0, 0, 0);
	memcpyParams.dstPtr = make_cudaPitchedPtr(outputNode_h, sizeof(float), 1, 1);
	memcpyParams.extent = make_cudaExtent(sizeof(float), 1, 1);
	memcpyParams.kind = cudaMemcpyDeviceToHost;
	checkCudaErrors(cudaGraphAddMemcpyNode(&memcpyNode_3, graph, nodeDependencies.data(), nodeDependencies.size(), &memcpyParams));

	cudaGraphNode_t *nodes = NULL;
	size_t numNodes = 0;
	checkCudaErrors(cudaGraphGetNodes(graph, nodes, &numNodes));
	printf("\nNum of nodes in the graph created manually = %zu\n", numNodes);

	cudaGraphExec_t graphExec;
	checkCudaErrors(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0));
	checkCudaErrors(cudaGraphLaunch(graphExec, streamForGraph));

	checkCudaErrors(cudaStreamSynchronize(streamForGraph));
	checkCudaErrors(cudaGraphExecDestroy(graphExec));
	checkCudaErrors(cudaGraphDestroy(graph));
	checkCudaErrors(cudaStreamDestroy(streamForGraph));

	std::cout << *outputNode_h << std::endl;

	cudaFree(inputNode1_d);
	cudaFree(inputNode2_d);
	cudaFree(outputNode_d);
	cudaFree(outputNode_h);