Why does cudaGraphLaunch not behave as an async api

Found from gdb, the program is blocked in cudaGraphLaunch


#include<stdio.h>
#include <string>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
using namespace std;

#define CUDA_CHECK(err) if(err != cudaSuccess) { printf("line %d, error: %d\n", __LINE__, (int)err);}

__global__ void kernel(int* in)
{
	int tid = threadIdx.x + blockIdx.x * blockDim.x;
	in[tid]++;
	while(1){}
}

int main(int argc, char** argv)
{
	cudaGraph_t graph;
	cudaGraphExec_t exec;
	int* h_buffer = (int*)malloc(100 * sizeof(int));
	memset(h_buffer, 0, 100 * sizeof(int));

	CUDA_CHECK(cudaGraphCreate(&graph, 0));
	for (int i = 0; i < 5; i++)
	{
		cudaGraphNode_t allocNode, copy1DFromHostNode, kernelNode, copy1DToHostNode, freeNode;

		cudaMemAllocNodeParams allocParamater;
		memset(&allocParamater, 0, sizeof(cudaMemAllocNodeParams));
		allocParamater.bytesize = 100 * sizeof(int);
		allocParamater.poolProps.allocType = cudaMemAllocationTypePinned;
		allocParamater.poolProps.location.id = 0;
		allocParamater.poolProps.location.type = cudaMemLocationTypeDevice;
		CUDA_CHECK(cudaGraphAddMemAllocNode(&allocNode, graph, NULL, 0, &allocParamater));

		CUDA_CHECK(cudaGraphAddMemcpyNode1D(&copy1DFromHostNode, graph, &allocNode, 1, allocParamater.dptr, h_buffer, 100 * sizeof(int), cudaMemcpyHostToDevice));

		cudaKernelNodeParams kernelParamater = {0};
		void* kernalargs[1] = { (void*)(&allocParamater.dptr)};
		kernelParamater.blockDim = 100;
		kernelParamater.gridDim = 1;
		kernelParamater.func = (void*)kernel;
		kernelParamater.sharedMemBytes = 0;
		kernelParamater.kernelParams = kernalargs;
		CUDA_CHECK(cudaGraphAddKernelNode(&kernelNode, graph, &copy1DFromHostNode, 1, &kernelParamater));

		CUDA_CHECK(cudaGraphAddMemcpyNode1D(&copy1DToHostNode, graph, &kernelNode, 1, h_buffer, allocParamater.dptr, 100 * sizeof(int), cudaMemcpyDeviceToHost));

		CUDA_CHECK(cudaGraphAddMemFreeNode(&freeNode, graph, &copy1DToHostNode, 1, allocParamater.dptr));
	}

	CUDA_CHECK(cudaGraphInstantiate(&exec, graph, nullptr, nullptr, 0));
	CUDA_CHECK(cudaGraphLaunch(exec, 0));

	//CUDA_CHECK(cudaStreamSynchronize(0));


	string path = "./debug.dot";
	cudaGraphDebugDotPrint(graph, path.c_str(), cudaGraphDebugDotFlagsVerbose);
	CUDA_CHECK(cudaGraphDestroy(graph));
	CUDA_CHECK(cudaGraphExecDestroy(exec));

	return 0;
}

In the future, please format your code correctly. A possible simple set of steps is:

  1. Edit your post
  2. Select the code
  3. click the </> button at the top of the edit box
  4. Save your changes

You have a memcpy operation D->H that is targeting non-pinned memory (h_buffer). That will be a blocking operation. You may wish to study basic CUDA concurrency principles (e.g. session 7 here)