cudaGraphics Map/Unmap of D3D11 resources is slow

Hello.
I’m adding cuda based post process on a d3d11 graphics app, where the input and output of cuda are both d3d11 texture. The idea is to map/process/unmap every frame. Then I found that the map/unmap takes a lot of time, almost equal to the process itself. Is this expected? How can I improve?

[2024-06-06 13:43:00.178] [logger] [info] interval #0: 1.602848ms cudaGraphicsMapResources
[2024-06-06 13:43:00.178] [logger] [info] interval #1: 0.001792ms cudaGraphicsSubResourceGetMappedArray
[2024-06-06 13:43:00.178] [logger] [info] interval #2: 0.057408ms cudaMemcpy2DFromArrayAsync
[2024-06-06 13:43:00.178] [logger] [info] interval #3: 3.857312ms cudaGraphicsUnmapResources
[2024-06-06 13:43:00.178] [logger] [info] interval #4: 9.328736ms infer
[2024-06-06 13:43:00.178] [logger] [info] interval #5: 1.3624ms cudaGraphicsMapResources
[2024-06-06 13:43:00.178] [logger] [info] interval #6: 0.001728ms cudaGraphicsSubResourceGetMappedArray
[2024-06-06 13:43:00.178] [logger] [info] interval #7: 0.058208ms cudaMemcpy2DToArrayAsync
[2024-06-06 13:43:00.178] [logger] [info] interval #8: 1.648864ms cudaGraphicsUnmapResources

		cudaEvent_t evt[10];
		for (int i = 0; i < 10; i++)
		{
			cudaEventCreate(&evt[i]);
		}

		// model input/output is NHWC
		int ih = gPostSR->getInputSize(1);
		int iw = gPostSR->getInputSize(2);
		int ic = gPostSR->getInputSize(3);
		int oh = gPostSR->getOutputSize(1);
		int ow = gPostSR->getOutputSize(2);
		int oc = gPostSR->getOutputSize(3);
		size_t isize = 1 * ic * ih * iw;
		size_t osize = 1 * oc * oh * ow;

		cudaEventRecord(evt[0], gPostSR->getCudaStream());
		cudaArray_t inArr;
		check(cudaGraphicsMapResources(1, &inputRes, gPostSR->getCudaStream()));
		cudaEventRecord(evt[1], gPostSR->getCudaStream());
		check(cudaGraphicsSubResourceGetMappedArray(&inArr, inputRes, 0, 0));
		cudaEventRecord(evt[2], gPostSR->getCudaStream());

		check(cudaMemcpy2DFromArrayAsync(gPostSR->inputBuffer(), iw * ic, inArr, 0, 0, iw * ic, ih,
			cudaMemcpyDeviceToDevice, gPostSR->getCudaStream()));
		cudaEventRecord(evt[3], gPostSR->getCudaStream());

		check(cudaGraphicsUnmapResources(1, &inputRes, gPostSR->getCudaStream()));
		cudaEventRecord(evt[4], gPostSR->getCudaStream());

		bool ret = gPostSR->infer();
		cudaEventRecord(evt[5], gPostSR->getCudaStream());

		cudaArray_t outArr;
		check(cudaGraphicsMapResources(1, &outputRes, gPostSR->getCudaStream()));
		cudaEventRecord(evt[6], gPostSR->getCudaStream());
		check(cudaGraphicsSubResourceGetMappedArray(&outArr, outputRes, 0, 0));
		cudaEventRecord(evt[7], gPostSR->getCudaStream());

		check(cudaMemcpy2DToArrayAsync(outArr, 0, 0, gPostSR->outputBuffer(), ow * oc, ow * oc, oh,
			cudaMemcpyDeviceToDevice, gPostSR->getCudaStream()));
		cudaEventRecord(evt[8], gPostSR->getCudaStream());

		check(cudaGraphicsUnmapResources(1, &outputRes, gPostSR->getCudaStream()));
		cudaEventRecord(evt[9], gPostSR->getCudaStream());
		for (int i = 0; i <= 9; i++)
			cudaEventSynchronize(evt[i]);

		float times[10];
		for (int i = 0; i < 9; i++)
		{
			cudaEventElapsedTime(&times[i], evt[i], evt[i+1]);
			gLogger->info("interval #{}: {}ms", i, times[i]);
		}
		gLogger->info("===========");

		check(cudaStreamSynchronize(gPostSR->getCudaStream()));

		for (int i = 0; i < 10; i++)
		{
			cudaEventDestroy(evt[i]);
		}