Hello.
I’m adding cuda based post process on a d3d11 graphics app, where the input and output of cuda are both d3d11 texture. The idea is to map/process/unmap every frame. Then I found that the map/unmap takes a lot of time, almost equal to the process itself. Is this expected? How can I improve?
[2024-06-06 13:43:00.178] [logger] [info] interval #0: 1.602848ms cudaGraphicsMapResources
[2024-06-06 13:43:00.178] [logger] [info] interval #1: 0.001792ms cudaGraphicsSubResourceGetMappedArray
[2024-06-06 13:43:00.178] [logger] [info] interval #2: 0.057408ms cudaMemcpy2DFromArrayAsync
[2024-06-06 13:43:00.178] [logger] [info] interval #3: 3.857312ms cudaGraphicsUnmapResources
[2024-06-06 13:43:00.178] [logger] [info] interval #4: 9.328736ms infer
[2024-06-06 13:43:00.178] [logger] [info] interval #5: 1.3624ms cudaGraphicsMapResources
[2024-06-06 13:43:00.178] [logger] [info] interval #6: 0.001728ms cudaGraphicsSubResourceGetMappedArray
[2024-06-06 13:43:00.178] [logger] [info] interval #7: 0.058208ms cudaMemcpy2DToArrayAsync
[2024-06-06 13:43:00.178] [logger] [info] interval #8: 1.648864ms cudaGraphicsUnmapResources
cudaEvent_t evt[10];
for (int i = 0; i < 10; i++)
{
cudaEventCreate(&evt[i]);
}
// model input/output is NHWC
int ih = gPostSR->getInputSize(1);
int iw = gPostSR->getInputSize(2);
int ic = gPostSR->getInputSize(3);
int oh = gPostSR->getOutputSize(1);
int ow = gPostSR->getOutputSize(2);
int oc = gPostSR->getOutputSize(3);
size_t isize = 1 * ic * ih * iw;
size_t osize = 1 * oc * oh * ow;
cudaEventRecord(evt[0], gPostSR->getCudaStream());
cudaArray_t inArr;
check(cudaGraphicsMapResources(1, &inputRes, gPostSR->getCudaStream()));
cudaEventRecord(evt[1], gPostSR->getCudaStream());
check(cudaGraphicsSubResourceGetMappedArray(&inArr, inputRes, 0, 0));
cudaEventRecord(evt[2], gPostSR->getCudaStream());
check(cudaMemcpy2DFromArrayAsync(gPostSR->inputBuffer(), iw * ic, inArr, 0, 0, iw * ic, ih,
cudaMemcpyDeviceToDevice, gPostSR->getCudaStream()));
cudaEventRecord(evt[3], gPostSR->getCudaStream());
check(cudaGraphicsUnmapResources(1, &inputRes, gPostSR->getCudaStream()));
cudaEventRecord(evt[4], gPostSR->getCudaStream());
bool ret = gPostSR->infer();
cudaEventRecord(evt[5], gPostSR->getCudaStream());
cudaArray_t outArr;
check(cudaGraphicsMapResources(1, &outputRes, gPostSR->getCudaStream()));
cudaEventRecord(evt[6], gPostSR->getCudaStream());
check(cudaGraphicsSubResourceGetMappedArray(&outArr, outputRes, 0, 0));
cudaEventRecord(evt[7], gPostSR->getCudaStream());
check(cudaMemcpy2DToArrayAsync(outArr, 0, 0, gPostSR->outputBuffer(), ow * oc, ow * oc, oh,
cudaMemcpyDeviceToDevice, gPostSR->getCudaStream()));
cudaEventRecord(evt[8], gPostSR->getCudaStream());
check(cudaGraphicsUnmapResources(1, &outputRes, gPostSR->getCudaStream()));
cudaEventRecord(evt[9], gPostSR->getCudaStream());
for (int i = 0; i <= 9; i++)
cudaEventSynchronize(evt[i]);
float times[10];
for (int i = 0; i < 9; i++)
{
cudaEventElapsedTime(×[i], evt[i], evt[i+1]);
gLogger->info("interval #{}: {}ms", i, times[i]);
}
gLogger->info("===========");
check(cudaStreamSynchronize(gPostSR->getCudaStream()));
for (int i = 0; i < 10; i++)
{
cudaEventDestroy(evt[i]);
}