In sampleUffSSD.cpp, in doInference,
we have
// DMA the input to the GPU, execute the batch asynchronously, and DMA it back:
CHECK(cudaMemcpyAsync(buffers[inputIndex], inputData, batchSize * INPUT_C * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
auto t_start = std::chrono::high_resolution_clock::now();
context.execute(batchSize, &buffers[0]);
auto t_end = std::chrono::high_resolution_clock::now();
How does the code ensure that the copying to buffers[inputIndex] has completed before IExecutionContext::execute executes?