The below code is from one of the example (SampleUffMNIST) to convert a tensorflow uff model into TensorRT inference engine and do inference on MNIST images. I am very new to TensorRT and GPUs therefore I find it really difficult to convert this below method infer() to do the inference parallely instead of one by one (basically eliminating the loop). I couldn’t find any related topics as well. Please be kind enough to help me out.
CODE
bool SampleUffMNIST::infer()
{
// Create RAII buffer manager object
samplesCommon::BufferManager buffers(mEngine, mParams.batchSize);
auto context = SampleUniquePtr<nvinfer1::IExecutionContext>(mEngine->createExecutionContext());
if (!context)
{
return false;
}
bool outputCorrect = true;
float total = 0;
// Try to infer each digit 0-9
for (int digit = 0; digit < kDIGITS; digit++)
{
if (!processInput(buffers, mParams.inputTensorNames[0], digit))
{
return false;
}
// Copy data from host input buffers to device input buffers
buffers.copyInputToDevice();
const auto t_start = std::chrono::high_resolution_clock::now();
// Execute the inference work
if (!context->execute(mParams.batchSize, buffers.getDeviceBindings().data()))
{
return false;
}
const auto t_end = std::chrono::high_resolution_clock::now();
const float ms = std::chrono::duration<float, std::milli>(t_end - t_start).count();
total += ms;
// Copy data from device output buffers to host output buffers
buffers.copyOutputToHost();
// Check and print the output of the inference
outputCorrect &= verifyOutput(buffers, mParams.outputTensorNames[0], digit);
}
total /= kDIGITS;
gLogInfo << "Average over " << kDIGITS << " runs is " << total << " ms." << std::endl;
return outputCorrect;
}