How to do Batch Execution on TensorRT

Hi, all

I am trying to do batch execution on this TensorRT sample code.

To do that,
I changed the code int batchSize = 1 to int batchSize = 4 as follows.

I am not sure that it is enough to make this code do batch execution.
Is there any more code I have to change for batch execution?

(Plus, Is there any way to check whether this program really does batch execution or not?)

Any help would be greatly appreciated.
Thanks in advance.


For reference,
main.cpp in

#include "tools.h"
#include <unistd.h>
#include <fstream>
#include <vector>
#include <cassert>
#include <mutex>
#include <condition_variable>
#include <atomic>
#include <numeric>
#include <future>
#include <csignal>
#include <cstdlib>
#include <chrono>
#include <string>
using namespace nvinfer1;

class Task {
	Task(const char* filename, int dlaCore, int streamCreateFlag = cudaStreamDefault, int eventCreateFlag = cudaEventDefault);
	void exec_once(int batchSize = 4) {
		const bool success = context->enqueue(batchSize,, stream.get(), nullptr);

	enum class SyncType {
	std::thread repeatWithSync(SyncType syncType, std::function<void(Task&)> threadInit = nullptr);

	static void signalHandler(int s) {
		gStop = true;

	static bool shouldStop() {
		return gStop;

	// called from master thread when repeatWithSync is running
	float reportFPS() {
		const size_t frames = counter->load();
		counter->fetch_sub(frames); //reset counter safely
		auto timeEnd = std::chrono::steady_clock::now();
		const float fps = frames / std::chrono::duration_cast<std::chrono::duration<float>>(timeEnd - timeBeg).count();
		timeBeg = timeEnd;
		return fps;
	static Logger gLogger;
	static std::atomic_bool gStop;

	UniqPtr<IRuntime> runtime = UniqPtr<IRuntime>{ createInferRuntime(gLogger) };
	std::unique_ptr<std::atomic_size_t> counter = std::unique_ptr<std::atomic_size_t>(new std::atomic_size_t(0));
	UniqPtr<ICudaEngine> engine = nullptr;
	UniqPtr<IExecutionContext> context = nullptr;
	std::vector<UniqPtr<char, CuMemDeleter>> bindings;
	std::vector<void*> bindingArray;//same content as bindings
	std::unique_ptr<CUstream_st, StreamDeleter> stream;
	std::unique_ptr<CUevent_st, EventDeleter> event;

	//for use by host thread
	std::chrono::time_point<std::chrono::steady_clock> timeBeg;

Logger Task::gLogger;
std::atomic_bool Task::gStop;

Task::Task(const char* filename, int dlaCore, int streamCreateFlag, int eventCreateFlag)
	: stream{makeCudaStream(streamCreateFlag)}
	, event{makeCudaEvent(eventCreateFlag)}
	if (dlaCore >= 0)
	std::cout << "Load engine from :" << filename << std::endl;
	std::ifstream fin(filename, std::ios::binary);
	std::vector<char> inBuffer((std::istreambuf_iterator<char>(fin)), std::istreambuf_iterator<char>());
	engine.reset(runtime->deserializeCudaEngine(, inBuffer.size(), nullptr));

	const int nbBindings = engine->getNbBindings();
	for (int i = 0; i < nbBindings; i++) {
		const auto dataType = engine->getBindingDataType(i);
		const int elemSize = [&]() -> int {
			switch (dataType) {
			case DataType::kFLOAT: return 4;
			case DataType::kHALF: return 2;
			default: throw std::runtime_error("invalid data type");
		const auto dims = engine->getBindingDimensions(i);
		const int bindingSize = elemSize * std::accumulate(dims.d, &dims.d[dims.nbDims], 1, std::multiplies<int>{});

std::thread Task::repeatWithSync(Task::SyncType syncType, std::function<void(Task&)> threadInit) {
	return std::thread([this, syncType, threadInit]() {
		if (threadInit)
		timeBeg = std::chrono::steady_clock::now();
			switch (syncType) {
			case SyncType::Stream: checkCudaErrors(cudaStreamSynchronize(stream.get())); break;
			case SyncType::Event:
                                checkCudaErrors(cudaEventRecord(event.get(), stream.get()));
			default: throw std::runtime_error("invalid sync type");

int main(int argc, char* argv[])
	// Configurations
	const int streamCreateFlags = cudaStreamDefault;
	const int eventCreateFlags = cudaEventBlockingSync;
	auto threadInit = [](Task& task)->void {
		// If you want to do something at the begining of each worker threads, put it here.
		// ...
	const Task::SyncType syncType = Task::SyncType::Event;

	// Configuration is done. Now we start.
	signal(SIGINT, Task::signalHandler);

	std::vector<Task> tasks;
	for (int i = 0; i < argc-1; i++) {
		const int dlaCore = std::stoi(argv[i + 1]);
		const char* filename = dlaCore < 0 ? "gpu.engine" : "dla.engine";
		tasks.emplace_back(filename, dlaCore, streamCreateFlags, eventCreateFlags);

	std::vector<std::thread> workers;
	for (Task& task : tasks)
		workers.emplace_back(task.repeatWithSync(syncType, threadInit));

	while (!Task::shouldStop()) {
		std::cout << "FPS:\t";
		for (auto& task : tasks)
			std::cout << task.reportFPS() << "\t";
		std::cout << std::endl;

	for (auto& thrd : workers)

	return 0;


The sample used a pre-generated TensorRT engine.
Please also make sure you have created the engine with the expected batch size.

For example:

$ /usr/src/tensorrt/bin/trtexec --maxBatch=4 ...


Hi, @AastaLLL

Thanks for the reply.

I used the pre-generated TensorRT engine with --maxBatch=4.

However, even if I increased batch size from 1 to 8, average execution time for “context->enqueue(batchSize,, stream.get(), nullptr);” didn’t changed.

So, I guess there is more action I have to do to do for batch execution.

Is there anything I should know for batch execution on the above code?
( Is there any more code I have to change for batch execution?)

Or, could you tell me the way how to check if this program really does batch execution?




This sounds correct to us.

If the model doesn’t occupy all the resources, TensorRT will launch N batches parallelly.
The time is much smaller than launch one batch for N times.

You can enable the output dump option for validation.
It’s expected to get N outputs if inferencing with batch size =N.

$ /usr/src/tensorrt/bin/trtexec --dumpOuput ...


This topic was automatically closed 60 days after the last reply. New replies are no longer allowed.