I want to have 5 models work together in a single stream.
Each model onnx file is separately loaded and parsed.
Each of the model contexts is created and kept in an array.
For contexts at 2,3,4(0-based indexing), The results are fine for every batch.
But for contexts 0,1. The results are garbage.
TensorRT Version: 8.0.1
GPU Type: 1050ti
Nvidia Driver Version: 470.82.01
CUDA Version: 11.4
CUDNN Version: NA
Operating System + Version: Ubuntu 18.04
Python Version (if applicable): 3.8
TensorFlow Version (if applicable): NA
PyTorch Version (if applicable): NA
Baremetal or Container (if container which image + tag): NA
Relevant Files
model_1.onnx (462.8 KB)
model_2.onnx (462.8 KB)
model_3.onnx (462.9 KB)
Steps To Reproduce
static IExecutionContext** colorContext;
colorContext = (IExecutionContext**)malloc(sizeof(IExecutionContext*) * NumColorModels );
static float* buffers_color[10];
int load_onnx(char *filename, char *enginename, char* format, int isFP16){
std::ifstream file(filename, std::ios::binary);
spdlog::debug("Onnx path: {}", filename);
spdlog::debug("TRT path: {}", enginename);
if (!file.good()) {
std::cerr << "ONNX read " << filename << " error!" << std::endl;
return -1;
IBuilder* builder = createInferBuilder(gLogger);
if(strcmp(format, "NCHW") == 0){
else if (strncmp(format, "NF", 2) == 0) {
builder->setMaxBatchSize(BatchSize * NumBodypart);
uint32_t flag = 1U <<static_cast<uint32_t>
INetworkDefinition* network = builder->createNetworkV2(flag);
IParser* parser = createParser(*network, gLogger);
parser->parseFromFile(filename, 3);
for (int32_t i = 0; i < parser->getNbErrors(); ++i)
std::cout << parser->getError(i)->desc() << std::endl;
IOptimizationProfile* profile = builder->createOptimizationProfile();
if(strcmp(format, "NCHW") == 0){
profile->setDimensions(INPUT_BLOB_NAME, OptProfileSelector::kMIN, Dims4(BatchSize,InputChannel,InputH,InputW));
profile->setDimensions(INPUT_BLOB_NAME, OptProfileSelector::kOPT, Dims4(BatchSize,InputChannel,InputH,InputW));
profile->setDimensions(INPUT_BLOB_NAME, OptProfileSelector::kMAX, Dims4(BatchSize,InputChannel,InputH,InputW));
else if (strncmp(format, "NF", 2) == 0) {
int featuresindex = atoi((&format[2])); // Starts from 0
int featuresize = FeatureSizeList[featuresindex];
spdlog::debug(" featuresize {} ", featuresize);
profile->setDimensions(INPUT_BLOB_NAME, OptProfileSelector::kMIN, Dims2(BatchSize * NumBodypart,featuresize));
profile->setDimensions(INPUT_BLOB_NAME, OptProfileSelector::kOPT, Dims2(BatchSize * NumBodypart,featuresize));
profile->setDimensions(INPUT_BLOB_NAME, OptProfileSelector::kMAX, Dims2(BatchSize * NumBodypart,featuresize));
IBuilderConfig* config = builder->createBuilderConfig();
config->setMaxWorkspaceSize(1U << 20);
IHostMemory* serializedModel = builder->buildSerializedNetwork(*network, *config);
std::ofstream p(enginename, std::ios::binary);
if (!p) {
std::cerr << "TRT engine could not open plan output file" << std::endl;
return -1;
p.write(reinterpret_cast<const char*>(serializedModel->data()), serializedModel->size());
delete parser;
delete network;
delete config;
delete builder;
delete serializedModel;
return 1;
int load_engine(char* filename, char* format){
std::ifstream file(filename, std::ios::binary);
if (!file.good()) {
std::cerr << "read " << filename << " error!" << std::endl;
return -1;
char *trtModelStream = nullptr;
size_t size = 0;
file.seekg(0, file.end);
size = file.tellg();
file.seekg(0, file.beg);
trtModelStream = new char[size];
assert(trtModelStream);, size);
runtime = createInferRuntime(gLogger);
assert(runtime != nullptr);
engine = runtime->deserializeCudaEngine(trtModelStream, size);
assert(engine != nullptr);
if(strcmp(format, "NCHW") == 0){
context = engine->createExecutionContext();
assert(context != nullptr);
else if (strncmp(format, "NF", 2) == 0) {
int featuresindex = atoi((&format[2])); // Starts from 0
spdlog::debug(" featureindex : {} ", featuresindex);
colorContext[featuresindex] = engine->createExecutionContext();
assert(colorContext[featuresindex] != nullptr);
delete[] trtModelStream;
inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME);
outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME);
assert(inputIndex == 0);
assert(outputIndex == 1);
// Create GPU buffers on device
if(strcmp(format, "NCHW") == 0){
CUDA_CHECK(cudaMalloc((void**)&buffers[inputIndex], BatchSize * InputChannel * InputH * InputW * sizeof(float)));
CUDA_CHECK(cudaMalloc((void**)&buffers[outputIndex],BatchSize * OutputChannel * InputH * InputW * sizeof(float)));
context->setBindingDimensions(inputIndex, Dims4(BatchSize, InputChannel, InputH, InputW));
context->setOptimizationProfileAsync(0, stream);
else if (strncmp(format, "NF", 2) == 0) {
int featuresindex = atoi((&format[2])); // Starts from 0
int featuresize = FeatureSizeList[featuresindex];
spdlog::debug("featureindex {}", featuresindex);
spdlog::debug("featuresize {}", featuresize);
spdlog::debug("BatchSize {}", BatchSize);
spdlog::debug("NumBodypart {}", NumBodypart);
spdlog::debug("NumColorClasses {}", NumColorClasses);
CUDA_CHECK(cudaMalloc((void**)&buffers_color[2 * featuresindex + 0], BatchSize * NumBodypart * featuresize * sizeof(float)));
CUDA_CHECK(cudaMalloc((void**)&buffers_color[2 * featuresindex + 1], BatchSize * NumBodypart * NumColorClasses * sizeof(float)));
// std::cout << " CUDA pointer of buffers color allocated "<< buffers_color[2 * featuresindex + 0] << " "<< buffers_color[2 * featuresindex + 1];
colorContext[featuresindex]->setBindingDimensions(inputIndex, Dims2(BatchSize * NumBodypart, featuresize));
colorContext[featuresindex]->setOptimizationProfileAsync(0, stream);
return 1;
// Works fine
void* colorBuffers1[] = { buffers_color[0], buffers_color[1] } ;
colorContext[2]->enqueue(currentBS * NumBodypart, (void**)colorBuffers1, stream, nullptr);
void* colorBuffers2[] = { buffers_color[2], buffers_color[3] } ;
colorContext[3]->enqueue(currentBS * NumBodypart, (void**)colorBuffers2, stream, nullptr);
void* colorBuffers3[] = { buffers_color[4], buffers_color[5] } ;
colorContext[4]->enqueue(currentBS * NumBodypart, (void**)colorBuffers3, stream, nullptr);
//Issues with results on different batchsizes.
void* colorBuffers4[] = { buffers_color[6], buffers_color[7] } ;
colorContext[0]->enqueue(currentBS * NumBodypart, (void**)colorBuffers4, stream, nullptr);
void* colorBuffers5[] = { buffers_color[8], buffers_color[9] } ;
colorContext[1]->enqueue(currentBS * NumBodypart, (void**)colorBuffers5, stream, nullptr);