Multiple tensorrt engine contexts for different models

Description

I want to have 5 models work together in a single stream.
Each model onnx file is separately loaded and parsed.

Each of the model contexts is created and kept in an array.
For contexts at 2,3,4(0-based indexing), The results are fine for every batch.
But for contexts 0,1. The results are garbage.

Environment

TensorRT Version: 8.0.1
GPU Type: 1050ti
Nvidia Driver Version: 470.82.01
CUDA Version: 11.4
CUDNN Version: NA
Operating System + Version: Ubuntu 18.04
Python Version (if applicable): 3.8
TensorFlow Version (if applicable): NA
PyTorch Version (if applicable): NA
Baremetal or Container (if container which image + tag): NA

Relevant Files

model_1.onnx (462.8 KB)

model_2.onnx (462.8 KB)

model_3.onnx (462.9 KB)

Steps To Reproduce

static IExecutionContext** colorContext;
colorContext = (IExecutionContext**)malloc(sizeof(IExecutionContext*) * NumColorModels );
static float* buffers_color[10];

int load_onnx(char *filename, char *enginename,  char* format, int isFP16){

  std::ifstream file(filename, std::ios::binary);
  spdlog::debug("Onnx path: {}", filename);
  spdlog::debug("TRT path: {}", enginename);
  
  if (!file.good()) {
    std::cerr << "ONNX read " << filename << " error!" << std::endl;
    return -1;
  }
  IBuilder* builder = createInferBuilder(gLogger);
  if(strcmp(format, "NCHW") == 0){
    builder->setMaxBatchSize(BatchSize);
  }
  else if (strncmp(format, "NF", 2) == 0) {
    builder->setMaxBatchSize(BatchSize * NumBodypart);
  }  
  
  uint32_t flag = 1U <<static_cast<uint32_t>
    (NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); 

  INetworkDefinition* network = builder->createNetworkV2(flag);
  IParser*  parser = createParser(*network, gLogger);
  parser->parseFromFile(filename, 3);
  for (int32_t i = 0; i < parser->getNbErrors(); ++i)
  {
    std::cout << parser->getError(i)->desc() << std::endl;
  }

  IOptimizationProfile* profile = builder->createOptimizationProfile();

  if(strcmp(format, "NCHW") == 0){
    profile->setDimensions(INPUT_BLOB_NAME, OptProfileSelector::kMIN, Dims4(BatchSize,InputChannel,InputH,InputW));
    profile->setDimensions(INPUT_BLOB_NAME, OptProfileSelector::kOPT, Dims4(BatchSize,InputChannel,InputH,InputW));
    profile->setDimensions(INPUT_BLOB_NAME, OptProfileSelector::kMAX, Dims4(BatchSize,InputChannel,InputH,InputW));
  }
  else if (strncmp(format, "NF", 2) == 0) {
    int featuresindex = atoi((&format[2])); // Starts from 0
    int featuresize = FeatureSizeList[featuresindex];
    spdlog::debug(" featuresize {} ", featuresize);
    profile->setDimensions(INPUT_BLOB_NAME, OptProfileSelector::kMIN, Dims2(BatchSize * NumBodypart,featuresize));
    profile->setDimensions(INPUT_BLOB_NAME, OptProfileSelector::kOPT, Dims2(BatchSize * NumBodypart,featuresize));
    profile->setDimensions(INPUT_BLOB_NAME, OptProfileSelector::kMAX, Dims2(BatchSize * NumBodypart,featuresize));
  }  

  IBuilderConfig* config = builder->createBuilderConfig();
  config->setMaxWorkspaceSize(1U << 20);
  config->addOptimizationProfile(profile);
  if(isFP16){
    config->setFlag(BuilderFlag::kFP16);
  }
  IHostMemory*  serializedModel = builder->buildSerializedNetwork(*network, *config);
  std::ofstream p(enginename, std::ios::binary);
  if (!p) {
    std::cerr << "TRT engine could not open plan output file" << std::endl;
    return -1;
  }
  p.write(reinterpret_cast<const char*>(serializedModel->data()), serializedModel->size());

  delete parser;
  delete network;
  delete config;
  delete builder;
  delete serializedModel;
  return 1;
}

int load_engine(char* filename, char* format){
  
  std::ifstream file(filename, std::ios::binary);
  if (!file.good()) {
    std::cerr << "read " << filename << " error!" << std::endl;
    return -1;
  }
  char *trtModelStream = nullptr;
  size_t size = 0;
  file.seekg(0, file.end);
  size = file.tellg();
  file.seekg(0, file.beg);
  trtModelStream = new char[size];
  assert(trtModelStream);
  file.read(trtModelStream, size);
  file.close();

  runtime = createInferRuntime(gLogger);
  assert(runtime != nullptr);

  engine = runtime->deserializeCudaEngine(trtModelStream, size);
  assert(engine != nullptr);
  
  if(strcmp(format, "NCHW") == 0){
    context = engine->createExecutionContext();
    assert(context != nullptr);
  }
  else if (strncmp(format, "NF", 2) == 0) {    
    int featuresindex = atoi((&format[2])); // Starts from 0
    spdlog::debug(" featureindex : {} ", featuresindex);    
    colorContext[featuresindex] = engine->createExecutionContext();
    assert(colorContext[featuresindex] != nullptr);
  }
  
  delete[] trtModelStream;
  
  inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME);
  outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME);
  
  assert(inputIndex == 0);
  assert(outputIndex == 1);

  
  // Create GPU buffers on device
  if(strcmp(format, "NCHW") == 0){
    CUDA_CHECK(cudaMalloc((void**)&buffers[inputIndex], BatchSize * InputChannel * InputH * InputW  * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)&buffers[outputIndex],BatchSize * OutputChannel * InputH * InputW  * sizeof(float)));
    context->setBindingDimensions(inputIndex, Dims4(BatchSize, InputChannel, InputH, InputW));
    context->setOptimizationProfileAsync(0, stream);
  }
  else if (strncmp(format, "NF", 2) == 0) {
    int featuresindex = atoi((&format[2])); // Starts from 0
    int featuresize = FeatureSizeList[featuresindex];

    spdlog::debug("featureindex {}", featuresindex);
    spdlog::debug("featuresize {}", featuresize);

    spdlog::debug("BatchSize {}", BatchSize);
    spdlog::debug("NumBodypart {}", NumBodypart);
    spdlog::debug("NumColorClasses {}", NumColorClasses);
        
    CUDA_CHECK(cudaMalloc((void**)&buffers_color[2 * featuresindex + 0], BatchSize * NumBodypart * featuresize  * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)&buffers_color[2 * featuresindex + 1], BatchSize * NumBodypart * NumColorClasses  * sizeof(float)));

    // std::cout << " CUDA pointer of buffers color allocated "<<  buffers_color[2 * featuresindex + 0] << "  "<< buffers_color[2 * featuresindex + 1];

    colorContext[featuresindex]->setBindingDimensions(inputIndex, Dims2(BatchSize * NumBodypart, featuresize));
    colorContext[featuresindex]->setOptimizationProfileAsync(0, stream);
  }
  
  return 1;
}


// Works fine
void* colorBuffers1[] = { buffers_color[0], buffers_color[1] } ;
colorContext[2]->enqueue(currentBS * NumBodypart, (void**)colorBuffers1, stream, nullptr);
void* colorBuffers2[] = { buffers_color[2], buffers_color[3] } ;
colorContext[3]->enqueue(currentBS * NumBodypart, (void**)colorBuffers2, stream, nullptr);
void* colorBuffers3[] = { buffers_color[4], buffers_color[5] } ;
colorContext[4]->enqueue(currentBS * NumBodypart, (void**)colorBuffers3, stream, nullptr);

//Issues with results on different batchsizes.  
void* colorBuffers4[] = { buffers_color[6], buffers_color[7] } ;
colorContext[0]->enqueue(currentBS * NumBodypart, (void**)colorBuffers4, stream, nullptr);
void* colorBuffers5[] = { buffers_color[8], buffers_color[9] } ;
colorContext[1]->enqueue(currentBS * NumBodypart, (void**)colorBuffers5, stream, nullptr);

Hi,

The below links might be useful for you.

https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#stream-priorities

https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html

For multi-threading/streaming, will suggest you to use Deepstream or TRITON

For more details, we recommend you raise the query in the Deepstream forum.

or

raise the query in the Triton Inference Server Github instance issues section.

Thanks!

1 Like

The issue was with ONNX model. It had multiple unnecessary outputs also. Once they were removed, the issue got resolved

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.