Multiple tensorrt engine contexts for different models

prince.patel.14 · January 26, 2023, 12:22pm

Description

I want to have 5 models work together in a single stream.
Each model onnx file is separately loaded and parsed.

Each of the model contexts is created and kept in an array.
For contexts at 2,3,4(0-based indexing), The results are fine for every batch.
But for contexts 0,1. The results are garbage.

Environment

TensorRT Version: 8.0.1
GPU Type: 1050ti
Nvidia Driver Version: 470.82.01
CUDA Version: 11.4
CUDNN Version: NA
Operating System + Version: Ubuntu 18.04
Python Version (if applicable): 3.8
TensorFlow Version (if applicable): NA
PyTorch Version (if applicable): NA
Baremetal or Container (if container which image + tag): NA

Relevant Files

model_1.onnx (462.8 KB)

model_2.onnx (462.8 KB)

model_3.onnx (462.9 KB)

Steps To Reproduce

static IExecutionContext** colorContext;
colorContext = (IExecutionContext**)malloc(sizeof(IExecutionContext*) * NumColorModels );
static float* buffers_color[10];

int load_onnx(char *filename, char *enginename,  char* format, int isFP16){

  std::ifstream file(filename, std::ios::binary);
  spdlog::debug("Onnx path: {}", filename);
  spdlog::debug("TRT path: {}", enginename);
  
  if (!file.good()) {
    std::cerr << "ONNX read " << filename << " error!" << std::endl;
    return -1;
  }
  IBuilder* builder = createInferBuilder(gLogger);
  if(strcmp(format, "NCHW") == 0){
    builder->setMaxBatchSize(BatchSize);
  }
  else if (strncmp(format, "NF", 2) == 0) {
    builder->setMaxBatchSize(BatchSize * NumBodypart);
  }  
  
  uint32_t flag = 1U <<static_cast<uint32_t>
    (NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); 

  INetworkDefinition* network = builder->createNetworkV2(flag);
  IParser*  parser = createParser(*network, gLogger);
  parser->parseFromFile(filename, 3);
  for (int32_t i = 0; i < parser->getNbErrors(); ++i)
  {
    std::cout << parser->getError(i)->desc() << std::endl;
  }

  IOptimizationProfile* profile = builder->createOptimizationProfile();

  if(strcmp(format, "NCHW") == 0){
    profile->setDimensions(INPUT_BLOB_NAME, OptProfileSelector::kMIN, Dims4(BatchSize,InputChannel,InputH,InputW));
    profile->setDimensions(INPUT_BLOB_NAME, OptProfileSelector::kOPT, Dims4(BatchSize,InputChannel,InputH,InputW));
    profile->setDimensions(INPUT_BLOB_NAME, OptProfileSelector::kMAX, Dims4(BatchSize,InputChannel,InputH,InputW));
  }
  else if (strncmp(format, "NF", 2) == 0) {
    int featuresindex = atoi((&format[2])); // Starts from 0
    int featuresize = FeatureSizeList[featuresindex];
    spdlog::debug(" featuresize {} ", featuresize);
    profile->setDimensions(INPUT_BLOB_NAME, OptProfileSelector::kMIN, Dims2(BatchSize * NumBodypart,featuresize));
    profile->setDimensions(INPUT_BLOB_NAME, OptProfileSelector::kOPT, Dims2(BatchSize * NumBodypart,featuresize));
    profile->setDimensions(INPUT_BLOB_NAME, OptProfileSelector::kMAX, Dims2(BatchSize * NumBodypart,featuresize));
  }  

  IBuilderConfig* config = builder->createBuilderConfig();
  config->setMaxWorkspaceSize(1U << 20);
  config->addOptimizationProfile(profile);
  if(isFP16){
    config->setFlag(BuilderFlag::kFP16);
  }
  IHostMemory*  serializedModel = builder->buildSerializedNetwork(*network, *config);
  std::ofstream p(enginename, std::ios::binary);
  if (!p) {
    std::cerr << "TRT engine could not open plan output file" << std::endl;
    return -1;
  }
  p.write(reinterpret_cast<const char*>(serializedModel->data()), serializedModel->size());

  delete parser;
  delete network;
  delete config;
  delete builder;
  delete serializedModel;
  return 1;
}

int load_engine(char* filename, char* format){
  
  std::ifstream file(filename, std::ios::binary);
  if (!file.good()) {
    std::cerr << "read " << filename << " error!" << std::endl;
    return -1;
  }
  char *trtModelStream = nullptr;
  size_t size = 0;
  file.seekg(0, file.end);
  size = file.tellg();
  file.seekg(0, file.beg);
  trtModelStream = new char[size];
  assert(trtModelStream);
  file.read(trtModelStream, size);
  file.close();

  runtime = createInferRuntime(gLogger);
  assert(runtime != nullptr);

  engine = runtime->deserializeCudaEngine(trtModelStream, size);
  assert(engine != nullptr);
  
  if(strcmp(format, "NCHW") == 0){
    context = engine->createExecutionContext();
    assert(context != nullptr);
  }
  else if (strncmp(format, "NF", 2) == 0) {    
    int featuresindex = atoi((&format[2])); // Starts from 0
    spdlog::debug(" featureindex : {} ", featuresindex);    
    colorContext[featuresindex] = engine->createExecutionContext();
    assert(colorContext[featuresindex] != nullptr);
  }
  
  delete[] trtModelStream;
  
  inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME);
  outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME);
  
  assert(inputIndex == 0);
  assert(outputIndex == 1);

  
  // Create GPU buffers on device
  if(strcmp(format, "NCHW") == 0){
    CUDA_CHECK(cudaMalloc((void**)&buffers[inputIndex], BatchSize * InputChannel * InputH * InputW  * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)&buffers[outputIndex],BatchSize * OutputChannel * InputH * InputW  * sizeof(float)));
    context->setBindingDimensions(inputIndex, Dims4(BatchSize, InputChannel, InputH, InputW));
    context->setOptimizationProfileAsync(0, stream);
  }
  else if (strncmp(format, "NF", 2) == 0) {
    int featuresindex = atoi((&format[2])); // Starts from 0
    int featuresize = FeatureSizeList[featuresindex];

    spdlog::debug("featureindex {}", featuresindex);
    spdlog::debug("featuresize {}", featuresize);

    spdlog::debug("BatchSize {}", BatchSize);
    spdlog::debug("NumBodypart {}", NumBodypart);
    spdlog::debug("NumColorClasses {}", NumColorClasses);
        
    CUDA_CHECK(cudaMalloc((void**)&buffers_color[2 * featuresindex + 0], BatchSize * NumBodypart * featuresize  * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)&buffers_color[2 * featuresindex + 1], BatchSize * NumBodypart * NumColorClasses  * sizeof(float)));

    // std::cout << " CUDA pointer of buffers color allocated "<<  buffers_color[2 * featuresindex + 0] << "  "<< buffers_color[2 * featuresindex + 1];

    colorContext[featuresindex]->setBindingDimensions(inputIndex, Dims2(BatchSize * NumBodypart, featuresize));
    colorContext[featuresindex]->setOptimizationProfileAsync(0, stream);
  }
  
  return 1;
}


// Works fine
void* colorBuffers1[] = { buffers_color[0], buffers_color[1] } ;
colorContext[2]->enqueue(currentBS * NumBodypart, (void**)colorBuffers1, stream, nullptr);
void* colorBuffers2[] = { buffers_color[2], buffers_color[3] } ;
colorContext[3]->enqueue(currentBS * NumBodypart, (void**)colorBuffers2, stream, nullptr);
void* colorBuffers3[] = { buffers_color[4], buffers_color[5] } ;
colorContext[4]->enqueue(currentBS * NumBodypart, (void**)colorBuffers3, stream, nullptr);

//Issues with results on different batchsizes.  
void* colorBuffers4[] = { buffers_color[6], buffers_color[7] } ;
colorContext[0]->enqueue(currentBS * NumBodypart, (void**)colorBuffers4, stream, nullptr);
void* colorBuffers5[] = { buffers_color[8], buffers_color[9] } ;
colorContext[1]->enqueue(currentBS * NumBodypart, (void**)colorBuffers5, stream, nullptr);

spolisetty · January 30, 2023, 4:02pm

Hi,

The below links might be useful for you.

https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#stream-priorities

https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html

For multi-threading/streaming, will suggest you to use Deepstream or TRITON

For more details, we recommend you raise the query in the Deepstream forum.

or

raise the query in the Triton Inference Server Github instance issues section.

Thanks!

prince.patel.14 · March 16, 2023, 1:59pm

The issue was with ONNX model. It had multiple unnecessary outputs also. Once they were removed, the issue got resolved

system · March 30, 2023, 1:59pm

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
LSTM ONNX to TensorRT mismatched outputs TensorRT tensorrt	3	954	September 29, 2022
How can I access the same TensorRT engine model in different thread TensorRT cudnn	1	559	November 27, 2023
How to use different profile in tensorrt? TensorRT tensorrt , python	3	1392	July 19, 2022
ONNX model and TensorRT engine works differently TensorRT	5	736	February 20, 2023
Tensorrt8.5 inference different with origin onnx model TensorRT	6	1086	December 13, 2022
TypeError: build_cuda_engine(): incompatible function arguments TensorRT	7	5712	October 12, 2021
Falure to do inference TAO Toolkit tensorrt	9	1071	January 11, 2022
ONNX Model and Tensorrt Engine gives different output TensorRT tensorrt , onnx	13	5388	June 29, 2022
Build TensorRT on Cuda compute capability 7.5 and make it backward compatible with previous capabilities TensorRT tensorrt	4	1909	May 19, 2022
:nvinfer1::rt::ExecutionContext::enqueueInternal::330, condition: bindings[x] != nullptr TensorRT tensorrt	1	1884	February 15, 2022

Multiple tensorrt engine contexts for different models

Description

Environment

Relevant Files

Steps To Reproduce

Related topics