Please provide complete information as applicable to your setup.
• Hardware Platform (Jetson / GPU) → dGPU aws T4
• DeepStream Version - 5.0
• TensorRT Version → 7+
• NVIDIA GPU Driver Version (valid for GPU only) → 440.82
I have created TensorRT engine using tensorRT API. And was trying to use the engine file to do inference on RetinaFace detection. Below is my config file:
[property]
gpu-id=0
net-scale-factor=0.0039215697906911373
model-engine-file=../../../samples/models/RetinaFace_Detector/retina_r50.engine
# create labels file
# labelfile-path=./labels.txt
batch-size=1
model-color-format=0
## 0=FP32, 1=INT8, 2=FP16 mode
network-mode=2
num-detected-classes=1
interval=0
gie-unique-id=1
is-classifier=0
custom-lib-path=./libdecodeplugin.so
[class-attrs-all]
threshold=0.2
group-threshold=1
## Set eps=0.7 and minBoxes for enable-dbscan=1
eps=0.2
#minBoxes=3
roi-top-offset=0
roi-bottom-offset=0
detected-min-w=0
detected-min-h=0
detected-max-w=1920
detected-max-h=1080
And this is the custom plugin that I defined while creating tensorRT engine file:
#ifndef _DECODE_CU_H
#define _DECODE_CU_H
#include "decode.h"
#include "stdio.h"
namespace nvinfer1
{
DecodePlugin::DecodePlugin()
{
}
DecodePlugin::~DecodePlugin()
{
}
// create the plugin at runtime from a byte stream
DecodePlugin::DecodePlugin(const void* data, size_t length)
{
}
void DecodePlugin::serialize(void* buffer) const
{
}
size_t DecodePlugin::getSerializationSize() const
{
return 0;
}
int DecodePlugin::initialize()
{
return 0;
}
Dims DecodePlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims)
{
//output the result to channel
int totalCount = 0;
totalCount += decodeplugin::INPUT_H / 8 * decodeplugin::INPUT_W / 8 * 2 * sizeof(decodeplugin::Detection) / sizeof(float);
totalCount += decodeplugin::INPUT_H / 16 * decodeplugin::INPUT_W / 16 * 2 * sizeof(decodeplugin::Detection) / sizeof(float);
totalCount += decodeplugin::INPUT_H / 32 * decodeplugin::INPUT_W / 32 * 2 * sizeof(decodeplugin::Detection) / sizeof(float);
return Dims3(totalCount + 1, 1, 1);
}
// Set plugin namespace
void DecodePlugin::setPluginNamespace(const char* pluginNamespace)
{
mPluginNamespace = pluginNamespace;
}
const char* DecodePlugin::getPluginNamespace() const
{
return mPluginNamespace;
}
// Return the DataType of the plugin output at the requested index
DataType DecodePlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const
{
return DataType::kFLOAT;
}
// Return true if output tensor is broadcast across a batch.
bool DecodePlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const
{
return false;
}
// Return true if plugin can use input that is broadcast across batch without replication.
bool DecodePlugin::canBroadcastInputAcrossBatch(int inputIndex) const
{
return false;
}
void DecodePlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput)
{
}
// Attach the plugin object to an execution context and grant the plugin the access to some context resource.
void DecodePlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator)
{
}
// Detach the plugin object from its execution context.
void DecodePlugin::detachFromContext() {}
const char* DecodePlugin::getPluginType() const
{
return "Decode_TRT";
}
const char* DecodePlugin::getPluginVersion() const
{
return "1";
}
void DecodePlugin::destroy()
{
delete this;
}
// Clone the plugin
IPluginV2IOExt* DecodePlugin::clone() const
{
DecodePlugin *p = new DecodePlugin();
p->setPluginNamespace(mPluginNamespace);
return p;
}
__device__ float Logist(float data){ return 1./(1. + expf(-data)); };
__global__ void CalDetection(const float *input, float *output, int num_elem, int step, int anchor) {
int idx = threadIdx.x + blockDim.x * blockIdx.x;
if (idx >= num_elem) return;
int h = decodeplugin::INPUT_H / step;
int w = decodeplugin::INPUT_W / step;
int y = idx / w;
int x = idx % w;
const float *bbox_reg = &input[0];
const float *cls_reg = &input[2 * 4 * num_elem];
const float *lmk_reg = &input[2 * 4 * num_elem + 2 * 2 * num_elem];
for (int k = 0; k < 2; ++k) {
float conf1 = cls_reg[idx + k * num_elem * 2];
float conf2 = cls_reg[idx + k * num_elem * 2 + num_elem];
conf2 = expf(conf2) / (expf(conf1) + expf(conf2));
if (conf2 <= 0.02) continue;
float *res_count = output;
int count = (int)atomicAdd(res_count, 1);
char* data = (char *)res_count + sizeof(float) + count * sizeof(decodeplugin::Detection);
decodeplugin::Detection* det = (decodeplugin::Detection*)(data);
float prior[4];
prior[0] = ((float)x + 0.5) / w;
prior[1] = ((float)y + 0.5) / h;
prior[2] = (float)anchor * (k + 1) / decodeplugin::INPUT_W;
prior[3] = (float)anchor * (k + 1) / decodeplugin::INPUT_H;
//Location
det->bbox[0] = prior[0] + bbox_reg[idx + k * num_elem * 4] * 0.1 * prior[2];
det->bbox[1] = prior[1] + bbox_reg[idx + k * num_elem * 4 + num_elem] * 0.1 * prior[3];
det->bbox[2] = prior[2] * expf(bbox_reg[idx + k * num_elem * 4 + num_elem * 2] * 0.2);
det->bbox[3] = prior[3] * expf(bbox_reg[idx + k * num_elem * 4 + num_elem * 3] * 0.2);
det->bbox[0] -= det->bbox[2] / 2;
det->bbox[1] -= det->bbox[3] / 2;
det->bbox[2] += det->bbox[0];
det->bbox[3] += det->bbox[1];
det->bbox[0] *= decodeplugin::INPUT_W;
det->bbox[1] *= decodeplugin::INPUT_H;
det->bbox[2] *= decodeplugin::INPUT_W;
det->bbox[3] *= decodeplugin::INPUT_H;
det->class_confidence = conf2;
for (int i = 0; i < 10; i += 2) {
det->landmark[i] = prior[0] + lmk_reg[idx + k * num_elem * 10 + num_elem * i] * 0.1 * prior[2];
det->landmark[i+1] = prior[1] + lmk_reg[idx + k * num_elem * 10 + num_elem * (i + 1)] * 0.1 * prior[3];
det->landmark[i] *= decodeplugin::INPUT_W;
det->landmark[i+1] *= decodeplugin::INPUT_H;
}
}
}
void DecodePlugin::forwardGpu(const float *const * inputs, float * output, cudaStream_t stream, int batchSize)
{
int num_elem = 0;
int base_step = 8;
int base_anchor = 16;
int thread_count;
cudaMemset(output, 0, sizeof(float));
for (unsigned int i = 0; i < 3; ++i)
{
num_elem = decodeplugin::INPUT_H / base_step * decodeplugin::INPUT_W / base_step;
thread_count = (num_elem < thread_count_) ? num_elem : thread_count_;
CalDetection<<< (num_elem + thread_count - 1) / thread_count, thread_count>>>
(inputs[i], output, num_elem, base_step, base_anchor);
base_step *= 2;
base_anchor *= 4;
}
}
int DecodePlugin::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream)
{
//assert(batchSize == 1);
//GPU
//CUDA_CHECK(cudaStreamSynchronize(stream));
forwardGpu((const float *const *)inputs,(float *)outputs[0],stream,batchSize);
return 0;
};
PluginFieldCollection DecodePluginCreator::mFC{};
std::vector<PluginField> DecodePluginCreator::mPluginAttributes;
DecodePluginCreator::DecodePluginCreator()
{
mPluginAttributes.clear();
mFC.nbFields = mPluginAttributes.size();
mFC.fields = mPluginAttributes.data();
}
const char* DecodePluginCreator::getPluginName() const
{
return "Decode_TRT";
}
const char* DecodePluginCreator::getPluginVersion() const
{
return "1";
}
const PluginFieldCollection* DecodePluginCreator::getFieldNames()
{
return &mFC;
}
IPluginV2IOExt* DecodePluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc)
{
DecodePlugin* obj = new DecodePlugin();
obj->setPluginNamespace(mNamespace.c_str());
return obj;
}
IPluginV2IOExt* DecodePluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength)
{
// This object will be deleted when the network is destroyed, which will
// call PReluPlugin::destroy()
DecodePlugin* obj = new DecodePlugin(serialData, serialLength);
obj->setPluginNamespace(mNamespace.c_str());
return obj;
}
}
for which I have a libdecodeplugin.so that I have mentioned in the config file.
Deserializing engine file with this plugin works when I tested in tensorrt docker container but fails when deepstream deserializes using the lib.so liibrary for this plugin. I havent found any resources yet.
How do I include this so that I can deserialize tensorRT engine file successfully in deepstream?
and this is the terminal output:
root@73183ab47661:/opt/nvidia/deepstream/deepstream-5.0/sources/apps/deepstream-retinaface-multistream# ./deepstream-retinaface-multistream-app ../../../samples/streams/sample_720p.mp4
(gst-plugin-scanner:26): GStreamer-WARNING **: 07:09:47.376: Failed to load plugin '/usr/lib/x86_64-linux-gnu/gstreamer-1.0/deepstream/libnvdsgst_inferserver.so': libtrtserver.so: cannot open shared object file: No such file or directory
Warn: 'threshold' parameter has been deprecated. Use 'pre-cluster-threshold' instead.
Now playing: ../../../samples/streams/sample_720p.mp4
ERROR: ../nvdsinfer/nvdsinfer_func_utils.cpp:31 [TRT]: INVALID_ARGUMENT: getPluginCreator could not find plugin Decode_TRT version 1
ERROR: ../nvdsinfer/nvdsinfer_func_utils.cpp:31 [TRT]: safeDeserializationUtils.cpp (293) - Serialization Error in load: 0 (Cannot deserialize plugin since corresponding IPluginCreator not found in Plugin Registry)
ERROR: ../nvdsinfer/nvdsinfer_func_utils.cpp:31 [TRT]: INVALID_STATE: std::exception
ERROR: ../nvdsinfer/nvdsinfer_func_utils.cpp:31 [TRT]: INVALID_CONFIG: Deserialize the cuda engine failed.
ERROR: ../nvdsinfer/nvdsinfer_model_builder.cpp:1452 Deserialize engine failed from file: /opt/nvidia/deepstream/deepstream-5.0/sources/apps/deepstream-retinaface-multistream/tensorrt_engines_awsT4/retina_r50.engine
0:00:04.315405593 25 0x5618a017b8d0 WARN nvinfer gstnvinfer.cpp:599:gst_nvinfer_logger:<primary-nvinference-engine> NvDsInferContext[UID 1]: Warning from NvDsInferContextImpl::deserializeEngineAndBackend() <nvdsinfer_context_impl.cpp:1566> [UID = 1]: deserialize engine from file :/opt/nvidia/deepstream/deepstream-5.0/sources/apps/deepstream-retinaface-multistream/tensorrt_engines_awsT4/retina_r50.engine failed
0:00:04.315444719 25 0x5618a017b8d0 WARN nvinfer gstnvinfer.cpp:599:gst_nvinfer_logger:<primary-nvinference-engine> NvDsInferContext[UID 1]: Warning from NvDsInferContextImpl::generateBackendContext() <nvdsinfer_context_impl.cpp:1673> [UID = 1]: deserialize backend context from engine from file :/opt/nvidia/deepstream/deepstream-5.0/sources/apps/deepstream-retinaface-multistream/tensorrt_engines_awsT4/retina_r50.engine failed, try rebuild
0:00:04.315464105 25 0x5618a017b8d0 INFO nvinfer gstnvinfer.cpp:602:gst_nvinfer_logger:<primary-nvinference-engine> NvDsInferContext[UID 1]: Info from NvDsInferContextImpl::buildModel() <nvdsinfer_context_impl.cpp:1591> [UID = 1]: Trying to create engine from model files
ERROR: ../nvdsinfer/nvdsinfer_model_builder.cpp:934 failed to build network since there is no model file matched.
ERROR: ../nvdsinfer/nvdsinfer_model_builder.cpp:872 failed to build network.
0:00:04.315759096 25 0x5618a017b8d0 ERROR nvinfer gstnvinfer.cpp:596:gst_nvinfer_logger:<primary-nvinference-engine> NvDsInferContext[UID 1]: Error in NvDsInferContextImpl::buildModel() <nvdsinfer_context_impl.cpp:1611> [UID = 1]: build engine file failed
0:00:04.315783396 25 0x5618a017b8d0 ERROR nvinfer gstnvinfer.cpp:596:gst_nvinfer_logger:<primary-nvinference-engine> NvDsInferContext[UID 1]: Error in NvDsInferContextImpl::generateBackendContext() <nvdsinfer_context_impl.cpp:1697> [UID = 1]: build backend context failed
0:00:04.315800708 25 0x5618a017b8d0 ERROR nvinfer gstnvinfer.cpp:596:gst_nvinfer_logger:<primary-nvinference-engine> NvDsInferContext[UID 1]: Error in NvDsInferContextImpl::initialize() <nvdsinfer_context_impl.cpp:1024> [UID = 1]: generate backend failed, check config file settings
0:00:04.315966590 25 0x5618a017b8d0 WARN nvinfer gstnvinfer.cpp:781:gst_nvinfer_start:<primary-nvinference-engine> error: Failed to create NvDsInferContext instance
0:00:04.315979631 25 0x5618a017b8d0 WARN nvinfer gstnvinfer.cpp:781:gst_nvinfer_start:<primary-nvinference-engine> error: Config file path: retinaface_pgie_config.txt, NvDsInfer Error: NVDSINFER_CONFIG_FAILED
Running...
ERROR from element primary-nvinference-engine: Failed to create NvDsInferContext instance
Error details: gstnvinfer.cpp(781): gst_nvinfer_start (): /GstPipeline:dstest1-pipeline/GstNvInfer:primary-nvinference-engine:
Config file path: retinaface_pgie_config.txt, NvDsInfer Error: NVDSINFER_CONFIG_FAILED
Returned, stopping playback
Deleting pipeline