Could not open lib: /opt/nvidia/deepstream/deepstream-5.0/sources/apps/myapp/libdecodeplugin.so, error string: undefined symbol: getPluginRegistry

Please provide complete information as applicable to your setup.

• Hardware Platform (Jetson / GPU) -> aws dGPU T4
• DeepStream Version -> SDK 5.0
• TensorRT Version -> 7
• NVIDIA GPU Driver Version (valid for GPU only) -> 440.82

Created custom layers which are written into the library libdecodeplugin.so. The code for custom layers which use IPluginCreator to create plugins is below:

#include "decode.h"
#include "stdio.h"

namespace nvinfer1
{
    DecodePlugin::DecodePlugin()
    {
    }

    DecodePlugin::~DecodePlugin()
    {
    }

    // create the plugin at runtime from a byte stream
    DecodePlugin::DecodePlugin(const void* data, size_t length)
    {
    }

    void DecodePlugin::serialize(void* buffer) const
    {
    }

    size_t DecodePlugin::getSerializationSize() const
    {  
        return 0;
    }

    int DecodePlugin::initialize()
    { 
        return 0;
    }

    Dims DecodePlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims)
    {
        //output the result to channel
        int totalCount = 0;
        totalCount += decodeplugin::INPUT_H / 8 * decodeplugin::INPUT_W / 8 * 2 * sizeof(decodeplugin::Detection) / sizeof(float);
        totalCount += decodeplugin::INPUT_H / 16 * decodeplugin::INPUT_W / 16 * 2 * sizeof(decodeplugin::Detection) / sizeof(float);
        totalCount += decodeplugin::INPUT_H / 32 * decodeplugin::INPUT_W / 32 * 2 * sizeof(decodeplugin::Detection) / sizeof(float);

        return Dims3(totalCount + 1, 1, 1);
    }

    // Set plugin namespace
    void DecodePlugin::setPluginNamespace(const char* pluginNamespace)
    {
        mPluginNamespace = pluginNamespace;
    }

    const char* DecodePlugin::getPluginNamespace() const
    {
        return mPluginNamespace;
    }

    // Return the DataType of the plugin output at the requested index
    DataType DecodePlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const
    {
        return DataType::kFLOAT;
    }

    // Return true if output tensor is broadcast across a batch.
    bool DecodePlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const
    {
        return false;
    }

    // Return true if plugin can use input that is broadcast across batch without replication.
    bool DecodePlugin::canBroadcastInputAcrossBatch(int inputIndex) const
    {
        return false;
    }

    void DecodePlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput)
    {
    }

    // Attach the plugin object to an execution context and grant the plugin the access to some context resource.
    void DecodePlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator)
    {
    }

    // Detach the plugin object from its execution context.
    void DecodePlugin::detachFromContext() {}

    const char* DecodePlugin::getPluginType() const
    {
        return "Decode_TRT";
    }

    const char* DecodePlugin::getPluginVersion() const
    {
        return "1";
    }

    void DecodePlugin::destroy()
    {
        delete this;
    }

    // Clone the plugin
    IPluginV2IOExt* DecodePlugin::clone() const
    {
        DecodePlugin *p = new DecodePlugin();
        p->setPluginNamespace(mPluginNamespace);
        return p;
    }

    __device__ float Logist(float data){ return 1./(1. + expf(-data)); };

    __global__ void CalDetection(const float *input, float *output, int num_elem, int step, int anchor) {
    
        int idx = threadIdx.x + blockDim.x * blockIdx.x;
        if (idx >= num_elem) return;

        int h = decodeplugin::INPUT_H / step;
        int w = decodeplugin::INPUT_W / step;
        int y = idx / w;
        int x = idx % w;
        const float *bbox_reg = &input[0];
        const float *cls_reg = &input[2 * 4 * num_elem];
        const float *lmk_reg = &input[2 * 4 * num_elem + 2 * 2 * num_elem];

        for (int k = 0; k < 2; ++k) {
            float conf1 = cls_reg[idx + k * num_elem * 2];
            float conf2 = cls_reg[idx + k * num_elem * 2 + num_elem];
            conf2 = expf(conf2) / (expf(conf1) + expf(conf2));
            if (conf2 <= 0.02) continue;

            float *res_count = output;
            int count = (int)atomicAdd(res_count, 1);
            char* data = (char *)res_count + sizeof(float) + count * sizeof(decodeplugin::Detection);
            decodeplugin::Detection* det = (decodeplugin::Detection*)(data);

            float prior[4];
            prior[0] = ((float)x + 0.5) / w;
            prior[1] = ((float)y + 0.5) / h;
            prior[2] = (float)anchor * (k + 1) / decodeplugin::INPUT_W;
            prior[3] = (float)anchor * (k + 1) / decodeplugin::INPUT_H;

            //Location
            det->bbox[0] = prior[0] + bbox_reg[idx + k * num_elem * 4] * 0.1 * prior[2];
            det->bbox[1] = prior[1] + bbox_reg[idx + k * num_elem * 4 + num_elem] * 0.1 * prior[3];
            det->bbox[2] = prior[2] * expf(bbox_reg[idx + k * num_elem * 4 + num_elem * 2] * 0.2);
            det->bbox[3] = prior[3] * expf(bbox_reg[idx + k * num_elem * 4 + num_elem * 3] * 0.2);
            det->bbox[0] -= det->bbox[2] / 2;
            det->bbox[1] -= det->bbox[3] / 2;
            det->bbox[2] += det->bbox[0];
            det->bbox[3] += det->bbox[1];
            det->bbox[0] *= decodeplugin::INPUT_W;
            det->bbox[1] *= decodeplugin::INPUT_H;
            det->bbox[2] *= decodeplugin::INPUT_W;
            det->bbox[3] *= decodeplugin::INPUT_H;
            det->class_confidence = conf2;
            for (int i = 0; i < 10; i += 2) {
                det->landmark[i] = prior[0] + lmk_reg[idx + k * num_elem * 10 + num_elem * i] * 0.1 * prior[2];
                det->landmark[i+1] = prior[1] + lmk_reg[idx + k * num_elem * 10 + num_elem * (i + 1)] * 0.1 * prior[3];
                det->landmark[i] *= decodeplugin::INPUT_W;
                det->landmark[i+1] *= decodeplugin::INPUT_H;
            }
        }
    }

    void DecodePlugin::forwardGpu(const float *const * inputs, float * output, cudaStream_t stream, int batchSize) 
    {
        int num_elem = 0;
        int base_step = 8;
        int base_anchor = 16;
        int thread_count;
        cudaMemset(output, 0, sizeof(float));
        for (unsigned int i = 0; i < 3; ++i)
        {
            num_elem = decodeplugin::INPUT_H / base_step * decodeplugin::INPUT_W / base_step;
            thread_count = (num_elem < thread_count_) ? num_elem : thread_count_;
            CalDetection<<< (num_elem + thread_count - 1) / thread_count, thread_count>>>
                (inputs[i], output, num_elem, base_step, base_anchor);
            base_step *= 2;
            base_anchor *= 4;
        }
    }

    int DecodePlugin::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream)
    {
        //assert(batchSize == 1);
        //GPU
        //CUDA_CHECK(cudaStreamSynchronize(stream));
        forwardGpu((const float *const *)inputs,(float *)outputs[0],stream,batchSize);

        return 0;
    };

    PluginFieldCollection DecodePluginCreator::mFC{};
    std::vector<PluginField> DecodePluginCreator::mPluginAttributes;

    DecodePluginCreator::DecodePluginCreator()
    {
        mPluginAttributes.clear();

        mFC.nbFields = mPluginAttributes.size();
        mFC.fields = mPluginAttributes.data();
    }

    const char* DecodePluginCreator::getPluginName() const
    {
        return DECODE_PLUGIN_NAME;
    }

    const char* DecodePluginCreator::getPluginVersion() const
    {
        return DECODE_PLUGIN_VERSION;
    }

    const PluginFieldCollection* DecodePluginCreator::getFieldNames()
    {
        return &mFC;
    }

    IPluginV2IOExt* DecodePluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc)
    {
        DecodePlugin* obj = new DecodePlugin();
        obj->setPluginNamespace(mNamespace.c_str());
        return obj;
    }

    IPluginV2IOExt* DecodePluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength)
    {
        // This object will be deleted when the network is destroyed, which will
        // call PReluPlugin::destroy()
        DecodePlugin* obj = new DecodePlugin(serialData, serialLength);
        obj->setPluginNamespace(mNamespace.c_str());
        return obj;
    }
    REGISTER_TENSORRT_PLUGIN(DecodePluginCreator);
}

saved in .cu extension file. Now when I deserialize the engine file in NVIDIA tensorrt container works perfectly fine. But when I use the libdecodeplugin.so to deserialize the generated engine file it doesn’t seem to work in NVIDIA DeepStream container. Below is the terminal output:

Warn: 'threshold' parameter has been deprecated. Use 'pre-cluster-threshold' instead.
Now playing: ../../../samples/streams/sample_720p.mp4
ERROR: ../nvdsinfer/nvdsinfer_func_utils.cpp:60 Could not open lib: /opt/nvidia/deepstream/deepstream-5.0/sources/apps/deepstream-retinaface-multistream/libdecodeplugin.so, error string: /opt/nvidia/deepstream/deepstream-5.0/sources/apps/deepstream-retinaface-multistream/libdecodeplugin.so: undefined symbol: getPluginRegistry
0:00:00.986501706   101 0x557a92760ed0 ERROR                nvinfer gstnvinfer.cpp:596:gst_nvinfer_logger:<primary-nvinference-engine> NvDsInferContext[UID 1]: Error in NvDsInferContextImpl::initialize() <nvdsinfer_context_impl.cpp:1015> [UID = 1]: Could not open custom lib: (null)
0:00:00.986549778   101 0x557a92760ed0 WARN                 nvinfer gstnvinfer.cpp:781:gst_nvinfer_start:<primary-nvinference-engine> error: Failed to create NvDsInferContext instance
0:00:00.986563826   101 0x557a92760ed0 WARN                 nvinfer gstnvinfer.cpp:781:gst_nvinfer_start:<primary-nvinference-engine> error: Config file path: retinaface_pgie_config.txt, NvDsInfer Error: NVDSINFER_CUSTOM_LIB_FAILED
Running...
ERROR from element primary-nvinference-engine: Failed to create NvDsInferContext instance
Error details: gstnvinfer.cpp(781): gst_nvinfer_start (): /GstPipeline:dstest1-pipeline/GstNvInfer:primary-nvinference-engine:
Config file path: retinaface_pgie_config.txt, NvDsInfer Error: NVDSINFER_CUSTOM_LIB_FAILED
Returned, stopping playback
Deleting pipeline 

I did not find any good resources for this issue. Thanks in advance!

Which TensorRT container are you using?
Could you share the result of “ldd libdecodeplugin.so” in your TensorRT container?

Using this TensorRT container ->
nvcr.io/nvidia/tensorrt:20.03-py3

@bcao Result of ldd libdecodeplugin.so in TensorRT container:

root@84f80a226000:/home/retinaface_tensorrt/build# ldd libdecodeplugin.so 
    linux-vdso.so.1 (0x00007fff9c9ea000)
    libcudart.so.10.2 => /usr/local/cuda/lib64/libcudart.so.10.2 (0x00007f4fdb949000)
    libstdc++.so.6 => /usr/lib/x86_64-linux-gnu/libstdc++.so.6 (0x00007f4fdb5c0000)
    libgcc_s.so.1 => /lib/x86_64-linux-gnu/libgcc_s.so.1 (0x00007f4fdb3a8000)
    libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007f4fdafb7000)
    /lib64/ld-linux-x86-64.so.2 (0x00007f4fdbdcf000)
    libdl.so.2 => /lib/x86_64-linux-gnu/libdl.so.2 (0x00007f4fdadb3000)
    libpthread.so.0 => /lib/x86_64-linux-gnu/libpthread.so.0 (0x00007f4fdab94000)
    librt.so.1 => /lib/x86_64-linux-gnu/librt.so.1 (0x00007f4fda98c000)
    libm.so.6 => /lib/x86_64-linux-gnu/libm.so.6 (0x00007f4fda5ee000)

getPluginRegistry is defined in /usr/lib/x86_64-linux-gnu/libnvinfer.so.7 , I think you should link to the library.

@y14uc339

Add an extra row to CMakeList.txt to link your libdecodeplugin.so with nvinfer:

target_link_libraries(decodeplugin nvinfer)
target_link_libraries(retina_50 nvinfer)
target_link_libraries(retina_50 cudart)
target_link_libraries(retina_50 decodeplugin)
target_link_libraries(retina_50 ${OpenCV_LIBRARIES})

And you’d better update cmake minimum to 3.13:

cmake_minimum_required(VERSION 3.13)

Okay I’ll try this right away!