Using half precision engine for Deepstream inference

Hello,

I successfully managed to run a custom detector engine in DeepStream 4.0.1 in full precision mode.

Now I am trying to have it working in half precision but the output is not as expected.

The engine has been built with TensorRT 5 C++ API.

My doubt is:
if my engine expects a half precision input is deepstream providing an already converted FP16 data buffer (I set network-mode=2 in deepstream), or am I supposed to do the conversion inside the engine?

I noticed that when engines are created with tensorrt parser from a tensorflow pb file, instead of being built with the API, this problem does not show up and everything seems to work.

Thanks,

f

Hi,

Have you updated the network-mode flag?

Ex. config_infer_primary_ssd.txt

[property]
...
## 0=FP32, 1=INT8, 2=FP16 mode
network-mode=2

Thanks.

Hi,
thanks for replying, and yes, I set network-mode=2, as I wrote on the original post.

This should assure that the engine receives a half precision buffer? or am I still in charge of the conversion?

thanks again,

f

Hi,

The conversion is automatically applied when the flag is set to FP16.

May I know if there is any plugin TensorRT layer inside your model?
If no, could you share your model with us so we can reproduce on our side?

Thanks.

Hi,
thanks for replying.
We indeed use a couple of custom plugins inside our model.

I think it is also interesting to show how we actually use the generate engine outside DeepStream.
The piece of code is shown below. Basically we manually cast the data buffer (originally FP32) to FP16, and back to FP32 after the engine. I understand, Deepstream takes care of this process, and it just enqueues data to the engine, so the whole pipeline should be equivalent.

void doInference(IExecutionContext& netContext, float* input, float* outputboxes, float* outputscores, int32_t* outputidx, int batchSize)
{
	// DMA the input to the GPU
    CHECK(cudaMemcpyAsync(buffers[netInputIndex], input, batchSize * params.INPUT_H * params.INPUT_W * params.INPUT_C * sizeof(float), cudaMemcpyHostToDevice, stream));

    // cast to half precision if needed
    if (params.DATATYPE == DataType::kHALF)
        cudaCastFloatToHalf_device((float*)buffers[netInputIndex], (__half*)buffers_h[netInputIndex], batchSize * params.INPUT_H * params.INPUT_W * params.INPUT_C, stream);

    // feed data to the engine
    if (params.DATATYPE == DataType::kHALF)
    	netContext.enqueue(batchSize, buffers_h, stream, nullptr);
    else
        netContext.enqueue(batchSize, buffers, stream, nullptr);

    // cast back to full precision if needed
    if (params.DATATYPE == DataType::kHALF)
    {
        cudaCastHalfToFloat_device((__half*)buffers_h[netOutputIndexBoxes], (float*)buffers[netOutputIndexBoxes], batchSize * params.OUTPUT_W * params.OUTPUT_H * params.NUM_CLASSES * 4, stream);
        cudaCastHalfToFloat_device((__half*)buffers_h[netOutputIndexScores], (float*)buffers[netOutputIndexScores], batchSize * params.OUTPUT_W * params.OUTPUT_H * params.NUM_CLASSES, stream);
        buffers[netOutputIndexIdx] = (int32_t*)buffers_h[netOutputIndexIdx];
    }


    // DMA back to CPU
    CHECK(cudaMemcpyAsync(outputboxes, (float*)buffers[netOutputIndexBoxes], batchSize * params.OUTPUT_W * params.OUTPUT_H * params.NUM_CLASSES * 4 * sizeof(float), cudaMemcpyDeviceToHost, stream));
    CHECK(cudaMemcpyAsync(outputscores, (float*)buffers[netOutputIndexScores], batchSize * params.OUTPUT_W * params.OUTPUT_H * params.NUM_CLASSES * sizeof(float), cudaMemcpyDeviceToHost, stream));
    CHECK(cudaMemcpyAsync(outputidx, (int32_t*)buffers[netOutputIndexIdx], batchSize * params.N_MAX_BBOXES_PER_CLASS * params.NUM_CLASSES * sizeof(int32_t), cudaMemcpyDeviceToHost, stream));

    // synchro tasks
	cudaStreamSynchronize(stream);
}

I also share below the implementation of the two custom plugins we use.
Hope that helps to understand the problem.
Thanks again,

f

#include <string.h> 
#include <iostream>
#include <cassert>
#include <vector>
#include <functional>
#include <numeric>
#include <algorithm>
#include "NvInferPlugin.h"
#include "cuda_runtime_api.h"


void postProMapsLauncher(const int batch_size, const int *map_size, const float thr,
                        const float *data_in_l, const float *data_in_b, const float *data_in_l_pool, float *data_out_b, float *data_out_s, cudaStream_t stream);

void resizeBilinear(float resize_factor_h, float resize_factor_w, float* input, float* output, int* input_size, int* output_size, int batch_size, cudaStream_t stream);


using namespace nvinfer1;

class PostproLayer : public IPluginV2
{
public:
    PostproLayer(const float score_threshold, const int map_height, const int map_width, const int map_depth)
    {
        scoreThreshold = score_threshold;
        mapHeight = map_height;
        mapWidth = map_width;
        mapDepth = map_depth;
    }

    PostproLayer(const void* data, size_t length)
    {
        const char* d = static_cast<const char*>(data);
        scoreThreshold = read<float>(d);
        mapHeight = read<int>(d);
        mapWidth = read<int>(d);
        mapDepth = read<int>(d);
    }

    // It makes no sense to construct UffPoolPluginV2 without arguments.
    PostproLayer() = delete;

    virtual ~PostproLayer() {}

    int getNbOutputs() const override
    {
        return 2;
    }

    Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override
    {
        assert(nbInputDims == 3);
        assert(inputs[0].nbDims == 3);
        assert(inputs[1].nbDims == 3);
        assert(inputs[2].nbDims == 3);

        if (index == 0)
            return DimsCHW(mapDepth, mapHeight * mapWidth, 4);

        if (index == 1)
            return DimsCHW(mapDepth, 1, mapHeight * mapWidth);

        return DimsCHW(mapDepth, mapHeight * mapWidth, 4);
    }

    int initialize() override { return 0; }

    void terminate() override { ; }

    size_t getWorkspaceSize(int maxBatchSize) const override { return 0; }

    int enqueue(int batch_size, const void*const *inputs, void** outputs, void*, cudaStream_t stream) override
    {
        int map_size[] {mapHeight, mapWidth, mapDepth};
        float thr = scoreThreshold;
        float *data_in_l = (float*)inputs[0];
        float *data_in_b = (float*)inputs[1];
        float *data_in_l_pool = (float*)inputs[2];
        float *data_out_b = (float*)outputs[0];
        float *data_out_s = (float*)outputs[1];

        postProMapsLauncher(batch_size, map_size, thr, data_in_l, data_in_b, data_in_l_pool, data_out_b, data_out_s, stream);

        return 0;
    }

    size_t getSerializationSize() const { return sizeof(float) + 3 * sizeof(int); }

    void serialize(void* buffer) const
    {
        char *d = reinterpret_cast<char*>(buffer);
        write(d, scoreThreshold);
        write(d, mapHeight);
        write(d, mapWidth);
        write(d, mapDepth);
    }

    void configureWithFormat(const Dims* inputs, int nbInputs, const Dims* outputDims, int nbOutputs, nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize) override
    {
        assert(nbOutputs == 2);
        assert(inputs[0].nbDims == 3);
        assert(inputs[1].nbDims == 3);
        assert(inputs[2].nbDims == 3);
        for (int i = 0; i < nbInputs; ++i)
        {
            assert(inputs[i].d[1] == mapHeight);
            assert(inputs[i].d[2] == mapWidth);
        }
    }

    bool supportsFormat(DataType type, PluginFormat format) const override { return (type == DataType::kFLOAT && format == PluginFormat::kNCHW); }

    const char* getPluginType() const override { return "PostProcessing_TRT"; }

    const char* getPluginVersion() const override { return "1"; }

    void destroy() override { delete this; }

    IPluginV2* clone() const { return new PostproLayer(scoreThreshold, mapHeight, mapWidth, mapDepth); }

    void setPluginNamespace(const char* libNamespace) override { mNamespace = libNamespace; }

    const char* getPluginNamespace() const override { return mNamespace.c_str(); }

private:
    template <typename T>
    void write(char*& buffer, const T& val) const
    {
        *reinterpret_cast<T*>(buffer) = val;
        buffer += sizeof(T);
    }

    template <typename T>
    T read(const char*& buffer)
    {
        T val = *reinterpret_cast<const T*>(buffer);
        buffer += sizeof(T);
        return val;
    }

    float scoreThreshold;
    int mapHeight;    
    int mapWidth;
    int mapDepth;
    std::string mNamespace;
};

namespace
{
const char* POSTPROLAYER_PLUGIN_VERSION{"1"};
const char* POSTPROLAYER_PLUGIN_NAME{"PostProcessing_TRT"};
} // namespace


class PostproLayerPluginCreator : public IPluginCreator
{
public:

    PostproLayerPluginCreator()
    {
        mPluginAttributes.emplace_back(PluginField("score_threshold", nullptr, PluginFieldType::kFLOAT32, 1));
        mPluginAttributes.emplace_back(PluginField("map_height", nullptr, PluginFieldType::kINT32, 1));
        mPluginAttributes.emplace_back(PluginField("map_width", nullptr, PluginFieldType::kINT32, 1));
        mPluginAttributes.emplace_back(PluginField("map_depth", nullptr, PluginFieldType::kINT32, 1));

        mFC.nbFields = mPluginAttributes.size();
        mFC.fields = mPluginAttributes.data();
    }

    ~PostproLayerPluginCreator() {}

    const char* getPluginName() const override { return POSTPROLAYER_PLUGIN_NAME; }

    const char* getPluginVersion() const override { return POSTPROLAYER_PLUGIN_VERSION; }

    const PluginFieldCollection* getFieldNames() override { return &mFC; }

    IPluginV2* createPlugin(const char* name, const PluginFieldCollection* fc) override
    {
        const PluginField* fields = fc->fields;

        for (int i = 0; i < fc->nbFields; ++i)
        {
            const char* attrName = fields[i].name;
            if (!strcmp(attrName, "score_threshold"))
            {
                assert(fields[i].type == PluginFieldType::kFLOAT32);
                scoreThreshold = *(static_cast<const float*>(fields[i].data));
            }
            if (!strcmp(attrName, "map_height"))
            {
                assert(fields[i].type == PluginFieldType::kINT32);
                mapHeight = *(static_cast<const int*>(fields[i].data));
            }
            if (!strcmp(attrName, "map_width"))
            {
                assert(fields[i].type == PluginFieldType::kINT32);
                mapWidth = *(static_cast<const int*>(fields[i].data));
            }
            if (!strcmp(attrName, "map_depth"))
            {
                assert(fields[i].type == PluginFieldType::kINT32);
                mapDepth = *(static_cast<const int*>(fields[i].data));
            }
        }

        return new PostproLayer(scoreThreshold, mapHeight, mapWidth, mapDepth);
    }

    IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override
    {
        auto plugin = new PostproLayer(serialData, serialLength);
        mPluginName = name;
        return plugin;
    }

    void setPluginNamespace(const char* libNamespace) override { mNamespace = libNamespace; }

    const char* getPluginNamespace() const override { return mNamespace.c_str(); }

private:
    float scoreThreshold;
    int mapHeight;    
    int mapWidth;
    int mapDepth;
    std::string mNamespace;
    std::string mPluginName;
    std::vector<PluginField> mPluginAttributes;
    PluginFieldCollection mFC;
};






class ResizeLayer : public IPluginV2
{
public:
    ResizeLayer(int in_dims_c, int in_dims_h, int in_dims_w) 
    {
        dimsDataInC = in_dims_c;
        dimsDataInH = in_dims_h;        
        dimsDataInW = in_dims_w;
    }

    ResizeLayer(const void* data, size_t length)
    {
        const char* d = static_cast<const char*>(data);
        dimsDataInC = read<int>(d);
        dimsDataInH = read<int>(d);        
        dimsDataInW = read<int>(d);
    }

    // It makes no sense to construct UffPoolPluginV2 without arguments.
    ResizeLayer() = delete;

    virtual ~ResizeLayer() {}

    int getNbOutputs() const override { return 1; }

    Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override
    {
        assert(nbInputDims == 1);
        assert(index == 0);
        assert(inputs[index].nbDims == 3);

        int output_c = inputs[index].d[0];
        int output_h = 128;//static_cast<const int>(inputs[index].d[1] * resizeFactorH);
        int output_w = 192;//static_cast<const int>(inputs[index].d[2] * resizeFactorW);

        return DimsCHW(output_c, output_h, output_w);
    }

    int initialize() override { return 0; }

    void terminate() override { ; }

    size_t getWorkspaceSize(int maxBatchSize) const override { return 0; }

    int enqueue(int batchSize, const void*const *inputs, void** outputs, void*, cudaStream_t stream)
    {
        int input_size[] {dimsDataInC, dimsDataInH, dimsDataInW};
        int output_size[] {dimsDataInC, 128, 192};
        float *d_input = (float*)inputs[0];
        float *d_output = (float*)outputs[0];
        float resize_factor_h = float(output_size[1]) / float(dimsDataInH);
        float resize_factor_w = float(output_size[2]) / float(dimsDataInW);

        resizeBilinear(resize_factor_h, resize_factor_w, d_input, d_output, input_size, output_size, batchSize, stream);

        return 0;
    }

    size_t getSerializationSize() const
    {
        return  3 * sizeof(int);
    }

    void serialize(void* buffer) const
    {
        char *d = reinterpret_cast<char*>(buffer);
        write(d, dimsDataInC);
        write(d, dimsDataInH);
        write(d, dimsDataInW);
    }

    void configureWithFormat(const Dims* inputs, int nbInputs, const Dims* outputDims, int nbOutputs, nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize) override
    {
        assert(nbOutputs == 1);
        assert(inputs[0].nbDims == 3);
    }

    bool supportsFormat(DataType type, PluginFormat format) const override { return (type == DataType::kFLOAT && format == PluginFormat::kNCHW); }

    const char* getPluginType() const override { return "Resize_TRT"; }

    const char* getPluginVersion() const override { return "1"; }

    void destroy() override { delete this; }

    IPluginV2* clone() const { return new ResizeLayer(dimsDataInC, dimsDataInH, dimsDataInW) ; }

    void setPluginNamespace(const char* libNamespace) override { mNamespace = libNamespace; }

    const char* getPluginNamespace() const override { return mNamespace.c_str(); }

private:
    template <typename T>
    void write(char*& buffer, const T& val) const
    {
        *reinterpret_cast<T*>(buffer) = val;
        buffer += sizeof(T);
    }

    template <typename T>
    T read(const char*& buffer)
    {
        T val = *reinterpret_cast<const T*>(buffer);
        buffer += sizeof(T);
        return val;
    }

    int dimsDataInC, dimsDataInH, dimsDataInW;
    std::string mNamespace;
};

namespace
{
const char* RESIZELAYER_PLUGIN_VERSION{"1"};
const char* RESIZELAYER_PLUGIN_NAME{"Resize_TRT"};
} // namespace


class ResizeLayerPluginCreator : public IPluginCreator
{
public:

    ResizeLayerPluginCreator()
    {
        mPluginAttributes.emplace_back(PluginField("dims_data_in_0", nullptr, PluginFieldType::kINT32, 1));
        mPluginAttributes.emplace_back(PluginField("dims_data_in_1", nullptr, PluginFieldType::kINT32, 1));
        mPluginAttributes.emplace_back(PluginField("dims_data_in_2", nullptr, PluginFieldType::kINT32, 1));

        mFC.nbFields = mPluginAttributes.size();
        mFC.fields = mPluginAttributes.data();
    }

    ~ResizeLayerPluginCreator() {}

    const char* getPluginName() const override { return RESIZELAYER_PLUGIN_NAME; }

    const char* getPluginVersion() const override { return RESIZELAYER_PLUGIN_VERSION; }

    const PluginFieldCollection* getFieldNames() override { return &mFC; }

    IPluginV2* createPlugin(const char* name, const PluginFieldCollection* fc) override
    {
        const PluginField* fields = fc->fields;

        for (int i = 0; i < fc->nbFields; ++i)
        {
            const char* attrName = fields[i].name;
            if (!strcmp(attrName, "dims_data_in_0"))
            {
                assert(fields[i].type == PluginFieldType::kINT32);
                dimsDataInC = *(static_cast<const int*>(fields[i].data));
            }
            if (!strcmp(attrName, "dims_data_in_1"))
            {
                assert(fields[i].type == PluginFieldType::kINT32);
                dimsDataInH = *(static_cast<const int*>(fields[i].data));
            }
            if (!strcmp(attrName, "dims_data_in_2"))
            {
                assert(fields[i].type == PluginFieldType::kINT32);
                dimsDataInW = *(static_cast<const int*>(fields[i].data));
            }
        }

        return new ResizeLayer(dimsDataInC, dimsDataInH, dimsDataInW) ;
    }



    IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override
    {
        auto plugin = new ResizeLayer(serialData, serialLength);
        mPluginName = name;
        return plugin;
    }

    void setPluginNamespace(const char* libNamespace) override { mNamespace = libNamespace; }

    const char* getPluginNamespace() const override { return mNamespace.c_str(); }

private:
    int dimsDataInC, dimsDataInH, dimsDataInW;
    std::string mNamespace;
    std::string mPluginName;
    std::vector<PluginField> mPluginAttributes;
    PluginFieldCollection mFC;
};


REGISTER_TENSORRT_PLUGIN(PostproLayerPluginCreator);
REGISTER_TENSORRT_PLUGIN(ResizeLayerPluginCreator);

Hi,

Do you implement cudaCastHalfToFloat_device and cudaCastFloatToHalf_device on your own?

If yes, it is recommended to use __float2half and __half2float quantization function from CUDA toolkit instead:
[url]CUDA Math API :: CUDA Toolkit Documentation

Since not only input but weight need to be converted into fp16 precision.
It will better to use the same quantization approach to get the inference result.

Thanks.