Hi,
thanks for replying.
We indeed use a couple of custom plugins inside our model.
I think it is also interesting to show how we actually use the generate engine outside DeepStream.
The piece of code is shown below. Basically we manually cast the data buffer (originally FP32) to FP16, and back to FP32 after the engine. I understand, Deepstream takes care of this process, and it just enqueues data to the engine, so the whole pipeline should be equivalent.
void doInference(IExecutionContext& netContext, float* input, float* outputboxes, float* outputscores, int32_t* outputidx, int batchSize)
{
// DMA the input to the GPU
CHECK(cudaMemcpyAsync(buffers[netInputIndex], input, batchSize * params.INPUT_H * params.INPUT_W * params.INPUT_C * sizeof(float), cudaMemcpyHostToDevice, stream));
// cast to half precision if needed
if (params.DATATYPE == DataType::kHALF)
cudaCastFloatToHalf_device((float*)buffers[netInputIndex], (__half*)buffers_h[netInputIndex], batchSize * params.INPUT_H * params.INPUT_W * params.INPUT_C, stream);
// feed data to the engine
if (params.DATATYPE == DataType::kHALF)
netContext.enqueue(batchSize, buffers_h, stream, nullptr);
else
netContext.enqueue(batchSize, buffers, stream, nullptr);
// cast back to full precision if needed
if (params.DATATYPE == DataType::kHALF)
{
cudaCastHalfToFloat_device((__half*)buffers_h[netOutputIndexBoxes], (float*)buffers[netOutputIndexBoxes], batchSize * params.OUTPUT_W * params.OUTPUT_H * params.NUM_CLASSES * 4, stream);
cudaCastHalfToFloat_device((__half*)buffers_h[netOutputIndexScores], (float*)buffers[netOutputIndexScores], batchSize * params.OUTPUT_W * params.OUTPUT_H * params.NUM_CLASSES, stream);
buffers[netOutputIndexIdx] = (int32_t*)buffers_h[netOutputIndexIdx];
}
// DMA back to CPU
CHECK(cudaMemcpyAsync(outputboxes, (float*)buffers[netOutputIndexBoxes], batchSize * params.OUTPUT_W * params.OUTPUT_H * params.NUM_CLASSES * 4 * sizeof(float), cudaMemcpyDeviceToHost, stream));
CHECK(cudaMemcpyAsync(outputscores, (float*)buffers[netOutputIndexScores], batchSize * params.OUTPUT_W * params.OUTPUT_H * params.NUM_CLASSES * sizeof(float), cudaMemcpyDeviceToHost, stream));
CHECK(cudaMemcpyAsync(outputidx, (int32_t*)buffers[netOutputIndexIdx], batchSize * params.N_MAX_BBOXES_PER_CLASS * params.NUM_CLASSES * sizeof(int32_t), cudaMemcpyDeviceToHost, stream));
// synchro tasks
cudaStreamSynchronize(stream);
}
I also share below the implementation of the two custom plugins we use.
Hope that helps to understand the problem.
Thanks again,
f
#include <string.h>
#include <iostream>
#include <cassert>
#include <vector>
#include <functional>
#include <numeric>
#include <algorithm>
#include "NvInferPlugin.h"
#include "cuda_runtime_api.h"
void postProMapsLauncher(const int batch_size, const int *map_size, const float thr,
const float *data_in_l, const float *data_in_b, const float *data_in_l_pool, float *data_out_b, float *data_out_s, cudaStream_t stream);
void resizeBilinear(float resize_factor_h, float resize_factor_w, float* input, float* output, int* input_size, int* output_size, int batch_size, cudaStream_t stream);
using namespace nvinfer1;
class PostproLayer : public IPluginV2
{
public:
PostproLayer(const float score_threshold, const int map_height, const int map_width, const int map_depth)
{
scoreThreshold = score_threshold;
mapHeight = map_height;
mapWidth = map_width;
mapDepth = map_depth;
}
PostproLayer(const void* data, size_t length)
{
const char* d = static_cast<const char*>(data);
scoreThreshold = read<float>(d);
mapHeight = read<int>(d);
mapWidth = read<int>(d);
mapDepth = read<int>(d);
}
// It makes no sense to construct UffPoolPluginV2 without arguments.
PostproLayer() = delete;
virtual ~PostproLayer() {}
int getNbOutputs() const override
{
return 2;
}
Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override
{
assert(nbInputDims == 3);
assert(inputs[0].nbDims == 3);
assert(inputs[1].nbDims == 3);
assert(inputs[2].nbDims == 3);
if (index == 0)
return DimsCHW(mapDepth, mapHeight * mapWidth, 4);
if (index == 1)
return DimsCHW(mapDepth, 1, mapHeight * mapWidth);
return DimsCHW(mapDepth, mapHeight * mapWidth, 4);
}
int initialize() override { return 0; }
void terminate() override { ; }
size_t getWorkspaceSize(int maxBatchSize) const override { return 0; }
int enqueue(int batch_size, const void*const *inputs, void** outputs, void*, cudaStream_t stream) override
{
int map_size[] {mapHeight, mapWidth, mapDepth};
float thr = scoreThreshold;
float *data_in_l = (float*)inputs[0];
float *data_in_b = (float*)inputs[1];
float *data_in_l_pool = (float*)inputs[2];
float *data_out_b = (float*)outputs[0];
float *data_out_s = (float*)outputs[1];
postProMapsLauncher(batch_size, map_size, thr, data_in_l, data_in_b, data_in_l_pool, data_out_b, data_out_s, stream);
return 0;
}
size_t getSerializationSize() const { return sizeof(float) + 3 * sizeof(int); }
void serialize(void* buffer) const
{
char *d = reinterpret_cast<char*>(buffer);
write(d, scoreThreshold);
write(d, mapHeight);
write(d, mapWidth);
write(d, mapDepth);
}
void configureWithFormat(const Dims* inputs, int nbInputs, const Dims* outputDims, int nbOutputs, nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize) override
{
assert(nbOutputs == 2);
assert(inputs[0].nbDims == 3);
assert(inputs[1].nbDims == 3);
assert(inputs[2].nbDims == 3);
for (int i = 0; i < nbInputs; ++i)
{
assert(inputs[i].d[1] == mapHeight);
assert(inputs[i].d[2] == mapWidth);
}
}
bool supportsFormat(DataType type, PluginFormat format) const override { return (type == DataType::kFLOAT && format == PluginFormat::kNCHW); }
const char* getPluginType() const override { return "PostProcessing_TRT"; }
const char* getPluginVersion() const override { return "1"; }
void destroy() override { delete this; }
IPluginV2* clone() const { return new PostproLayer(scoreThreshold, mapHeight, mapWidth, mapDepth); }
void setPluginNamespace(const char* libNamespace) override { mNamespace = libNamespace; }
const char* getPluginNamespace() const override { return mNamespace.c_str(); }
private:
template <typename T>
void write(char*& buffer, const T& val) const
{
*reinterpret_cast<T*>(buffer) = val;
buffer += sizeof(T);
}
template <typename T>
T read(const char*& buffer)
{
T val = *reinterpret_cast<const T*>(buffer);
buffer += sizeof(T);
return val;
}
float scoreThreshold;
int mapHeight;
int mapWidth;
int mapDepth;
std::string mNamespace;
};
namespace
{
const char* POSTPROLAYER_PLUGIN_VERSION{"1"};
const char* POSTPROLAYER_PLUGIN_NAME{"PostProcessing_TRT"};
} // namespace
class PostproLayerPluginCreator : public IPluginCreator
{
public:
PostproLayerPluginCreator()
{
mPluginAttributes.emplace_back(PluginField("score_threshold", nullptr, PluginFieldType::kFLOAT32, 1));
mPluginAttributes.emplace_back(PluginField("map_height", nullptr, PluginFieldType::kINT32, 1));
mPluginAttributes.emplace_back(PluginField("map_width", nullptr, PluginFieldType::kINT32, 1));
mPluginAttributes.emplace_back(PluginField("map_depth", nullptr, PluginFieldType::kINT32, 1));
mFC.nbFields = mPluginAttributes.size();
mFC.fields = mPluginAttributes.data();
}
~PostproLayerPluginCreator() {}
const char* getPluginName() const override { return POSTPROLAYER_PLUGIN_NAME; }
const char* getPluginVersion() const override { return POSTPROLAYER_PLUGIN_VERSION; }
const PluginFieldCollection* getFieldNames() override { return &mFC; }
IPluginV2* createPlugin(const char* name, const PluginFieldCollection* fc) override
{
const PluginField* fields = fc->fields;
for (int i = 0; i < fc->nbFields; ++i)
{
const char* attrName = fields[i].name;
if (!strcmp(attrName, "score_threshold"))
{
assert(fields[i].type == PluginFieldType::kFLOAT32);
scoreThreshold = *(static_cast<const float*>(fields[i].data));
}
if (!strcmp(attrName, "map_height"))
{
assert(fields[i].type == PluginFieldType::kINT32);
mapHeight = *(static_cast<const int*>(fields[i].data));
}
if (!strcmp(attrName, "map_width"))
{
assert(fields[i].type == PluginFieldType::kINT32);
mapWidth = *(static_cast<const int*>(fields[i].data));
}
if (!strcmp(attrName, "map_depth"))
{
assert(fields[i].type == PluginFieldType::kINT32);
mapDepth = *(static_cast<const int*>(fields[i].data));
}
}
return new PostproLayer(scoreThreshold, mapHeight, mapWidth, mapDepth);
}
IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override
{
auto plugin = new PostproLayer(serialData, serialLength);
mPluginName = name;
return plugin;
}
void setPluginNamespace(const char* libNamespace) override { mNamespace = libNamespace; }
const char* getPluginNamespace() const override { return mNamespace.c_str(); }
private:
float scoreThreshold;
int mapHeight;
int mapWidth;
int mapDepth;
std::string mNamespace;
std::string mPluginName;
std::vector<PluginField> mPluginAttributes;
PluginFieldCollection mFC;
};
class ResizeLayer : public IPluginV2
{
public:
ResizeLayer(int in_dims_c, int in_dims_h, int in_dims_w)
{
dimsDataInC = in_dims_c;
dimsDataInH = in_dims_h;
dimsDataInW = in_dims_w;
}
ResizeLayer(const void* data, size_t length)
{
const char* d = static_cast<const char*>(data);
dimsDataInC = read<int>(d);
dimsDataInH = read<int>(d);
dimsDataInW = read<int>(d);
}
// It makes no sense to construct UffPoolPluginV2 without arguments.
ResizeLayer() = delete;
virtual ~ResizeLayer() {}
int getNbOutputs() const override { return 1; }
Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override
{
assert(nbInputDims == 1);
assert(index == 0);
assert(inputs[index].nbDims == 3);
int output_c = inputs[index].d[0];
int output_h = 128;//static_cast<const int>(inputs[index].d[1] * resizeFactorH);
int output_w = 192;//static_cast<const int>(inputs[index].d[2] * resizeFactorW);
return DimsCHW(output_c, output_h, output_w);
}
int initialize() override { return 0; }
void terminate() override { ; }
size_t getWorkspaceSize(int maxBatchSize) const override { return 0; }
int enqueue(int batchSize, const void*const *inputs, void** outputs, void*, cudaStream_t stream)
{
int input_size[] {dimsDataInC, dimsDataInH, dimsDataInW};
int output_size[] {dimsDataInC, 128, 192};
float *d_input = (float*)inputs[0];
float *d_output = (float*)outputs[0];
float resize_factor_h = float(output_size[1]) / float(dimsDataInH);
float resize_factor_w = float(output_size[2]) / float(dimsDataInW);
resizeBilinear(resize_factor_h, resize_factor_w, d_input, d_output, input_size, output_size, batchSize, stream);
return 0;
}
size_t getSerializationSize() const
{
return 3 * sizeof(int);
}
void serialize(void* buffer) const
{
char *d = reinterpret_cast<char*>(buffer);
write(d, dimsDataInC);
write(d, dimsDataInH);
write(d, dimsDataInW);
}
void configureWithFormat(const Dims* inputs, int nbInputs, const Dims* outputDims, int nbOutputs, nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize) override
{
assert(nbOutputs == 1);
assert(inputs[0].nbDims == 3);
}
bool supportsFormat(DataType type, PluginFormat format) const override { return (type == DataType::kFLOAT && format == PluginFormat::kNCHW); }
const char* getPluginType() const override { return "Resize_TRT"; }
const char* getPluginVersion() const override { return "1"; }
void destroy() override { delete this; }
IPluginV2* clone() const { return new ResizeLayer(dimsDataInC, dimsDataInH, dimsDataInW) ; }
void setPluginNamespace(const char* libNamespace) override { mNamespace = libNamespace; }
const char* getPluginNamespace() const override { return mNamespace.c_str(); }
private:
template <typename T>
void write(char*& buffer, const T& val) const
{
*reinterpret_cast<T*>(buffer) = val;
buffer += sizeof(T);
}
template <typename T>
T read(const char*& buffer)
{
T val = *reinterpret_cast<const T*>(buffer);
buffer += sizeof(T);
return val;
}
int dimsDataInC, dimsDataInH, dimsDataInW;
std::string mNamespace;
};
namespace
{
const char* RESIZELAYER_PLUGIN_VERSION{"1"};
const char* RESIZELAYER_PLUGIN_NAME{"Resize_TRT"};
} // namespace
class ResizeLayerPluginCreator : public IPluginCreator
{
public:
ResizeLayerPluginCreator()
{
mPluginAttributes.emplace_back(PluginField("dims_data_in_0", nullptr, PluginFieldType::kINT32, 1));
mPluginAttributes.emplace_back(PluginField("dims_data_in_1", nullptr, PluginFieldType::kINT32, 1));
mPluginAttributes.emplace_back(PluginField("dims_data_in_2", nullptr, PluginFieldType::kINT32, 1));
mFC.nbFields = mPluginAttributes.size();
mFC.fields = mPluginAttributes.data();
}
~ResizeLayerPluginCreator() {}
const char* getPluginName() const override { return RESIZELAYER_PLUGIN_NAME; }
const char* getPluginVersion() const override { return RESIZELAYER_PLUGIN_VERSION; }
const PluginFieldCollection* getFieldNames() override { return &mFC; }
IPluginV2* createPlugin(const char* name, const PluginFieldCollection* fc) override
{
const PluginField* fields = fc->fields;
for (int i = 0; i < fc->nbFields; ++i)
{
const char* attrName = fields[i].name;
if (!strcmp(attrName, "dims_data_in_0"))
{
assert(fields[i].type == PluginFieldType::kINT32);
dimsDataInC = *(static_cast<const int*>(fields[i].data));
}
if (!strcmp(attrName, "dims_data_in_1"))
{
assert(fields[i].type == PluginFieldType::kINT32);
dimsDataInH = *(static_cast<const int*>(fields[i].data));
}
if (!strcmp(attrName, "dims_data_in_2"))
{
assert(fields[i].type == PluginFieldType::kINT32);
dimsDataInW = *(static_cast<const int*>(fields[i].data));
}
}
return new ResizeLayer(dimsDataInC, dimsDataInH, dimsDataInW) ;
}
IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override
{
auto plugin = new ResizeLayer(serialData, serialLength);
mPluginName = name;
return plugin;
}
void setPluginNamespace(const char* libNamespace) override { mNamespace = libNamespace; }
const char* getPluginNamespace() const override { return mNamespace.c_str(); }
private:
int dimsDataInC, dimsDataInH, dimsDataInW;
std::string mNamespace;
std::string mPluginName;
std::vector<PluginField> mPluginAttributes;
PluginFieldCollection mFC;
};
REGISTER_TENSORRT_PLUGIN(PostproLayerPluginCreator);
REGISTER_TENSORRT_PLUGIN(ResizeLayerPluginCreator);