I implemented a custom layer in c++ and cuda by inheriting from PluginV2, and creating a PluginCreator. Since I generate my engine in python, I used pybind11 to bind the ctors of the layer and the creator. In python, I use network.add_plugin_v2() to add the layers to my network.I replace this node with the configuration:
def preprocess(dynamic_graph):
# Now create a new graph by collapsing namespaces
dynamic_graph.collapse_namespaces(namespace_plugin_map)
namespace_plugin_map = {"FeatureExtractor/detnat_59/fpn/top_down/smoothing_1/BA_GConv2d_3_3/split": split10,
"FeatureExtractor/detnat_59/fpn/top_down/smoothing_1/BA_GConv2d_3_3/split:1": split10,
"FeatureExtractor/detnat_59/fpn/top_down/smoothing_1/BA_GConv2d_3_3/split:2": split10,
"FeatureExtractor/detnat_59/fpn/top_down/smoothing_1/BA_GConv2d_3_3/split:3": split10,}
split10 = gs.create_plugin_node("Split_10",
inputChannels=128,
inputHeight=60,
inputWidth=92,
axis=1,
num_of_split=4)
then, there is 4 outputs,the split plugin code is there:
//return number of splited tensor.
int getNbOutputs() const override { return mParams.num_of_split; }
//return tensor's output dims.
nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) override
{
assert(inputs[0].nbDims == 3);
assert(mParams.inputChannels == inputs[0].d[0] &&
mParams.inputHeight == inputs[0].d[1] &&
mParams.inputWidth == inputs[0].d[2]);
assert(mParams.axis == 1 || mParams.axis == 2 || mParams.axis == 3);
UNUSED(index);
UNUSED(nbInputDims);
if (mParams.axis == 1) {
assert(mParams.inputChannels % mParams.num_of_split == 0);
return nvinfer1::DimsCHW(mParams.inputChannels/mParams.num_of_split, mParams.inputHeight, mParams.inputWidth);
} else if (mParams.axis == 2) {
assert(mParams.inputHeight % mParams.num_of_split == 0);
return nvinfer1::DimsCHW(mParams.inputChannels, mParams.inputHeight/mParams.num_of_split, mParams.inputWidth);
} else if (mParams.axis == 3){
assert(mParams.inputWidth % mParams.num_of_split == 0);
return nvinfer1::DimsCHW(mParams.inputChannels, mParams.inputHeight, mParams.inputWidth/mParams.num_of_split);
} else
assert(false);
}
int enqueue(int batchSize, const void* const* inputs, void** outputs, void*, cudaStream_t stream) override
{
UNUSED(stream);
int i, j;
int split_channel;
float* output;
long copy_length, total_length;
const float* input = reinterpret_cast<const float*>(inputs[0]);
total_length = mParams.inputWidth * mParams.inputHeight * mParams.inputChannels;
if (!mParams.isFp16) {
if (mParams.axis == 1) {
split_channel = mParams.inputChannels/mParams.num_of_split;
copy_length = mParams.inputHeight * mParams.inputWidth * split_channel;
for (j = 0; j < batchSize; j++) {
for (i = 0; i < mParams.num_of_split; i++) {
output = reinterpret_cast<float*>(outputs[i]);
CHECK(cudaMemcpy((void*)(output + j*copy_length), (void *)(input + i * split_channel * mParams.inputWidth * mParams.inputHeight + j*total_length) , copy_length * sizeof(float), cudaMemcpyDeviceToDevice));
}
}
} else {
// TODO:
assert(false);
}
} else {
// TODO:
assert(false);
}
return 0;
}
For example, I have four outputs here that are the inputs of the following nodes, but the test finds that each output value is the value of output[0]. Instead of output[0], output[1], output[2], output[3].
please help me,thanks.