Hello,
I’m trying to implement a network with the C++ API. In a particular section of the red, I need to link a topk layer (specifically the index output) to a custom plugin that I implemented;
the engine fails to build with the error that follows
./builder/cudnnBuilder2.cpp:1743: virtual std::vector<nvinfer1::query::RequirementsCombination> nvinfer1::builder::EngineTacticSupply::getSupportedFormats(const nvinfer1::builder::Node&): Assertion `!formats.empty()' failed.
to me this seems to be produced by the fact that the index output (second output) of topk comes in int32; yet my plugin is written to handle int32 and expects its input to be of that type; is there any reason why a custom IPluginExt should not work with a int32 input?
I use TensorRT 6.0.1.5, with cuda 10.1.
I also attach the implementation of my plugin
void cudaCastInt32ToFloat_device(const int32_t *d_data_in, float *d_data_out, const int num_elements, cudaStream_t stream);
CastInt32ToFloatLayer::CastInt32ToFloatLayer(int c, int h, int w)
{
numChannels = c;
height = h;
width = w;
}
CastInt32ToFloatLayer::CastInt32ToFloatLayer(const void*buf, size_t size)
{
assert(size == 3 * sizeof(int));
const int *bufferSizeT = static_cast<const int*>(buf);
numChannels = static_cast<const int>(bufferSizeT[0]);
height = static_cast<const int>(bufferSizeT[1]);
width = static_cast<const int>(bufferSizeT[2]);
}
IPluginExt* CastInt32ToFloatLayer::clone()
{
return new CastInt32ToFloatLayer(numChannels, height, width);
}
bool CastInt32ToFloatLayer::supportsFormat(DataType type, PluginFormat format) const
{
return format == PluginFormat::kNCHW;
}
void CastInt32ToFloatLayer::configureWithFormat(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, DataType type, PluginFormat format, int maxBatchSize)
{
assert(format == PluginFormat::kNCHW);
mDataType = type;
}
Dims CastInt32ToFloatLayer::getOutputDimensions(int index, const Dims* inputs, int nbInputDims)
{
assert(nbInputDims == 1);
assert(inputs[0].nbDims == 3);
return DimsCHW(numChannels, height, width);
}
int CastInt32ToFloatLayer::enqueue(int batch_size, const void*const *inputs, void** outputs, void*, cudaStream_t stream)
{
int32_t *data_in = (int32_t*)inputs[0];
float *data_out = (float*)outputs[0];
cudaCastInt32ToFloat_device(data_in, data_out, batch_size * numChannels * height * width, stream);
return 0;
}
size_t CastInt32ToFloatLayer::getSerializationSize()
{
return 3 * sizeof(int);
}
void CastInt32ToFloatLayer::serialize(void* buffer)
{
int *bufferSizeT = static_cast<int*>(buffer);
bufferSizeT[0] = numChannels;
bufferSizeT[1] = height;
bufferSizeT[2] = width;
}
the enqueue ends up calling the cuda kernel that follows
__global__ void castInt32ToFloatKernel(const int32_t *data_in, float *data_out)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
data_out[index] = static_cast<float>( data_in[index] ) ;
}
Is there a way to work around this?
Thanks in advance,
f