[TensorRT] ERROR: [ElementWise]: elementwise inputs must not be Int32

Linux distro and version : Ubuntu 18.04.1 LTS
GPU type : Tesla P4
nvidia driver version : 410.79
CUDA version : 10.0.130
CUDNN version : 7.4.2
Python version : 3.6
TensorRT version :

Describe the problem:

I use the second output of TopK layer as the input to the elementwise layer.
Then it prompts an error during build the engine.

[TensorRT] ERROR: (Unnamed Layer* 1) [ElementWise]: elementwise inputs must not be Int32
[TensorRT] ERROR: Could not compute dimensions for (Unnamed ITensor* 3), because the network is not valid

I have tried to cast the output as float using set_output_type().
But it still prompts this error.

Can anyone help me solve this problem? Thanks.

Here is the code.

import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit

import numpy as np

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    # Return only the host outputs.
    return [out.host for out in outputs]

# Simple helper data class that's a little nicer to use than a 2-tuple.
class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()

# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream

def populate_network(network, shape):
    k = 3
    axes = 2

    input_tensor = network.add_input(name='data', dtype=trt.float32, shape=shape)
    topk_node = network.add_topk(input=input_tensor, op=trt.TopKOperation.MIN, k=k, axes=axes)
    topk_node.precision = trt.DataType.FLOAT
    topk_node.set_output_type(1, trt.DataType.FLOAT)

    add_node = network.add_elementwise(input1=topk_node.get_output(1), input2=topk_node.get_output(1),


def build_engine(shape):
    # For more information on TRT basics, refer to the introductory samples.
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network:
        builder.max_workspace_size = 1 << 30
        # Populate the network using weights from the PyTorch model.
        populate_network(network, shape)
        # Build and return an engine.
        return builder.build_cuda_engine(network)

def main():
    x = [[0.3, 0.2, 0.4],
         [0.1, 0.3, 0.2]]
    data = np.array(x, np.float32)

    with build_engine(data.shape) as engine:
        inputs, outputs, bindings, stream = allocate_buffers(engine)
        with engine.create_execution_context() as context:
            np.copyto(inputs[0].host, data.ravel())
            [output] = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
            output_shape = engine.get_binding_shape(1)
            output = np.reshape(output, output_shape)
            print("Prediction: \n" + str(output))

if __name__ == '__main__':


we are triaging this issue and will keep you updated.


Unfortunately, TRT currently doesn’t support int32 input tensors for elementwise operation. We are working on adding this feature in a future release.

As a workaround, you can add a plugin layer in between topK layer and elementwise layer to convert int32 to float data type.

Can’t set_output_type() to do this conversion?

Per engineering:

Currently, set_output_type(index, tensorrt.DataType) can’t force change the second output tensor’s data type(Int32) of topK layer. The second output tensor of topK layer is used for indexing.

I’ve tried your suggestion to make plugin for conversion. But tensorrt still thinks that it’s type int32.

Code of plugin:

#include "TypeConversionPlugin.hpp"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>

	const char* TYPECONVERSION_PLUGIN_LAYER{ "TypeConversion" };
} // namespace

// Helper function for serializing plugin
template <typename T>
void writeToBuffer(char*& buffer, const T& val)
    *reinterpret_cast<T*>(buffer) = val;
    buffer += sizeof(T);
// Helper function for deserializing plugin
template <typename T>
T readFromBuffer(const char*& buffer)
    T val = *reinterpret_cast<const T*>(buffer);
    buffer += sizeof(T);
    return val;
PluginFieldCollection TypeConversionPluginCreator::m_PFC{};
std::vector<PluginField> TypeConversionPluginCreator::m_pluginAttributes;

TypeConversionPlugin::TypeConversionPlugin(const void* serial_buf)

TypeConversionPlugin::TypeConversionPlugin(const TypeConversionPlugin *plugin)
	m_dims = plugin->m_dims;

TypeConversionPlugin::TypeConversionPlugin(const PluginFieldCollection *fc)
	auto fields  = fc->fields;

		for (int i=0; i < fc->nbFields; i++)
		const void *tmpData = fields[i].data;
		int length = fields[i].length;
		if (!strcmp(fields[i].name, "dims"))
			const int* m_channels = reinterpret_cast<const int*>(tmpData);
			for (size_t i = 0; i < length; i++)

const char* TypeConversionPlugin::getPluginType() const

const char* TypeConversionPlugin::getPluginVersion() const

int TypeConversionPlugin::getNbOutputs() const
	return 1;

nvinfer1::Dims TypeConversionPlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims)
	if(m_dims.size() == 2)
	return nvinfer1::Dims2(m_dims[0], m_dims[1]);
	if(m_dims.size() == 3)
	return nvinfer1::Dims3(m_dims[0], m_dims[1], m_dims[2]);

int TypeConversionPlugin::initialize()
	return 0;

void TypeConversionPlugin::destroy()
void TypeConversionPlugin::terminate()

size_t TypeConversionPlugin::getWorkspaceSize(int maxBatchSize ) const
	return size_t();

size_t TypeConversionPlugin::getSerializationSize() const
	return size_t();

void TypeConversionPlugin::serialize(void* buffer ) const

bool TypeConversionPlugin::supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format) const

 return (type == DataType::kINT32 && format == PluginFormat::kNCHW);

void TypeConversionPlugin::configureWithFormat(const nvinfer1::Dims* inputDims, int nbInputs, const nvinfer1::Dims* outputDims, int nbOutputs, nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize)
	switch (type)
	case DataType::kFLOAT:
		std::cout <<"kFLOAT data" << std::endl;
	case DataType::kINT32:
		std::cout <<"kINT32 data" << std::endl;
		std::cout <<"default data" << std::endl;

void TypeConversionPlugin::setPluginNamespace(const char* pluginNamespace)
	m_pluginNamespace = pluginNamespace;

const char* TypeConversionPlugin::getPluginNamespace() const
	return m_pluginNamespace;

nvinfer1::DataType TypeConversionPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs ) const
	return nvinfer1::DataType::kFLOAT;

nvinfer1::IPluginV2* TypeConversionPlugin::clone() const
	TypeConversionPlugin* plg = new TypeConversionPlugin(this);
	return plg;

bool TypeConversionPlugin::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, int nbOutputs ) const
	return false;
#pragma region TypeConversionPluginCreator

{	m_pluginAttributes.emplace_back(PluginField("dims", nullptr, PluginFieldType::kINT32, 1));
    m_PFC.nbFields = m_pluginAttributes.size();
    m_PFC.fields = m_pluginAttributes.data();

	m_Namespace = "";

const char* TypeConversionPluginCreator::getPluginName() const 
const char* TypeConversionPluginCreator::getPluginVersion() const 

const nvinfer1::PluginFieldCollection* TypeConversionPluginCreator::getFieldNames()
	return &m_PFC;

nvinfer1::IPluginV2* TypeConversionPluginCreator::createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc)
	TypeConversionPlugin* plg = new TypeConversionPlugin(fc);
	return plg;

nvinfer1::IPluginV2* TypeConversionPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength)
	return new TypeConversionPlugin(serialData);

void TypeConversionPluginCreator::setPluginNamespace(const char* pluginNamespace)
	m_Namespace = pluginNamespace;

const char* TypeConversionPluginCreator::getPluginNamespace() const
	return m_Namespace;
#pragma endregion

__global__ void convertVals(int * in, float *out, int n)
	int index = blockIdx.x * blockDim.x + threadIdx.x;
	if (index < n)
		out[index] =  __int_as_float(in[index]);

int TypeConversionPlugin::enqueue(int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream)
	const void* inputData = inputs[0];
	const void* outputData = outputs[0];
	int n = 1;
	for (size_t i = 0; i < m_dims.size(); i++)
		n *= m_dims[i];

	return 0;

Logger output:

Verbose: Deserialize required 1913954 microseconds.
Error: Output tensor wh of type Float produced from output of incompatible type Int32
Error: Could not compute dimensions for wh, because the network is not valid

What is wrong?

Did you solve this problem?

write a custom TopK plugin