Linux distro and version : Ubuntu 18.04.1 LTS
GPU type : Tesla P4
nvidia driver version : 410.79
CUDA version : 10.0.130
CUDNN version : 7.4.2
Python version : 3.6
TensorRT version : 5.0.2.6
Describe the problem:
I use the second output of TopK layer as the input to the elementwise layer.
Then it prompts an error during build the engine.
[TensorRT] ERROR: (Unnamed Layer* 1) [ElementWise]: elementwise inputs must not be Int32
[TensorRT] ERROR: Could not compute dimensions for (Unnamed ITensor* 3), because the network is not valid
I have tried to cast the output as float using set_output_type().
But it still prompts this error.
Can anyone help me solve this problem? Thanks.
Here is the code.
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
# Transfer input data to the GPU.
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference.
context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out.host for out in outputs]
# Simple helper data class that's a little nicer to use than a 2-tuple.
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
def allocate_buffers(engine):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
def populate_network(network, shape):
k = 3
axes = 2
input_tensor = network.add_input(name='data', dtype=trt.float32, shape=shape)
topk_node = network.add_topk(input=input_tensor, op=trt.TopKOperation.MIN, k=k, axes=axes)
topk_node.precision = trt.DataType.FLOAT
topk_node.set_output_type(1, trt.DataType.FLOAT)
add_node = network.add_elementwise(input1=topk_node.get_output(1), input2=topk_node.get_output(1),
op=trt.ElementWiseOperation.SUM)
network.mark_output(add_node.get_output(0))
def build_engine(shape):
# For more information on TRT basics, refer to the introductory samples.
with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network:
builder.max_workspace_size = 1 << 30
# Populate the network using weights from the PyTorch model.
populate_network(network, shape)
# Build and return an engine.
return builder.build_cuda_engine(network)
def main():
x = [[0.3, 0.2, 0.4],
[0.1, 0.3, 0.2]]
data = np.array(x, np.float32)
with build_engine(data.shape) as engine:
inputs, outputs, bindings, stream = allocate_buffers(engine)
with engine.create_execution_context() as context:
np.copyto(inputs[0].host, data.ravel())
[output] = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
output_shape = engine.get_binding_shape(1)
output = np.reshape(output, output_shape)
print("Prediction: \n" + str(output))
if __name__ == '__main__':
main()