Triton inference server with SSD : interpreting responses

I am running a Triton server serving a SSD model plan generated by the TAO toolkit

Serving works fine! but I am trying to parse the results. here is a sample request and response

image_path = 'test_images/frame_002541.png'  # read an image 

## pre-proces the image 

img = cv2.imread(image_path)
offsets  = np.array([103.939,116.779,123.68]) # BGR color offsets
infer_dims = [3, 736, 1280]

#convert the image to float32 and subtract the offsets
img = img.astype(np.float32)
img -= offsets

# rescale the imaage to infer_dims
img = cv2.resize(img, (infer_dims[2], infer_dims[1]))

# show image shape
print('Image shape: ', img.shape)

# convert the image to CHW format
img = img.transpose([2, 0, 1])

The Trition server is configured to accept CHW (736, 1280) images.

Now we make a connection (I have ports mapped so please don;t worry about the strange port numbers)

# Setup a connection with the Triton Inference Server.
triton_client = httpclient.InferenceServerClient(url="localhost:9100", verbose=True, concurrency=1, insecure=True)

# get some model metadata
    model_metadata = triton_client.get_model_metadata(model_name="my-model-v2", model_version="1")
except InferenceServerException as e:
    print("Failed to retrieve the metadata: {}".format(e))

# get model config
    model_config = triton_client.get_model_config(model_name="my-model-v2", model_version="1")
    print("Model config:\n{}".format(model_config))
except InferenceServerException as e:
    print("Failed to retrieve the config: {}".format(e))

get metadata and config

print("Model inputs:{} | LEN {}".format(model_metadata['inputs'], len(model_metadata['inputs'])))
print(f"Model outputs:{model_metadata['outputs']} | LEN {len(model_metadata['outputs'])} \n")

print(f"model config inputs: {model_config['input']} | LEN {len(model_config['input'])}")
print(f"model config outputs: {model_config['output']} | LEN {len(model_config['output'])} \n")


Model inputs:[{'name': 'Input', 'datatype': 'FP32', 'shape': [3, 736, 1280]}] | LEN 1
Model outputs:[{'name': 'NMS', 'datatype': 'FP32', 'shape': [1, 1, 10, 7]}, {'name': 'NMS_1', 'datatype': 'FP32', 'shape': [1, 1, 1, 1]}] | LEN 2 

model config inputs: [{'name': 'Input', 'data_type': 'TYPE_FP32', 'format': 'FORMAT_NONE', 'dims': [3, 736, 1280], 'reshape': {'shape': [1, 3, 736, 1280]}, 'is_shape_tensor': False, 'allow_ragged_batch': False, 'optional': False}] | LEN 1
model config outputs: [{'name': 'NMS', 'data_type': 'TYPE_FP32', 'dims': [1, 1, 10, 7], 'reshape': {'shape': [1, 1, 10, 7]}, 'label_filename': '', 'is_shape_tensor': False}, {'name': 'NMS_1', 'data_type': 'TYPE_FP32', 'dims': [1, 1, 1, 1], 'reshape': {'shape': [1, 1, 1, 1]}, 'label_filename': '', 'is_shape_tensor': False}] | LEN 2 

setting up input and output

# Specify the names of the input and output layer(s) of our model.
test_input = httpclient.InferInput("Input", image_data.shape, datatype="FP32")

output_0 = httpclient.InferRequestedOutput("NMS",)
output_1 = httpclient.InferRequestedOutput("NMS_1")

making the inference request

    results = triton_client.infer(model_name="my-model-v2", 
                                  outputs=[output_0, output_1]


except InferenceServerException as e:
    print("inference failed: {}".format(e))

print response



{'id': '1', 'model_name': 'nozzlenet-v2', 'model_version': '1', 'outputs': [{'name': 'NMS', 'datatype': 'BYTES', 'shape': [10], 'parameters': {'binary_data_size': 144}}, {'name': 'NMS_1', 'datatype': 'BYTES', 'shape': [1], 'parameters': {'binary_data_size': 14}}]}

After this point I’m not sure how to capture this binary data to the right structure? (or is there a way I can have something like a JSON/dictionary rather than binary to begin with so I can see how the data is packaged so I can deal with the binary data later?

I tried in a crude way to see how the items in the results are stored and then re arrange it but the lists in the elements seem to not have a fixed size?

because I expected to see a 10x7 arrangement where if I pick a row (one in 10) there will be 7 sub elements. (this could be still the case but I may be missing something /lacking understanding)

my crude method is described below: I tried working back from sizes by doing the steps below

output = results.as_numpy("NMS")
# prinnt output data type
print(f"Output data type: {output.dtype.type}")
print(f"Output shape: {len(output)}")


Output data type: <class 'numpy.object_'>
Output shape: 10

then i looked at indivdual items to see if they are arranged in a consistent way

# get the size if output_0 in memory
output_0_size = sys.getsizeof(output_0)
print(f"Output 0 size: {output_0_size}")

# get the size if output_1 in memory
output_1_size = sys.getsizeof(output_1)
print(f"Output 1 size: {output_1_size}")

# get the sixe of of output_2
output_2_size = sys.getsizeof(output_2)
print(f"Output 2 size: {output_2_size}")

# get the size of output_3
output_3_size = sys.getsizeof(output_3)
print(f"Output 3 size: {output_3_size}")

# get the size of output_4
output_4_size = sys.getsizeof(output_4)
print(f"Output 4 size: {output_4_size}")

# get the size of output_5
output_5_size = sys.getsizeof(output_5)
print(f"Output 5 size: {output_5_size}")

# get the size of output_6
output_6_size = sys.getsizeof(output_6)
print(f"Output 6 size: {output_6_size}")

# get the size of output_7
output_7_size = sys.getsizeof(output_7)
print(f"Output 7 size: {output_7_size}")

# get the size of output_8
output_8_size = sys.getsizeof(output_8)
print(f"Output 8 size: {output_8_size}")

# get the size of output_9
output_9_size = sys.getsizeof(output_9)
print(f"Output 9 size: {output_9_size}")

# get the size of all the elements in output
output_size = sys.getsizeof(output)
print(f"Output size: {output_size}")

I got

Output 0 size: 43
Output 1 size: 43
Output 2 size: 44
Output 3 size: 43
Output 4 size: 44
Output 5 size: 43
Output 6 size: 44
Output 7 size: 44
Output 8 size: 43
Output 9 size: 43
Output size: 112

Can you help me make sense of my output (should be easy as I know that my NMS is [1, 1, 10, 7] the output list is 10 items long so i reckon there should be 7 elements on one list items (but 43 and 44 byte sizes throw me off)

can you pleae help me or point me to the right documentation?


Suggest you to refer to Retinanet in GitHub - NVIDIA-AI-IOT/tao-toolkit-triton-apps: Sample app code for deploying TAO Toolkit trained models to Triton.
To dump response for Retinanet, we can add some code after

For example,

515         response = responses[processed_request]
            print("response.get_output('NMS') is {}".format(response.get_output('NMS')))
            print("response._result is {}".format(response._result))
            print("response.as_numpy for NMS is {}".format(response.as_numpy('NMS')))
            print("response.as_numpy for NMS_1 is {}".format(response.as_numpy('NMS_1')))
            print("response is {}".format(response.get_response()))
            print("response's output is {}".format(response.get_response()["outputs"]))

Thanks I will check that.

I easily got the basic functionality working by reading and plying with the TAO evaulation code for SSD.

the do_inference function is super strightforward and returns a list of 6 elements with class ids, confidance and bbox coordinates, ccordinates ca be easily extracted from e.g. [ , , uppper_leftx, upper_left_y, lower_right_x, lower_right_y]

def do_inference(context, bindings, inputs,
                 outputs, stream, batch_size=1,
                 execute_v2=False, return_raw=False):
    """Generalization for multiple inputs/outputs.

    inputs and outputs are expected to be lists of HostDeviceMem objects.
    # Transfer input data to the GPU.
    for inp in inputs:
        cuda.memcpy_htod_async(inp.device,, stream)
    # Run inference.
    if execute_v2:
        context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
        context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    for out in outputs:
        cuda.memcpy_dtoh_async(, out.device, stream)
    # Synchronize the stream

    if return_raw:
        return outputs

    # Return only the host outputs.
    return [ for out in outputs]

I will check your link and update (I think a message broker serverless application in my case will be convenient than the direct tensorrt method I’m using now because I can easly version the models without having to re create the function,)

Thanks a lot for getting back to me!