Python PaddingLayer works incorrect

Ubuntu 18.04
rtx 6000
driver 410.73
CUDA 10
CUDNN 7.4
Python 3.6
TensorRT 5.0.2.6

I tried to test a simple engine with only a paddinglayer (pre_padding=(1, 1), post_padding=(1, 1)), and input is numpy.ones((1, 1, 128, 128)) array, the result had correct shape (1, 1, 130, 130), but the elements value were modified.

def build_padding_test_engine(network, shape=(128, 128)):
    input_x = network.add_input(name='input', dtype=trt.float16, shape=(1, shape[0], shape[1]))
    crop = network.add_padding(
        input_x,
        pre_padding=(1, 1),
        post_padding=(1, 1)
    )

    network.mark_output(tensor=crop.get_output(0))

    print("PADDING", input_x.shape, crop.get_output(0).shape)

def infer(context, input_img, output_size, batch_size):
    input_img = input_img.numpy().astype(np.float16)
    output = np.empty(output_size, dtype=np.float16)

    # Allocate device memory
    d_input = cuda.mem_alloc(batch_size * input_img.nbytes)
    d_output = cuda.mem_alloc(batch_size * output.nbytes)

    bindings = [int(d_input), int(d_output)]

    stream = cuda.Stream()

    # Transfer input data to device
    cuda.memcpy_htod_async(d_input, input_img, stream)
    # Execute model
    context.execute_async(batch_size, bindings, stream.handle)
    # Transfer predictions back
    cuda.memcpy_dtoh_async(output, d_output, stream)

    stream.synchronize()

    # Return predictions
    return output


def test_engine(runtime, fn, input):
    with open(fn, "rb") as f:
        engine = runtime.deserialize_cuda_engine(f.read())

    context = engine.create_execution_context()
    
    output_size = 0
    for binding in engine:
        if engine.binding_is_input(binding):
            continue
        output_size = trt.volume(engine.get_binding_shape(binding))

    output = infer(context, engine, input.cpu(), output_size, 1)

    return output[:output_size]


TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
runtime = trt.Runtime(TRT_LOGGER)

# testing padding layer
input = torch.ones(1, 1, 128, 128).half()
output = test_engine(runtime, 'trt_padding.engine', input)

result = torch.Tensor(np.reshape(output, (1, 1, 130, 130)))

print(result)

The result should be value of ones in center 128x128, but I got following:

tensor([[[[0.0000, 0.0000, 0.0000, …, 0.0000, 0.0000, 0.0000],
[0.0000, 0.0000, 0.0000, …, 0.0000, 0.0000, 0.0000],
[0.0000, 0.0000, 0.0000, …, 1.8750, 0.0000, 1.8750],
…,
[0.0000, 1.8750, 0.0000, …, 1.8750, 0.0000, 0.0000],
[0.0000, 0.0000, 0.0000, …, 1.8750, 0.0000, 1.8750],
[0.0000, 1.8750, 0.0000, …, 1.8750, 0.0000, 0.0000]]]])

Any suggestion?

Regards,

I’ve figured out the reason, data type of IPaddingLayer output is fp32, and I can ensure the input is in fp16.

According to TensorRT layers precision document,
https://docs.nvidia.com/deeplearning/sdk/tensorrt-support-matrix/index.html#layers-precision-matrix

Padding layer should support FP16, any solutions?