IResizeLayer give unexpected result

Description

Hi, I am trying to use IResizeLayer do the interpolate op.
In NEAREST mode, the layer give me some unexpected results.

The input is [1,0,1,0…] with shape [1,1,1,33], and the output shape is [1,1,1,66] (double width)
I expect the result should be [1,1,0,0,1,1,0,0…], but the result is [ 1,1,1,0,0,1,1…]. There is an extra 1 in the begining.

Environment

TensorRT Version: 7.1.3.4
GPU Type: 2070 Super
Nvidia Driver Version: 450.80.02
CUDA Version: 10.2
CUDNN Version: 8.0.4
Operating System + Version: ubuntu18.04
Python Version (if applicable): 3.7
TensorFlow Version (if applicable):
PyTorch Version (if applicable): 1.7.0
Baremetal or Container (if container which image + tag):

Steps To Reproduce

import tensorrt as trt
import torch
import numpy as np


def main():

    input_size = [1, 1, 1, 33]

    print("create trt model")
    log_level = trt.Logger.ERROR
    logger = trt.Logger(log_level)
    builder = trt.Builder(logger)

    ## build network
    EXPLICIT_BATCH = 1 << (int)(
        trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
    network = builder.create_network(EXPLICIT_BATCH)
    input_name = 'input'
    output_name = 'output'
    input_trt = network.add_input(name=input_name,
                                  shape=input_size,
                                  dtype=trt.float32)

    layer = network.add_resize(input_trt)
    layer.shape = tuple(input_size[:3] + [input_size[3] * 2])
    layer.resize_mode = trt.ResizeMode.NEAREST

    output = layer.get_output(0)
    output.name = output_name
    network.mark_output(output)

    ## builder config
    max_workspace_size = 1 << 30
    fp16_mode = False

    builder.max_workspace_size = max_workspace_size
    builder.fp16_mode = fp16_mode

    config = builder.create_builder_config()
    config.max_workspace_size = max_workspace_size
    profile = builder.create_optimization_profile()

    # set shape
    input_shape = input_size
    profile.set_shape(input_name, input_shape, input_shape, input_shape)
    config.add_optimization_profile(profile)
    if fp16_mode:
        config.set_flag(trt.BuilderFlag.FP16)

    # build engine
    engine = builder.build_engine(network, config)
    context = engine.create_execution_context()

    print("inference")
    input_torch = torch.zeros(input_size, dtype=torch.float32).cuda().contiguous()
    input_torch[:,:,:,::2] = 1

    bindings = [None] * 2

    # set input
    idx = engine.get_binding_index(input_name)
    context.set_binding_shape(idx, tuple(input_torch.shape))
    bindings[idx] = input_torch.data_ptr()

    # set output
    idx = engine.get_binding_index(output_name)
    shape = tuple(context.get_binding_shape(idx))
    output_torch = torch.empty(shape, dtype=torch.float32).cuda()
    bindings[idx] = output_torch.data_ptr()

    context.execute_async_v2(bindings, torch.cuda.current_stream().cuda_stream)

    print("input:")
    print(input_torch.view(-1)[:20])

    print("output:")
    print(output_torch.view(-1)[:20])


if __name__ == "__main__":
    main()

Only tensors with size 33, 37, 41, 47, 55 (and their Integer multiple) will cause this results.

Why did this happened and how could I fix it?
Thanks.