PeopleNet. Coverage output is always zero

Hi. I’m trying to inference pruned peoplenet model using tensor rt, but always get zero coverage output. So, i downloaded peoplenet tlt using command from this topic: How to run tlt-converter

After that i converted .eltl to engine using next command:

./tlt-converter /home/bronstein/tlt-experiments/resnet34_peoplenet_pruned.etlt -k tlt_encode -o output_cov/Sigmoid,output_bbox/BiasAdd -d 3,544,960 -i nchw -e /home/bronstein/tlt-experiments/engine/peoplenet.engine -m 1 -t fp16

Then i tried to inference this model using image from peoplenet main page:

Here is code which i used:

import numpy as np
import cv2
import time

import tensorrt as trt
import pycuda.autoinit
import pycuda.driver as cuda


TRT_LOGGER = trt.Logger(trt.Logger.INFO)
runtime = trt.Runtime(TRT_LOGGER)

host_inputs  = []
cuda_inputs  = []
host_outputs = []
cuda_outputs = []
bindings = []


def Inference(engine):

    im = cv2.imread('input_11ft45deg_000070.jpg')
    # im = cv2.resize(im, (640, 640))
    im = cv2.resize(im, (960, 544))
    im = np.asarray(im).astype(np.float32)
    im = im.transpose(2,0,1) / 255
    print(np.shape(host_inputs[0]), np.shape(im))
    np.copyto(host_inputs[0], im.ravel())
    stream = cuda.Stream()
    context = engine.create_execution_context()

    start_time = time.time()
    cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
    context.execute_async(bindings=bindings, stream_handle=stream.handle)
    cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
    stream.synchronize()
    print("execute times "+str(time.time()-start_time))

    print(host_outputs[1], np.max(host_outputs[1]))


def PrepareEngine():
    # deserialize engine
    with open('peoplenet.engine', 'rb') as f:
        buf = f.read()
    engine = runtime.deserialize_cuda_engine(buf)

    # create buffer
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        host_mem = cuda.pagelocked_empty(shape=[size],dtype=np.float32)
        cuda_mem = cuda.mem_alloc(host_mem.nbytes)

        bindings.append(int(cuda_mem))

        if engine.binding_is_input(binding):
            print(engine.get_binding_shape(binding))
            host_inputs.append(host_mem)
            cuda_inputs.append(cuda_mem)
        else:
            print(engine.get_binding_shape(binding))
            host_outputs.append(host_mem)
            cuda_outputs.append(cuda_mem)

    return engine


if __name__ == "__main__":

    engine = PrepareEngine()
    Inference(engine)

And np.max(host_outputs[1]) gives me 0.0. What am i doing wrong?