import tensorrt as trt
import cv2
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit
import argparse

##
## tlt-converter -k tlt_encode -o output_bbox/BiasAdd,output_cov/Sigmoid 
## -d 3,554,960 -c resnet34_peoplenet_pruned.etlt
## -e peoplenet.engine
##

class TrtDetectNet():
    def __init__(self, model, input_shape, cuda_ctx=None):
        self.model = model
        self.input_shape = input_shape
        self.cuda_ctx = cuda_ctx
        if self.cuda_ctx:
            self.cuda_ctx.push()
        self.logger = trt.Logger(trt.Logger.INFO)
        self._load_plugins()
        self.engine = self._load_engine()
        print(self.engine.name)

        if self.engine:
            try:
                self.context = self.engine.create_execution_context()
                self.stream = cuda.Stream()
                self.cpu_in, self.cpu_out, self.gpu_in, self.gpu_out, self.bindings = self._allocate_buffers()
            except Exception as e:
                print('Error occured ', e)
            finally:
                if self.cuda_ctx:
                    self.cuda_ctx.pop()
        else:
            print("No engine")

    def __del__(self):
        del self.gpu_in
        del self.stream
        del self.gpu_out

    def _load_plugins(self):
        if (trt.__version__[0]) < '7':
            # XXX
            ctypes.CDLL("libflattenconcat.so")
        trt.init_libnvinfer_plugins(self.logger, '')

    def _load_engine(self):
        TRT_engine = self.model
        with open(TRT_engine, 'rb') as f, trt.Runtime(self.logger) as runtime:
            print("loading engine from []", TRT_engine)
            return runtime.deserialize_cuda_engine(f.read())

    def _allocate_buffers(self):
        cpu_in, cpu_out, gpu_in, gpu_out, bindings = \
            [], [], [], [], []
        for binding in self.engine:
            size = trt.volume(self.engine.get_binding_shape(binding)) * \
                self.engine.max_batch_size
            cpu_mem = cuda.pagelocked_empty(size, np.float32)
            gpu_mem = cuda.mem_alloc(cpu_mem.nbytes)
            bindings.append(int(gpu_mem))
            if self.engine.binding_is_input(binding):
                cpu_in.append(cpu_mem)
                gpu_in.append(gpu_mem)
            else:
                cpu_out.append(cpu_mem)
                gpu_out.append(gpu_mem)
        return cpu_in, cpu_out, gpu_in, gpu_out, bindings

    def _preprocess(self, image, shape=(554, 960)):
        image = cv2.resize(image, shape)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = image.transpose((2, 0, 1)).astype(np.float32)
        image *= (2.0/255.0)
        image -= 1.0
        return image

    def _postprocess(self, image, output, conf_thresh=0.5, output_layout=7):
        height, width, channel = image.shape
        bboxes, confs, classes = [], [], []
        for idx in range(0, len(output), output_layout):
            conf = float(output[idx+2])
            if conf < conf_thresh:
                continue
            # get bounding box in pixel coordinates
            x1 = int(output[idx+3] * width)
            y1 = int(output[idx+4] * height)
            x2 = int(output[idx+5] * width)
            y2 = int(output[idx+6] * height)
            # get detection class
            objClass = int(output[idx+1])
            bboxes.append((x1, y1, x2, y2))
            classes.append(objClass)
            confs.append(conf)
            print("Class: ", objClass)
            print("Conf: ", conf)
            print("BBox: ", x1, y1, x2, y2)
        return bboxes, confs, classes

    def runInference(self, image, conf_thresh=0.5):
        data = self._preprocess(image)
        np.copyto(self.cpu_in[0], data.ravel())

        if self.cuda_ctx:
            self.cuda_ctx.push()

        cuda.memcpy_htod_async(self.gpu_in[0], self.cpu_in[0], self.stream)
        self.context.execute_async(batch_size=1, bindings=self.bindings, stream_handle=self.stream.handle)
        cuda.memcpy_dtoh_async(self.cpu_out[1], self.gpu_out[1], self.stream)
        cuda.memcpy_dtoh_async(self.cpu_out[0], self.gpu_out[0], self.stream)

        self.stream.synchronize()

        if self.cuda_ctx:
            self.cuda_ctx.pop()

        output = self.cpu_out[0]
        return self._postprocess(image, output, conf_thresh)


def parse_args():
    desc = ("Run peoplenet on the given image")
    parser = argparse.ArgumentParser(description=desc)
    parser.add_argument('-i', '--input', type=str)
    parser.add_argument('-o', '--output', type=str)
    args = parser.parse_args()
    return args


def draw_boundingbox(image, boxes, confs, classes):
    for bb, cf, cl in zip(boxes, confs, classes):
        cl = int(cl)
        x_min, y_min, x_max, y_max = bb[0], bb[1], bb[2], bb[3]
        print(cl, x_min, y_min, x_max, y_max)
        cv2.rectangle(image, (x_min, y_min), (x_max, y_max), (255, 0, 0), 2)
        return image


def main():
    args = parse_args()
    in_file = args.input
    out_file = args.output
    trt_engine = TrtDetectNet('peoplenet.engine', (554, 960))

    image = cv2.imread(in_file)
    boxes, confs, classes = trt_engine.runInference(image)
    out = draw_boundingbox(image, boxes, confs, classes)
    cv2.imwrite(out_file, out)

if __name__ == '__main__':
    main()