Jetson TX2 framerate with face detection and 5 point facial landmark

He Guys, I’m a graduate student Industrial Design and I’m currently using the Jetson TX2 to integrate face detect and facial landmark detection in one of my prototypes. I got it working right now using the developers kit camera that comes with the TX2. However my problem is that I can only reach a framerate of about 4fps, while on my laptop (32gb ram, 4gb gpu, I know not a fair comparison) I get (what feels like) real time detection. I’m using Python, Opencv, Dlib and some more packages to get it all working. However I’m not a diehard programmer and therefore I have actually no idea if I am using the Jetson TX2 to it’s fullest potential. So my question is how can I check this? Are there ways to get this code working faster? Any advice in general? You can find my code below:

# the needed libraries/packages
from imutils.video import WebcamVideoStream
from imutils import face_utils
from imutils.video import FPS
import numpy as np
import argparse
import imutils
import time
import dlib
import cv2


# Put arguments that you want to parse here
ap = argparse.ArgumentParser()
ap.add_argument("-s","--shape-predictor",default="./shape_predictor_5_face_landmarks.dat",help="path to facial landmark predictor")
ap.add_argument("-p", "--prototxt", default="./deploy.prototxt.txt",help="path to Caffe 'deploy' prototxt file")
ap.add_argument("-m", "--model", default="./res10_300x300_ssd_iter_140000.caffemodel", help="path to Caffe pre-trained model")
ap.add_argument("-c", "--confidence", type=float, default=0.2, help="minimum probability to filter weak detections")
ap.add_argument("-b","--blurtreshold",type=float,default=100.0,help="The sharpnes treshold")
ap.add_argument("-t", "--sizetreshold",type=float,default=0.4,help="The amount of pixels the face takes in percentage of total video resolution")
ap.add_argument("-w","--width",type=int,default=1920,help="width of video input")
ap.add_argument("-l","--height",type=int,default=1080,help="height of video input")
args = vars(ap.parse_args())

# load AI models here
print("[INFO] loading models...")
net = cv2.dnn.readNetFromCaffe(args["prototxt"], args["model"]) #find faces using Deep Neural Net
predictor = dlib.shape_predictor(args["shape_predictor"]) #Find face features using dlib

#starting up the camera
gst_str = ("nvcamerasrc ! "
               "video/x-raw(memory:NVMM), width=(int)2592, height=(int)1458, format=(string)I420, framerate=(fraction)30/1 ! "
               "nvvidconv ! video/x-raw, width=(int){}, height=(int){}, format=(string)BGRx ! "
               "videoconvert ! appsink").format(args["width"], args["height"])
vs = cv2.VideoCapture(gst_str,cv2.CAP_GSTREAMER)
fps = FPS().start()

time.sleep(0.5)

#Using the Laplacian Variance the sharpness of the image is checked. 
def variance_of_laplacian(image):
    return cv2.Laplacian(frame,cv2.CV_64F).var()

def create_eye_rect(x1,y1,x2,y2,name):
    faceHeight = endY-startY
    eyeHeight = faceHeight/8
    region = cv2.rectangle(frame,(x1,y1-int(eyeHeight/2)),(x2,y2+int(eyeHeight/2)),(0,255,0),2)
    roi = frame[y1-int(eyeHeight/2):y2+int(eyeHeight/2),x1:x2]
    sharpnessFactor = variance_of_laplacian(roi)
    print(name, "has variance of laplacian of:", sharpnessFactor)


def find_center_of_eye(eye,name):
    x1,y1 = eye[0]
    x2,y2 = eye[1]
    x3 = int(x1+((x2-x1)/2))
    y3 = int(y1+((y2-y1)/2))
    create_eye_rect(int(x1),int(y1),int(x2),int(y2),name)
    return cv2.circle(frame,(x3,y3),4,(255,0,255),-1) 


# loop over the frames from the video stream
while True:
    ret_val, frame = vs.read() #use on windows
   
    frame = imutils.resize(frame,width=400)
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    (h, w) = frame.shape[:2]
    blob = cv2.dnn.blobFromImage(cv2.resize(frame, (300, 300)), 1.0,(300, 300), (104.0, 177.0, 123.0))
    net.setInput(blob)
    detections = net.forward()

    for i in range(0, detections.shape[2]):
        confidence = detections[0, 0, i, 2]

        if confidence < args["confidence"]:
            continue

        box = detections[0, 0, i, 3:7] * np.array([w, h, w, h])
        (startX, startY, endX, endY) = box.astype("int")
        dlib_rect = dlib.rectangle(int(startX), int(startY), int(endX), int(endY))  
        text1 = "DNN confidence = {:.2f}%".format(confidence * 100)
        text2 = "68 Facial Landmarks model enabled"
        text3 = "No frontal face recognized"
        cv2.rectangle(frame, (startX, startY), (endX, endY),(0, 0, 255), 2)
        cv2.rectangle(frame, (0, 0), (680, 30),(255, 255, 255),-1)
        cv2.putText(frame, text1, (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.45, (0, 0, 0), 1)
        
        if ((endY-startY)/h) > args["sizetreshold"]:
            shape = predictor(gray,dlib_rect)
            shape = face_utils.shape_to_np(shape)
            for (x,y) in shape:
                cv2.circle(frame,(x,y),2,(255,255,255),1)   
            eye_one = shape[0:2]
            eye_two = shape[2:4]
            nose = shape[4:]
            find_center_of_eye(eye_one,"sharpness eye one")
            find_center_of_eye(eye_two,"sharpness eye two")
            
              
    

    cv2.imshow("Frame", frame)
    key = cv2.waitKey(1) & 0xFF
    fps.update()

    # if the `q` key was pressed, break from the loop
    if key == ord("q"):
        break

fps.stop()
print("[INFO] approx.FPS: {:.2f}".format(fps.fps()))
cv2.destroyAllWindows()
vs.stop()

Hi j.g.a.m.cox, using NVIDIA TensorRT you should get faster performance. It’s installed by JetPack. This tutorial uses it an includes a face detection model, I get up to 12FPS with it: https://github.com/dusty-nv/jetson-inference#running-the-live-camera-detection-demo-on-jetson