Help with generating gaze vector from GazeNet model's output

Hello everyone,

I’m building a multi‐input gaze estimation pipeline using Nvidia’s GazeNet. the model produces five outputs in the form:

[x, y, z, theta, phi]

currently I am using deployable_v1.0

I’m uncertain if those angles are in degrees or radians, and I’m struggling to draw the correct gaze arrow on the original image. Or else is the issue because of how I am processing and supplying the input from the original image to the model?

Below is an overview of my workflow and relevant code snippets:

Preprocessing Script (extract-multi-inputsv1.py)

  • Uses Mediapipe to detect the face bounding box and landmarks for left/right eyes.
  • Saves four inputs:
    • Face: 224×224 grayscale
    • Left Eye: 224×224 grayscale
    • Right Eye: 224×224 grayscale
    • FaceGrid: A 25×25 binary map (flattened to 625×1) indicating face location.
import cv2
import mediapipe as mp
import numpy as np
import os
from tqdm import tqdm


mp_face_detection = mp.solutions.face_detection
mp_face_mesh = mp.solutions.face_mesh
face_detection = mp_face_detection.FaceDetection(min_detection_confidence=0.7)

os.makedirs("processed/face", exist_ok=True)
os.makedirs("processed/left_eye", exist_ok=True)
os.makedirs("processed/right_eye", exist_ok=True)
os.makedirs("processed/facegrid", exist_ok=True)


def generate_facegrid(face_bbox, img_shape):
    H, W = img_shape[:2]
    grid_size = 25
    facegrid = np.zeros((grid_size * grid_size, 1), dtype=np.uint8)  # Shape: (625, 1)
    
    x, y, w, h = face_bbox
    x_grid = int((x / W) * grid_size)
    y_grid = int((y / H) * grid_size)
    w_grid = int((w / W) * grid_size)
    h_grid = int((h / H) * grid_size)

    for i in range(y_grid, min(y_grid + h_grid, grid_size)):
        for j in range(x_grid, min(x_grid + w_grid, grid_size)):
            index = i * grid_size + j  # flat 2D index to 1D
            facegrid[index] = 1
    return facegrid


LEFT_EYE_LANDMARKS = [362, 382, 381, 380, 374, 373, 390, 249, 263, 466, 388, 387, 386, 385, 384, 398]
RIGHT_EYE_LANDMARKS = [33, 7, 163, 144, 145, 153, 154, 155, 133, 173, 157, 158, 159, 160, 161, 246]

input_folder = r"front-cam-images"
output_folder = "processed"

for filename in tqdm(os.listdir(input_folder)):
    if not filename.endswith(('.jpg', '.png', '.jpeg')):
        continue
    
    img_path = os.path.join(input_folder, filename)
    image = cv2.imread(img_path)
    if image is None:
        continue
        
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    H, W = image.shape[:2]


    results = face_detection.process(image_rgb)
    if not results.detections:
        continue

    for detection in results.detections:
        bboxC = detection.location_data.relative_bounding_box
        x, y, w, h = int(bboxC.xmin * W), int(bboxC.ymin * H), int(bboxC.width * W), int(bboxC.height * H)
        face = image[y:y+h, x:x+w]
        face_gray = cv2.cvtColor(face, cv2.COLOR_BGR2GRAY)
        face_resized = cv2.resize(face_gray, (224, 224))


        with mp_face_mesh.FaceMesh(static_image_mode=True, max_num_faces=1, refine_landmarks=True) as face_mesh:
            mesh_results = face_mesh.process(image_rgb)
            if not mesh_results.multi_face_landmarks:
                continue

            for face_landmarks in mesh_results.multi_face_landmarks:
                # extract eye regions
                left_eye_points = [(int(face_landmarks.landmark[i].x * W), int(face_landmarks.landmark[i].y * H)) 
                                   for i in LEFT_EYE_LANDMARKS]
                right_eye_points = [(int(face_landmarks.landmark[i].x * W), int(face_landmarks.landmark[i].y * H)) 
                                    for i in RIGHT_EYE_LANDMARKS]
                
                margin = 10
                left_x_min = max(0, min(p[0] for p in left_eye_points) - margin)
                left_y_min = max(0, min(p[1] for p in left_eye_points) - margin)
                left_x_max = min(W, max(p[0] for p in left_eye_points) + margin)
                left_y_max = min(H, max(p[1] for p in left_eye_points) + margin)
                
                right_x_min = max(0, min(p[0] for p in right_eye_points) - margin)
                right_y_min = max(0, min(p[1] for p in right_eye_points) - margin)
                right_x_max = min(W, max(p[0] for p in right_eye_points) + margin)
                right_y_max = min(H, max(p[1] for p in right_eye_points) + margin)
                
                left_eye = image[left_y_min:left_y_max, left_x_min:left_x_max]
                right_eye = image[right_y_min:right_y_max, right_x_min:right_x_max]
                
                if left_eye.size == 0 or right_eye.size == 0:
                    continue
                
                left_eye_gray = cv2.cvtColor(left_eye, cv2.COLOR_BGR2GRAY)
                right_eye_gray = cv2.cvtColor(right_eye, cv2.COLOR_BGR2GRAY)
                left_eye_resized = cv2.resize(left_eye_gray, (224, 224))
                right_eye_resized = cv2.resize(right_eye_gray, (224, 224))

                # generate facegrid
                facegrid = generate_facegrid((x, y, w, h), image.shape)

                
                cv2.imwrite(f"{output_folder}/face/{filename}", face_resized)
                cv2.imwrite(f"{output_folder}/left_eye/{filename}", left_eye_resized)
                cv2.imwrite(f"{output_folder}/right_eye/{filename}", right_eye_resized)
                np.save(f"{output_folder}/facegrid/{filename.split('.')[0]}.npy", facegrid)
                break


and the processed images from the above script are:

left eye:

right eye:

and the 224 x 224 grey-scale face:

Inference Script (run_inference.py)

  • Loads the preprocessed images and facegrid.
  • Normalizes them to [0,1] and adds batch/channel dimensions.
  • Feeds them into a TensorRT engine (out.engine).
  • Prints out the 5D output for each sample.
import numpy as np
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import cv2
import os


def load_inputs(filename):
    base_path = "/workspace/gaze/processed" 
    
    face_path = os.path.join(base_path, "face", filename)
    left_eye_path = os.path.join(base_path, "left_eye", filename)
    right_eye_path = os.path.join(base_path, "right_eye", filename)
    facegrid_path = os.path.join(base_path, "facegrid", filename.split('.')[0] + ".npy")
    
    face = cv2.imread(face_path, cv2.IMREAD_GRAYSCALE)
    left_eye = cv2.imread(left_eye_path, cv2.IMREAD_GRAYSCALE)
    right_eye = cv2.imread(right_eye_path, cv2.IMREAD_GRAYSCALE)
    facegrid = np.load(facegrid_path)
    
    if face is None or left_eye is None or right_eye is None or facegrid is None:
        raise ValueError(f"Failed to load inputs for {filename}")
    
    if facegrid.shape != (625, 1):
        raise ValueError(f"Facegrid shape {facegrid.shape} does not match expected (625, 1)")
    

    face = face.astype(np.float32) / 255.0
    left_eye = left_eye.astype(np.float32) / 255.0
    right_eye = right_eye.astype(np.float32) / 255.0
    facegrid = facegrid.astype(np.float32)
    

    face = face[np.newaxis, np.newaxis, :, :]        #(1, 1, 224, 224)
    left_eye = left_eye[np.newaxis, np.newaxis, :, :]  #(1, 1, 224, 224)
    right_eye = right_eye[np.newaxis, np.newaxis, :, :] #(1, 1, 224, 224)
    facegrid = facegrid[np.newaxis, np.newaxis, :, :]    #(1, 1, 625, 1)
    
    return left_eye, right_eye, face, facegrid


engine_path = "out.engine"
with open(engine_path, "rb") as f:
    runtime = trt.Runtime(trt.Logger(trt.Logger.WARNING))
    engine = runtime.deserialize_cuda_engine(f.read())


context = engine.create_execution_context()


context.set_binding_shape(0, (1, 1, 224, 224))  #input_right_images:0
context.set_binding_shape(1, (1, 1, 224, 224))  #input_left_images:0
context.set_binding_shape(2, (1, 1, 625, 1))    #input_facegrid:0
context.set_binding_shape(3, (1, 1, 224, 224))  #input_face_images:0


for i in range(engine.num_bindings):
    if engine.binding_is_input(i):
        print(f"Input Binding {i}: {engine.get_binding_name(i)}, Shape: {context.get_binding_shape(i)}")


inputs = [
    cuda.mem_alloc(1 * 1 * 224 * 224 * 4),  #right_eye
    cuda.mem_alloc(1 * 1 * 224 * 224 * 4),  #left_eye
    cuda.mem_alloc(1 * 1 * 625 * 1 * 4),    #facegrid
    cuda.mem_alloc(1 * 1 * 224 * 224 * 4)   #face
]


output_idx = [i for i in range(engine.num_bindings) if not engine.binding_is_input(i)][0]
output_shape = context.get_binding_shape(output_idx)
if output_shape[0] < 0:  # Handle dynamic batch size
    output_shape = (1, *output_shape[1:])  # Assume batch size 1
output_host = np.zeros(output_shape, dtype=np.float32)
d_output = cuda.mem_alloc(output_host.nbytes)

bindings = [int(d) for d in inputs] + [0] * (engine.num_bindings - len(inputs))
bindings[output_idx] = int(d_output)


filename = "looking-up.jpg"  
try:
    left_eye, right_eye, face, facegrid = load_inputs(filename)
except ValueError as e:
    print(f"Error: {e}")
    exit(1)


cuda.memcpy_htod(inputs[0], right_eye)  
cuda.memcpy_htod(inputs[1], left_eye)  
cuda.memcpy_htod(inputs[2], facegrid)  
cuda.memcpy_htod(inputs[3], face) 

# un inference
context.execute_v2(bindings)


cuda.memcpy_dtoh(output_host, d_output)


print(f"Gaze prediction for {filename}: {output_host}")
print(f"Output shape: {output_shape}")

Current Output
For a test image where a man is looking up

I get:

output = [ 1.4023830e+02 -1.2399844e+02 -3.4104328e+01 1.2113892e-01 -6.9884084e-02 ]

Drawing the Gaze Vector (check-output.py)

I currently take theta = output[3] and phi = output[4], then do

import cv2
import mediapipe as mp
import numpy as np
import os
import math


mp_face_detection = mp.solutions.face_detection
face_detection = mp_face_detection.FaceDetection(min_detection_confidence=0.7)


def draw_gaze(img, face_center, gaze_vector, scale=100, color=(0, 255, 0)):
    #draws an arrow from the face center indicating gaze direction.
    print("Gaze Vector:", gaze_vector)           # Debug print
    end_x = int(face_center[0] + gaze_vector[0] * scale)
    end_y = int(face_center[1] + gaze_vector[1] * scale)
    print("Endpoint:", (end_x, end_y))           # Debug print
    cv2.arrowedLine(img, face_center, (end_x, end_y), color, 2, tipLength=0.3)


def estimate_gaze():
    """Returns the real gaze vector from model output logs."""
    gaze_output = np.array([ 1.4023830e+02, -1.2399844e+02, -3.4104328e+01,  1.2113892e-01,
  -6.9884084e-02])  
    theta = gaze_output[3]
    phi = gaze_output[4]
    # Convert angles to a vector
    dir_x = math.cos(phi) * math.sin(theta)
    dir_y = math.sin(phi)
    return (dir_x, dir_y)


def process_frame(image_path):
    img = cv2.imread(image_path)
    h, w, _ = img.shape


    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    results = face_detection.process(img_rgb)

    if results.detections:
        for detection in results.detections:
            bbox = detection.location_data.relative_bounding_box
            x, y, w_box, h_box = int(bbox.xmin * w), int(bbox.ymin * h), \
                                 int(bbox.width * w), int(bbox.height * h)
            face_center = (x + w_box // 2, y + h_box // 2)

            gaze_vector = estimate_gaze()

            cv2.rectangle(img, (x, y), (x + w_box, y + h_box), (255, 0, 0), 2)


            draw_gaze(img, face_center, gaze_vector)


    cv2.imshow("Gaze Estimation", img)
    cv2.waitKey(0)
    cv2.destroyAllWindows()


process_frame(r"front-cam-images\looking-up.jpg")  

dir_x = math.cos(phi) * math.sin(theta)
dir_y = math.sin(phi)

But the arrow from the center of the image doesn’t match the direction the person is actually looking. I’m not sure what’s wrong:

can anyone help me