Hello everyone,
I’m building a multi‐input gaze estimation pipeline using Nvidia’s GazeNet. the model produces five outputs in the form:
[x, y, z, theta, phi]
currently I am using deployable_v1.0
I’m uncertain if those angles are in degrees or radians, and I’m struggling to draw the correct gaze arrow on the original image. Or else is the issue because of how I am processing and supplying the input from the original image to the model?
Below is an overview of my workflow and relevant code snippets:
Preprocessing Script (extract-multi-inputsv1.py
)
- Uses Mediapipe to detect the face bounding box and landmarks for left/right eyes.
- Saves four inputs:
- Face: 224×224 grayscale
- Left Eye: 224×224 grayscale
- Right Eye: 224×224 grayscale
- FaceGrid: A 25×25 binary map (flattened to 625×1) indicating face location.
import cv2
import mediapipe as mp
import numpy as np
import os
from tqdm import tqdm
mp_face_detection = mp.solutions.face_detection
mp_face_mesh = mp.solutions.face_mesh
face_detection = mp_face_detection.FaceDetection(min_detection_confidence=0.7)
os.makedirs("processed/face", exist_ok=True)
os.makedirs("processed/left_eye", exist_ok=True)
os.makedirs("processed/right_eye", exist_ok=True)
os.makedirs("processed/facegrid", exist_ok=True)
def generate_facegrid(face_bbox, img_shape):
H, W = img_shape[:2]
grid_size = 25
facegrid = np.zeros((grid_size * grid_size, 1), dtype=np.uint8) # Shape: (625, 1)
x, y, w, h = face_bbox
x_grid = int((x / W) * grid_size)
y_grid = int((y / H) * grid_size)
w_grid = int((w / W) * grid_size)
h_grid = int((h / H) * grid_size)
for i in range(y_grid, min(y_grid + h_grid, grid_size)):
for j in range(x_grid, min(x_grid + w_grid, grid_size)):
index = i * grid_size + j # flat 2D index to 1D
facegrid[index] = 1
return facegrid
LEFT_EYE_LANDMARKS = [362, 382, 381, 380, 374, 373, 390, 249, 263, 466, 388, 387, 386, 385, 384, 398]
RIGHT_EYE_LANDMARKS = [33, 7, 163, 144, 145, 153, 154, 155, 133, 173, 157, 158, 159, 160, 161, 246]
input_folder = r"front-cam-images"
output_folder = "processed"
for filename in tqdm(os.listdir(input_folder)):
if not filename.endswith(('.jpg', '.png', '.jpeg')):
continue
img_path = os.path.join(input_folder, filename)
image = cv2.imread(img_path)
if image is None:
continue
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
H, W = image.shape[:2]
results = face_detection.process(image_rgb)
if not results.detections:
continue
for detection in results.detections:
bboxC = detection.location_data.relative_bounding_box
x, y, w, h = int(bboxC.xmin * W), int(bboxC.ymin * H), int(bboxC.width * W), int(bboxC.height * H)
face = image[y:y+h, x:x+w]
face_gray = cv2.cvtColor(face, cv2.COLOR_BGR2GRAY)
face_resized = cv2.resize(face_gray, (224, 224))
with mp_face_mesh.FaceMesh(static_image_mode=True, max_num_faces=1, refine_landmarks=True) as face_mesh:
mesh_results = face_mesh.process(image_rgb)
if not mesh_results.multi_face_landmarks:
continue
for face_landmarks in mesh_results.multi_face_landmarks:
# extract eye regions
left_eye_points = [(int(face_landmarks.landmark[i].x * W), int(face_landmarks.landmark[i].y * H))
for i in LEFT_EYE_LANDMARKS]
right_eye_points = [(int(face_landmarks.landmark[i].x * W), int(face_landmarks.landmark[i].y * H))
for i in RIGHT_EYE_LANDMARKS]
margin = 10
left_x_min = max(0, min(p[0] for p in left_eye_points) - margin)
left_y_min = max(0, min(p[1] for p in left_eye_points) - margin)
left_x_max = min(W, max(p[0] for p in left_eye_points) + margin)
left_y_max = min(H, max(p[1] for p in left_eye_points) + margin)
right_x_min = max(0, min(p[0] for p in right_eye_points) - margin)
right_y_min = max(0, min(p[1] for p in right_eye_points) - margin)
right_x_max = min(W, max(p[0] for p in right_eye_points) + margin)
right_y_max = min(H, max(p[1] for p in right_eye_points) + margin)
left_eye = image[left_y_min:left_y_max, left_x_min:left_x_max]
right_eye = image[right_y_min:right_y_max, right_x_min:right_x_max]
if left_eye.size == 0 or right_eye.size == 0:
continue
left_eye_gray = cv2.cvtColor(left_eye, cv2.COLOR_BGR2GRAY)
right_eye_gray = cv2.cvtColor(right_eye, cv2.COLOR_BGR2GRAY)
left_eye_resized = cv2.resize(left_eye_gray, (224, 224))
right_eye_resized = cv2.resize(right_eye_gray, (224, 224))
# generate facegrid
facegrid = generate_facegrid((x, y, w, h), image.shape)
cv2.imwrite(f"{output_folder}/face/{filename}", face_resized)
cv2.imwrite(f"{output_folder}/left_eye/{filename}", left_eye_resized)
cv2.imwrite(f"{output_folder}/right_eye/{filename}", right_eye_resized)
np.save(f"{output_folder}/facegrid/{filename.split('.')[0]}.npy", facegrid)
break
and the processed images from the above script are:
left eye:
right eye:
and the 224 x 224 grey-scale face:
Inference Script (run_inference.py
)
- Loads the preprocessed images and facegrid.
- Normalizes them to [0,1] and adds batch/channel dimensions.
- Feeds them into a TensorRT engine (
out.engine
). - Prints out the 5D output for each sample.
import numpy as np
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import cv2
import os
def load_inputs(filename):
base_path = "/workspace/gaze/processed"
face_path = os.path.join(base_path, "face", filename)
left_eye_path = os.path.join(base_path, "left_eye", filename)
right_eye_path = os.path.join(base_path, "right_eye", filename)
facegrid_path = os.path.join(base_path, "facegrid", filename.split('.')[0] + ".npy")
face = cv2.imread(face_path, cv2.IMREAD_GRAYSCALE)
left_eye = cv2.imread(left_eye_path, cv2.IMREAD_GRAYSCALE)
right_eye = cv2.imread(right_eye_path, cv2.IMREAD_GRAYSCALE)
facegrid = np.load(facegrid_path)
if face is None or left_eye is None or right_eye is None or facegrid is None:
raise ValueError(f"Failed to load inputs for {filename}")
if facegrid.shape != (625, 1):
raise ValueError(f"Facegrid shape {facegrid.shape} does not match expected (625, 1)")
face = face.astype(np.float32) / 255.0
left_eye = left_eye.astype(np.float32) / 255.0
right_eye = right_eye.astype(np.float32) / 255.0
facegrid = facegrid.astype(np.float32)
face = face[np.newaxis, np.newaxis, :, :] #(1, 1, 224, 224)
left_eye = left_eye[np.newaxis, np.newaxis, :, :] #(1, 1, 224, 224)
right_eye = right_eye[np.newaxis, np.newaxis, :, :] #(1, 1, 224, 224)
facegrid = facegrid[np.newaxis, np.newaxis, :, :] #(1, 1, 625, 1)
return left_eye, right_eye, face, facegrid
engine_path = "out.engine"
with open(engine_path, "rb") as f:
runtime = trt.Runtime(trt.Logger(trt.Logger.WARNING))
engine = runtime.deserialize_cuda_engine(f.read())
context = engine.create_execution_context()
context.set_binding_shape(0, (1, 1, 224, 224)) #input_right_images:0
context.set_binding_shape(1, (1, 1, 224, 224)) #input_left_images:0
context.set_binding_shape(2, (1, 1, 625, 1)) #input_facegrid:0
context.set_binding_shape(3, (1, 1, 224, 224)) #input_face_images:0
for i in range(engine.num_bindings):
if engine.binding_is_input(i):
print(f"Input Binding {i}: {engine.get_binding_name(i)}, Shape: {context.get_binding_shape(i)}")
inputs = [
cuda.mem_alloc(1 * 1 * 224 * 224 * 4), #right_eye
cuda.mem_alloc(1 * 1 * 224 * 224 * 4), #left_eye
cuda.mem_alloc(1 * 1 * 625 * 1 * 4), #facegrid
cuda.mem_alloc(1 * 1 * 224 * 224 * 4) #face
]
output_idx = [i for i in range(engine.num_bindings) if not engine.binding_is_input(i)][0]
output_shape = context.get_binding_shape(output_idx)
if output_shape[0] < 0: # Handle dynamic batch size
output_shape = (1, *output_shape[1:]) # Assume batch size 1
output_host = np.zeros(output_shape, dtype=np.float32)
d_output = cuda.mem_alloc(output_host.nbytes)
bindings = [int(d) for d in inputs] + [0] * (engine.num_bindings - len(inputs))
bindings[output_idx] = int(d_output)
filename = "looking-up.jpg"
try:
left_eye, right_eye, face, facegrid = load_inputs(filename)
except ValueError as e:
print(f"Error: {e}")
exit(1)
cuda.memcpy_htod(inputs[0], right_eye)
cuda.memcpy_htod(inputs[1], left_eye)
cuda.memcpy_htod(inputs[2], facegrid)
cuda.memcpy_htod(inputs[3], face)
# un inference
context.execute_v2(bindings)
cuda.memcpy_dtoh(output_host, d_output)
print(f"Gaze prediction for {filename}: {output_host}")
print(f"Output shape: {output_shape}")
Current Output
For a test image where a man is looking up
I get:
output = [ 1.4023830e+02 -1.2399844e+02 -3.4104328e+01 1.2113892e-01 -6.9884084e-02 ]
Drawing the Gaze Vector (check-output.py
)
I currently take theta = output[3]
and phi = output[4]
, then do
import cv2
import mediapipe as mp
import numpy as np
import os
import math
mp_face_detection = mp.solutions.face_detection
face_detection = mp_face_detection.FaceDetection(min_detection_confidence=0.7)
def draw_gaze(img, face_center, gaze_vector, scale=100, color=(0, 255, 0)):
#draws an arrow from the face center indicating gaze direction.
print("Gaze Vector:", gaze_vector) # Debug print
end_x = int(face_center[0] + gaze_vector[0] * scale)
end_y = int(face_center[1] + gaze_vector[1] * scale)
print("Endpoint:", (end_x, end_y)) # Debug print
cv2.arrowedLine(img, face_center, (end_x, end_y), color, 2, tipLength=0.3)
def estimate_gaze():
"""Returns the real gaze vector from model output logs."""
gaze_output = np.array([ 1.4023830e+02, -1.2399844e+02, -3.4104328e+01, 1.2113892e-01,
-6.9884084e-02])
theta = gaze_output[3]
phi = gaze_output[4]
# Convert angles to a vector
dir_x = math.cos(phi) * math.sin(theta)
dir_y = math.sin(phi)
return (dir_x, dir_y)
def process_frame(image_path):
img = cv2.imread(image_path)
h, w, _ = img.shape
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
results = face_detection.process(img_rgb)
if results.detections:
for detection in results.detections:
bbox = detection.location_data.relative_bounding_box
x, y, w_box, h_box = int(bbox.xmin * w), int(bbox.ymin * h), \
int(bbox.width * w), int(bbox.height * h)
face_center = (x + w_box // 2, y + h_box // 2)
gaze_vector = estimate_gaze()
cv2.rectangle(img, (x, y), (x + w_box, y + h_box), (255, 0, 0), 2)
draw_gaze(img, face_center, gaze_vector)
cv2.imshow("Gaze Estimation", img)
cv2.waitKey(0)
cv2.destroyAllWindows()
process_frame(r"front-cam-images\looking-up.jpg")
dir_x = math.cos(phi) * math.sin(theta)
dir_y = math.sin(phi)
But the arrow from the center of the image doesn’t match the direction the person is actually looking. I’m not sure what’s wrong:
can anyone help me