Thank you Dusty, that worked. You’re a huge help, yet again :)
For anyone playing along at home, here’s the code that I implemented.
It works with 1 object class. Renders the input video. If there is an object detection, it renders the bounding box video next to the input video.
(I am a rookie at python, so please don’t judge too harshly if I’ve made mistakes. I’m happy to take feedback)
#!/usr/bin/env python3
#
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
#
import argparse
import datetime
import math
import sys
import os
from jetson_inference import detectNet
from jetson_utils import (videoSource, videoOutput, logUsage, saveImage,
cudaAllocMapped, cudaCrop, cudaDeviceSynchronize,
cudaOverlay, cudaResize)
# parse the command line
parser = argparse.ArgumentParser(description="Locate objects in a live camera stream using an object detection DNN.",
formatter_class=argparse.RawTextHelpFormatter,
epilog=detectNet.Usage() + videoSource.Usage() + videoOutput.Usage() + logUsage())
parser.add_argument("input_URI", type=str, default="", nargs='?', help="URI of the input stream")
parser.add_argument("output_URI", type=str, default="", nargs='?', help="URI of the output stream")
parser.add_argument("--network", type=str, default="ssd-mobilenet-v2", help="pre-trained model to load (see below for options)")
parser.add_argument("--overlay", type=str, default="box,labels,conf", help="detection overlay flags (e.g. --overlay=box,labels,conf)\nvalid combinations are: 'box', 'labels', 'conf', 'none'")
parser.add_argument("--threshold", type=float, default=0.5, help="minimum detection threshold to use")
parser.add_argument("--snapshots", type=str, default="images/test/detections", help="output directory of detection snapshots")
parser.add_argument("--timestamp", type=str, default="%Y%m%d-%H%M%S-%f", help="timestamp format used in snapshot filenames")
try:
args = parser.parse_known_args()[0]
except:
print("")
parser.print_help()
sys.exit(0)
# create video output object
output = videoOutput("cool_detection_video.mp4")
# load the object detection network
net = detectNet(argv=["--model=/home/<your-jetson>/jetson-inference/python/training/detection/ssd/models/<your-model>/ssd-mobilenet.onnx", "--labels=/home/<your-jetson>/jetson-inference/python/training/detection/ssd/models/<your-model>/labels.txt", "--input_blob=input_0", "--output_cvg=scores", "--output_bbox=boxes", "--threshold=0.2"])
# create video source
input = videoSource("/dev/video0", argv=["--input-height=720","input-width=1280"])
# process frames until the user exits
while True:
# capture the next image
img = input.Capture()
# detect objects in the image
detections = net.Detect(img, overlay='none')
#This is used to determine if an object was detected
boundBox = None
for detection in detections:
roi = (int(detection.Left), int(detection.Top), int(detection.Right), int(detection.Bottom))
boundBox = cudaAllocMapped(width=roi[2]-roi[0], height=roi[3]-roi[1], format=img.format)
cudaCrop(img, boundBox, roi)
cudaDeviceSynchronize()
break #single detection only, as I only have 1 object class
# render the image
if boundBox:
#Getting aspect ratio of boundBox
ratio = detection.Height/detection.Width
#increasing size of bounding box video to 500 width, maintaining the same aspect ratio.
boundBoxResized = cudaAllocMapped(width=500, height=int(500*ratio), format=img.format)
cudaResize(boundBox, boundBoxResized)
cudaDeviceSynchronize()
#Combining input video with bounding box video
imgOutput = cudaAllocMapped(width = img.width + 500, height = img.height, format = img.format)
cudaOverlay(img, imgOutput, 0,0)
cudaOverlay(boundBoxResized, imgOutput, img.width,0)
cudaDeviceSynchronize()
output.Render(imgOutput)
del boundBox
else:
imgOutput = cudaAllocMapped(width = img.width + 500, height = img.height, format = img.format)
cudaOverlay(img, imgOutput, 0,0)
cudaDeviceSynchronize()
output.Render(imgOutput)
# update the title bar
output.SetStatus("{:s} | Network {:.0f} FPS".format(args.network, net.GetNetworkFPS()))
# exit on input/output EOS
if not input.IsStreaming() or not output.IsStreaming():
break