Hi,
I am currently capturing video frames from a .mp4 file and applying transformations using vpi - CUDA backend and CuPy User-defined kernel before rendering it using jetson-utils VideoSource and VideoOutput in jetson-utils. I am reading the .mp4 file in rgb8 format, converting it into uint8 format for image transformations and rendering it using the same rgb8 format. Here is the code -
import cupy as cp
from exception_message import ExceptionMessage
import numpy
from jetson_utils import videoSource, videoOutput, cudaImage
from extract_system_calibration import EXTRACT_SYSTEM_CALIBRATION
import vpi
kernel_mask_image_renderer = cp.RawKernel(r'''
extern "C" __global__
void elementwise_matrix_multiplication_mask_image(unsigned char *maskImg, unsigned char *fullImg, unsigned char *mask, int width, int height, int dim)
{
const int row = blockIdx.y*blockDim.y + threadIdx.y;
const int col = blockIdx.x*blockDim.x + threadIdx.x;
const int dep = blockIdx.z*blockDim.z + threadIdx.z;
unsigned char op_val;
if(row<height && col<width && dep<dim)
{
op_val = fullImg[row*width+col+dep*width*height] * mask[row*width+col+dep*width*height];
maskImg[row*width+col+dep*width*height] = op_val;
}
}
''','elementwise_matrix_multiplication_mask_image')
excpt_msg = ExceptionMessage(Status=False, Message=None)
roi_mask = cupy.asarray(__ROI_mask_front__).astype(numpy.uint8)
final_image = cupy.asarray(numpy.zeros((720,1280,3), dtype=numpy.uint8))
param_videoOutput = []
param_videoOutput.append("--width=" + str(1280))
param_videoOutput.append(f"--height=" + str(720))
output_stream = videoOutput('display://0', argv=param_videoOutput)
cap_stream = videoSource('file:///video.mp4')
while True:
try:
frame = cap_stream.Capture(format='rgb8')
with vpi.Backend.CUDA:
distortion_corrected = vpi.asimage(frame )\
.remap(warpmap_distortion_correction, interp=vpi.Interp.LINEAR)\
with distortion_corrected.rlock_cuda() as cudaBuffer:
image_pointer = cudaBuffer.__cuda_array_interface__['data'][0]
mem = cupy.cuda.UnownedMemory(image_pointer, 2764800, owner=None)
memptr = cupy.cuda.MemoryPointer(mem, offset=0)
distortion_corrected_cpArr = cupy.ndarray((720, 1280, 3), dtype=cupy.uint8, memptr=memptr, strides = (3840, 3, 1))
kernel_mask_image_renderer((43,65),(17,20,3),\
(final_image,roi_mask,distortion_corrected_cpArr,1280,720,3))
final_image_ptr = final_image.__cuda_array_interface__['data'][0]
masked_cudaImg = cudaImage(ptr = final_image_ptr, width=1280, height=720, format='rgb8')
output_stream.Render(masked_cudaImg)
if not cap_stream.IsStreaming() or not output_stream.IsStreaming():
continue
except Exception as exception:
excpt_msg['Status'] = True
excpt_msg['Message'] = str(exception)
The output video frames look like this -
However, when I do not use CuPy’s user defined kernel and just copy the data to a CuPy ndarray before rendering, I don’t see any color format issues. The code is -
import cupy as cp
from exception_message import ExceptionMessage
import numpy
from jetson_utils import videoSource, videoOutput, cudaImage
from extract_system_calibration import EXTRACT_SYSTEM_CALIBRATION
import vpi
kernel_mask_image_renderer = cp.RawKernel(r'''
extern "C" __global__
void elementwise_matrix_multiplication_mask_image(unsigned char *maskImg, unsigned char *fullImg, unsigned char *mask, int width, int height, int dim)
{
const int row = blockIdx.y*blockDim.y + threadIdx.y;
const int col = blockIdx.x*blockDim.x + threadIdx.x;
const int dep = blockIdx.z*blockDim.z + threadIdx.z;
unsigned char op_val;
if(row<height && col<width && dep<dim)
{
op_val = fullImg[row*width+col+dep*width*height] * mask[row*width+col+dep*width*height];
maskImg[row*width+col+dep*width*height] = op_val;
}
}
''','elementwise_matrix_multiplication_mask_image')
excpt_msg = ExceptionMessage(Status=False, Message=None)
roi_mask = cupy.asarray(__ROI_mask_front__).astype(numpy.uint8)
final_image = cupy.asarray(numpy.zeros((720,1280,3), dtype=numpy.uint8))
param_videoOutput = []
param_videoOutput.append("--width=" + str(1280))
param_videoOutput.append(f"--height=" + str(720))
output_stream = videoOutput('display://0', argv=param_videoOutput)
cap_stream = videoSource('file:///video.mp4')
while True:
try:
frame = cap_stream.Capture(format='rgb8')
with vpi.Backend.CUDA:
distortion_corrected = vpi.asimage(frame )\
.remap(warpmap_distortion_correction, interp=vpi.Interp.LINEAR)\
with distortion_corrected.rlock_cuda() as cudaBuffer:
image_pointer = cudaBuffer.__cuda_array_interface__['data'][0]
mem = cupy.cuda.UnownedMemory(image_pointer, 2764800, owner=None)
memptr = cupy.cuda.MemoryPointer(mem, offset=0)
distortion_corrected_cpArr = cupy.ndarray((720, 1280, 3), dtype=cupy.uint8, memptr=memptr, strides = (3840, 3, 1))
final_image_ptr = distortion_corrected_cpArr.__cuda_array_interface__['data'][0]
masked_cudaImg = cudaImage(ptr = final_image_ptr, width=1280, height=720, format='rgb8')
output_stream.Render(masked_cudaImg)
if not cap_stream.IsStreaming() or not output_stream.IsStreaming():
continue
except Exception as exception:
excpt_msg['Status'] = True
excpt_msg['Message'] = str(exception)
I tested the kernel separately (using the ‘output - (a*b) = 0’ approach), and it works as expected. Kindly advice.
Thanks