Optimal way to copy data between a VPI pipeline and CUDA Kernel in PyCUDA

Hi,

I am trying to use an image in vpi format in a CUDA kernel. Currently, I am having to convert the image to a numpy.float32 format. Is there a way to achieve zero mem copy in this pipeline, without having to copy the data to host and convert it to numpy format?

Here is the code -

kernel_mask_image_renderer = SourceModule("""
__global__ void elementwise_matrix_multiplication_mask_image(float *maskImg, float *fullImg, float *mask, int width, int height, int dim)
{
    const int row = blockIdx.y*blockDim.y + threadIdx.y;
    const int col = blockIdx.x*blockDim.x + threadIdx.x;
    const int dep = blockIdx.z*blockDim.z + threadIdx.z;

    float op_val;
    if(row<height && col<width && dep<dim)
    {
        op_val = fullImg[row*width+col+dep*width*height] * mask[row*width+col+dep*width*height];
        maskImg[row*width+col+dep*width*height] = op_val;
    }

}
""")


cap_front = cv2.VideoCapture("filesrc location=schoolbus_video/front_video.mp4 ! qtdemux ! queue ! h264parse ! nvv4l2decoder ! nvvidconv ! video/x-raw,format=BGRx ! queue ! videoconvert ! queue ! video/x-raw, format=BGR ! appsink  ", cv2.CAP_GSTREAMER)
kernel_object_mask_image = kernel_mask_image_renderer.get_function('elementwise_matrix_multiplication_mask_image')

while True:
    ret_front, frame_front = cap_front.read()

    with vpi.Backend.CUDA:
        distortion_corrected_front = vpi.asimage(frame_front)\
                                                    .convert(vpi.Format.NV12_ER)\
                                                    .remap(warpmap_distortion_correction, interp=vpi.Interp.LINEAR)\
                                                    .convert(vpi.Format.RGB8)
        
        roi_image_front_input = distortion_corrected_front.cpu().astype(numpy.float32)

        kernel_object_mask_image(drv.Out(roi_image_front), drv.In(roi_image_front_input), drv.In(ROI_mask_front),
                                         randArrWidth, randArrHeight, randArrDim,
                                         block=(20,17,3), grid=(65,43))

I wanted to know if there is a way to avoid converting distortion_corrected_front into a numpy array before using it in a CUDA function.

Thanks

Hi,

VPI can convert the image from RGB8 to float32 directly.
Please check below document:

https://docs.nvidia.com/vpi/algo_imageconv.html

Thanks.

Thanks for the reply @AastaLLL
I tried using the following code instead -

from jetson_utils import videoSource, videoOutput
import pycuda.driver as drv
import pycuda.autoprimaryctx
from pycuda.compiler import SourceModule
from extract_system_calibration import EXTRACT_SYSTEM_CALIBRATION
from exception_message import ExceptionMessage
import numpy
import cv2
import vpi


kernel_mask_image_renderer = SourceModule("""
__global__ void elementwise_matrix_multiplication_mask_image(float *maskImg, float *fullImg, float *mask, int width, int height, int dim)
{
    const int row = blockIdx.y*blockDim.y + threadIdx.y;
    const int col = blockIdx.x*blockDim.x + threadIdx.x;
    const int dep = blockIdx.z*blockDim.z + threadIdx.z;

    float op_val;
    if(row<height && col<width && dep<dim)
    {
        op_val = fullImg[row*width+col+dep*width*height] * mask[row*width+col+dep*width*height];
        maskImg[row*width+col+dep*width*height] = op_val;
    }

}
""")

def initialize_kernel_parameters():

    global kernel_object_mask_image, randArrWidth, randArrHeight, randArrDim
    try:
        excpt_message = ExceptionMessage(Status=False, Message=None)
        dummy_image = numpy.random.randint(low = 1, high = 255, size = (720,1280,3)).astype(numpy.float32)
        randArrWidth = numpy.int32(dummy_image.shape[0])
        randArrHeight = numpy.int32(dummy_image.shape[1])
        randArrDim = numpy.int32(dummy_image.shape[2])
        kernel_object_mask_image = kernel_mask_image_renderer.get_function('elementwise_matrix_multiplication_mask_image')
    except Exception as exception:
        excpt_message['Status'] = True
        excpt_message['Message'] = exception
    finally:
        return excpt_message

if __name__ == '__main__':
    try:
        final_image = numpy.zeros([720,1280,3], dtype = numpy.float32)
        excpt_init_kernel_params = initialize_kernel_parameters()
        if excpt_init_kernel_params['Status'] is False:
            param_videoOutput = []
            param_videoOutput.append("--width=" + str(1280))
            param_videoOutput.append(f"--height=" + str(720))

            input = videoSource('file://front_video.mp4')

            output = videoOutput('display://0', argv=param_videoOutput)

            while True:
                image = input.Capture(format='rgb8')
                if image is None:
                    continue
                
                with vpi.Backend.CUDA:
                    distortion_corrected_front = vpi.asimage(image)\
                                                    .convert(vpi.Format.NV12_ER)\
                                                    .remap(warpmap_distortion_correction, interp=vpi.Interp.LINEAR)\
                                                    .convert(vpi.Format.F32)       
                # print(ROI_mask_front.shape)
                # print(final_image.shape)
                kernel_object_mask_image(drv.Out(final_image), drv.In(distortion_corrected_front), drv.In(ROI_mask_front), randArrWidth, randArrHeight, randArrDim,\
                                        block=(17,20,3), grid=(43,65))

                output.Render(image)

                if not input.IsStreaming() or not output.IsStreaming():
                    break
        else:
            raise Exception(excpt_init_kernel_params['Message'])
    except Exception as exception:
        print(str(exception))

However, I get the error -

Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/pycuda/driver.py", line 131, in get_device_alloc
    self.dev_alloc = mem_alloc_like(self.array)
  File "/usr/local/lib/python3.8/dist-packages/pycuda/driver.py", line 802, in mem_alloc_like
    return mem_alloc(ary.nbytes)
AttributeError: 'vpi.Image' object has no attribute 'nbytes'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "test_script2.py", line 185, in <module>
    kernel_object_mask_image(drv.Out(final_image), drv.In(distortion_corrected_front), drv.In(ROI_mask_front), randArrWidth, randArrHeight, randArrDim,\
  File "/usr/local/lib/python3.8/dist-packages/pycuda/driver.py", line 482, in function_call
    handlers, arg_buf = _build_arg_buf(args)
  File "/usr/local/lib/python3.8/dist-packages/pycuda/driver.py", line 204, in _build_arg_buf
    arg_data.append(int(arg.get_device_alloc()))
  File "/usr/local/lib/python3.8/dist-packages/pycuda/driver.py", line 133, in get_device_alloc
    raise TypeError(
TypeError: could not determine array length of '<class 'vpi.Image'>': unsupported array type or not an array

Note that ROI_mask_front is a numpy float 32 array of the dimensions (720,1280,3), same as input video resolution.

Do I need to perform any other conversion on distortion_corrected_front apart from converting it into float 32?

Also copying @dusty_nv as I am using jetson_utils project for video streaming

Thanks

@AastaLLL I have also tried using a CudaBuffer as mentioned in the documentation - vpi.CudaBuffer — VPI Python API Reference 2.3 documentation.

from jetson_utils import videoSource, videoOutput
import pycuda.driver as drv
import pycuda.autoprimaryctx
from pycuda.compiler import SourceModule
from extract_system_calibration import EXTRACT_SYSTEM_CALIBRATION
from exception_message import ExceptionMessage
import numpy
import cv2
import vpi


kernel_mask_image_renderer = SourceModule("""
__global__ void elementwise_matrix_multiplication_mask_image(float *maskImg, float *fullImg, float *mask, int width, int height, int dim)
{
    const int row = blockIdx.y*blockDim.y + threadIdx.y;
    const int col = blockIdx.x*blockDim.x + threadIdx.x;
    const int dep = blockIdx.z*blockDim.z + threadIdx.z;

    float op_val;
    if(row<height && col<width && dep<dim)
    {
        op_val = fullImg[row*width+col+dep*width*height] * mask[row*width+col+dep*width*height];
        maskImg[row*width+col+dep*width*height] = op_val;
    }

}
""")

def initialize_kernel_parameters():

    global kernel_object_mask_image, randArrWidth, randArrHeight, randArrDim
    try:
        excpt_message = ExceptionMessage(Status=False, Message=None)
        dummy_image = numpy.random.randint(low = 1, high = 255, size = (720,1280,3)).astype(numpy.float32)
        randArrWidth = numpy.int32(dummy_image.shape[0])
        randArrHeight = numpy.int32(dummy_image.shape[1])
        randArrDim = numpy.int32(dummy_image.shape[2])
        kernel_object_mask_image = kernel_mask_image_renderer.get_function('elementwise_matrix_multiplication_mask_image')
    except Exception as exception:
        excpt_message['Status'] = True
        excpt_message['Message'] = exception
    finally:
        return excpt_message

if __name__ == '__main__':
    try:
        final_image = numpy.zeros([720,1280,3], dtype = numpy.float32)
        excpt_init_kernel_params = initialize_kernel_parameters()
        if excpt_init_kernel_params['Status'] is False:
            param_videoOutput = []
            param_videoOutput.append("--width=" + str(1280))
            param_videoOutput.append(f"--height=" + str(720))

            input = videoSource('file://front_video.mp4')

            output = videoOutput('display://0', argv=param_videoOutput)

            while True:
                image = input.Capture(format='rgb8')
                if image is None:
                    continue
                
                with vpi.Backend.CUDA:
                    distortion_corrected_front = vpi.asimage(image)\
                                                    .convert(vpi.Format.NV12_ER)\
                                                    .remap(warpmap_distortion_correction, interp=vpi.Interp.LINEAR)\
                                                    .convert(vpi.Format.F32)       
                # print(ROI_mask_front.shape)
                # print(final_image.shape)
                with distortion_corrected_front.rlock_cuda() as cuda_buffer:
                     kernel_object_mask_image(drv.Out(final_image), drv.In(cuda_buffer), drv.In(ROI_mask_front), randArrWidth, randArrHeight, randArrDim,\
                                        block=(17,20,3), grid=(43,65))
            
                output.Render(image)

                if not input.IsStreaming() or not output.IsStreaming():
                    break
        else:
            raise Exception(excpt_init_kernel_params['Message'])
    except Exception as exception:
        print(str(exception))

The dtype of the cuda buffer is float 32.

However, I get the following errors -

Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/pycuda/driver.py", line 131, in get_device_alloc
    self.dev_alloc = mem_alloc_like(self.array)
  File "/usr/local/lib/python3.8/dist-packages/pycuda/driver.py", line 802, in mem_alloc_like
    return mem_alloc(ary.nbytes)
AttributeError: 'vpi.CudaBuffer' object has no attribute 'nbytes'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "test_script2.py", line 191, in <module>
    kernel_object_mask_image(drv.Out(final_image), drv.In(cuda_buffer), drv.In(ROI_mask_front), randArrWidth, randArrHeight, randArrDim,\
  File "/usr/local/lib/python3.8/dist-packages/pycuda/driver.py", line 482, in function_call
    handlers, arg_buf = _build_arg_buf(args)
  File "/usr/local/lib/python3.8/dist-packages/pycuda/driver.py", line 204, in _build_arg_buf
    arg_data.append(int(arg.get_device_alloc()))
  File "/usr/local/lib/python3.8/dist-packages/pycuda/driver.py", line 133, in get_device_alloc
    raise TypeError(
TypeError: could not determine array length of '<class 'vpi.CudaBuffer'>': unsupported array type or not an array

How can I use the cuda buffer in PyCUDA context? Does PyCuda Source Module only accept numpy and pytorch datatypes, since nbytes is an attribute commonly used in numpy and pytorch?

Thanks

Hi,

Have you tried to convert the output to a NumPy array and pass it to the display?
Thanks.

Hi @AastaLLL ,

I am trying to achieve zero copy memory. I believe if I convert to a numpy array, the output will only be accessible in cpu memory. I would need to copy it again to a mapped memory for it to be accessible to the gpu. Is there a way I can use cuda_array_interface or any other technique to access the output from vpi directly in PyCUDA, without any mem copies ?

Thanks

Hi,

Is C++ an option for you?
We have some examples of zero-copy in C++.
But much limited in Python with PyCUDA.

Thanks.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.