Hi,
I am trying to use an image in vpi format in a CUDA kernel. Currently, I am having to convert the image to a numpy.float32 format. Is there a way to achieve zero mem copy in this pipeline, without having to copy the data to host and convert it to numpy format?
Here is the code -
kernel_mask_image_renderer = SourceModule("""
__global__ void elementwise_matrix_multiplication_mask_image(float *maskImg, float *fullImg, float *mask, int width, int height, int dim)
{
const int row = blockIdx.y*blockDim.y + threadIdx.y;
const int col = blockIdx.x*blockDim.x + threadIdx.x;
const int dep = blockIdx.z*blockDim.z + threadIdx.z;
float op_val;
if(row<height && col<width && dep<dim)
{
op_val = fullImg[row*width+col+dep*width*height] * mask[row*width+col+dep*width*height];
maskImg[row*width+col+dep*width*height] = op_val;
}
}
""")
cap_front = cv2.VideoCapture("filesrc location=schoolbus_video/front_video.mp4 ! qtdemux ! queue ! h264parse ! nvv4l2decoder ! nvvidconv ! video/x-raw,format=BGRx ! queue ! videoconvert ! queue ! video/x-raw, format=BGR ! appsink ", cv2.CAP_GSTREAMER)
kernel_object_mask_image = kernel_mask_image_renderer.get_function('elementwise_matrix_multiplication_mask_image')
while True:
ret_front, frame_front = cap_front.read()
with vpi.Backend.CUDA:
distortion_corrected_front = vpi.asimage(frame_front)\
.convert(vpi.Format.NV12_ER)\
.remap(warpmap_distortion_correction, interp=vpi.Interp.LINEAR)\
.convert(vpi.Format.RGB8)
roi_image_front_input = distortion_corrected_front.cpu().astype(numpy.float32)
kernel_object_mask_image(drv.Out(roi_image_front), drv.In(roi_image_front_input), drv.In(ROI_mask_front),
randArrWidth, randArrHeight, randArrDim,
block=(20,17,3), grid=(65,43))
I wanted to know if there is a way to avoid converting distortion_corrected_front
into a numpy array before using it in a CUDA function.
Thanks