Please provide complete information as applicable to your setup.
• Jetson & dGPU
• Deepstream 6.4
• JetPack Version 6.2
• 561.17
**• How do I get VPI to work with **
**• How to reproduce the issue? We have based a Deepstream GStreamer plugin off of the gstexample(/opt/nvidia/deepstream/deepstream-6.4/sources/gst-plugins/gst-dsexample) and instead of using OpenCV we would like to use VPI to keep the image in GPU/iGPU memory to do operations. **
**• Requirement details - Trying to get rid of OpenCV in favor of VPI to do operations on images in GPU memory instead of bringing down to the CPU. **
Currently we use the following method to convert NvBufSurface into a cv::Mat which brings the image down from GPU memory to CPU:
static GstFlowReturn get_converted_mat(GstPip *pip, NvBufSurface *input_buf, gint idx,
NvOSD_RectParams *crop_rect_params, gdouble &ratio, gint input_width,
gint input_height) {
NvBufSurfTransform_Error err;
NvBufSurfTransformConfigParams transform_config_params;
NvBufSurfTransformParams transform_params;
NvBufSurfTransformRect src_rect;
NvBufSurfTransformRect dst_rect;
NvBufSurface ip_surf;
cv::Mat in_mat;
ip_surf = *input_buf;
ip_surf.numFilled = ip_surf.batchSize = 1;
ip_surf.surfaceList = &(input_buf->surfaceList[idx]);
gint src_left = GST_ROUND_UP_2((unsigned int)crop_rect_params->left);
gint src_top = GST_ROUND_UP_2((unsigned int)crop_rect_params->top);
gint src_width = GST_ROUND_DOWN_2((unsigned int)crop_rect_params->width);
gint src_height = GST_ROUND_DOWN_2((unsigned int)crop_rect_params->height);
// Calculate the extra padding for the object (12.5% of width and height)
gint padding_width = src_width * 0.25;
gint padding_height = src_height * 0.25;
// Adjust left, top, width, and height while ensuring they remain within the
// frame bounds
src_left = std::max(0, src_left - padding_width);
src_top = std::max(0, src_top - padding_height);
src_width = std::min(input_width - src_left, src_width + 2 * padding_width);
src_height = std::min(input_height - src_top, src_height + 2 * padding_height);
/* Maintain aspect ratio */
double hdest = pip->processing_width * src_height / (double)src_width;
double wdest = pip->processing_height * src_width / (double)src_height;
guint dest_width, dest_height;
if (hdest <= pip->processing_height) {
dest_width = pip->processing_width;
dest_height = hdest;
} else {
dest_width = wdest;
dest_height = pip->processing_height;
}
/* Configure transform session parameters for the transformation */
transform_config_params.compute_mode = NvBufSurfTransformCompute_Default;
transform_config_params.gpu_id = pip->gpu_id;
transform_config_params.cuda_stream = pip->cuda_stream;
/* Set the transform session parameters for the conversions executed in this
* thread. */
err = NvBufSurfTransformSetSessionParams(&transform_config_params);
if (err != NvBufSurfTransformError_Success) {
NVGSTDS_ERR_MSG_V("NvBufSurfTransformSetSessionParams failed with error %d", err);
goto error;
}
/* Calculate scaling ratio while maintaining aspect ratio */
ratio = MIN(1.0 * dest_width / src_width, 1.0 * dest_height / src_height);
if ((crop_rect_params->width == 0) || (crop_rect_params->height == 0)) {
NVGSTDS_ERR_MSG_V("Crop rect params dimensions are zero");
goto error;
}
#ifdef __aarch64__
if (ratio <= 1.0 / 16 || ratio >= 16.0) {
/* Currently cannot scale by ratio > 16 or < 1/16 for Jetson */
NVGSTDS_ERR_MSG_V("Cannot scale by ratio > 16 or < 1/16 for Jetson.");
goto error;
}
#endif
/* Set the transform ROIs for source and destination */
src_rect = {(guint)src_top, (guint)src_left, (guint)src_width, (guint)src_height};
dst_rect = {0, 0, (guint)dest_width, (guint)dest_height};
/* Set the transform parameters */
transform_params.src_rect = &src_rect;
transform_params.dst_rect = &dst_rect;
transform_params.transform_flag =
NVBUFSURF_TRANSFORM_FILTER | NVBUFSURF_TRANSFORM_CROP_SRC | NVBUFSURF_TRANSFORM_CROP_DST;
transform_params.transform_filter = NvBufSurfTransformInter_Default;
/* Memset the memory */
NvBufSurfaceMemSet(pip->inter_buf, 0, 0, 0);
/* Transformation scaling+format conversion if any. */
err = NvBufSurfTransform(&ip_surf, pip->inter_buf, &transform_params);
if (err != NvBufSurfTransformError_Success) {
NVGSTDS_ERR_MSG_V("NvBufSurfTransform failed with error %d while converting buffer", err);
goto error;
}
/* Map the buffer so that it can be accessed by CPU */
if (NvBufSurfaceMap(pip->inter_buf, 0, 0, NVBUF_MAP_READ) != 0) {
goto error;
}
if (pip->inter_buf->memType == NVBUF_MEM_SURFACE_ARRAY) {
/* Cache the mapped data for CPU access */
NvBufSurfaceSyncForCpu(pip->inter_buf, 0, 0);
}
/* Use openCV to remove padding and convert RGBA to BGR. Can be skipped if
* algorithm can handle padded RGBA data. */
in_mat = cv::Mat(pip->processing_height, pip->processing_width, CV_8UC4,
pip->inter_buf->surfaceList[0].mappedAddr.addr[0], pip->inter_buf->surfaceList[0].pitch);
#if (CV_MAJOR_VERSION >= 4)
cv::cvtColor(in_mat, *pip->cvmat, cv::COLOR_RGBA2BGR);
#else
cv::cvtColor(in_mat, *pip->cvmat, CV_RGBA2BGR);
#endif
if (NvBufSurfaceUnMap(pip->inter_buf, 0, 0)) {
goto error;
}
if (pip->is_integrated) {
#ifdef __aarch64__
/* To use the converted buffer in CUDA, create an EGLImage and then use
* CUDA-EGL interop APIs */
if (USE_EGLIMAGE) {
if (NvBufSurfaceMapEglImage(pip->inter_buf, 0) != 0) {
goto error;
}
/* pip->inter_buf->surfaceList[0].mappedAddr.eglImage
* Use interop APIs cuGraphicsEGLRegisterImage and
* cuGraphicsResourceGetMappedEglFrame to access the buffer in CUDA
*/
/* Destroy the EGLImage */
NvBufSurfaceUnMapEglImage(pip->inter_buf, 0);
}
#endif
}
/* We will first convert only the Region of Interest (the entire frame or
* the object bounding box) to RGB and then scale the converted RGB frame to
* processing resolution. */
return GST_FLOW_OK;
error:
return GST_FLOW_ERROR;
}
This is called in our gst_pip_transform_ip
method:
static GstFlowReturn gst_pip_transform_ip(GstBaseTransform *btrans, GstBuffer *inbuf) {
GstPip *pip = GST_PIP(btrans);
NvDsBatchMeta *batch_meta = NULL;
NvDsObjectMeta *obj_meta = NULL;
batch_meta = gst_buffer_get_nvds_batch_meta(inbuf);
if (batch_meta == NULL) {
NVGSTDS_ERR_MSG_V("NvDsBatchMeta not found for input buffer");
return GST_FLOW_ERROR;
}
GstMapInfo in_map_info;
GstFlowReturn flow_ret = GST_FLOW_ERROR;
gdouble scale_ratio = 1.0;
PipOutput *output;
NvBufSurface *surface = NULL;
NvDsFrameMeta *frame_meta = NULL;
NvDsMetaList *l_frame = NULL;
guint i = 0;
cv::Mat in_mat;
NvDsMetaList *l_obj = NULL;
CHECK_CUDA_STATUS(cudaSetDevice(pip->gpu_id), "Unable to set cuda device");
memset(&in_map_info, 0, sizeof(in_map_info));
if (!gst_buffer_map(inbuf, &in_map_info, GST_MAP_READ)) {
NVGSTDS_ERR_MSG_V("Failed to map gst buffer.");
goto error;
}
nvds_set_input_system_timestamp(inbuf, GST_ELEMENT_NAME(pip));
surface = (NvBufSurface *)in_map_info.data;
if (CHECK_NVDS_MEMORY_AND_GPUID(pip, surface))
goto error;
if (!pip->is_integrated) {
if (!(surface->memType == NVBUF_MEM_CUDA_UNIFIED || surface->memType == NVBUF_MEM_CUDA_PINNED)) {
GST_ELEMENT_ERROR(pip, STREAM, FAILED,
("%s:need NVBUF_MEM_CUDA_UNIFIED or "
"NVBUF_MEM_CUDA_PINNED memory for "
"opencv digital enhancement",
__func__),
(NULL));
return GST_FLOW_ERROR;
}
}
for (l_frame = batch_meta->frame_meta_list; l_frame != NULL; l_frame = l_frame->next) {
frame_meta = (NvDsFrameMeta *)(l_frame->data);
if (surface->surfaceList[frame_meta->batch_id].mappedAddr.addr[0] == NULL) {
if (NvBufSurfaceMap(surface, frame_meta->batch_id, 0, NVBUF_MAP_READ_WRITE) != 0) {
NVGSTDS_ERR_MSG_V("Buffer map to be accessed by CPU failed");
return GST_FLOW_ERROR;
}
}
/* Cache the mapped data for CPU access */
if (pip->inter_buf->memType == NVBUF_MEM_SURFACE_ARRAY) {
NvBufSurfaceSyncForCpu(surface, frame_meta->batch_id, 0);
}
in_mat = cv::Mat(surface->surfaceList[frame_meta->batch_id].planeParams.height[0],
surface->surfaceList[frame_meta->batch_id].planeParams.width[0], CV_8UC4,
surface->surfaceList[frame_meta->batch_id].mappedAddr.addr[0],
surface->surfaceList[frame_meta->batch_id].planeParams.pitch[0]);
obj_meta = find_central_object_with_highest_confidence(pip, batch_meta);
if (obj_meta != NULL) {
/* Crop and scale the object */
if (get_converted_mat(pip, surface, frame_meta->batch_id, &obj_meta->rect_params, scale_ratio,
pip->video_info.width, pip->video_info.height) != GST_FLOW_OK) {
/* Error in conversion, skip processing on object. */
continue;
}
if (digitally_enhance_object(pip, surface, frame_meta, in_mat) != GST_FLOW_OK) {
NVGSTDS_ERR_MSG_V("Unable to digitally enhance object");
return GST_FLOW_ERROR;
}
}
else if (!pip->cached_frames[frame_meta->source_id].empty()) {
auto now = std::chrono::steady_clock::now();
auto last_update = pip->cached_timestamps[frame_meta->source_id];
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(now - last_update).count();
if (duration < pip->cached_duration) {
int pipWidth, pipHeight, pipPosX, pipPosY;
if (calculate_object_region(surface, frame_meta, pipWidth, pipHeight, pipPosX, pipPosY) != GST_FLOW_OK) {
NVGSTDS_ERR_MSG_V("Unable to display cached object");
return GST_FLOW_ERROR;
}
cv::Rect pipRect(pipPosX, pipPosY, pipWidth, pipHeight);
pip->cached_frames[frame_meta->source_id].copyTo(in_mat(pipRect));
}
}
/* Cache the mapped data for device access */
if (pip->inter_buf->memType == NVBUF_MEM_SURFACE_ARRAY) {
NvBufSurfaceSyncForDevice(surface, frame_meta->batch_id, 0);
}
} // For loop end
flow_ret = GST_FLOW_OK;
error:
nvds_set_output_system_timestamp(inbuf, GST_ELEMENT_NAME(pip));
gst_buffer_unmap(inbuf, &in_map_info);
return flow_ret;
}
VPI seems to be straight forward for the Jetson, looks like you can wrap the NvBufSurface directly like so:
NvBufSurfaceMapParams buffer_params;
NvBufSurfaceGetMapParams(surface, 0, &buffer_params);
VPIImageData img_data;
img_data.bufferType = VPI_IMAGE_BUFFER_NVBUFFER;
img_data.buffer.fd = buffer_params.fd;
VPIStatus err_vpi = vpiImageCreateWrapper(&img_data, nullptr, VPI_BACKEND_CUDA, &img_vpi);
As stated here, looks like this is only valid on the Jetson but not for dGPU:
How can this be done with the dGPU?