hello,
i am using nvidia jetson orin nx for robotics applications.
my JetPack version is 5.1.2 with full packages like CUDA-11.4 and DeepStream-6.3
also TensforRT 8.5.2.2 with Cudnn 8.6
i am trying to create custom gstreamer pipeline to implement zero-copy techniques due to reduction of latency between gpu-cpu transfer and found DeepStream could ease this way with NvBufSurface header.
in my case i have camera and run below pipeline to apply my custom gstreamer plugin on frames. i used dsexample code and wanna apply my custom cuda kernel on frames.
my gstreamer pipeline : gst-launch-1.0 nvv4l2camerasrc ! ‘video/x-raw(memory:NVMM), format=UYVY, width=1920, height=1080’ ! nvvideoconvert copy-hw=2 nvbuf-memory-type=4 ! ‘video/x-raw(memory:NVMM), format=RGBA’ ! queue ! mux.sink_0 nvstreammux name=mux batch-size=1 width=1920 height=1080 live-source=1 nvbuf-memory-type=4 ! dsexample ! nvvideoconvert copy-hw=2 nvbuf-memory-type=4 ! xvimagesink sync=false
but in dsexample plugin i dont understand how can i access to camera frames to push them into cuda kernel, my simple cuda kernel that i tested with cudaMallocPitch() and worked wrote like this:
#include <cuda_runtime.h>
#include <stdint.h>
#include <iostream>
extern "C" void launch_bgr2gray_rgba(
uint8_t* rgba_ptr, int width, int height, int pitch, cudaStream_t stream);
__global__ void rgba_to_gray_rgba_kernel(uint8_t* rgba, int width, int height, int pitch) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= width || y >= height) return;
uint8_t* pixel = rgba + y * pitch + x * 4;
uint8_t r = pixel[0];
uint8_t g = pixel[1];
uint8_t b = pixel[2];
uint8_t gray = static_cast<uint8_t>(0.299f * r + 0.587f * g + 0.114f * b);
pixel[0] = gray;
pixel[1] = gray;
pixel[2] = gray;
// alpha stays the same
}
extern "C" void launch_bgr2gray_rgba(
uint8_t* rgba_ptr, int width, int height, int pitch, cudaStream_t stream) {
dim3 block(16, 16);
dim3 grid((width + block.x - 1) / block.x,
(height + block.y - 1) / block.y);
rgba_to_gray_rgba_kernel<<<grid, block, 0, stream>>>(rgba_ptr, width, height, pitch);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("CUDA kernel launch failed: %s\n", cudaGetErrorString(err));
}
}
using cudaMallocPitch() have time cast and i wanna run this cuda kernel with zero-copy teqnique. now i need help to how can i access rgba_ptr in dsexample code for run custom cuda kernel.
static GstFlowReturn
gst_dsexample_transform_ip (GstBaseTransform * btrans, GstBuffer * inbuf)
{
GstDsExample *dsexample = GST_DSEXAMPLE (btrans);
GstMapInfo in_map_info;
GstFlowReturn flow_ret = GST_FLOW_ERROR;
gdouble scale_ratio = 1.0;
DsExampleOutput *output;
NvBufSurface *surface = NULL;
NvDsBatchMeta *batch_meta = NULL;
NvDsFrameMeta *frame_meta = NULL;
NvDsMetaList * l_frame = NULL;
guint i = 0;
dsexample->frame_num++;
CHECK_CUDA_STATUS (cudaSetDevice (dsexample->gpu_id),
"Unable to set cuda device");
memset (&in_map_info, 0, sizeof (in_map_info));
if (!gst_buffer_map (inbuf, &in_map_info, GST_MAP_READ)) {
g_print ("Error: Failed to map gst buffer\n");
goto error;
}
nvds_set_input_system_timestamp (inbuf, GST_ELEMENT_NAME (dsexample));
surface = (NvBufSurface *) in_map_info.data;
GST_DEBUG_OBJECT (dsexample,
"Processing Frame %" G_GUINT64_FORMAT " Surface %p\n",
dsexample->frame_num, surface);
if (CHECK_NVDS_MEMORY_AND_GPUID (dsexample, surface))
goto error;
batch_meta = gst_buffer_get_nvds_batch_meta (inbuf);
if (batch_meta == nullptr) {
GST_ELEMENT_ERROR (dsexample, STREAM, FAILED,
("NvDsBatchMeta not found for input buffer."), (NULL));
return GST_FLOW_ERROR;
}
GST_DEBUG("Memory type: %d (4=NVBUF_MEM_SURFACE_ARRAY)", surface->memType);
if (surface->memType != NVBUF_MEM_SURFACE_ARRAY) {
GST_WARNING("Unexpected memory type - may need different handling");
}
if (dsexample->process_full_frame) {
for (l_frame = batch_meta->frame_meta_list; l_frame != NULL;
l_frame = l_frame->next)
{
frame_meta = (NvDsFrameMeta *) (l_frame->data);
int batch_id = frame_meta->batch_id;
///////////////////////////// naji ///////////////////////////////
NvBufSurfaceParams *params = &surface->surfaceList[batch_id];
if (NvBufSurfaceMap(surface, batch_id, 0, NVBUF_MAP_READ_WRITE) != 0) {
GST_ERROR("Failed to map surface for batch %d", batch_id);
goto error;
}
NvBufSurfaceSyncForCpu(surface, batch_id, 0);
uint8_t* device_ptr = (uint8_t*)surface->surfaceList[0].dataPtr;
int width = params->width;
int height = params->height;
int pitch = params->pitch;
std::cout<<"width = "<<width<<"height = "<<height<<"pitch = "<<pitch<<std::endl;
launch_bgr2gray_rgba(device_ptr, width, height, pitch, 0); // stream 0 for now
NvBufSurfaceSyncForDevice(surface, batch_id, 0);
NvBufSurfaceUnMap(surface, batch_id, 0);
///////////////////////////// naji ///////////////////////////////
NvOSD_RectParams rect_params;
/* Scale the entire frame to processing resolution */
rect_params.left = 0;
rect_params.top = 0;
rect_params.width = dsexample->video_info.width;
rect_params.height = dsexample->video_info.height;
/* Scale and convert the frame */
if (get_converted_mat (dsexample, surface, i, &rect_params,
scale_ratio, dsexample->video_info.width,
dsexample->video_info.height) != GST_FLOW_OK) {
goto error;
}
/* Process to get the output */
#ifdef WITH_OPENCV
output =
DsExampleProcess (dsexample->dsexamplelib_ctx,
dsexample->cvmat->data);
#else
output =
DsExampleProcess (dsexample->dsexamplelib_ctx,
(unsigned char *)dsexample->inter_buf->surfaceList[0].mappedAddr.addr[0]);
#endif
/* Attach the metadata for the full frame */
attach_metadata_full_frame (dsexample, frame_meta, scale_ratio, output, i);
i++;
free (output);
}
} else {
/* Using object crops as input to the algorithm. The objects are detected by
* the primary detector */
NvDsMetaList * l_obj = NULL;
NvDsObjectMeta *obj_meta = NULL;
if(!dsexample->is_integrated) {
if (dsexample->blur_objects) {
if (!(surface->memType == NVBUF_MEM_CUDA_UNIFIED || surface->memType == NVBUF_MEM_CUDA_PINNED)){
GST_ELEMENT_ERROR (dsexample, STREAM, FAILED,
("%s:need NVBUF_MEM_CUDA_UNIFIED or NVBUF_MEM_CUDA_PINNED memory for opencv blurring",__func__), (NULL));
return GST_FLOW_ERROR;
}
}
}
for (l_frame = batch_meta->frame_meta_list; l_frame != NULL;
l_frame = l_frame->next)
{
frame_meta = (NvDsFrameMeta *) (l_frame->data);
#ifdef WITH_OPENCV
cv::Mat in_mat;
if (dsexample->blur_objects) {
/* Map the buffer so that it can be accessed by CPU */
if (surface->surfaceList[frame_meta->batch_id].mappedAddr.addr[0] == NULL){
if (NvBufSurfaceMap (surface, frame_meta->batch_id, 0, NVBUF_MAP_READ_WRITE) != 0){
GST_ELEMENT_ERROR (dsexample, STREAM, FAILED,
("%s:buffer map to be accessed by CPU failed", __func__), (NULL));
return GST_FLOW_ERROR;
}
}
/* Cache the mapped data for CPU access */
if(dsexample->inter_buf->memType == NVBUF_MEM_SURFACE_ARRAY)
NvBufSurfaceSyncForCpu (surface, frame_meta->batch_id, 0);
in_mat =
cv::Mat (surface->surfaceList[frame_meta->batch_id].planeParams.height[0],
surface->surfaceList[frame_meta->batch_id].planeParams.width[0], CV_8UC4,
surface->surfaceList[frame_meta->batch_id].mappedAddr.addr[0],
surface->surfaceList[frame_meta->batch_id].planeParams.pitch[0]);
}
#endif
for (l_obj = frame_meta->obj_meta_list; l_obj != NULL;
l_obj = l_obj->next)
{
obj_meta = (NvDsObjectMeta *) (l_obj->data);
if (dsexample->blur_objects) {
/* gaussian blur the detected objects using opencv */
#ifdef WITH_OPENCV
if (blur_objects (dsexample, frame_meta->batch_id,
&obj_meta->rect_params, in_mat) != GST_FLOW_OK) {
/* Error in blurring, skip processing on object. */
GST_ELEMENT_ERROR (dsexample, STREAM, FAILED,
("blurring the object failed"), (NULL));
if (NvBufSurfaceUnMap (surface, frame_meta->batch_id, 0)){
GST_ELEMENT_ERROR (dsexample, STREAM, FAILED,
("%s:buffer unmap to be accessed by CPU failed", __func__), (NULL));
}
return GST_FLOW_ERROR;
}
continue;
#else
GST_ELEMENT_ERROR (dsexample, STREAM, FAILED,
("OpenCV has been deprecated, hence object blurring will not work."
"Enable OpenCV compilation in gst-dsexample Makefile by setting 'WITH_OPENCV:=1"), (NULL));
return GST_FLOW_ERROR;
#endif
}
/* Should not process on objects smaller than MIN_INPUT_OBJECT_WIDTH x MIN_INPUT_OBJECT_HEIGHT
* since it will cause hardware scaling issues. */
if (obj_meta->rect_params.width < MIN_INPUT_OBJECT_WIDTH ||
obj_meta->rect_params.height < MIN_INPUT_OBJECT_HEIGHT)
continue;
/* Crop and scale the object */
if (get_converted_mat (dsexample,
surface, frame_meta->batch_id, &obj_meta->rect_params,
scale_ratio, dsexample->video_info.width,
dsexample->video_info.height) != GST_FLOW_OK) {
/* Error in conversion, skip processing on object. */
continue;
}
#ifdef WITH_OPENCV
/* Process the object crop to obtain label */
output = DsExampleProcess (dsexample->dsexamplelib_ctx,
dsexample->cvmat->data);
#else
/* Process the object crop to obtain label */
output = DsExampleProcess (dsexample->dsexamplelib_ctx,
(unsigned char *)dsexample->inter_buf->surfaceList[0].mappedAddr.addr[0]);
#endif
/* Attach labels for the object */
attach_metadata_object (dsexample, obj_meta, output);
free (output);
}
if (dsexample->blur_objects) {
/* Cache the mapped data for device access */
if(dsexample->inter_buf->memType == NVBUF_MEM_SURFACE_ARRAY)
NvBufSurfaceSyncForDevice (surface, frame_meta->batch_id, 0);
#ifdef WITH_OPENCV
#ifdef DSEXAMPLE_DEBUG
/* Use openCV to remove padding and convert RGBA to BGR. Can be skipped if
* algorithm can handle padded RGBA data. */
#if (CV_MAJOR_VERSION >= 4)
cv::cvtColor (in_mat, *dsexample->cvmat, cv::COLOR_RGBA2BGR);
#else
cv::cvtColor (in_mat, *dsexample->cvmat, CV_RGBA2BGR);
#endif
/* used to dump the converted mat to files for debug */
static guint cnt = 0;
cv::imwrite("out_" + std::to_string (cnt) + ".jpeg", *dsexample->cvmat);
cnt++;
#endif
#endif
}
}
}
flow_ret = GST_FLOW_OK;
error:
nvds_set_output_system_timestamp (inbuf, GST_ELEMENT_NAME (dsexample));
gst_buffer_unmap (inbuf, &in_map_info);
return flow_ret;
}
Thanks …