Please provide complete information as applicable to your setup.
• Hardware Platform (Jetson / GPU): Laptop RTX 4070
• DeepStream Version: 7.1
• TensorRT Version: 10.3 using ngc deepstream container
• NVIDIA GPU Driver Version (valid for GPU only): 570
• Issue Type( questions, new requirements, bugs): Question
• How to reproduce the issue ?
I am preparing tensors for secondary inference using masks from previous instance segmentation to remove background for detected objects. This is done using two self written Cuda Kernels, one Scaling the mask to the correct dimensions so it can be applied to unit->converted_frame_buf which is then applied using a edited version of the Conversion Kernels provided in nvdspreprocess_conversion.cu taking the masks value in account for each pixel. The data is passt the exact same way as done in example library and also converted the exact same way.
Here is my question:
Is the CustomAsyncTransformation as in the example library always needed? As far as i understand this creates a Tensor from GstBuffer Datastructure and needs to be implemented in every custom preprocess library
Second How should nvdspreprocess be configured for my pipeline
src->nvstreammux->pgie->nvdspreprocess->sgie
So that everything matches: expected batch size, format, using meta-data-tensor as input etc. Using the examples did not give me a good understanding, some say if nvdspreprocess is present sgie has to be configured for primary inference when input-from-meta-data=1 is set. How do batchsizes need to be specified, my suspicion is that my sgie is expecting a full batch, and exits with segfault when said batchsize is not given as input.
backtrace of segfault:
Thread 23 "python3" received signal SIGSEGV, Segmentation fault.
[Switching to Thread 0x7fff9adde640 (LWP 908439)]
0x00007ffff7ce93fe in free () from /usr/lib/x86_64-linux-gnu/libc.so.6
(gdb) bt full
#0 0x00007ffff7ce93fe in free () at /usr/lib/x86_64-linux-gnu/libc.so.6
#1 0x00007ffff716d87f in release_obj_meta () at /opt/nvidia/deepstream/deepstream/lib/libnvds_meta.so
#2 0x00007ffff716d65b in nvds_clear_meta_list () at /opt/nvidia/deepstream/deepstream/lib/libnvds_meta.so
#3 0x00007ffff716d6f5 in release_frame_meta () at /opt/nvidia/deepstream/deepstream/lib/libnvds_meta.so
#4 0x00007ffff716cffd in nvds_destroy_meta_pool () at /opt/nvidia/deepstream/deepstream/lib/libnvds_meta.so
#5 0x00007ffff716bda5 in nvds_destroy_batch_meta () at /opt/nvidia/deepstream/deepstream/lib/libnvds_meta.so
#6 0x00007ffff6f38139 in () at /usr/lib/x86_64-linux-gnu/libgstreamer-1.0.so.0
#7 0x00007ffff51e1acf in () at /usr/lib/x86_64-linux-gnu/libgstbase-1.0.so.0
#8 0x00007ffff51e114c in () at /usr/lib/x86_64-linux-gnu/libgstbase-1.0.so.0
#9 0x00007ffff6f7986d in () at /usr/lib/x86_64-linux-gnu/libgstreamer-1.0.so.0
#10 0x00007ffff6f7ce09 in () at /usr/lib/x86_64-linux-gnu/libgstreamer-1.0.so.0
#11 0x00007ffff6f7d22e in gst_pad_push () at /usr/lib/x86_64-linux-gnu/libgstreamer-1.0.so.0
#12 0x00007fffeb3cf8d3 in () at /usr/lib/x86_64-linux-gnu/gstreamer-1.0/deepstream/libnvdsgst_infer.so
#13 0x00007ffff773bac1 in g_thread_proxy (data=0x555559540550) at ../glib/gthread.c:831
thread = 0x555559540550
__func__ = "g_thread_proxy"
#14 0x00007ffff7cd8ac3 in () at /usr/lib/x86_64-linux-gnu/libc.so.6
#15 0x00007ffff7d6a850 in () at /usr/lib/x86_64-linux-gnu/libc.so.6
This is my tensorpreparation function:
NvDsPreProcessStatus CustomTensorPreparation(CustomCtx *ctx, NvDsPreProcessBatch *batch, NvDsPreProcessCustomBuf *&buf, CustomTensorParams &tensorParam, NvDsPreProcessAcquirer *acquirer)
{
NvDsPreProcessStatus status = NVDSPREPROCESS_TENSOR_NOT_READY;
std::cout << "=== CustomTensorPreparation STARTED ===" << std::endl;
std::cout << "Batch size: " << (batch ? batch->units.size() : 0) << std::endl;
std::cout << "TensorParam network color format: " << tensorParam.params.network_color_format << std::endl;
// 2. Acquire a buffer from the tensor pool
buf = acquirer->acquire();
if (!buf || !buf->memory_ptr)
{
std::cerr<< "ERROR: Failed to acquire buffer from tensor pool or null memory pointer"<<std::endl;
return NVDSPREPROCESS_RESOURCE_ERROR;
}
std::cout << "Buffer acquired successfully" << std::endl;
// 3. Cutout objects with error handling
std::cout << "Calling cutout_objects..." << std::endl;
status = ctx->cutout_impl->cutout_objects(batch, buf->memory_ptr, tensorParam);
std::cout << "cutout_objects returned status: " << status << std::endl;
if (status != NVDSPREPROCESS_SUCCESS)
{
std::cerr<< "ERROR: CustomTensorPreparation: cutout_objects failed error code "<<status<<std::endl;
acquirer->release(buf);
return status;
}
// 4. Sync CUDA stream with error handling
std::cout << "Syncing CUDA stream..." << std::endl;
status = ctx->cutout_impl->syncStream();
std::cout << "syncStream returned status: " << status << std::endl;
if (status != NVDSPREPROCESS_SUCCESS)
{
std::cerr<< "ERROR: CustomTensorPreparation: syncStream failed error code "<<status<<std::endl;
acquirer->release(buf);
return status;
}
tensorParam.params.network_input_shape[0] = (int)batch->units.size();
std::cout << "=== CustomTensorPreparation COMPLETED SUCCESSFULLY ===" << std::endl;
return status;
}
The cutout_objects function:
NvDsPreProcessStatus CustomObjectCutoutImpl::cutout_objects(
NvDsPreProcessBatch *batch, void *&devBuf, CustomTensorParams &tensorParam)
{
cudaError_t err = cudaSuccess;
if (!batch || !devBuf)
{
std::cerr << "Invalid input parameters" << std::endl;
return NVDSPREPROCESS_CUSTOM_LIB_FAILED;
}
unsigned int batch_size = batch->units.size();
if (batch_size > m_BatchSize)
{
std::cerr << "Batch size exceeds allocated resources" << std::endl;
return NVDSPREPROCESS_CUSTOM_LIB_FAILED;
}
for (unsigned int i = 0; i < batch_size; i++)
{
NvDsPreProcessUnit *unit = &batch->units[i];
if (!unit || !unit->roi_meta.object_meta)
{
std::cerr << "Invalid unit or object metadata" << std::endl;
continue;
}
// original rect_roi
NvDsRoiMeta * roi_meta = &unit->roi_meta;
NvOSD_RectParams * roi = &roi_meta->roi;
// 320x320 mask for nvinfer input 1280x1280
NvOSD_MaskParams *mask_params = &unit->roi_meta.object_meta->mask_params;
// Before calling TransformMask, add verification:
if (!mask_params || !mask_params->data)
{
std::cerr<< "ERROR: Invalid mask_params or mask data"<<std::endl;
return NVDSPREPROCESS_CUDA_ERROR;
}
if (mask_params->width != m_MaskWidth || mask_params->height != m_MaskHeight)
{
std::cerr<< "ERROR: Invalid mask dimensions"<<std::endl;
return NVDSPREPROCESS_CUDA_ERROR;
}
std::size_t mask_size = m_MaskWidth * m_MaskHeight * sizeof(float);
float *mask = ((float *)m_Mask.get() ? m_Mask->ptr<float>() : nullptr);
if (!mask)
{
std::cerr<< "ERROR: Invalid mask pointer"<<std::endl;
return NVDSPREPROCESS_CUDA_ERROR;
}
float* d_mask_data = mask + i * m_MaskWidth * m_MaskHeight;
std::cout << "Copying mask data to device" << std::endl;
err = cudaMemcpyAsync(d_mask_data, mask_params->data, mask_size,
cudaMemcpyHostToDevice, *m_PreProcessStream);
if (err != cudaSuccess) {
std::cerr<< "ERROR: CustomObjectCutoutImpl: cutout_objects: Failed to copy mask data to device: "<<cudaGetErrorString(err)<<std::endl;
return NVDSPREPROCESS_CUDA_ERROR;
}
float *d_scaled_mask_data = ((float *)m_ScaledMask.get() ? m_ScaledMask->ptr<float>() : nullptr) + i * m_NetworkSize.width * m_NetworkSize.height;
if (!d_scaled_mask_data) {
std::cerr<< "ERROR: Invalid scaled mask pointer"<<std::endl;
return NVDSPREPROCESS_CUDA_ERROR;
}
// shape of mask input
unsigned int in_mask_width = mask_params->width; // 160
unsigned int in_mask_height = mask_params->height; // 160
// shape of desired output
unsigned int out_width = m_NetworkSize.width; // 640
unsigned int out_height = m_NetworkSize.height; // 640
// calculate output roi to scale in roi to
float roi_crop_scale_x = roi_meta->scale_ratio_x;
float roi_crop_scale_y = roi_meta->scale_ratio_y;
unsigned int roi_out_width = (unsigned int)((float)roi->width * (float)roi_crop_scale_x);
unsigned int roi_out_height = (unsigned int)((float)roi->height * (float)roi_crop_scale_y);
if (roi_out_width > out_width)
{
roi_out_width = out_width;
}
if (roi_out_height > out_height)
{
roi_out_height = out_height;
}
unsigned int roi_out_left = roi_meta->offset_left;
unsigned int roi_out_top = roi_meta->offset_top;
// scale ratio needed for scaling from in shape to out shape
float scale_ratio_x = float(in_mask_width) / float(roi_out_width);
float scale_ratio_y = float(in_mask_height) / float(roi_out_height);
if (!m_ScaledMask || !m_ScaledMask->ptr() || !m_PreProcessStream || !m_PreProcessStream->ptr()) {
std::cerr<< "ERROR: Invalid CUDA resources"<<std::endl;
return NVDSPREPROCESS_CUDA_ERROR;
}
std::cout << "Transforming mask" << std::endl;
err = TransformMask(
d_scaled_mask_data,
d_mask_data,
in_mask_width,
in_mask_height,
out_width,
out_height,
roi_out_left,
roi_out_top,
roi_out_width,
roi_out_height,
scale_ratio_x,
scale_ratio_y,
m_PreProcessStream->ptr());
if (err != cudaSuccess)
{
std::cerr<< "TransformMask failed with err "<<(int)err<<" : "<<cudaGetErrorName(err)<<std::endl;
return NVDSPREPROCESS_CUDA_ERROR;
}
void *outPtr = (void*)((uint8_t*)devBuf + i * m_NetworkSize.channels
* m_NetworkSize.width * m_NetworkSize.height
* bytesPerElement(tensorParam.params.data_type));
err = ApplyMaskAndConvert_C4ToL3Half((half *)outPtr, (unsigned char *)unit->converted_frame_ptr, d_scaled_mask_data, out_width, out_height, batch->pitch, *m_PreProcessStream);
if (err != cudaSuccess)
{
std::cerr<< "ApplyMaskToConvertedBuffer failed with err "<<(int)err<<" : "<<cudaGetErrorName(err)<<std::endl;
return NVDSPREPROCESS_CUDA_ERROR;
}
err = cudaStreamSynchronize(*m_PreProcessStream);
if (err != cudaSuccess)
{
std::cerr<< "cudaStreamSynchronize failed with err "<<(int)err<<" : "<<cudaGetErrorName(err)<<std::endl;
return NVDSPREPROCESS_CUDA_ERROR;
}
// Add error checking after each CUDA operation
if (cudaGetLastError() != cudaSuccess) {
std::cerr << "CUDA error during processing" << std::endl;
return NVDSPREPROCESS_CUDA_ERROR;
}
}
return NVDSPREPROCESS_SUCCESS;
}
I already tested this function using memcopy and saving results in devBuf to ppm files, the images are perfectly cutout, masks correctly applied. Also for multiple iterations this is at least executed correctly i cant see the results from secondary inference but the function is executed correctly for some time until segfault happens
Also for some reason the converted_frame_buf is in RGBA but nowhere this format is specified in my configs
config_preprocess_secondary.txt (2.8 KB)
config_infer_primary_yoloV8_seg.txt (2.0 KB)
config_infer_secondary_yoloV8_seg.txt (2.1 KB)