Access all pixel value from each face bounding box

Description

Need to retrieve frame region data in a bounding box

Environment

TDeepstream Version: 6.3
GPU Type: GeForce RTX 3050
Nvidia Driver Version: 530.41.03
CUDA Version: 12.1
Operating System + Version: Ubuntu 20
Python Version (if applicable): Python 3.8

i was using Yolov8 to detect the faces in a video through Deepstream 6.3. Now, i want to retrieve the data in the frame inside the bounding box.

You can get the whole data from the nvbufsurface and you get the bbox. Then you can retrieve that in the whole image data based on the bbox.

The following is my code snippet. I am able to get the bounding box coordinates, but i need the pixel values inside the bounding box region.

def extract_face_region(frame_meta, obj_meta, buffer):
    x, y, width, height = obj_meta.rect_params.left, obj_meta.rect_params.top, obj_meta.rect_params.width, obj_meta.rect_params.height

    batch_meta = pyds.gst_buffer_get_nvds_batch_meta(hash(buffer))
    l_frame = batch_meta.frame_meta_list

    while l_frame is not None:
        try:
            frame_meta = pyds.NvDsFrameMeta.cast(l_frame.data)
            print("Image Format:", frame_meta.frame_meta.batch_meta.image.format)
        except StopIteration:
            break

        if frame_meta.num_obj_meta != 0:
            l_obj = frame_meta.obj_meta_list
            while l_obj is not None:
                try:
                    obj_meta = pyds.NvDsObjectMeta.cast(l_obj.data)
                except StopIteration:
                    break

                if obj_meta.class_id == 0:  # Adjust the class_id as needed
                    batch_id = frame_meta.batch_id
                    n_frame = pyds.get_nvds_buf_surface(buffer, batch_id)
                    frame_array = np.ndarray(shape=(frame_meta.batch_meta.image.height,
                                                    frame_meta.batch_meta.image.width, 4),
                                            dtype=np.uint8, buffer=n_frame)

                    # Extract the face region directly from RGBA format
                    face_region = frame_array[y:y + height, x:x + width, :]



                    return face_region

                try:
                    l_obj = l_obj.next
                except StopIteration:
                    break

        try:
            l_frame = l_frame.next
        except StopIteration:
            break

    return None

Also, i tried with : ‘pyds.nvbufsurface’ to retrieve the values inside the bounding box but ** module ‘pyds’ has no attribute ‘nvbfsurface’
**

You can refer to our demo deepstream_imagedata-multistream_redaction.py to get the frame array. It’s simillar to your code snippet. Then you need to calculate the data corresponding to the frame array based on the bbox coordinates.

i have updated the code as follows :

def extract_face_region(frame_meta, obj_meta, buffer):
    x, y, width, height = int(obj_meta.rect_params.left), int(obj_meta.rect_params.top), int(obj_meta.rect_params.width), int(obj_meta.rect_params.height)

    batch_meta = pyds.gst_buffer_get_nvds_batch_meta(hash(buffer))
    l_frame = batch_meta.frame_meta_list

    while l_frame is not None:
        try:
            frame_meta = pyds.NvDsFrameMeta.cast(l_frame.data)
        except StopIteration:
            break

        if frame_meta.num_obj_meta != 0:
            l_obj = frame_meta.obj_meta_list
            while l_obj is not None:
                try:
                    obj_meta = pyds.NvDsObjectMeta.cast(l_obj.data)
                except StopIteration:
                    break

                if obj_meta.class_id == 0:  # Adjust the class_id as needed
                    batch_id = frame_meta.batch_id
                    n_frame = pyds.get_nvds_buf_surface(buffer, batch_id)

                    # Extract the face region directly from RGBA format
                    face_region = n_frame[y:y + height, x:x + width, :]
                    # print("n_frame : ", type(face_region))
                    emotions = detect_emotion(face_region)
                    label = max(emotions, key=lambda x: emotions[x])
                    print("label : ", label)




                    return n_frame

                try:
                    l_obj = l_obj.next
                except StopIteration:
                    break

        try:
            l_frame = l_frame.next
        except StopIteration:
            break

    return None

def detect_emotion(image):
    with torch.no_grad():
        # Normalise and transform images
        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                         std=[0.229, 0.224, 0.225])
        x = []
        try:
            img = Image.fromarray(image)
            transformed = transforms.Compose([
                transforms.Resize(256),
                transforms.CenterCrop(224),
                transforms.ToTensor(),
                normalize
            ])(img)
            x.append(transformed)
        except Exception as e:
            print(f"Error processing image: {e}")

        if len(x) > 0:
            # Stack the transformed images into a tensor
            x = torch.stack(x)
            # Feed through the model
            y = emo_model(x)

        for i in range(y.size()[0]):
            # Add emotion to result
            try:
                emots = {}
                for key, emot in enumerate(list(emotions_labels)):
                    emots[emot] = round(100 * y[i][key].item(), 2)
                    print(emots)
                return emots
            except Exception as e:
                return False
        return False

i am getting a Segmentation fault (core dumped) when i try to access the ‘n_frame’. Even when i try to print "n_frame’ i am getting a segmentation fault.

Could you try to convert that to the numpy first?

frame_copy = np.array(n_frame, copy=True, order='C')

Thank you, I got the solution.