C++ collects video much faster than Python, is it reasonable?

I collects video using a USB camera connected to the Xavier NX. I conduct experiments on both V4L2 in C++ and pyrealsense2 in Python. The results show that in the same FPS (e.g. 60), C++ only takes about 1% CPU with lower latency, while Python takes about 20% with higher latency. And the C++ program seems do not use GPU while Python program uses GPU, whose usage increases with the FPS. Here is the main function of the C++ code.


int main() {
    int fd = open("/dev/video4", O_RDWR);

    if (fd == -1) {
        perror("Opening camera device failed");
        return 1;
    }

    unsigned int fps = 30;
    if (set_fps(fd, fps) == 0) {
        printf("FPS set to %u\n", fps);
    }

    struct v4l2_format format;
    memset(&format, 0, sizeof(format));
    format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
    format.fmt.pix.width = 640;
    format.fmt.pix.height = 480;
    format.fmt.pix.pixelformat = V4L2_PIX_FMT_YUYV;
    format.fmt.pix.field = V4L2_FIELD_INTERLACED;

    if (ioctl(fd, VIDIOC_S_FMT, &format) == -1) {
        perror("Setting camera format failed");
        close(fd);
        return 1;
    }

    struct v4l2_streamparm streamparm;
    memset(&streamparm, 0, sizeof(streamparm));
    streamparm.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;

    if (ioctl(fd, VIDIOC_G_PARM, &streamparm) == -1) {
        perror("Getting current parameters failed");
        close(fd);
        return 1;
    }

    streamparm.parm.capture.timeperframe.numerator = 1;
    streamparm.parm.capture.timeperframe.denominator = 60; 

    if (ioctl(fd, VIDIOC_S_PARM, &streamparm) == -1) {
        perror("Setting frame rate failed");
        close(fd);
        return 1;
    }

    std::cout << "Frame rate set to: " 
              << streamparm.parm.capture.timeperframe.denominator << " FPS" << std::endl;


    struct v4l2_requestbuffers requestBuffers;
    memset(&requestBuffers, 0, sizeof(requestBuffers));
    requestBuffers.count = 1;
    requestBuffers.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
    requestBuffers.memory = V4L2_MEMORY_MMAP;

    if (ioctl(fd, VIDIOC_REQBUFS, &requestBuffers) == -1) {
        perror("Requesting buffer failed");
        close(fd);
        return 1;
    }

    struct v4l2_buffer buffer;
    memset(&buffer, 0, sizeof(buffer));
    buffer.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
    buffer.memory = V4L2_MEMORY_MMAP;
    buffer.index = 0;

    if (ioctl(fd, VIDIOC_QUERYBUF, &buffer) == -1) {
        perror("Querying buffer failed");
        close(fd);
        return 1;
    }

    void* bufferStart = mmap(nullptr, buffer.length, PROT_READ | PROT_WRITE, MAP_SHARED, fd, buffer.m.offset);
    if (bufferStart == MAP_FAILED) {
        perror("mmap failed");
        close(fd);
        return 1;
    }

    int type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
    if (ioctl(fd, VIDIOC_STREAMON, &type) == -1) {
        perror("Starting capture failed");
        munmap(bufferStart, buffer.length);
        close(fd);
        return 1;
    }

    int frame_count = 0;
    while (frame_count < 2000) { 
        if (ioctl(fd, VIDIOC_QBUF, &buffer) == -1) {
            perror("Queueing buffer failed");
            break;
        }

        if (ioctl(fd, VIDIOC_DQBUF, &buffer) == -1) {
            perror("Dequeueing buffer failed");
            break;
        }

        struct timeval hardware_timestamp = buffer.timestamp;
        double hardware_time_sec = hardware_timestamp.tv_sec + hardware_timestamp.tv_usec / 1000000.0; 

        auto user_end = std::chrono::system_clock::now();
        double user_time_sec = std::chrono::duration<double>(user_end.time_since_epoch()).count();  
        double delay_sec = user_time_sec - hardware_time_sec;

        printf("Buffer Index: %u\n", buffer.index);
        printf("Hardware timestamp: %.6f ms\n", hardware_time_sec);
        printf("User-space timestamp: %.6f ms\n", user_time_sec);
        printf("Delay: %.3f ms\n", delay_sec * 1000);
        // yuyv_to_rgb(static_cast<unsigned char*>(bufferStart), rgb_buffer, 640, 480);
        // save_jpeg(rgb_buffer, 640, 480, frame_count);

        frame_count++;
    }

    ioctl(fd, VIDIOC_STREAMOFF, &type);
    munmap(bufferStart, buffer.length);
    close(fd);

    return 0;
}

Can anyone explain the result? Is it reasonable that such differences between C++ and Python? Thank you so much!

Hi,
Do you have source code of python binding? One possible reason is that there is some buffer copy in python binding, generating the CPU loading. Would suggest check the source code. For comparing system status, you may run sudo tegrastats

Here is the Python Code:

def video_capture(event, new_fps):
    """
    Basic video capture function using Intel RealSense and saving frames as images with OpenCV.
    """  
    pipeline = rs.pipeline()
    config = rs.config()

    # Enable RGB stream with the desired resolution and fps
    config.enable_stream(rs.stream.color, resolution[0], resolution[1], rs.format.bgr8, fps)

    buffer_latency_list = []
    collection_latency_list = []
    try:
        # Start streaming from RealSense camera
        profile = pipeline.start(config)
        color_sensor = profile.get_device().first_color_sensor()
        color_sensor.set_option(rs.option.global_time_enabled, 0)  

        # print('\nData stream stabilized. Starting data collection...')
        
        print('Video Start')
        start_time = time.time()

        frames_cnt = 1
        first = True
        last_kernel_timestamp = None
        
        start_interrupt_count = get_xhci_hcd_interrupt_count()
        process = psutil.Process()

        ctx_switches_start = process.num_ctx_switches()
        start_cpu_times = process.cpu_times()
   
        event.set()
        with open(f"logs/log_fps_{fps}_cpu_{cpu}.txt", "w") as log_file:  
            # while time.time() - start_time <= 10.:  # Save 10 seconds of frames
            while True:
                # Wait for a new frame
                frames = pipeline.wait_for_frames(100000000)

                dt1 = frames.get_frame_metadata(rs.frame_metadata_value.backend_timestamp)
                dt2 = frames.get_frame_metadata(rs.frame_metadata_value.frame_timestamp)
                dt3 = frames.get_frame_metadata(rs.frame_metadata_value.time_of_arrival)
                dt4 = frames.get_frame_metadata(rs.frame_metadata_value.sensor_timestamp)
                
                if first:
                    start_timestamp = dt1
                    first = False
                elif dt1 - start_timestamp >= sample_time * 1000:
                    print((dt1 - start_timestamp) / 1000)
                    break
                
                kernel_timestamp = dt1 / 1000
                userspace_timestamp = dt3 / 1000
                read_timestamp = time.time()
                buffer_latency = (userspace_timestamp - kernel_timestamp) * 1000
                collection_latency = (read_timestamp - userspace_timestamp) * 1000
                
                # Calculate time interval between frames
                if last_kernel_timestamp is not None:
                    frame_interval = (kernel_timestamp - last_kernel_timestamp) * 1000 # Time difference between frames
                    frame_interval = abs(frame_interval - 1000 / new_fps)
                    frame_interval_list.append(frame_interval)      
                else:
                    frame_interval = 0  # First frame has no previous interval

                # Update last_kernel_timestamp
                last_kernel_timestamp = kernel_timestamp
                
                log_str = (f'Frame {frames_cnt} - Kernel Timestamp: {kernel_timestamp}, '
                   f'Userspace Timestamp: {userspace_timestamp}, Read Timestamp: {read_timestamp}, '
                   f'Kernel Delay: {userspace_timestamp - kernel_timestamp}, '
                   f'Frame Interval: {frame_interval:.3f}s\n')

                last_kernel_timestamp = kernel_timestamp

                # print(log_str)
                
                log_file.write(log_str)
                # write_system_usage(log_file)
                
                collection_latency_list.append(collection_latency)
                buffer_latency_list.append(buffer_latency)

                if save_image:
                    color_frame = frames.get_color_frame()
                    if color_frame:
                        color_image = np.asanyarray(color_frame.get_data())
                        # frame_filename = f'{experiment}/frames_{fps}_cpu_{cpu}/frame_{frames_cnt:03d}.png'
                        frame_filename = f'frames/frame_{frames_cnt:03d}.png'
                        cv2.imwrite(frame_filename, color_image)

                frames_cnt += 1
                        
            end_time = time.time()
        
    finally:
        # Stop the RealSense pipeline
        pipeline.stop()
        os.close(fd)

Well, I want to compare the system usage of a specific process, and sudo tegrastats can only show the overall status of the whole system. I use psutil to monitor the process system usage.

help! plzzzz!

Hi,
For further investigation you would need to check source code of pyrealsense2. To check how it is implemented. There may not be further information in upper application layer.

OK, thank you. So it is common that in C++, collecting video data only requires such small amount of CPU usage?

Hi,
Capturing frame data through v4l2 is direct so it does not have additional CPU usage. It seems like pyrealsense2 has some code having additional usage. This would need to check source code of pyrealsense2 to get further information.

OK. And I also have a question that does the V4L2 capture procedure will use GPU? Because the GPU usage will increase occasionally, while sometimes it is always 0.

Hi,
V4L2 capture does not use GPU. The GPU usage should be from other tasks such as Ubuntu desktop(GUI).

OK, I see. Thank you.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.