dlib causing memory leak

I have written a program to capture faces from webcam and recognize it. The program is an extension/modification of the example http://dlib.net/dnn_face_recognition_ex.cpp.html. When it runs in Desktop, there is no memory leak. Here is the code:

#include <dlib/opencv.h>
#include <opencv2/highgui/highgui.hpp>
#include <dlib/image_processing/frontal_face_detector.h>
#include <dlib/image_processing/render_face_detections.h>
#include <dlib/image_processing.h>
#include <dlib/gui_widgets.h>
#include <dlib/dnn.h>
#include <dlib/clustering.h>
#include <dlib/string.h>
#include <dlib/image_io.h>
#include <iostream>
#include <string>

using namespace dlib;
using namespace std;

// ----------------------------------------------------------------------------------------

// The next bit of code defines a ResNet network.  It's basically copied
// and pasted from the dnn_imagenet_ex.cpp example, except we replaced the loss
// layer with loss_metric and made the network somewhat smaller.  Go read the introductory
// dlib DNN examples to learn what all this stuff means.
//
// Also, the dnn_metric_learning_on_images_ex.cpp example shows how to train this network.
// The dlib_face_recognition_resnet_model_v1 model used by this example was trained using
// essentially the code shown in dnn_metric_learning_on_images_ex.cpp except the
// mini-batches were made larger (35x15 instead of 5x5), the iterations without progress
// was set to 10000, the jittering you can see below in jitter_image() was used during
// training, and the training dataset consisted of about 3 million images instead of 55.
// Also, the input layer was locked to images of size 150.
template <template <int,template<typename>class,int,typename> class block, int N, template<typename>class BN, typename SUBNET>
using residual = add_prev1<block<N,BN,1,tag1<SUBNET>>>;

template <template <int,template<typename>class,int,typename> class block, int N, template<typename>class BN, typename SUBNET>
using residual_down = add_prev2<avg_pool<2,2,2,2,skip1<tag2<block<N,BN,2,tag1<SUBNET>>>>>>;

template <int N, template <typename> class BN, int stride, typename SUBNET> 
using block  = BN<con<N,3,3,1,1,relu<BN<con<N,3,3,stride,stride,SUBNET>>>>>;

template <int N, typename SUBNET> using ares      = relu<residual<block,N,affine,SUBNET>>;
template <int N, typename SUBNET> using ares_down = relu<residual_down<block,N,affine,SUBNET>>;

template <typename SUBNET> using alevel0 = ares_down<256,SUBNET>;
template <typename SUBNET> using alevel1 = ares<256,ares<256,ares_down<256,SUBNET>>>;
template <typename SUBNET> using alevel2 = ares<128,ares<128,ares_down<128,SUBNET>>>;
template <typename SUBNET> using alevel3 = ares<64,ares<64,ares<64,ares_down<64,SUBNET>>>>;
template <typename SUBNET> using alevel4 = ares<32,ares<32,ares<32,SUBNET>>>;

using anet_type = loss_metric<fc_no_bias<128,avg_pool_everything<
                            alevel0<
                            alevel1<
                            alevel2<
                            alevel3<
                            alevel4<
                            max_pool<3,3,2,2,relu<affine<con<32,7,7,2,2,
                            input_rgb_image_sized<150>
                            >>>>>>>>>>>>;

// ----------------------------------------------------------------------------------------

std::vector<matrix<rgb_pixel>> jitter_image(
    const matrix<rgb_pixel>& img
);

// ----------------------------------------------------------------------------------------

void GetFilesInDirectory(std::vector<string> &out, const string &directory)
{
#ifdef WINDOWS
    HANDLE dir;
    WIN32_FIND_DATA file_data;

    if ((dir = FindFirstFile((directory + "/*").c_str(), &file_data)) == INVALID_HANDLE_VALUE)
        return; /* No files found */

    do {
        const string file_name = file_data.cFileName;
        const string full_file_name = directory + "/" + file_name;
        const bool is_directory = (file_data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) != 0;

        if (file_name[0] == '.')
            continue;

        if (is_directory)
            continue;

        out.push_back(full_file_name);
    } while (FindNextFile(dir, &file_data));

    FindClose(dir);
#else
    DIR *dir;
    class dirent *ent;
    class stat st;

    dir = opendir(directory.c_str());
    while ((ent = readdir(dir)) != NULL) {
        const string file_name = ent->d_name;
        const string full_file_name = directory + "/" + file_name;

        if (file_name[0] == '.')
            continue;

        if (stat(full_file_name.c_str(), &st) == -1)
            continue;

        const bool is_directory = (st.st_mode & S_IFDIR) != 0;

        if (is_directory)
            continue;

        out.push_back(file_name);
        out.push_back(full_file_name);
    }
    closedir(dir);
#endif
} // GetFilesInDirectory

int main()
{
    try
    {

		std::vector<string> names;
		GetFilesInDirectory(names,"./faces/template");

		for(int i=0; i<names.size(); i++) cout<< names[i]<<endl;

		cv::VideoCapture cap(0);
        
        if (!cap.isOpened())
        {
            cerr << "Unable to connect to camera" << endl;
            return 1;
        }

        image_window win;

        // Load face detection and pose estimation models.
        frontal_face_detector detector = get_frontal_face_detector();
        // We will also use a face landmarking model to align faces to a standard pose:  (see face_landmark_detection_ex.cpp for an introduction)
		shape_predictor sp;
		deserialize("shape_predictor_5_face_landmarks.dat") >> sp;

		// And finally we load the DNN responsible for face recognition.
		anet_type net;
		deserialize("dlib_face_recognition_resnet_model_v1.dat") >> net;

		printf("loading done!\n\n");

		std::vector<matrix<rgb_pixel>> templates;
		matrix<rgb_pixel> temp;
		for(int i=1; i<names.size(); i+=2)
		{
			printf("i : %d\n",i);
			load_image(temp, names[i]);
			printf("debug 0\n");

			for (auto face : detector(temp))
			{
				auto shape = sp(temp, face);
				printf("debug 1\n");
				matrix<rgb_pixel> face_chip;
				extract_image_chip(temp, get_face_chip_details(shape,150,0.25), face_chip);
				templates.push_back(move(face_chip));
			}
		}
		std::vector<matrix<float,0,1>> template_descriptors = net(templates);

        // Grab and process frames until the main window is closed by the user.
        while(!win.is_closed())
        {
            // Grab a frame
            cv::Mat temp;
			//for(int i=0; i<5; i++) cap>> temp;
            if (!cap.read(temp))
            {
				cout<<"image not found!\n";
                continue;
            }
            // Turn OpenCV's Mat into something dlib can deal with.  Note that this just
            // wraps the Mat object, it doesn't copy anything.  So cimg is only valid as
            // long as temp is valid.  Also don't do anything to temp that would cause it
            // to reallocate the memory which stores the image as that will make cimg
            // contain dangling pointers.  This basically means you shouldn't modify temp
            // while using cimg.
          //  cv_image<bgr_pixel> img(temp);
                 array2d<rgb_pixel> img;
    		assign_image(img, dlib::cv_image<bgr_pixel>(temp));

			win.clear_overlay();
			win.set_image(img);

            // Run the face detector on the image of our action heroes, and for each face extract a
			// copy that has been normalized to 150x150 pixels in size and appropriately rotated
			// and centered.
			std::vector<matrix<rgb_pixel>> faces;
			std::vector<rectangle> rects = detector(img);
			for (auto face : rects)
			{
				auto shape = sp(img, face);
				matrix<rgb_pixel> face_chip;
				extract_image_chip(img, get_face_chip_details(shape,150,0.25), face_chip);
				faces.push_back(move(face_chip));
				// Also put some boxes on the faces so we can see that the detector is finding
				// them.
				win.add_overlay(face);
			}
            
			if (faces.size() == 0)
			{
				cout << "No faces found in image!" << endl;
				//return 1;
			}
			else
			{
				// This call asks the DNN to convert each face image in faces into a 128D vector.
				// In this 128D vector space, images from the same person will be close to each other
				// but vectors from different people will be far apart.  So we can use these vectors to
				// identify if a pair of images are from the same person or from different people.  
				std::vector<matrix<float,0,1>> face_descriptors = net(faces);

				// In particular, one simple thing we can do is face clustering.  This next bit of code
				// creates a graph of connected faces and then uses the Chinese whispers graph clustering
				// algorithm to identify how many people there are and which faces belong to whom.
				std::vector<sample_pair> edges;
				for (size_t i = 0; i < face_descriptors.size(); ++i)
				{
					for (size_t j = 0; j < template_descriptors.size(); ++j)
					{
						cout<<"i : "<<i<<" j : "<<j<<endl;
						// Faces are connected in the graph if they are close enough.  Here we check if
						// the distance between two face descriptors is less than 0.6, which is the
						// decision threshold the network was trained to use.  Although you can
						// certainly use any other threshold you find useful.
						if (length(face_descriptors[i]-template_descriptors[j]) < 0.5)
						{
							string temp = names[j+j] + "found!!";
							win.add_overlay(dlib::image_window::overlay_rect(rects[i], rgb_pixel(0,255,0),temp));
							cout<< "face no: "<<i<<" similar to : " <<names[j+j]<<endl;
						}
					}
				}

   std::vector<unsigned long> labels;
    const auto num_clusters = chinese_whispers(edges, labels);
    // This will correctly indicate that there are 4 people in the image.
    cout << "number of people found in the image: "<< num_clusters << endl;

// Now let's display the face clustering results on the screen.  You will see that it
    // correctly grouped all the faces. 
    std::vector<image_window> win_clusters(num_clusters);
    for (size_t cluster_id = 0; cluster_id < num_clusters; ++cluster_id)
    {
        std::vector<matrix<rgb_pixel>> temp;
        for (size_t j = 0; j < labels.size(); ++j)
        {
            if (cluster_id == labels[j])
                temp.push_back(faces[j]);
        }
        win_clusters[cluster_id].set_title("face cluster " + cast_to_string(cluster_id));
        win_clusters[cluster_id].set_image(tile_images(temp));
    }

				// Finally, let's print one of the face descriptors to the screen.  
				cout << "face descriptor for one face: " << trans(face_descriptors[0]) << endl;

				// It should also be noted that face recognition accuracy can be improved if jittering
				// is used when creating face descriptors.  In particular, to get 99.38% on the LFW
				// benchmark you need to use the jitter_image() routine to compute the descriptors,
				// like so:
				
				//matrix<float,0,1> face_descriptor = mean(mat(net(jitter_image(faces[0]))));
				//cout << "jittered face descriptor for one face: " << trans(face_descriptor) << endl;
				
				// If you use the model without jittering, as we did when clustering the bald guys, it
				// gets an accuracy of 99.13% on the LFW benchmark.  So jittering makes the whole
				// procedure a little more accurate but makes face descriptor calculation slower.
			}

			// Display it all on the screen
            //win.clear_overlay();
            //win.set_image(cimg);
            //win.add_overlay(render_face_detections(shapes));
        }
    }
    catch(serialization_error& e)
    {
        cout << "You need dlib's default face landmarking model file to run this example." << endl;
        cout << "You can get it from the following URL: " << endl;
        cout << "   http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2" << endl;
        cout << endl << e.what() << endl;
    }
    catch(exception& e)
    {
        cout << e.what() << endl;
    }
}

// ----------------------------------------------------------------------------------------

std::vector<matrix<rgb_pixel>> jitter_image(
    const matrix<rgb_pixel>& img
)
{
    // All this function does is make 100 copies of img, all slightly jittered by being
    // zoomed, rotated, and translated a little bit differently.
    thread_local random_cropper cropper;
    cropper.set_chip_dims(150,150);
    cropper.set_randomly_flip(true);
    cropper.set_max_object_size(0.99999);
    cropper.set_background_crops_fraction(0);
    cropper.set_min_object_size(0.97);
    cropper.set_translate_amount(0.02);
    cropper.set_max_rotation_degrees(3);

    std::vector<mmod_rect> raw_boxes(1), ignored_crop_boxes;
    raw_boxes[0] = shrink_rect(get_rect(img),3);
    std::vector<matrix<rgb_pixel>> crops; 

    matrix<rgb_pixel> temp; 
    for (int i = 0; i < 100; ++i)
    {
        cropper(img, raw_boxes, temp, ignored_crop_boxes);
        crops.push_back(move(temp));
    }
    return crops;
}

But when the same code runs on both Jetson Xavier or Nano, the memory increases gradually. I have discovered that when I comment out the line

// assign_image(img, dlib::cv_image<bgr_pixel>(temp));

there is no memory leak, but it also stops face recognition. I wonder if there is any bug or compatibility issue with dlib and Jetpack. My dlib version is 19.7 and Jetpack version is 4.2

Hi,

Thanks for your feedback.
We are going to check this issue and will update more information soon.

May I know the dlib you used is built from source or install via apt-get?
Thanks.

I have build dlib from source and compiled this program using

$ g++ -std=c++11 -O3 -I.. ../dlib/all/source.cpp -lpthread -lX11 -ljpeg -DDLIB_JPEG_SUPPORT webcam_face_recog.cpp $(pkg-config --cflags --libs opencv)

I have also found that

deserialize("dlib_face_recognition_resnet_model_v1.dat") >> net;
std::vector<matrix<float,0,1>> face_descriptors = net(faces);

causes the leak more specifically. When the net is not called, for example

std::vector<matrix<float,0,1>> face_descriptors;// = net(faces);

then there is no leak. I assume the deserialized dlib model causing the leakage.

Hi xhuv_NV,
did you find a solution for the memory leak?
Thanks.

Later I found that the leakage is not directly referred to dlib particularly. Thus resolved.