Uploading opencv Mat is too slow

Hi.
I’m now working on jetson TX1 with cuda8.0.
I compiled opencv3.1 with cuda and it looks successful.

However, when I tried some samples, I found cuda calclation pretty slow mostly because of uploading images.

For example, I tried surf feature extraction and matching.
The result is like below:

upLoad = 39.9022

Device 0:  "NVIDIA Tegra X1"  3995Mb, sm_53, Driver/Runtime ver.8.0/8.0
FOUND 158 keypoints on first image
FOUND 137 keypoints on second image

Findcuda = 0.000123487  Extraction = 0.0952315
Matching = 0.00152424 Download = 0.00137919

It is shown uploading two images took about 40sec!
Are there any solutions? Thank you.

I put my code here.

#include <iostream>

#include "opencv2/opencv_modules.hpp"

#ifdef HAVE_OPENCV_XFEATURES2D

#include "opencv2/core.hpp"
#include "opencv2/features2d.hpp"
#include "opencv2/highgui.hpp"
#include "opencv2/cudafeatures2d.hpp"
#include "opencv2/xfeatures2d/cuda.hpp"

using namespace std;
using namespace cv;
using namespace cv::cuda;

static void help()
{
    cout << "\nThis program demonstrates using SURF_CUDA features detector, descriptor extractor and BruteForceMatcher_CUDA" << endl;
    cout << "\nUsage:\n\tsurf_keypoint_matcher --left <image1> --right <image2>" << endl;
}

int main(int argc, char* argv[])
{
    if (argc != 5)
    {
        help();
        return -1;
    }

    GpuMat img1, img2;
    cv::Mat raw1,raw2;
    raw1 = imread(argv[2], IMREAD_GRAYSCALE);
    raw2= imread(argv[4], IMREAD_GRAYSCALE);
    int64 t0 = cv::getTickCount();
    for (int i = 1; i < argc; ++i)
    {
        if (string(argv[i]) == "--left")
        {
            img1.upload(raw1);
            CV_Assert(!img1.empty());
        }
        else if (string(argv[i]) == "--right")
        {
            img2.upload(raw2);
            CV_Assert(!img2.empty());
        }
        else if (string(argv[i]) == "--help")
        {
            help();
            return -1;
        }
    }
    
    int64 t1 = cv::getTickCount();
    cout <<  " upLoad = " << (t1-t0)/cv::getTickFrequency() << endl;

    cv::cuda::printShortCudaDeviceInfo(cv::cuda::getDevice());
    
    int64 t2 = cv::getTickCount();
    SURF_CUDA surf;
    
    // detecting keypoints & computing descriptors
    GpuMat keypoints1GPU, keypoints2GPU;
    GpuMat descriptors1GPU, descriptors2GPU;
    surf(img1, GpuMat(), keypoints1GPU, descriptors1GPU);
    surf(img2, GpuMat(), keypoints2GPU, descriptors2GPU);
    
    int64 t3 = cv::getTickCount();
    cout << "FOUND " << keypoints1GPU.cols << " keypoints on first image" << endl;
    cout << "FOUND " << keypoints2GPU.cols << " keypoints on second image" << endl;

        cout << " Findcuda = " << (t2-t1)/cv::getTickFrequency() << " Extraction = " << (t3-t2)/cv::getTickFrequency() << endl;
        
    // matching descriptors
    Ptr<cv::cuda::DescriptorMatcher> matcher = cv::cuda::DescriptorMatcher::createBFMatcher(surf.defaultNorm());
    vector<DMatch> matches;
    matcher->match(descriptors1GPU, descriptors2GPU, matches);
    int64 t4 = cv::getTickCount();

    // downloading results
    vector<KeyPoint> keypoints1, keypoints2;
    vector<float> descriptors1, descriptors2;
    surf.downloadKeypoints(keypoints1GPU, keypoints1);
    surf.downloadKeypoints(keypoints2GPU, keypoints2);
    surf.downloadDescriptors(descriptors1GPU, descriptors1);
    surf.downloadDescriptors(descriptors2GPU, descriptors2);
    int64 t5 = cv::getTickCount();


    cout <<  " Matching = " << (t4-t3)/cv::getTickFrequency() <<  " Download = " << (t5-t4)/cv::getTickFrequency() << endl;
    
    // drawing the results
    Mat img_matches;
    drawMatches(Mat(img1), keypoints1, Mat(img2), keypoints2, matches, img_matches);

    namedWindow("matches", 0);
    imshow("matches", img_matches);
    waitKey(0);

    return 0;
}

#else

int main()
{
    std::cerr << "OpenCV was built without xfeatures2d module" << std::endl;
    return 0;
}

#endif

Where is the source data for the images coming from? System memory or mass storage? What is the size, in MByte, of each image?

I am wondering whether OpenCV might have substantial one-time initialization costs that accrue to the very first OpenCV function called. If so, can you force this initialization overhead to occur outside the timed portion of your test by issuing some “dummy” command to OpenCV?

BTW, there is a subforum dedicated to the TX1 (https://devtalk.nvidia.com/default/board/164/jetson-tx1/) and participants there might be able to help you more quickly because they presumably have hands-on experience with the platform