Strange cost time in xavier

I found a strange things when running code in xavier, the code as follow:

#include <iostream>
#include <chrono>
#include <vector>

class test{
public:
    test(){
        all_class_num_ = 7;
    }

    void test_func1(){
        int rows = 800;
        int cols = 768;
        int size = rows * cols;
        int all_channel = all_class_num_;

        std::vector<float> classify_map1(size);
        std::vector<float> classify_data1(size * all_channel);
        double timer = getTime();
        for (int row = 0; row < rows; ++row)
        {
            int idx = row * cols;
            for (int col = 0; col < cols; ++col, ++idx)
            {
                float maxval = classify_data1[idx];
                int index = 0;

                for (int c = 1; c < all_channel; ++c)
                {
                    float tmp_val = classify_data1[c * size + idx];
                    if (maxval < tmp_val)
                    {
                        maxval = tmp_val;
                        index = c;
                    }
                }
                classify_map1[idx] = index;
            }
        }
        std::cout<<"1 COST: "<<(getTime() - timer) * 1000.<<" ms. "<<std::endl;
    }
    void test_func2(){
        int rows = 800;
        int cols = 800;
        int size = rows * cols;
        int all_channel = all_class_num_;

        std::vector<float> classify_map1(size);
        std::vector<float> classify_data1(size * all_channel);
        double timer = getTime();
        for (int row = 0; row < rows; ++row)
        {
            int idx = row * cols;
            for (int col = 0; col < cols; ++col, ++idx)
            {
                float maxval = classify_data1[idx];
                int index = 0;

                for (int c = 1; c < all_channel; ++c)
                {
                    float tmp_val = classify_data1[c * size + idx];
                    if (maxval < tmp_val)
                    {
                        maxval = tmp_val;
                        index = c;
                    }
                }
                classify_map1[idx] = index;
            }
        }
        std::cout<<"2 COST: "<<(getTime() - timer) * 1000.<<" ms. "<<std::endl;
    }
    void test_func3(){
        int rows = 800;
        int cols = 768;
        int size = rows * cols;
        int all_channel = 7;

        std::vector<float> classify_map1(size);
        std::vector<float> classify_data1(size * all_channel);
        double timer = getTime();
        for (int row = 0; row < rows; ++row)
        {
            int idx = row * cols;
            for (int col = 0; col < cols; ++col, ++idx)
            {
                float maxval = classify_data1[idx];
                int index = 0;

                for (int c = 1; c < all_channel; ++c)
                {
                    float tmp_val = classify_data1[c * size + idx];
                    if (maxval < tmp_val)
                    {
                        maxval = tmp_val;
                        index = c;
                    }
                }
                classify_map1[idx] = index;
            }
        }
        std::cout<<"3 COST: "<<(getTime() - timer) * 1000.<<" ms. "<<std::endl;
    }

    double getTime(void) {
        const auto t = std::chrono::system_clock::now();
        const auto t_sec = std::chrono::duration_cast<std::chrono::duration<double>>(t.time_since_epoch());
        return t_sec.count();
    }

private:
    int all_class_num_;
};


int main(){

    int n = 100;
    test t1;
    for (int i = 0; i < n; ++i) {
        t1.test_func1();
        t1.test_func2();
        t1.test_func3();
    }

}

for this code ,i found 1 cost nearly 30ms,and 2 cost just 6ms,and 3 cost 8ms. but they are just the same logit and nothing so different. I test in two more xavier and the result is the same.

I can’t answer, but it would be worth seeing if the first test simply takes longer. What happens if you change main() as follows:

int main(){

    int n = 100;
    test t1;
    // Run t1 test once and do nothing with it. Perhaps cache or other
    // temporary changes matter in terms of performance.
    t1.test_func1();
    for (int i = 0; i < n; ++i) {
        t1.test_func1();
        t1.test_func2();
        t1.test_func3();
        // Run all tests a second time, see if func1() changes due
        // to temporary effects.
        t1.test_func1();
        t1.test_func2();
        t1.test_func3();
    }

}
1 COST: 66.4811 ms. 
1 COST: 50.1866 ms. 
2 COST: 12.2271 ms. 
3 COST: 12.8722 ms. 
1 COST: 48.3041 ms. 
2 COST: 12.5794 ms. 
3 COST: 13.4814 ms. 
1 COST: 53.2944 ms. 
2 COST: 8.66556 ms. 
3 COST: 6.001 ms. 
1 COST: 53.5984 ms. 
2 COST: 8.51417 ms. 
3 COST: 5.98359 ms. 
1 COST: 57.905 ms. 
2 COST: 9.15194 ms. 
3 COST: 6.89697 ms. 
1 COST: 53.7579 ms. 
2 COST: 8.74329 ms. 
3 COST: 5.9824 ms. 
1 COST: 52.9311 ms. 
2 COST: 8.82268 ms. 
3 COST: 6.04916 ms. 
1 COST: 52.8085 ms. 
2 COST: 8.6503 ms. 
3 COST: 7.48086 ms. 
1 COST: 53.8533 ms. 
2 COST: 8.48484 ms. 
3 COST: 6.26445 ms. 
1 COST: 55.4714 ms. 
2 COST: 8.40855 ms. 
3 COST: 6.22916 ms. 
1 COST: 53.1902 ms. 
2 COST: 8.40759 ms. 
3 COST: 6.02627 ms. 
1 COST: 53.3967 ms. 
2 COST: 8.41641 ms. 
3 COST: 6.17576 ms. 
1 COST: 52.5427 ms. 
2 COST: 8.38852 ms. 
3 COST: 5.92113 ms. 
1 COST: 52.5582 ms. 
2 COST: 8.44216 ms. 
3 COST: 5.84698 ms. 
1 COST: 52.8677 ms. 
2 COST: 8.40569 ms. 
3 COST: 5.90205 ms. 
1 COST: 53.0884 ms. 
2 COST: 9.12142 ms. 
3 COST: 6.75368 ms. 
1 COST: 52.8762 ms. 
2 COST: 10.7553 ms. 
3 COST: 5.8949 ms. 
1 COST: 53.1464 ms. 
2 COST: 9.59063 ms. 
3 COST: 5.89299 ms. 
1 COST: 52.6597 ms. 
2 COST: 8.37326 ms. 
3 COST: 5.74017 ms. 
1 COST: 52.7501 ms. 
2 COST: 8.37874 ms. 
3 COST: 5.79929 ms. 
1 COST: 52.9511 ms. 
2 COST: 8.38208 ms. 
3 COST: 5.85079 ms. 
1 COST: 53.1948 ms. 
2 COST: 8.47721 ms. 
3 COST: 5.85604 ms. 
1 COST: 52.5942 ms. 
2 COST: 8.38614 ms. 
3 COST: 7.2279 ms. 
1 COST: 53.1187 ms. 
2 COST: 8.41212 ms. 
3 COST: 5.79214 ms. 
1 COST: 52.7303 ms. 
2 COST: 9.55915 ms. 
3 COST: 7.23958 ms. 
1 COST: 52.8436 ms. 
2 COST: 9.49407 ms. 
3 COST: 8.7235 ms. 
1 COST: 52.8786 ms. 
2 COST: 8.39949 ms. 
3 COST: 5.74756 ms. 
1 COST: 53.339 ms. 
2 COST: 8.35443 ms. 
3 COST: 5.89776 ms. 
1 COST: 55.6509 ms. 
2 COST: 8.42857 ms. 
3 COST: 6.14238 ms. 
1 COST: 53.2033 ms. 
2 COST: 8.32772 ms. 
3 COST: 5.85198 ms. 
1 COST: 52.5994 ms. 
2 COST: 8.42953 ms. 
3 COST: 5.71299 ms. 
1 COST: 52.6462 ms. 
2 COST: 8.35061 ms. 
3 COST: 5.86534 ms. 
1 COST: 53.7517 ms. 
2 COST: 9.62758 ms. 
3 COST: 6.67119 ms. 
1 COST: 53.0837 ms. 
2 COST: 9.48787 ms. 
3 COST: 5.84149 ms. 
1 COST: 52.9647 ms. 
2 COST: 8.43382 ms. 
3 COST: 5.89776 ms. 
1 COST: 53.1213 ms. 
2 COST: 8.25548 ms. 
3 COST: 5.68938 ms. 
1 COST: 52.8872 ms. 
2 COST: 8.37636 ms. 
3 COST: 5.759 ms. 
1 COST: 53.1888 ms. 
2 COST: 8.27765 ms. 
3 COST: 7.24173 ms. 
1 COST: 52.8986 ms. 
2 COST: 8.36778 ms. 
3 COST: 5.80263 ms. 
1 COST: 53.1559 ms. 
2 COST: 8.30746 ms. 
3 COST: 5.77164 ms. 
1 COST: 52.438 ms. 
2 COST: 9.54032 ms. 
3 COST: 5.74946 ms. 
1 COST: 52.3658 ms. 
2 COST: 9.51409 ms. 
3 COST: 5.88393 ms. 
1 COST: 51.9323 ms. 
2 COST: 8.39806 ms. 
3 COST: 5.77474 ms. 
1 COST: 51.9047 ms. 
2 COST: 8.26001 ms. 
3 COST: 5.6839 ms. 
1 COST: 51.9321 ms. 
2 COST: 8.28004 ms. 
3 COST: 7.2422 ms. 
1 COST: 51.899 ms. 
2 COST: 8.32891 ms. 
3 COST: 5.83553 ms. 
1 COST: 51.8959 ms. 
2 COST: 8.3065 ms. 
3 COST: 5.68342 ms. 
1 COST: 51.8646 ms. 
2 COST: 8.28266 ms. 
3 COST: 5.80072 ms. 
1 COST: 51.8782 ms. 
2 COST: 9.48906 ms. 
3 COST: 5.75662 ms. 
1 COST: 51.8916 ms. 
2 COST: 9.56011 ms. 
3 COST: 7.28297 ms. 
1 COST: 51.8858 ms. 
2 COST: 8.29315 ms. 
3 COST: 5.78022 ms. 
1 COST: 52.279 ms. 
2 COST: 8.30722 ms. 
3 COST: 5.71036 ms. 
1 COST: 51.9407 ms. 
2 COST: 8.28338 ms. 
3 COST: 5.75376 ms. 
1 COST: 51.9457 ms. 
2 COST: 8.27622 ms. 
3 COST: 5.67579 ms. 
1 COST: 52.3186 ms. 
2 COST: 8.30865 ms. 
3 COST: 6.65617 ms. 
1 COST: 52.2528 ms. 
2 COST: 8.33035 ms. 
3 COST: 5.8527 ms. 
1 COST: 52.0058 ms. 
2 COST: 9.57942 ms. 
3 COST: 5.85175 ms. 
1 COST: 51.8904 ms. 
2 COST: 9.50265 ms. 
3 COST: 5.78284 ms. 
1 COST: 52.3548 ms. 
2 COST: 8.31699 ms. 
3 COST: 5.82933 ms. 
1 COST: 52.5563 ms. 
2 COST: 8.3158 ms. 
3 COST: 6.04534 ms. 
1 COST: 51.9121 ms. 
2 COST: 8.32176 ms. 
3 COST: 5.76496 ms. 
1 COST: 52.4931 ms. 
2 COST: 8.33321 ms. 
3 COST: 5.84412 ms. 
1 COST: 52.4418 ms. 
2 COST: 8.33488 ms. 
3 COST: 5.8105 ms. 
1 COST: 52.1014 ms. 
2 COST: 8.32391 ms. 
3 COST: 7.89189 ms. 
1 COST: 52.2532 ms. 
2 COST: 8.32272 ms. 
3 COST: 5.73778 ms. 
1 COST: 54.4202 ms. 
2 COST: 9.57441 ms. 
3 COST: 5.83339 ms. 
1 COST: 51.8193 ms. 
2 COST: 9.51672 ms. 
3 COST: 7.24316 ms. 
1 COST: 51.8064 ms. 
2 COST: 8.3437 ms. 
3 COST: 5.85055 ms. 
1 COST: 51.8951 ms. 
2 COST: 8.31938 ms. 
3 COST: 5.75209 ms. 
1 COST: 52.2833 ms. 
2 COST: 8.2581 ms. 
3 COST: 5.69248 ms. 
1 COST: 51.816 ms. 
2 COST: 8.27169 ms. 
3 COST: 5.71036 ms. 
1 COST: 51.8861 ms. 
2 COST: 8.28362 ms. 
3 COST: 5.69344 ms. 
1 COST: 51.8365 ms. 
2 COST: 8.24976 ms. 
3 COST: 5.62072 ms. 
1 COST: 52.2089 ms. 
2 COST: 9.45187 ms. 
3 COST: 5.71132 ms. 
1 COST: 51.9099 ms. 
2 COST: 9.48572 ms. 
3 COST: 5.78308 ms. 
1 COST: 52.0639 ms. 
2 COST: 8.30317 ms. 
3 COST: 5.70703 ms. 
1 COST: 51.8997 ms. 
2 COST: 8.31723 ms. 
3 COST: 5.73754 ms. 
1 COST: 51.5256 ms. 
2 COST: 8.32963 ms. 
3 COST: 7.22718 ms. 
1 COST: 51.856 ms. 
2 COST: 8.32605 ms. 
3 COST: 7.18951 ms. 
1 COST: 52.078 ms. 
2 COST: 8.30936 ms. 
3 COST: 5.80311 ms. 
1 COST: 54.4484 ms. 
2 COST: 8.322 ms. 
3 COST: 5.79906 ms. 
1 COST: 51.8372 ms. 
2 COST: 9.47237 ms. 
3 COST: 5.86414 ms. 
1 COST: 52.1023 ms. 
2 COST: 9.5129 ms. 
3 COST: 5.84126 ms. 
1 COST: 52.31 ms. 
2 COST: 8.29911 ms. 
3 COST: 5.81884 ms. 
1 COST: 52.0194 ms. 
2 COST: 8.32939 ms. 
3 COST: 6.57415 ms. 
1 COST: 53.3748 ms. 
2 COST: 8.34322 ms. 
3 COST: 5.82933 ms. 
1 COST: 52.0277 ms. 
2 COST: 8.32152 ms. 
3 COST: 5.71465 ms. 
1 COST: 54.5907 ms. 
2 COST: 8.28052 ms. 
3 COST: 5.93495 ms. 
1 COST: 51.8346 ms. 
2 COST: 8.30984 ms. 
3 COST: 6.47449 ms. 
1 COST: 51.95 ms. 
2 COST: 9.32479 ms. 
3 COST: 5.78594 ms. 
1 COST: 52.1998 ms. 
2 COST: 9.46569 ms. 
3 COST: 5.80549 ms. 
1 COST: 52.0458 ms. 
2 COST: 8.33154 ms. 
3 COST: 5.76806 ms. 
1 COST: 52.3317 ms. 
2 COST: 8.32295 ms. 
3 COST: 5.73969 ms. 
1 COST: 52.0184 ms. 
2 COST: 8.33535 ms. 
3 COST: 5.87177 ms. 
1 COST: 51.8813 ms. 
2 COST: 8.31628 ms. 
3 COST: 5.79739 ms. 
1 COST: 51.9943 ms. 
2 COST: 9.40084 ms. 
3 COST: 5.75161 ms. 
1 COST: 51.8513 ms. 
2 COST: 8.32152 ms. 
3 COST: 5.77188 ms. 
1 COST: 52.0039 ms. 
2 COST: 8.30579 ms. 
3 COST: 5.79691 ms. 
1 COST: 52.285 ms. 
2 COST: 8.31175 ms. 
3 COST: 5.81026 ms. 
1 COST: 52.0294 ms. 
2 COST: 8.30579 ms. 
3 COST: 5.78475 ms. 
1 COST: 51.7836 ms. 
2 COST: 8.28218 ms. 
3 COST: 7.09414 ms. 
1 COST: 52.0051 ms. 
2 COST: 8.34322 ms. 
3 COST: 7.43747 ms. 
1 COST: 52.1696 ms. 
2 COST: 8.358 ms. 
3 COST: 5.83959 ms. 
1 COST: 52.0515 ms. 
2 COST: 8.35586 ms. 
3 COST: 5.81908 ms. 
1 COST: 51.9576 ms. 
2 COST: 8.29482 ms. 
3 COST: 5.69129 ms. 
1 COST: 51.9178 ms. 
2 COST: 8.26836 ms. 
3 COST: 5.74446 ms. 
1 COST: 51.9662 ms. 
2 COST: 8.27837 ms. 
3 COST: 5.69105 ms. 
1 COST: 51.8963 ms. 
2 COST: 9.31072 ms. 
3 COST: 5.73277 ms. 
1 COST: 52.0017 ms. 
2 COST: 8.28934 ms. 
3 COST: 5.71227 ms. 
1 COST: 52.0656 ms. 
2 COST: 8.33654 ms. 
3 COST: 5.85842 ms. 
1 COST: 54.755 ms. 
2 COST: 8.33917 ms. 
3 COST: 6.63614 ms. 
1 COST: 52.1076 ms. 
2 COST: 8.31485 ms. 
3 COST: 5.80287 ms. 
1 COST: 52.0229 ms. 
2 COST: 8.34417 ms. 
3 COST: 5.83029 ms. 
1 COST: 51.8756 ms. 
2 COST: 8.31509 ms. 
3 COST: 5.97906 ms. 
1 COST: 52.0275 ms. 
2 COST: 8.31175 ms. 
3 COST: 5.88369 ms. 
1 COST: 51.7797 ms. 
2 COST: 8.31342 ms. 
3 COST: 6.03294 ms. 
1 COST: 51.9655 ms. 
2 COST: 8.30173 ms. 
3 COST: 5.89156 ms. 
1 COST: 51.9745 ms. 
2 COST: 8.31628 ms. 
3 COST: 5.7981 ms. 
1 COST: 52.0422 ms. 
2 COST: 8.35037 ms. 
3 COST: 5.85604 ms. 
1 COST: 51.5277 ms. 
2 COST: 8.32462 ms. 
3 COST: 5.82457 ms. 
1 COST: 52.7437 ms. 
2 COST: 8.31604 ms. 
3 COST: 5.83172 ms. 
1 COST: 52.0449 ms. 
2 COST: 8.27432 ms. 
3 COST: 5.7323 ms. 
1 COST: 52.0289 ms. 
2 COST: 8.26502 ms. 
3 COST: 7.09486 ms. 
1 COST: 52.104 ms. 
2 COST: 8.32748 ms. 
3 COST: 5.79023 ms. 
1 COST: 52.074 ms. 
2 COST: 8.33273 ms. 
3 COST: 5.76758 ms. 
1 COST: 54.2047 ms. 
2 COST: 8.28576 ms. 
3 COST: 7.22432 ms. 
1 COST: 51.9505 ms. 
2 COST: 8.3189 ms. 
3 COST: 5.83053 ms. 
1 COST: 51.8956 ms. 
2 COST: 8.29148 ms. 
3 COST: 5.74851 ms. 
1 COST: 51.9142 ms. 
2 COST: 8.29458 ms. 
3 COST: 5.66792 ms. 
1 COST: 51.847 ms. 
2 COST: 8.28338 ms. 
3 COST: 5.7025 ms. 
1 COST: 51.8296 ms. 
2 COST: 8.27718 ms. 
3 COST: 5.69272 ms. 
1 COST: 51.9016 ms. 
2 COST: 8.27479 ms. 
3 COST: 5.65386 ms. 
1 COST: 51.9185 ms. 
2 COST: 8.26335 ms. 
3 COST: 5.77188 ms. 
1 COST: 51.9371 ms. 
2 COST: 8.28075 ms. 
3 COST: 5.71346 ms. 
1 COST: 51.9831 ms. 
2 COST: 8.25953 ms. 
3 COST: 5.7137 ms. 
1 COST: 52.0558 ms. 
2 COST: 8.28886 ms. 
3 COST: 7.19762 ms. 
1 COST: 52.053 ms. 
2 COST: 8.31509 ms. 
3 COST: 5.83529 ms. 
1 COST: 54.1608 ms. 
2 COST: 8.41403 ms. 
3 COST: 5.72491 ms. 
1 COST: 52.0711 ms. 
2 COST: 8.2922 ms. 
3 COST: 5.70416 ms. 
1 COST: 52.2485 ms. 
2 COST: 9.48596 ms. 
3 COST: 6.24967 ms. 
1 COST: 52.4774 ms. 
2 COST: 9.56631 ms. 
3 COST: 5.98931 ms. 
1 COST: 52.0403 ms. 
2 COST: 8.31556 ms. 
3 COST: 5.80239 ms. 
1 COST: 56.8178 ms. 
2 COST: 8.35848 ms. 
3 COST: 5.71895 ms. 
1 COST: 53.1688 ms. 
2 COST: 8.63695 ms. 
3 COST: 6.94704 ms. 
1 COST: 55.8786 ms. 
2 COST: 8.27575 ms. 
3 COST: 5.78189 ms. 
1 COST: 52.0422 ms. 
2 COST: 8.24714 ms. 
3 COST: 5.7025 ms. 
1 COST: 54.0986 ms. 
2 COST: 8.34036 ms. 
3 COST: 5.74303 ms. 
1 COST: 56.3061 ms. 
2 COST: 9.27758 ms. 
3 COST: 7.37405 ms. 
1 COST: 54.4386 ms. 
2 COST: 9.94325 ms. 
3 COST: 5.76735 ms. 
1 COST: 52.0685 ms. 
2 COST: 8.32248 ms. 
3 COST: 5.87821 ms. 
1 COST: 52.305 ms. 
2 COST: 8.64911 ms. 
3 COST: 6.58703 ms. 
1 COST: 52.5908 ms. 
2 COST: 8.26693 ms. 
3 COST: 5.75542 ms. 
1 COST: 55.2542 ms. 
2 COST: 8.92997 ms. 
3 COST: 6.60276 ms. 
1 COST: 51.9495 ms. 
2 COST: 8.32009 ms. 
3 COST: 5.84459 ms. 
1 COST: 52.4678 ms. 
2 COST: 8.33082 ms. 
3 COST: 5.80215 ms. 
1 COST: 51.9686 ms. 
2 COST: 9.5799 ms. 
3 COST: 5.75805 ms. 
1 COST: 52.12 ms. 
2 COST: 8.92663 ms. 
3 COST: 5.67055 ms. 
1 COST: 51.9292 ms. 
2 COST: 8.32701 ms. 
3 COST: 5.82504 ms. 
1 COST: 52.5486 ms. 
2 COST: 8.35085 ms. 
3 COST: 5.72705 ms. 
1 COST: 51.9519 ms. 
2 COST: 8.322 ms. 
3 COST: 5.82147 ms. 
1 COST: 52.4158 ms. 
2 COST: 8.32748 ms. 
3 COST: 5.75376 ms. 
1 COST: 51.9948 ms. 
2 COST: 8.33321 ms. 
3 COST: 6.32024 ms. 
1 COST: 52.2554 ms. 
2 COST: 8.8737 ms. 
3 COST: 5.77569 ms. 
1 COST: 51.9464 ms. 
2 COST: 9.60064 ms. 
3 COST: 5.74136 ms. 
1 COST: 51.9869 ms. 
2 COST: 8.31962 ms. 
3 COST: 5.86367 ms. 
1 COST: 52.4952 ms. 
2 COST: 8.65221 ms. 
3 COST: 6.89483 ms. 
1 COST: 52.0208 ms. 
2 COST: 8.27765 ms. 
3 COST: 5.67865 ms. 
1 COST: 52.3565 ms. 
2 COST: 8.31294 ms. 
3 COST: 5.77712 ms. 
1 COST: 52.0391 ms. 
2 COST: 8.32009 ms. 
3 COST: 5.83792 ms. 
1 COST: 54.3962 ms. 
2 COST: 8.33464 ms. 
3 COST: 5.85914 ms. 
1 COST: 52.0165 ms. 
2 COST: 8.82578 ms. 
3 COST: 5.72801 ms. 
1 COST: 51.8732 ms. 
2 COST: 9.57775 ms. 
3 COST: 7.00831 ms. 
1 COST: 53.0245 ms. 
2 COST: 8.7359 ms. 
3 COST: 6.5732 ms. 
1 COST: 52.9208 ms. 
2 COST: 8.31461 ms. 
3 COST: 5.8341 ms. 
1 COST: 54.2641 ms. 
2 COST: 8.32272 ms. 
3 COST: 5.79882 ms. 
1 COST: 56.5708 ms. 
2 COST: 8.3344 ms. 
3 COST: 5.86247 ms. 
1 COST: 54.1701 ms. 
2 COST: 8.33082 ms. 
3 COST: 5.72252 ms. 
1 COST: 52.0413 ms. 
2 COST: 8.34394 ms. 
3 COST: 5.77855 ms. 
1 COST: 52.0153 ms. 
2 COST: 9.73535 ms. 
3 COST: 8.71038 ms. 
1 COST: 52.2449 ms. 
2 COST: 9.45902 ms. 
3 COST: 5.77641 ms. 
1 COST: 54.6386 ms. 
2 COST: 8.32033 ms. 
3 COST: 5.68533 ms. 
1 COST: 52.1328 ms. 
2 COST: 8.27456 ms. 
3 COST: 5.80239 ms. 
1 COST: 52.027 ms. 
2 COST: 8.30221 ms. 
3 COST: 5.6808 ms. 
1 COST: 52.0282 ms. 
2 COST: 8.26073 ms. 
3 COST: 5.80144 ms. 
1 COST: 51.9633 ms. 
2 COST: 8.27432 ms. 
3 COST: 5.68151 ms. 
1 COST: 53.0481 ms. 
2 COST: 8.27241 ms. 
3 COST: 5.73587 ms. 
1 COST: 51.9454 ms. 
2 COST: 9.46617 ms. 
3 COST: 5.75137 ms. 
1 COST: 51.9414 ms. 
2 COST: 9.552 ms. 
3 COST: 5.83959 ms. 
1 COST: 51.8746 ms. 
2 COST: 8.30531 ms. 
3 COST: 5.75733 ms. 
1 COST: 51.868 ms. 
2 COST: 8.255 ms. 
3 COST: 5.66888 ms. 
1 COST: 51.8878 ms. 
2 COST: 8.25858 ms. 
3 COST: 6.84857 ms. 
1 COST: 51.8451 ms. 
2 COST: 8.30936 ms. 
3 COST: 5.74327 ms. 
1 COST: 53.9057 ms. 
2 COST: 8.42357 ms. 
3 COST: 6.93774 ms. 
1 COST: 57.7743 ms. 
2 COST: 8.28052 ms. 
3 COST: 5.73635 ms. 
1 COST: 54.5475 ms. 
2 COST: 8.27694 ms. 
3 COST: 5.66268 ms. 
1 COST: 54.2135 ms. 
2 COST: 9.44233 ms. 
3 COST: 7.47967 ms. 
1 COST: 52.002 ms. 
2 COST: 9.53007 ms. 
3 COST: 5.76997 ms. 
1 COST: 52.1691 ms. 
2 COST: 8.72374 ms. 
3 COST: 6.84357 ms. 
1 COST: 53.8597 ms. 
2 COST: 8.28195 ms. 
3 COST: 6.93583 ms. 
1 COST: 54.0051 ms. 
2 COST: 8.63576 ms. 
3 COST: 6.94346 ms. 

I test it as the suggestion, but the rule seems not change, the test_func1 cost much more time.

it seems to due to cache effect.
Can you run the same logic as a c code?
768 col size only has the problem and may be some other sizes little far away

can you convert these into C

std::vector classify_map1(size);
std::vector classify_data1(size * all_channel);

um… I am really not good at writing pure c code, and follow is my code. I am not sure the code is write.

#include <iostream>
//#include <chrono>
#include <vector>
#include <cstdlib>

#ifdef __cplusplus
extern "C" {
#endif

class test {
public:
    test() {
        all_class_num_ = 7;
    }

    void test_func1() {
        int rows = 800;
        int cols = 768;
        int size = rows * cols;
        int all_channel = all_class_num_;
        clock_t start_t, end_t;

//        std::vector<float> classify_map1(size);
//        std::vector<float> classify_data1(size * all_channel);

        float *classify_map1 = (float *) malloc(size * sizeof(float));
        float *classify_data1 = (float *) malloc(size * all_channel * sizeof(float));
//        double timer = getTime();
        start_t = clock();
        for (int row = 0; row < rows; ++row) {
            int idx = row * cols;
            for (int col = 0; col < cols; ++col, ++idx) {
                float maxval = classify_data1[idx];
                int index = 0;

                for (int c = 1; c < all_channel; ++c) {
                    float tmp_val = classify_data1[c * size + idx];
                    if (maxval < tmp_val) {
                        maxval = tmp_val;
                        index = c;
                    }
                }
                classify_map1[idx] = index;
            }
        }
        end_t = clock();
        double total_t = (double) (end_t - start_t) / CLOCKS_PER_SEC;
        std::cout << "1 COST: " << (total_t) * 1000. << " ms. " << std::endl;
        free(classify_data1);
        free(classify_map1);
//        std::cout<<"1 COST: "<<(getTime() - timer) * 1000.<<" ms. "<<std::endl;
    }

    void test_func2() {
        int rows = 800;
        int cols = 800;
        int size = rows * cols;
        int all_channel = all_class_num_;
        clock_t start_t, end_t;

        //std::vector<float> classify_map1(size);
        //std::vector<float> classify_data1(size * all_channel);
        float *classify_map1 = (float *) malloc(size * sizeof(float));
        float *classify_data1 = (float *) malloc(size * all_channel * sizeof(float));
//        double timer = getTime();
        start_t = clock();
        for (int row = 0; row < rows; ++row) {
            int idx = row * cols;
            for (int col = 0; col < cols; ++col, ++idx) {
                float maxval = classify_data1[idx];
                int index = 0;

                for (int c = 1; c < all_channel; ++c) {
                    float tmp_val = classify_data1[c * size + idx];
                    if (maxval < tmp_val) {
                        maxval = tmp_val;
                        index = c;
                    }
                }
                classify_map1[idx] = index;
            }
        }
        end_t = clock();
        double total_t = (double) (end_t - start_t) / CLOCKS_PER_SEC;
        std::cout << "2 COST: " << (total_t) * 1000. << " ms. " << std::endl;
        free(classify_data1);
        free(classify_map1);
//        std::cout<<"2 COST: "<<(getTime() - timer) * 1000.<<" ms. "<<std::endl;
    }

    void test_func3() {
        int rows = 800;
        int cols = 768;
        int size = rows * cols;
        int all_channel = 7;
        clock_t start_t, end_t;
//        std::vector<float> classify_map1(size);
//        std::vector<float> classify_data1(size * all_channel);

        float *classify_map1 = (float *) malloc(size * sizeof(float));
        float *classify_data1 = (float *) malloc(size * all_channel * sizeof(float));
//        double timer = getTime();
        start_t = clock();
        for (int row = 0; row < rows; ++row) {
            int idx = row * cols;
            for (int col = 0; col < cols; ++col, ++idx) {
                float maxval = classify_data1[idx];
                int index = 0;

                for (int c = 1; c < all_channel; ++c) {
                    float tmp_val = classify_data1[c * size + idx];
                    if (maxval < tmp_val) {
                        maxval = tmp_val;
                        index = c;
                    }
                }
                classify_map1[idx] = index;
            }
        }
        end_t = clock();
        double total_t = (double) (end_t - start_t) / CLOCKS_PER_SEC;
        std::cout << "3 COST: " << (total_t) * 1000. << " ms. " << std::endl;
        free(classify_data1);
        free(classify_map1);
//        std::cout<<"3 COST: "<<(getTime() - timer) * 1000.<<" ms. "<<std::endl;
    }

//    double getTime(void) {
//        const auto t = std::chrono::system_clock::now();
//        const auto t_sec = std::chrono::duration_cast<std::chrono::duration<double>>(t.time_since_epoch());
//        return t_sec.count();
//    }

private:
    int all_class_num_;
};

#ifdef __cplusplus
}
#endif



int main(){

    int n = 100;
    test t1;
    for (int i = 0; i < n; ++i) {
        t1.test_func1();
        t1.test_func2();
        t1.test_func3();
    }

}

the result is as follow

1 COST: 47.62 ms. 
2 COST: 30.662 ms. 
3 COST: 13.97 ms. 
1 COST: 43.357 ms. 
2 COST: 24.241 ms. 
3 COST: 10.533 ms. 
1 COST: 35.24 ms. 
2 COST: 24.531 ms. 
3 COST: 6.911 ms. 
1 COST: 34.539 ms. 
2 COST: 25.47 ms. 
3 COST: 7.358 ms. 
1 COST: 34.202 ms. 
2 COST: 23.208 ms. 
3 COST: 8.705 ms. 
1 COST: 39.11 ms. 
2 COST: 24.129 ms. 
3 COST: 11.775 ms. 
1 COST: 34.349 ms. 
2 COST: 23.57 ms. 
3 COST: 11.659 ms. 
1 COST: 33.455 ms. 
2 COST: 23.568 ms. 
3 COST: 11.124 ms. 
1 COST: 32.893 ms. 
2 COST: 23.574 ms. 
3 COST: 11.001 ms. 
1 COST: 34.406 ms. 
2 COST: 23.223 ms. 
3 COST: 10.94 ms. 
1 COST: 39.74 ms. 
2 COST: 23.536 ms. 
3 COST: 10.89 ms. 
1 COST: 34.493 ms. 
2 COST: 23.953 ms. 
3 COST: 11.034 ms. 
1 COST: 34.237 ms. 
2 COST: 25 ms. 
3 COST: 10.906 ms. 
1 COST: 32.422 ms. 
2 COST: 22.99 ms. 
3 COST: 10.875 ms. 
1 COST: 35.325 ms. 
2 COST: 23.617 ms. 
3 COST: 10.91 ms. 
1 COST: 39.816 ms. 
2 COST: 23.251 ms. 
3 COST: 10.883 ms. 
1 COST: 34.38 ms. 
2 COST: 23.176 ms. 
3 COST: 10.929 ms. 
1 COST: 33.5 ms. 
2 COST: 23.253 ms. 
3 COST: 11.019 ms. 
1 COST: 34.472 ms. 
2 COST: 23.216 ms. 
3 COST: 11.04 ms. 
1 COST: 35.34 ms. 
2 COST: 23.249 ms. 
3 COST: 10.991 ms. 
1 COST: 39.694 ms. 
2 COST: 23.538 ms. 
3 COST: 10.926 ms. 
1 COST: 34.398 ms. 
2 COST: 24.963 ms. 
3 COST: 11.029 ms. 
1 COST: 33.44 ms. 
2 COST: 23.278 ms. 
3 COST: 10.94 ms. 
1 COST: 32.432 ms. 
2 COST: 23.408 ms. 
3 COST: 11.715 ms. 
1 COST: 33.866 ms. 
2 COST: 23.236 ms. 
3 COST: 13.561 ms. 
1 COST: 39.799 ms. 
2 COST: 23.53 ms. 
3 COST: 11.023 ms. 
1 COST: 32.502 ms. 
2 COST: 23.245 ms. 
3 COST: 11.607 ms. 
1 COST: 32.48 ms. 
2 COST: 24.874 ms. 
3 COST: 11.098 ms. 
1 COST: 32.448 ms. 
2 COST: 23.396 ms. 
3 COST: 11.039 ms. 
1 COST: 33.73 ms. 
2 COST: 23.366 ms. 
3 COST: 11.069 ms. 
1 COST: 39.76 ms. 
2 COST: 23.562 ms. 
3 COST: 10.945 ms. 
1 COST: 32.49 ms. 
2 COST: 23.514 ms. 
3 COST: 10.965 ms. 
1 COST: 32.442 ms. 
2 COST: 23.602 ms. 
3 COST: 10.962 ms. 
1 COST: 32.514 ms. 
2 COST: 22.907 ms. 
3 COST: 11.711 ms. 
1 COST: 35.158 ms. 
2 COST: 23.004 ms. 
3 COST: 11.059 ms. 
1 COST: 39.656 ms. 
2 COST: 23.044 ms. 
3 COST: 10.96 ms. 
1 COST: 32.436 ms. 
2 COST: 23.288 ms. 
3 COST: 11.158 ms. 
1 COST: 32.562 ms. 
2 COST: 23.2 ms. 
3 COST: 11.55 ms. 
1 COST: 32.451 ms. 
2 COST: 23.181 ms. 
3 COST: 11.163 ms. 
1 COST: 34.676 ms. 
2 COST: 23.127 ms. 
3 COST: 10.984 ms. 
1 COST: 41.249 ms. 
2 COST: 23.264 ms. 
3 COST: 10.962 ms. 
1 COST: 30.244 ms. 
2 COST: 23.096 ms. 
3 COST: 10.888 ms. 
1 COST: 34.407 ms. 
2 COST: 23.076 ms. 
3 COST: 13.364 ms. 
1 COST: 33.402 ms. 
2 COST: 23.421 ms. 
3 COST: 16.866 ms. 
1 COST: 33.39 ms. 
2 COST: 23.113 ms. 
3 COST: 13.201 ms. 
1 COST: 32.806 ms. 
2 COST: 23.095 ms. 
3 COST: 10.92 ms. 
1 COST: 36.561 ms. 
2 COST: 23.093 ms. 
3 COST: 10.883 ms. 
1 COST: 32.434 ms. 
2 COST: 23.231 ms. 
3 COST: 11.032 ms. 
1 COST: 32.411 ms. 
2 COST: 23.206 ms. 
3 COST: 10.892 ms. 
1 COST: 33.384 ms. 
2 COST: 23.351 ms. 
3 COST: 10.945 ms. 
1 COST: 40.994 ms. 
2 COST: 23.068 ms. 
3 COST: 11.002 ms. 
1 COST: 39.986 ms. 
2 COST: 23.066 ms. 
3 COST: 10.913 ms. 
1 COST: 34.357 ms. 
2 COST: 23.269 ms. 
3 COST: 10.89 ms. 
1 COST: 33.388 ms. 
2 COST: 23.116 ms. 
3 COST: 10.922 ms. 
1 COST: 33.441 ms. 
2 COST: 23.1 ms. 
3 COST: 10.929 ms. 
1 COST: 32.67 ms. 
2 COST: 23.733 ms. 
3 COST: 10.908 ms. 
1 COST: 33.429 ms. 
2 COST: 23.618 ms. 
3 COST: 10.924 ms. 
1 COST: 35.127 ms. 
2 COST: 24.337 ms. 
3 COST: 10.972 ms. 
1 COST: 35.267 ms. 
2 COST: 23.26 ms. 
3 COST: 10.908 ms. 
1 COST: 34.458 ms. 
2 COST: 23.345 ms. 
3 COST: 11.002 ms. 
1 COST: 34.388 ms. 
2 COST: 23.563 ms. 
3 COST: 10.969 ms. 
1 COST: 38.737 ms. 
2 COST: 23.61 ms. 
3 COST: 10.941 ms. 
1 COST: 33.411 ms. 
2 COST: 23.105 ms. 
3 COST: 10.926 ms. 
1 COST: 33.396 ms. 
2 COST: 23.399 ms. 
3 COST: 10.939 ms. 
1 COST: 32.423 ms. 
2 COST: 23.492 ms. 
3 COST: 10.935 ms. 
1 COST: 32.435 ms. 
2 COST: 23.483 ms. 
3 COST: 10.92 ms. 
1 COST: 40.364 ms. 
2 COST: 23.545 ms. 
3 COST: 11.003 ms. 
1 COST: 39.734 ms. 
2 COST: 23.016 ms. 
3 COST: 10.963 ms. 
1 COST: 34.412 ms. 
2 COST: 23.576 ms. 
3 COST: 10.951 ms. 
1 COST: 32.429 ms. 
2 COST: 23.172 ms. 
3 COST: 13.172 ms. 
1 COST: 32.457 ms. 
2 COST: 23.067 ms. 
3 COST: 11.315 ms. 
1 COST: 32.742 ms. 
2 COST: 23.204 ms. 
3 COST: 11.268 ms. 
1 COST: 32.45 ms. 
2 COST: 23.2 ms. 
3 COST: 11.06 ms. 
1 COST: 32.419 ms. 
2 COST: 23.315 ms. 
3 COST: 10.901 ms. 
1 COST: 32.441 ms. 
2 COST: 23.206 ms. 
3 COST: 10.884 ms. 
1 COST: 39.842 ms. 
2 COST: 23.168 ms. 
3 COST: 10.975 ms. 
1 COST: 39.78 ms. 
2 COST: 23.093 ms. 
3 COST: 10.95 ms. 
1 COST: 32.376 ms. 
2 COST: 23.177 ms. 
3 COST: 10.901 ms. 
1 COST: 32.434 ms. 
2 COST: 23.201 ms. 
3 COST: 11.004 ms. 
1 COST: 32.41 ms. 
2 COST: 23.16 ms. 
3 COST: 10.902 ms. 
1 COST: 37.799 ms. 
2 COST: 23.042 ms. 
3 COST: 10.894 ms. 
1 COST: 30.233 ms. 
2 COST: 23.215 ms. 
3 COST: 13.335 ms. 
1 COST: 32.357 ms. 
2 COST: 23.14 ms. 
3 COST: 13.377 ms. 
1 COST: 32.426 ms. 
2 COST: 23.317 ms. 
3 COST: 10.904 ms. 
1 COST: 32.411 ms. 
2 COST: 23.138 ms. 
3 COST: 10.869 ms. 
1 COST: 34.096 ms. 
2 COST: 23.186 ms. 
3 COST: 10.987 ms. 
1 COST: 39.769 ms. 
2 COST: 23.211 ms. 
3 COST: 10.899 ms. 
1 COST: 33.432 ms. 
2 COST: 23.09 ms. 
3 COST: 10.947 ms. 
1 COST: 34.335 ms. 
2 COST: 23.199 ms. 
3 COST: 10.943 ms. 
1 COST: 32.924 ms. 
2 COST: 23.155 ms. 
3 COST: 10.929 ms. 
1 COST: 33.439 ms. 
2 COST: 23.272 ms. 
3 COST: 10.925 ms. 
1 COST: 36.702 ms. 
2 COST: 23.137 ms. 
3 COST: 10.968 ms. 
1 COST: 40.058 ms. 
2 COST: 23.249 ms. 
3 COST: 10.941 ms. 
1 COST: 34.431 ms. 
2 COST: 23.152 ms. 
3 COST: 10.895 ms. 
1 COST: 33.359 ms. 
2 COST: 23.18 ms. 
3 COST: 10.971 ms. 
1 COST: 33.373 ms. 
2 COST: 23.216 ms. 
3 COST: 10.924 ms. 
1 COST: 34.331 ms. 
2 COST: 23.329 ms. 
3 COST: 10.932 ms. 
1 COST: 35.397 ms. 
2 COST: 23.272 ms. 
3 COST: 10.999 ms. 
1 COST: 35.203 ms. 
2 COST: 23.256 ms. 
3 COST: 11.29 ms. 
1 COST: 32.645 ms. 
2 COST: 23.352 ms. 
3 COST: 13.246 ms. 

it seems still look strange…