inference score isn't correct on TX1

Hi experts,
I’m executing inference on TX1 with image inputed. But the inference score always too small and definitely not correct I think. Because no matter what model or preprocessing the image, I always got small scores.
I’m not sure if below inference codes are correct. I just modified the demo cpp file giexec.cpp which located at TX1.
I have no any clues now. Could anyone give some comments?
Thanks a lot!

my HW info:
R28 (release), REVISION: 2.0, GCID: 10567845, BOARD: t210ref, EABI: aarch64, DATE: Fri Mar…
CUDA Version 9.0.252
#define CUDNN_MAJOR 7
nv-tensorrt-repo-ubuntu1604-ga-cuda9.0-trt3.0.4-20180208 1-1

infer.cpp:

#include <assert.h>
#include <fstream>
#include <sstream>
#include <iostream>
#include <cmath>
#include <sys/stat.h>
#include <cmath>
#include <time.h>
#include <cuda_runtime_api.h>
#include <algorithm>
#include <chrono>
#include <string.h>
#include <map>
#include <random>
#include <iterator>
#include <vector>


#include "NvInfer.h"
#include "NvCaffeParser.h"
#include "opencv2/opencv.hpp"

//using namespace cv;

using namespace nvinfer1;
using namespace nvcaffeparser1;

#define CHECK(status)									\
{														\
	if (status != 0)									\
	{													\
		std::cout << "Cuda failure: " << status;		\
		abort();										\
	}													\
}

struct Params
{
	std::string deployFile, modelFile, engine, calibrationCache{"CalibrationTable"};
	std::vector<std::string> outputs;
	std::string imageFile;
	int device{ 0 }, batchSize{ 1 }, workspaceSize{ 16 }, iterations{ 10 }, avgRuns{ 10 };
	bool half2{ false }, int8{ false }, verbose{ false }, hostTime{ false };
} gParams;

static inline int volume(DimsCHW dims)
{
	return dims.c()*dims.h()*dims.w();
}

std::vector<std::string> gInputs;
std::map<std::string, DimsCHW> gInputDimensions;

// Logger for GIE info/warning/errors
class Logger : public ILogger
{
	void log(Severity severity, const char* msg) override
	{
		// suppress info-level messages
		if (severity != Severity::kINFO || gParams.verbose)
			std::cout << msg << std::endl;
	}
} gLogger;


#include "opencv2/opencv.hpp"

//using namespace cv;

// data_format: NCHW
void fill_tensor_with_cvmat(const cv::Mat& img_in, float* localData, const int num, \
    const int channel, const int width, const int height, const float* mean, const float* scale) {
    cv::Mat im;
    cv::resize(img_in, im, cv::Size(width, height), 0.f, 0.f);
    int stride = width * height;
    for (int i = 0; i < num; i++) {
        float* ptr_in = localData + i * channel * height * width;
        for (int r = 0; r < height; r++) {
            for (int c = 0; c < width; c++) {
                ptr_in[r * width + c] = (im.at<cv::Vec3b>(r, c)[0] - mean[0]) * scale[0];
                ptr_in[stride + r * width + c] = (im.at<cv::Vec3b>(r, c)[1] - mean[1]) * scale[1];
                ptr_in[2 * stride + r * width + c] = (im.at<cv::Vec3b>(r, c)[2] - mean[2]) * scale[2];
            }
        }
    }
}


size_t numTensorElements(nvinfer1::Dims dimensions)
{
    if (dimensions.nbDims == 0)
        return 0;
    size_t size = 1;
    for (int i = 0; i < dimensions.nbDims; i++)
        size *= dimensions.d[i];
    return size;
}


//std::vector<size_t> argsort(float *tensor, nvinfer1::Dims dimensions, float (&scores)[5])
std::vector<size_t> argsort(float *tensor, nvinfer1::Dims dimensions, std::vector<float> &scores)
{
    size_t numel = numTensorElements(dimensions);
    std::vector<size_t> indices(numel);
    for (int i = 0; i < numel; i++)
        indices[i] = i;
    std::sort(indices.begin(), indices.begin() + numel, [tensor](size_t idx1, size_t idx2) {
        return tensor[idx1] > tensor[idx2];
    });

    for (int i=0; i<5; i++){
        int idx = indices[i];
        scores[i] = tensor[idx];
    }

    return indices;
}


class RndInt8Calibrator : public IInt8EntropyCalibrator
{
public:
	RndInt8Calibrator(int totalSamples, std::string cacheFile)
		: mTotalSamples(totalSamples)
		, mCurrentSample(0)
        , mCacheFile(cacheFile)
	{
		std::default_random_engine generator;
		std::uniform_real_distribution<float> distribution(-1.0F, 1.0F);
		for(auto& elem: gInputDimensions)
		{
			int elemCount = volume(elem.second);

			std::vector<float> rnd_data(elemCount);
			for(auto& val: rnd_data)
				val = distribution(generator);

			void * data;
			CHECK(cudaMalloc(&data, elemCount * sizeof(float)));
			CHECK(cudaMemcpy(data, &rnd_data[0], elemCount * sizeof(float), cudaMemcpyHostToDevice));

			mInputDeviceBuffers.insert(std::make_pair(elem.first, data));
		}
	}

	~RndInt8Calibrator()
	{
		for(auto& elem: mInputDeviceBuffers)
			CHECK(cudaFree(elem.second));
	}

	int getBatchSize() const override
	{
		return 1;
	}

	bool getBatch(void* bindings[], const char* names[], int nbBindings) override
	{
		if (mCurrentSample >= mTotalSamples)
			return false;

		for(int i = 0; i < nbBindings; ++i)
			bindings[i] = mInputDeviceBuffers[names[i]];

		++mCurrentSample;
		return true;
	}

	const void* readCalibrationCache(size_t& length) override
	{
        mCalibrationCache.clear();
		std::ifstream input(mCacheFile, std::ios::binary);
		input >> std::noskipws;
		if (input.good())
			std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(mCalibrationCache));

		length = mCalibrationCache.size();
		return length ? &mCalibrationCache[0] : nullptr;
	}

	virtual void writeCalibrationCache(const void* cache, size_t length) override
	{
	}

private:
	int mTotalSamples;
	int mCurrentSample;
    std::string mCacheFile;
	std::map<std::string, void*> mInputDeviceBuffers;
	std::vector<char> mCalibrationCache;
};

ICudaEngine* caffeToGIEModel()
{
	// create the builder
	IBuilder* builder = createInferBuilder(gLogger);

	// parse the caffe model to populate the network, then set the outputs
	INetworkDefinition* network = builder->createNetwork();
	ICaffeParser* parser = createCaffeParser();
	const IBlobNameToTensor* blobNameToTensor = parser->parse(gParams.deployFile.c_str(),
															  gParams.modelFile.empty() ? 0 : gParams.modelFile.c_str(),
															  *network,
															  gParams.half2 ? DataType::kHALF:DataType::kFLOAT);


	if (!blobNameToTensor)
		return nullptr;

	for (int i = 0, n = network->getNbInputs(); i < n; i++)
	{
		DimsCHW dims = static_cast<DimsCHW&&>(network->getInput(i)->getDimensions());
		gInputs.push_back(network->getInput(i)->getName());
		gInputDimensions.insert(std::make_pair(network->getInput(i)->getName(), dims));
		std::cout << "Input \"" << network->getInput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl;
	}

	// specify which tensors are outputs
	for (auto& s : gParams.outputs)
	{
		if (blobNameToTensor->find(s.c_str()) == nullptr)
		{
			std::cout << "could not find output blob " << s << std::endl;
			return nullptr;
		}
		network->markOutput(*blobNameToTensor->find(s.c_str()));
	}

	for (int i = 0, n = network->getNbOutputs(); i < n; i++)
	{
		DimsCHW dims = static_cast<DimsCHW&&>(network->getOutput(i)->getDimensions());
		std::cout << "Output \"" << network->getOutput(i)->getName() << "\": " << dims.c() << "x" << dims.h() << "x" << dims.w() << std::endl;
	}

	// Build the engine
	builder->setMaxBatchSize(gParams.batchSize);
	builder->setMaxWorkspaceSize(size_t(gParams.workspaceSize)<<20);
	builder->setHalf2Mode(gParams.half2);

	RndInt8Calibrator calibrator(1, gParams.calibrationCache);
	if (gParams.int8)
	{
		builder->setInt8Mode(true);
		builder->setInt8Calibrator(&calibrator);
	}

	ICudaEngine* engine = builder->buildCudaEngine(*network);
	if (engine == nullptr)
		std::cout << "could not build engine" << std::endl;

	parser->destroy();
	network->destroy();
	builder->destroy();
	shutdownProtobufLibrary();
	return engine;
}


void doInference(ICudaEngine& engine)
{
	IExecutionContext *context = engine.createExecutionContext();
	// input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
	// of these, but in this case we know that there is exactly one input and one output.

    //size_t outputSize;
    //DimsCHW outputDims;
	std::vector<void*> buffers(gInputs.size() + gParams.outputs.size());
//	for (size_t i = 0; i < gInputs.size(); i++)
//		createMemory(engine, buffers, gInputs[i]);
//
//	for (size_t i = 0; i < gParams.outputs.size(); i++)
//		createMemory(engine, buffers, gParams.outputs[i]);
//    feedMemory(engine, buffers, gInputs[0], gParams.outputs[0], outputSize, outputDims);

    size_t inputIndex = engine.getBindingIndex(gInputs[0].c_str());
	size_t outputIndex = engine.getBindingIndex(gParams.outputs[0].c_str());
	printf("inputName=%s, bindingIndex=%d, buffers.size()=%d\n", gInputs[0].c_str(), (int)inputIndex, 2);
	printf("outputName=%s, bindingIndex=%d, buffers.size()=%d\n", gParams.outputs[0].c_str(), (int)outputIndex, 2);

	DimsCHW inputDims = static_cast<DimsCHW&&>(engine.getBindingDimensions(inputIndex)), outputDims = static_cast<DimsCHW&&>(engine.getBindingDimensions(outputIndex));

    /* convert from uint8+NHWC to float+NCHW */
    float *inputDataHost, *outputDataHost;
    size_t numInput, numOutput;
    numInput = numTensorElements(inputDims);
    numOutput = numTensorElements(outputDims);
    inputDataHost = (float*) malloc(numInput * sizeof(float));
    outputDataHost = (float*) malloc(numOutput * sizeof(float));

    cv::Mat img = cv::imread(gParams.imageFile, CV_LOAD_IMAGE_COLOR);
    if (img.empty()) {
        std::cout << "Read image failed!" << std::endl;
    }
    // reverse channel order
    cv::cvtColor(img, img, cv::COLOR_BGR2RGB, 3);
    //! set your mean value and scale value here
    float mean_mb[3] = {103.94f, 116.78f, 123.68f};
    float scale_mb[3] = {0.017f, 0.017f, 0.017f};
    fill_tensor_with_cvmat(img, inputDataHost, gParams.batchSize, inputDims.c(), inputDims.w(), inputDims.h(), mean_mb, scale_mb);

    for (int i=0; i<3*224*224; i++){
        if ( i % 1000 == 0 ){
            std::cout << i << ": " << inputDataHost[i] << " | ";
        }
        if ( i > 150518 )
            std::cout << i << ": " << inputDataHost[i] << " | ";
    }
    std::cout << "\n---------------------------------------------------------" << std::endl;

    /* transfer to device */
    float *inputDataDevice, *outputDataDevice;
    cudaMalloc((void**)&inputDataDevice, numInput * sizeof(float));
    cudaMalloc((void**)&outputDataDevice, numOutput * sizeof(float));
    cudaMemcpy(inputDataDevice, inputDataHost, numInput * sizeof(float), cudaMemcpyHostToDevice);
    void *bindings[2];
    bindings[inputIndex] = (void*) inputDataDevice;
    bindings[outputIndex] = (void*) outputDataDevice;

	cudaStream_t stream;
	CHECK(cudaStreamCreate(&stream));
	cudaEvent_t start, end;
	CHECK(cudaEventCreateWithFlags(&start, cudaEventBlockingSync));
	CHECK(cudaEventCreateWithFlags(&end, cudaEventBlockingSync));

	for (int j = 0; j < gParams.iterations; j++)
	{
		float total = 0, ms;
		for (int i = 0; i < gParams.avgRuns; i++)
		{
			if (gParams.hostTime)
			{
				auto t_start = std::chrono::high_resolution_clock::now();
				context->execute(gParams.batchSize, &buffers[0]);
				auto t_end = std::chrono::high_resolution_clock::now();
				ms = std::chrono::duration<float, std::milli>(t_end - t_start).count();
			}
			else
			{
				cudaEventRecord(start, stream);
//				context->enqueue(gParams.batchSize, &buffers[0], stream, nullptr);
				context->enqueue(gParams.batchSize, bindings, stream, nullptr);
				cudaEventRecord(end, stream);
				cudaEventSynchronize(end);
				cudaEventElapsedTime(&ms, start, end);
			}
			total += ms;
		}
		total /= gParams.avgRuns;
		std::cout << "Average over " << gParams.avgRuns << " runs is " << total << " ms." << std::endl;
	}
        std::cout << "\n---------------------------------------------------------" << std::endl;

//    float* outputDataHost;
    /* transfer output back to host */
//    cudaMemcpy(outputDataHost, buffers[1], outputSize, cudaMemcpyDeviceToHost);

    /* transfer output back to host */
   cudaMemcpy(outputDataHost, outputDataDevice, numOutput * sizeof(float), cudaMemcpyDeviceToHost);

    std::cout << "\n## output outputDataHost..." << std::endl;
    int j = 0;
    for (int i=0; i<1000; i++){
        //if ( i % 10 == 0 ){
        if ( outputDataHost[i] > 0.001 ){
            std::cout << i << ": " << ((float*)outputDataHost)[i] << " | ";
            ++j;
            if (j == 4){
                std::cout << std::endl;
                j = 0;
            }
        }
    }
    std::cout << "\n---------------------------------------------------------" << std::endl;

    /* parse output */
//    float scores[5] = {0,0,0,0,0};
    std::vector<float> scores(5);
    std::vector<size_t> sortedIndices = argsort(outputDataHost, outputDims, scores);

    std::cout << "\nThe top-5 indices are: " << std::endl;
    for (int i = 0; i < 5; i++)
        std::cout << " | " << sortedIndices[i] << ": " << scores[i] << std::endl;
    std::cout <<"\n" << std::endl;

//    ifstream labelsFile(labelFilename);
//
//    if (!labelsFile.is_open())
//    {
//        cout << "\nCould not open label file." << endl;
//        return 1;
//    }
//
//    vector<string> labelMap;
//    string label;
//    while(getline(labelsFile, label))
//    {
//        labelMap.push_back(label);
//    }
//
//    cout << "\nWhich corresponds to class labels: ";
//    for (int i = 0; i < 5; i++)
//        cout << endl << i << ". " << labelMap[sortedIndices[i]];
//    cout << endl;
        free(inputDataHost);
        free(outputDataHost);
        cudaFree(inputDataDevice);
        cudaFree(outputDataDevice);

	cudaStreamDestroy(stream);
	cudaEventDestroy(start);
	cudaEventDestroy(end);
}



static void printUsage()
{
	printf("\n");
	printf("Mandatory params:\n");
	printf("  --deploy=<file>      Caffe deploy file\n");
	printf("  --output=<name>      Output blob name (can be specified multiple times)\n");
	printf("  --image=<file>       input image file\n");

	printf("\nOptional params:\n");

	printf("  --model=<file>       Caffe model file (default = no model, random weights used)\n");
	printf("  --batch=N            Set batch size (default = %d)\n", gParams.batchSize);
	printf("  --device=N           Set cuda device to N (default = %d)\n", gParams.device);
	printf("  --iterations=N       Run N iterations (default = %d)\n", gParams.iterations);
	printf("  --avgRuns=N          Set avgRuns to N - perf is measured as an average of avgRuns (default=%d)\n", gParams.avgRuns);
	printf("  --workspace=N        Set workspace size in megabytes (default = %d)\n", gParams.workspaceSize);
	printf("  --half2              Run in paired fp16 mode (default = false)\n");
	printf("  --int8               Run in int8 mode (default = false)\n");
	printf("  --verbose            Use verbose logging (default = false)\n");
	printf("  --hostTime	       Measure host time rather than GPU time (default = false)\n");
	printf("  --engine=<file>      Generate a serialized GIE engine\n");
	printf("  --calib=<file>       Read INT8 calibration cache file\n");

	fflush(stdout);
}

bool parseString(const char* arg, const char* name, std::string& value)
{
	size_t n = strlen(name);
	bool match = arg[0] == '-' && arg[1] == '-' && !strncmp(arg + 2, name, n) && arg[n + 2] == '=';
	if (match)
	{
		value = arg + n + 3;
		std::cout << name << ": " << value << std::endl;
	}
	return match;
}

bool parseInt(const char* arg, const char* name, int& value)
{
	size_t n = strlen(name);
	bool match = arg[0] == '-' && arg[1] == '-' && !strncmp(arg + 2, name, n) && arg[n + 2] == '=';
	if (match)
	{
		value = atoi(arg + n + 3);
		std::cout << name << ": " << value << std::endl;
	}
	return match;
}

bool parseBool(const char* arg, const char* name, bool& value)
{
	size_t n = strlen(name);
	bool match = arg[0] == '-' && arg[1] == '-' && !strncmp(arg + 2, name, n);
	if (match)
	{
		std::cout << name << std::endl;
		value = true;
	}
	return match;

}


bool parseArgs(int argc, char* argv[])
{
	if (argc < 3)
	{
		printUsage();
		return false;
	}

	for (int j = 1; j < argc; j++)
	{
		if (parseString(argv[j], "model", gParams.modelFile) || parseString(argv[j], "deploy", gParams.deployFile) || parseString(argv[j], "engine", gParams.engine))
			continue;

		if (parseString(argv[j], "calib", gParams.calibrationCache))
			continue;

		std::string output;
		if (parseString(argv[j], "output", output))
		{
			gParams.outputs.push_back(output);
			continue;
		}

		if (parseString(argv[j], "image", gParams.imageFile))
			continue;

		if (parseInt(argv[j], "batch", gParams.batchSize) || parseInt(argv[j], "iterations", gParams.iterations) || parseInt(argv[j], "avgRuns", gParams.avgRuns)
			|| parseInt(argv[j], "device", gParams.device)	|| parseInt(argv[j], "workspace", gParams.workspaceSize))
			continue;

		if (parseBool(argv[j], "half2", gParams.half2) || parseBool(argv[j], "int8", gParams.int8)
			|| parseBool(argv[j], "verbose", gParams.verbose) || parseBool(argv[j], "hostTime", gParams.hostTime))
			continue;

		printf("Unknown argument: %s\n", argv[j]);
		return false;
	}
	return true;
}

static ICudaEngine* createEngine()
{
	ICudaEngine *engine;

	if (!gParams.deployFile.empty()) {
		engine = caffeToGIEModel();
		if (!engine)
		{
			std::cerr << "Engine could not be created" << std::endl;
			return nullptr;
		}


		if (!gParams.engine.empty())
		{
			std::ofstream p(gParams.engine);
			if (!p)
			{
				std::cerr << "could not open plan output file" << std::endl;
				return nullptr;
			}
			IHostMemory *ptr = engine->serialize();
            assert(ptr);
            p.write(reinterpret_cast<const char*>(ptr->data()), ptr->size());
            ptr->destroy();
		}
		return engine;
	}

	// load directly from serialized engine file if deploy not specified
	if (!gParams.engine.empty()) {
		char *gieModelStream{nullptr};
        size_t size{0};
		std::ifstream file(gParams.engine, std::ios::binary);
		if (file.good()) {
            file.seekg(0, file.end);
            size = file.tellg();
            file.seekg(0, file.beg);
            gieModelStream = new char;
            assert(gieModelStream);
            file.read(gieModelStream, size);
			file.close();
		}

		IRuntime* infer = createInferRuntime(gLogger);
		engine = infer->deserializeCudaEngine(gieModelStream, size, nullptr);
        if (gieModelStream) delete [] gieModelStream;

		// assume input to be "data" for deserialized engine
		gInputs.push_back("data");
		return engine;
	}

	// complain about empty deploy file
	std::cerr << "Deploy file not specified" << std::endl;
	return nullptr;
}

int main(int argc, char** argv)
{
	// create a GIE model from the caffe model and serialize it to a stream

	if (!parseArgs(argc, argv))
		return -1;

	cudaSetDevice(gParams.device);

	if (gParams.outputs.size() == 0)
	{
		std::cerr << "At least one network output must be defined" << std::endl;
		return -1;
	}

	ICudaEngine* engine = createEngine();
	if (!engine)
	{
		std::cerr << "Engine could not be created" << std::endl;
		return -1;
	}

	doInference(*engine);
	engine->destroy();

	return 0;
}

Results:

nvidia@tegra-ubuntu:~/tools/giue/build$ ./infer --deploy=/media/nvidia/CCCOMA_X64F/tensorrt/caffe/MobileNetV1/mobilenet_deploy.prototxt --output=prob --image=/media/nvidia/CCCOMA_X64F/tensorrt/image/2.jpg
deploy: /media/nvidia/CCCOMA_X64F/tensorrt/caffe/MobileNetV1/mobilenet_deploy.prototxt
output: prob
image: /media/nvidia/CCCOMA_X64F/tensorrt/image/2.jpg
Input "data": 3x224x224
Output "prob": 1000x1x1
inputName=data, bindingIndex=0, buffers.size()=2
outputName=prob, bindingIndex=1, buffers.size()=2
0: 2.56802 | 1000: 2.56802 | 2000: 2.56802 | 3000: 2.56802 | 4000: 2.56802 | 5000: 2.56802 | 6000: 2.56802 | 7000: 0.90202 | 8000: 2.56802 | 9000: 0.32402 | 10000: 2.56802 | 11000: 1.71802 | 12000: 2.53402 | 13000: -0.78098 | 14000: 2.55102 | 15000: 2.56802 | 16000: 0.29002 | 17000: 2.41502 | 18000: 0.46002 | 19000: -0.66198 | 20000: 1.03802 | 21000: -1.56298 | 22000: 2.56802 | 23000: 1.65002 | 24000: 2.56802 | 25000: -0.23698 | 26000: 2.56802 | 27000: -1.42698 | 28000: 2.56802 | 29000: 0.34102 | 30000: 2.41502 | 31000: -0.55998 | 32000: 1.95602 | 33000: 2.56802 | 34000: 2.56802 | 35000: 2.56802 | 36000: 2.56802 | 37000: 2.56802 | 38000: 2.56802 | 39000: 2.56802 | 40000: 2.56802 | 41000: 2.56802 | 42000: 2.56802 | 43000: 2.36402 | 44000: 1.20802 | 45000: 2.56802 | 46000: 2.56802 | 47000: 1.93902 | 48000: 2.56802 | 49000: 2.56802 | 50000: 2.56802 | 51000: 2.34974 | 52000: 0.47974 | 53000: 2.34974 | 54000: 2.34974 | 55000: 2.34974 | 56000: 2.34974 | 57000: 2.34974 | 58000: 2.34974 | 59000: 2.33274 | 60000: 2.34974 | 61000: -0.76126 | 62000: -0.50626 | 63000: -1.96826 | 64000: -1.45826 | 65000: 0.10574 | 66000: 1.02374 | 67000: 2.34974 | 68000: 1.17674 | 69000: 2.34974 | 70000: 1.04074 | 71000: -0.81226 | 72000: 0.58174 | 73000: -1.30526 | 74000: -1.66226 | 75000: -0.42126 | 76000: -1.84926 | 77000: 0.41174 | 78000: 2.34974 | 79000: 0.08874 | 80000: 2.34974 | 81000: -0.23426 | 82000: 2.34974 | 83000: 2.31574 | 84000: 2.34974 | 85000: 2.34974 | 86000: 0.51374 | 87000: -1.54326 | 88000: -1.76426 | 89000: 2.34974 | 90000: 2.34974 | 91000: 2.34974 | 92000: 2.34974 | 93000: 2.34974 | 94000: 2.34974 | 95000: 2.34974 | 96000: 2.34974 | 97000: 2.34974 | 98000: 2.34974 | 99000: 2.31574 | 100000: 2.29874 | 101000: 2.23244 | 102000: 2.23244 | 103000: 2.23244 | 104000: 2.13044 | 105000: 2.23244 | 106000: 1.60344 | 107000: 2.23244 | 108000: 1.09344 | 109000: 2.23244 | 110000: -1.28656 | 111000: 2.23244 | 112000: -1.54156 | 113000: 2.19844 | 114000: 2.23244 | 115000: -1.64356 | 116000: -0.87856 | 117000: -0.91256 | 118000: 0.90644 | 119000: -1.84756 | 120000: -1.69456 | 121000: 2.23244 | 122000: 0.58344 | 123000: 2.23244 | 124000: -0.58956 | 125000: 2.23244 | 126000: -1.91556 | 127000: -1.25256 | 128000: -0.35156 | 129000: -0.40256 | 130000: -2.08556 | 131000: -0.98056 | 132000: 2.23244 | 133000: 2.23244 | 134000: 2.23244 | 135000: 2.23244 | 136000: 2.23244 | 137000: 2.23244 | 138000: 2.23244 | 139000: 2.23244 | 140000: 2.23244 | 141000: 2.23244 | 142000: -0.09656 | 143000: -0.65756 | 144000: -0.65756 | 145000: 2.23244 | 146000: 2.23244 | 147000: 2.23244 | 148000: 2.23244 | 149000: 2.23244 | 150000: 2.23244 | 150519: 2.23244 | 150520: 2.23244 | 150521: 2.23244 | 150522: 2.23244 | 150523: 2.23244 | 150524: 2.23244 | 150525: 2.23244 | 150526: 2.23244 | 150527: 2.23244 | 
---------------------------------------------------------
Average over 10 runs is 17.7887 ms.
Average over 10 runs is 10.5378 ms.
Average over 10 runs is 10.6131 ms.
Average over 10 runs is 10.4526 ms.
Average over 10 runs is 10.3486 ms.
Average over 10 runs is 10.4802 ms.
Average over 10 runs is 10.522 ms.
Average over 10 runs is 10.4813 ms.
Average over 10 runs is 10.473 ms.
Average over 10 runs is 10.4765 ms.

---------------------------------------------------------

## output outputDataHost...
1: 0.00100176 | 3: 0.00100891 | 6: 0.001011 | 8: 0.00100509 | 
9: 0.00100463 | 11: 0.00100788 | 13: 0.0010041 | 17: 0.00100307 | 
18: 0.00100199 | 23: 0.0010078 | 27: 0.00101147 | 29: 0.00100186 | 
31: 0.00100794 | 33: 0.00101418 | 34: 0.00100239 | 37: 0.00100047 | 
39: 0.00101143 | 40: 0.00100499 | 43: 0.00101598 | 44: 0.00100058 | 
45: 0.00101516 | 46: 0.00100557 | 48: 0.00100592 | 50: 0.00100013 | 
51: 0.00101982 | 54: 0.00100144 | 57: 0.00100787 | 59: 0.00100301 | 
60: 0.00101593 | 61: 0.0010033 | 63: 0.00100493 | 66: 0.00100349 | 
68: 0.00100197 | 69: 0.00100331 | 77: 0.00100139 | 78: 0.00100293 | 
81: 0.00100378 | 88: 0.00100256 | 92: 0.00101347 | 94: 0.00100456 | 
95: 0.00101509 | 98: 0.00102084 | 99: 0.00100775 | 100: 0.00100296 | 
101: 0.00100297 | 102: 0.00101557 | 104: 0.00101055 | 108: 0.00100622 | 
109: 0.00100182 | 112: 0.00100594 | 114: 0.00100332 | 115: 0.00100217 | 
116: 0.00100099 | 118: 0.00100657 | 119: 0.00101405 | 122: 0.00100451 | 
126: 0.00100141 | 127: 0.00100521 | 128: 0.00100674 | 130: 0.00102497 | 
133: 0.00100652 | 134: 0.001003 | 136: 0.00100436 | 137: 0.00101532 | 
139: 0.00101165 | 140: 0.00100288 | 145: 0.00100969 | 148: 0.00100324 | 
149: 0.00100161 | 150: 0.00100228 | 151: 0.00100679 | 154: 0.00100338 | 
158: 0.00100041 | 159: 0.0010205 | 160: 0.00101752 | 161: 0.00100093 | 
163: 0.00100032 | 164: 0.00100051 | 165: 0.00100647 | 167: 0.00100767 | 
169: 0.00100004 | 172: 0.00101219 | 175: 0.00100809 | 176: 0.00101735 | 
177: 0.0010066 | 182: 0.00100982 | 187: 0.00100597 | 188: 0.00100664 | 
189: 0.00101011 | 192: 0.00100212 | 196: 0.00101193 | 199: 0.00100128 | 
200: 0.0010016 | 204: 0.00100845 | 206: 0.00101261 | 207: 0.0010186 | 
211: 0.00100335 | 212: 0.00100283 | 216: 0.00100138 | 219: 0.00100148 | 
221: 0.00100532 | 222: 0.00101241 | 224: 0.00100317 | 227: 0.00101202 | 
228: 0.00101073 | 229: 0.00100664 | 234: 0.0010045 | 235: 0.00101004 | 
237: 0.00100297 | 238: 0.00100007 | 239: 0.0010058 | 240: 0.001006 | 
241: 0.00101286 | 242: 0.00100437 | 243: 0.00100849 | 244: 0.00101354 | 
246: 0.00100335 | 247: 0.00101245 | 248: 0.00100767 | 250: 0.00100595 | 
252: 0.00100366 | 253: 0.00100751 | 254: 0.00101152 | 258: 0.00100199 | 
259: 0.00100242 | 261: 0.00101787 | 263: 0.00100066 | 264: 0.00101393 | 
265: 0.0010103 | 269: 0.00100279 | 271: 0.00100995 | 272: 0.00100073 | 
274: 0.00100362 | 277: 0.0010062 | 278: 0.00101278 | 279: 0.00100047 | 
287: 0.00100234 | 289: 0.00100129 | 290: 0.00100741 | 292: 0.00100963 | 
295: 0.00100072 | 296: 0.00100278 | 299: 0.00100039 | 301: 0.00100107 | 
302: 0.00100841 | 305: 0.00101911 | 311: 0.00100541 | 312: 0.00101516 | 
318: 0.00100196 | 321: 0.00100006 | 323: 0.00100072 | 324: 0.00101523 | 
328: 0.00100403 | 330: 0.00101015 | 334: 0.00100417 | 335: 0.00100346 | 
336: 0.00101024 | 338: 0.00100288 | 339: 0.00101798 | 340: 0.00101026 | 
341: 0.00100343 | 342: 0.00100076 | 343: 0.00100167 | 344: 0.00100996 | 
345: 0.00100701 | 347: 0.001005 | 348: 0.00100239 | 351: 0.00100028 | 
352: 0.00100444 | 354: 0.00101864 | 357: 0.00100857 | 358: 0.00100161 | 
359: 0.00100246 | 360: 0.00100078 | 361: 0.00101185 | 365: 0.00100241 | 
368: 0.00101136 | 369: 0.00100957 | 370: 0.00101451 | 374: 0.00100633 | 
377: 0.00100034 | 378: 0.00100026 | 381: 0.00100852 | 382: 0.00100586 | 
384: 0.00100547 | 385: 0.00101368 | 387: 0.00100788 | 390: 0.00101543 | 
392: 0.00100853 | 393: 0.00100632 | 395: 0.00100455 | 398: 0.00100354 | 
400: 0.00100046 | 402: 0.00100098 | 403: 0.00100492 | 405: 0.00102073 | 
406: 0.0010081 | 407: 0.00100154 | 408: 0.00101542 | 410: 0.00100653 | 
411: 0.0010035 | 413: 0.00100261 | 414: 0.00100613 | 415: 0.00100788 | 
416: 0.00100973 | 417: 0.00101384 | 418: 0.0010004 | 419: 0.00101298 | 
421: 0.00100916 | 423: 0.00101004 | 424: 0.0010088 | 425: 0.00100752 | 
427: 0.00101059 | 430: 0.00100556 | 431: 0.00100287 | 436: 0.00100058 | 
438: 0.00100545 | 443: 0.00101369 | 445: 0.001009 | 446: 0.00101193 | 
449: 0.00100408 | 450: 0.00100165 | 451: 0.00100027 | 457: 0.00101233 | 
459: 0.00100239 | 460: 0.00101057 | 461: 0.0010059 | 462: 0.00101251 | 
463: 0.00101279 | 465: 0.00100548 | 466: 0.00100536 | 468: 0.00100647 | 
471: 0.00100252 | 479: 0.00100835 | 480: 0.0010217 | 481: 0.00100359 | 
482: 0.00100079 | 486: 0.00100631 | 488: 0.00100472 | 489: 0.00100927 | 
490: 0.00100808 | 492: 0.00100667 | 493: 0.00100819 | 494: 0.00100631 | 
495: 0.00100624 | 498: 0.00101535 | 499: 0.00100331 | 501: 0.00101228 | 
503: 0.00100923 | 505: 0.00100763 | 508: 0.00101055 | 509: 0.00100329 | 
510: 0.00101043 | 512: 0.0010027 | 513: 0.00101706 | 514: 0.00101132 | 
517: 0.00101729 | 518: 0.0010021 | 519: 0.00100591 | 522: 0.00100952 | 
523: 0.00100358 | 525: 0.00101868 | 527: 0.0010134 | 529: 0.00100458 | 
537: 0.00101444 | 538: 0.00100704 | 539: 0.00100786 | 543: 0.0010125 | 
547: 0.00102846 | 548: 0.0010058 | 550: 0.0010055 | 551: 0.00100112 | 
552: 0.00100825 | 554: 0.00100325 | 555: 0.00100276 | 556: 0.00100281 | 
557: 0.00100544 | 558: 0.00100892 | 560: 0.00100126 | 561: 0.00100338 | 
562: 0.00100066 | 564: 0.00100579 | 567: 0.00101314 | 568: 0.00100042 | 
569: 0.00100409 | 572: 0.0010086 | 573: 0.00101253 | 574: 0.00101144 | 
575: 0.00101084 | 579: 0.00100434 | 580: 0.00101123 | 581: 0.00100323 | 
585: 0.00100436 | 586: 0.00100729 | 587: 0.00100005 | 588: 0.0010053 | 
591: 0.00100738 | 592: 0.00100174 | 597: 0.00100559 | 598: 0.00100412 | 
599: 0.00100702 | 601: 0.00100155 | 602: 0.0010045 | 603: 0.00100545 | 
606: 0.00100841 | 607: 0.00100147 | 608: 0.00101834 | 609: 0.00100885 | 
610: 0.00100081 | 612: 0.00100241 | 615: 0.00100525 | 616: 0.001007 | 
617: 0.00100343 | 618: 0.00100282 | 619: 0.00100955 | 620: 0.00100462 | 
621: 0.00100268 | 627: 0.0010009 | 628: 0.00100442 | 629: 0.00100344 | 
632: 0.00100688 | 633: 0.00101679 | 634: 0.00100637 | 636: 0.00101418 | 
638: 0.00100319 | 639: 0.00100021 | 640: 0.00102051 | 641: 0.00100079 | 
642: 0.00100548 | 649: 0.00100887 | 650: 0.001007 | 651: 0.00100289 | 
654: 0.00100295 | 655: 0.00101246 | 658: 0.00100716 | 662: 0.00101014 | 
663: 0.00101476 | 671: 0.00100642 | 672: 0.00101006 | 673: 0.00101348 | 
674: 0.00101393 | 678: 0.00100266 | 680: 0.00100197 | 681: 0.00100933 | 
683: 0.00100837 | 690: 0.00100276 | 691: 0.0010078 | 692: 0.00100814 | 
695: 0.00101692 | 697: 0.00100327 | 699: 0.00100973 | 700: 0.00101934 | 
701: 0.0010015 | 705: 0.00100538 | 706: 0.00100871 | 707: 0.00101457 | 
709: 0.00100914 | 710: 0.00100112 | 712: 0.00100122 | 713: 0.00100244 | 
714: 0.00100928 | 716: 0.00100917 | 717: 0.00100683 | 727: 0.00100808 | 
728: 0.00100312 | 730: 0.00100515 | 731: 0.00100928 | 736: 0.0010037 | 
741: 0.00100153 | 742: 0.00100249 | 743: 0.00101802 | 744: 0.00101117 | 
745: 0.00100719 | 746: 0.00100203 | 748: 0.0010166 | 749: 0.00100436 | 
750: 0.00100317 | 753: 0.00100423 | 754: 0.00100291 | 761: 0.00102681 | 
762: 0.0010015 | 765: 0.00100564 | 766: 0.00101266 | 768: 0.00100761 | 
769: 0.00100017 | 771: 0.00100121 | 772: 0.00100528 | 774: 0.00100198 | 
776: 0.00100625 | 777: 0.00101811 | 779: 0.00100836 | 780: 0.00100702 | 
781: 0.00101923 | 782: 0.00101461 | 786: 0.0010066 | 787: 0.00100823 | 
788: 0.00100314 | 789: 0.00100422 | 792: 0.0010071 | 794: 0.00101356 | 
795: 0.00100563 | 797: 0.00100985 | 798: 0.00102489 | 800: 0.00100971 | 
801: 0.00101495 | 802: 0.0010063 | 803: 0.00100456 | 804: 0.00100351 | 
805: 0.00101834 | 808: 0.00100188 | 809: 0.00101025 | 811: 0.00101979 | 
812: 0.00101357 | 814: 0.00100164 | 815: 0.00100659 | 816: 0.00100193 | 
817: 0.00100422 | 819: 0.00100693 | 820: 0.00100505 | 821: 0.00101227 | 
824: 0.00100625 | 825: 0.00100294 | 827: 0.00100346 | 829: 0.00101092 | 
834: 0.00100827 | 836: 0.00100407 | 837: 0.00100388 | 838: 0.00100437 | 
842: 0.00100093 | 843: 0.00101566 | 845: 0.00100699 | 847: 0.00101061 | 
848: 0.00100415 | 851: 0.00100467 | 853: 0.00100905 | 855: 0.00100256 | 
857: 0.00101036 | 863: 0.00101344 | 867: 0.00100448 | 868: 0.00101037 | 
869: 0.00100013 | 870: 0.00100679 | 871: 0.00100209 | 873: 0.00101075 | 
874: 0.00100706 | 875: 0.00100141 | 879: 0.0010036 | 880: 0.00100463 | 
884: 0.00100785 | 885: 0.00100472 | 890: 0.0010059 | 891: 0.00101307 | 
892: 0.00101042 | 893: 0.00100473 | 897: 0.00101276 | 898: 0.00100446 | 
899: 0.00101436 | 900: 0.00100171 | 903: 0.00101381 | 904: 0.00100604 | 
905: 0.00100471 | 911: 0.00100035 | 912: 0.00100239 | 913: 0.00100238 | 
915: 0.00101145 | 916: 0.00100496 | 917: 0.00100779 | 918: 0.00101586 | 
921: 0.00100847 | 923: 0.0010045 | 924: 0.0010058 | 926: 0.00101469 | 
927: 0.00100556 | 928: 0.00100041 | 932: 0.00100655 | 936: 0.00101137 | 
937: 0.0010053 | 938: 0.00100542 | 940: 0.00101448 | 941: 0.00100164 | 
942: 0.00100397 | 943: 0.00100403 | 944: 0.00100239 | 947: 0.0010072 | 
949: 0.00100599 | 954: 0.00100441 | 955: 0.0010061 | 957: 0.00100254 | 
960: 0.00102748 | 962: 0.00101383 | 963: 0.00100568 | 965: 0.00100543 | 
966: 0.00100766 | 967: 0.00100629 | 969: 0.0010089 | 971: 0.00101128 | 
973: 0.00101659 | 977: 0.00100255 | 979: 0.00100143 | 982: 0.00101136 | 
983: 0.00100838 | 985: 0.00100052 | 987: 0.00100795 | 988: 0.00100005 | 
989: 0.00100596 | 991: 0.00100939 | 995: 0.0010023 | 996: 0.00102457 | 
997: 0.0010066 | 
---------------------------------------------------------

The top-5 indices are: 
 | 547: 0.00102846
 | 960: 0.00102748
 | 761: 0.00102681
 | 130: 0.00102497
 | 798: 0.00102489

Hi,

Could you share the prototxt file with us?
Is there any scaling or power layer in the begin of your model?

Thanks.

Hi,
model files uploaded to below link

link:百度网盘 请输入提取码
password:x10h

@AastaLLL

Below is the contents of CMakeLists.txt I’m using:
Thanks a lot for your helping. :)

project(tensorrtInference)
cmake_minimum_required(VERSION "3.0")

set(CMAKE_CXX_STANDARD 11)

find_package(OpenCV REQUIRED)
message(STATUS "OpenCV library status:")
message(STATUS "    version: ${OpenCV_VERSION}")
message(STATUS "    libraries: ${OpenCV_LIBS}")
message(STATUS "    include path: ${OpenCV_INCLUDE_DIRS}")

message(STATUS "===========================================")
find_package(CUDA REQUIRED)
message(STATUS "CUDA library status:")
message(STATUS "    version: ${CUDA_VERSION}")
message(STATUS "    libraries: ${CUDA_LIBS}")
message(STATUS "    include path: ${CUDA_INCLUDE_DIRS}")
message(STATUS "===========================================")

#find_package(CUDNN REQUIRED)
#message(STATUS "CUDNN library status:")
#message(STATUS "    version: ${CUDNN_VERSION}")
#message(STATUS "    libraries: ${CUDNN_LIBS}")
#message(STATUS "    include path: ${CUDNN_INCLUDE_DIRS}")

include_directories("/usr/include" "/usr/local/cuda/include" "/usr/include/aarch64-linux-gnu" ${CUDA_INCLUDE_DIRS})
link_directories("/usr/lib/aarch64-linux-gnu" "/usr/local/cuda/lib64")
set(CUDA_LIBS ${CUDA_LIBS} nvinfer nvparsers nvinfer_plugin cudnn cublas cudart_static nvToolsExt cudart)
set(required_libs ${OpenCV_LIBS} ${CUDA_LIBS} pthread rt dl)
cuda_add_executable(infer infer.cpp)
target_link_libraries(infer ${required_libs})

How’s it going? any clues?

Hi
Issue just was resolved. The root cause is that I didn’t pass model argument to the infer application (I thought it will be loaded with default, but it’s not…).

Thanks anyway.

Good to know it works!