Problem loading weights in half precision mode

Hello,
I have a problem trying to load weights value to be used for half precision inference;
for weight loading I use the function loadWeights provided in many TensorRT samples; that works fine for float32 but when I switch to half precision I have the following error

ERROR: Parameter check failed at: Network.cpp::addConvolution::32, condition: kernelWeights.values != NULL
demo: demo.cpp:255: void APIToModel(unsigned int, nvinfer1::IHostMemory**): Assertion `conv1 != nullptr' failed.
Aborted (core dumped)

I tried to modify the loadWeights function, forcing the conversion with the function __float2half (also found in TensorRT samplePlugin sample), but the weights were not loaded succesfully either;

I attach a working piece of code to reproduce the problem; at line 77 it is possible to switch between half and full precision; the weight files I used can be found below;
I use TensorRT RC 4.0.0.3 with CUDA 9.0;

#include "NvInfer.h"
#include "NvCaffeParser.h"
#include "NvUtils.h"
#include "cuda_runtime_api.h"
#include <cassert>
#include <cmath>
#include <cstring>
#include <string>
#include <fstream>
#include <iostream>
#include <sstream>
#include <sys/stat.h>
#include <vector>
#include <algorithm>
#include <opencv2/opencv.hpp>
#include <opencv2/core/core.hpp>
#include <cublas_v2.h>

#define CHECK(status)									\
{														\
	if (status != 0)									\
	{													\
		std::cout << "Cuda failure: " << status;		\
		abort();										\
	}													\
}


// stuff we know about the network and the input/output blobs
static const int INPUT_H = 256;
static const int INPUT_W = 384;
static const int INPUT_C = 3;

static const int OUTPUT_TEST_H = 256;
static const int OUTPUT_TEST_W = 384;
static const int OUTPUT_TEST_C = 64;
static const int OUTPUT_TEST_SIZE = OUTPUT_TEST_H * OUTPUT_TEST_W * OUTPUT_TEST_C;

static const int BATCH_SIZE = 4;
static const int MAX_BATCH_SIZE = 4;

const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "out";
const char* OUTPUT_BLOB_NAME_TEST = "test_out";

const int N_BINDINGS = 2;
static void* buffers[N_BINDINGS];
static cudaStream_t stream;
static int inputIndex, outputIndexTest;

using namespace nvinfer1;

// Logger for GIE info/warning/errors
class Logger : public nvinfer1::ILogger			
{
    public:
	void log(nvinfer1::ILogger::Severity severity, const char* msg) override
	{
		// suppress info-level messages
        if (severity == Severity::kINFO) return;

        switch (severity)
        {
            case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break;
            case Severity::kERROR: std::cerr << "ERROR: "; break;
            case Severity::kWARNING: std::cerr << "WARNING: "; break;
            case Severity::kINFO: std::cerr << "INFO: "; break;
            default: std::cerr << "UNKNOWN: "; break;
        }
        std::cerr << msg << std::endl;
	}
};


static Logger gLogger;

static const DataType DATATYPE = DataType::kHALF;
const char* WEIGHTS_FILENAME = "demo16.wts";
//static const DataType DATATYPE = DataType::kFLOAT;
//const char* WEIGHTS_FILENAME = "demo32.wts";


template<typename T, typename U> T bitwise_cast(U u)
{
    return *reinterpret_cast<T*>(&u);
}

__half __float2half(float f)
{
    uint32_t x = bitwise_cast<uint32_t, float>(f);
    uint32_t u = (x & 0x7fffffff);

    // Get rid of +NaN/-NaN case first.
    if (u > 0x7f800000)
        return bitwise_cast<__half, uint16_t>(uint16_t(0x7fff));
  
    uint16_t sign = ((x >> 16) & 0x8000);
  
    // Get rid of +Inf/-Inf, +0/-0.
    if (u > 0x477fefff)
        return bitwise_cast<__half, uint16_t>(sign | uint16_t(0x7c00));

    if (u < 0x33000001)
        return bitwise_cast<__half, uint16_t>(sign | uint16_t(0x0000));

    uint32_t exponent = ((u >> 23) & 0xff);
    uint32_t mantissa = (u & 0x7fffff);

    uint32_t shift;
    if (exponent > 0x70)
    {
        shift = 13;
        exponent -= 0x70;
    }
    else
    {
        shift = 0x7e - exponent;
        exponent = 0;
        mantissa |= 0x800000;
    }

    uint32_t lsb    = (1 << shift);
    uint32_t lsb_s1 = (lsb >> 1);
    uint32_t lsb_m1 = (lsb - 1);
  
    // Round to nearest even.
    uint32_t remainder = (mantissa & lsb_m1);
    mantissa >>= shift;
    if ( (remainder > lsb_s1) || ((remainder == lsb_s1) && (mantissa & 0x1)) )
    {
        ++mantissa;
        if (!(mantissa & 0x3ff))
        {
            ++exponent;
            mantissa = 0;
        }
    }
    
    return bitwise_cast<__half, uint16_t>(sign | uint16_t(exponent<<10) | uint16_t(mantissa));
}


// Our weight files are in a very simple space delimited format.
// [type]  <data x size in hex> 
std::map<std::string, Weights> loadWeights(const std::string file)
{
    std::map<std::string, Weights> weightMap;
	std::ifstream input(file);
	assert(input.is_open() && "Unable to load weight file.");
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");
    while(count--)
    {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t type, size;
        std::string name;
        input >> name >> std::dec >> type >> size;
        wt.type = static_cast<DataType>(type);
        if (wt.type == DataType::kFLOAT)
        {
            uint32_t *val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
            for (uint32_t x = 0, y = size; x < y; ++x)
            {
                input >> std::hex >> val[x];

            }
            wt.values = val;
        } else if (wt.type == DataType::kHALF)
        {
            uint16_t *val = reinterpret_cast<uint16_t*>(malloc(sizeof(val) * size));
            for (uint32_t x = 0, y = size; x < y; ++x)
            {
                input >> std::hex >> val[x];
            }
            wt.values = val;
        }
        wt.count = size;
        weightMap[name] = wt;
    }
    return weightMap;
}


std::string locateFile(const std::string& input, const std::vector<std::string> & directories)
{
    std::string file;
	const int MAX_DEPTH{10};
    bool found{false};
    for (auto &dir : directories)
    {
        file = dir + input;
        for (int i = 0; i < MAX_DEPTH && !found; i++)
        {
            std::ifstream checkFile(file);
            found = checkFile.is_open();
            if (found) break;
            file = "../" + file;
        }
        if (found) break;
        file.clear();
    }

    assert(!file.empty() && "Could not find a file due to it not existing in the data directory.");
    return file;
}

// We have the data files located in a specific directory. This 
// searches for that directory format from the current directory.
std::string locateFile(const std::string& input)
{
    std::vector<std::string> dirs{"./data/"};
    return locateFile(input, dirs);
}

// print tensor dimensions
void printDims(ITensor* data)
{
    Dims dims = data->getDimensions();
    int nbDims = dims.nbDims;
    for (int d = 0; d < nbDims; d++)
        std::cout << dims.d[d] << " ";// << dims.d[1] << " " << dims.d[2] << " " << dims.d[3] << std::endl;
    std::string sss;    
    if (data->getType() == DataType::kHALF)
        sss = "float16";
    if (data->getType() == DataType::kFLOAT)
        sss = "float32";
    std::cout << sss << " ";
    std::cout << std::endl;
}


void APIToModel(unsigned int maxBatchSize, IHostMemory **modelStream)
{
	// create the builder
	IBuilder* builder = createInferBuilder(gLogger);


///////////////////////////////////////////////////////////
    INetworkDefinition* network = builder->createNetwork();
    
    // load weights values from disk
    std::map<std::string, Weights> weightMap = loadWeights(locateFile(WEIGHTS_FILENAME));

	// define input
	auto data = network->addInput(INPUT_BLOB_NAME, DATATYPE, DimsCHW{INPUT_C, INPUT_H, INPUT_W});
	assert(data != nullptr);
    std::cout << "input" << std::endl;
    printDims(data);


    // add layer
    // 1 ////////////////////////////////////
    auto conv1 = network->addConvolution(*data, 64, DimsHW{3, 3}, weightMap["conv1_w"], weightMap["conv1_b"]);
	assert(conv1 != nullptr);
	conv1->setStride(DimsHW{1, 1});
    conv1->setPadding(DimsHW{1, 1});

    // set output
    conv1->getOutput(0)->setName(OUTPUT_BLOB_NAME_TEST);
	network->markOutput(*conv1->getOutput(0));



///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////

	// Build the engine
	builder->setMaxBatchSize(maxBatchSize);
	builder->setMaxWorkspaceSize(1e9);
    if (DATATYPE == DataType::kHALF)
    	builder->setHalf2Mode(true);
	
    std::cout << "building the engine..." << std::endl;

	auto engine = builder->buildCudaEngine(*network);
         assert(engine != nullptr);

    std::cout << "engine built!" << std::endl;

	// serialize the engine, then close everything down
	(*modelStream) = engine->serialize();

	// Once we have built the cuda engine, we can release all of our held memory.
	for (auto &mem : weightMap)
    {
        free((void*)(mem.second.values));
    }
///////////////////////////////////////////////////////////

    network->destroy();
	engine->destroy();
	builder->destroy();
}


void setUpDevice(IExecutionContext& context, int batchSize)
{
    const ICudaEngine& engine = context.getEngine();
    // input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
    // of these, but in this case we know that there is exactly one input and one output.
    assert(engine.getNbBindings() == N_BINDINGS);

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // note that indices are guaranteed to be less than IEngine::getNbBindings()
    inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    outputIndexTest = engine.getBindingIndex(OUTPUT_BLOB_NAME_TEST);

    // create GPU buffers and a stream
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_H * INPUT_W * INPUT_C * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndexTest], batchSize * OUTPUT_TEST_SIZE * sizeof(float)));

    // create cuda stream
    CHECK(cudaStreamCreate(&stream));
}

void cleanUp()
{
  	// release the stream and the buffers
	cudaStreamDestroy(stream);
	CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndexTest]));
}

void doInference(IExecutionContext& context, float* input, float* outputtest, int batchSize)
{
	// DMA the input to the GPU, execute the batch asynchronously, and DMA it back:
	CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_H * INPUT_W * INPUT_C * sizeof(float), cudaMemcpyHostToDevice, stream));
	context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(outputtest, buffers[outputIndexTest], batchSize * OUTPUT_TEST_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
	cudaStreamSynchronize(stream);
}


// rearrange image data to [N, C, H, W] order
void prepareDataBatch(float *data, std::vector<cv::Mat> &frames)
{
     assert(data && !frames.empty());
     unsigned int volChl = INPUT_H * INPUT_W;
     unsigned int volImg = INPUT_H * INPUT_W * INPUT_C;
     
     for (int b = 0; b < BATCH_SIZE; b++)
         for (int c = 0; c < INPUT_C; c++)
         {
              // the color image to input should be in BGR order
              for (unsigned j = 0; j < volChl; j++)
                   data[b * volImg + c * volChl + j] = float(frames[b].data[j * INPUT_C + c]) / 255.0;
         }
     
     return;
}



void printOutput(float *out, const int batch_size, const int output_c,  const int output_h,  const int output_w)
{
    int output_size(output_c * output_h * output_w);

    std::cout << "================="<< std::endl;   
    std::cout << "================="<< std::endl;
    std::cout << "-----------------"<< std::endl; 
    for (int b = 0; b < batch_size; b++)
        {
        for (int c = 0; c < output_c; c++)
            {
                for (int h = 0; h < output_h; h++)
                {
                    for (int w = 0; w < output_w; w++)
                        std::cout << out[b * output_size + c * output_h * output_w + h * output_w + w] << " ";
                    std::cout << std::endl;
                }
            std::cout << "-----------------"<< std::endl; 
            }
        std::cout << "================="<< std::endl;   
        std::cout << "================="<< std::endl;
        }

    return;
}





int main(int argc, char** argv)
{
    std::cout << sizeof(float) << std::endl;

    //read input, convert, resize
    std::vector<std::string> image_paths;
  
    image_paths.push_back("./images/2.jpg");
    image_paths.push_back("./images/9.jpg");
    image_paths.push_back("./images/18.jpg");
    image_paths.push_back("./images/20.jpg");

    std::vector<cv::Mat> images(BATCH_SIZE);

    for (int b = 0; b < BATCH_SIZE; b++)
    {
        std::cout << image_paths[b] << std::endl;
        cv::Mat image_bgr = cv::imread(image_paths[b]);
        cv::Mat image_rgb, image;
        cv::cvtColor(image_bgr, image_rgb, cv::COLOR_BGR2RGB);
        cv::resize(image_rgb, image_rgb, cv::Size(INPUT_W, INPUT_H), 0, 0, cv::INTER_LINEAR);
        images[b] = image_rgb;
    }

    // allocate CPU memory for input and output
    int inputSize = sizeof(float) * BATCH_SIZE * INPUT_C * INPUT_H * INPUT_W;
    int outputSizeTest = sizeof(float) * BATCH_SIZE * OUTPUT_TEST_SIZE;
    float *data = (float *)malloc(inputSize);
    float *outtest = (float *)malloc(outputSizeTest);

	// init model stream variables
    IHostMemory *modelStream{nullptr};
	IRuntime* runtime = createInferRuntime(gLogger);

	// create a model using the API directly and serialize it to a stream
    APIToModel(MAX_BATCH_SIZE, &modelStream);
    ICudaEngine* engine = runtime->deserializeCudaEngine(modelStream->data(), modelStream->size(), nullptr);

    // create execution context
	IExecutionContext *context = engine->createExecutionContext();

    // allocate memory on device
    setUpDevice(*context, BATCH_SIZE);

    // flatten image Mat, convert to float, do TF-style whitening
    prepareDataBatch(data, images);

    // run inference
    doInference(*context, data, outtest, BATCH_SIZE);

	// destroy the engine
	context->destroy();
	engine->destroy();
	runtime->destroy();

    // clean-up device
    cleanUp();

    // print out
    printOutput(outtest, BATCH_SIZE, OUTPUT_TEST_C, OUTPUT_TEST_H, OUTPUT_TEST_W);

    //free mem
    free(data);
    free(outtest);

    std::cout << "done!" << std::endl;

    return 0;
}

Weight files I used to run the example:

thanks in advance for any help,

f

Hi,

Based on the log, it looks like the weight data cannot be well-recognized by TensorRT.

Suppose you are using TensorRT API to create engine from layers directly.
Could you share how you save the kHALF weights when training?

Thanks.

Hi,
thanks for replying;

this is the python script I use to generate both the full precision and the half precision wts file, starting from the trained weights; the latter are stored as a python dictionary in a pickle file, which I also include below; at line 13/14 one can switch between full and half precision

import os
import numpy as np
import pickle
import struct

### conv
# the weights are specified as a contiguous array in GKCRS order, where G is the number of groups, K the number of output feature maps, C the number of input channels, and R and S are the height and width of the filter
## fc
# set the kernel weights. The expected format is an array of KC values, where K is the number of outputs and C is the number of inputs. 
#G = 1

precision = 'full'
#precision = 'half'

assert(precision in ['full', 'half'])
if precision == 'full':
    datatype = np.float32
    datatype_id = 0
elif precision == 'half':
    datatype = np.float16
    datatype_id = 1
    
    

flatten_order = 'C'
transpose_order_conv = (3, 2, 0, 1)
transpose_order_fc = (1, 0)


def float_to_hex(f):
    
    return hex(struct.unpack('<I', struct.pack('<f', f))[0])


def make_printable(in_array):
    
    hex_array = [float_to_hex(x)[2:] for x in in_array]
    
    return ' '.join(hex_array)


if __name__ == '__main__':
    
    data_path = './data'
    in_file = os.path.join(data_path, 'weights_demo.p')
    out_file = os.path.join(data_path, 'weights_demo.wts')
    weights_data = pickle.load(open(in_file, 'rb'))
    
    with open(out_file, 'w') as outfile:
    
        # first line, number of weight tensors to store
        outfile.write('%s\n' % len(weights_data.keys()) )
    
        # iterate across weight tensors
        for ll in range(1,2):
            
            # get weight name
            w_id = 'conv%d_w' % ll
            b_id = 'conv%d_b' % ll
            
            # retrieve values
            w = weights_data[w_id]
            b = weights_data[b_id]
            
            # adjust to RT format and convert
            w = np.transpose(w, transpose_order_conv)
            w = w.flatten(flatten_order)
            w = w.astype(datatype)
            b = b.astype(datatype)
            
            # line format: <name> <datatype> <size> <hex values> 
            w_line = w_id + (' %d %d ' % (datatype_id, w.size)) + make_printable(w) + '\n'
            b_line = b_id + (' %d %d ' % (datatype_id, b.size)) + make_printable(b) + '\n'
            
            # append to file
            outfile.write(b_line)
            outfile.write(w_line)

The input file with the trained weights can be found at:
http://s000.tinyupload.com/?file_id=78209385462446853379

Hope this information will help;

Thanks again,

f

Hi,

Could you try to save the model with DataType::kHALF rather than numpy.float16?
Thanks.

Hello,
thanks for replying;
I am a bit confused by the request;

Let’s say I have a set of trained weight values that come from a finished training I previously ran in some framework;
this is now a dictionary of numpy arrays but it could be any python format;

what do you exactly mean by saving these weights as DataType::kHALF? Is there a tensorrt routine to convert a float scalar or tensor to a DataType::kHALF? How would you replace lines 68-69 of my python script (astype function) ?

Thanks and apologize in advance if the answer is very obvious

regards,

f

Hi,

Sorry for our unclear explanation.

Originally, we suggest to save model with our data type since we are not sure the conversion used in numpy,
But after some internal discussion, it’s recommended to save with float32 type and convert it into
float16 afterwards(in c++).

Flow: hex->float32->float16

DataType::kFLOAT *input_ptr;
DataType::kHALF *output_ptr
...
output_ptr[i] = input_ptr[i];

Thanks.

Hi,
Thanks for your support, but unfortunately we did not succeed in running half precision inference on a API-defined RT model; We followed the dataflow you have suggested in term of weight conversion (we previously tested and other possible combinations as well). In spite of the fact that we are not completely sure that the we are not doing any mistake in the conversion, we are evaluating that we may miss something in the rest of the code and we are not able to detect it.

Since there are many unclear points that we did not figure out how to handle, we think the most effective way is to make reference to the most similar case to ours, among those provided as TensorRT samples, in the version 4.0.0.3; that is sampleMLP;
From release note I understand that the sample does not support FP16, even though looking at the code, it looks like it is prepared to handle kHALF type in most of its parts;

my question is: which is exactly the unimplemented part that prevents half precision inference from working in this specific case?

I am confident that if I can figure out this, and maybe work it around, that would be extremely helpful to solve our problem as well;

thanks again;

regards,

f

Hi,

We will feedback this issue to our internal team and share an example if possible.
Before that, could you share your pickle file and complete serialize/deserialize source code with us?

Thanks.

Hi,
this is the script I use for generating the wts file

import os
import numpy as np
import pickle
import struct

### conv
# the weights are specified as a contiguous array in GKCRS order, where G is the number of groups, K the number of output feature maps, C the number of input channels, and R and S are the height and width of the filter
## fc
# set the kernel weights. The expected format is an array of KC values, where K is the number of outputs and C is the number of inputs. 
#G = 1

#precision = 'full'
precision = 'half'

assert(precision in ['full', 'half'])
if precision == 'full':
    datatype = np.float32
    datatype_id = 0
elif precision == 'half':
#    datatype = np.float16
    datatype = np.float32
    datatype_id = 1

flatten_order = 'C'
transpose_order_conv = (3, 2, 0, 1)
transpose_order_fc = (1, 0)

def float_to_hex(f):
    
    return hex(struct.unpack('<I', struct.pack('<f', f))[0])

def make_printable(in_array):
    
    hex_array = [float_to_hex(x)[2:] for x in in_array]
    
    return ' '.join(hex_array)

if __name__ == '__main__':
    
    data_path = './data'
    in_file = os.path.join(data_path, 'weights_demo.p')
    out_file = os.path.join(data_path, 'weights_demo.wts')
    weights_data = pickle.load(open(in_file, 'rb'))
    
    with open(out_file, 'w') as outfile:
    
        # first line, number of weight tensors to store
        outfile.write('%s\n' % len(weights_data.keys()) )
    
        # iterate across weight tensors
        for ll in range(1,2):
            
            # get weight name
            w_id = 'conv%d_w' % ll
            b_id = 'conv%d_b' % ll
            
            # retrieve values
            w = weights_data[w_id]
            b = weights_data[b_id]
            
            # adjust to RT format and convert
            w = np.transpose(w, transpose_order_conv)
            w = w.flatten(flatten_order)
            w = w.astype(datatype)
            b = b.astype(datatype)
            
            # line format: <name> <datatype> <size> <hex values> 
            w_line = w_id + (' %d %d ' % (datatype_id, w.size)) + make_printable(w) + '\n'
            b_line = b_id + (' %d %d ' % (datatype_id, b.size)) + make_printable(b) + '\n'
            
            # append to file
            outfile.write(b_line)
            outfile.write(w_line)

you can download the input pickle file from here:
http://s000.tinyupload.com/?file_id=56300649554087776585

I also attach the last version of my inference toy example, which I modified according to the approach used in sampleMLP TensorRT sample code

#include "NvInfer.h"
#include "NvCaffeParser.h"
#include "NvUtils.h"
#include "cuda_runtime_api.h"
#include <cassert>
#include <cmath>
#include <cstring>
#include <string>
#include <fstream>
#include <iostream>
#include <sstream>
#include <sys/stat.h>
#include <vector>
#include <algorithm>
#include <opencv2/opencv.hpp>
#include <opencv2/core/core.hpp>
#include <cublas_v2.h>

#define CHECK(status)									\
{														\
	if (status != 0)									\
	{													\
		std::cout << "Cuda failure: " << status;		\
		abort();										\
	}													\
}

// stuff we know about the network and the input/output blobs
static const int INPUT_H = 256;
static const int INPUT_W = 384;
static const int INPUT_C = 3;

static const int OUTPUT_TEST_H = 256;
static const int OUTPUT_TEST_W = 384;
static const int OUTPUT_TEST_C = 64;
static const int OUTPUT_TEST_SIZE = OUTPUT_TEST_H * OUTPUT_TEST_W * OUTPUT_TEST_C;

static const int BATCH_SIZE = 4;
static const int MAX_BATCH_SIZE = 4;

const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "out";
const char* OUTPUT_BLOB_NAME_TEST = "test_out";

const int N_BINDINGS = 2;
static void* buffers[N_BINDINGS];
static cudaStream_t stream;
static int inputIndex, outputIndexTest;

using namespace nvinfer1;

static const DataType DATATYPE = DataType::kHALF;
const char* WEIGHTS_FILENAME = "weights_demo16.wts";
//static const DataType DATATYPE = DataType::kFLOAT;
//const char* WEIGHTS_FILENAME = "weights_demo32.wts";

// Logger for GIE info/warning/errors
class Logger : public nvinfer1::ILogger			
{
    public:
	void log(nvinfer1::ILogger::Severity severity, const char* msg) override
	{
		// suppress info-level messages
        if (severity == Severity::kINFO) return;

        switch (severity)
        {
            case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break;
            case Severity::kERROR: std::cerr << "ERROR: "; break;
            case Severity::kWARNING: std::cerr << "WARNING: "; break;
            case Severity::kINFO: std::cerr << "INFO: "; break;
            default: std::cerr << "UNKNOWN: "; break;
        }
        std::cerr << msg << std::endl;
	}
};

static Logger gLogger;

// Our weight files are in a very simple space delimited format.
// [type]  <data x size in hex> 
std::map<std::string, Weights> loadWeights(const std::string file)
{
    std::map<std::string, Weights> weightMap;
	std::ifstream input(file);
	assert(input.is_open() && "Unable to load weight file.");
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");
    while(count--)
    {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t type, size;
        std::string name;
        input >> name >> std::dec >> type >> size;
        wt.type = static_cast<DataType>(type);
        
        if (wt.type == DataType::kFLOAT)
        {
            std::cout << "full precision weights loaded" << std::endl;
            uint32_t *val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
            for (uint32_t x = 0, y = size; x < y; ++x)
            {
                input >> std::hex >> val[x];

            }
            wt.values = val;
        } else if (wt.type == DataType::kHALF)
        {
            uint16_t *val = reinterpret_cast<uint16_t*>(malloc(sizeof(uint16_t) * size));
            uint32_t val32;
            for (uint32_t x = 0, y = size; x < y; ++x)
            {
                input >> std::hex >> val32;
                val[x] = val32;
            }
            wt.values = val;
        }
        wt.count = size;
        weightMap[name] = wt;
    }
    return weightMap;
}

std::string locateFile(const std::string& input, const std::vector<std::string> & directories)
{
    std::string file;
	const int MAX_DEPTH{10};
    bool found{false};
    for (auto &dir : directories)
    {
        file = dir + input;
        for (int i = 0; i < MAX_DEPTH && !found; i++)
        {
            std::ifstream checkFile(file);
            found = checkFile.is_open();
            if (found) break;
            file = "../" + file;
        }
        if (found) break;
        file.clear();
    }

    assert(!file.empty() && "Could not find a file due to it not existing in the data directory.");
    return file;
}

// We have the data files located in a specific directory. This 
// searches for that directory format from the current directory.
std::string locateFile(const std::string& input)
{
    std::vector<std::string> dirs{"./data/"};
    return locateFile(input, dirs);
}

// print tensor dimensions
void printDims(ITensor* data)
{
    Dims dims = data->getDimensions();
    int nbDims = dims.nbDims;
    for (int d = 0; d < nbDims; d++)
        std::cout << dims.d[d] << " ";// << dims.d[1] << " " << dims.d[2] << " " << dims.d[3] << std::endl;
    std::string sss;    
    if (data->getType() == DataType::kHALF)
        sss = "float16";
    if (data->getType() == DataType::kFLOAT)
        sss = "float32";
    std::cout << sss << " ";
    std::cout << std::endl;
}

static void setAllLayerOutputsToHalf(INetworkDefinition* network)
{
    for (int i = 0; i < network->getNbLayers(); i++)
    {
        nvinfer1::ILayer* layer = network->getLayer(i);
        for (int j = 0; j < layer->getNbOutputs(); j++)
        {
            if (layer->getOutput(j)->isNetworkOutput())
                layer->getOutput(j)->setType(DataType::kHALF);
        }
    }
}

void APIToModel(unsigned int maxBatchSize, IHostMemory **modelStream)
{
	// create the builder
	IBuilder* builder = createInferBuilder(gLogger);

///////////////////////////////////////////////////////////
    INetworkDefinition* network = builder->createNetwork();
    
    // load weights values from disk
    std::map<std::string, Weights> weightMap = loadWeights(locateFile(WEIGHTS_FILENAME));

	// define input
	auto data = network->addInput(INPUT_BLOB_NAME, DATATYPE, DimsCHW{INPUT_C, INPUT_H, INPUT_W});
	assert(data != nullptr);
    std::cout << "input" << std::endl;
    printDims(data);

// add layer
    // 1 ////////////////////////////////////
    auto conv1 = network->addConvolution(*data, 64, DimsHW{3, 3}, weightMap["conv1_w"], weightMap["conv1_b"]);
	assert(conv1 != nullptr);
	conv1->setStride(DimsHW{1, 1});
    conv1->setPadding(DimsHW{1, 1});

    // set output
    conv1->getOutput(0)->setName(OUTPUT_BLOB_NAME_TEST);
	network->markOutput(*conv1->getOutput(0));

///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////

	// Build the engine
	builder->setMaxBatchSize(maxBatchSize);
	builder->setMaxWorkspaceSize(1e6);
    if (DATATYPE == DataType::kHALF)
    {
    	builder->setHalf2Mode(true);
        setAllLayerOutputsToHalf(network);
	}
    std::cout << "building the engine..." << std::endl;

	auto engine = builder->buildCudaEngine(*network);
         assert(engine != nullptr);

    std::cout << "engine built!" << std::endl;

	// serialize the engine, then close everything down
	(*modelStream) = engine->serialize();

	// Once we have built the cuda engine, we can release all of our held memory.
	for (auto &mem : weightMap)
    {
        free((void*)(mem.second.values));
    }
///////////////////////////////////////////////////////////

    network->destroy();
	engine->destroy();
	builder->destroy();
}

template <typename T>
void setUpDevice(IExecutionContext& context, int batchSize)
{
    const ICudaEngine& engine = context.getEngine();
    // input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
    // of these, but in this case we know that there is exactly one input and one output.
    assert(engine.getNbBindings() == N_BINDINGS);

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // note that indices are guaranteed to be less than IEngine::getNbBindings()
    inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    outputIndexTest = engine.getBindingIndex(OUTPUT_BLOB_NAME_TEST);

    // create GPU buffers and a stream
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_H * INPUT_W * INPUT_C * sizeof(T)));
    CHECK(cudaMalloc(&buffers[outputIndexTest], batchSize * OUTPUT_TEST_SIZE * sizeof(T)));

    // create cuda stream
    CHECK(cudaStreamCreate(&stream));
}

void cleanUp()
{
  	// release the stream and the buffers
	cudaStreamDestroy(stream);
	CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndexTest]));
}

template <typename T>
void doInference(IExecutionContext& context, uint8_t* input_, uint8_t* output_, int batchSize)
{
    // cast input and output pointers
    T* input = reinterpret_cast<T*>(input_);
    T* output = reinterpret_cast<T*>(output_);

	// DMA the input to the GPU, execute the batch asynchronously, and DMA it back:
	CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_H * INPUT_W * INPUT_C * sizeof(T), cudaMemcpyHostToDevice, stream));
	context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndexTest], batchSize * OUTPUT_TEST_SIZE * sizeof(T), cudaMemcpyDeviceToHost, stream));
	cudaStreamSynchronize(stream);
}

// rearrange image data to [N, C, H, W] order
void prepareDataBatch(uint8_t *data, std::vector<cv::Mat> &frames)
{
     assert(data && !frames.empty());
     unsigned int volChl = INPUT_H * INPUT_W;
     unsigned int volImg = INPUT_H * INPUT_W * INPUT_C;
     
     for (int b = 0; b < BATCH_SIZE; b++)
         for (int c = 0; c < INPUT_C; c++)
         {
              // the color image to input should be in BGR order
              for (unsigned j = 0; j < volChl; j++)
                   reinterpret_cast<float*>(data)[b * volImg + c * volChl + j] = float(frames[b].data[j * INPUT_C + c]) / 255.0;
         }
     
     return;
}

void printOutput(float *out, const int batch_size, const int output_c,  const int output_h,  const int output_w)
{
    int output_size(output_c * output_h * output_w);

    std::cout << "================="<< std::endl;   
    std::cout << "================="<< std::endl;
    std::cout << "-----------------"<< std::endl; 
    for (int b = 0; b < batch_size; b++)
        {
        for (int c = 0; c < output_c; c++)
            {
                for (int h = 0; h < output_h; h++)
                {
                    for (int w = 0; w < output_w; w++)
                        std::cout << out[b * output_size + c * output_h * output_w + h * output_w + w] << " ";
                    std::cout << std::endl;
                }
            std::cout << "-----------------"<< std::endl; 
            }
        std::cout << "================="<< std::endl;   
        std::cout << "================="<< std::endl;
        }

    return;
}

int main(int argc, char** argv)
{
    std::cout << sizeof(float) << std::endl;

    //read input, convert, resize
    std::vector<std::string> image_paths;
  
    image_paths.push_back("./images/2.jpg");
    image_paths.push_back("./images/9.jpg");
    image_paths.push_back("./images/18.jpg");
    image_paths.push_back("./images/20.jpg");

    std::vector<cv::Mat> images(BATCH_SIZE);

    for (int b = 0; b < BATCH_SIZE; b++)
    {
        std::cout << image_paths[b] << std::endl;
        cv::Mat image_bgr = cv::imread(image_paths[b]);
        cv::Mat image_rgb, image;
        cv::cvtColor(image_bgr, image_rgb, cv::COLOR_BGR2RGB);
        cv::resize(image_rgb, image_rgb, cv::Size(INPUT_W, INPUT_H), 0, 0, cv::INTER_LINEAR);
        images[b] = image_rgb;
    }

    // allocate CPU memory for input and output
    int inputSize = sizeof(float) * BATCH_SIZE * INPUT_C * INPUT_H * INPUT_W;
    int outputSizeTest = sizeof(float) * BATCH_SIZE * OUTPUT_TEST_SIZE;
    uint8_t* data = new uint8_t[inputSize];
    uint8_t* outtest = new uint8_t[outputSizeTest];

	// init model stream variables
    IHostMemory *modelStream{nullptr};
	IRuntime* runtime = createInferRuntime(gLogger);

	// create a model using the API directly and serialize it to a stream
    APIToModel(MAX_BATCH_SIZE, &modelStream);
    ICudaEngine* engine = runtime->deserializeCudaEngine(modelStream->data(), modelStream->size(), nullptr);

    // create execution context
	IExecutionContext *context = engine->createExecutionContext();

    // allocate memory on device
    if (DATATYPE == DataType::kHALF)
        setUpDevice<uint16_t>(*context, BATCH_SIZE);
    else
        setUpDevice<float>(*context, BATCH_SIZE);

    // flatten image Mat, convert to float, do TF-style whitening
    prepareDataBatch(data, images);

    // run inference
    if (DATATYPE == DataType::kHALF)
        doInference<uint16_t>(*context, data, outtest, BATCH_SIZE);
    else
        doInference<float>(*context, data, outtest, BATCH_SIZE);

	// destroy the engine
	context->destroy();
	engine->destroy();
	runtime->destroy();

    // clean-up device
    cleanUp();

    // print out
    float* out{reinterpret_cast<float*>(outtest)};
    printOutput(out, BATCH_SIZE, OUTPUT_TEST_C, OUTPUT_TEST_H, OUTPUT_TEST_W);

    //free mem
    delete[] data;
    delete[] outtest;

    std::cout << "done!" << std::endl;

    return 0;
}

The whole content of this post can be also found here:
https://github.com/FrancescoB-Vintra/fp16tensorRT

It would really be helpful if you could provide a working example of half precision inference on a API-generated TensorRT model;

Thanks again,

f

Hi,

Thanks for sharing your source.
We will check this internally and update information to you later.

Thanks.

Hi,

There are two suggestions for your issue:

1.
You can find float2half and half2float implementation in /usr/src/tensorrt/samples/samplePlugin/fp16.h.

2.
Based on you source, it looks like there are some misunderstanding in weight and buffer format.
They can be set to FLOAT or HALF independently.

If you want to use half weight, it’s no needed to set buffer into half.
If you want to use half buffer, please remember to set all the input/output data to half.
(call float2half for input and use half2float for output)

Thanks.

Hi,
we tried to follow your guidelines still we could not succeed in solving our issue (obtaining similar results between half and full precision inference).

According to your suggestions we modified our weight loading routine as shown in the code below;
in particular we are using the conversion tools from “tensorrt/samples/samplePlugin/fp16.h” and we removed any buffer conversions (data are kept in float type)

std::map<std::string, Weights> loadWeights(const std::string file)
{
    std::map<std::string, Weights> weightMap;
	std::ifstream input(file);
	assert(input.is_open() && "Unable to load weight file.");
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");
    while(count--)
    {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t type, size;
        std::string name;
        input >> name >> std::dec >> type >> size;
        wt.type = static_cast<DataType>(type);
        
        if (wt.type == DataType::kFLOAT)
        {
            std::cout << "full precision weights loaded" << std::endl;
            uint32_t *val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
            for (uint32_t x = 0, y = size; x < y; ++x)
            {
                input >> std::hex >> val[x];
            }
            wt.values = val;
        }
        else if (wt.type == DataType::kHALF)
        {
            __half *val = reinterpret_cast<__half*>(malloc(sizeof(__half) * size));
            uint32_t val32;
            __half tmp_var;
            for (uint32_t x = 0, y = size; x < y; ++x)
            {
                input >> std::hex >> val32;
                val[x] = fp16::__float2half(reinterpret_cast<float>(val32));
            }
            wt.values = val;
        }
        wt.count = size;
        weightMap[name] = wt;
    }
    return weightMap;
}

please find the whole modified code below

#include "NvInfer.h"
#include "NvCaffeParser.h"
#include "NvUtils.h"
#include "cuda_runtime_api.h"
#include <cassert>
#include <cmath>
#include <cstring>
#include <string>
#include <fstream>
#include <iostream>
#include <sstream>
#include <sys/stat.h>
#include <vector>
#include <algorithm>
#include <opencv2/opencv.hpp>
#include <opencv2/core/core.hpp>
#include "fp16.h"

#define CHECK(status)									\
{														\
	if (status != 0)									\
	{													\
		std::cout << "Cuda failure: " << status;		\
		abort();										\
	}													\
}


// stuff we know about the network and the input/output blobs
static const int INPUT_H = 256;
static const int INPUT_W = 384;
static const int INPUT_C = 3;

static const int OUTPUT_TEST_H = 256;
static const int OUTPUT_TEST_W = 384;
static const int OUTPUT_TEST_C = 64;
static const int OUTPUT_TEST_SIZE = OUTPUT_TEST_H * OUTPUT_TEST_W * OUTPUT_TEST_C;

static const int BATCH_SIZE = 4;
static const int MAX_BATCH_SIZE = 4;

const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "out";
const char* OUTPUT_BLOB_NAME_TEST = "test_out";

const int N_BINDINGS = 2;
static void* buffers[N_BINDINGS];
static cudaStream_t stream;
static int inputIndex, outputIndexTest;

using namespace nvinfer1;


static const DataType DATATYPE = DataType::kHALF;
const char* WEIGHTS_FILENAME = "weights_demo16.wts";
//static const DataType DATATYPE = DataType::kFLOAT;
//const char* WEIGHTS_FILENAME = "weights_demo32.wts";


// Logger for GIE info/warning/errors
class Logger : public nvinfer1::ILogger			
{
    public:
	void log(nvinfer1::ILogger::Severity severity, const char* msg) override
	{
		// suppress info-level messages
        if (severity == Severity::kINFO) return;

        switch (severity)
        {
            case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break;
            case Severity::kERROR: std::cerr << "ERROR: "; break;
            case Severity::kWARNING: std::cerr << "WARNING: "; break;
            case Severity::kINFO: std::cerr << "INFO: "; break;
            default: std::cerr << "UNKNOWN: "; break;
        }
        std::cerr << msg << std::endl;
	}
};


static Logger gLogger;


// Our weight files are in a very simple space delimited format.
// [type]  <data x size in hex> 
std::map<std::string, Weights> loadWeights(const std::string file)
{
    std::map<std::string, Weights> weightMap;
	std::ifstream input(file);
	assert(input.is_open() && "Unable to load weight file.");
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");
    while(count--)
    {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t type, size;
        std::string name;
        input >> name >> std::dec >> type >> size;
        wt.type = static_cast<DataType>(type);
        
        if (wt.type == DataType::kFLOAT)
        {
            std::cout << "full precision weights loaded" << std::endl;
            uint32_t *val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
            for (uint32_t x = 0, y = size; x < y; ++x)
            {
                input >> std::hex >> val[x];
            }
            wt.values = val;
        }
        else if (wt.type == DataType::kHALF)
        {
            __half *val = reinterpret_cast<__half*>(malloc(sizeof(__half) * size));
            uint32_t val32;
            __half tmp_var;
            for (uint32_t x = 0, y = size; x < y; ++x)
            {
                input >> std::hex >> val32;
                val[x] = fp16::__float2half(reinterpret_cast<float>(val32));
            }
            wt.values = val;
        }
        wt.count = size;
        weightMap[name] = wt;
    }
    return weightMap;
}



std::string locateFile(const std::string& input, const std::vector<std::string> & directories)
{
    std::string file;
	const int MAX_DEPTH{10};
    bool found{false};
    for (auto &dir : directories)
    {
        file = dir + input;
        for (int i = 0; i < MAX_DEPTH && !found; i++)
        {
            std::ifstream checkFile(file);
            found = checkFile.is_open();
            if (found) break;
            file = "../" + file;
        }
        if (found) break;
        file.clear();
    }

    assert(!file.empty() && "Could not find a file due to it not existing in the data directory.");
    return file;
}

// We have the data files located in a specific directory. This 
// searches for that directory format from the current directory.
std::string locateFile(const std::string& input)
{
    std::vector<std::string> dirs{"./data/"};
    return locateFile(input, dirs);
}

// print tensor dimensions
void printDims(ITensor* data)
{
    Dims dims = data->getDimensions();
    int nbDims = dims.nbDims;
    for (int d = 0; d < nbDims; d++)
        std::cout << dims.d[d] << " ";// << dims.d[1] << " " << dims.d[2] << " " << dims.d[3] << std::endl;
    std::string sss;    
    if (data->getType() == DataType::kHALF)
        sss = "float16";
    if (data->getType() == DataType::kFLOAT)
        sss = "float32";
    std::cout << sss << " ";
    std::cout << std::endl;
}


static void setAllLayerOutputsToHalf(INetworkDefinition* network)
{
    for (int i = 0; i < network->getNbLayers(); i++)
    {
        nvinfer1::ILayer* layer = network->getLayer(i);
        for (int j = 0; j < layer->getNbOutputs(); j++)
        {
            if (layer->getOutput(j)->isNetworkOutput())
                layer->getOutput(j)->setType(DataType::kHALF);
        }
    }
}

void APIToModel(unsigned int maxBatchSize, IHostMemory **modelStream)
{
	// create the builder
	IBuilder* builder = createInferBuilder(gLogger);


///////////////////////////////////////////////////////////
    INetworkDefinition* network = builder->createNetwork();
    
    // load weights values from disk
    std::map<std::string, Weights> weightMap = loadWeights(locateFile(WEIGHTS_FILENAME));

	// define input
	auto data = network->addInput(INPUT_BLOB_NAME, DATATYPE, DimsCHW{INPUT_C, INPUT_H, INPUT_W});
	assert(data != nullptr);
    std::cout << "input" << std::endl;
    printDims(data);


    // add layer
    // 1 ////////////////////////////////////
    auto conv1 = network->addConvolution(*data, 64, DimsHW{3, 3}, weightMap["conv1_w"], weightMap["conv1_b"]);
	assert(conv1 != nullptr);
	conv1->setStride(DimsHW{1, 1});
    conv1->setPadding(DimsHW{1, 1});

    // set output
    conv1->getOutput(0)->setName(OUTPUT_BLOB_NAME_TEST);
	network->markOutput(*conv1->getOutput(0));



///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////

	// Build the engine
	builder->setMaxBatchSize(maxBatchSize);
	builder->setMaxWorkspaceSize(1e6);
    if (DATATYPE == DataType::kHALF)
    {
    	builder->setHalf2Mode(true);
        setAllLayerOutputsToHalf(network);
	}
    std::cout << "building the engine..." << std::endl;

	auto engine = builder->buildCudaEngine(*network);
         assert(engine != nullptr);

    std::cout << "engine built!" << std::endl;

	// serialize the engine, then close everything down
	(*modelStream) = engine->serialize();

	// Once we have built the cuda engine, we can release all of our held memory.
	for (auto &mem : weightMap)
    {
        free((void*)(mem.second.values));
    }
///////////////////////////////////////////////////////////

    network->destroy();
	engine->destroy();
	builder->destroy();
}

void setUpDevice(IExecutionContext& context, int batchSize)
{
    const ICudaEngine& engine = context.getEngine();
    // input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
    // of these, but in this case we know that there is exactly one input and one output.
    assert(engine.getNbBindings() == N_BINDINGS);

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // note that indices are guaranteed to be less than IEngine::getNbBindings()
    inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    outputIndexTest = engine.getBindingIndex(OUTPUT_BLOB_NAME_TEST);

    // create GPU buffers and a stream
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_H * INPUT_W * INPUT_C * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndexTest], batchSize * OUTPUT_TEST_SIZE * sizeof(float)));

    // create cuda stream
    CHECK(cudaStreamCreate(&stream));
}

void cleanUp()
{
  	// release the stream and the buffers
	cudaStreamDestroy(stream);
	CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndexTest]));
}


void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
{
	// DMA the input to the GPU, execute the batch asynchronously, and DMA it back:
	CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_H * INPUT_W * INPUT_C * sizeof(float), cudaMemcpyHostToDevice, stream));
	context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndexTest], batchSize * OUTPUT_TEST_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
	cudaStreamSynchronize(stream);
}


// rearrange image data to [N, C, H, W] order
void prepareDataBatch(float *data, std::vector<cv::Mat> &frames)
{
     assert(data && !frames.empty());
     unsigned int volChl = INPUT_H * INPUT_W;
     unsigned int volImg = INPUT_H * INPUT_W * INPUT_C;
     
     for (int b = 0; b < BATCH_SIZE; b++)
         for (int c = 0; c < INPUT_C; c++)
         {
              // the color image to input should be in BGR order
              for (unsigned j = 0; j < volChl; j++)
                   data[b * volImg + c * volChl + j] = float(frames[b].data[j * INPUT_C + c]) / 255.0;
         }
     
     return;
}



void printOutput(float *out, const int batch_size, const int output_c,  const int output_h,  const int output_w)
{
    int output_size(output_c * output_h * output_w);

    std::cout << "================="<< std::endl;   
    std::cout << "================="<< std::endl;
    std::cout << "-----------------"<< std::endl; 
    for (int b = 0; b < batch_size; b++)
        {
        for (int c = 0; c < output_c; c++)
            {
                for (int h = 0; h < output_h; h++)
                {
                    for (int w = 0; w < output_w; w++)
                        std::cout << out[b * output_size + c * output_h * output_w + h * output_w + w] << " ";
                    std::cout << std::endl;
                }
            std::cout << "-----------------"<< std::endl; 
            }
        std::cout << "================="<< std::endl;   
        std::cout << "================="<< std::endl;
        }

    return;
}





int main(int argc, char** argv)
{
    std::cout << sizeof(float) << std::endl;

    //read input, convert, resize
    std::vector<std::string> image_paths;
  
    image_paths.push_back("./images/2.jpg");
    image_paths.push_back("./images/9.jpg");
    image_paths.push_back("./images/18.jpg");
    image_paths.push_back("./images/20.jpg");

    std::vector<cv::Mat> images(BATCH_SIZE);

    for (int b = 0; b < BATCH_SIZE; b++)
    {
        std::cout << image_paths[b] << std::endl;
        cv::Mat image_bgr = cv::imread(image_paths[b]);
        cv::Mat image_rgb, image;
        cv::cvtColor(image_bgr, image_rgb, cv::COLOR_BGR2RGB);
        cv::resize(image_rgb, image_rgb, cv::Size(INPUT_W, INPUT_H), 0, 0, cv::INTER_LINEAR);
        images[b] = image_rgb;
    }

    // allocate CPU memory for input and output
    int inputSize = BATCH_SIZE * INPUT_C * INPUT_H * INPUT_W;
    int outputSizeTest = BATCH_SIZE * OUTPUT_TEST_SIZE;
    float* data = new float[inputSize];
    float* outtest = new float[outputSizeTest];

	// init model stream variables
    IHostMemory *modelStream{nullptr};
	IRuntime* runtime = createInferRuntime(gLogger);

	// create a model using the API directly and serialize it to a stream
    APIToModel(MAX_BATCH_SIZE, &modelStream);
    ICudaEngine* engine = runtime->deserializeCudaEngine(modelStream->data(), modelStream->size(), nullptr);

    // create execution context
	IExecutionContext *context = engine->createExecutionContext();

    // allocate memory on device
    setUpDevice(*context, BATCH_SIZE);

    // flatten image Mat, convert to float, do TF-style whitening
    prepareDataBatch(data, images);

    // run inference
    doInference(*context, data, outtest, BATCH_SIZE);

	// destroy the engine
	context->destroy();
	engine->destroy();
	runtime->destroy();

    // clean-up device
    cleanUp();

    // print out
    printOutput(outtest, BATCH_SIZE, OUTPUT_TEST_C, OUTPUT_TEST_H, OUTPUT_TEST_W);

    //free mem
    delete[] data;
    delete[] outtest;

    std::cout << "done!" << std::endl;

    return 0;
}

Everything else needed to replicate this problem can be found in
https://github.com/FrancescoB-Vintra/fp16tensorRT.git

Thanks,

f

Hi,

Since you have set all the DATA layer to half:

line189: layer->getOutput(j)->setType(DataType::kHALF);

Please remember to convert input from float to half precision:

line315: data[b * volImg + c * volChl + j] = float(frames[b].data[j * INPUT_C + c]) / 255.0;

And convert the output from half back to float precision:

line337: std::cout << out[b * output_size + c * output_h * output_w + h * output_w + w] << " ";

Thanks.

Hi,
We finally ended up having a working version of the code.

Relevant changes were converting the buffer to half precision with the tools from “fp16.h” and casting the weights the same way. However, we experienced that inference with data buffer in float format and weights in half format (point 2 of https://devtalk.nvidia.com/default/topic/1036354/general/problem-loading-weights-in-half-precision-mode/post/5267846/#5267846) was not working as expected.

A working example of the code can be found below.

#include "NvInfer.h"
#include "NvCaffeParser.h"
#include "NvUtils.h"
#include "cuda_runtime_api.h"
#include <cassert>
#include <cmath>
#include <cstring>
#include <string>
#include <fstream>
#include <iostream>
#include <sstream>
#include <sys/stat.h>
#include <vector>
#include <algorithm>
#include <opencv2/opencv.hpp>
#include <opencv2/core/core.hpp>
#include <type_traits>
#include "fp16.h"

#define CHECK(status)									\
{														\
	if (status != 0)									\
	{													\
		std::cout << "Cuda failure: " << status;		\
		abort();										\
	}													\
}


// stuff we know about the network and the input/output blobs
static const int INPUT_H = 256;
static const int INPUT_W = 384;
static const int INPUT_C = 3;

static const int OUTPUT_TEST_H = 256;
static const int OUTPUT_TEST_W = 384;
static const int OUTPUT_TEST_C = 64;
static const int OUTPUT_TEST_SIZE = OUTPUT_TEST_H * OUTPUT_TEST_W * OUTPUT_TEST_C;

static const int BATCH_SIZE = 4;
static const int MAX_BATCH_SIZE = 4;

const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "out";

const int N_BINDINGS = 2;
static void* buffers[N_BINDINGS];
static cudaStream_t stream;
static int inputIndex, outputIndexTest;

using namespace nvinfer1;


static const DataType DATATYPE = DataType::kHALF;
const char* WEIGHTS_FILENAME = "weights_demo16.wts";
//static const DataType DATATYPE = DataType::kFLOAT;
//const char* WEIGHTS_FILENAME = "weights_demo32.wts";


// Logger for GIE info/warning/errors
class Logger : public nvinfer1::ILogger			
{
    public:
	void log(nvinfer1::ILogger::Severity severity, const char* msg) override
	{
		// suppress info-level messages
        if (severity == Severity::kINFO) return;

        switch (severity)
        {
            case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break;
            case Severity::kERROR: std::cerr << "ERROR: "; break;
            case Severity::kWARNING: std::cerr << "WARNING: "; break;
            case Severity::kINFO: std::cerr << "INFO: "; break;
            default: std::cerr << "UNKNOWN: "; break;
        }
        std::cerr << msg << std::endl;
	}
};


static Logger gLogger;

union hf
{
    uint32_t data_u;
    float data_f;
} converter;

// Our weight files are in a very simple space delimited format.
// [type]  <data x size in hex> 
std::map<std::string, Weights> loadWeights(const std::string file)
{
    std::map<std::string, Weights> weightMap;
	std::ifstream input(file);
	assert(input.is_open() && "Unable to load weight file.");
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");
    while(count--)
    {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t type, size;
        std::string name;
        input >> name >> std::dec >> type >> size;
        wt.type = static_cast<DataType>(type);
        
        if (wt.type == DataType::kFLOAT)
        {
            uint32_t *val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
            for (uint32_t x = 0, y = size; x < y; ++x)
            {
                input >> std::hex >> val[x];
            }
            wt.values = val;
        }
        else if (wt.type == DataType::kHALF)
        {
            __half *val = reinterpret_cast<__half*>(malloc(sizeof(__half) * size));
            for (uint32_t x = 0, y = size; x < y; ++x)
            {
                input >> std::hex >> converter.data_u;		
                val[x] = fp16::__float2half(converter.data_f);
            }
            wt.values = val;
        }
        wt.count = size;
        weightMap[name] = wt;
    }
    return weightMap;
}



std::string locateFile(const std::string& input, const std::vector<std::string> & directories)
{
    std::string file;
	const int MAX_DEPTH{10};
    bool found{false};
    for (auto &dir : directories)
    {
        file = dir + input;
        for (int i = 0; i < MAX_DEPTH && !found; i++)
        {
            std::ifstream checkFile(file);
            found = checkFile.is_open();
            if (found) break;
            file = "../" + file;
        }
        if (found) break;
        file.clear();
    }

    assert(!file.empty() && "Could not find a file due to it not existing in the data directory.");
    return file;
}

// We have the data files located in a specific directory. This 
// searches for that directory format from the current directory.
std::string locateFile(const std::string& input)
{
    std::vector<std::string> dirs{"./data/"};
    return locateFile(input, dirs);
}

// print tensor dimensions
void printDims(ITensor* data)
{
    Dims dims = data->getDimensions();
    int nbDims = dims.nbDims;
    for (int d = 0; d < nbDims; d++)
        std::cout << dims.d[d] << " ";// << dims.d[1] << " " << dims.d[2] << " " << dims.d[3] << std::endl;
    std::string sss;    
    if (data->getType() == DataType::kHALF)
        sss = "float16";
    if (data->getType() == DataType::kFLOAT)
        sss = "float32";
    std::cout << sss << " ";
    std::cout << std::endl;
}


static void setAllLayerOutputsToHalf(INetworkDefinition* network)
{
    for (int i = 0; i < network->getNbLayers(); i++)
    {
        nvinfer1::ILayer* layer = network->getLayer(i);
        for (int j = 0; j < layer->getNbOutputs(); j++)
        {
            if (layer->getOutput(j)->isNetworkOutput())
                layer->getOutput(j)->setType(DataType::kHALF);
        }
    }
}

void APIToModel(unsigned int maxBatchSize, IHostMemory **modelStream)
{
	// create the builder
	IBuilder* builder = createInferBuilder(gLogger);


///////////////////////////////////////////////////////////
    INetworkDefinition* network = builder->createNetwork();
    
    // load weights values from disk
    std::map<std::string, Weights> weightMap = loadWeights(locateFile(WEIGHTS_FILENAME));

	// define input
	auto data = network->addInput(INPUT_BLOB_NAME, DATATYPE, DimsCHW{INPUT_C, INPUT_H, INPUT_W});
	assert(data != nullptr);
    std::cout << "input" << std::endl;
    printDims(data);


    // add layer
    // 1 ////////////////////////////////////
    auto conv1 = network->addConvolution(*data, 64, DimsHW{3, 3}, weightMap["conv1_w"], weightMap["conv1_b"]);
	assert(conv1 != nullptr);
	conv1->setStride(DimsHW{1, 1});
    conv1->setPadding(DimsHW{1, 1});

    // set output
    conv1->getOutput(0)->setName(OUTPUT_BLOB_NAME);
	network->markOutput(*conv1->getOutput(0));



///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////

	// Build the engine
	builder->setMaxBatchSize(maxBatchSize);
	builder->setMaxWorkspaceSize(1e6);
    if (DATATYPE == DataType::kHALF)
    {
    	builder->setHalf2Mode(true);
        setAllLayerOutputsToHalf(network);
	}
    std::cout << "building the engine..." << std::endl;

	auto engine = builder->buildCudaEngine(*network);
         assert(engine != nullptr);

    std::cout << "engine built!" << std::endl;

	// serialize the engine, then close everything down
	(*modelStream) = engine->serialize();

	// Once we have built the cuda engine, we can release all of our held memory.
	for (auto &mem : weightMap)
    {
        free((void*)(mem.second.values));
    }
///////////////////////////////////////////////////////////

    network->destroy();
	engine->destroy();
	builder->destroy();
}

template <typename T>
void setUpDevice(IExecutionContext& context, int batchSize)
{
    const ICudaEngine& engine = context.getEngine();
    // input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
    // of these, but in this case we know that there is exactly one input and one output.
    assert(engine.getNbBindings() == N_BINDINGS);

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // note that indices are guaranteed to be less than IEngine::getNbBindings()
    inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    outputIndexTest = engine.getBindingIndex(OUTPUT_BLOB_NAME);

    // create GPU buffers and a stream
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_H * INPUT_W * INPUT_C * sizeof(T)));
    CHECK(cudaMalloc(&buffers[outputIndexTest], batchSize * OUTPUT_TEST_SIZE * sizeof(T)));

    // create cuda stream
    CHECK(cudaStreamCreate(&stream));
}

void cleanUp()
{
  	// release the stream and the buffers
	cudaStreamDestroy(stream);
	CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndexTest]));
}

void doInference(IExecutionContext& context, __half* input, __half* output, int batchSize)
{
	// DMA the input to the GPU, execute the batch asynchronously, and DMA it back:
	CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_H * INPUT_W * INPUT_C * sizeof(__half), cudaMemcpyHostToDevice, stream));
	context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndexTest], batchSize * OUTPUT_TEST_SIZE * sizeof(__half), cudaMemcpyDeviceToHost, stream));
	cudaStreamSynchronize(stream);
}

void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
{
	// DMA the input to the GPU, execute the batch asynchronously, and DMA it back:
	CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_H * INPUT_W * INPUT_C * sizeof(float), cudaMemcpyHostToDevice, stream));
	context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndexTest], batchSize * OUTPUT_TEST_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
	cudaStreamSynchronize(stream);
}


// rearrange image data to [N, C, H, W] order
void prepareDataBatch(__half* data, std::vector<cv::Mat> &frames)
{
     assert(data && !frames.empty());
     unsigned int volChl = INPUT_H * INPUT_W;
     unsigned int volImg = INPUT_H * INPUT_W * INPUT_C;
     
     for (int b = 0; b < BATCH_SIZE; b++)
         for (int c = 0; c < INPUT_C; c++)
         {
              // the color image to input should be in BGR order
              for (unsigned j = 0; j < volChl; j++)
                   data[b * volImg + c * volChl + j] = fp16::__float2half( (frames[b].data[j * INPUT_C + c]) / 255.0);
         }
     
     return;
}

void prepareDataBatch(float* data, std::vector<cv::Mat> &frames)
{
     assert(data && !frames.empty());
     unsigned int volChl = INPUT_H * INPUT_W;
     unsigned int volImg = INPUT_H * INPUT_W * INPUT_C;
     
     for (int b = 0; b < BATCH_SIZE; b++)
         for (int c = 0; c < INPUT_C; c++)
         {
              // the color image to input should be in BGR order
              for (unsigned j = 0; j < volChl; j++)
                   data[b * volImg + c * volChl + j] = (frames[b].data[j * INPUT_C + c]) / 255.0;
         }
     
     return;
}



void printOutput(float *out, const int batch_size, const int output_c,  const int output_h,  const int output_w)
{
    int output_size(output_c * output_h * output_w);

    std::cout << "================="<< std::endl;   
    std::cout << "================="<< std::endl;
    std::cout << "-----------------"<< std::endl; 
    for (int b = 0; b < batch_size; b++)
        {
        for (int c = 0; c < output_c; c++)
            {
                for (int h = 0; h < 5; h++)//output_h; h++)
                {
                    for (int w = 0; w < 5; w++)//output_w; w++)
                        std::cout << out[b * output_size + c * output_h * output_w + h * output_w + w] << " ";
                    std::cout << std::endl;
                }
            std::cout << "-----------------"<< std::endl; 
            }
        std::cout << "================="<< std::endl;   
        std::cout << "================="<< std::endl;
        }

    return;
}



void castOutput(__half* data_h, float* data_f, int output_size)
{
    for (int i = 0; i < output_size; i++)
         data_f[i] = fp16::__half2float(data_h[i]);
    return;
}

void castOutput(float* data_h, float* data_f, int output_size)
{
    for (int i = 0; i < output_size; i++)
         data_f[i] = data_h[i];
    return;
}


typedef std::conditional<DATATYPE == DataType::kHALF, __half, float>::type FloatPrecision;


int main(int argc, char** argv)
{
    std::cout << sizeof(float) << std::endl;

    //read input, convert, resize
    std::vector<std::string> image_paths;
    image_paths.push_back("./images/2.jpg");
    image_paths.push_back("./images/9.jpg");
    image_paths.push_back("./images/18.jpg");
    image_paths.push_back("./images/20.jpg");

    std::vector<cv::Mat> images(BATCH_SIZE);

    for (int b = 0; b < BATCH_SIZE; b++)
    {
        std::cout << image_paths[b] << std::endl;
        cv::Mat image_bgr = cv::imread(image_paths[b]);
        cv::Mat image_rgb, image;
        cv::cvtColor(image_bgr, image_rgb, cv::COLOR_BGR2RGB);
        cv::resize(image_rgb, image_rgb, cv::Size(INPUT_W, INPUT_H), 0, 0, cv::INTER_LINEAR);
        images[b] = image_rgb;
    }

    // allocate CPU memory for input and output
    int inputSize = BATCH_SIZE * INPUT_C * INPUT_H * INPUT_W;
    int outputSizeTest = BATCH_SIZE * OUTPUT_TEST_SIZE;
    FloatPrecision* data = new FloatPrecision[inputSize];
    FloatPrecision* outtest = new FloatPrecision[outputSizeTest];
    float* outtest_f = new float[outputSizeTest];

	// init model stream variables
    IHostMemory *modelStream{nullptr};
	IRuntime* runtime = createInferRuntime(gLogger);

	// create a model using the API directly and serialize it to a stream
    APIToModel(MAX_BATCH_SIZE, &modelStream);
    ICudaEngine* engine = runtime->deserializeCudaEngine(modelStream->data(), modelStream->size(), nullptr);

    // create execution context
	IExecutionContext *context = engine->createExecutionContext();

    // allocate memory on device
    if (DATATYPE == DataType::kHALF)
        setUpDevice<__half>(*context, BATCH_SIZE);
    else
        setUpDevice<float>(*context, BATCH_SIZE);

    // flatten image Mat, convert to float, do TF-style whitening
    prepareDataBatch(data, images);

    // run inference
    doInference(*context, data, outtest, BATCH_SIZE);

	// destroy the engine
	context->destroy();
	engine->destroy();
	runtime->destroy();

    // clean-up device
    cleanUp();

    // print out
    castOutput(outtest, outtest_f, outputSizeTest);
    printOutput(outtest_f, BATCH_SIZE, OUTPUT_TEST_C, OUTPUT_TEST_H, OUTPUT_TEST_W);

    //free mem
    delete[] data;
    delete[] outtest;
    delete[] outtest_f;

    std::cout << "done!" << std::endl;

    return 0;
}

Code and data to run the example are available at
https://github.com/FrancescoB-Vintra/fp16tensorRT.git

Many thanks for your support,

f