Problem loading weights in half precision mode

Hello,
I have a problem trying to load weights value from disk to be used for half precision inference;
for weight loading I use the function loadWeights provided in many TensorRT samples; that works fine for float32 but when I switch to half precision I have the following error

ERROR: Parameter check failed at: Network.cpp::addConvolution::32, condition: kernelWeights.values != NULL
demo: demo.cpp:255: void APIToModel(unsigned int, nvinfer1::IHostMemory**): Assertion `conv1 != nullptr' failed.
Aborted (core dumped)

I tried to modify the loadWeights function, forcing the conversion with the function __float2half (also found in TensorRT samplePlugin sample), but the weights were not loaded succesfully either;

I attach a working piece of code to reproduce the problem; at line 77 it is possible to switch between half and full precision; the weight files I used can be found below;
I use TensorRT RC 4.0.0.3 with CUDA 9.0;

#include "NvInfer.h"
#include "NvCaffeParser.h"
#include "NvUtils.h"
#include "cuda_runtime_api.h"
#include <cassert>
#include <cmath>
#include <cstring>
#include <string>
#include <fstream>
#include <iostream>
#include <sstream>
#include <sys/stat.h>
#include <vector>
#include <algorithm>
#include <opencv2/opencv.hpp>
#include <opencv2/core/core.hpp>
#include <cublas_v2.h>

#define CHECK(status)									\
{														\
	if (status != 0)									\
	{													\
		std::cout << "Cuda failure: " << status;		\
		abort();										\
	}													\
}


// stuff we know about the network and the input/output blobs
static const int INPUT_H = 256;
static const int INPUT_W = 384;
static const int INPUT_C = 3;

static const int OUTPUT_TEST_H = 256;
static const int OUTPUT_TEST_W = 384;
static const int OUTPUT_TEST_C = 64;
static const int OUTPUT_TEST_SIZE = OUTPUT_TEST_H * OUTPUT_TEST_W * OUTPUT_TEST_C;

static const int BATCH_SIZE = 4;
static const int MAX_BATCH_SIZE = 4;

const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "out";
const char* OUTPUT_BLOB_NAME_TEST = "test_out";

const int N_BINDINGS = 2;
static void* buffers[N_BINDINGS];
static cudaStream_t stream;
static int inputIndex, outputIndexTest;

using namespace nvinfer1;

// Logger for GIE info/warning/errors
class Logger : public nvinfer1::ILogger			
{
    public:
	void log(nvinfer1::ILogger::Severity severity, const char* msg) override
	{
		// suppress info-level messages
        if (severity == Severity::kINFO) return;

        switch (severity)
        {
            case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break;
            case Severity::kERROR: std::cerr << "ERROR: "; break;
            case Severity::kWARNING: std::cerr << "WARNING: "; break;
            case Severity::kINFO: std::cerr << "INFO: "; break;
            default: std::cerr << "UNKNOWN: "; break;
        }
        std::cerr << msg << std::endl;
	}
};


static Logger gLogger;

static const DataType DATATYPE = DataType::kHALF;
const char* WEIGHTS_FILENAME = "demo16.wts";
//static const DataType DATATYPE = DataType::kFLOAT;
//const char* WEIGHTS_FILENAME = "demo32.wts";


template<typename T, typename U> T bitwise_cast(U u)
{
    return *reinterpret_cast<T*>(&u);
}

__half __float2half(float f)
{
    uint32_t x = bitwise_cast<uint32_t, float>(f);
    uint32_t u = (x & 0x7fffffff);

    // Get rid of +NaN/-NaN case first.
    if (u > 0x7f800000)
        return bitwise_cast<__half, uint16_t>(uint16_t(0x7fff));
  
    uint16_t sign = ((x >> 16) & 0x8000);
  
    // Get rid of +Inf/-Inf, +0/-0.
    if (u > 0x477fefff)
        return bitwise_cast<__half, uint16_t>(sign | uint16_t(0x7c00));

    if (u < 0x33000001)
        return bitwise_cast<__half, uint16_t>(sign | uint16_t(0x0000));

    uint32_t exponent = ((u >> 23) & 0xff);
    uint32_t mantissa = (u & 0x7fffff);

    uint32_t shift;
    if (exponent > 0x70)
    {
        shift = 13;
        exponent -= 0x70;
    }
    else
    {
        shift = 0x7e - exponent;
        exponent = 0;
        mantissa |= 0x800000;
    }

    uint32_t lsb    = (1 << shift);
    uint32_t lsb_s1 = (lsb >> 1);
    uint32_t lsb_m1 = (lsb - 1);
  
    // Round to nearest even.
    uint32_t remainder = (mantissa & lsb_m1);
    mantissa >>= shift;
    if ( (remainder > lsb_s1) || ((remainder == lsb_s1) && (mantissa & 0x1)) )
    {
        ++mantissa;
        if (!(mantissa & 0x3ff))
        {
            ++exponent;
            mantissa = 0;
        }
    }
    
    return bitwise_cast<__half, uint16_t>(sign | uint16_t(exponent<<10) | uint16_t(mantissa));
}


// Our weight files are in a very simple space delimited format.
// [type]  <data x size in hex> 
std::map<std::string, Weights> loadWeights(const std::string file)
{
    std::map<std::string, Weights> weightMap;
	std::ifstream input(file);
	assert(input.is_open() && "Unable to load weight file.");
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");
    while(count--)
    {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t type, size;
        std::string name;
        input >> name >> std::dec >> type >> size;
        wt.type = static_cast<DataType>(type);
        if (wt.type == DataType::kFLOAT)
        {
            uint32_t *val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
            for (uint32_t x = 0, y = size; x < y; ++x)
            {
                input >> std::hex >> val[x];

            }
            wt.values = val;
        } else if (wt.type == DataType::kHALF)
        {
            uint16_t *val = reinterpret_cast<uint16_t*>(malloc(sizeof(val) * size));
            for (uint32_t x = 0, y = size; x < y; ++x)
            {
                input >> std::hex >> val[x];
            }
            wt.values = val;
        }
        wt.count = size;
        weightMap[name] = wt;
    }
    return weightMap;
}


std::string locateFile(const std::string& input, const std::vector<std::string> & directories)
{
    std::string file;
	const int MAX_DEPTH{10};
    bool found{false};
    for (auto &dir : directories)
    {
        file = dir + input;
        for (int i = 0; i < MAX_DEPTH && !found; i++)
        {
            std::ifstream checkFile(file);
            found = checkFile.is_open();
            if (found) break;
            file = "../" + file;
        }
        if (found) break;
        file.clear();
    }

    assert(!file.empty() && "Could not find a file due to it not existing in the data directory.");
    return file;
}

// We have the data files located in a specific directory. This 
// searches for that directory format from the current directory.
std::string locateFile(const std::string& input)
{
    std::vector<std::string> dirs{"./data/"};
    return locateFile(input, dirs);
}

// print tensor dimensions
void printDims(ITensor* data)
{
    Dims dims = data->getDimensions();
    int nbDims = dims.nbDims;
    for (int d = 0; d < nbDims; d++)
        std::cout << dims.d[d] << " ";// << dims.d[1] << " " << dims.d[2] << " " << dims.d[3] << std::endl;
    std::string sss;    
    if (data->getType() == DataType::kHALF)
        sss = "float16";
    if (data->getType() == DataType::kFLOAT)
        sss = "float32";
    std::cout << sss << " ";
    std::cout << std::endl;
}


void APIToModel(unsigned int maxBatchSize, IHostMemory **modelStream)
{
	// create the builder
	IBuilder* builder = createInferBuilder(gLogger);


///////////////////////////////////////////////////////////
    INetworkDefinition* network = builder->createNetwork();
    
    // load weights values from disk
    std::map<std::string, Weights> weightMap = loadWeights(locateFile(WEIGHTS_FILENAME));

	// define input
	auto data = network->addInput(INPUT_BLOB_NAME, DATATYPE, DimsCHW{INPUT_C, INPUT_H, INPUT_W});
	assert(data != nullptr);
    std::cout << "input" << std::endl;
    printDims(data);


    // add layer
    // 1 ////////////////////////////////////
    auto conv1 = network->addConvolution(*data, 64, DimsHW{3, 3}, weightMap["conv1_w"], weightMap["conv1_b"]);
	assert(conv1 != nullptr);
	conv1->setStride(DimsHW{1, 1});
    conv1->setPadding(DimsHW{1, 1});

    // set output
    conv1->getOutput(0)->setName(OUTPUT_BLOB_NAME_TEST);
	network->markOutput(*conv1->getOutput(0));



///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////

	// Build the engine
	builder->setMaxBatchSize(maxBatchSize);
	builder->setMaxWorkspaceSize(1e9);
    if (DATATYPE == DataType::kHALF)
    	builder->setHalf2Mode(true);
	
    std::cout << "building the engine..." << std::endl;

	auto engine = builder->buildCudaEngine(*network);
         assert(engine != nullptr);

    std::cout << "engine built!" << std::endl;

	// serialize the engine, then close everything down
	(*modelStream) = engine->serialize();

	// Once we have built the cuda engine, we can release all of our held memory.
	for (auto &mem : weightMap)
    {
        free((void*)(mem.second.values));
    }
///////////////////////////////////////////////////////////

    network->destroy();
	engine->destroy();
	builder->destroy();
}


void setUpDevice(IExecutionContext& context, int batchSize)
{
    const ICudaEngine& engine = context.getEngine();
    // input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
    // of these, but in this case we know that there is exactly one input and one output.
    assert(engine.getNbBindings() == N_BINDINGS);

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // note that indices are guaranteed to be less than IEngine::getNbBindings()
    inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    outputIndexTest = engine.getBindingIndex(OUTPUT_BLOB_NAME_TEST);

    // create GPU buffers and a stream
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_H * INPUT_W * INPUT_C * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndexTest], batchSize * OUTPUT_TEST_SIZE * sizeof(float)));

    // create cuda stream
    CHECK(cudaStreamCreate(&stream));
}

void cleanUp()
{
  	// release the stream and the buffers
	cudaStreamDestroy(stream);
	CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndexTest]));
}

void doInference(IExecutionContext& context, float* input, float* outputtest, int batchSize)
{
	// DMA the input to the GPU, execute the batch asynchronously, and DMA it back:
	CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_H * INPUT_W * INPUT_C * sizeof(float), cudaMemcpyHostToDevice, stream));
	context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(outputtest, buffers[outputIndexTest], batchSize * OUTPUT_TEST_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
	cudaStreamSynchronize(stream);
}


// rearrange image data to [N, C, H, W] order
void prepareDataBatch(float *data, std::vector<cv::Mat> &frames)
{
     assert(data && !frames.empty());
     unsigned int volChl = INPUT_H * INPUT_W;
     unsigned int volImg = INPUT_H * INPUT_W * INPUT_C;
     
     for (int b = 0; b < BATCH_SIZE; b++)
         for (int c = 0; c < INPUT_C; c++)
         {
              // the color image to input should be in BGR order
              for (unsigned j = 0; j < volChl; j++)
                   data[b * volImg + c * volChl + j] = float(frames[b].data[j * INPUT_C + c]) / 255.0;
         }
     
     return;
}



void printOutput(float *out, const int batch_size, const int output_c,  const int output_h,  const int output_w)
{
    int output_size(output_c * output_h * output_w);

    std::cout << "================="<< std::endl;   
    std::cout << "================="<< std::endl;
    std::cout << "-----------------"<< std::endl; 
    for (int b = 0; b < batch_size; b++)
        {
        for (int c = 0; c < output_c; c++)
            {
                for (int h = 0; h < output_h; h++)
                {
                    for (int w = 0; w < output_w; w++)
                        std::cout << out[b * output_size + c * output_h * output_w + h * output_w + w] << " ";
                    std::cout << std::endl;
                }
            std::cout << "-----------------"<< std::endl; 
            }
        std::cout << "================="<< std::endl;   
        std::cout << "================="<< std::endl;
        }

    return;
}





int main(int argc, char** argv)
{
    std::cout << sizeof(float) << std::endl;

    //read input, convert, resize
    std::vector<std::string> image_paths;
  
    image_paths.push_back("./images/2.jpg");
    image_paths.push_back("./images/9.jpg");
    image_paths.push_back("./images/18.jpg");
    image_paths.push_back("./images/20.jpg");

    std::vector<cv::Mat> images(BATCH_SIZE);

    for (int b = 0; b < BATCH_SIZE; b++)
    {
        std::cout << image_paths[b] << std::endl;
        cv::Mat image_bgr = cv::imread(image_paths[b]);
        cv::Mat image_rgb, image;
        cv::cvtColor(image_bgr, image_rgb, cv::COLOR_BGR2RGB);
        cv::resize(image_rgb, image_rgb, cv::Size(INPUT_W, INPUT_H), 0, 0, cv::INTER_LINEAR);
        images[b] = image_rgb;
    }

    // allocate CPU memory for input and output
    int inputSize = sizeof(float) * BATCH_SIZE * INPUT_C * INPUT_H * INPUT_W;
    int outputSizeTest = sizeof(float) * BATCH_SIZE * OUTPUT_TEST_SIZE;
    float *data = (float *)malloc(inputSize);
    float *outtest = (float *)malloc(outputSizeTest);

	// init model stream variables
    IHostMemory *modelStream{nullptr};
	IRuntime* runtime = createInferRuntime(gLogger);

	// create a model using the API directly and serialize it to a stream
    APIToModel(MAX_BATCH_SIZE, &modelStream);
    ICudaEngine* engine = runtime->deserializeCudaEngine(modelStream->data(), modelStream->size(), nullptr);

    // create execution context
	IExecutionContext *context = engine->createExecutionContext();

    // allocate memory on device
    setUpDevice(*context, BATCH_SIZE);

    // flatten image Mat, convert to float, do TF-style whitening
    prepareDataBatch(data, images);

    // run inference
    doInference(*context, data, outtest, BATCH_SIZE);

	// destroy the engine
	context->destroy();
	engine->destroy();
	runtime->destroy();

    // clean-up device
    cleanUp();

    // print out
    printOutput(outtest, BATCH_SIZE, OUTPUT_TEST_C, OUTPUT_TEST_H, OUTPUT_TEST_W);

    //free mem
    free(data);
    free(outtest);

    std::cout << "done!" << std::endl;

    return 0;
}

Weight files I used to run the example:

thanks in advance for any help,

f