Hello,
I have a problem trying to load weights value to be used for half precision inference;
for weight loading I use the function loadWeights provided in many TensorRT samples; that works fine for float32 but when I switch to half precision I have the following error
ERROR: Parameter check failed at: Network.cpp::addConvolution::32, condition: kernelWeights.values != NULL
demo: demo.cpp:255: void APIToModel(unsigned int, nvinfer1::IHostMemory**): Assertion `conv1 != nullptr' failed.
Aborted (core dumped)
I tried to modify the loadWeights function, forcing the conversion with the function __float2half (also found in TensorRT samplePlugin sample), but the weights were not loaded succesfully either;
I attach a working piece of code to reproduce the problem; at line 77 it is possible to switch between half and full precision; the weight files I used can be found below;
I use TensorRT RC 4.0.0.3 with CUDA 9.0;
#include "NvInfer.h"
#include "NvCaffeParser.h"
#include "NvUtils.h"
#include "cuda_runtime_api.h"
#include <cassert>
#include <cmath>
#include <cstring>
#include <string>
#include <fstream>
#include <iostream>
#include <sstream>
#include <sys/stat.h>
#include <vector>
#include <algorithm>
#include <opencv2/opencv.hpp>
#include <opencv2/core/core.hpp>
#include <cublas_v2.h>
#define CHECK(status) \
{ \
if (status != 0) \
{ \
std::cout << "Cuda failure: " << status; \
abort(); \
} \
}
// stuff we know about the network and the input/output blobs
static const int INPUT_H = 256;
static const int INPUT_W = 384;
static const int INPUT_C = 3;
static const int OUTPUT_TEST_H = 256;
static const int OUTPUT_TEST_W = 384;
static const int OUTPUT_TEST_C = 64;
static const int OUTPUT_TEST_SIZE = OUTPUT_TEST_H * OUTPUT_TEST_W * OUTPUT_TEST_C;
static const int BATCH_SIZE = 4;
static const int MAX_BATCH_SIZE = 4;
const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "out";
const char* OUTPUT_BLOB_NAME_TEST = "test_out";
const int N_BINDINGS = 2;
static void* buffers[N_BINDINGS];
static cudaStream_t stream;
static int inputIndex, outputIndexTest;
using namespace nvinfer1;
// Logger for GIE info/warning/errors
class Logger : public nvinfer1::ILogger
{
public:
void log(nvinfer1::ILogger::Severity severity, const char* msg) override
{
// suppress info-level messages
if (severity == Severity::kINFO) return;
switch (severity)
{
case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break;
case Severity::kERROR: std::cerr << "ERROR: "; break;
case Severity::kWARNING: std::cerr << "WARNING: "; break;
case Severity::kINFO: std::cerr << "INFO: "; break;
default: std::cerr << "UNKNOWN: "; break;
}
std::cerr << msg << std::endl;
}
};
static Logger gLogger;
static const DataType DATATYPE = DataType::kHALF;
const char* WEIGHTS_FILENAME = "demo16.wts";
//static const DataType DATATYPE = DataType::kFLOAT;
//const char* WEIGHTS_FILENAME = "demo32.wts";
template<typename T, typename U> T bitwise_cast(U u)
{
return *reinterpret_cast<T*>(&u);
}
__half __float2half(float f)
{
uint32_t x = bitwise_cast<uint32_t, float>(f);
uint32_t u = (x & 0x7fffffff);
// Get rid of +NaN/-NaN case first.
if (u > 0x7f800000)
return bitwise_cast<__half, uint16_t>(uint16_t(0x7fff));
uint16_t sign = ((x >> 16) & 0x8000);
// Get rid of +Inf/-Inf, +0/-0.
if (u > 0x477fefff)
return bitwise_cast<__half, uint16_t>(sign | uint16_t(0x7c00));
if (u < 0x33000001)
return bitwise_cast<__half, uint16_t>(sign | uint16_t(0x0000));
uint32_t exponent = ((u >> 23) & 0xff);
uint32_t mantissa = (u & 0x7fffff);
uint32_t shift;
if (exponent > 0x70)
{
shift = 13;
exponent -= 0x70;
}
else
{
shift = 0x7e - exponent;
exponent = 0;
mantissa |= 0x800000;
}
uint32_t lsb = (1 << shift);
uint32_t lsb_s1 = (lsb >> 1);
uint32_t lsb_m1 = (lsb - 1);
// Round to nearest even.
uint32_t remainder = (mantissa & lsb_m1);
mantissa >>= shift;
if ( (remainder > lsb_s1) || ((remainder == lsb_s1) && (mantissa & 0x1)) )
{
++mantissa;
if (!(mantissa & 0x3ff))
{
++exponent;
mantissa = 0;
}
}
return bitwise_cast<__half, uint16_t>(sign | uint16_t(exponent<<10) | uint16_t(mantissa));
}
// Our weight files are in a very simple space delimited format.
// [type] <data x size in hex>
std::map<std::string, Weights> loadWeights(const std::string file)
{
std::map<std::string, Weights> weightMap;
std::ifstream input(file);
assert(input.is_open() && "Unable to load weight file.");
int32_t count;
input >> count;
assert(count > 0 && "Invalid weight map file.");
while(count--)
{
Weights wt{DataType::kFLOAT, nullptr, 0};
uint32_t type, size;
std::string name;
input >> name >> std::dec >> type >> size;
wt.type = static_cast<DataType>(type);
if (wt.type == DataType::kFLOAT)
{
uint32_t *val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
for (uint32_t x = 0, y = size; x < y; ++x)
{
input >> std::hex >> val[x];
}
wt.values = val;
} else if (wt.type == DataType::kHALF)
{
uint16_t *val = reinterpret_cast<uint16_t*>(malloc(sizeof(val) * size));
for (uint32_t x = 0, y = size; x < y; ++x)
{
input >> std::hex >> val[x];
}
wt.values = val;
}
wt.count = size;
weightMap[name] = wt;
}
return weightMap;
}
std::string locateFile(const std::string& input, const std::vector<std::string> & directories)
{
std::string file;
const int MAX_DEPTH{10};
bool found{false};
for (auto &dir : directories)
{
file = dir + input;
for (int i = 0; i < MAX_DEPTH && !found; i++)
{
std::ifstream checkFile(file);
found = checkFile.is_open();
if (found) break;
file = "../" + file;
}
if (found) break;
file.clear();
}
assert(!file.empty() && "Could not find a file due to it not existing in the data directory.");
return file;
}
// We have the data files located in a specific directory. This
// searches for that directory format from the current directory.
std::string locateFile(const std::string& input)
{
std::vector<std::string> dirs{"./data/"};
return locateFile(input, dirs);
}
// print tensor dimensions
void printDims(ITensor* data)
{
Dims dims = data->getDimensions();
int nbDims = dims.nbDims;
for (int d = 0; d < nbDims; d++)
std::cout << dims.d[d] << " ";// << dims.d[1] << " " << dims.d[2] << " " << dims.d[3] << std::endl;
std::string sss;
if (data->getType() == DataType::kHALF)
sss = "float16";
if (data->getType() == DataType::kFLOAT)
sss = "float32";
std::cout << sss << " ";
std::cout << std::endl;
}
void APIToModel(unsigned int maxBatchSize, IHostMemory **modelStream)
{
// create the builder
IBuilder* builder = createInferBuilder(gLogger);
///////////////////////////////////////////////////////////
INetworkDefinition* network = builder->createNetwork();
// load weights values from disk
std::map<std::string, Weights> weightMap = loadWeights(locateFile(WEIGHTS_FILENAME));
// define input
auto data = network->addInput(INPUT_BLOB_NAME, DATATYPE, DimsCHW{INPUT_C, INPUT_H, INPUT_W});
assert(data != nullptr);
std::cout << "input" << std::endl;
printDims(data);
// add layer
// 1 ////////////////////////////////////
auto conv1 = network->addConvolution(*data, 64, DimsHW{3, 3}, weightMap["conv1_w"], weightMap["conv1_b"]);
assert(conv1 != nullptr);
conv1->setStride(DimsHW{1, 1});
conv1->setPadding(DimsHW{1, 1});
// set output
conv1->getOutput(0)->setName(OUTPUT_BLOB_NAME_TEST);
network->markOutput(*conv1->getOutput(0));
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////
// Build the engine
builder->setMaxBatchSize(maxBatchSize);
builder->setMaxWorkspaceSize(1e9);
if (DATATYPE == DataType::kHALF)
builder->setHalf2Mode(true);
std::cout << "building the engine..." << std::endl;
auto engine = builder->buildCudaEngine(*network);
assert(engine != nullptr);
std::cout << "engine built!" << std::endl;
// serialize the engine, then close everything down
(*modelStream) = engine->serialize();
// Once we have built the cuda engine, we can release all of our held memory.
for (auto &mem : weightMap)
{
free((void*)(mem.second.values));
}
///////////////////////////////////////////////////////////
network->destroy();
engine->destroy();
builder->destroy();
}
void setUpDevice(IExecutionContext& context, int batchSize)
{
const ICudaEngine& engine = context.getEngine();
// input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
// of these, but in this case we know that there is exactly one input and one output.
assert(engine.getNbBindings() == N_BINDINGS);
// In order to bind the buffers, we need to know the names of the input and output tensors.
// note that indices are guaranteed to be less than IEngine::getNbBindings()
inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
outputIndexTest = engine.getBindingIndex(OUTPUT_BLOB_NAME_TEST);
// create GPU buffers and a stream
CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_H * INPUT_W * INPUT_C * sizeof(float)));
CHECK(cudaMalloc(&buffers[outputIndexTest], batchSize * OUTPUT_TEST_SIZE * sizeof(float)));
// create cuda stream
CHECK(cudaStreamCreate(&stream));
}
void cleanUp()
{
// release the stream and the buffers
cudaStreamDestroy(stream);
CHECK(cudaFree(buffers[inputIndex]));
CHECK(cudaFree(buffers[outputIndexTest]));
}
void doInference(IExecutionContext& context, float* input, float* outputtest, int batchSize)
{
// DMA the input to the GPU, execute the batch asynchronously, and DMA it back:
CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_H * INPUT_W * INPUT_C * sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueue(batchSize, buffers, stream, nullptr);
CHECK(cudaMemcpyAsync(outputtest, buffers[outputIndexTest], batchSize * OUTPUT_TEST_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
}
// rearrange image data to [N, C, H, W] order
void prepareDataBatch(float *data, std::vector<cv::Mat> &frames)
{
assert(data && !frames.empty());
unsigned int volChl = INPUT_H * INPUT_W;
unsigned int volImg = INPUT_H * INPUT_W * INPUT_C;
for (int b = 0; b < BATCH_SIZE; b++)
for (int c = 0; c < INPUT_C; c++)
{
// the color image to input should be in BGR order
for (unsigned j = 0; j < volChl; j++)
data[b * volImg + c * volChl + j] = float(frames[b].data[j * INPUT_C + c]) / 255.0;
}
return;
}
void printOutput(float *out, const int batch_size, const int output_c, const int output_h, const int output_w)
{
int output_size(output_c * output_h * output_w);
std::cout << "================="<< std::endl;
std::cout << "================="<< std::endl;
std::cout << "-----------------"<< std::endl;
for (int b = 0; b < batch_size; b++)
{
for (int c = 0; c < output_c; c++)
{
for (int h = 0; h < output_h; h++)
{
for (int w = 0; w < output_w; w++)
std::cout << out[b * output_size + c * output_h * output_w + h * output_w + w] << " ";
std::cout << std::endl;
}
std::cout << "-----------------"<< std::endl;
}
std::cout << "================="<< std::endl;
std::cout << "================="<< std::endl;
}
return;
}
int main(int argc, char** argv)
{
std::cout << sizeof(float) << std::endl;
//read input, convert, resize
std::vector<std::string> image_paths;
image_paths.push_back("./images/2.jpg");
image_paths.push_back("./images/9.jpg");
image_paths.push_back("./images/18.jpg");
image_paths.push_back("./images/20.jpg");
std::vector<cv::Mat> images(BATCH_SIZE);
for (int b = 0; b < BATCH_SIZE; b++)
{
std::cout << image_paths[b] << std::endl;
cv::Mat image_bgr = cv::imread(image_paths[b]);
cv::Mat image_rgb, image;
cv::cvtColor(image_bgr, image_rgb, cv::COLOR_BGR2RGB);
cv::resize(image_rgb, image_rgb, cv::Size(INPUT_W, INPUT_H), 0, 0, cv::INTER_LINEAR);
images[b] = image_rgb;
}
// allocate CPU memory for input and output
int inputSize = sizeof(float) * BATCH_SIZE * INPUT_C * INPUT_H * INPUT_W;
int outputSizeTest = sizeof(float) * BATCH_SIZE * OUTPUT_TEST_SIZE;
float *data = (float *)malloc(inputSize);
float *outtest = (float *)malloc(outputSizeTest);
// init model stream variables
IHostMemory *modelStream{nullptr};
IRuntime* runtime = createInferRuntime(gLogger);
// create a model using the API directly and serialize it to a stream
APIToModel(MAX_BATCH_SIZE, &modelStream);
ICudaEngine* engine = runtime->deserializeCudaEngine(modelStream->data(), modelStream->size(), nullptr);
// create execution context
IExecutionContext *context = engine->createExecutionContext();
// allocate memory on device
setUpDevice(*context, BATCH_SIZE);
// flatten image Mat, convert to float, do TF-style whitening
prepareDataBatch(data, images);
// run inference
doInference(*context, data, outtest, BATCH_SIZE);
// destroy the engine
context->destroy();
engine->destroy();
runtime->destroy();
// clean-up device
cleanUp();
// print out
printOutput(outtest, BATCH_SIZE, OUTPUT_TEST_C, OUTPUT_TEST_H, OUTPUT_TEST_W);
//free mem
free(data);
free(outtest);
std::cout << "done!" << std::endl;
return 0;
}
Weight files I used to run the example:
- weights for half precision (demo16.wts) http://s000.tinyupload.com/?file_id=60272674837382850962
- weights for full precision (demo32.wts) http://s000.tinyupload.com/?file_id=13652338346346173172
thanks in advance for any help,
f