vgg16 layer visualization

I use tensorRT 5 for c++
I want to visulize output from first convolution layer of vgg16 network.
I use pretrained model from keras, converted to uff format. Then i load it with с++ interface.
I get right size of output vector, but the image is wrong - increased 2 times and has linear artifacts, in comparison with python results. Likely I convert formats incorrectly. Can anyone help me?

Output layer is block1_conv1 (Conv2D), Size - (224, 224, 64)

std::vector<cv::Mat> channels;
cv::Mat fff(224, 224, CV_32FC(64));
std::memcpy(fff.data, outputs, fff.rows*fff.cols*64*sizeof(float));
cv::split(fff, channels);

can you share a small repro that demonstrates the difference you are seeing between tensorrt c++ and tensorrt python?

visualizaqtion of first 4 layer on python and opencv c++

https://yadi.sk/i/s8vXHgUZCYlz_w

#include <algorithm>
#include <chrono>
#include <cstdlib>
#include <cuda_runtime_api.h>
#include <fstream>
#include <iostream>
#include <string>
#include <sys/stat.h>
#include <unordered_map>
#include <cassert>
#include <vector>
#include "NvInfer.h"
#include "NvUffParser.h"

#include "NvUtils.h"

#include <opencv2/core.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/imgcodecs.hpp>
#include <opencv2/imgproc.hpp>

#include "common.h"

using namespace nvuffparser;
using namespace nvinfer1;


static const int INPUT_H = 224;
static const int INPUT_W = 224;
static const int INPUT_D = 3;

static Logger gLogger;
static int gDLA{0};

#define MAX_WORKSPACE (1 << 30)

#define RETURN_AND_LOG(ret, severity, message)                                              	\
		do {                                                                                    \
			std::string error_message = "sample_uff: " + std::string(message);            \
			gLogger.log(ILogger::Severity::k ## severity, error_message.c_str());               \
			return (ret);                                                                       \
		} while(0)

inline int64_t volume(const Dims& d)
{
	int64_t v = 1;
	for (int64_t i = 0; i < d.nbDims; i++)
		v *= d.d[i];
	return v;
}


inline unsigned int elementSize(DataType t)
{
	switch (t)
	{
	case DataType::kINT32: // Fallthrough, same as kFLOAT
	case DataType::kFLOAT: return 4;
	case DataType::kHALF:  return 2;
	case DataType::kINT8:  return 1;
	}
	assert(0);
	return 0;
}

void loadData(std::string filename, uint8_t *fileData, int datasize)
{
	cv::Mat image = cv::imread(filename, 1);
	if((image.cols != INPUT_W) || (image.rows != INPUT_H))
	{
		resize(image, image, cv::Size(INPUT_W, INPUT_H));
	}
	std::memcpy(fileData, image.data, image.cols * image.rows * image.channels() * sizeof(char));
	cv::imshow("input", image);
	cv::waitKey(10);
}


void* safeCudaMalloc(size_t memSize)
{
	void* deviceMem;
	CHECK(cudaMalloc(&deviceMem, memSize));
	if (deviceMem == nullptr)
	{
		std::cerr << "Out of memory" << std::endl;
		exit(1);
	}
	return deviceMem;
}


std::vector<std::pair<int64_t, DataType>>
calculateBindingBufferSizes(const ICudaEngine& engine, int nbBindings, int batchSize)
{
	std::vector<std::pair<int64_t, DataType>> sizes;
	for (int i = 0; i < nbBindings; ++i)
	{
		Dims dims = engine.getBindingDimensions(i);
		DataType dtype = engine.getBindingDataType(i);

		int64_t eltCount = volume(dims) * batchSize;
		sizes.push_back(std::make_pair(eltCount, dtype));

		std::cout << "eltCount " << eltCount << std::endl;
	}

	return sizes;
}


void* createCudaBuffer(int64_t eltCount, DataType dtype)
{
	/* in that specific case, eltCount == INPUT_H * INPUT_W * INPUT_D*/
	assert(eltCount == INPUT_H * INPUT_W * INPUT_D);
	assert(elementSize(dtype) == sizeof(float));

	size_t memSize = eltCount * elementSize(dtype);
	float* inputs = new float[eltCount];

	/* read PGM file */
	uint8_t fileData[INPUT_H * INPUT_W * INPUT_D];

	// Load data
	loadData("/home/timyr/vscode-workspace/tensorflow-test/features_test.png", fileData, INPUT_H * INPUT_W * INPUT_D);

	// Preprocess data
	for (int i = 0; i < eltCount; i++)
		inputs[i] = float(fileData[i]) / 255.0;
		//inputs[i] = 1.0 - float(fileData[i]) / 255.0;

	// Load data to device
	void* deviceMem = safeCudaMalloc(memSize);
	CHECK(cudaMemcpy(deviceMem, inputs, memSize, cudaMemcpyHostToDevice));

	delete[] inputs;
	return deviceMem;
}


void printOutput(int64_t eltCount, DataType dtype, void* buffer)
{
	assert(elementSize(dtype) == sizeof(float));

	size_t memSize = eltCount * elementSize(dtype);
	float* outputs = new float[eltCount];

	CHECK(cudaMemcpy(outputs, buffer, memSize, cudaMemcpyDeviceToHost));


	std::vector<cv::Mat> channels;
	cv::Mat fff(224, 224, CV_32FC(64));
	std::memcpy(fff.data, outputs, fff.rows*fff.cols*64*sizeof(float));
	cv::split(fff, channels);


	for(int i = 0; i < 64; ++i)
	{
		cv::imshow("filter_"+ std::to_string(i), channels[i]);
		cv::moveWindow("filter_"+ std::to_string(i), 0 + 112 * (i%8), 0 + 140 * (i/8) );
	}

	delete[] outputs;
}


ICudaEngine* loadModelAndCreateEngine(const char* uffFile, int maxBatchSize,
		IUffParser* parser)
{
	IBuilder* builder = createInferBuilder(gLogger);
	INetworkDefinition* network = builder->createNetwork();

	if (!parser->parse(uffFile, *network, nvinfer1::DataType::kFLOAT))
		RETURN_AND_LOG(nullptr, ERROR, "Fail to parse");

	/* we create the engine */
	builder->setMaxBatchSize(maxBatchSize);
	builder->setMaxWorkspaceSize(MAX_WORKSPACE);
	if (gDLA > 0) samplesCommon::enableDLA(builder, gDLA);

	ICudaEngine* engine = builder->buildCudaEngine(*network);
	if (!engine)
		RETURN_AND_LOG(nullptr, ERROR, "Unable to create engine");

	/* we can clean the network and the parser */
	network->destroy();
	builder->destroy();

	return engine;
}


void execute(ICudaEngine& engine)
{
	IExecutionContext* context = engine.createExecutionContext();
	int batchSize = 1;

	int nbBindings = engine.getNbBindings();
	assert(nbBindings == 2);

	std::vector<void*> buffers(nbBindings);

	// pair inputs and outputs
	auto buffersSizes = calculateBindingBufferSizes(engine, nbBindings, batchSize);


	int bindingIdxInput = 0;
	for (int i = 0; i < nbBindings; ++i)
	{
		if (engine.bindingIsInput(i))
			bindingIdxInput = i;
		else
		{
			auto bufferSizesOutput = buffersSizes[i];
			buffers[i] = safeCudaMalloc(bufferSizesOutput.first * elementSize(bufferSizesOutput.second));
		}
	}

	auto bufferSizesInput = buffersSizes[bindingIdxInput];


	auto t_start = std::chrono::high_resolution_clock::now();
	buffers[bindingIdxInput] = createCudaBuffer(bufferSizesInput.first, bufferSizesInput.second);
	int iterations = 1;
	int numberRun = 1;

	for (int i = 0; i < iterations; ++i)
	{
		float total = 0, ms;
		for (int run = 0; run < numberRun; run++)
		{
			context->execute(batchSize, &buffers[0]);
			auto t_end = std::chrono::high_resolution_clock::now();
			ms = std::chrono::duration<float, std::milli>(t_end - t_start).count();
			total += ms;

		}

		total /= numberRun;
		std::cout << "Average over " << numberRun << " runs is " << total << " ms." << std::endl;
	}


	for (int bindingIdx = 0; bindingIdx < nbBindings; ++bindingIdx)
	{
		if (engine.bindingIsInput(bindingIdx))
			continue;

		auto bufferSizesOutput = buffersSizes[bindingIdx];
		std::cout <<  bufferSizesOutput.first << "-----" << std::endl;;
		printOutput(bufferSizesOutput.first, bufferSizesOutput.second, buffers[bindingIdx]);


	}

	CHECK(cudaFree(buffers[bindingIdxInput]));


	for (int bindingIdx = 0; bindingIdx < nbBindings; ++bindingIdx)
		if (!engine.bindingIsInput(bindingIdx))
			CHECK(cudaFree(buffers[bindingIdx]));
	context->destroy();
}


int main(int argc, char** argv)
{
	gDLA = samplesCommon::parseDLA(argc, argv);
	auto fileName = "/home/timyr/vscode-workspace/tensorflow-test/vgg16features_lite.uff";
	std::cout << fileName << std::endl;

	int maxBatchSize = 1;
	auto parser = createUffParser();

	/* Register tensorflow input */
	parser->registerInput("input_1", Dims3(INPUT_D, INPUT_W, INPUT_H), UffInputOrder::kNCHW);
	parser->registerOutput("block1_conv1/Relu");

	ICudaEngine* engine = loadModelAndCreateEngine(fileName, maxBatchSize, parser);

	if (!engine)
		RETURN_AND_LOG(EXIT_FAILURE, ERROR, "Model load failed");
	parser->destroy();

	execute(*engine);

	engine->destroy();
	shutdownProtobufLibrary();


	cv::waitKey(0);
	return EXIT_SUCCESS;
}

With ONNX model it works fine.
Seems i saved model wrong.