visualizaqtion of first 4 layer on python and opencv c++
https://yadi.sk/i/s8vXHgUZCYlz_w
#include <algorithm>
#include <chrono>
#include <cstdlib>
#include <cuda_runtime_api.h>
#include <fstream>
#include <iostream>
#include <string>
#include <sys/stat.h>
#include <unordered_map>
#include <cassert>
#include <vector>
#include "NvInfer.h"
#include "NvUffParser.h"
#include "NvUtils.h"
#include <opencv2/core.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/imgcodecs.hpp>
#include <opencv2/imgproc.hpp>
#include "common.h"
using namespace nvuffparser;
using namespace nvinfer1;
static const int INPUT_H = 224;
static const int INPUT_W = 224;
static const int INPUT_D = 3;
static Logger gLogger;
static int gDLA{0};
#define MAX_WORKSPACE (1 << 30)
#define RETURN_AND_LOG(ret, severity, message) \
do { \
std::string error_message = "sample_uff: " + std::string(message); \
gLogger.log(ILogger::Severity::k ## severity, error_message.c_str()); \
return (ret); \
} while(0)
inline int64_t volume(const Dims& d)
{
int64_t v = 1;
for (int64_t i = 0; i < d.nbDims; i++)
v *= d.d[i];
return v;
}
inline unsigned int elementSize(DataType t)
{
switch (t)
{
case DataType::kINT32: // Fallthrough, same as kFLOAT
case DataType::kFLOAT: return 4;
case DataType::kHALF: return 2;
case DataType::kINT8: return 1;
}
assert(0);
return 0;
}
void loadData(std::string filename, uint8_t *fileData, int datasize)
{
cv::Mat image = cv::imread(filename, 1);
if((image.cols != INPUT_W) || (image.rows != INPUT_H))
{
resize(image, image, cv::Size(INPUT_W, INPUT_H));
}
std::memcpy(fileData, image.data, image.cols * image.rows * image.channels() * sizeof(char));
cv::imshow("input", image);
cv::waitKey(10);
}
void* safeCudaMalloc(size_t memSize)
{
void* deviceMem;
CHECK(cudaMalloc(&deviceMem, memSize));
if (deviceMem == nullptr)
{
std::cerr << "Out of memory" << std::endl;
exit(1);
}
return deviceMem;
}
std::vector<std::pair<int64_t, DataType>>
calculateBindingBufferSizes(const ICudaEngine& engine, int nbBindings, int batchSize)
{
std::vector<std::pair<int64_t, DataType>> sizes;
for (int i = 0; i < nbBindings; ++i)
{
Dims dims = engine.getBindingDimensions(i);
DataType dtype = engine.getBindingDataType(i);
int64_t eltCount = volume(dims) * batchSize;
sizes.push_back(std::make_pair(eltCount, dtype));
std::cout << "eltCount " << eltCount << std::endl;
}
return sizes;
}
void* createCudaBuffer(int64_t eltCount, DataType dtype)
{
/* in that specific case, eltCount == INPUT_H * INPUT_W * INPUT_D*/
assert(eltCount == INPUT_H * INPUT_W * INPUT_D);
assert(elementSize(dtype) == sizeof(float));
size_t memSize = eltCount * elementSize(dtype);
float* inputs = new float[eltCount];
/* read PGM file */
uint8_t fileData[INPUT_H * INPUT_W * INPUT_D];
// Load data
loadData("/home/timyr/vscode-workspace/tensorflow-test/features_test.png", fileData, INPUT_H * INPUT_W * INPUT_D);
// Preprocess data
for (int i = 0; i < eltCount; i++)
inputs[i] = float(fileData[i]) / 255.0;
//inputs[i] = 1.0 - float(fileData[i]) / 255.0;
// Load data to device
void* deviceMem = safeCudaMalloc(memSize);
CHECK(cudaMemcpy(deviceMem, inputs, memSize, cudaMemcpyHostToDevice));
delete[] inputs;
return deviceMem;
}
void printOutput(int64_t eltCount, DataType dtype, void* buffer)
{
assert(elementSize(dtype) == sizeof(float));
size_t memSize = eltCount * elementSize(dtype);
float* outputs = new float[eltCount];
CHECK(cudaMemcpy(outputs, buffer, memSize, cudaMemcpyDeviceToHost));
std::vector<cv::Mat> channels;
cv::Mat fff(224, 224, CV_32FC(64));
std::memcpy(fff.data, outputs, fff.rows*fff.cols*64*sizeof(float));
cv::split(fff, channels);
for(int i = 0; i < 64; ++i)
{
cv::imshow("filter_"+ std::to_string(i), channels[i]);
cv::moveWindow("filter_"+ std::to_string(i), 0 + 112 * (i%8), 0 + 140 * (i/8) );
}
delete[] outputs;
}
ICudaEngine* loadModelAndCreateEngine(const char* uffFile, int maxBatchSize,
IUffParser* parser)
{
IBuilder* builder = createInferBuilder(gLogger);
INetworkDefinition* network = builder->createNetwork();
if (!parser->parse(uffFile, *network, nvinfer1::DataType::kFLOAT))
RETURN_AND_LOG(nullptr, ERROR, "Fail to parse");
/* we create the engine */
builder->setMaxBatchSize(maxBatchSize);
builder->setMaxWorkspaceSize(MAX_WORKSPACE);
if (gDLA > 0) samplesCommon::enableDLA(builder, gDLA);
ICudaEngine* engine = builder->buildCudaEngine(*network);
if (!engine)
RETURN_AND_LOG(nullptr, ERROR, "Unable to create engine");
/* we can clean the network and the parser */
network->destroy();
builder->destroy();
return engine;
}
void execute(ICudaEngine& engine)
{
IExecutionContext* context = engine.createExecutionContext();
int batchSize = 1;
int nbBindings = engine.getNbBindings();
assert(nbBindings == 2);
std::vector<void*> buffers(nbBindings);
// pair inputs and outputs
auto buffersSizes = calculateBindingBufferSizes(engine, nbBindings, batchSize);
int bindingIdxInput = 0;
for (int i = 0; i < nbBindings; ++i)
{
if (engine.bindingIsInput(i))
bindingIdxInput = i;
else
{
auto bufferSizesOutput = buffersSizes[i];
buffers[i] = safeCudaMalloc(bufferSizesOutput.first * elementSize(bufferSizesOutput.second));
}
}
auto bufferSizesInput = buffersSizes[bindingIdxInput];
auto t_start = std::chrono::high_resolution_clock::now();
buffers[bindingIdxInput] = createCudaBuffer(bufferSizesInput.first, bufferSizesInput.second);
int iterations = 1;
int numberRun = 1;
for (int i = 0; i < iterations; ++i)
{
float total = 0, ms;
for (int run = 0; run < numberRun; run++)
{
context->execute(batchSize, &buffers[0]);
auto t_end = std::chrono::high_resolution_clock::now();
ms = std::chrono::duration<float, std::milli>(t_end - t_start).count();
total += ms;
}
total /= numberRun;
std::cout << "Average over " << numberRun << " runs is " << total << " ms." << std::endl;
}
for (int bindingIdx = 0; bindingIdx < nbBindings; ++bindingIdx)
{
if (engine.bindingIsInput(bindingIdx))
continue;
auto bufferSizesOutput = buffersSizes[bindingIdx];
std::cout << bufferSizesOutput.first << "-----" << std::endl;;
printOutput(bufferSizesOutput.first, bufferSizesOutput.second, buffers[bindingIdx]);
}
CHECK(cudaFree(buffers[bindingIdxInput]));
for (int bindingIdx = 0; bindingIdx < nbBindings; ++bindingIdx)
if (!engine.bindingIsInput(bindingIdx))
CHECK(cudaFree(buffers[bindingIdx]));
context->destroy();
}
int main(int argc, char** argv)
{
gDLA = samplesCommon::parseDLA(argc, argv);
auto fileName = "/home/timyr/vscode-workspace/tensorflow-test/vgg16features_lite.uff";
std::cout << fileName << std::endl;
int maxBatchSize = 1;
auto parser = createUffParser();
/* Register tensorflow input */
parser->registerInput("input_1", Dims3(INPUT_D, INPUT_W, INPUT_H), UffInputOrder::kNCHW);
parser->registerOutput("block1_conv1/Relu");
ICudaEngine* engine = loadModelAndCreateEngine(fileName, maxBatchSize, parser);
if (!engine)
RETURN_AND_LOG(EXIT_FAILURE, ERROR, "Model load failed");
parser->destroy();
execute(*engine);
engine->destroy();
shutdownProtobufLibrary();
cv::waitKey(0);
return EXIT_SUCCESS;
}