I modify something based on your code.
It can work. Please check.
#include <string>
#include <future>
#include <deque>
#include <fstream>
#include <iostream>
#include <signal.h>
#include <stdlib.h>
#include <unistd.h>
#include <mutex>
#include <stdio.h>
#include <cassert>
#include <opencv2/opencv.hpp>
#include "cuda_runtime_api.h"
#include <cuda.h>
#include "NvInfer.h"
#define DEVICE 0 // GPU id
#define BATCH_SIZE 1
static const int INPUT_H = 48;
static const int INPUT_W = 96;
static const int OUTPUT_SIZE = 24;
const char *INPUT_BLOB_NAME = "image_input";
const char *OUTPUT_BLOB_NAME_1 = "tf_op_layer_ArgMax";
const char *OUTPUT_BLOB_NAME_2 = "tf_op_layer_Max";
const std::string alphabet[] = {
"0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
"A", "B", "C", "D", "E", "F", "G", "H", "I", "J",
"K", "L", "M", "N", "P", "Q", "R", "S", "T", "U",
"V", "W", "X", "Y", "Z"
};
#define CHECK(status) \
do\
{\
auto ret = (status);\
if (ret != 0)\
{\
std::cerr << "Cuda failure: " << ret << std::endl;\
abort();\
}\
} while (0)
class Logger : public nvinfer1::ILogger {
void log(Severity severity, const char* msg) override {
if (severity <= Severity::kWARNING){
std::cout << msg << std::endl;
}
}
} logger;
void doInference(nvinfer1::IExecutionContext &context, float *input, int *output_1, float *output_2, int batchSize) {
const nvinfer1::ICudaEngine &engine = context.getEngine();
// Pointers to input and output device buffers to pass to engine.
// Engine requires exactly IEngine::getNbBindings() number of buffers.
assert(engine.getNbBindings() == 3);
void *buffers[3];
// In order to bind the buffers, we need to know the names of the input and output tensors.
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
const int outputIndex_1 = engine.getBindingIndex(OUTPUT_BLOB_NAME_1);
const int outputIndex_2 = engine.getBindingIndex(OUTPUT_BLOB_NAME_2);
// Create GPU buffers on device
CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
CHECK(cudaMalloc(&buffers[outputIndex_1], batchSize * OUTPUT_SIZE * sizeof(float)));
CHECK(cudaMalloc(&buffers[outputIndex_2], batchSize * OUTPUT_SIZE * sizeof(float)));
// Create stream
cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));
// DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueue(batchSize, buffers, stream, nullptr);
CHECK(cudaMemcpyAsync(output_1, buffers[outputIndex_1], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
CHECK(cudaMemcpyAsync(output_2, buffers[outputIndex_2], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
// Release stream and buffers
cudaStreamDestroy(stream);
CHECK(cudaFree(buffers[inputIndex]));
CHECK(cudaFree(buffers[outputIndex_1]));
CHECK(cudaFree(buffers[outputIndex_2]));
}
int main(int argc, char *argv[]){
cudaSetDevice(DEVICE);
char *trtModelStream{nullptr};
size_t size{0};
std::ifstream file("/workspace/demo_2.0/lprnet/lpr_us_onnx_b16.engine", std::ios::binary);
if (file.good()) {
file.seekg(0, file.end);
size = file.tellg();
file.seekg(0, file.beg);
trtModelStream = new char[size];
assert(trtModelStream);
file.read(trtModelStream, size);
file.close();
}
std::cout << "size:" << size <<"\n";
nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(logger);
assert(runtime != nullptr);
nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
assert(engine != nullptr);
nvinfer1::IExecutionContext *context = engine->createExecutionContext();
assert(context != nullptr);
static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W];
cv::Mat img = cv::imread("/workspace/demo_2.0/lprnet/data/openalpr/train/image/wts-lg-000158.jpg");
cv::Mat pr_img;
cv::resize(img, pr_img, cv::Size(INPUT_W, INPUT_H), 0, 0, cv::INTER_CUBIC);
int i = 0;
for (int row = 0; row < INPUT_H; ++row) {
uchar* uc_pixel = pr_img.data + row * pr_img.step;
for (int col = 0; col < INPUT_W; ++col) {
data[i + 2 * INPUT_H * INPUT_W] = ((float)uc_pixel[2] - 127.5)*0.003921568627451;
data[i + INPUT_H * INPUT_W] = ((float)uc_pixel[1]-127.5)*0.003921568627451;
data[i] = ((float)uc_pixel[0]-127.5)*0.003921568627451;
uc_pixel += 3;
++i;
}
}
// Run inference
static int tf_op_layer_ArgMax[BATCH_SIZE * OUTPUT_SIZE];
static float tf_op_layer_Max[BATCH_SIZE * OUTPUT_SIZE];
auto start = std::chrono::system_clock::now();
printf("running inference \n");
doInference(*context, data, tf_op_layer_ArgMax, tf_op_layer_Max, BATCH_SIZE);
auto end = std::chrono::system_clock::now();
std::cout << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << "us" << std::endl;
std::cout << std::endl;
std::vector<float> preds;
for (int i = 0; i < 24; ++i) {
preds.push_back(tf_op_layer_ArgMax[i]);
}
// remove repeat blank label
int pre_c = preds[0];
std::vector<int> no_repeat_blank_label;
for (auto c: preds) {
if (c == pre_c || c == 35) {
if (c == 35) pre_c = c;
continue;
}
no_repeat_blank_label.push_back(c);
pre_c = c;
}
//print the character list
std::string str;
for (auto v: no_repeat_blank_label) {
str += alphabet[v];
}
std::cout<<"result:"<<str<<std::endl;
// Destroy the engine
context->destroy();
engine->destroy();
runtime->destroy();
return 0;
}
/*
time g++ trt_lprnet.cpp -lnvinfer -Lcudart -pthread $(pkg-config --cflags --libs opencv4) -I/usr/local/cuda/include -L/usr/local/cuda/lib64 -lcuda -lcudart -O0 -p -g && time ./a.out
nvcc -I/usr/local/cuda/include -L/usr/local/cuda/lib64 -lcutil -lcudpp -lcuda -lcudart -c -o FSPB_main.o FSPB_main.cpp
*/