TensorRT:How cv::MAT convert to NHCW format ?

I run inference using TensorRT like below code.
Nothing error. But, It’s not right result of output.

In User Guide.html, Input/output of tensorRT is need to use NCHW format.
What’s NHCW fomat ?
How do I convert cv::MAT to NHCW format ?

int batchSize = 1;
int size_of_single_input = 256 * 256 * 3 * sizeof(float);
int size_of_single_output = 100 * 1 * 1 * sizeof(float); 

IBuilder* builder = createInferBuilder(gLogger);
 
INetworkDefinition* network = builder->createNetwork();
 
CaffeParser parser;
auto blob_name_to_tensor = parser.parse(“deploy.prototxt”,
                                        "sample.caffemodel",
                                        *network,
                                        DataType::kFLOAT);

network->markOutput(*blob_name_to_tensor->find("prob"));

builder->setMaxBatchSize(1);
builder->setMaxWorkspaceSize(1 << 30); 
ICudaEngine* engine = builder->buildCudaEngine(*network);

IExecutionContext *context = engine->createExecutionContext();

int inputIndex = engine->getBindingIndex(INPUT_LAYER_NAME),
int outputIndex = engine->getBindingIndex(OUTPUT_LAYER_NAME);

cv::Mat input;
input = imread("./sample.jpg");
cvtColor(input, input, CV_BGR2RGB);
cv::resize(input, input, cv::Size(256, 256));

float output[OUTPUTSIZE];

void* buffers = malloc(engine->getNbBindings() * sizeof(void*));
cudaMalloc(&buffers[inputIndex], batchSize * size_of_single_input);
cudaMalloc(&buffers[outputIndex], batchSize * size_of_single_output);

cudaStream_t stream;
cudaStreamCreate(&stream);

cudaMemcpyAsync(buffers[inputIndex], (float *)input, 
                batchSize * size_of_single_input, 
                cudaMemcpyHostToDevice, stream);

context.enqueue(batchSize, buffers, stream, nullptr);

cudaMemcpyAsync(output, buffers[outputIndex], 
                batchSize * size_of_single_output, 
                cudaMemcpyDeviceToHost, stream));

cudaStreamSynchronize(stream);
int rows = image.rows;
    int cols = image.cols;
    int channel = image.channels();

    int ndims = 3;
    int size[3];
    size[0] = rows;
    size[1] = cols;
    size[2] = channel * 2;

    float* ptr = reinterpret_cast<float*>(image.ptr());
    float* output = new float*size[1]*size[2]];
    memset(input,0, size[0]*size[1]*size[2]*sizeof(float));
    channel = size[2];
    int height = size[0];
    int width = size[1];
    for (int c = 0; c < channel; ++c) {
        for (int h = 0; h < height; ++h) {
            for (int w = 0; w < width; ++w) {
                output[ c*width*height + h*width + w ] = ptr[ width*channel*h + w*channel + c ];
            }
        }
    }

float* output is possible to be copied to GPU mem directly.

1 Like