How to to transplant pre-processing codes with opencv to nvinfer plugin

Hi all.

How to to transplant pre-processing codes bellow to nvinfer plugin

cv::Mat InputImageBlob(Laneparams *params, cv::Mat mat)
{
    cv::Mat temp;
    cv::resize(mat, temp, cv::Size(params->inputDims.d[2], params->inputDims.d[1]));
    cv::Mat data_mean = cv::Mat(temp.rows, temp.cols, CV_32FC3, cv::Scalar(0.4914, 0.4822, 0.4465));
    cv::Mat data_std = cv::Mat(temp.rows, temp.cols, CV_32FC3, cv::Scalar(0.2023, 0.1994, 0.2010));
    temp.convertTo(temp, CV_32F, 1. / 255.); // unsigned int8 → float32
    cv::subtract(temp, data_mean, temp);  //
    cv::divide(temp, data_std, temp);     //

    cv::Mat inputBlob = cv::dnn::blobFromImage(temp, 1.0,
                        cv::Size(params->inputDims.d[2], params->inputDims.d[1]),
						cv::Scalar(0, 0, 0), false, false);
    return inputBlob;
}

my config file is :

[property]
gpu-id=0
net-scale-factor=0.003922
offsets=0.2023;0.1994;0.2010
divides=0.4914;0.4822;0.4465
#0=RGB, 1=BGR
model-color-format=1
onnx-file=model.onnx
model-engine-file=model.engine
labelfile-path=lebal.txt

batch-size=16
model-color-format=1
## 0=FP32, 1=INT8, 2=FP16 mode
network-mode=2
is-classifier=1
process-mode=2
classifier-async-mode=0
classifier-threshold=0
input-object-min-width=64
input-object-min-height=64
gpu-id=0
gie-unique-id=1
operate-on-gie-id=1
operate-on-class-ids=1;2

I had changed nvdsinfer_conversion.cu to add meanDivideDataBuffer (data come from the divides node in the config file) as bellows.

__global__ void
NvDsInferConvert_CxToP3FloatKernelWithMean(
    float *outBuffer,
    unsigned char *inBuffer,
    unsigned int width,
    unsigned int height,
    unsigned int pitch,
    unsigned int inputPixelSize,
    float scaleFactor,
    float *meanOffsetsDataBuffer,
    float *meanDivideDataBuffer)
{
    unsigned int row = blockIdx.y * blockDim.y + threadIdx.y;
    unsigned int col = blockIdx.x * blockDim.x + threadIdx.x;

    if(col < width && row < height)
    {
        for(unsigned int k = 0; k < 3; k++)
        {
            outBuffer[width * height * k + row * width + col] =
                scaleFactor * (((float) inBuffer[row * pitch + col * inputPixelSize + k] -
                                meanOffsetsDataBuffer[(row * width * 3) + (col * 3) + k]) /
                               meanDivideDataBuffer[(row * width * 3) + (col * 3) + k]);
        }
    }
}

Now the model run ,but give wrong result.

How do you call NvDsInferConvert_CxToP3FloatKernelWithMean()?

Cross-checking NvDsInferConvert_CxToP3FloatKernelWithMean() and InputImageBlob(), the mapping between the parameters of two functions should be:

inBuffer : CUDA buffer of temp after “temp.convertTo(temp, CV_32F, 1. / 255.)”
meanOffsetsDataBuffer : CUDA buffer of data_mean
meanDivideDataBuffer: data_std

Hi mchi.

I had tested the configuration bellow,and had’t get the results I was hoping for.

offsets=0.2023;0.1994;0.2010
divides=0.4914;0.4822;0.4465
offsets=0.4914;0.4822;0.4465
divides=0.2023;0.1994;0.2010

The Related Code is as bellows.

convertFcn = NvDsInferConvert_C4ToP3RFloat;
convertFcn(outPtr, (unsigned char*) batchInput.inputFrames[i],
                   m_NetworkInfo.width, m_NetworkInfo.height,
                   batchInput.inputPitch, m_NetworkScaleFactor,
                   m_MeanOffsetDataBuffer, m_MeanDivideDataBuffer, m_PreProcessStream);
void
NvDsInferConvert_C4ToP3RFloat(
    float *outBuffer,
    unsigned char *inBuffer,
    unsigned int width,
    unsigned int height,
    unsigned int pitch,
    float scaleFactor,
    float *meanOffsetsDataBuffer,
    float *meanDivideDataBuffer,
    cudaStream_t stream)
{
    dim3 threadsPerBlock(THREADS_PER_BLOCK, THREADS_PER_BLOCK);
    dim3 blocks((width + THREADS_PER_BLOCK_1) / threadsPerBlock.x, (height + THREADS_PER_BLOCK_1) / threadsPerBlock.y);

    if(meanOffsetsDataBuffer != NULL && meanDivideDataBuffer != NULL)
    {
        NvDsInferConvert_CxToP3RFloatKernelWithMean <<< blocks, threadsPerBlock, 0, stream>>>
        (outBuffer, inBuffer, width, height, pitch, 4, scaleFactor, meanOffsetsDataBuffer, meanDivideDataBuffer);
    }
}
__global__ void
NvDsInferConvert_CxToP3RFloatKernelWithMean(
    float *outBuffer,
    unsigned char *inBuffer,
    unsigned int width,
    unsigned int height,
    unsigned int pitch,
    unsigned int inputPixelSize,
    float scaleFactor,
    float *meanOffsetsDataBuffer,
    float *meanDivideDataBuffer
)
{
    unsigned int row = blockIdx.y * blockDim.y + threadIdx.y;
    unsigned int col = blockIdx.x * blockDim.x + threadIdx.x;

    if(col < width && row < height)
    {
        for(unsigned int k = 0; k < 3; k++)
        {
            outBuffer[width * height * k + row * width + col] =
                scaleFactor * (((float) inBuffer[row * pitch + col * inputPixelSize + (2 - k)] -
                                meanOffsetsDataBuffer[(row * width * 3) + (col * 3) + k]) /
                               meanDivideDataBuffer[(row * width * 3) + (col * 3) + k]);
        }
    }
}

Could you check if you have below mapping in your NvDsInferConvert_C4ToP3RFloat() call ? From the code you pasted, there is not clear informantion to confirm this. You can add print in NvDsInferConvert_CxToP3RFloatKernelWithMean() to confirm that.

inBuffer : CUDA buffer of temp after “temp.convertTo(temp, CV_32F, 1. / 255.)”
meanOffsetsDataBuffer : CUDA buffer of data_mean
meanDivideDataBuffer: CUDA buffer of data_std

mchi:
I had add printf in NvDsInferConvert_CxToP3RFloatKernelWithMean() :
printf("=====%s %d\n",func,LINE);

and got the output message :
=====NvDsInferConvert_CxToP3RFloatKernelWithMean 155

I mean adding print in NvDsInferConvert_CxToP3RFloatKernelWithMean() to check if you pass correct “temp”, “data_mean” and “data_std” data to the CUDA kernel.

inBuffer : CUDA buffer of temp after “temp.convertTo(temp, CV_32F, 1. / 255.)”
meanOffsetsDataBuffer : CUDA buffer of data_mean
meanDivideDataBuffer: CUDA buffer of data_std

Hi, mchi.
The data is pass correctly, I had just tested for it.

Ok, so you could use the same method to check the output data.

Thanks,mchi.
I had found the program.
In our needs,the config “secondary-reinfer-interval=0” is Necessary.