VPI image pointing to managed memory

I have a memory buffer allocated with cudaMallocManaged; this buffer is used as input to the AI model as a binding. I do the image preprocessing with the OpenCV as follows: I create and OpenCV matrix pointing to that buffer on GPU and preprocess images inplace. Like:

cv::Size inputSize = _modelConfig.inputSize;
int numBindings = engine->getNbIOTensors();
void *buffer_in, *buffer_out; // buffers for input and output of the model
int size_in = inputSize.height * inputSize.width * 3 * sizeof(float);
cudaMallocManaged(&buffer_in, size_in);
int size_out = 1 * sizeof(float); // output is juts a float number
cudaMallocManaged(&buffer_out, size_out);

void** bindings = new void*[numBindings];
bindings[0] = buffer_in;
bindings[1] = buffer_out;

// create cv::cuda::GpuMat outputMat pointing to bindings[0] what is buffer_in
cv::cuda::GpuMat outputMat = cv::cuda::GpuMat(inputSize.height, inputSize.width, CV_32FC3, bindings[0]);
preprocess outputMat with the OpenCV…

Now I want to do the same with the VPI library. But it cannot point to the Managed buffer bindings[0]. It requires whether NvBufSurface or cudaArray. Is it possible to make for example, NvBufSurface from the managed memory? What would you advice in order to make a VPIImage pointing to the bounding[0]?

With the best wishes,
Valeriy

Hi,

VPI can wrap the GPU buffer.
Which VPI version do you use? 1.2 or 2.x?

Thanks.

I use vpi 2.
How can I wrap the GPU buffer? As I see it, the code should look like:

cudaMallocManaged(&GPU_buffer, buffer_size);
VPIIMage img_vpi;
VPIImageData img_data;
img_data.bufferType = ?;
img_data.buffer.? = GPU_buffer;

VPIStatus err_vpi = vpiImageCreateWrapper(&img_data, nullptr, VPI_BACKEND_CUDA, &img_vpi);

what should I use instead of “?”
for buffertype there are options: VPI_IMAGE_BUFFER_CUDA_PITCH_LINEAR, VPI_IMAGE_BUFFER_CUDA_ARRAY, VPI_IMAGE_BUFFER_EGLIMAGE, VPI_IMAGE_BUFFER_NVBUFFER

Could you please help me with my question?

Hi,

Sorry for the late update.
Here is an example to wrap CUDA buffer with VPI 2.x for your reference:

// Given width, height, d_ptr(GPU buffer pointer)

VPIImage img;
VPIImageData data = {};
data.bufferType = VPI_IMAGE_BUFFER_CUDA_PITCH_LINEAR;
data.buffer.pitch.numPlanes = 1;
data.buffer.pitch.planes[0].width  = width;
data.buffer.pitch.planes[0].height = height;
data.buffer.pitch.planes[0].data   = d_ptr;

CHECK_STATUS(vpiImageCreateWrapper(&data, nullptr, 0, &img));

Thanks.

Thank you very much. I tried your code on Nvidia DevKit with Jetson AGX Xavier as follows:

void* buff;
int width = 100;
int height = 100;
cudaMalloc(&buff, width * height * 4 * sizeof(float));
VPIImage img_vpi;
VPIImageData img_data = {};
img_data.bufferType = VPI_IMAGE_BUFFER_CUDA_PITCH_LINEAR;
img_data.buffer.pitch.numPlanes = 1;
img_data.buffer.pitch.planes[0].width = width;
img_data.buffer.pitch.planes[0].height = height;
img_data.buffer.pitch.planes[0].data = buff;
VPIStatus err_vpi = vpiImageCreateWrapper(&img_data, nullptr, 0, &img_vpi);
std::cout << vpiStatusGetName(err_vpi) << std::endl;

the output is:
VPI_ERROR_INVALID_ARGUMENT

Hi,

We want to check the invalid argument error in our environment.
Could you wrap the code into a runnable source and share it with us?

Thanks.

include
include <opencv2/cudaimgproc.hpp>
include <opencv2/imgcodecs.hpp>
include <opencv2/cudawarping.hpp> // for cv::cuda::resize
include <cuda.h>
include <cuda_runtime.h>
include <vpi/Image.h>
include <vpi/Status.h>
include <vpi/ImageFormat.h>

int main()
{
cuInit(0);
int img_width_output = 224;
int img_height_output = 224;
int size_output = img_width_output * img_height_output * 3 * sizeof(float);
void* output_buff; // a preprocessed image will be put here; this buffer will be used as a model input (binding)
cudaMalloc(&output_buff, size_output);

// how I do it with OpenCV GPU version
cv::Mat img_input;
cv::cuda::GpuMat img_gpu_in, img_gpu_processed;
std::string path = "304194-17086108.jpg"; // here use your image
img_input = cv::imread(path);
img_gpu_in = cv::cuda::GpuMat(img_input); // build OpenCV GPU image based on the CPU version
// wrap an OpenCV GPU image around previously allocated output_buff
img_gpu_processed = cv::cuda::GpuMat(img_height_output, img_width_output, CV_32FC3, output_buff);
cv::cuda::resize(img_gpu_in, img_gpu_processed, cv::Size(img_height_output, img_width_output), 0, 0, cv::INTER_NEAREST);
// the resulting resized image is stored in output_buff and I can use it as model input in future

// now I try to do the same with VPIimage instead of OpenCV GPU version
VPIImage img_vpi;
VPIImageData img_data = {};
img_data.bufferType = VPI_IMAGE_BUFFER_CUDA_PITCH_LINEAR;
img_data.buffer.pitch.numPlanes = 1;
img_data.buffer.pitch.planes[0].width = img_width_output;
img_data.buffer.pitch.planes[0].height = img_height_output;
img_data.buffer.pitch.planes[0].pixelType = VPI_PIXEL_TYPE_3F32;
img_data.buffer.pitch.planes[0].data = output_buff;
//VPIStatus err_vpi = vpiImageCreateWrapper(&img_data, nullptr, VPI_BACKEND_CUDA, &img_vpi);
VPIStatus err_vpi = vpiImageCreateWrapper(&img_data, nullptr, 0, &img_vpi);
std::cout << vpiStatusGetName(err_vpi) << std::endl; // Output: VPI_ERROR_INVALID_ARGUMENT

img_gpu_in.release();
img_gpu_processed.release();
cudaFree(output_buff);

return 0;

}

Compile it with:
nvcc -std=c++17 -I /usr/local/cuda/include/ -I /usr/local/include/opencv4/ -I /opt/nvidia/vpi2/include/ -lcuda -lcudart -lopencv_core -lopencv_imgcodecs -lopencv_cudawarping -lnvvpi ForGuysFromNvidia.cu -o ForGuysFromNvidia

Thanks.

Will get back to you after checking.

Hi,

Below line is not correct:

img_data.buffer.pitch.planes[0].pixelType = VPI_PIXEL_TYPE_3F32;

Only below is supported in VPI Image:
https://docs.nvidia.com/vpi/ImageFormat_8h.html
For example, please try this instead:

img_data.buffer.pitch.format = VPI_IMAGE_FORMAT_F32;

Thanks.

I replaced the line ```
img_data.buffer.pitch.planes[0].pixelType = VPI_PIXEL_TYPE_3F32;

with ```
img_data.buffer.pitch.format = VPI_IMAGE_FORMAT_F32;

but still have the same output VPI_ERROR_INVALID_ARGUMENT

Hi,

Could you try if the below setting can work?

VPIImageData data;
data.bufferType = VPI_IMAGE_BUFFER_CUDA_PITCH_LINEAR;
data.buffer.pitch.format = VPI_IMAGE_FORMAT_Y8_ER;
data.buffer.pitch.planes[0].data      = image_ptr;
data.buffer.pitch.planes[0].width     = image_width;
data.buffer.pitch.planes[0].height    = image_height;
data.buffer.pitch.planes[0].pixelType = VPI_PIXEL_TYPE_DEFAULT;

Thanks.

I tried. The whole code is the following:

include
include <opencv2/cudaimgproc.hpp>
include <opencv2/imgcodecs.hpp>
include <opencv2/cudawarping.hpp> // for cv::cuda::resize
include <cuda.h>
include <cuda_runtime.h>
include <vpi/Image.h>
include <vpi/Status.h>
include <vpi/ImageFormat.h>

int main()
{
cuInit(0);
int img_width_output = 224;
int img_height_output = 224;
int size_output = img_width_output * img_height_output * 3 * sizeof(float);
void* output_buff; // a preprocessed image will be put here; this buffer will be used as a model input (binding)
cudaMalloc(&output_buff, size_output);

// how I do it with OpenCV GPU version
cv::Mat img_input;
cv::cuda::GpuMat img_gpu_in, img_gpu_processed;
std::string path = "frame0001.png"; // here use your image
img_input = cv::imread(path);
img_gpu_in = cv::cuda::GpuMat(img_input); // build OpenCV GPU image based on the CPU version
// wrap an OpenCV GPU image around previously allocated output_buff
img_gpu_processed = cv::cuda::GpuMat(img_height_output, img_width_output, CV_32FC3, output_buff);
cv::cuda::resize(img_gpu_in, img_gpu_processed, cv::Size(img_height_output, img_width_output), 0, 0, cv::INTER_NEAREST);
// the resulting resized image is stored in output_buff and I can use it as model input in future

// now I try to do the same with VPIimage instead of OpenCV GPU version
VPIImage img_vpi;

VPIImageData data;
data.bufferType = VPI_IMAGE_BUFFER_CUDA_PITCH_LINEAR;
data.buffer.pitch.format = VPI_IMAGE_FORMAT_Y8_ER;
data.buffer.pitch.planes[0].data      = output_buff;
data.buffer.pitch.planes[0].width     = img_width_output;
data.buffer.pitch.planes[0].height    = img_height_output;
data.buffer.pitch.planes[0].pixelType = VPI_PIXEL_TYPE_DEFAULT;

//VPIStatus err_vpi = vpiImageCreateWrapper(&data, nullptr, VPI_BACKEND_CUDA, &img_vpi);
VPIStatus err_vpi = vpiImageCreateWrapper(&data, nullptr, 0, &img_vpi);
std::cout << vpiStatusGetName(err_vpi) << std::endl; // Output: VPI_ERROR_INVALID_ARGUMENT

img_gpu_in.release();
img_gpu_processed.release();
cudaFree(output_buff);

return 0;

}

The error is still the same.

Hi,

Sorry for the late.
Please see the sample below:

#include <iostream>
#include <fstream>
#include <sstream>

#define CHECK_STATUS(STMT)                                    \
    do                                                        \
    {                                                         \
        VPIStatus status = (STMT);                            \
        if (status != VPI_SUCCESS)                            \
        {                                                     \
            char buffer[VPI_MAX_STATUS_MESSAGE_LENGTH];       \
            vpiGetLastStatusMessage(buffer, sizeof(buffer));  \
            std::ostringstream ss;                            \
            ss << "line " << __LINE__ <<": ";                 \
            ss << vpiStatusGetName(status) << ": " << buffer; \
            throw std::runtime_error(ss.str());               \
        }                                                     \
    } while (0);


int main(int argc, char *argv[])
{
    int width  = 224;
    int height = 224;
    int pitch  = (int((width-1)/256)+1)*256;

    char* ptr;
    cudaMalloc((void**)&ptr, pitch*height*sizeof(char));

    VPIImage img = NULL;
    VPIImageData data = {};
    data.bufferType = VPI_IMAGE_BUFFER_CUDA_PITCH_LINEAR;
    data.buffer.pitch.numPlanes = 1;
    data.buffer.pitch.planes[0].width      = width;
    data.buffer.pitch.planes[0].height     = height;
    data.buffer.pitch.planes[0].pitchBytes = pitch;
    data.buffer.pitch.planes[0].data       = ptr;
    data.buffer.pitch.format               = VPI_IMAGE_FORMAT_Y8_ER;

    CHECK_STATUS(vpiImageCreateWrapper(&data, nullptr, 0, &img));

    vpiImageDestroy(img);
    cudaFree(ptr);
    return 0;
}

Thanks.

Thank you very much! The problem was that I didn’t use the pitch value. Could you please explain what is the idea behind this formula?
int pitch = (int((width-1)/256)+1)*256;

I understand that it is something with the memory layout. But what is the exact meaning and from where the 256 value comes from?

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.