I have a memory buffer allocated with cudaMallocManaged; this buffer is used as input to the AI model as a binding. I do the image preprocessing with the OpenCV as follows: I create and OpenCV matrix pointing to that buffer on GPU and preprocess images inplace. Like:
cv::Size inputSize = _modelConfig.inputSize;
int numBindings = engine->getNbIOTensors();
void *buffer_in, *buffer_out; // buffers for input and output of the model
int size_in = inputSize.height * inputSize.width * 3 * sizeof(float);
cudaMallocManaged(&buffer_in, size_in);
int size_out = 1 * sizeof(float); // output is juts a float number
cudaMallocManaged(&buffer_out, size_out);
// create cv::cuda::GpuMat outputMat pointing to bindings[0] what is buffer_in
cv::cuda::GpuMat outputMat = cv::cuda::GpuMat(inputSize.height, inputSize.width, CV_32FC3, bindings[0]);
preprocess outputMat with the OpenCV…
Now I want to do the same with the VPI library. But it cannot point to the Managed buffer bindings[0]. It requires whether NvBufSurface or cudaArray. Is it possible to make for example, NvBufSurface from the managed memory? What would you advice in order to make a VPIImage pointing to the bounding[0]?
what should I use instead of “?”
for buffertype there are options: VPI_IMAGE_BUFFER_CUDA_PITCH_LINEAR, VPI_IMAGE_BUFFER_CUDA_ARRAY, VPI_IMAGE_BUFFER_EGLIMAGE, VPI_IMAGE_BUFFER_NVBUFFER
int main()
{
cuInit(0);
int img_width_output = 224;
int img_height_output = 224;
int size_output = img_width_output * img_height_output * 3 * sizeof(float);
void* output_buff; // a preprocessed image will be put here; this buffer will be used as a model input (binding)
cudaMalloc(&output_buff, size_output);
// how I do it with OpenCV GPU version
cv::Mat img_input;
cv::cuda::GpuMat img_gpu_in, img_gpu_processed;
std::string path = "304194-17086108.jpg"; // here use your image
img_input = cv::imread(path);
img_gpu_in = cv::cuda::GpuMat(img_input); // build OpenCV GPU image based on the CPU version
// wrap an OpenCV GPU image around previously allocated output_buff
img_gpu_processed = cv::cuda::GpuMat(img_height_output, img_width_output, CV_32FC3, output_buff);
cv::cuda::resize(img_gpu_in, img_gpu_processed, cv::Size(img_height_output, img_width_output), 0, 0, cv::INTER_NEAREST);
// the resulting resized image is stored in output_buff and I can use it as model input in future
// now I try to do the same with VPIimage instead of OpenCV GPU version
VPIImage img_vpi;
VPIImageData img_data = {};
img_data.bufferType = VPI_IMAGE_BUFFER_CUDA_PITCH_LINEAR;
img_data.buffer.pitch.numPlanes = 1;
img_data.buffer.pitch.planes[0].width = img_width_output;
img_data.buffer.pitch.planes[0].height = img_height_output;
img_data.buffer.pitch.planes[0].pixelType = VPI_PIXEL_TYPE_3F32;
img_data.buffer.pitch.planes[0].data = output_buff;
//VPIStatus err_vpi = vpiImageCreateWrapper(&img_data, nullptr, VPI_BACKEND_CUDA, &img_vpi);
VPIStatus err_vpi = vpiImageCreateWrapper(&img_data, nullptr, 0, &img_vpi);
std::cout << vpiStatusGetName(err_vpi) << std::endl; // Output: VPI_ERROR_INVALID_ARGUMENT
img_gpu_in.release();
img_gpu_processed.release();
cudaFree(output_buff);
return 0;
int main()
{
cuInit(0);
int img_width_output = 224;
int img_height_output = 224;
int size_output = img_width_output * img_height_output * 3 * sizeof(float);
void* output_buff; // a preprocessed image will be put here; this buffer will be used as a model input (binding)
cudaMalloc(&output_buff, size_output);
// how I do it with OpenCV GPU version
cv::Mat img_input;
cv::cuda::GpuMat img_gpu_in, img_gpu_processed;
std::string path = "frame0001.png"; // here use your image
img_input = cv::imread(path);
img_gpu_in = cv::cuda::GpuMat(img_input); // build OpenCV GPU image based on the CPU version
// wrap an OpenCV GPU image around previously allocated output_buff
img_gpu_processed = cv::cuda::GpuMat(img_height_output, img_width_output, CV_32FC3, output_buff);
cv::cuda::resize(img_gpu_in, img_gpu_processed, cv::Size(img_height_output, img_width_output), 0, 0, cv::INTER_NEAREST);
// the resulting resized image is stored in output_buff and I can use it as model input in future
// now I try to do the same with VPIimage instead of OpenCV GPU version
VPIImage img_vpi;
VPIImageData data;
data.bufferType = VPI_IMAGE_BUFFER_CUDA_PITCH_LINEAR;
data.buffer.pitch.format = VPI_IMAGE_FORMAT_Y8_ER;
data.buffer.pitch.planes[0].data = output_buff;
data.buffer.pitch.planes[0].width = img_width_output;
data.buffer.pitch.planes[0].height = img_height_output;
data.buffer.pitch.planes[0].pixelType = VPI_PIXEL_TYPE_DEFAULT;
//VPIStatus err_vpi = vpiImageCreateWrapper(&data, nullptr, VPI_BACKEND_CUDA, &img_vpi);
VPIStatus err_vpi = vpiImageCreateWrapper(&data, nullptr, 0, &img_vpi);
std::cout << vpiStatusGetName(err_vpi) << std::endl; // Output: VPI_ERROR_INVALID_ARGUMENT
img_gpu_in.release();
img_gpu_processed.release();
cudaFree(output_buff);
return 0;
Thank you very much! The problem was that I didn’t use the pitch value. Could you please explain what is the idea behind this formula?
int pitch = (int((width-1)/256)+1)*256;
I understand that it is something with the memory layout. But what is the exact meaning and from where the 256 value comes from?