I attempted to allocate pitched memory and copy the data into it from the array like this
VPIImageBuffer vpi_image_buffer;
vpi_image_buffer.pitch.format = VPI_IMAGE_FORMAT_NV12_ER;
vpi_image_buffer.pitch.numPlanes = 2;
// Plane 0 (Y)
vpi_image_buffer.pitch.planes[0].width = cuda_egl_frame_.width;
vpi_image_buffer.pitch.planes[0].height = cuda_egl_frame_.height;
vpi_image_buffer.pitch.planes[0].pixelType = VPI_PIXEL_TYPE_U8;
// Plane 1 (UV)
vpi_image_buffer.pitch.planes[1].width = cuda_egl_frame_.width / 2;
vpi_image_buffer.pitch.planes[1].height = cuda_egl_frame_.height / 2;
vpi_image_buffer.pitch.planes[1].pixelType = VPI_PIXEL_TYPE_2U8;
// Allocate memory
size_t y_pitch;
auto cuda_response = cudaMallocPitch(
&vpi_image_buffer.pitch.planes[0].data,
&y_pitch,
vpi_image_buffer.pitch.planes[0].width,
vpi_image_buffer.pitch.planes[0].height
);
vpi_image_buffer.pitch.planes[0].pitchBytes = y_pitch;
size_t uv_pitch;
cuda_response = cudaMallocPitch(
&vpi_image_buffer.pitch.planes[1].data,
&uv_pitch,
vpi_image_buffer.pitch.planes[1].width,
vpi_image_buffer.pitch.planes[1].height
);
vpi_image_buffer.pitch.planes[0].pitchBytes = uv_pitch;
// Copy data
cuda_response = cudaMemcpy2DFromArray(
vpi_image_buffer.pitch.planes[0].data,
uv_pitch,
reinterpret_cast<cudaArray_t>(cuda_egl_frame_.frame.pArray[0]),
0, 0, vpi_image_buffer.pitch.planes[0].width,
vpi_image_buffer.pitch.planes[0].height,
cudaMemcpyDeviceToDevice
);
cuda_response = cudaMemcpy2DFromArray(
vpi_image_buffer.pitch.planes[1].data,
uv_pitch,
reinterpret_cast<cudaArray_t>(cuda_egl_frame_.frame.pArray[1]),
0, 0, vpi_image_buffer.pitch.planes[1].width,
vpi_image_buffer.pitch.planes[1].height,
cudaMemcpyDeviceToDevice
);
// Create Image data
VPIImageData vpi_image_data;
vpi_image_data.buffer = vpi_image_buffer;
vpi_image_data.bufferType = VPI_IMAGE_BUFFER_CUDA_PITCH_LINEAR;
// Create VPIImage
VPIImage vpi_image;
CHECK_STATUS(
vpiImageCreate(1920, 1200,
VPI_IMAGE_FORMAT_NV12_ER, VPI_BACKEND_CUDA, &vpi_image));
// Wrap the image
CHECK_STATUS(vpiImageCreateWrapper(&vpi_image_data, nullptr, VPI_BACKEND_CUDA, &vpi_image));
The wrapping step results in the following error
VPI_ERROR_INVALID_ARGUMENT: Plane row stride must be greater or equal row length in bytes
Edit:
I got it to work. The UV plane requires the width to be set to double during allocation and copying like this
cuda_response = cudaMallocPitch(
&vpi_image_buffer.pitch.planes[1].data,
&uv_pitch,
vpi_image_buffer.pitch.planes[1].width * 2,
vpi_image_buffer.pitch.planes[1].height
);
cuda_response = cudaMemcpy2DFromArray(
vpi_image_buffer.pitch.planes[1].data,
uv_pitch,
reinterpret_cast<cudaArray_t>(cuda_egl_frame_.frame.pArray[1]),
0, 0, vpi_image_buffer.pitch.planes[1].width * 2,
vpi_image_buffer.pitch.planes[1].height,
cudaMemcpyDeviceToDevice
);