Import block linear NvSciBufObj with pitch to CUDA

Oh, here it’s still available. Anyway I paste it here:

// gcc -o cuda_nvmedia_test ./samples/cuda_nvmedia_test.cpp -I ./nvmedia_include/usr/include/ -I /usr/local/cuda-11.4/targets/aarch64-linux/include -std=c++17 -lnvmedia_dla -lnvscibuf -lnvscisync -lnvmedia2d -lnvmedia_iep_sci -lnvmedia_ide_sci -lnvmedialdc -lpthread -lstdc++ -L /usr/local/cuda-11.4/targets/aarch64-linux/lib/ -lcudart -DDRIVEOS_6

// gcc -o cuda_nvmedia_test ./samples/cuda_nvmedia_test.cpp -I /usr/local/cuda-10.2/targets/aarch64-linux/include -std=c++17 -lnvmedia_dla -lnvscibuf -lnvscisync -lnvmedia_2d -lnvmedia_core -lnvmedia -lpthread -lstdc++ -L /usr/local/cuda-10.2/targets/aarch64-linux/lib/ -lcudart -DDRIVEOS_5_COMPAT

#include <nvscibuf.h>
#include <nvscisync.h>
#ifdef DRIVEOS_5_COMPAT
    #include <nvmedia_surface.h>
    #include <nvmedia_ldc.h>
    #include <nvmedia_2d.h>
    #include <nvmedia_image.h>
    #include <nvmedia_image_nvscibuf.h>
    #include <nvmedia_2d_nvscisync.h>
    #include <nvmedia_ldc_nvscisync.h>
#else
    #include <nvmedia_6x/nvmedia_ldc.h>
    #include <nvmedia_6x/nvmedia_2d.h>
    #include <nvmedia_6x/nvmedia_2d_sci.h>
    #include <nvmedia_6x/nvmedia_ldc_sci.h>
    #include <nvmedia_6x/nvmedia_iep.h>
    #include <nvmedia_6x/nvmedia_ldc_util.h>
#endif
#include <cuda_runtime.h>

#include <stddef.h>
#include <cstdint>
#include <vector>
#include <memory>
#include <cstdio>
#include <type_traits>
#include <chrono>
#include <thread>
#include <fstream>
#include <iostream>

#define LOG_ERR(...) printf(__VA_ARGS__); printf("\n")

static constexpr size_t imgW = 2912;
static constexpr size_t imgH = 1080;

std::array<uint8_t, imgW * imgH> inImgCudaY;
std::array<uint8_t, (imgW) * (imgH / 2)> inImgCudaUV;

std::array<uint8_t, imgW * imgH> outImgCpuY;
std::array<uint8_t, (imgW / 2) * (imgH / 2)> outImgCpuU;
std::array<uint8_t, (imgW / 2) * (imgH / 2)> outImgCpuV;
std::array<void*, 3> outImgPtrs{outImgCpuY.data(), outImgCpuU.data(), outImgCpuV.data()};

using ScopedSciBufObj = std::unique_ptr<NvSciBufObjRefRec, std::integral_constant<decltype(NvSciBufObjFree)*, NvSciBufObjFree>>;
using ScopedCpuWaitContext = std::unique_ptr<NvSciSyncCpuWaitContextRec, std::integral_constant<decltype(NvSciSyncCpuWaitContextFree)*, NvSciSyncCpuWaitContextFree>>;
using ScopedNvMedia2D = std::unique_ptr<NvMedia2D, std::integral_constant<decltype(NvMedia2DDestroy)*, NvMedia2DDestroy>>;
using ScopedNvSciSyncAttrList = std::unique_ptr<NvSciSyncAttrListRec, std::integral_constant<decltype(NvSciSyncAttrListFree)*, NvSciSyncAttrListFree>>;
using ScopedNvSciBufAttrList = std::unique_ptr<NvSciBufAttrListRec, std::integral_constant<decltype(NvSciBufAttrListFree)*, NvSciBufAttrListFree>>;
using ScopedSciSyncObj = std::unique_ptr<NvSciSyncObjRec, std::integral_constant<decltype(NvSciSyncObjFree)*, NvSciSyncObjFree>>;

#ifdef DRIVEOS_5_COMPAT
    static void initNvMediaImageNvSciBuf()
    {
        static constexpr auto initLambda{[]() -> void* {
            
            if(auto const err{NvMediaImageNvSciBufInit()}; err != NVMEDIA_STATUS_OK) {
                LOG_ERR("Could not init NvMediaImageNvSciBuf: %d", err);
                return nullptr;
            }
            return reinterpret_cast<void*>(1); // important to return something non-null (unique_ptr won't call the deleter with nullptr)
        }};

        static constexpr auto dummyDeleterFunctionPtr{
            +[](void*) {
                #ifdef DRIVEOS_5_COMPAT
                    NvMediaImageNvSciBufDeinit();
                #endif
            }
        };

        static std::unique_ptr<void, void(*)(void*)> const nvMediaSciBufInitDeinit{
            initLambda(),
            dummyDeleterFunctionPtr
        };
    }

    static NvMediaDevice* getNvMediaDevice()
    {
        static constexpr auto initLambda{[]() {
            initNvMediaImageNvSciBuf();
            auto const nvMediaDevice{NvMediaDeviceCreate()};
            if(nvMediaDevice == nullptr) {
                LOG_ERR("Could not create NvMediaDevice");
            }
            return nvMediaDevice;
        }};

        static std::unique_ptr<NvMediaDevice, std::integral_constant<decltype(NvMediaDeviceDestroy)*, NvMediaDeviceDestroy>> const nvMediaDevice{initLambda()};
        return nvMediaDevice.get();
    }
#endif

static NvSciBufModule getNvSciBufModule()
{
    static constexpr auto initLambda{[]() -> NvSciBufModule {
        #ifdef DRIVEOS_5_COMPAT
            initNvMediaImageNvSciBuf();
        #endif
        NvSciBufModule rawNvSciBufModule{};
        if(auto const error{NvSciBufModuleOpen(&rawNvSciBufModule)}; error != NvSciError_Success)
        {
            LOG_ERR("NvSciBufModuleOpen failed: %d", error);
            return nullptr;
        }
        return rawNvSciBufModule;
    }};

    static std::unique_ptr<NvSciBufModuleRec, std::integral_constant<decltype(NvSciBufModuleClose)*, NvSciBufModuleClose>> const nvSciBufModule{initLambda()};
    return nvSciBufModule.get();
}

static NvSciSyncModule getNvSciSyncModule()
{
    static constexpr auto initLambda{[]() -> NvSciSyncModule {
        #ifdef DRIVEOS_5_COMPAT
            initNvMediaImageNvSciBuf();
        #endif
        NvSciSyncModule rawNvSciSyncModule{};
        if(auto const error{NvSciSyncModuleOpen(&rawNvSciSyncModule)}; error != NvSciError_Success)
        {
            LOG_ERR("NvSciSyncModuleOpen failed: %d", error);
            return nullptr;
        }
        return rawNvSciSyncModule;
    }};

    static std::unique_ptr<NvSciSyncModuleRec, std::integral_constant<decltype(NvSciSyncModuleClose)*, NvSciSyncModuleClose>> const nvSciSyncModule{initLambda()};
    return nvSciSyncModule.get();
}

#ifdef DRIVEOS_5_COMPAT
    NvSciBufAttrList nvMediaSurfAttrsToNvSciBufAttrList(NvMediaSurfaceType surfType, NvMediaSurfAllocAttr const *surfAttrs, uint32_t numSurfAttrs, NvSciBufAttrValAccessPerm accessPerm)
    {  
        // NvMedia
        ScopedNvSciBufAttrList nvmediaAttr{nullptr};
        {
            NvSciBufAttrList rawNvMediaAttr{nullptr};
            if (auto const err{NvSciBufAttrListCreate(getNvSciBufModule(), &rawNvMediaAttr)}; NvSciError_Success != err) {
                LOG_ERR("Couldn't create NvSciBufAttr");
                return nullptr;
            }
            nvmediaAttr.reset(rawNvMediaAttr);
        }

        if (auto const err{NvMediaImageFillNvSciBufAttrs(getNvMediaDevice(), surfType, surfAttrs, numSurfAttrs, 0, nvmediaAttr.get())}; NVMEDIA_STATUS_OK != err) {
            LOG_ERR("Couldn't fill NvSciBufAttrs");
            return nullptr;
        }


        // CUDA + NPPI
        ScopedNvSciBufAttrList cudaAttr{nullptr};
        {
            static constexpr NvSciBufType bufType = NvSciBufType_Image;

            {
                NvSciBufAttrList rawCudaAttr{nullptr};
                if (auto const err{NvSciBufAttrListCreate(getNvSciBufModule(), &rawCudaAttr)}; NvSciError_Success != err) {
                    LOG_ERR("Couldn't create NvSciBufAttr");
                    return nullptr;
                }
                cudaAttr.reset(rawCudaAttr);
            }

            {
                NvSciBufAttrKeyValuePair setAttrs[] = {                              
                    { NvSciBufGeneralAttrKey_Types, &bufType, sizeof(bufType) },
                    { NvSciBufGeneralAttrKey_RequiredPerm, &accessPerm, sizeof(accessPerm) },
                };

                if (auto const err{NvSciBufAttrListSetAttrs(cudaAttr.get(), setAttrs, sizeof(setAttrs) / sizeof(NvSciBufAttrKeyValuePair))}; NvSciError_Success != err) {
                    LOG_ERR("Couldn't fill NvSciBufAttrs");
                    return nullptr;
                }
            }
        }

        /* Reconcile the NvSciBufAttrs and then allocate an NvSciBufObj. */
        NvSciBufAttrList attrList[2] = {nvmediaAttr.get(), cudaAttr.get()};
        NvSciBufAttrList reconciled{nullptr};
        NvSciBufAttrList conflicts{nullptr};
        if (auto const err{NvSciBufAttrListReconcile(attrList, 2, &reconciled, &conflicts)}; NvSciError_Success != err) {
            LOG_ERR("Couldn't reconcile NvSciBufAttr and allocate NvSciBufObj");
            if(conflicts != nullptr)
            {
                NvSciBufAttrListFree(conflicts);
            }
        }
        return reconciled;
    }
#else
    static const std::vector<NvSciRmGpuId>& getGpuIds()
    {
        static const auto initLambda{[]() -> std::vector<NvSciRmGpuId> {
            int cudaDeviceCount{0};
            if (auto const err{cudaGetDeviceCount(&cudaDeviceCount)}; err != cudaSuccess)
            {
                LOG_ERR("Couldn't get cuda device count: %d", err);
                return {};
            }

            std::vector<NvSciRmGpuId> result{};
            result.reserve(cudaDeviceCount);

            for(int currDevice{0}; currDevice < cudaDeviceCount; ++currDevice)
            {
                cudaDeviceProp currDeviceProp{};
                if (auto const err{cudaGetDeviceProperties(&currDeviceProp, currDevice)}; err != cudaSuccess)
                {
                    LOG_ERR("Couldn't get cuda device properties: %d", err);
                    return {};
                }
                result.emplace_back(NvSciRmGpuId{});
                memcpy(result.back().bytes, currDeviceProp.uuid.bytes, 16 * sizeof(uint8_t));
            }
            return result;
        }};

        static const std::vector<NvSciRmGpuId> gpuIds{initLambda()};
        return gpuIds;
    }
#endif

ScopedSciBufObj allocateImage(uint32_t width, uint32_t height)
{
    ScopedNvSciBufAttrList imageAttrReconciled{nullptr};

    #ifdef DRIVEOS_5_COMPAT
        {
            NvMediaSurfAllocAttr surfAllocAttr[] = {
                { NVM_SURF_ATTR_WIDTH, width },
                { NVM_SURF_ATTR_HEIGHT, height },
                { NVM_SURF_ATTR_EMB_LINES_TOP, 0 },
                { NVM_SURF_ATTR_EMB_LINES_BOTTOM, 0 },
                { NVM_SURF_ATTR_CPU_ACCESS, NVM_SURF_ATTR_CPU_ACCESS_CACHED },
                { NVM_SURF_ATTR_ALLOC_TYPE, NVM_SURF_ATTR_ALLOC_ISOCHRONOUS },
                { NVM_SURF_ATTR_PEER_VM_ID, 0 },
                { NVM_SURF_ATTR_SCAN_TYPE, NVM_SURF_ATTR_SCAN_PROGRESSIVE },
                { NVM_SURF_ATTR_COLOR_STD_TYPE, NVM_SURF_ATTR_COLOR_STD_REC709_ER }
            };

            NvMediaSurfFormatAttr surfFormatAttr[NVM_SURF_FMT_ATTR_MAX]{};
            NVM_SURF_FMT_SET_ATTR_YUV(surfFormatAttr, YUV, 420, SEMI_PLANAR, UINT, 8, BL);

            NvMediaSurfaceType const surfType = NvMediaSurfaceFormatGetType(surfFormatAttr, NVM_SURF_FMT_ATTR_MAX);

            NvSciBufAttrList const rawImageAttrReconciled{nvMediaSurfAttrsToNvSciBufAttrList(surfType, surfAllocAttr, sizeof(surfAllocAttr) / sizeof(NvMediaSurfAllocAttr), NvSciBufAccessPerm_ReadWrite)};
            imageAttrReconciled.reset(rawImageAttrReconciled);
        }
    #else
        static constexpr NvSciBufAttrValImageLayoutType layout{NvSciBufImage_BlockLinearType};
        static constexpr NvSciBufAttrValAccessPerm accessPerm{NvSciBufAccessPerm_ReadWrite};
        static constexpr NvSciBufType bufType{NvSciBufType_Image};
        static constexpr bool needCPUAccess{true};
        static constexpr bool enableCpuCache{true};
        static constexpr uint32_t planeCount{2};
        static constexpr uint64_t padding[]{0, 0};
        static constexpr NvSciBufAttrValColorFmt colorFormat[]{NvSciColor_Y8, NvSciColor_U8_V8};
        static constexpr NvSciBufAttrValColorStd colorStd[]{NvSciColorStd_REC709_ER, NvSciColorStd_REC709_ER}; 
        static constexpr int32_t planeBaseAddrAlign[]{1024, 1024};
        uint32_t const planeWidth[]{width, width / 2};
        uint32_t const planeHeight[]{height, height / 2};
        static constexpr NvSciBufAttrValImageScanType scanType{NvSciBufScan_ProgressiveType};
        static constexpr bool vpr{false};
        static constexpr uint64_t imageCount{1};
        static std::vector<NvSciRmGpuId> const& gpuIds{getGpuIds()};
        static std::vector<NvSciBufAttrValGpuCache> const& gpuCache{[](){
            std::vector<NvSciBufAttrValGpuCache> result{};
            result.reserve(gpuIds.size());
            for(auto const& currGpu: gpuIds) {
                result.emplace_back(NvSciBufAttrValGpuCache{currGpu, false});
            }
            return result;
        }()};

        if(gpuIds.empty())
        {
            LOG_ERR("Couldn't get GPU vector.");
            return nullptr;
        }

        ScopedNvSciBufAttrList imageAttr{nullptr};
        {
            {
                NvSciBufAttrList rawImageAttr{nullptr};
                if (auto const err{NvSciBufAttrListCreate(getNvSciBufModule(), &rawImageAttr)}; NvSciError_Success != err) {
                    LOG_ERR("Couldn't create NvSciBufAttr: %d", err);
                    return nullptr;
                }
                imageAttr.reset(rawImageAttr);
            }

            {
                NvSciBufAttrKeyValuePair setAttrs[] = {                              
                    { NvSciBufGeneralAttrKey_Types, &bufType, sizeof(bufType) },
                    { NvSciBufGeneralAttrKey_RequiredPerm, &accessPerm, sizeof(accessPerm) },
                    { NvSciBufGeneralAttrKey_NeedCpuAccess, &needCPUAccess, sizeof(needCPUAccess) },
                    { NvSciBufGeneralAttrKey_EnableCpuCache, &enableCpuCache, sizeof(enableCpuCache) },
                    { NvSciBufGeneralAttrKey_GpuId, gpuIds.data(), sizeof(NvSciRmGpuId) * gpuIds.size() },
                    { NvSciBufGeneralAttrKey_EnableGpuCache, gpuCache.data(), sizeof(NvSciBufAttrValGpuCache) * gpuCache.size() },
                    { NvSciBufImageAttrKey_Layout, &layout, sizeof(layout) },
                    { NvSciBufImageAttrKey_PlaneCount, &planeCount, sizeof(planeCount) },
                    { NvSciBufImageAttrKey_TopPadding, &padding, sizeof(padding) },
                    { NvSciBufImageAttrKey_BottomPadding, &padding, sizeof(padding) },
                    { NvSciBufImageAttrKey_LeftPadding, &padding, sizeof(padding) },
                    { NvSciBufImageAttrKey_RightPadding, &padding, sizeof(padding) },
                    { NvSciBufImageAttrKey_PlaneColorFormat, &colorFormat, sizeof(colorFormat) },
                    { NvSciBufImageAttrKey_PlaneColorStd, &colorStd, sizeof(colorStd) },
                    { NvSciBufImageAttrKey_PlaneBaseAddrAlign, &planeBaseAddrAlign, sizeof(planeBaseAddrAlign) },
                    { NvSciBufImageAttrKey_PlaneWidth, &planeWidth, sizeof(planeWidth) },
                    { NvSciBufImageAttrKey_PlaneHeight, &planeHeight, sizeof(planeHeight) },
                    { NvSciBufImageAttrKey_ScanType, &scanType, sizeof(scanType) },
                    { NvSciBufImageAttrKey_VprFlag, &vpr, sizeof(vpr) },
                    { NvSciBufImageAttrKey_ImageCount, &imageCount, sizeof(imageCount) },
                };

                if (auto const err{NvSciBufAttrListSetAttrs(imageAttr.get(), setAttrs, sizeof(setAttrs) / sizeof(NvSciBufAttrKeyValuePair))}; NvSciError_Success != err) {
                    LOG_ERR("Couldn't fill NvSciBufAttrs: %d", err);
                    return nullptr;
                }
            }

            // We need to call this to be able to use the buffer from HW engines
            if (auto const err{NvMedia2DFillNvSciBufAttrList(nullptr, imageAttr.get())}; NVMEDIA_STATUS_OK != err) {
                LOG_ERR("NvMedia2DFillNvSciBufAttrList failed: %d", err);
                return nullptr;
            }
        }

        {
            NvSciBufAttrList attrList[]{imageAttr.get()};
            NvSciBufAttrList reconciled{nullptr};
            NvSciBufAttrList conflicts{nullptr};
            if (auto const err{NvSciBufAttrListReconcile(attrList, sizeof(attrList) / sizeof(NvSciBufAttrList), &reconciled, &conflicts)}; NvSciError_Success != err)
            {
                LOG_ERR("Couldn't reconcile NvSciBufAttr and allocate NvSciBufObj: %d", err);
                if(conflicts != nullptr)
                {
                    void* dump{};
                    size_t len{};
                    if (auto const err{NvSciBufAttrListDebugDump(conflicts, &dump, &len)}; NvSciError_Success != err) {
                        LOG_ERR("NvSciBufAttrListDebugDump failed: %d", err);
                    } else {
                        LOG_ERR("Conflicted args: %-*s", static_cast<int>(len), static_cast<char*>(dump));
                    }
                    NvSciBufAttrListFree(conflicts);
                } else {
                    LOG_ERR("No conflict list");
                }
            }
            imageAttrReconciled.reset(reconciled);
        }
    #endif

    NvSciBufObj result{nullptr};
    if (auto const err{NvSciBufObjAlloc(imageAttrReconciled.get(), &result)}; err != NvSciError_Success) {
        LOG_ERR("NvSciBufObjAlloc failed: %d", err);
        return nullptr;
    }
    return ScopedSciBufObj{result};
}

int main()
{
    // Allocate images
    auto srcImage{allocateImage(imgW, imgH)};
    uint64_t fullBufferSize{};

    struct CUexternalMemory_st *externalMemory;
    struct cudaMipmappedArray *mipmapArray[2];
    struct cudaArray *imagePlanes[2];
    unsigned long long cudaSurfaceNvmediaBuf[2];

    {
        // Get cuda maps
        NvSciBufAttrList sciBufAttr{nullptr};
        // Don't need to free the returned attribute list
        if (auto const err{NvSciBufObjGetAttrList(srcImage.get(), &sciBufAttr)}; NvSciError_Success != err) {
            LOG_ERR("Couldn't get sciBufObj attribute list: %d", err);
        }

        NvSciBufAttrKeyValuePair queriedAttrs[]{
            { NvSciBufImageAttrKey_Size, nullptr, 0 },
            { NvSciBufImageAttrKey_PlaneChannelCount, nullptr, 0 },
            { NvSciBufImageAttrKey_PlaneOffset, nullptr, 0 },
            { NvSciBufImageAttrKey_PlaneWidth, nullptr, 0 },
            { NvSciBufImageAttrKey_PlaneHeight, nullptr, 0 },
            { NvSciBufImageAttrKey_PlanePitch, nullptr, 0 },
            { NvSciBufImageAttrKey_PlaneBitsPerPixel, nullptr, 0 },
            { NvSciBufImageAttrKey_PlaneCount, nullptr, 0 },
        };

        // Query args returned filled by NvMedia
        if (auto const err{NvSciBufAttrListGetAttrs(sciBufAttr, queriedAttrs, sizeof(queriedAttrs) / sizeof(NvSciBufAttrKeyValuePair))}; NvSciError_Success != err) {
            LOG_ERR("Couldn't get NvSciBufAttrs: %d", err);
        }

        fullBufferSize = *static_cast<uint64_t const*>(queriedAttrs[0].value);
        uint8_t const *const channelCountPerPlane{static_cast<uint8_t const*>(queriedAttrs[1].value)};
        uint64_t const *planeOffset{static_cast<uint64_t const*>(queriedAttrs[2].value)};
        uint32_t const *planeWidth{static_cast<uint32_t const*>(queriedAttrs[3].value)};
        uint32_t const *planeHeight{static_cast<uint32_t const*>(queriedAttrs[4].value)};
        uint32_t const *planePitch{static_cast<uint32_t const*>(queriedAttrs[5].value)};
        uint32_t const *planePixelBits{static_cast<uint32_t const*>(queriedAttrs[6].value)};
        uint32_t const planeCount{*static_cast<uint32_t const*>(queriedAttrs[7].value)};


        // Import external memory
        {
            cudaExternalMemoryHandleDesc memHandleDesc{};
            memHandleDesc.type = cudaExternalMemoryHandleTypeNvSciBuf;
            memHandleDesc.handle.nvSciBufObject = srcImage.get();
            memHandleDesc.size = fullBufferSize;
            if(auto const error{cudaImportExternalMemory(&externalMemory, &memHandleDesc)}; error != cudaSuccess)
            {
                LOG_ERR("Could not import external memory: %d", error);
            }
        }

        for (uint32_t planeIndex{0}; planeIndex < planeCount; ++planeIndex)
        {
            LOG_ERR("Plane %d W: %d H: %d P: %d", planeIndex, planeWidth[planeIndex], planeHeight[planeIndex], planePitch[planeIndex]);
            /* Create mipmapArray */
            cudaExtent const extent{make_cudaExtent(planeWidth[planeIndex], planeHeight[planeIndex], 0)}; // It works only on DriveOS 5.2.6, on DriveOS 6.0.8.1 the image breaks
            //cudaExtent const extent{make_cudaExtent(planePitch[planeIndex] / channelCountPerPlane[planeIndex], planeHeight[planeIndex], 0)}; // It makes both versions work
            cudaChannelFormatDesc desc{};
            uint32_t const bitsPerPixel{planePixelBits[planeIndex] / channelCountPerPlane[planeIndex]};
            switch (channelCountPerPlane[planeIndex]) {
            case 1:
                desc = cudaCreateChannelDesc(bitsPerPixel, 0, 0, 0, cudaChannelFormatKindUnsigned);
            break;
            case 2:
                desc = cudaCreateChannelDesc(bitsPerPixel, bitsPerPixel, 0, 0, cudaChannelFormatKindUnsigned);
            break;
            default: 
                LOG_ERR("Unsupported channel count");
            }

            cudaExternalMemoryMipmappedArrayDesc mipmapDesc{};
            mipmapDesc.offset = planeOffset[planeIndex];
            mipmapDesc.formatDesc = desc;
            mipmapDesc.extent = extent;
            mipmapDesc.flags = 0;
            mipmapDesc.numLevels = 1;

            if(auto const error{cudaExternalMemoryGetMappedMipmappedArray(&mipmapArray[planeIndex], externalMemory, &mipmapDesc)}; error != cudaSuccess)
            {
                LOG_ERR("Could not create mipmapped array: %d", error);
            }

            /* Create cuda arrays */

            if(auto const error{cudaGetMipmappedArrayLevel(&imagePlanes[planeIndex], mipmapArray[planeIndex], 0 /*mipmapLevel*/)}; error != cudaSuccess)
            {
                LOG_ERR("Could not create cuda array from mipmapArray: %d", error);
            }

            /* Create cuda surface */
            {
                cudaResourceDesc resourceDesc{};
                resourceDesc.resType = cudaResourceTypeArray;
                resourceDesc.res.array.array = imagePlanes[planeIndex];

                if(auto const error{cudaCreateSurfaceObject(&cudaSurfaceNvmediaBuf[planeIndex], &resourceDesc)}; error != cudaSuccess)
                {
                    LOG_ERR("Could not create cuda surface from array: %d", error);
                }
            }
        }
    }

    // init input image
    for(size_t y = 0; y < imgH; ++y)
    {
        for(size_t x = 0; x < imgW; ++x)
        {
            inImgCudaY[y * imgW + x] = (y * 128 / imgH) + (x * 128 / imgW);
            if(x % 2 == 0 && y % 2 == 0)
            {
                inImgCudaUV[(y / 2) * imgW + x] = (y * 128 / imgH) + (x * 128 / imgW);
                inImgCudaUV[(y / 2) * imgW + x + 1] = 255 - (y * 128 / imgH) + (x * 128 / imgW);
            }
        }
    }

    // Fill image from cuda
    {
        cudaError_t cudaError = cudaSuccess;
        cudaError = cudaMemcpy2DToArray(imagePlanes[0], 0, 0, inImgCudaY.data(), imgW, imgW, imgH, cudaMemcpyHostToDevice);
        if(cudaError != cudaSuccess)
        {
            LOG_ERR("Couldn't copy image to host memory, cudaMemcpy failed: %d", cudaError);
            return -1;
        }
        cudaError = cudaMemcpy2DToArray(imagePlanes[1], 0, 0, inImgCudaUV.data(), imgW, imgW, imgH / 2, cudaMemcpyHostToDevice);
        if(cudaError != cudaSuccess)
        {
            LOG_ERR("Couldn't copy image to host memory, cudaMemcpy failed: %d", cudaError);
            return -1;
        }
    }

    // Get output data
    {
        std::array<uint32_t, 3> outImgPitches{imgW, imgW / 2, imgW / 2};

        #ifdef DRIVEOS_5_COMPAT
            NvMediaImage* nvMediaImage{};
            if (auto const err{NvMediaImageCreateFromNvSciBuf(getNvMediaDevice(), srcImage.get(), &nvMediaImage)}; err != NVMEDIA_STATUS_OK) {
                LOG_ERR("NvMediaImageCreateFromNvSciBuf failed: %d", err);
                return -1;
            }
            NvMediaImageSurfaceMap surfaceMap{};
            if(auto const error{NvMediaImageLock(nvMediaImage, NVMEDIA_IMAGE_ACCESS_READ, &surfaceMap)}; error != NVMEDIA_STATUS_OK) 
            {
                LOG_ERR("NvMediaImageLock failed: %d", error);
            }
            if(auto const error{NvMediaImageGetBits(nvMediaImage, nullptr, const_cast<void**>(outImgPtrs.data()), outImgPitches.data())}; error != NVMEDIA_STATUS_OK) 
            {
                LOG_ERR("NvMediaImageGetBits failed: %d", error);
            }
            NvMediaImageUnlock(nvMediaImage);
        #else
            std::array<uint32_t, 3> outImgSizes{outImgCpuY.size(), outImgCpuU.size(), outImgCpuV.size()};
            if(auto const error{NvSciBufObjGetPixels(srcImage.get(), nullptr, outImgPtrs.data(), outImgSizes.data(), outImgPitches.data())}; error != NvSciError_Success) {
                LOG_ERR("NvSciBufObjGetPixels failed: %d", error);
                return -1;
            }
        #endif
    }

    // Save images
    {
        std::ofstream imgFs("img_cuda" + std::to_string(imgW) + "x" + std::to_string(imgH) + ".yuv", std::ios::out | std::ios::binary | std::ios::trunc);
        imgFs.write(reinterpret_cast<char*>(outImgCpuY.data()), outImgCpuY.size());
        imgFs.write(reinterpret_cast<char*>(outImgCpuU.data()), outImgCpuU.size());
        imgFs.write(reinterpret_cast<char*>(outImgCpuV.data()), outImgCpuV.size());
        imgFs.close();
    }
}