How to convert cv::Mat to dwImageCUDA?

I am currently working on a segmentation model. To decode the segmentation label mask and convert it into a color mask, I copy the model output GPU buffer to CPU, process it and convert it to a cv::Mat for rendering. I then want to convert this cv::Mat to dwImageCUDA. I tried the shown below code block and ran into error.

uint8_t* colorMask = new uint8_t[m_imageHeight*m_imageWidth*3];

/// MASK DECODING ///

cv::Mat colorMaskMat(m_networkOutputDimensions[0].height, m_networkOutputDimensions[0].width, CV_8UC3, (void *)(colorMask));
dwImage_createAndBindBuffer(&m_maskCUDA, m_displayProperties, colorMaskMat.data, NULL, 1, m_context);

ss

I would like to know what is going wrong here and how to solve this issue and send the cv::Mat to CUDA.

Hi @priyam1

May I know which DRIVE platform and DRIVE OS your’re using?

@kayccc I am using Drive OS 6 and Driveworks 5.6.

Dear @priyam1,
We do not support opencv officially on DRIVE Orin release.
Just curious if you able to integrate DRIVE OS + DW 5.6 + opencv sample? If so, please share the steps to help others in the community.
The error indicates you are accessing illegal memory causing segfault. It is not very clear from the snippet on route cause. Could you share function code snippet if possible. Do you see any issue if you try dwImageCPU?

@SivaRamaKrishnaNV I shifted my postprocessing pipeline to GPU. I try to send the decoded mask through GLStreamer and keep on receiving:

[13-10-2022 17:59:16] Driveworks exception thrown: ImageStreamer(CUDA->GL)::receiveImpl, cannot unmap. Error cudaErrorInvalidGraphicsContext: invalid OpenGL or DirectX context

terminate called after throwing an instance of 'std::runtime_error'
  what():  [2022-10-13 17:59:16] DW Error DW_CUDA_ERROR executing DW function:
 dwImageStreamerGL_consumerReceive(&m_maskGL, 32000, m_streamerGL_CUDA2GL)
 at /home/mz/mz/src/perception/segmentation.hpp:364
Aborted (core dumped)

I have shared my post processing cuda kernel below:

__global__ void decodeLabelKernel(uint8_t* mask, const float32_t* label, size_t pitch,
                                  const uint32_t width, const uint32_t height,
                                  const dwImageFormat format, float32_t maxVal)
{
    
    const uint8_t colors[26][3] = {
        /// COLOR MAP ///
    };
    
    const uint32_t tidx = blockDim.x * blockIdx.x + threadIdx.x;
    const uint32_t tidy = blockDim.y * blockIdx.y + threadIdx.y;

    if (tidx >= width || tidy >= height)
        return;

    int maxIdx = -1;
    uint32_t startPos = (tidy * width) + tidx;
    for(int i=0; i<26; ++i)
    {
        if (label[startPos + (i * height * width)] > maxVal)
        {
            maxIdx = i;
            maxVal = label[startPos + (i * height * width)];
        }
    }
    mask[tidy * pitch + 4 * tidx + 0] = colors[maxIdx][0];
    mask[tidy * pitch + 4 * tidx + 1] = colors[maxIdx][1];
    mask[tidy * pitch + 4 * tidx + 2] = colors[maxIdx][2];
    mask[tidy * pitch + 4 * tidx + 3] = 255;
}

void decodeLabel(dwImageHandle_t mask, const float32_t* gpuBuffer)
{
    dwImageProperties prop;
    dwImage_getProperties(&prop, mask);
    float32_t maxVal = std::numeric_limits<float32_t>::min();
    if (prop.format != DW_IMAGE_FORMAT_RGBA_UINT8)
    {
        throw std::runtime_error("unsupported format in ");
    }
    else if (prop.type == DW_IMAGE_CUDA)
    {
        dwImageCUDA* maskCUDA;
        dwImage_getCUDA(&maskCUDA, mask);
        dim3 numThreads = dim3(128, 120);
        decodeLabelKernel<<<dim3(iDivUp(prop.width, numThreads.x),
                                iDivUp(prop.height, numThreads.y)),
                                numThreads>>>(static_cast<uint8_t*>(maskCUDA->dptr[0]), gpuBuffer,
                                                maskCUDA->pitch[0], prop.width,
                                                prop.height, prop.format, maxVal);
    }
}

Sharing the interpretOutput which used GLStreamer below:

void interpretOutput( [[maybe_unused]] const float32_t* out, [[maybe_unused]] dwContextHandle_t m_context)
{
     generateImage(m_maskCUDA, 0);
     cudaDeviceSynchronize();
     decodeLabel(m_maskCUDA, out); 
     cudaDeviceSynchronize();

     CHECK_DW_ERROR(dwImageStreamerGL_producerSend(m_maskCUDA, m_streamerGL_CUDA2GL));
     CHECK_DW_ERROR(dwImageStreamerGL_consumerReceive(&m_maskGL, 32000, m_streamerGL_CUDA2GL));
     dwImage_getGL(&m_imgGl, m_maskGL);
     CHECK_DW_ERROR(dwImageStreamerGL_consumerReturn(&m_maskGL, m_streamerGL_CUDA2GL));
     CHECK_DW_ERROR(dwImageStreamerGL_producerReturn(&m_maskCUDA, 32000, m_streamerGL_CUDA2GL));
}

Am I traversing the model output GPU buffer and creating the mask properly? Is the output of a segmentation model is stored in the linear memory block?