@SivaRamaKrishnaNV I shifted my postprocessing pipeline to GPU. I try to send the decoded mask through GLStreamer and keep on receiving:
[13-10-2022 17:59:16] Driveworks exception thrown: ImageStreamer(CUDA->GL)::receiveImpl, cannot unmap. Error cudaErrorInvalidGraphicsContext: invalid OpenGL or DirectX context
terminate called after throwing an instance of 'std::runtime_error'
what(): [2022-10-13 17:59:16] DW Error DW_CUDA_ERROR executing DW function:
dwImageStreamerGL_consumerReceive(&m_maskGL, 32000, m_streamerGL_CUDA2GL)
at /home/mz/mz/src/perception/segmentation.hpp:364
Aborted (core dumped)
I have shared my post processing cuda kernel below:
__global__ void decodeLabelKernel(uint8_t* mask, const float32_t* label, size_t pitch,
const uint32_t width, const uint32_t height,
const dwImageFormat format, float32_t maxVal)
{
const uint8_t colors[26][3] = {
/// COLOR MAP ///
};
const uint32_t tidx = blockDim.x * blockIdx.x + threadIdx.x;
const uint32_t tidy = blockDim.y * blockIdx.y + threadIdx.y;
if (tidx >= width || tidy >= height)
return;
int maxIdx = -1;
uint32_t startPos = (tidy * width) + tidx;
for(int i=0; i<26; ++i)
{
if (label[startPos + (i * height * width)] > maxVal)
{
maxIdx = i;
maxVal = label[startPos + (i * height * width)];
}
}
mask[tidy * pitch + 4 * tidx + 0] = colors[maxIdx][0];
mask[tidy * pitch + 4 * tidx + 1] = colors[maxIdx][1];
mask[tidy * pitch + 4 * tidx + 2] = colors[maxIdx][2];
mask[tidy * pitch + 4 * tidx + 3] = 255;
}
void decodeLabel(dwImageHandle_t mask, const float32_t* gpuBuffer)
{
dwImageProperties prop;
dwImage_getProperties(&prop, mask);
float32_t maxVal = std::numeric_limits<float32_t>::min();
if (prop.format != DW_IMAGE_FORMAT_RGBA_UINT8)
{
throw std::runtime_error("unsupported format in ");
}
else if (prop.type == DW_IMAGE_CUDA)
{
dwImageCUDA* maskCUDA;
dwImage_getCUDA(&maskCUDA, mask);
dim3 numThreads = dim3(128, 120);
decodeLabelKernel<<<dim3(iDivUp(prop.width, numThreads.x),
iDivUp(prop.height, numThreads.y)),
numThreads>>>(static_cast<uint8_t*>(maskCUDA->dptr[0]), gpuBuffer,
maskCUDA->pitch[0], prop.width,
prop.height, prop.format, maxVal);
}
}
Sharing the interpretOutput
which used GLStreamer below:
void interpretOutput( [[maybe_unused]] const float32_t* out, [[maybe_unused]] dwContextHandle_t m_context)
{
generateImage(m_maskCUDA, 0);
cudaDeviceSynchronize();
decodeLabel(m_maskCUDA, out);
cudaDeviceSynchronize();
CHECK_DW_ERROR(dwImageStreamerGL_producerSend(m_maskCUDA, m_streamerGL_CUDA2GL));
CHECK_DW_ERROR(dwImageStreamerGL_consumerReceive(&m_maskGL, 32000, m_streamerGL_CUDA2GL));
dwImage_getGL(&m_imgGl, m_maskGL);
CHECK_DW_ERROR(dwImageStreamerGL_consumerReturn(&m_maskGL, m_streamerGL_CUDA2GL));
CHECK_DW_ERROR(dwImageStreamerGL_producerReturn(&m_maskCUDA, 32000, m_streamerGL_CUDA2GL));
}
Am I traversing the model output GPU buffer and creating the mask properly? Is the output of a segmentation model is stored in the linear memory block?