nppiCompressedMarkerLabelsUFInfo does not return the same result

I am creating an image processing using npp. (CUDA 11.4)
In the process, I ran into a problem.
The problem is that the labeling function npiCompressedMarkerLabelsUFInfo returns different results each time, even though it is processing the same image.
Specifically, the function returns npicompressedMarkerLabelsInfo, and although the nMarkerLabelPixelCount is the same each time, the oMarkerLabelBoundingBox is slightly different each time. This is a problem.

The code and output in problem is as follows

	//// test image
	int testwidth = 256;
	int testheight = 256;
	cv::Mat test = cv::Mat::zeros(cv::Size(testwidth,testheight), CV_8UC1);
	cv::Vec<uchar,1> *src;
	src = test.ptr<cv::Vec<uchar,1>>(2);
	src[2] = 255;
	src[3] = 255;
    cudaMemcpyAsync(cudaMem_u8_A, test.data, test.rows * test.step,cudaMemcpyHostToDevice, nppStreamCtx.hStream);
	//// LabelMarker
	Npp8u *buffer;
	int hpBufferSize;
	nppiLabelMarkersUFGetBufferSize_32u_C1R({256, 256}, &hpBufferSize);
	cudaMallocAsync((void**)&buffer, hpBufferSize, stream);
	nppiLabelMarkersUF_8u32u_C1R_Ctx(cudaMem_u8_A, 256*sizeof(Npp8u), cudaMem_u32_A, 256*sizeof(Npp32u), {256,256}, nppiNormInf, buffer, nppStreamCtx);
	cudaFreeAsync(buffer, stream);
	//// Compress Marker
	nppiCompressMarkerLabelsGetBufferSize_32u_C1R(256*256, &hpBufferSize);
	cudaMallocAsync((void**)&buffer, hpBufferSize, stream);
	int nCompressedLabelCount;
	nppiCompressMarkerLabelsUF_32u_C1IR_Ctx(cudaMem_u32_A, 256*sizeof(Npp32u), {256,256}, 256*256, &nCompressedLabelCount, buffer, nppStreamCtx);
	cudaFreeAsync(buffer, stream);
	//// Get LabelMarker Info
	unsigned int nInfoListSize;
	nppiCompressedMarkerLabelsUFGetInfoListSize_32u_C1R(nCompressedLabelCount, &nInfoListSize);
	NppiCompressedMarkerLabelsInfo *pMarkerLabelsInfoList, *pMarkerLabelsInfoListHost;
	cudaMallocAsync((void**)&pMarkerLabelsInfoList, nInfoListSize,stream);
	cudaMallocHost((void**)&pMarkerLabelsInfoListHost, nInfoListSize);
	nppiCompressedMarkerLabelsUFInfo_32u_C1R_Ctx(cudaMem_u32_A, 256*sizeof(Npp32u), {256,256}, nCompressedLabelCount, pMarkerLabelsInfoList,
												NULL,0,NULL,0,NULL,NULL,NULL,NULL,nppStreamCtx);
	cudaMemcpyAsync(pMarkerLabelsInfoListHost,pMarkerLabelsInfoList,nInfoListSize,cudaMemcpyDeviceToHost, nppStreamCtx.hStream);
	cudaStreamSynchronize(nppStreamCtx.hStream);
	for (unsigned int l = 0; l <= nCompressedLabelCount; l++){
		printf(" Rect # %2d :  PixelCount: %6d  @  BoundingBox.x %4d, y %4d, width %4d, height %4d\n",
			l,
			pMarkerLabelsInfoListHost[l].nMarkerLabelPixelCount,
			pMarkerLabelsInfoListHost[l].oMarkerLabelBoundingBox.x,
			pMarkerLabelsInfoListHost[l].oMarkerLabelBoundingBox.y,
			pMarkerLabelsInfoListHost[l].oMarkerLabelBoundingBox.width,
			pMarkerLabelsInfoListHost[l].oMarkerLabelBoundingBox.height);
	}
	cudaFreeAsync(pMarkerLabelsInfoList, stream);
    cudaFreeHost(pMarkerLabelsInfoListHost);

The output is as follows You can see that the bounding value changes each time.

 Rect #  0 :  PixelCount:  65534  @  BoundingBox.x    0, y   11, width  255, height  255
 Rect #  1 :  PixelCount:      2  @  BoundingBox.x    5, y    4, width    0, height    0

 Rect #  0 :  PixelCount:  65534  @  BoundingBox.x    0, y    2, width  255, height  255
 Rect #  1 :  PixelCount:      2  @  BoundingBox.x    5, y    4, width    0, height    0

 Rect #  0 :  PixelCount:  65534  @  BoundingBox.x    0, y    2, width  255, height  255
 Rect #  1 :  PixelCount:      2  @  BoundingBox.x    5, y    4, width    0, height    0

 Rect #  0 :  PixelCount:  65534  @  BoundingBox.x    0, y   19, width  255, height  254
 Rect #  1 :  PixelCount:      2  @  BoundingBox.x    5, y    4, width    0, height    0

 Rect #  0 :  PixelCount:  65534  @  BoundingBox.x    0, y    1, width  255, height  255
 Rect #  1 :  PixelCount:      2  @  BoundingBox.x    5, y    4, width    0, height    0

 Rect #  0 :  PixelCount:  65534  @  BoundingBox.x    0, y    0, width  255, height  253
 Rect #  1 :  PixelCount:      2  @  BoundingBox.x    5, y    4, width    0, height    0

 Rect #  0 :  PixelCount:  65534  @  BoundingBox.x    0, y    0, width  255, height  255
 Rect #  1 :  PixelCount:      2  @  BoundingBox.x    5, y    4, width    0, height    0

 Rect #  0 :  PixelCount:  65534  @  BoundingBox.x    0, y    0, width  255, height  255
 Rect #  1 :  PixelCount:      2  @  BoundingBox.x    5, y    4, width    0, height    0

 Rect #  0 :  PixelCount:  65534  @  BoundingBox.x    0, y    2, width  255, height  255
 Rect #  1 :  PixelCount:      2  @  BoundingBox.x    5, y    4, width    0, height    0

 Rect #  0 :  PixelCount:  65534  @  BoundingBox.x    0, y    0, width  255, height  248
 Rect #  1 :  PixelCount:      2  @  BoundingBox.x    5, y    4, width    0, height    0

I would appreciate if you could tell me the cause of the problem and what to do about it.

I faced same problem in CUDA 12.2.

#include <iostream>
#include <npp.h>
#include <opencv2/opencv.hpp>

int main()
{
	cudaStream_t stream;
	NppStreamContext nppStreamCtx{};
	Npp8u* cudaMem_u8_A;
	Npp32u* cudaMem_u32_A;

	if (cudaStreamCreate(&stream) != cudaSuccess)
	{
		std::cout << "ERROR: cuda stream creation failed." << std::endl;
	}

	nppGetStreamContext(&nppStreamCtx);
	nppStreamCtx.hStream = stream;

	int testwidth = 256;
	int testheight = 256;
	for (int i = 0; i <= 10; i++) {

		cudaMallocAsync((void**)&cudaMem_u8_A, testwidth * testheight * sizeof(Npp8u), stream);
		cudaMallocAsync((void**)&cudaMem_u32_A, testwidth * testheight * sizeof(Npp32u), nppStreamCtx.hStream);

		//// test image
		cv::Mat test = cv::Mat::zeros(cv::Size(testwidth, testheight), CV_8UC1);
		cv::Vec<uchar, 1>* src;
		src = test.ptr<cv::Vec<uchar, 1>>(2);
		src[2] = 255;
		src[3] = 255;
		cudaMemcpyAsync(cudaMem_u8_A, test.data, test.rows * test.step, cudaMemcpyHostToDevice, nppStreamCtx.hStream);
		//// LabelMarker
		Npp8u* buffer;
		int hpBufferSize;
		nppiLabelMarkersUFGetBufferSize_32u_C1R({ 256, 256 }, &hpBufferSize);
		cudaMallocAsync((void**)&buffer, hpBufferSize, stream);
		nppiLabelMarkersUF_8u32u_C1R_Ctx(cudaMem_u8_A, 256 * sizeof(Npp8u), cudaMem_u32_A, 256 * sizeof(Npp32u), { 256,256 }, nppiNormInf, buffer, nppStreamCtx);
		cudaFreeAsync(buffer, stream);
		//// Compress Marker
		nppiCompressMarkerLabelsGetBufferSize_32u_C1R(256 * 256, &hpBufferSize);
		cudaMallocAsync((void**)&buffer, hpBufferSize, stream);
		int nCompressedLabelCount;
		nppiCompressMarkerLabelsUF_32u_C1IR_Ctx(cudaMem_u32_A, 256 * sizeof(Npp32u), { 256,256 }, 256 * 256, &nCompressedLabelCount, buffer, nppStreamCtx);
		cudaFreeAsync(buffer, stream);
		//// Get LabelMarker Info
		unsigned int nInfoListSize;
		nppiCompressedMarkerLabelsUFGetInfoListSize_32u_C1R(nCompressedLabelCount, &nInfoListSize);
		NppiCompressedMarkerLabelsInfo* pMarkerLabelsInfoList, * pMarkerLabelsInfoListHost;
		cudaMallocAsync((void**)&pMarkerLabelsInfoList, nInfoListSize, stream);
		cudaMallocHost((void**)&pMarkerLabelsInfoListHost, nInfoListSize);
		nppiCompressedMarkerLabelsUFInfo_32u_C1R_Ctx(cudaMem_u32_A, 256 * sizeof(Npp32u), { 256,256 }, nCompressedLabelCount, pMarkerLabelsInfoList,
			NULL, 0, NULL, 0, NULL, NULL, NULL, NULL, NULL, nppStreamCtx);
		cudaMemcpyAsync(pMarkerLabelsInfoListHost, pMarkerLabelsInfoList, nInfoListSize, cudaMemcpyDeviceToHost, nppStreamCtx.hStream);
		cudaStreamSynchronize(nppStreamCtx.hStream);
		for (unsigned int l = 0; l <= nCompressedLabelCount; l++) {
			printf(" Rect # %2d :  PixelCount: %6d  @  BoundingBox.x %4d, y %4d, width %4d, height %4d\n",
				l,
				pMarkerLabelsInfoListHost[l].nMarkerLabelPixelCount,
				pMarkerLabelsInfoListHost[l].oMarkerLabelBoundingBox.x,
				pMarkerLabelsInfoListHost[l].oMarkerLabelBoundingBox.y,
				pMarkerLabelsInfoListHost[l].oMarkerLabelBoundingBox.width,
				pMarkerLabelsInfoListHost[l].oMarkerLabelBoundingBox.height);
		}
		cudaFreeAsync(pMarkerLabelsInfoList, stream);
		cudaFreeHost(pMarkerLabelsInfoListHost);
		printf("-----\n");
	}
}

The output is as follows.

 Rect #  0 :  PixelCount:  65534  @  BoundingBox.x    0, y    1, width  255, height  255   <----
 Rect #  1 :  PixelCount:      2  @  BoundingBox.x    2, y    2, width    2, height    2
-----
 Rect #  0 :  PixelCount:  65534  @  BoundingBox.x    0, y    0, width  255, height  255
 Rect #  1 :  PixelCount:      2  @  BoundingBox.x    2, y    2, width    2, height    2
-----
 Rect #  0 :  PixelCount:  65534  @  BoundingBox.x    0, y    0, width  255, height  255
 Rect #  1 :  PixelCount:      2  @  BoundingBox.x    2, y    2, width    2, height    2
-----
 Rect #  0 :  PixelCount:  65534  @  BoundingBox.x    0, y   16, width  255, height  255   <----
 Rect #  1 :  PixelCount:      2  @  BoundingBox.x    2, y    2, width    2, height    2
-----
 Rect #  0 :  PixelCount:  65534  @  BoundingBox.x    0, y    0, width  255, height  255
 Rect #  1 :  PixelCount:      2  @  BoundingBox.x    2, y    2, width    2, height    2
-----
 Rect #  0 :  PixelCount:  65534  @  BoundingBox.x    0, y    5, width  255, height  255   <----
 Rect #  1 :  PixelCount:      2  @  BoundingBox.x    2, y    2, width    2, height    2
-----
 Rect #  0 :  PixelCount:  65534  @  BoundingBox.x    0, y    0, width  255, height  255
 Rect #  1 :  PixelCount:      2  @  BoundingBox.x    2, y    2, width    2, height    2
-----
 Rect #  0 :  PixelCount:  65534  @  BoundingBox.x    0, y    0, width  255, height  255
 Rect #  1 :  PixelCount:      2  @  BoundingBox.x    2, y    2, width    2, height    2
-----
 Rect #  0 :  PixelCount:  65534  @  BoundingBox.x    0, y    0, width  255, height  255
 Rect #  1 :  PixelCount:      2  @  BoundingBox.x    2, y    2, width    2, height    2
-----
 Rect #  0 :  PixelCount:  65534  @  BoundingBox.x    0, y    0, width  255, height  255
 Rect #  1 :  PixelCount:      2  @  BoundingBox.x    2, y    2, width    2, height    2
-----
 Rect #  0 :  PixelCount:  65534  @  BoundingBox.x    0, y    0, width  255, height  255
 Rect #  1 :  PixelCount:      2  @  BoundingBox.x    2, y    2, width    2, height    2
-----

When I run your code, I am getting an npp error. I suggest proper error checking. Furthermore, if you want assistance, my suggestion is don’t build in unnecessary dependencies to your test case. In my view, there is no reason to use OpenCV for this test case, if you want help with npp. It makes it harder for me and others to work on it if I must also have a properly functioning OpenCV environment to run your test case. Do as you wish of course, just making suggestions.

Thanks for the reply.
As you indicated, there was an error NPP_CUDA_KERNEL_EXECUTION_ERROR in NPPiCompressedMarkerLabelsUFInfo_32u_C1R_Ctx.
The error has been resolved and also the dependency on opencv has been removed.
The situation is still the same as described below, but the symptoms are not the same as the execution result.

#include <iostream>
#include <npp.h>

int main()
{
	NppStreamContext nppStreamCtx{};
	Npp8u* cudaMem_u8_A;
	Npp32u* cudaMem_u32_A;

	Npp8u* pContoursImageDev;
	NppiContourPixelDirectionInfo* pContoursDirectionImageDev;
	NppiContourTotalsInfo oContoursTotalsInfoHost;
	Npp32u* pContoursPixelCountsListDev;
	Npp32u* pContoursPixelCountsListHost;
	Npp32u* pContoursPixelStartingOffsetDev;
	Npp32u* pContoursPixelStartingOffsetHost;

	NppStatus nppstatus;
	cudaError cudaerror;

	//// test image
	int imsize = 8;
	cudaerror = cudaMalloc((void**)&cudaMem_u8_A, imsize * imsize * sizeof(Npp8u));
	cudaerror = cudaMalloc((void**)&cudaMem_u32_A, imsize * imsize * sizeof(Npp32u));

	Npp8u* hostMem_8u_A;
	hostMem_8u_A = reinterpret_cast<Npp8u*>(malloc(sizeof(Npp8u) * (imsize * imsize)));
	for (int i = 0; i < imsize*imsize; i++) {
		hostMem_8u_A[i] = 0;
	}

	hostMem_8u_A[3 * imsize + 2] = 255;
	hostMem_8u_A[3 * imsize + 3] = 255;
	hostMem_8u_A[4 * imsize + 3] = 255;
	hostMem_8u_A[4 * imsize + 4] = 255;
	
	for (int i = 0; i < imsize; i++) {
		for (int j = 0; j < imsize; j++) {
			printf("%3d ", hostMem_8u_A[i * imsize + j]);
		}
		printf("\n");
	}

	cudaerror = cudaMemcpy(cudaMem_u8_A, hostMem_8u_A,  imsize*imsize*sizeof(Npp8u), cudaMemcpyHostToDevice);
	//// LabelMarker
	Npp8u* buffer;
	int hpBufferSize;
	nppstatus = nppiLabelMarkersUFGetBufferSize_32u_C1R({ imsize, imsize }, &hpBufferSize);
	cudaerror = cudaMalloc((void**)&buffer, hpBufferSize);
	nppstatus = nppiLabelMarkersUF_8u32u_C1R(cudaMem_u8_A, imsize * sizeof(Npp8u), cudaMem_u32_A, imsize * sizeof(Npp32u), { imsize,imsize }, nppiNormInf, buffer);
	cudaerror = cudaFree(buffer);
	//// Compress Marker
	nppstatus = nppiCompressMarkerLabelsGetBufferSize_32u_C1R(imsize * imsize, &hpBufferSize);
	cudaerror = cudaMalloc((void**)&buffer, hpBufferSize);
	int nCompressedLabelCount;
	nppstatus = nppiCompressMarkerLabelsUF_32u_C1IR(cudaMem_u32_A, imsize * sizeof(Npp32u), { imsize,imsize }, imsize * imsize, &nCompressedLabelCount, buffer);
	cudaerror = cudaFree(buffer);
	//// Get LabelMarker Info
	unsigned int nInfoListSize;
	nppstatus = nppiCompressedMarkerLabelsUFGetInfoListSize_32u_C1R(nCompressedLabelCount, &nInfoListSize);
	NppiCompressedMarkerLabelsInfo* pMarkerLabelsInfoList, * pMarkerLabelsInfoListHost;
	cudaerror = cudaMalloc((void**)&pMarkerLabelsInfoList, nInfoListSize);
	pMarkerLabelsInfoListHost = reinterpret_cast<NppiCompressedMarkerLabelsInfo*>(malloc(nInfoListSize));

	cudaerror = cudaMalloc((void**)&pContoursImageDev, imsize * sizeof(Npp8u) * imsize);
	cudaerror = cudaMalloc((void**)&pContoursDirectionImageDev, imsize * sizeof(NppiContourPixelDirectionInfo) * imsize);
	cudaerror = cudaMalloc((void**)&pContoursPixelCountsListDev, sizeof(Npp32u) * (nCompressedLabelCount + 4));
	cudaerror = cudaMalloc((void**)&pContoursPixelStartingOffsetDev, sizeof(Npp32u) * (nCompressedLabelCount + 4));
	pContoursPixelCountsListHost = reinterpret_cast<Npp32u*>(malloc(sizeof(Npp32u) * (nCompressedLabelCount + 4)));
	pContoursPixelStartingOffsetHost = reinterpret_cast<Npp32u*>(malloc(sizeof(Npp32u) * (nCompressedLabelCount + 4)));

	for (int i = 0; i < 10; i++) {
		nppstatus = nppiCompressedMarkerLabelsUFInfo_32u_C1R_Ctx(
			cudaMem_u32_A,
			imsize * sizeof(Npp32u),
			{ imsize,imsize },
			nCompressedLabelCount,
			pMarkerLabelsInfoList,
			pContoursImageDev,
			imsize * sizeof(Npp8u),
			pContoursDirectionImageDev,
			imsize * sizeof(NppiContourPixelDirectionInfo),
			&oContoursTotalsInfoHost,
			pContoursPixelCountsListDev,
			pContoursPixelCountsListHost,
			pContoursPixelStartingOffsetDev,
			pContoursPixelStartingOffsetHost,
			nppStreamCtx);


		cudaerror = cudaMemcpy(pMarkerLabelsInfoListHost, pMarkerLabelsInfoList, nInfoListSize, cudaMemcpyDeviceToHost);
		printf("-----\n");
		for (unsigned int l = 0; l <= nCompressedLabelCount; l++) {
			printf(" Rect%d : PixelCount:%d @ BoundingBox.x %d, y %d, width %d, height %d, contour %d, x %d, y %d\n",
				l,
				pMarkerLabelsInfoListHost[l].nMarkerLabelPixelCount,
				pMarkerLabelsInfoListHost[l].oMarkerLabelBoundingBox.x,
				pMarkerLabelsInfoListHost[l].oMarkerLabelBoundingBox.y,
				pMarkerLabelsInfoListHost[l].oMarkerLabelBoundingBox.width,
				pMarkerLabelsInfoListHost[l].oMarkerLabelBoundingBox.height,
				pMarkerLabelsInfoListHost[l].nContourPixelCount,
				pMarkerLabelsInfoListHost[l].oContourFirstPixelLocation.x,
				pMarkerLabelsInfoListHost[l].oContourFirstPixelLocation.y
			);
		}
		printf("-----\n");
	}
	cudaerror = cudaFree(pContoursImageDev);
	cudaerror = cudaFree(pContoursDirectionImageDev);
	cudaerror = cudaFree(pContoursPixelCountsListDev);
	cudaerror = cudaFree(pContoursPixelStartingOffsetDev);
	free(pContoursPixelCountsListHost);
	free(pContoursPixelStartingOffsetHost);
	cudaerror = cudaFree(pMarkerLabelsInfoList);
	free(pMarkerLabelsInfoListHost);
	cudaerror = cudaFree(cudaMem_u8_A);
	cudaerror = cudaFree(cudaMem_u32_A);
}

The results are as follows.

  0   0   0   0   0   0   0   0
  0   0   0   0   0   0   0   0
  0   0   0   0   0   0   0   0
  0   0 255 255   0   0   0   0
  0   0   0 255 255   0   0   0
  0   0   0   0   0   0   0   0
  0   0   0   0   0   0   0   0
  0   0   0   0   0   0   0   0
-----
 Rect0 : PixelCount:60 @ BoundingBox.x 0, y 0, width 7, height 7, contour 14, x 5, y 4
 Rect1 : PixelCount:4 @ BoundingBox.x 2, y 3, width 3, height 4, contour 4, x 4, y 4
-----
-----
 Rect0 : PixelCount:60 @ BoundingBox.x 0, y 0, width 7, height 7, contour 14, x 5, y 4
 Rect1 : PixelCount:4 @ BoundingBox.x 2, y 3, width 4, height 4, contour 4, x 4, y 4
-----
-----
 Rect0 : PixelCount:60 @ BoundingBox.x 0, y 0, width 7, height 7, contour 14, x 5, y 4
 Rect1 : PixelCount:4 @ BoundingBox.x 2, y 3, width 4, height 4, contour 4, x 4, y 4
-----
-----
 Rect0 : PixelCount:60 @ BoundingBox.x 0, y 0, width 7, height 7, contour 14, x 5, y 4
 Rect1 : PixelCount:4 @ BoundingBox.x 2, y 3, width 4, height 4, contour 4, x 4, y 4
-----
-----
 Rect0 : PixelCount:60 @ BoundingBox.x 0, y 0, width 7, height 7, contour 14, x 5, y 4
 Rect1 : PixelCount:4 @ BoundingBox.x 2, y 3, width 4, height 4, contour 4, x 4, y 4
-----
-----
 Rect0 : PixelCount:60 @ BoundingBox.x 0, y 0, width 7, height 7, contour 14, x 5, y 4
 Rect1 : PixelCount:4 @ BoundingBox.x 2, y 3, width 4, height 4, contour 4, x 4, y 4
-----
-----
 Rect0 : PixelCount:60 @ BoundingBox.x 0, y 0, width 7, height 7, contour 14, x 5, y 4
 Rect1 : PixelCount:4 @ BoundingBox.x 2, y 3, width 4, height 4, contour 4, x 4, y 4
-----
-----
 Rect0 : PixelCount:60 @ BoundingBox.x 0, y 0, width 7, height 7, contour 14, x 5, y 4
 Rect1 : PixelCount:4 @ BoundingBox.x 2, y 3, width 4, height 4, contour 4, x 4, y 4
-----
-----
 Rect0 : PixelCount:60 @ BoundingBox.x 0, y 0, width 7, height 7, contour 14, x 5, y 3   <---is not same
 Rect1 : PixelCount:4 @ BoundingBox.x 2, y 3, width 3, height 4, contour 4, x 3, y 3   <---is not same
-----
-----
 Rect0 : PixelCount:60 @ BoundingBox.x 0, y 0, width 7, height 7, contour 14, x 4, y 2   <---is not same
 Rect1 : PixelCount:4 @ BoundingBox.x 2, y 3, width 3, height 4, contour 4, x 3, y 3   <---is not same
-----

I note that it says here:

“Note that while the bounding box is relatively accurate occasionally a few contour pixels may extend beyond the bounding box limits.”

I’m not that familiar with what the algorithm is doing, but its possible there may be some variability run-to-run. You’re welcome to file a bug if you wish.

1 Like

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.