I replied here on github but I wanted to copy and paste my response. As I’m unsure where things are best tracked.
We originally did not encounter this issue on Jetson Orin Nano 4GB. However, since upgrading to Jetpack 6 with Cuda NPP version 12.3.1
we seem to be observing this same problem now on Jetson. Here’s a sample program
#include <stdio.h>
#include <stdlib.h>
#include "cuda_runtime.h"
#include "nppdefs.h"
#include "nppcore.h"
#include "nppi_filtering_functions.h"
#include "npps_initialization.h"
#define min(x, y) (((x) < (y)) ? (x) : (y))
#define nImageWidth 64
#define nImageHeight 64
int test_npp()
{
NppStreamContext nppStreamCtx;
nppStreamCtx.hStream = 0;
cudaGetDevice(&nppStreamCtx.nCudaDeviceId);
cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMajor,
cudaDevAttrComputeCapabilityMajor,
nppStreamCtx.nCudaDeviceId);
cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMinor,
cudaDevAttrComputeCapabilityMinor,
nppStreamCtx.nCudaDeviceId);
cudaStreamGetFlags(nppStreamCtx.hStream, &nppStreamCtx.nStreamFlags);
struct cudaDeviceProp oDeviceProperties;
cudaGetDeviceProperties(&oDeviceProperties, nppStreamCtx.nCudaDeviceId);
nppStreamCtx.nMultiProcessorCount = oDeviceProperties.multiProcessorCount;
nppStreamCtx.nMaxThreadsPerMultiProcessor = oDeviceProperties.maxThreadsPerMultiProcessor;
nppStreamCtx.nMaxThreadsPerBlock = oDeviceProperties.maxThreadsPerBlock;
nppStreamCtx.nSharedMemPerBlock = oDeviceProperties.sharedMemPerBlock;
NppiSize oImageSizeROI = {nImageWidth, nImageHeight};
Npp8u hostInputBuffer[nImageHeight * nImageWidth] = {0};
for (int i = 0; i < nImageWidth; i++) {
// Fill first row with 1s
hostInputBuffer[i] = 1;
}
printf("Input array (%dx%d):\n", nImageWidth, nImageHeight);
if (nImageWidth != 8) {
printf("Truncating printed width to %d\n", 8);
}
for (int y = 0; y < nImageHeight; y++) {
for (int x = 0; x < min(8, nImageWidth); x++) {
printf("%d ", hostInputBuffer[y * nImageWidth + x]);
}
printf("\n");
}
Npp8u *pInputImage_Device = 0;
Npp32f *pOutputImage_Device = 0;
Npp8u *pScratchBuffer = 0;
Npp32f *hostOutputBuffer = 0;
size_t nScratchBufferSize;
nppiDistanceTransformPBAGetBufferSize(oImageSizeROI, &nScratchBufferSize);
printf("Scratch buffer size: %zu bytes\n", nScratchBufferSize);
cudaMalloc((void **)&pScratchBuffer, nScratchBufferSize);
cudaMalloc((void **)&pInputImage_Device, oImageSizeROI.width * sizeof(Npp8u) * oImageSizeROI.height);
cudaMalloc((void **)&pOutputImage_Device, oImageSizeROI.width * sizeof(Npp32f) * oImageSizeROI.height);
hostOutputBuffer = (Npp32f *)malloc(oImageSizeROI.width * sizeof(Npp32f) * oImageSizeROI.height);
if (pScratchBuffer == 0 || pInputImage_Device == 0 || pOutputImage_Device == 0 || hostOutputBuffer == 0)
{
printf("Memory allocation failed\n");
if (pScratchBuffer)
cudaFree(pScratchBuffer);
if (pInputImage_Device)
cudaFree(pInputImage_Device);
if (pOutputImage_Device)
cudaFree(pOutputImage_Device);
if (hostOutputBuffer)
free(hostOutputBuffer);
return -1;
}
cudaMemcpy(pInputImage_Device, hostInputBuffer,
oImageSizeROI.width * oImageSizeROI.height * sizeof(Npp8u),
cudaMemcpyHostToDevice);
Npp8u nMinSiteValue = 1;
Npp8u nMaxSiteValue = 1;
printf("Running distance transform...\n");
NppStatus status = nppiDistanceTransformPBA_8u32f_C1R_Ctx(
pInputImage_Device, oImageSizeROI.width * sizeof(Npp8u),
nMinSiteValue, nMaxSiteValue,
0, 0,
0, 0,
0, 0,
pOutputImage_Device, oImageSizeROI.width * sizeof(Npp32f),
oImageSizeROI, pScratchBuffer, nppStreamCtx);
if (status != NPP_SUCCESS)
{
printf("Distance transform failed with error: %d\n", status);
}
else
{
printf("Distance transform completed successfully!\n");
cudaMemcpy(hostOutputBuffer, pOutputImage_Device,
oImageSizeROI.width * sizeof(Npp32f) * oImageSizeROI.height,
cudaMemcpyDeviceToHost);
if (nImageHeight != 8) {
printf("Truncating printed width to %d\n", 8);
}
for (int y = 0; y < nImageHeight; y++) {
for (int x = 0; x < min(8, nImageWidth); x++) {
printf("%.1f ", hostOutputBuffer[y * nImageWidth + x]);
}
printf("\n");
}
}
cudaDeviceSynchronize();
cudaFree(pScratchBuffer);
cudaFree(pInputImage_Device);
cudaFree(pOutputImage_Device);
free(hostOutputBuffer);
printf("Done!\n");
return 0;
}
int main() {
void *x = 0;
cudaMallocHost((void **)&x, 8);
if (x != 0) {
cudaFreeHost(x);
x = 0;
}
NppLibraryVersion * version = nppGetLibVersion();
printf("%d %d %d\n", version->major, version->minor, version->build);
test_npp();
}
as well as its output on jetson
12 3 1
Input array (64x64):
Truncating printed width to 8
1 1 1 1 1 1 1 1
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
Scratch buffer size: 98840 bytes
Running distance transform...
Distance transform completed successfully!
Truncating printed width to 8
0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0
3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.5
4.5 4.5 4.5 4.5 4.5 4.5 4.5 4.5
5.5 5.5 5.5 5.5 5.5 5.5 5.5 5.5
6.5 6.5 6.5 6.5 6.5 6.5 6.5 6.5
7.5 7.5 7.5 7.5 7.5 7.5 7.5 7.5
8.5 8.5 8.5 8.5 8.5 8.5 8.5 8.5
9.5 9.5 9.5 9.5 9.5 9.5 9.5 9.5
10.5 10.5 10.5 10.5 10.5 10.5 10.5 10.5
11.5 11.5 11.5 11.5 11.5 11.5 11.5 11.5
12.5 12.5 12.5 12.5 12.5 12.5 12.5 12.5
13.5 13.5 13.5 13.5 13.5 13.5 13.5 13.5
14.5 14.5 14.5 14.5 14.5 14.5 14.5 14.5
15.5 15.5 15.5 15.5 15.5 15.5 15.5 15.5
16.5 16.5 16.5 16.5 16.5 16.5 16.5 16.5
17.5 17.5 17.5 17.5 17.5 17.5 17.5 17.5
18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5
19.5 19.5 19.5 19.5 19.5 19.5 19.5 19.5
20.5 20.5 20.5 20.5 20.5 20.5 20.5 20.5
21.5 21.5 21.5 21.5 21.5 21.5 21.5 21.5
22.5 22.5 22.5 22.5 22.5 22.5 22.5 22.5
23.5 23.5 23.5 23.5 23.5 23.5 23.5 23.5
24.5 24.5 24.5 24.5 24.5 24.5 24.5 24.5
25.5 25.5 25.5 25.5 25.5 25.5 25.5 25.5
26.5 26.5 26.5 26.5 26.5 26.5 26.5 26.5
27.5 27.5 27.5 27.5 27.5 27.5 27.5 27.5
28.5 28.5 28.5 28.5 28.5 28.5 28.5 28.5
29.5 29.5 29.5 29.5 29.5 29.5 29.5 29.5
30.5 30.5 30.5 30.5 30.5 30.5 30.5 30.5
31.5 31.5 31.5 31.5 31.5 31.5 31.5 31.5
32.5 32.5 32.5 32.5 32.5 32.5 32.5 32.5
33.5 33.5 33.5 33.5 33.5 33.5 33.5 33.5
34.5 34.5 34.5 34.5 34.5 34.5 34.5 34.5
35.5 35.5 35.5 35.5 35.5 35.5 35.5 35.5
36.5 36.5 36.5 36.5 36.5 36.5 36.5 36.5
37.5 37.5 37.5 37.5 37.5 37.5 37.5 37.5
38.5 38.5 38.5 38.5 38.5 38.5 38.5 38.5
39.5 39.5 39.5 39.5 39.5 39.5 39.5 39.5
40.5 40.5 40.5 40.5 40.5 40.5 40.5 40.5
41.5 41.5 41.5 41.5 41.5 41.5 41.5 41.5
42.5 42.5 42.5 42.5 42.5 42.5 42.5 42.5
43.5 43.5 43.5 43.5 43.5 43.5 43.5 43.5
44.5 44.5 44.5 44.5 44.5 44.5 44.5 44.5
45.5 45.5 45.5 45.5 45.5 45.5 45.5 45.5
46.5 46.5 46.5 46.5 46.5 46.5 46.5 46.5
47.5 47.5 47.5 47.5 47.5 47.5 47.5 47.5
48.5 48.5 48.5 48.5 48.5 48.5 48.5 48.5
49.5 49.5 49.5 49.5 49.5 49.5 49.5 49.5
50.5 50.5 50.5 50.5 50.5 50.5 50.5 50.5
51.5 51.5 51.5 51.5 51.5 51.5 51.5 51.5
52.5 52.5 52.5 52.5 52.5 52.5 52.5 52.5
53.5 53.5 53.5 53.5 53.5 53.5 53.5 53.5
54.5 54.5 54.5 54.5 54.5 54.5 54.5 54.5
55.5 55.5 55.5 55.5 55.5 55.5 55.5 55.5
56.5 56.5 56.5 56.5 56.5 56.5 56.5 56.5
57.5 57.5 57.5 57.5 57.5 57.5 57.5 57.5
58.5 58.5 58.5 58.5 58.5 58.5 58.5 58.5
59.5 59.5 59.5 59.5 59.5 59.5 59.5 59.5
60.5 60.5 60.5 60.5 60.5 60.5 60.5 60.5
61.5 61.5 61.5 61.5 61.5 61.5 61.5 61.5
62.5 62.5 62.5 62.5 62.5 62.5 62.5 62.5
Done!