Hi, I filed the following issue on Github here but I thought I’d also do it here as I was unsure which is preferred.
Description
Problem
When passing an “empty” array (where empty means no values in [MINSITEVALUE, MAXSITEVALUE]
) nppDistanceTransform
will lead to issues with compute-sanitizer
The following Nvidia docs mention mention
Note that an input image that does not contain at least one site pixel is considered to be an invalid image. If you suspect that your input image may be invalid you can call an NPP function like nppiCountInRange() first to confirm that the image is valid before calling the distance transform function.
However, I assumed that invalid would mean incorrect output which is fine. The issue however is if you run the below code with compute-sanitizer
you’ll see
========= Invalid __global__ read of size 4 bytes
========= at kernelColor(short2 *, short2 *, int)+0x1d0
========= by thread (55,15,0) in block (0,0,0)
========= Address 0x7fe7039ffcdc is out of bounds
========= and is 804 bytes before the nearest allocation at 0x7fe703a00000 of size 1581080 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x335f87]
========= in /lib/x86_64-linux-gnu/libcuda.so.1
========= Host Frame: [0x40b2c2]
========= in /usr/local/cuda/targets/x86_64-linux/lib/libnppif.so.12
========= Host Frame: [0x46ef2d]
========= in /usr/local/cuda/targets/x86_64-linux/lib/libnppif.so.12
========= Host Frame: [0x98e5e]
========= in /usr/local/cuda/targets/x86_64-linux/lib/libnppif.so.12
========= Host Frame: [0x98f92]
========= in /usr/local/cuda/targets/x86_64-linux/lib/libnppif.so.12
========= Host Frame:nppiDistanceTransformPBA_8u32f_C1R_Ctx [0x9b9c9]
========= in /usr/local/cuda/targets/x86_64-linux/lib/libnppif.so.12
========= Host Frame:main in /usr/src/project/NPP/distanceTransform/distanceTransform.cpp:706 [0x16de]
========= in /usr/src/project/NPP/distanceTransform/build/./distanceTransform
========= Host Frame: [0x29d8f]
========= in /lib/x86_64-linux-gnu/libc.so.6
========= Host Frame:__libc_start_main [0x29e3f]
========= in /lib/x86_64-linux-gnu/libc.so.6
========= Host Frame:_start [0x12a4]
========= ERROR SUMMARY: 453 errors
Anecdotally while this example runs fine I’ve seen this same code produce crashes and other problems. Finally, it’s hard to implement the above fix on GPU streams where many enqueued operations run without branching.
Reproducible Example
This was reproduced with
NPP Library Version 12.3.1
CUDA Driver Version: 12.7
CUDA Runtime Version: 12.6
on X86 as well as on Jetson.
Below is a minimum reproducible example
#include <stdio.h>
#include <stdlib.h>
#include <nppdefs.h>
#include <nppcore.h>
#include <nppi_filtering_functions.h>
#include <npps_initialization.h>
int main()
{
NppStreamContext nppStreamCtx;
nppStreamCtx.hStream = 0;
cudaGetDevice(&nppStreamCtx.nCudaDeviceId);
cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMajor,
cudaDevAttrComputeCapabilityMinor,
nppStreamCtx.nCudaDeviceId);
cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMinor,
cudaDevAttrComputeCapabilityMinor,
nppStreamCtx.nCudaDeviceId);
cudaStreamGetFlags(nppStreamCtx.hStream, &nppStreamCtx.nStreamFlags);
cudaDeviceProp oDeviceProperties;
cudaGetDeviceProperties(&oDeviceProperties, nppStreamCtx.nCudaDeviceId);
nppStreamCtx.nMultiProcessorCount = oDeviceProperties.multiProcessorCount;
nppStreamCtx.nMaxThreadsPerMultiProcessor = oDeviceProperties.maxThreadsPerMultiProcessor;
nppStreamCtx.nMaxThreadsPerBlock = oDeviceProperties.maxThreadsPerBlock;
nppStreamCtx.nSharedMemPerBlock = oDeviceProperties.sharedMemPerBlock;
int nImageWidth = 256;
int nImageHeight = 256;
NppiSize oImageSizeROI = {nImageWidth, nImageHeight};
Npp8u *pInputImage_Device = 0;
Npp32f *pOutputImage_Device = 0;
Npp8u *pScratchBuffer = 0;
size_t nScratchBufferSize;
nppiDistanceTransformPBAGetBufferSize(oImageSizeROI, &nScratchBufferSize);
printf("Scratch buffer size: %zu bytes\n", nScratchBufferSize);
cudaMalloc((void **)&pScratchBuffer, nScratchBufferSize);
cudaMalloc((void **)&pInputImage_Device, oImageSizeROI.width * sizeof(Npp8u) * oImageSizeROI.height);
cudaMalloc((void **)&pOutputImage_Device, oImageSizeROI.width * sizeof(Npp32f) * oImageSizeROI.height);
if (pScratchBuffer == 0 || pInputImage_Device == 0 || pOutputImage_Device == 0)
{
if (pScratchBuffer)
cudaFree(pScratchBuffer);
if (pInputImage_Device)
cudaFree(pInputImage_Device);
if (pOutputImage_Device)
cudaFree(pOutputImage_Device);
return -1;
}
cudaMemset(pInputImage_Device, 0, oImageSizeROI.width * oImageSizeROI.height * sizeof(Npp8u));
Npp8u nMinSiteValue = 1;
Npp8u nMaxSiteValue = 1;
printf("Running distance transform...\n");
NppStatus status = nppiDistanceTransformPBA_8u32f_C1R_Ctx(
pInputImage_Device, oImageSizeROI.width * sizeof(Npp8u),
nMinSiteValue, nMaxSiteValue,
0, 0,
0, 0,
0, 0,
pOutputImage_Device, oImageSizeROI.width * sizeof(Npp32f),
oImageSizeROI, pScratchBuffer, nppStreamCtx);
if (status != NPP_SUCCESS)
{
printf("Distance transform failed with error: %d\n", status);
}
else
{
printf("Distance transform completed successfully!\n");
}
cudaDeviceSynchronize();
cudaFree(pScratchBuffer);
cudaFree(pInputImage_Device);
cudaFree(pOutputImage_Device);
printf("Done!\n");
return 0;
}