Invalid Memory reads with NPP Distance Transform on Empty Image

Hi, I filed the following issue on Github here but I thought I’d also do it here as I was unsure which is preferred.

Description

Problem

When passing an “empty” array (where empty means no values in [MINSITEVALUE, MAXSITEVALUE]) nppDistanceTransform will lead to issues with compute-sanitizer

The following Nvidia docs mention mention

Note that an input image that does not contain at least one site pixel is considered to be an invalid image. If you suspect that your input image may be invalid you can call an NPP function like nppiCountInRange() first to confirm that the image is valid before calling the distance transform function.

However, I assumed that invalid would mean incorrect output which is fine. The issue however is if you run the below code with compute-sanitizer you’ll see

========= Invalid __global__ read of size 4 bytes
=========     at kernelColor(short2 *, short2 *, int)+0x1d0
=========     by thread (55,15,0) in block (0,0,0)
=========     Address 0x7fe7039ffcdc is out of bounds
=========     and is 804 bytes before the nearest allocation at 0x7fe703a00000 of size 1581080 bytes
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x335f87]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0x40b2c2]
=========                in /usr/local/cuda/targets/x86_64-linux/lib/libnppif.so.12
=========     Host Frame: [0x46ef2d]
=========                in /usr/local/cuda/targets/x86_64-linux/lib/libnppif.so.12
=========     Host Frame: [0x98e5e]
=========                in /usr/local/cuda/targets/x86_64-linux/lib/libnppif.so.12
=========     Host Frame: [0x98f92]
=========                in /usr/local/cuda/targets/x86_64-linux/lib/libnppif.so.12
=========     Host Frame:nppiDistanceTransformPBA_8u32f_C1R_Ctx [0x9b9c9]
=========                in /usr/local/cuda/targets/x86_64-linux/lib/libnppif.so.12
=========     Host Frame:main in /usr/src/project/NPP/distanceTransform/distanceTransform.cpp:706 [0x16de]
=========                in /usr/src/project/NPP/distanceTransform/build/./distanceTransform
=========     Host Frame: [0x29d8f]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:__libc_start_main [0x29e3f]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:_start [0x12a4]
========= ERROR SUMMARY: 453 errors

Anecdotally while this example runs fine I’ve seen this same code produce crashes and other problems. Finally, it’s hard to implement the above fix on GPU streams where many enqueued operations run without branching.

Reproducible Example

This was reproduced with

NPP Library Version 12.3.1
CUDA Driver  Version: 12.7
CUDA Runtime Version: 12.6

on X86 as well as on Jetson.

Below is a minimum reproducible example

#include <stdio.h>
#include <stdlib.h>

#include <nppdefs.h>
#include <nppcore.h>
#include <nppi_filtering_functions.h>
#include <npps_initialization.h>

int main()
{
    NppStreamContext nppStreamCtx;
    nppStreamCtx.hStream = 0;

    cudaGetDevice(&nppStreamCtx.nCudaDeviceId);
    cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMajor,
                           cudaDevAttrComputeCapabilityMinor,
                           nppStreamCtx.nCudaDeviceId);
    cudaDeviceGetAttribute(&nppStreamCtx.nCudaDevAttrComputeCapabilityMinor,
                           cudaDevAttrComputeCapabilityMinor,
                           nppStreamCtx.nCudaDeviceId);
    cudaStreamGetFlags(nppStreamCtx.hStream, &nppStreamCtx.nStreamFlags);

    cudaDeviceProp oDeviceProperties;
    cudaGetDeviceProperties(&oDeviceProperties, nppStreamCtx.nCudaDeviceId);
    nppStreamCtx.nMultiProcessorCount = oDeviceProperties.multiProcessorCount;
    nppStreamCtx.nMaxThreadsPerMultiProcessor = oDeviceProperties.maxThreadsPerMultiProcessor;
    nppStreamCtx.nMaxThreadsPerBlock = oDeviceProperties.maxThreadsPerBlock;
    nppStreamCtx.nSharedMemPerBlock = oDeviceProperties.sharedMemPerBlock;

    int nImageWidth = 256;
    int nImageHeight = 256;
    NppiSize oImageSizeROI = {nImageWidth, nImageHeight};

    Npp8u *pInputImage_Device = 0;
    Npp32f *pOutputImage_Device = 0;
    Npp8u *pScratchBuffer = 0;

    size_t nScratchBufferSize;
    nppiDistanceTransformPBAGetBufferSize(oImageSizeROI, &nScratchBufferSize);
    printf("Scratch buffer size: %zu bytes\n", nScratchBufferSize);

    cudaMalloc((void **)&pScratchBuffer, nScratchBufferSize);
    cudaMalloc((void **)&pInputImage_Device, oImageSizeROI.width * sizeof(Npp8u) * oImageSizeROI.height);
    cudaMalloc((void **)&pOutputImage_Device, oImageSizeROI.width * sizeof(Npp32f) * oImageSizeROI.height);

    if (pScratchBuffer == 0 || pInputImage_Device == 0 || pOutputImage_Device == 0)
    {
        if (pScratchBuffer)
            cudaFree(pScratchBuffer);
        if (pInputImage_Device)
            cudaFree(pInputImage_Device);
        if (pOutputImage_Device)
            cudaFree(pOutputImage_Device);
        return -1;
    }

    cudaMemset(pInputImage_Device, 0, oImageSizeROI.width * oImageSizeROI.height * sizeof(Npp8u));

    Npp8u nMinSiteValue = 1;
    Npp8u nMaxSiteValue = 1;

    printf("Running distance transform...\n");
    NppStatus status = nppiDistanceTransformPBA_8u32f_C1R_Ctx(
        pInputImage_Device, oImageSizeROI.width * sizeof(Npp8u),
        nMinSiteValue, nMaxSiteValue,
        0, 0,
        0, 0,
        0, 0,
        pOutputImage_Device, oImageSizeROI.width * sizeof(Npp32f),
        oImageSizeROI, pScratchBuffer, nppStreamCtx);

    if (status != NPP_SUCCESS)
    {
        printf("Distance transform failed with error: %d\n", status);
    }
    else
    {
        printf("Distance transform completed successfully!\n");
    }

    cudaDeviceSynchronize();

    cudaFree(pScratchBuffer);
    cudaFree(pInputImage_Device);
    cudaFree(pOutputImage_Device);

    printf("Done!\n");
    return 0;
}