nppiFilterGaussAdvanced_8u_C1R produces discolored band on the right

Hi,
I applied gaussian convolution to an image filled with constant value and got obvious error in the rightmost few dozen pixels. Below is the code and attached is the result.
I am using Orin 36.3.0
Thank you
nppiFilterGaussAdvanced_8u_C1R_bad

/*
g++ -I /usr/local/include/opencv4 \
    -L /usr/local/lib \
    -I /usr/local/cuda/targets/aarch64-linux/include \
    -L /usr/local/cuda/targets/aarch64-linux/lib \
    -o testNpp testNpp.cpp \
    -lopencv_core -lopencv_cudaimgproc -lopencv_imgcodecs -lopencv_imgproc \
    -lnppif -lnppc -lnppisu

width=500 height=500 outputFileName=out.png ./testNpp

*/
#include <npp.h>
#include <opencv2/opencv.hpp> //OpenCV only for cv::write, upload and download

int main(int argc, char ** argv)
{
    const char * outputFileName = getenv("outputFileName");
    if(!outputFileName)
    {
        printf("no outputFileName\n");
        return -1;
    }
    
    int width = 512;
    int height = 512;
    const char * temp = getenv("width");
    if(temp)
    {
        width = strtol(temp, NULL, 10);
    }
    temp = getenv("height");
    if(temp)
    {
        height = strtol(temp, NULL, 10);
    }
    printf("width %d height %d\n", width, height);
    
    int kernelSize {5};
    cv::cuda::GpuMat gpuInput;
    gpuInput.create(height, width, CV_8UC1);
    gpuInput.setTo(cv::Scalar(0x80));
    
    if(getenv("withCross"))
    {
        //Let's make a cross
        cv::cuda::GpuMat rect1(gpuInput, cv::Rect (0, height/2, width, 10));
        rect1.setTo(cv::Scalar(0));
        cv::cuda::GpuMat rect2(gpuInput, cv::Rect (width/2, 0, 10, height));
        rect2.setTo(cv::Scalar(255));
    }
    
    double sigma {3.0};
    cv::cuda::GpuMat gpuKernel;
    cv::Mat uf(1, kernelSize + 2, CV_32F, 0.0);//should be 2 pixel wider than the kernel.
    uf.at<float>(0, 1 + kernelSize/2) = 1.0;
    std::cout << uf << std::endl;
    cv::Mat kf(1, kernelSize, CV_32F, 0.0);
    cv::GaussianBlur(uf, kf, cv::Size(kernelSize, 1), sigma, 1);
    cv::Mat kernel = kf(cv::Rect(1, 0, kernelSize, 1));//Exclude starting and ending 0
    std::cout << kernel << std::endl;
    cv::cuda::GpuMat gpuResult;
    
    gpuKernel.upload(kernel);
    
    gpuResult.create(height, width, CV_8UC1);

    NppiPoint oSrcOffset {0,0};
    NppiSize roi { width, height};
    NppiPoint anchor { kernelSize/2, kernelSize/2 };
    printf("%p %d %p %d %p\n", gpuInput.data, (int)gpuInput.step, gpuResult.data, (int)gpuResult.step, gpuKernel.data);
    NppStatus nppStatus = nppiFilterGaussAdvanced_8u_C1R(
        gpuInput.data, gpuInput.step, 
        gpuResult.data, gpuResult.step, 
        roi, kernelSize, (const Npp32f *)gpuKernel.data);
    if(nppStatus)
    {
        printf("nppiFilterGaussAdvanced_8u_C1R failed: %d\n", nppStatus);
        return -1;
    }
    
    cv::Mat output;
    gpuResult.download(output);
    if(!cv::imwrite(outputFileName, output))
    {
        printf("cv::imwrite %s failed\n", outputFileName);
        return -1;
    }
    return 0;
}
  1. I won’t be able to debug your openCV code or the interface from openCV to npp
  2. You don’t seem to be handling borders correctly
  3. You may not have a proper kernel setup. openCV gpuMat usually creates a pitched allocation, this is not what you want.
  4. try running your code with compute-sanitizer (or cuda-memcheck if you are running on an older GPU like I am - but compute-sanitizer should be correct for your Orin) to get an idea if there is something obviously wrong with what you are doing.

Here is a non-OpenCV example that seems to work correctly, for me.

$ cat t40.cpp
#include <npp.h>
#include <iostream>
#include <cstdio>
#include "lodepng.h"

// for png encoding of output images
void encodeOneStep(const char* filename, const unsigned char* image, unsigned width, unsigned height) {
  /*Encode the image*/
  unsigned error = lodepng_encode32_file(filename, image, width, height);

  /*if there's an error, display it*/
  if(error) printf("error %u: %s\n", error, lodepng_error_text(error));
}
template <typename T>
void write_u8(const char *fname, int width, int height, T *d){
    unsigned char out_im[width*height*4];
    for (int i = 0; i < width*height; i++) {
	out_im[i*4+0] = d[i];    
	out_im[i*4+1] = d[i];    
	out_im[i*4+2] = d[i];    
	out_im[i*4+3] = 255;}    
    encodeOneStep(fname, out_im, width, height);
}

int main(int argc, char ** argv)
{
    const char * outputFileName = "out.png";
    
    int width = 500;
    int height = 500;
    printf("width %d height %d\n", width, height);
    const int kernelSize {5};
    Npp8u *gpuInput;
    cudaMalloc(&gpuInput, width*height*sizeof(Npp8u));
    cudaMemset(gpuInput, 0x040, width*height*sizeof(Npp8u)); 
    
    Npp8u *gpuResult;
    cudaMalloc(&gpuResult, (width-kernelSize)*(height-kernelSize)*sizeof(Npp8u));
    cudaMemset(gpuResult, 0x0, (width-kernelSize)*(height-kernelSize)*sizeof(Npp8u)); 
    Npp32f *gpuKernel;
    cudaMalloc(&gpuKernel, kernelSize*sizeof(Npp32f));
    Npp32f gpuKernel_h[kernelSize] = {1, 4, 7, 4, 1};
    for (int i = 0; i < kernelSize; i++) gpuKernel_h[i] /= 17.0f;
    NppiSize roi { width-kernelSize, height-kernelSize};
    cudaMemcpy(gpuKernel, gpuKernel_h, kernelSize*sizeof(Npp32f), cudaMemcpyHostToDevice);
    NppStatus nppStatus = nppiFilterGaussAdvanced_8u_C1R(
        gpuInput+(kernelSize/2)*(width+1), width, 
        gpuResult, width-kernelSize,
        roi, kernelSize, gpuKernel);
    if(nppStatus)
    {
        printf("nppiFilterGaussAdvanced_8u_C1R failed: %d\n", nppStatus);
        return -1;
    }
    Npp8u *gpuResult_h = new Npp8u[(width-kernelSize)*(height-kernelSize)];
    cudaMemcpy(gpuResult_h, gpuResult, (width-kernelSize)*(height-kernelSize)*sizeof(Npp8u), cudaMemcpyDeviceToHost);
    write_u8(outputFileName, width-kernelSize, height-kernelSize, gpuResult_h);
    return 0;
}
$ nvcc -o t40 t40.cpp lodepng.cpp -lnppif
$ cuda-memcheck ./t40
========= CUDA-MEMCHECK
width 500 height 500
========= ERROR SUMMARY: 0 errors
$

out (1)

You’ll note the output image is 495x495, rather than 500x500. You can get the lodepng stuff here.

It appears that openCV is not the problem - it allows assigning any step/pitch, for example:

gpuInput.step = gpuInput.cols;
gpuResult.step = gpuResult.cols;

and then nppiFilterGaussAdvanced_8u_C1R works OK, just like in your example.

The requirement to have pitch padded to 256 bytes is coming from Nvidia - on Orin we are supposed to use NvBufSurfaceAllocate, which always pads pitch to 256. OpenCV works fine with that pitch, but not nppiFilterGaussAdvanced, even if has parameters “nSrcStep” and “nDstStep”, which suggests that step can be larger than width. I found that I can, indeed, provide nSrcStep = 512 and get correct result, but nDstStep has to be 495, otherwise the discolored band appears.

It is interesting that there are no examples or discussions on the Web about NPP APIs. It appears that I am the first person using it after 15 years of its existence.

I don’t think there is any requirement that pitch be padded to 256 in NPP. If you want to provide an example that uses NPP and CUDA C++ and nothing else, that you think is misbehaving, I’ll take a look as time permits. Defects are always possible.

There are various questions about NPP here on these forums.
You can put the following into a chrome search bar:

site:https://forums.developer.nvidia.com/ npp

No, I meant that Orin has a requirement that image rows are padded to 256 for some operations. So, if NPP cannot handle padded rows, then I will need to include an additional step or two to convert, which is wasteful.

Yes, I googled everywhere, but prior to your answer above there were no working examples of nppiFilterGaussAdvanced_8u_C1R and no explanation that it cannot handle padded rows.

NPP can handle “padded rows” (i.e. a pitched allocation) on both the input and output arrays/data. The place where I indicated care is needed is for the filter kernel definition. That is not expected to be padded in any way. If your kernel width is 5, then you are expected to provide a data array resident in device memory that has a contiguous allocation of 5 elements. If you “pad” the end of those 5 elements, it’s not going to cause any issues, but other types of arrangement might.

No, kernel is not a problem - it is a simple linear array of 5 numbers in device memory without any padding.

What was wrong with my original example is that I used full image dimensions 500x500 for both source, destination and ROI.
And that caused an Invalid global read, as found by compute-sanitizer.
It appears that NPP accesses area within Dst image of size = Src image size + ROI, which causes invalid access.

I made this mistake because the documentation for nppiFilterGaussAdvanced parameters at
Image Filtering Functions — npp 12.6 documentation never mention that Dst and ROI must be reduced by kernel.
And there were no examples on the Web for using NPP until you posted correct code on 7/3/2024:

NppiSize roi { width-kernelSize, height-kernelSize};
nppiFilterGaussAdvanced_8u_C1R(
gpuInput+(kernelSize/2)*(width+1), width,
gpuResult, width-kernelSize,
thank you.

But your code produces 495x495 resulting image.
But is there a way to produce full 500x500 image?
I tried calling nppiFilterGaussAdvancedBorder_8u_C1R with different size, but cannot make 500x500 image without discolored band.
Is there some trick with parameters to make 500x500 output with some kind of border?
Thank you

The documentation for that is here.

It is the user’s responsibility to avoid Sampling Beyond Image Boundaries .

And I made a reference to it here:

nppiFilterGaussAdvanced has no border control.

If you want to have border control, you should use nppiFilterGaussAdvancedBorder

Without border control, you must manage edge access yourself, and it would be undefined behavior to place a filter over an edge pixel if the filter extends to undefined region. (The user’s responsibility to avoid … refers to avoiding this scenario.)

The xxxxBorder variants of the filter functions address this concern, and allow you to specify behavior around the border where filter placement would “normally” extend to an undefined area of the image.

With this sort of “border control” you should be able to get a predictable, legal output that is 500x500 when using a 500x500 input image.

There are other ways to do something, if you wish. You could start with a 505x505 image to get to 500x500, or you could start with a 500x500 image, create a 495x495 output image in the center of the output buffer, the copy the 5 pixel border from input to output - or really do anything you want, “manually”, with the 5 pixel output image border. (copying the entire image from input to output before you run the nppi op into the 495x495 center is an approach I have seen, anyway.)

If none of those ideas work for you, I don’t know any other “tricks with parameters”. I acknowledge you have already brought up the idea of nppiFilterGaussAdvancedBorder, so feel free to disregard any of my comments.

The general suggestion given in the documentation is:

Borderless output can be accomplished by using a larger source image than the destination and adjusting oSrcSize and oSrcOffset parameters accordingly

OK, Thank you for reference to “Sampling Beyond Image Boundaries” - I missed it originally.
Now I meet the requirements and do not sample beyond boundaries and compute-sanitizer does not find errors.
Now back to the original question: how to get rid of discolored bars.
I understand now that those bars are caused by pitch, which is larger than width - this is how NvBufSurfaceAllocate does this - always padds to 256 bytes.
I tried to modify sample that you posted on 7/13/2024 (attached).
nppiFilterGaussAdvanced_padded.txt (5.7 KB)

I tried nppiFilterGaussAdvanced_8u_C1R with params
nppiFilterGaussAdvanced_8u_C1R 0x485567000+1026 pitch 512 0x4855a5800 pitch 512 roi 495 495 ks 5
and nppiFilterGaussAdvancedBorder_8u_C1R with params
nppiFilterGaussAdvancedBorder_8u_C1R Src 0x485567000+1026 step 512 size 495 495 off 2 2 Dst 0x4855a5800+1026 step 512 roi 495 495 ks 5 NPP_BORDER_REPLICATE
Also
nppiFilterGaussAdvancedBorder_8u_C1R Src 0x485567000+2565 step 512 size 495 495 off 5 5 Dst 0x4855a5800+2565 step 512 roi 495 495 ks 5 NPP_BORDER_REPLICATE
Also
nppiFilterGaussAdvancedBorder_8u_C1R Src 0x485567000+2565 step 512 size 495 495 off 5 5 Dst 0x4855a5800+2565 step 512 roi 490 490 ks 5 NPP_BORDER_REPLICATE
Also
nppiFilterGaussAdvancedBorder_8u_C1R Src 0x485567000+2565 step 512 size 490 490 off 5 5 Dst 0x4855a5800+2565 step 512 roi 490 490 ks 5 NPP_BORDER_REPLICATE
(trying to make offset larger and size and roi smaller and smaller).

Nothing helps: nppiFilterGaussAdvanced_8u_C1R still one discolored bar on the right
and nppiFilterGaussAdvancedBorder_8u_C1R produces 2 bars: on the left and right.

Any other ideas?
Thank you
out

please don’t make postings with code as attachments. Post code inline in your post like you did in your very first posting, or like I did in my posting on 7/13. There are several reasons for it. It makes it easier to inspect, easier to quote using forum tools, and makes it searchable.

So our objective now is to demonstrate a nppiFilterGaussAdvanced with allocation pitch = 512, input image width of 500x500, and output image width of 500x500, with no discolored bars or obvious borders, using the sample image I already used.

Here is how I would do that:

#include <npp.h>
#include <iostream>
#include <cstdio>
#include <cstdlib>
#include "lodepng.h"

// for png encoding of output images
void encodeOneStep(const char* filename, const unsigned char* image, unsigned width, unsigned height) {
  /*Encode the image*/
  unsigned error = lodepng_encode32_file(filename, image, width, height);

  /*if there's an error, display it*/
  if(error) printf("error %u: %s\n", error, lodepng_error_text(error));
}
template <typename T>
void write_u8(const char *fname, int width, int height, T *d){
    unsigned char out_im[width*height*4];
    for (int i = 0; i < width*height; i++) {
	out_im[i*4+0] = d[i];    
	out_im[i*4+1] = d[i];    
	out_im[i*4+2] = d[i];    
	out_im[i*4+3] = 255;}    
    encodeOneStep(fname, out_im, width, height);
}
using pt = Npp8u;
int main(int argc, char ** argv)
{
    const char * outputFileName = "out.png";
    
    int width = 500;
    int height = 500;
    printf("width %d height %d\n", width, height);
    const int kernelSize {5};
    pt *gpuInput;
    int pitched_alloc_width=500;
    if (argc > 1) pitched_alloc_width = atoi(argv[1]);
    printf("pitch %d\n", pitched_alloc_width);
    int input_width = pitched_alloc_width;
    // allocate pitched image storage, and zero out
    // we could use cudaMallocPitch here also, but
    // this allows me to set the pitch (input_width) exactly, for 
    // demonstration purposes
    cudaMalloc(&gpuInput, input_width*height*sizeof(pt));
    cudaMemset(gpuInput, 0x0, input_width*height*sizeof(pt));
    // setup 500 x 500 image in pitched allocation
    cudaMemset2D(gpuInput, input_width,  0x040, width*sizeof(pt), height);
    // allocate output image
    pt *gpuResult;
    int output_width = pitched_alloc_width;
    cudaMalloc(&gpuResult, output_width*height*sizeof(pt));
    cudaMemset(gpuResult, 0x0, output_width*height*sizeof(pt)); 
    // copy input image (500x500) to output image, to "cover" the border
    cudaMemcpy2D(gpuResult, output_width*sizeof(pt), gpuInput, input_width*sizeof(pt), width*sizeof(pt), height, cudaMemcpyDeviceToDevice);
    // setup filter kernel 
    Npp32f *gpuKernel;
    cudaMalloc(&gpuKernel, kernelSize*sizeof(Npp32f));
    Npp32f gpuKernel_h[kernelSize] = {1, 4, 7, 4, 1};
    for (int i = 0; i < kernelSize; i++) gpuKernel_h[i] /= 17.0f;
    NppiSize roi { width-kernelSize, height-kernelSize};
    cudaMemcpy(gpuKernel, gpuKernel_h, kernelSize*sizeof(Npp32f), cudaMemcpyHostToDevice);
    // filter: 500x500 -> 495x495, centered in the 500x500 output image, which is in a pitched allocation
    NppStatus nppStatus = nppiFilterGaussAdvanced_8u_C1R(
        gpuInput+(kernelSize/2)*(input_width+1), input_width*sizeof(pt), 
        gpuResult+(kernelSize/2)*(output_width+1), output_width*sizeof(pt),
        roi, kernelSize, gpuKernel);
    if(nppStatus)
    {
        printf("nppiFilterGaussAdvanced_8u_C1R failed: %d\n", nppStatus);
        return -1;
    }
    pt *gpuResult_h = new pt[width*height];
    cudaMemcpy2D(gpuResult_h, width*sizeof(pt), gpuResult, output_width*sizeof(pt), width, height, cudaMemcpyDeviceToHost);
    write_u8(outputFileName, width, height, gpuResult_h);
#ifdef DEBUG
    for (int i = 0; i < 128; i++) std::cout << (int)gpuResult_h[200*width+i] << " ";
    std::cout << std::endl;
#endif
    return 0;
}

specify the pitch as a command-line parameter

However what I observed was some strange behavior for this npp function (I think) when the output pitch is 512, we get the banding you are describing. You can play with it using the above code. If you set the pitch to any value other than 512 (but 500 or larger) then you get a “normal-looking” output image. If you set the pitch to 512 via the command-line argument, then you get the strange banding. You can see what it looks like if you compile the code with -DDEBUG.

I have already determined that the issue is related to the output image pitch, not the input image pitch.

So I have filed an internal bug to have this looked at (4861931). In the meantime workarounds might be:

  • use a pitched input image, but not a pitched output image (set output_width to width)
  • use an output image pitch that is some number other than 512 (but 500 or larger, of course)

If you do need an output image that is exactly 500x500 (width, height) in a 512x500 pitched device allocation, then the only workaround suggestion I have is to use my code above to generate an output image in a 500x500 allocation (which seems to avoid the weirdness), and use cudaMemcpy2D (device-to-device) to copy that result to a 512x500 allocation.

NVBUG 4861931 is fixed and verified on pitch 512 . This change will target a next second CUDA release .

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.