NPP; Morphological Operations; Dilate operation gives strange results

I’ve tried all concievable permutations of image sizes, channel configurations etc.

To make this post searchable, here is also the method used:

nppiDilateBorder_16u_C1R with border type NPP_BORDER_REPLICATE.

Here is my full function call:

    NppStatus eStatusNPP = nppiDilateBorder_16u_C1R(
        sourceVRAMImage.data(), // source
        static_cast<int>(sourceVRAMImage.pitch()), // source step
        sourceImageSize, // source size
        sourceOffset, // source offset
        destinationVRAMImage.data(), // destination
        static_cast<int>(destinationVRAMImage.pitch()), // destination step
        regionOfInterest, // ROI
        maskVRAMImage.data(), // mask data
        maskSize, // mask size
        oAnchor,
        NPP_BORDER_REPLICATE
    );

I don’t seem to have any difficulty with it. Here is a complete test case:

# cat t195.cu
#include <nppi.h>
#include <iostream>
#include "lodepng.h"

const int R = 0;
const int G = 1;
const int B = 2;
const int A = 3;
const int C = 220;
const int Aval = 255;

template <typename T>
unsigned  write_png(T *data, const int w, const int h, const char *fn) {
  std::vector<unsigned char> d(w*h*4);
  for (int i = 0; i < w*h; i++){
    d[4*i+R] = data[i]*C;
    d[4*i+G] = data[i]*C;
    d[4*i+B] = data[i]*C;
    d[4*i+A] = Aval;}
  unsigned error = lodepng::encode(fn, d, w, h);
  return error;
}

int main(){
  const int dim = 2048;
  const int mdim = 39;
  int r = 800;
  Npp16u *data;
  Npp32s pitch;
  NppiSize sourceImageSize = {dim,dim};
  NppiPoint sourceOffset = {0,0};
  Npp16u *odata;
  Npp32s        opitch;
  NppiSize regionOfInterest = {dim, dim};
  Npp8u *mdata;
  NppiSize maskSize = {mdim,mdim};
  NppiPoint oAnchor = {0,0};

  cudaMallocManaged(&data, sizeof(Npp16u)*dim*dim);
  pitch = sizeof(Npp16u)*dim;
  memset(data, 0, sizeof(Npp16u)*dim*dim);
  for (int x = 0; x < dim; x++)
    for (int y = 0; y < dim; y++){
      int mx = dim/2-x;
      int my = dim/2-y;
      if ((mx*mx+my*my >= r*r) && (mx*mx+my*my < (r+1)*(r+1))) data[y*dim+x] = 1;
    }
  cudaMallocManaged(&odata, sizeof(Npp16u)*dim*dim);
  opitch = sizeof(Npp16u)*dim;
  memset(odata, 0, sizeof(Npp16u)*dim*dim);
  cudaMallocManaged(&mdata, sizeof(Npp8u)*mdim*mdim);
  for (int i = 0; i < mdim*mdim; i++) mdata[i] = 1;

#ifdef DEBUG
  std::cout << "Input:" << std::endl;
  for (int y = 0; y < dim; y++) {
    for (int x = 0; x < dim; x++) std::cout << data[y*dim+x] << " ";
    std::cout << std::endl;}
#endif
#ifdef IFILE
  write_png(data, dim, dim, "input.png");
#endif
  NppStatus eStatusNPP = nppiDilateBorder_16u_C1R(
        data, // source
        pitch, // source step
        sourceImageSize, // source size
        sourceOffset, // source offset
        odata, // destination
        opitch, // destination step
        regionOfInterest, // ROI
        mdata, // mask data
        maskSize, // mask size
        oAnchor,
        NPP_BORDER_REPLICATE);
  cudaDeviceSynchronize();
  if (eStatusNPP != NPP_SUCCESS) std::cout << (int)eStatusNPP;

#ifdef DEBUG
  std::cout << "Output:" << std::endl;
  for (int y = 0; y < dim; y++) {
    for (int x = 0; x < dim; x++) std::cout << odata[y*dim+x] << " ";
    std::cout << std::endl;}
#endif
#ifdef OFILE
  write_png(odata, dim, dim, "output.png");
#endif
}


# nvcc -o t195 t195.cu lodepng.cpp -lnppim -DOFILE
# compute-sanitizer ./t195
========= COMPUTE-SANITIZER
========= ERROR SUMMARY: 0 errors
#

And here is the output.png file produced from the above run:

If you want to see what the input file looks like, compile with -DIFILE and look for the input.png that is produced. You can find the necessary PNG helper files here (lodepng.cpp and lodepng.h)

CUDA 12.2, L4 GPU.

Thanks! Was a big help and it works for me now.

By trying portions of the test you supplied I could conclude that the error was with the mask data I was using.

I’m using the npp helper classes for images to do this for the mask (defective code):

    npp::ImageCPU_8u_C1 maskRAMImage;
    dipImageToNppCPU(maskRAMImage, maskDipImage); // internal conversion function
    npp::ImageNPP_8u_C1 maskVRAMImage(maskRAMImage);

the error is here, as the maskVRAMImage is not tightly packed as it should be.

So, the fix for my code is to go:

    npp::ImageCPU_8u_C1 maskRAMImage;
    dipImageToNppCPU(maskRAMImage, maskDipImage); // internal conversion function
    npp::ImageNPP_8u_C1 maskVRAMImage(maskRAMImage, true); // notice the true here, "tight" packing of rows in memory