I don’t seem to have any difficulty with it. Here is a complete test case:
# cat t195.cu
#include <nppi.h>
#include <iostream>
#include "lodepng.h"
const int R = 0;
const int G = 1;
const int B = 2;
const int A = 3;
const int C = 220;
const int Aval = 255;
template <typename T>
unsigned write_png(T *data, const int w, const int h, const char *fn) {
std::vector<unsigned char> d(w*h*4);
for (int i = 0; i < w*h; i++){
d[4*i+R] = data[i]*C;
d[4*i+G] = data[i]*C;
d[4*i+B] = data[i]*C;
d[4*i+A] = Aval;}
unsigned error = lodepng::encode(fn, d, w, h);
return error;
}
int main(){
const int dim = 2048;
const int mdim = 39;
int r = 800;
Npp16u *data;
Npp32s pitch;
NppiSize sourceImageSize = {dim,dim};
NppiPoint sourceOffset = {0,0};
Npp16u *odata;
Npp32s opitch;
NppiSize regionOfInterest = {dim, dim};
Npp8u *mdata;
NppiSize maskSize = {mdim,mdim};
NppiPoint oAnchor = {0,0};
cudaMallocManaged(&data, sizeof(Npp16u)*dim*dim);
pitch = sizeof(Npp16u)*dim;
memset(data, 0, sizeof(Npp16u)*dim*dim);
for (int x = 0; x < dim; x++)
for (int y = 0; y < dim; y++){
int mx = dim/2-x;
int my = dim/2-y;
if ((mx*mx+my*my >= r*r) && (mx*mx+my*my < (r+1)*(r+1))) data[y*dim+x] = 1;
}
cudaMallocManaged(&odata, sizeof(Npp16u)*dim*dim);
opitch = sizeof(Npp16u)*dim;
memset(odata, 0, sizeof(Npp16u)*dim*dim);
cudaMallocManaged(&mdata, sizeof(Npp8u)*mdim*mdim);
for (int i = 0; i < mdim*mdim; i++) mdata[i] = 1;
#ifdef DEBUG
std::cout << "Input:" << std::endl;
for (int y = 0; y < dim; y++) {
for (int x = 0; x < dim; x++) std::cout << data[y*dim+x] << " ";
std::cout << std::endl;}
#endif
#ifdef IFILE
write_png(data, dim, dim, "input.png");
#endif
NppStatus eStatusNPP = nppiDilateBorder_16u_C1R(
data, // source
pitch, // source step
sourceImageSize, // source size
sourceOffset, // source offset
odata, // destination
opitch, // destination step
regionOfInterest, // ROI
mdata, // mask data
maskSize, // mask size
oAnchor,
NPP_BORDER_REPLICATE);
cudaDeviceSynchronize();
if (eStatusNPP != NPP_SUCCESS) std::cout << (int)eStatusNPP;
#ifdef DEBUG
std::cout << "Output:" << std::endl;
for (int y = 0; y < dim; y++) {
for (int x = 0; x < dim; x++) std::cout << odata[y*dim+x] << " ";
std::cout << std::endl;}
#endif
#ifdef OFILE
write_png(odata, dim, dim, "output.png");
#endif
}
# nvcc -o t195 t195.cu lodepng.cpp -lnppim -DOFILE
# compute-sanitizer ./t195
========= COMPUTE-SANITIZER
========= ERROR SUMMARY: 0 errors
#
And here is the output.png file produced from the above run:
If you want to see what the input file looks like, compile with -DIFILE and look for the input.png that is produced. You can find the necessary PNG helper files here (lodepng.cpp and lodepng.h)
CUDA 12.2, L4 GPU.