Hello.
I am using the NVIDIA Performance Primitives to perform some image processing on a Jetson TX1 with CUDA 8.
I just want to use a simple function like ‘nppiDilate_8u_C1R’. The issue is that the output image is different each time I apply the function while the input image is the same.
I wrote a short program to prove it (compiled with ‘nvcc prog.cu -lnppi -arch=sm_53’):
#include <iostream>
#include <npp.h>
int main(void) {
const int width = 2592;
const int height = 1944;
const int nbPixels = width * height;
unsigned char img[nbPixels];
for (int i = 0; i < nbPixels; i++) img[i] = i % 10 == 0 ? 255 : 0;
unsigned char *d_img;
unsigned char *d_mask;
NppiSize maskSize = {3, 3};
Npp8u mask[9] = {
1, 1, 1,
1, 1, 1,
1, 1, 1
};
NppiPoint anchor = {1, 1};
NppiSize sizeROI = {width, height};
cudaMalloc(reinterpret_cast<void **>(&d_img), nbPixels * sizeof(unsigned char));
cudaMalloc(reinterpret_cast<void **>(&d_mask), sizeof(unsigned char) * maskSize.height * maskSize.width);
cudaMemcpy(d_mask, mask, maskSize.height * maskSize.width, cudaMemcpyHostToDevice);
cudaMemcpy(d_img, img, nbPixels * sizeof(unsigned char), cudaMemcpyHostToDevice);
nppiDilate_8u_C1R(d_img, width, d_img, width, sizeROI, d_mask, maskSize, anchor);
cudaMemcpy(img, d_img, width * height * sizeof(unsigned char), cudaMemcpyDeviceToHost);
int count = 0;
for (int i = 0; i < nbPixels; i++) count += img[i] / 255;
std::cout << count << std::endl;
return 0;
}
The number of white pixels can be 3 830 585 or 3 859 498 for example.
I find this strange that it is non-deterministic and that there is a random component acting for a simple dilatation operation.
Is it the expected behavior or am I doing something wrong?