nppiFilterMedian_32f_C1R - cuda-memcheck Invalid Access

Hello everyone,

I am trying to use the nppiFilterMedian_32f_C1R from NPP. My code seems to work but when I launch cuda-memcheck on it, there are countless invalid memory access.

I reproduced this behavior on the following sample. Am I doing something wrong ?

EDIT : I am using Cuda 10.1 on Ubuntu 18.04

// includes
#include <npp.h>
#include
#include <cuda.h>
#include <cuda_runtime.h>

// defines
#define WIDTH 1920
#define HEIGHT 1080
#define MEDIAN_SIZE 5

int main()
{
// input data
float* inData;
// output data
float* outData;
size_t pitch;

// Alloc input
cudaError_t err = cudaMallocPitch((void**)&inData, &pitch , WIDTH*sizeof(float) , HEIGHT );
if( err != cudaSuccess)
{
	std::cerr << "error allocating input \n";
	std::cerr << cudaGetErrorName(err) << std::endl;
	return -1;
}

// alloc output
err = cudaMallocPitch((void**)&outData, &pitch, WIDTH*sizeof(float) , HEIGHT);
if( err != cudaSuccess)
{
	std::cerr << "error allocating output \n";
	std::cerr << cudaGetErrorName(err) << std::endl;
	return -1;
}

// Npp ROI
int halfKernelSize = MEDIAN_SIZE / 2;
int left = halfKernelSize;
int right = WIDTH - 1 - halfKernelSize;
int top = halfKernelSize;
int bottom = HEIGHT -1 - halfKernelSize;
NppiSize sizeROI;
sizeROI.width = bottom - top;
sizeROI.height = right - left;

NppiSize size;
size.width = WIDTH;
size.height = HEIGHT;

// npp parameters - anchor
NppiPoint anchor;
anchor.x = halfKernelSize;
anchor.y = halfKernelSize;

// npp parameters - mask
NppiSize maskSize;
maskSize.width = MEDIAN_SIZE;
maskSize.height = MEDIAN_SIZE;

// alloc scratch Memory
Npp8u* scratchMemory;
uint32_t scratchMemorySize;
NppStatus errNpp = nppiFilterMedianGetBufferSize_32f_C1R(size, maskSize, &scratchMemorySize);
if(errNpp != NPP_SUCCESS)
{
	std::cerr << "NPP error Median filter BufferSize : " << errNpp << std::endl;
	return -1;
}

err = cudaMalloc( (void**)&scratchMemory, scratchMemorySize);
if (err != cudaSuccess) {
	std::cerr << "error allocating scratch \n";
	std::cerr << cudaGetErrorName(err) << std::endl;
	return -1;
}

// NPP launch
// offset because borders are not processed
int offset = top *  pitch + left * sizeof(float);

errNpp = nppiFilterMedian_32f_C1R(  (Npp32f*) (inData + offset), pitch,
                (Npp32f*)( outData + offset), pitch,
                sizeROI, maskSize, anchor, scratchMemory );

cudaDeviceSynchronize();
if(errNpp != NPP_SUCCESS)
{
	std::cerr << "NPP error Median filter : " << errNpp << std::endl;
	return -1;
}


return 0;

}

I will be grateful for the help.