NppStreamContext usage for nppi_Ctx functions

Hello,
I am trying to run nppiFunction_Ctx concurrently in streams. Namely the function nppiSqrDistanceValid_Norm_8u32f_C1R_Ctx for pattern matching of latest Cuda 10.2. The run is slow - gives the same runtime as with non-streams version nppiSqrDistanceValid_Norm_8u32f_C1R. My code is as follows:

int nstreams = 10;

Npp32f** pDst_array = new Npp32f * [nstreams];
int* pPitches_dst = new int[nstreams];

cudaStream_t* streams = new cudaStream_t[nstreams];
NppStreamContext* pNppStreamContext = new NppStreamContext[nstreams];

for (int i = 0; i < nstreams; i++)
{
cudaStreamCreate(&(streams[i]));
}

for (int i = 0; i < nstreams; i++)
{
int width = (roi_src[i]).width - (roi_pattern[i]).width + 1;
int height = (roi_src[i]).height - (roi_pattern[i]).height + 1;

nppSetStream(streams[i]);
nppGetStreamContext(&(pNppStreamContext[i]));

pDst_array[i] = nppiMalloc_32f_C1(width, height, &(pPitches_dst[i]));		

nppiSqrDistanceValid_Norm_8u32f_C1R_Ctx(d_src + (roi_src[i]).y * nSrcPitch + (roi_src[i]).x * sizeof(Npp8u),
										nSrcPitch,
										{ (roi_src[i]).width, (roi_src[i]).height },
										d_patterns_array[i],
										d_patterns_pitch[i],
										{ (roi_pattern[i]).width, (roi_pattern[i]).height },
										pDst_array[i],
										pPitches_dst[i],
										pNppStreamContext[i]);

}

What is wrong with such an implementation. And what is the correct usage of NppStreamContext in order to obtain the concurrent run of the function.

Thanks a lot in advance.