Hello,
I am trying to run nppiFunction_Ctx concurrently in streams. Namely the function nppiSqrDistanceValid_Norm_8u32f_C1R_Ctx for pattern matching of latest Cuda 10.2. The run is slow - gives the same runtime as with non-streams version nppiSqrDistanceValid_Norm_8u32f_C1R. My code is as follows:
int nstreams = 10;
Npp32f** pDst_array = new Npp32f * [nstreams];
int* pPitches_dst = new int[nstreams];
cudaStream_t* streams = new cudaStream_t[nstreams];
NppStreamContext* pNppStreamContext = new NppStreamContext[nstreams];
for (int i = 0; i < nstreams; i++)
{
cudaStreamCreate(&(streams[i]));
}
for (int i = 0; i < nstreams; i++)
{
int width = (roi_src[i]).width - (roi_pattern[i]).width + 1;
int height = (roi_src[i]).height - (roi_pattern[i]).height + 1;
nppSetStream(streams[i]);
nppGetStreamContext(&(pNppStreamContext[i]));
pDst_array[i] = nppiMalloc_32f_C1(width, height, &(pPitches_dst[i]));
nppiSqrDistanceValid_Norm_8u32f_C1R_Ctx(d_src + (roi_src[i]).y * nSrcPitch + (roi_src[i]).x * sizeof(Npp8u),
nSrcPitch,
{ (roi_src[i]).width, (roi_src[i]).height },
d_patterns_array[i],
d_patterns_pitch[i],
{ (roi_pattern[i]).width, (roi_pattern[i]).height },
pDst_array[i],
pPitches_dst[i],
pNppStreamContext[i]);
}
What is wrong with such an implementation. And what is the correct usage of NppStreamContext in order to obtain the concurrent run of the function.
Thanks a lot in advance.