About inaccuracy in nppiCrossCorrValid_NormLevel()

I tried running the function nppiCrossCorrValid_NormLevel_32f_C1R() with a toy example as follows Cuda version 11.4 and Quadro T2000 gpu. The output/result of this function are not quite accurate. I compared these results with manually calculating the crosscorr coefficient using the formulae given here.

#include <string.h>
#include <fstream>
#include <iostream>
#include <chrono>

#include <cuda_runtime.h>
#include <npp.h>

#include <helper_cuda.h>
#include <helper_string.h>

int main(int argc, char* argv[]) {
	printf("%s Starting...\n\n", argv[0]);

	try {
		

		// CrossNorm Example

		NppiSize srcRoiSize = { 5,4 };
		NppiSize tplRoiSize = { 3,3 };
		NppiSize dstRoiSize = { srcRoiSize.width - tplRoiSize.width + 1,srcRoiSize.height - tplRoiSize.height + 1 };

		Npp32f pSrc[5 * 4] = { 1.0f, 2.0f, 1.5f, 4.1f,  3.6f,
							 0.2f, 3.2f, 2.5f, 1.5f, 10.0f,
							 5.0f, 6.8f, 0.5f, 4.1f,  1.1f,
							 7.1f, 4.2f, 2.2f, 8.7f, 10.0f };

		for (int i = 0; i < srcRoiSize.width*srcRoiSize.height; i++) {
			pSrc[i] = 0;
		}

		pSrc[1*5+0] = 1.0f;
		pSrc[1 * 5 + 1] = 1.0f;
		pSrc[1 * 5 + 4] = 1.0f;
		
             for (int i = 0; i < srcRoiSize.height * srcRoiSize.width; i++) {
			if (i % srcRoiSize.width == 0)
				std::cout << "\n";
			printf("%.2f ", pSrc[i]);
		}
		std::cout << "\n";

		Npp32f pTpl[3 * 3] = { 0, 0, 0,
							0, 1.0f, 0,
							0, 0, 0.0f };
		Npp32f pDst[5*4];

		int srcStep = srcRoiSize.width * sizeof(Npp32f);
		int tplStep = tplRoiSize.width * sizeof(Npp32f);
		int dstStep = dstRoiSize.width * sizeof(Npp32f);

		Npp32f* DpSrc, * DpTpl, * DpDst;
		DpSrc = nppsMalloc_32f(srcRoiSize.width* srcRoiSize.height);
		DpTpl = nppsMalloc_32f(tplRoiSize.width* tplRoiSize.height);
		DpDst = nppsMalloc_32f(dstRoiSize.width* dstRoiSize.height);
		

		
		cudaError_t status = cudaMemcpy(DpTpl, pTpl, tplRoiSize.width * tplRoiSize.height * sizeof(Npp32f), cudaMemcpyHostToDevice);
		cudaMemcpy(DpSrc, pSrc, srcRoiSize.width * srcRoiSize.height * sizeof(Npp32f), cudaMemcpyHostToDevice);
		

		int BufferSize;
		NPP_CHECK_NPP(nppiValidNormLevelGetBufferHostSize_32f_C1R(srcRoiSize, &BufferSize));
		std::cout << "BufferSize : " << BufferSize << "\n";
		Npp8u* pDeviceBuffer;
		pDeviceBuffer = nppsMalloc_8u(BufferSize);
		

		NPP_CHECK_NPP(nppiCrossCorrValid_NormLevel_32f_C1R(DpSrc, srcStep, srcRoiSize, DpTpl, tplStep, tplRoiSize, DpDst, dstStep, pDeviceBuffer));
		
		for (int i = 0; i < dstRoiSize.height * dstRoiSize.width; i++) {
			if (i % dstRoiSize.width == 0)
				std::cout << "\n";
			printf("%.2f ", pDst[i]);
		}
		std::cout << "\n";
	}

	catch (...) {
		std::cerr << "Program error! An unknow type of exception occurred. \n";
		std::cerr << "Aborting." << std::endl;

		exit(EXIT_FAILURE);
		return -1;
	}
	return 0;
}

The results for the above function is as follows :
pSrc :

0.00 0.00 0.00 0.00 0.00
1.00 1.00 0.00 0.00 1.00
0.00 0.00 0.00 0.00 0.00
0.00 0.00 0.00 0.00 0.00

pTpl :

0.00 0.00 0.00
0.00 1.00 0.00
0.00 0.00 0.00

pDst :

0.64 -0.12 -0.12
-0.18 -0.12 -0.12

But manually calculating (for pDst[0,0]) -
The formulae for Rss_tilda and Rtt_tilda were given wrong. I think it’s the sum of squares of each term.

Mean_t = 1/9, Mean_s = 2/9
Rst_tilda = (8/9)*(7/9)
Rss_tilda = (7/9)*(7/9)*2
Rtt_tilda = (8/9)*(8/9)

pDst[0,0] = (8/9)*(7/9) / sqrt(Rss_tilda * Rtt_tilda)
= 1/sqrt(2) = 0.707

This results quite match for nppiCrossCorrValid_Norm() or nppiCrossCorrValid() but not for this particular function. Can you please point what’s wrong or how the function/api calc. internally ?