I tried running the function nppiCrossCorrValid_NormLevel_32f_C1R() with a toy example as follows Cuda version 11.4 and Quadro T2000 gpu. The output/result of this function are not quite accurate. I compared these results with manually calculating the crosscorr coefficient using the formulae given here.
#include <string.h>
#include <fstream>
#include <iostream>
#include <chrono>
#include <cuda_runtime.h>
#include <npp.h>
#include <helper_cuda.h>
#include <helper_string.h>
int main(int argc, char* argv[]) {
printf("%s Starting...\n\n", argv[0]);
try {
// CrossNorm Example
NppiSize srcRoiSize = { 5,4 };
NppiSize tplRoiSize = { 3,3 };
NppiSize dstRoiSize = { srcRoiSize.width - tplRoiSize.width + 1,srcRoiSize.height - tplRoiSize.height + 1 };
Npp32f pSrc[5 * 4] = { 1.0f, 2.0f, 1.5f, 4.1f, 3.6f,
0.2f, 3.2f, 2.5f, 1.5f, 10.0f,
5.0f, 6.8f, 0.5f, 4.1f, 1.1f,
7.1f, 4.2f, 2.2f, 8.7f, 10.0f };
for (int i = 0; i < srcRoiSize.width*srcRoiSize.height; i++) {
pSrc[i] = 0;
}
pSrc[1*5+0] = 1.0f;
pSrc[1 * 5 + 1] = 1.0f;
pSrc[1 * 5 + 4] = 1.0f;
for (int i = 0; i < srcRoiSize.height * srcRoiSize.width; i++) {
if (i % srcRoiSize.width == 0)
std::cout << "\n";
printf("%.2f ", pSrc[i]);
}
std::cout << "\n";
Npp32f pTpl[3 * 3] = { 0, 0, 0,
0, 1.0f, 0,
0, 0, 0.0f };
Npp32f pDst[5*4];
int srcStep = srcRoiSize.width * sizeof(Npp32f);
int tplStep = tplRoiSize.width * sizeof(Npp32f);
int dstStep = dstRoiSize.width * sizeof(Npp32f);
Npp32f* DpSrc, * DpTpl, * DpDst;
DpSrc = nppsMalloc_32f(srcRoiSize.width* srcRoiSize.height);
DpTpl = nppsMalloc_32f(tplRoiSize.width* tplRoiSize.height);
DpDst = nppsMalloc_32f(dstRoiSize.width* dstRoiSize.height);
cudaError_t status = cudaMemcpy(DpTpl, pTpl, tplRoiSize.width * tplRoiSize.height * sizeof(Npp32f), cudaMemcpyHostToDevice);
cudaMemcpy(DpSrc, pSrc, srcRoiSize.width * srcRoiSize.height * sizeof(Npp32f), cudaMemcpyHostToDevice);
int BufferSize;
NPP_CHECK_NPP(nppiValidNormLevelGetBufferHostSize_32f_C1R(srcRoiSize, &BufferSize));
std::cout << "BufferSize : " << BufferSize << "\n";
Npp8u* pDeviceBuffer;
pDeviceBuffer = nppsMalloc_8u(BufferSize);
NPP_CHECK_NPP(nppiCrossCorrValid_NormLevel_32f_C1R(DpSrc, srcStep, srcRoiSize, DpTpl, tplStep, tplRoiSize, DpDst, dstStep, pDeviceBuffer));
for (int i = 0; i < dstRoiSize.height * dstRoiSize.width; i++) {
if (i % dstRoiSize.width == 0)
std::cout << "\n";
printf("%.2f ", pDst[i]);
}
std::cout << "\n";
}
catch (...) {
std::cerr << "Program error! An unknow type of exception occurred. \n";
std::cerr << "Aborting." << std::endl;
exit(EXIT_FAILURE);
return -1;
}
return 0;
}
The results for the above function is as follows :
pSrc :
0.00 0.00 0.00 0.00 0.00
1.00 1.00 0.00 0.00 1.00
0.00 0.00 0.00 0.00 0.00
0.00 0.00 0.00 0.00 0.00
pTpl :
0.00 0.00 0.00
0.00 1.00 0.00
0.00 0.00 0.00
pDst :
0.64 -0.12 -0.12
-0.18 -0.12 -0.12
But manually calculating (for pDst[0,0]) -
The formulae for Rss_tilda and Rtt_tilda were given wrong. I think it’s the sum of squares of each term.
Mean_t = 1/9, Mean_s = 2/9
Rst_tilda = (8/9)*(7/9)
Rss_tilda = (7/9)*(7/9)*2
Rtt_tilda = (8/9)*(8/9)
pDst[0,0] = (8/9)*(7/9) / sqrt(Rss_tilda * Rtt_tilda)
= 1/sqrt(2) = 0.707
This results quite match for nppiCrossCorrValid_Norm() or nppiCrossCorrValid() but not for this particular function. Can you please point what’s wrong or how the function/api calc. internally ?