I tried to calculate the integral image using nvidias performance primitives. It works (return code is 0) but the result seems to be wrong.
// Create N x M matrix
const long long w = 10, h = 10;
Npp8u* quadrM = new Npp8u[w*h];
printf("\nSource matrix\n");
for(Npp8u i = 0; i < w*h; i++)
{
quadrM[i] = i;
if(i % w == 0) printf("\n");
printf("%d ", i);
}
// Set region of interest
NppiSize roi;
roi.width = 10;
roi.height = 10;
// Transfer to cuda device
Npp8u* pSrc;
size_t sizeSrc = w*h*sizeof(Npp8u);
cudaMalloc(&pSrc, sizeSrc);
cudaMemcpy(pSrc, quadrM, sizeSrc, cudaMemcpyHostToDevice);
check_CUDA_Error("Error while copying memory");
// Allocate destination memory
Npp32s* pDest;
size_t sizeDest = (roi.width+1)*(roi.height+1)*sizeof(Npp32s);
cudaMalloc(&pDest, sizeDest);
// Pointer for square sum
Npp32f* pSqr;
size_t sizeSqr = (roi.width+1)*(roi.height+1)*sizeof(Npp32f);
cudaMalloc(&pSqr, sizeSqr);
// Calculate integral image
LARGE_INTEGER li_start, li_stop, li_frequency;
QueryPerformanceFrequency(&li_frequency);
int t = 0;
QueryPerformanceCounter(&li_start);
NppStatus status = nppiSqrIntegral_8u32s32f_C1R(pSrc, // Source image
w, // The source-image line step is the number of bytes between successive rows in the image.
pDest,
(roi.width+1)*4, // Destination image line step
pSqr,
(roi.width+1)*4, // Destination image line step
roi,
0,
0,
roi.height+1);
QueryPerformanceCounter(&li_stop);
t = int ((1000000 * (li_stop.QuadPart - li_start.QuadPart)) / li_frequency.QuadPart);
printf("\nTime for integral image: %d \n", t);
// Show status
std::cout << status << std::endl;
// Print matrix
//printCudaMemory(pSrc, (roi.width)*(roi.height), roi.width);
printf("\nIntegral image\n");
printCudaMemory(pDest, (roi.width+1)*(roi.height+1), roi.width+1);
printf("\nSquare sum\n");
printCudaMemory(pSqr, (roi.width+1)*(roi.height+1), roi.width+1);
// Free memory
cudaFree(pSrc);
cudaFree(pDest);
cudaFree(pSqr);
// Needed for debugging
system("pause");
Input matrix is:
Output:
If you look at the second row, it shouldn’t be 8, but 10, 22, …
Looking through your code I can’t see anything obviously wrong. There is a pretty good chance that you’ve discovered a bug. NPP primitives are generally better tested and will also have much higher performance, if you allocate memory using the cuda 2D malloc (i.e. cudaMallocPitch()) or the NPP provided memory allocators (i.e. nppiMalloc_<data_type>_()).
We will investigate this issue, but that may take some time. If you can, please try the 2D memory allocators. I’d expect that to produce correct results.
Thanks for your answer. I tried to use NPP functions for memory copy and memory allocation, which results in a NPP_TEXTURE_BIND_ERROR. As there’s zero documentation regarding this error, I can’t really debug this.
// Create N x M matrix
const long long w = 10, h = 10;
Npp8u* quadrM = new Npp8u[w*h];
printf("\nSource matrix\n");
for(Npp8u i = 0; i < w*h; i++)
{
quadrM[i] = i;
if(i % w == 0) printf("\n");
printf("%d ", i);
}
// Set region of interest
NppiSize roi;
roi.width = 10;
roi.height = 10;
// Transfer to cuda device
Npp8u* pSrc;
size_t sizeSrc = w*h*sizeof(Npp8u);
//cudaMalloc(&pSrc, sizeSrc);
int pStepBytes = 0;
pSrc = nppiMalloc_8u_C1(w, h, &pStepBytes);
printf("pStepBytes %d\n", pStepBytes);
NppStatus stMemCopy = nppiCopy_8u_C1R(quadrM, w, pSrc, pStepBytes, roi);
//cudaMemcpy(pSrc, quadrM, sizeSrc, cudaMemcpyHostToDevice);
//check_CUDA_Error("Error while copying memory");
printf("Mem copy status %d\n", stMemCopy);