I developed the module to get the shrink and integral image by cuda npp.
I have some questions about it.
My code follows.
__global__ void convertValue(DWORD* pdwIntImage, __int64* pn64SqrIntImage, Npp32f* pn32fIntImage, int nPitch1, Npp64f* pn64fSqrIntImage, int nPitch2, int nIntH, int nIntW)
{
int nX = blockIdx.x * blockDim.x + threadIdx.x;
int nY = blockIdx.y * blockDim.y + threadIdx.y;
if ((nX > nIntW - 1) || (nY > nIntH - 1))
{
return;
}
pdwIntImage[nY * nIntW + nX] = (DWORD)pn32fIntImage[nPitch1 * nY + nX];
pn64SqrIntImage[nY * nIntW + nX] = (__int64)pn64fSqrIntImage[nPitch2 * nY + nX];
}
void GetShrAndIntImage(Npp8u* pbImage, int nLineStep, int nH, int nW, DWORD* pdwIntImage, __int64* pn64SqrIntImage, float rRate)
{
NppiSize xSize = {nW, nH}, xShrinkSize = {nW / rRate, nH / rRate}, xIntSize = {nW / rRate + 1, nH / rRate + 1};
NppiRect xRect = {0, 0, nW, nH};
int nShrinkLineStep, nIntStep;
size_t nSqrIntStep;
Npp8u *pbShrinkImage = nppiMalloc_8u_C1(xShrinkSize.width, xShrinkSize.height, &nShrinkLineStep);
Npp32f *p32fIntImage = nppiMalloc_32f_C1(xIntSize.width, xIntSize.height, &nIntStep);
Npp64f *p64fSqrIntImage;
cudaMallocPitch(&p64fSqrIntImage, &nSqrIntStep, xIntSize.width * sizeof(Npp64f), xIntSize.height);
NppStatus xStatusShrink = nppiResize_8u_C1R(pbImage, xSize, nLineStep, xRect, pbShrinkImage, nShrinkLineStep, xShrinkSize, 1 / rRate, 1 / rRate, 4);
NppStatus xStatusInt = nppiSqrIntegral_8u32f64f_C1R(pbShrinkImage, nShrinkLineStep, p32fIntImage, nIntStep, p64fSqrIntImage, nSqrIntStep, xShrinkSize, 0, 0); // returns pitch error at here
dim3 Block(32,32);
dim3 Grid((xIntSize.width + Block.x - 1)/Block.x, (xIntSize.height + Block.y - 1)/Block.y);
convertValue<<<Grid, Block>>>(pdwIntImage, pn64SqrIntImage, p32fIntImage, nIntStep, p64fSqrIntImage, nSqrIntStep, xIntSize.height, xIntSize.width);
cudaDeviceSynchronize();
cudaError_t error = cudaGetLastError(); // returns cudaErrorLaunchFailure at here
nppiFree(pbShrinkImage);
nppiFree(p32fIntImage);
cudaFree(p64fSqrIntImage);
}
First question: why convertValue function returns cudaErrorLaunchFailure?
Second question: what do they mean - last 3 parameters of nppiReseize_8u_C1R and last 2 parameters of nppiSqrIntegral_8u32f64f_C1R?
Third question: there is no method to malloc Npp64f pointer in npp? then can i use cudaMallocPitch instead like above?
Please help me.
Thanks in advance.