How to get the shrink and integral image by cuda npp

I developed the module to get the shrink and integral image by cuda npp.
I have some questions about it.
My code follows.

__global__ void convertValue(DWORD* pdwIntImage, __int64* pn64SqrIntImage, Npp32f* pn32fIntImage, int nPitch1, Npp64f* pn64fSqrIntImage, int nPitch2, int nIntH, int nIntW)
{
    int nX = blockIdx.x * blockDim.x + threadIdx.x;
    int nY = blockIdx.y * blockDim.y + threadIdx.y;
    if ((nX > nIntW - 1) || (nY > nIntH - 1))
    {
        return;
    }
    pdwIntImage[nY * nIntW + nX] = (DWORD)pn32fIntImage[nPitch1 * nY + nX];
    pn64SqrIntImage[nY * nIntW + nX] = (__int64)pn64fSqrIntImage[nPitch2 * nY + nX];
}

void GetShrAndIntImage(Npp8u* pbImage, int nLineStep, int nH, int nW, DWORD* pdwIntImage, __int64* pn64SqrIntImage, float rRate)
{
    NppiSize xSize = {nW, nH}, xShrinkSize = {nW / rRate, nH / rRate}, xIntSize = {nW / rRate + 1, nH / rRate + 1};
    NppiRect xRect = {0, 0, nW, nH};
    int nShrinkLineStep, nIntStep;
    size_t nSqrIntStep;
    Npp8u *pbShrinkImage = nppiMalloc_8u_C1(xShrinkSize.width, xShrinkSize.height, &nShrinkLineStep);
    Npp32f *p32fIntImage = nppiMalloc_32f_C1(xIntSize.width, xIntSize.height, &nIntStep);
    Npp64f *p64fSqrIntImage;
    cudaMallocPitch(&p64fSqrIntImage, &nSqrIntStep, xIntSize.width * sizeof(Npp64f), xIntSize.height);

    NppStatus xStatusShrink = nppiResize_8u_C1R(pbImage, xSize, nLineStep, xRect, pbShrinkImage, nShrinkLineStep, xShrinkSize, 1 / rRate, 1 / rRate, 4);
    NppStatus xStatusInt = nppiSqrIntegral_8u32f64f_C1R(pbShrinkImage, nShrinkLineStep, p32fIntImage, nIntStep, p64fSqrIntImage, nSqrIntStep, xShrinkSize, 0, 0); // returns pitch error at here
    dim3 Block(32,32);
    dim3 Grid((xIntSize.width + Block.x - 1)/Block.x, (xIntSize.height + Block.y - 1)/Block.y);
    convertValue<<<Grid, Block>>>(pdwIntImage, pn64SqrIntImage, p32fIntImage, nIntStep, p64fSqrIntImage, nSqrIntStep, xIntSize.height, xIntSize.width);
    cudaDeviceSynchronize();
    cudaError_t error = cudaGetLastError(); // returns cudaErrorLaunchFailure at here
    nppiFree(pbShrinkImage);
    nppiFree(p32fIntImage);
    cudaFree(p64fSqrIntImage);
}

First question: why convertValue function returns cudaErrorLaunchFailure?

Second question: nppiSqrIntegral_8u32f64f_C1R does not give the right result. what is the reason?

Third question: what do they mean - last 3 parameters of nppiReseize_8u_C1R and last 2 parameters of nppiSqrIntegral_8u32f64f_C1R?

Fourth question: there is no method to malloc Npp64f pointer in npp? then can i use cudaMallocPitch instead like above?

Please help me.

Thanks in advance.