Error in calculating the integral image by NPP

I have developed the module to calculate the integral image by NPP as below.
But it returns cudaErrorLaunchFailure.
What is the problem?

__global__ void convertValue(DWORD* pdwIntImage, __int64* pn64SqrIntImage, Npp32f* pn32fIntImage, int nPitch1, Npp64f* pn64fSqrIntImage, int nPitch2, int nIntH, int nIntW)
{
	int nX = blockIdx.x * blockDim.x + threadIdx.x;
	int nY = blockIdx.y * blockDim.y + threadIdx.y;
	if ((nX > nIntW - 1) || (nY > nIntH - 1))
	{
		return;
	}
	pdwIntImage[nY * nIntW + nX] = (DWORD)pn32fIntImage[nPitch1 * nY + nX];
	pn64SqrIntImage[nY * nIntW + nX] = (__int64)pn64fSqrIntImage[nPitch2 * nY + nX];
}

void CreateIntegralImage64ByNpp(BYTE* pbImage, DWORD* pdwIntImage, __int64* pn64SqrIntImage, int nIntH, int nIntW)
{
	NppStatus xStatus;
	Npp8u* p8uImage;
	Npp32f* p32fIntImage;
	Npp64f* p64fSqrIntImage;
	NppiSize xSize;
	xSize.height = nIntH - 1;
	xSize.width = nIntW - 1;
	size_t nPitch1, nPitch2, nPitch3;
	cudaMallocPitch(&p8uImage, &nPitch1, (nIntW - 1) * sizeof(Npp8u), nIntH - 1);
	cudaMemcpy2D(p8uImage, nPitch1, pbImage, nIntW - 1, (nIntW - 1) * sizeof(Npp8u), nIntH - 1, cudaMemcpyDeviceToDevice);
	cudaMallocPitch(&p32fIntImage, &nPitch2, nIntW * sizeof(Npp32f), nIntH);
	cudaMallocPitch(&p64fSqrIntImage, &nPitch3, nIntW * sizeof(Npp64f), nIntH);
	xStatus = nppiSqrIntegral_8u32f64f_C1R(p8uImage, nPitch1, p32fIntImage, nPitch2, p64fSqrIntImage, nPitch3, xSize, 0, 0);
	cudaDeviceSynchronize();
	cudaError_t error = cudaGetLastError();
	dim3 Block(32,32);
	dim3 Grid((nIntW + Block.x - 1)/Block.x, (nIntH + Block.y - 1)/Block.y);
	convertValue<<<Grid, Block>>>(pdwIntImage, pn64SqrIntImage, p32fIntImage, nPitch2, p64fSqrIntImage, nPitch3, nIntH, nIntW);

	cudaDeviceSynchronize();
	error = cudaGetLastError();
	cudaFree(p8uImage);
	cudaFree(p32fIntImage);
	cudaFree(p64fSqrIntImage);
}

int nH = 1200, nW = 1600;
Npp8u* d_pbImage;
DWORD* d_pdwIntImage;
__int64* d_pn64SqrIntImage;
cudaMalloc(&d_pbImage, nW * nH);
cudaMalloc(&d_pdwIntImage, (nW + 1) * (nH + 1) * sizeof(DWORD));
cudaMalloc(&d_pn64SqrIntImage, (nW + 1) * (nH + 1) * sizeof(__int64));
CreateIntegralImage64ByNpp(d_pbImage, d_pdwIntImage, d_pn64SqrIntImage, nH + 1, nW + 1);
cudaDeviceSynchronize();
cudaError_t error;
error = cudaGetLastError();

Thanks in advance.

No one had experience about NPP?