This is a module calculating the integral image by NPP.
But it returns cudaErrorLaunchFailure, i don’t know the reason.
Please help me.
Host Code
void CreateIntegralImage64ByNpp(BYTE* pbImage, DWORD* pdwIntImage, __int64* pn64SqrIntImage, int nIntH, int nIntW)
{
NppStatus xStatus;
Npp8u* p8uImage;
Npp32f* p32fIntImage;
Npp64f* p64fSqrIntImage;
NppiSize xSize;
xSize.height = nIntH - 1;
xSize.width = nIntW - 1;
size_t nPitch1, nPitch2, nPitch3;
cudaMallocPitch(&p8uImage, &nPitch1, (nIntW - 1) * sizeof(Npp8u), nIntH - 1);
cudaMemcpy2D(p8uImage, nPitch1, pbImage, nIntW - 1, (nIntW - 1) * sizeof(Npp8u), nIntH - 1, cudaMemcpyDeviceToDevice);
cudaMallocPitch(&p32fIntImage, &nPitch2, nIntW * sizeof(Npp32f), nIntH);
cudaMallocPitch(&p64fSqrIntImage, &nPitch3, nIntW * sizeof(Npp64f), nIntH);
xStatus = nppiSqrIntegral_8u32f64f_C1R(p8uImage, nPitch1, p32fIntImage, nPitch2, p64fSqrIntImage, nPitch3, xSize, 0, 0);
cudaDeviceSynchronize();
cudaError_t error = cudaGetLastError();
dim3 Block(32,32);
dim3 Grid((nIntW + Block.x - 1)/Block.x, (nIntH + Block.y - 1)/Block.y);
convertValue<<<Grid, Block>>>(pdwIntImage, pn64SqrIntImage, p32fIntImage, nPitch2, p64fSqrIntImage, nPitch3, nIntH, nIntW);
cudaDeviceSynchronize();
error = cudaGetLastError();// return cudaErrorLaunchFailure at here
cudaFree(p8uImage);
cudaFree(p32fIntImage);
cudaFree(p64fSqrIntImage);
}
Device Code
__global__ void convertValue(DWORD* pdwIntImage, __int64* pn64SqrIntImage, Npp32f* pn32fIntImage, int nPitch1, Npp64f* pn64fSqrIntImage, int nPitch2, int nIntH, int nIntW)
{
int nX = blockIdx.x * blockDim.x + threadIdx.x;
int nY = blockIdx.y * blockDim.y + threadIdx.y;
if ((nX > nIntW - 1) || (nY > nIntH - 1))
{
return;
}
pdwIntImage[nY * nIntW + nX] = (DWORD)pn32fIntImage[nPitch1 * nY + nX];
pn64SqrIntImage[nY * nIntW + nX] = (__int64)pn64fSqrIntImage[nPitch2 * nY + nX];
}