I have developed the module to calculate the integral image by NPP as below.
But it returns cudaErrorLaunchFailure.
What is the problem?
__global__ void convertValue(DWORD* pdwIntImage, __int64* pn64SqrIntImage, Npp32f* pn32fIntImage, int nPitch1, Npp64f* pn64fSqrIntImage, int nPitch2, int nIntH, int nIntW)
{
int nX = blockIdx.x * blockDim.x + threadIdx.x;
int nY = blockIdx.y * blockDim.y + threadIdx.y;
if ((nX > nIntW - 1) || (nY > nIntH - 1))
{
return;
}
pdwIntImage[nY * nIntW + nX] = (DWORD)pn32fIntImage[nPitch1 * nY + nX];
pn64SqrIntImage[nY * nIntW + nX] = (__int64)pn64fSqrIntImage[nPitch2 * nY + nX];
}
void CreateIntegralImage64ByNpp(BYTE* pbImage, DWORD* pdwIntImage, __int64* pn64SqrIntImage, int nIntH, int nIntW)
{
NppStatus xStatus;
Npp8u* p8uImage;
Npp32f* p32fIntImage;
Npp64f* p64fSqrIntImage;
NppiSize xSize;
xSize.height = nIntH - 1;
xSize.width = nIntW - 1;
size_t nPitch1, nPitch2, nPitch3;
cudaMallocPitch(&p8uImage, &nPitch1, (nIntW - 1) * sizeof(Npp8u), nIntH - 1);
cudaMemcpy2D(p8uImage, nPitch1, pbImage, nIntW - 1, (nIntW - 1) * sizeof(Npp8u), nIntH - 1, cudaMemcpyDeviceToDevice);
cudaMallocPitch(&p32fIntImage, &nPitch2, nIntW * sizeof(Npp32f), nIntH);
cudaMallocPitch(&p64fSqrIntImage, &nPitch3, nIntW * sizeof(Npp64f), nIntH);
xStatus = nppiSqrIntegral_8u32f64f_C1R(p8uImage, nPitch1, p32fIntImage, nPitch2, p64fSqrIntImage, nPitch3, xSize, 0, 0);
cudaDeviceSynchronize();
cudaError_t error = cudaGetLastError();
dim3 Block(32,32);
dim3 Grid((nIntW + Block.x - 1)/Block.x, (nIntH + Block.y - 1)/Block.y);
convertValue<<<Grid, Block>>>(pdwIntImage, pn64SqrIntImage, p32fIntImage, nPitch2, p64fSqrIntImage, nPitch3, nIntH, nIntW);
cudaDeviceSynchronize();
error = cudaGetLastError();
cudaFree(p8uImage);
cudaFree(p32fIntImage);
cudaFree(p64fSqrIntImage);
}
int nH = 1200, nW = 1600;
Npp8u* d_pbImage;
DWORD* d_pdwIntImage;
__int64* d_pn64SqrIntImage;
cudaMalloc(&d_pbImage, nW * nH);
cudaMalloc(&d_pdwIntImage, (nW + 1) * (nH + 1) * sizeof(DWORD));
cudaMalloc(&d_pn64SqrIntImage, (nW + 1) * (nH + 1) * sizeof(__int64));
CreateIntegralImage64ByNpp(d_pbImage, d_pdwIntImage, d_pn64SqrIntImage, nH + 1, nW + 1);
cudaDeviceSynchronize();
cudaError_t error;
error = cudaGetLastError();
Thanks in advance.