I’m seeing an odd behavior. On my primary dev box, a linux machine with a Titan X GPU, I receive expected behavior with nppiWarpPerspectiveBack_X_C1R. On a Windows 10 machine with a Tesla Titan V, I see a different behavior. nppiWarpPerspective returns an all 0 buffer without throwing an error code. On that same machine, nppiWarpAffine_8u_C1R and nppiWarpAffine_32f_C1R behave as expected. So, it seems like I’m seeing machine/GPU specific behavior only with nppiWarpPerspectiveBack. Can someone confirm if this is a known issue?
template <typename PIXEL_T, typename NPP_FWARP, typename NPP_FSET>
void callNPP(const PIXEL_T* pSrc,
const std::vector<size_t> srcSize,
NppiRect &srcRoi,
PIXEL_T* pDst,
NppiRect &dstRoi,
NppiSize &dstRoiSize,
const double *tformCoeffs,
const double *fillVal,
const std::string &interpolation,
size_t numelSrc,
const std::vector<size_t> dstSize,
NPP_FWARP nppWarpFuncPtr,
NPP_FSET nppSetFuncPtr)
{
NppiSize sizeSrc = { static_cast<int>(srcSize[0]), static_cast<int>(srcSize[1]) };
int dstStep = static_cast<int>(dstSize[0]) * sizeof(PIXEL_T);
int srcStep = static_cast<int>(srcSize[0]) * sizeof(PIXEL_T);
size_t inPlanes = (numelSrc == 0) ?
0 : numelSrc / srcSize[0] / srcSize[1];
mwSize dstSizePerPlane = dstSize[0]*dstSize[1];
int interpMethod = imagesgpu::getInterpEnumFromString(interpolation);
double T[3][3];
convertTransformToFormExpectedByNPP(tformCoeffs,T);
NppStatus statusCode;
for (mwSize k = 0; k < inPlanes; k++)
{
// Initialize plane with fill value. Dst pixels that map out of bounds in src will not be touched and therefore will have this initialized value.
statusCode = (*nppSetFuncPtr)(static_cast<PIXEL_T>(fillVal[k]),pDst+k*dstSizePerPlane, dstStep, dstRoiSize);
if (statusCode != NPP_SUCCESS)
mxErrMsgId(images::gpugeneric::nppFailure(statusCode));
statusCode = (*nppWarpFuncPtr)(
(pSrc + k*srcSize[0]*srcSize[1]),
sizeSrc,
srcStep,
srcRoi,
(pDst +k*dstSize[0]*dstSize[1]),
dstStep,
dstRoi,
T,
interpMethod);
if (statusCode != NPP_SUCCESS)
mxErrMsgId(images::gpugeneric::nppFailure(statusCode));
}
}
CUDADevice with properties:
Name: 'TITAN V'
Index: 1
ComputeCapability: '7.0'
SupportsDouble: 1
DriverVersion: 10.1000
ToolkitVersion: 10.1000
MaxThreadsPerBlock: 1024
MaxShmemPerBlock: 49152
MaxThreadBlockSize: [1024 1024 64]
MaxGridSize: [2.1475e+09 65535 65535]
SIMDWidth: 32
TotalMemory: 1.2747e+10
AvailableMemory: 1.1943e+10
MultiprocessorCount: 80
ClockRateKHz: 1455000
ComputeMode: 'Default'
GPUOverlapsTransfers: 1
KernelExecutionTimeout: 0
CanMapHostMemory: 1
DeviceSupported: 1
DeviceSelected: 1