I can reproduce this bug with OpenCV 3.4.0 cascadeclassifier sample on Tesla T4, but can’t on P4. This is the same as the link ChrisDing provided above.
After some debug, I got attached change. With this change, cascadeclassifier doesn’t hang in the GPU detectMultiScale() call and can do face detection correctly.
Before you get official fix from OpenCV if you raise this to OpenCV, you may could try attached WAR.
WAR: OpenCV-3.4.0: fix cascadeclassifier sample hang/block on
Turing GPU
On Turing GPU, e.g. T4, cascadeclassifier sample - samples/gpu/cascadeclassifier.cpp
hangs in below call
cascade_gpu->detectMultiScale(resized_gpu, facesBuf_gpu);
This sample works on Tesla P4 before below change.
---
modules/cudalegacy/src/cuda/NCVHaarObjectDetection.cu | 2 +-
modules/cudalegacy/src/cuda/NPP_staging.cu | 5 ++++-
2 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/modules/cudalegacy/src/cuda/NCVHaarObjectDetection.cu b/modules/cudalegacy/src/cuda/NCVHaarObjectDetection.cu
index e6e5e52..47982f2 100644
--- a/modules/cudalegacy/src/cuda/NCVHaarObjectDetection.cu
+++ b/modules/cudalegacy/src/cuda/NCVHaarObjectDetection.cu
@@ -173,7 +173,7 @@ __device__ Ncv32u scan1Inclusive(Ncv32u idata, volatile Ncv32u *s_Data)
const Ncv32u MAX_GRID_DIM = 65535;
-const Ncv32u NUM_THREADS_ANCHORSPARALLEL = 64;
+const Ncv32u NUM_THREADS_ANCHORSPARALLEL = 32; //64;
#define NUM_THREADS_CLASSIFIERPARALLEL_LOG2 6
diff --git a/modules/cudalegacy/src/cuda/NPP_staging.cu b/modules/cudalegacy/src/cuda/NPP_staging.cu
index a96f44f..b29c6d3 100644
--- a/modules/cudalegacy/src/cuda/NPP_staging.cu
+++ b/modules/cudalegacy/src/cuda/NPP_staging.cu
@@ -99,11 +99,14 @@ inline __device__ T warpScanInclusive(T idata, volatile T *s_Data)
#if __CUDA_ARCH__ >= 300
const unsigned int laneId = cv::cuda::device::Warp::laneId();
+ unsigned mask = __activemask();// need to get mask for active threads
+
// scan on shuffl functions
#pragma unroll
for (int i = 1; i <= (K_WARP_SIZE / 2); i *= 2)
{
- const T n = cv::cuda::device::shfl_up(idata, i);
+ //const T n = cv::cuda::device::shfl_up(idata, i);
+ const T n = __shfl_up_sync(mask, idata, i, 32);//need to call with proper mask
if (laneId >= i)
idata += n;
}