==8807== Profiling result: Type Time(%) Time Calls Avg Min Max Name GPU activities: 57.35% 17.3676s 1665 10.431ms 276.76us 155.83ms void applyHaarClassifierAnchorParallel(unsigned int*, unsigned int, float*, unsigned int, HaarFeature64*, HaarClassifierNode128*, HaarStage64*, unsigned int*, unsigned int*, unsigned int, unsigned int, NcvSize32u, unsigned int, unsigned int, float) 34.84% 10.5522s 1665 6.3376ms 103.40us 94.287ms void applyHaarClassifierAnchorParallel(unsigned int*, unsigned int, float*, unsigned int, HaarFeature64*, HaarClassifierNode128*, HaarStage64*, unsigned int*, unsigned int*, unsigned int, unsigned int, NcvSize32u, unsigned int, unsigned int, float) 2.97% 900.80ms 1665 541.02us 7.8440us 6.0432ms void applyHaarClassifierClassifierParallel(unsigned int*, unsigned int, float*, unsigned int, HaarFeature64*, HaarClassifierNode128*, HaarStage64*, unsigned int*, unsigned int*, unsigned int, unsigned int, NcvSize32u, unsigned int, unsigned int, float) 0.86% 261.30ms 333 784.69us 622.89us 5.8686ms void transpose(unsigned int*, unsigned int, unsigned int*, unsigned int, NcvSize32u) 0.63% 192.17ms 111 1.7313ms 1.4662ms 5.6529ms void scanRows(unsigned int*, unsigned int, unsigned int, unsigned int, __int64*, unsigned int) 0.53% 159.24ms 1665 95.639us 1.9200us 2.5828ms void rectStdDev_32f_C1R(unsigned int*, unsigned int, __int64*, unsigned int, float*, unsigned int, NcvSize32u, NcvRect32u, float) 0.52% 158.58ms 5109 31.039us 320ns 3.6070ms [CUDA memcpy HtoD] 0.46% 139.81ms 1665 83.970us 4.5460us 1.4372ms void decimate_C1R<__int64, bool=1>(__int64*, unsigned int, __int64*, unsigned int, NcvSize32u, unsigned int) 0.34% 102.00ms 111 918.90us 693.73us 1.8336ms void transpose<__int64>(__int64*, unsigned int, __int64*, unsigned int, NcvSize32u) 0.30% 90.375ms 1665 54.279us 3.9050us 1.1182ms void decimate_C1R(unsigned int*, unsigned int, unsigned int*, unsigned int, NcvSize32u, unsigned int) 0.30% 90.174ms 1665 54.158us 1.6000us 414.83us [CUDA memcpy DtoD] 0.30% 89.864ms 111 809.59us 577.59us 7.5202ms void scanRows(unsigned int*, unsigned int, unsigned int, unsigned int, unsigned int*, unsigned int) 0.23% 71.084ms 111 640.40us 495.63us 3.3368ms void scanRows(unsigned char*, unsigned int, unsigned int, unsigned int, unsigned int*, unsigned int) 0.19% 58.290ms 111 525.13us 496.85us 3.3756ms void scanRows(unsigned char*, unsigned int, unsigned int, unsigned int, unsigned int*, unsigned int) 0.16% 47.204ms 111 425.26us 401.93us 2.8850ms void cv::cudev::grid_transform_detail::transformSimple, unsigned char, cv::cudev::BGR_to_GRAY_func, cv::cudev::WithOutMask>(uchar3, cv::cudev::GlobPtr>, unsigned char, unsigned char, int, int) 0.01% 2.3638ms 5103 463ns 320ns 4.7060us [CUDA memcpy DtoH] 0.00% 754.43us 262 2.8790us 2.1770us 3.6810us growDetectionsKernel(unsigned int*, unsigned int, NcvRect32u*, unsigned int, unsigned int, float) API calls: 78.84% 30.1433s 16978 1.7754ms 7.8450us 155.95ms cudaStreamSynchronize 12.18% 4.65492s 3 1.55164s 65.662us 4.65246s cudaMallocHost 5.15% 1.97074s 11251 175.16us 92.424us 2.8773ms cudaLaunchKernel 0.92% 352.35ms 4995 70.539us 36.341us 424.22us cudaMemcpyFromSymbolAsync 0.69% 263.78ms 111 2.3764ms 1.6450ms 4.3629ms cudaMemcpy2D 0.68% 260.51ms 4995 52.154us 32.909us 312.98us cudaMemcpyToSymbolAsync 0.46% 175.17ms 8547 20.494us 12.676us 267.89us cudaBindTexture 0.43% 164.43ms 1776 92.582us 52.176us 754.31us cudaMemcpy 0.31% 116.97ms 5 23.394ms 51.263us 59.521ms cudaMalloc 0.19% 73.963ms 111 666.34us 482.53us 3.3831ms cudaDeviceSynchronize 0.04% 17.101ms 1 17.101ms 17.101ms 17.101ms cudaHostAlloc 0.04% 14.154ms 11251 1.2570us 769ns 70.962us cudaGetLastError 0.02% 8.8179ms 8547 1.0310us 448ns 67.047us cudaCreateChannelDesc 0.02% 6.0676ms 2 3.0338ms 2.1710ms 3.8966ms cudaMallocPitch 0.01% 3.4590ms 225 15.373us 4.7720us 101.19us cudaGetDevice 0.01% 3.2501ms 6 541.69us 157.23us 2.0153ms cudaFree 0.01% 2.0453ms 4 511.33us 108.74us 1.3174ms cudaFreeHost 0.00% 446.41us 3 148.80us 124.64us 170.97us cudaGetDeviceProperties 0.00% 365.39us 96 3.8060us 2.2400us 87.900us cuDeviceGetAttribute 0.00% 18.399us 1 18.399us 18.399us 18.399us cuDeviceTotalMem 0.00% 17.440us 3 5.8130us 2.6560us 7.4880us cuDeviceGetCount 0.00% 9.4110us 6 1.5680us 800ns 3.7120us cudaGetDeviceCount 0.00% 7.3270us 2 3.6630us 3.4870us 3.8400us cuDeviceGet 0.00% 4.8320us 1 4.8320us 4.8320us 4.8320us cuDeviceGetName 0.00% 2.9110us 1 2.9110us 2.9110us 2.9110us cuDeviceGetUuid 0.00% 1.6960us 1 1.6960us 1.6960us 1.6960us cudaDriverGetVersion 0.00% 1.4400us 1 1.4400us 1.4400us 1.4400us cudaRuntimeGetVersion