==21320== Profiling application: /usr/src/tensorrt/bin/trtexec --loadEngine=yolo.engine --plugins=/home/xavier1/Documents/git/tensorrtx/yolov5/build/libmyplugins.so ==21320== Profiling result: Type Time(%) Time Calls Avg Min Max Name GPU activities: 25.22% 797.84ms 10277 77.633us 12.448us 272.30us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 11.63% 367.83ms 2151 171.01us 86.882us 413.55us trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 10.79% 341.39ms 717 476.14us 153.06us 850.84us void cuResizeLayer::ResizeNearestGenericKernel(float*, cuResizeLayer::ResizeNearestGenericKernel const *, cuResizeLayer::LaunchParams) 8.46% 267.61ms 717 373.23us 131.97us 652.76us void CUTENSOR_NAMESPACE::vectorized_tensor_elementwise_kernel, float, float, char, float, bool=0, cutensorOperator_t=126, cutensorOperator_t, cutensorOperator_t, cutensorOperator_t, cutensorOperator_t>(CUTENSOR_NAMESPACE::pw_params_t, int, int, unsigned int=1, int=32 const *, CUTENSOR_NAMESPACE::pw_params_t, unsigned int=256 const *, CUTENSOR_NAMESPACE::pw_params_t, unsigned int=1 const *, unsigned int=256 const **, cutensorOperator_t, void const *, cutensorOperator_t, void const , cutensorOperator_t, void const , cutensorOperator_t, void const , cutensorOperator_t, void const ) 6.73% 212.81ms 956 222.60us 219.65us 240.94us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) 6.32% 200.00ms 2390 83.682us 27.969us 187.65us trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_interior_nt_v1 5.81% 183.67ms 333 551.57us 288ns 4.3680ms [CUDA memcpy HtoD] 5.51% 174.34ms 239 729.47us 716.28us 744.41us trt_volta_int8x4_icudnn_int8x4_128x32_relu_small_c32_nn_v1 3.98% 125.85ms 1912 65.822us 19.361us 141.76us void cuEltwise::eltwise, cuEltwise::Compute>(cuEltwise::LaunchParams) 3.88% 122.69ms 1195 102.67us 25.344us 229.74us cuInt8::nc32hw32ToNcqhw4(char4 const *, char4*, nvinfer1::rt::reduced_divisor, int, nvinfer1::rt::reduced_divisor, nvinfer1::rt::reduced_divisor, int, int, float const *, float const *) 1.96% 62.006ms 3346 18.531us 3.8400us 72.066us cuInt8::nc32hw32ToNc32hw32(char4 const *, char4*, nvinfer1::rt::reduced_divisor, int, nvinfer1::rt::reduced_divisor, nvinfer1::rt::reduced_divisor, int, int, float4 const *, float4 const *) 1.78% 56.267ms 478 117.71us 112.58us 122.37us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_interior_nt_v1 1.75% 55.420ms 956 57.970us 23.169us 119.94us trt_volta_fp32_icudnn_int8x4_128x64_relu_interior_nn_v1 1.54% 48.682ms 717 67.897us 22.848us 123.08us void genericReformat::copyPackedKernel, int=4>(unsigned int, unsigned int, void const *, genericReformat::ArrayN>, genericReformat::ArrayNWithReducedDivisors>, genericReformat::ArrayN, int, int, int, float const *, void*, genericReformat::ArrayN, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayN, int, int, int, float const , int=4) 1.49% 47.223ms 239 197.58us 193.77us 205.32us cuInt8::nchwToNcqhw4(float const *, unsigned int*, int, int, int, int, int, int, int, float const *, cuInt8::ReducedDivisorParameters) 1.42% 44.916ms 239 187.94us 185.16us 190.09us trt_volta_int8x4_icudnn_int8x4_128x64_relu_interior_nn_v1 1.02% 32.347ms 239 135.34us 133.19us 137.48us cuInt8::ncqhw4ToNc32hw32(char4 const *, char4*, nvinfer1::rt::reduced_divisor, int, nvinfer1::rt::reduced_divisor, nvinfer1::rt::reduced_divisor, int, int, float const *, float const *) 0.47% 14.860ms 717 20.725us 9.8570us 33.953us void nvinfer1::poolNCxHWxInt8(nvinfer1::IMMAInt8PackedArray const *, nvinfer1::poolNCxHWxInt8*, int, int, nvinfer1::rt::reduced_divisor, int, int, int, nvinfer1::rt, int, int, int, int, int, int, nvinfer1::rt, float, float, nvinfer1::IMMAFloatPackedArray const *, nvinfer1::IMMAFloatPackedArray const , int, int) 0.16% 4.9064ms 956 5.1320us 2.2400us 10.465us nvinfer1::CalDetection(float const *, float*, int, int, int, int, int, int, float const *, int, int) 0.06% 1.9079ms 140 13.627us 2.4010us 80.675us [CUDA memcpy DtoD] 0.02% 626.39us 239 2.6200us 1.9200us 3.0720us [CUDA memcpy DtoH] 0.00% 86.178us 247 348ns 288ns 1.1200us [CUDA memset] API calls: 37.40% 2.82639s 134 21.092ms 1.1840us 1.79281s cudaFree 33.42% 2.52565s 239 10.568ms 4.7241ms 13.473ms cudaEventSynchronize 16.49% 1.24614s 16 77.884ms 2.5280us 1.24605s cudaStreamCreateWithFlags 5.95% 449.36ms 84 5.3495ms 7.0080us 55.874ms cuModuleUnload 5.62% 424.59ms 28441 14.928us 11.008us 820.99us cudaLaunchKernel 0.34% 25.777ms 700 36.824us 9.8880us 4.6337ms cudaMemcpyAsync 0.20% 15.407ms 27963 550ns 256ns 718.53us cudaGetLastError 0.18% 13.952ms 1674 8.3340us 2.2400us 804.51us cudaEventRecord 0.12% 8.7104ms 117 74.447us 4.6400us 3.9342ms cudaMalloc 0.08% 6.0245ms 239 25.207us 15.392us 69.536us cudaMemset 0.07% 5.0789ms 1673 3.0350us 1.4400us 42.048us cudaEventElapsedTime 0.03% 2.3822ms 479 4.9730us 2.6880us 41.408us cudaStreamWaitEvent 0.02% 1.5020ms 6 250.34us 31.712us 1.1569ms cudaFreeHost 0.01% 846.21us 2 423.10us 109.12us 737.09us cudaMallocHost 0.01% 833.79us 360 2.3160us 1.3760us 30.752us cudaFuncSetAttribute 0.01% 822.94us 12 68.578us 13.984us 323.74us cudaMemcpy 0.01% 672.83us 230 2.9250us 1.2800us 24.224us cudaEventDestroy 0.01% 609.82us 156 3.9090us 928ns 56.896us cudaDeviceGetAttribute 0.01% 551.10us 230 2.3960us 1.4080us 25.440us cudaEventCreateWithFlags 0.01% 489.06us 1 489.06us 489.06us 489.06us cudaLaunchHostFunc 0.01% 487.07us 4 121.77us 38.048us 358.02us cudaHostAlloc 0.01% 435.10us 379 1.1480us 416ns 42.880us cuDeviceGetAttribute 0.00% 267.01us 8 33.376us 2.3040us 247.55us cudaStreamCreateWithPriority 0.00% 256.19us 7 36.598us 19.424us 76.832us cudaGetDeviceProperties 0.00% 246.43us 28 8.8010us 2.5600us 27.232us cudaStreamDestroy 0.00% 140.99us 8 17.624us 8.8000us 40.224us cudaMemsetAsync 0.00% 97.376us 16 6.0860us 3.1040us 11.648us cudaDeviceSynchronize 0.00% 88.064us 4 22.016us 6.8480us 31.264us cudaStreamCreate 0.00% 62.272us 4 15.568us 9.5040us 22.784us cuDeviceTotalMem 0.00% 58.592us 14 4.1850us 1.2800us 14.688us cudaGetDevice 0.00% 54.528us 3 18.176us 14.272us 23.904us cudaStreamSynchronize 0.00% 45.792us 2 22.896us 21.312us 24.480us cudaSetDevice 0.00% 18.656us 4 4.6640us 832ns 8.8000us cuDeviceGetUuid 0.00% 15.040us 3 5.0130us 4.5760us 5.5040us cuInit 0.00% 13.632us 5 2.7260us 608ns 10.176us cudaGetDeviceCount 0.00% 11.456us 6 1.9090us 1.0560us 4.0320us cuDeviceGetCount 0.00% 11.168us 4 2.7920us 1.3760us 6.1760us cuDeviceGetName 0.00% 10.048us 2 5.0240us 4.4800us 5.5680us cudaHostGetDevicePointer 0.00% 8.5760us 3 2.8580us 2.5920us 3.2320us cuDriverGetVersion 0.00% 5.4080us 2 2.7040us 2.4000us 3.0080us cudaDeviceGetStreamPriorityRange 0.00% 5.3440us 5 1.0680us 896ns 1.6320us cuDeviceGet 0.00% 3.2000us 1 3.2000us 3.2000us 3.2000us cuDevicePrimaryCtxRelease 0.00% 2.6240us 3 874ns 800ns 992ns cudaRuntimeGetVersion ==21320== NVTX result: ==21320== Thread "" (id = 1971313072) ==21320== Domain "TensorRT" ==21320== Range "(Unnamed Layer* 0) [Slice]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 9.3727ms 239 39.216us 21.504us 131.49us (Unnamed Layer* 0) [Slice] GPU activities: 100.00% 53.600ms 239 224.27us 221.61us 240.94us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) API calls: 100.00% 6.3856ms 239 26.717us 14.816us 102.82us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 1) [Slice]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.4842ms 239 22.946us 16.800us 62.944us (Unnamed Layer* 1) [Slice] GPU activities: 100.00% 52.986ms 239 221.70us 219.85us 224.93us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) API calls: 100.00% 4.0114ms 239 16.784us 12.224us 56.480us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 102) [Convolution] + (Unnamed Layer* 104) [Activation] || (Unnamed Layer* 99) [Convolution] + (Unnamed Layer* 101) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.5160ms 239 18.895us 15.648us 56.096us (Unnamed Layer* 102) [Convolution] + (Unnamed Layer* 104) [Activation] || (Unnamed Layer* 99) [Convolution] + (Unnamed Layer* 101) [Activation] GPU activities: 100.00% 11.147ms 239 46.640us 44.994us 50.210us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.3395ms 239 13.972us 11.776us 51.392us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 104) [Activation]_output copy" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.7375ms 239 24.006us 19.616us 184.58us (Unnamed Layer* 104) [Activation]_output copy GPU activities: 100.00% 2.5647ms 239 10.730us 10.144us 11.616us cuInt8::nc32hw32ToNc32hw32(char4 const *, char4*, nvinfer1::rt::reduced_divisor, int, nvinfer1::rt::reduced_divisor, nvinfer1::rt::reduced_divisor, int, int, float4 const *, float4 const *) API calls: 100.00% 3.5688ms 239 14.932us 12.768us 31.904us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 105) [Convolution] + (Unnamed Layer* 107) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.8977ms 239 20.492us 16.608us 55.488us (Unnamed Layer* 105) [Convolution] + (Unnamed Layer* 107) [Activation] GPU activities: 100.00% 6.8905ms 239 28.830us 28.289us 29.696us trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_interior_nt_v1 API calls: 100.00% 3.5853ms 239 15.001us 12.224us 50.240us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 108) [Convolution] + (Unnamed Layer* 110) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.7711ms 239 24.146us 16.128us 697.18us (Unnamed Layer* 108) [Convolution] + (Unnamed Layer* 110) [Activation] GPU activities: 100.00% 21.037ms 239 88.022us 86.914us 89.795us trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 API calls: 100.00% 4.1800ms 239 17.489us 11.968us 690.75us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 111) [ElementWise]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.4250ms 239 18.514us 15.968us 42.688us (Unnamed Layer* 111) [ElementWise] GPU activities: 100.00% 4.7150ms 239 19.727us 19.361us 20.225us void cuEltwise::eltwise, cuEltwise::Compute>(cuEltwise::LaunchParams) API calls: 100.00% 3.2394ms 239 13.553us 11.872us 37.728us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 113) [Convolution] + (Unnamed Layer* 115) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.5595ms 239 19.077us 15.936us 44.384us (Unnamed Layer* 113) [Convolution] + (Unnamed Layer* 115) [Activation] GPU activities: 100.00% 12.154ms 239 50.854us 49.665us 55.106us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.3745ms 239 14.119us 11.968us 40.064us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 116) [Convolution] + (Unnamed Layer* 118) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.5032ms 239 18.841us 15.360us 75.200us (Unnamed Layer* 116) [Convolution] + (Unnamed Layer* 118) [Activation] GPU activities: 100.00% 22.340ms 239 93.471us 92.835us 94.275us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.3879ms 239 14.175us 11.744us 68.896us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 119) [Convolution] + (Unnamed Layer* 121) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.2620ms 239 17.832us 15.200us 43.744us (Unnamed Layer* 119) [Convolution] + (Unnamed Layer* 121) [Activation] GPU activities: 100.00% 4.0640ms 239 17.004us 16.608us 17.441us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.2011ms 239 13.393us 11.648us 35.072us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 121) [Activation]_output copy" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.6841ms 239 23.782us 19.872us 56.416us (Unnamed Layer* 121) [Activation]_output copy GPU activities: 100.00% 973.79us 239 4.0740us 3.8400us 4.3520us cuInt8::nc32hw32ToNc32hw32(char4 const *, char4*, nvinfer1::rt::reduced_divisor, int, nvinfer1::rt::reduced_divisor, nvinfer1::rt::reduced_divisor, int, int, float4 const *, float4 const *) API calls: 100.00% 3.6018ms 239 15.070us 12.896us 44.160us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 122) [Pooling]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.0710ms 239 21.217us 17.568us 49.504us (Unnamed Layer* 122) [Pooling] GPU activities: 100.00% 2.4326ms 239 10.178us 9.8570us 10.560us void nvinfer1::poolNCxHWxInt8(nvinfer1::IMMAInt8PackedArray const *, nvinfer1::poolNCxHWxInt8*, int, int, nvinfer1::rt::reduced_divisor, int, int, int, nvinfer1::rt, int, int, int, int, int, int, nvinfer1::rt, float, float, nvinfer1::IMMAFloatPackedArray const *, nvinfer1::IMMAFloatPackedArray const , int, int) API calls: 100.00% 3.4706ms 239 14.521us 12.224us 38.720us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 123) [Pooling]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.1918ms 239 21.722us 17.824us 64.832us (Unnamed Layer* 123) [Pooling] GPU activities: 100.00% 4.5936ms 239 19.220us 18.721us 19.841us void nvinfer1::poolNCxHWxInt8(nvinfer1::IMMAInt8PackedArray const *, nvinfer1::poolNCxHWxInt8*, int, int, nvinfer1::rt::reduced_divisor, int, int, int, nvinfer1::rt, int, int, int, int, int, int, nvinfer1::rt, float, float, nvinfer1::IMMAFloatPackedArray const *, nvinfer1::IMMAFloatPackedArray const , int, int) API calls: 100.00% 3.5417ms 239 14.818us 12.256us 58.464us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 124) [Pooling]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 6.5430ms 239 27.376us 19.904us 97.920us (Unnamed Layer* 124) [Pooling] GPU activities: 100.00% 7.8342ms 239 32.779us 32.225us 33.953us void nvinfer1::poolNCxHWxInt8(nvinfer1::IMMAInt8PackedArray const *, nvinfer1::poolNCxHWxInt8*, int, int, nvinfer1::rt::reduced_divisor, int, int, int, nvinfer1::rt, int, int, int, int, int, int, nvinfer1::rt, float, float, nvinfer1::IMMAFloatPackedArray const *, nvinfer1::IMMAFloatPackedArray const , int, int) API calls: 100.00% 3.9984ms 239 16.729us 13.344us 46.944us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 126) [Convolution] + (Unnamed Layer* 128) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.5658ms 239 19.103us 15.488us 62.144us (Unnamed Layer* 126) [Convolution] + (Unnamed Layer* 128) [Activation] GPU activities: 100.00% 7.8923ms 239 33.022us 32.321us 33.729us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.3128ms 239 13.861us 11.936us 46.208us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 132) [Convolution] + (Unnamed Layer* 134) [Activation] || (Unnamed Layer* 129) [Convolution] + (Unnamed Layer* 131) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.4107ms 239 18.454us 15.200us 53.728us (Unnamed Layer* 132) [Convolution] + (Unnamed Layer* 134) [Activation] || (Unnamed Layer* 129) [Convolution] + (Unnamed Layer* 131) [Activation] GPU activities: 100.00% 5.3461ms 239 22.368us 21.857us 22.912us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.3301ms 239 13.933us 11.744us 48.032us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 134) [Activation]_output copy" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.6114ms 239 23.478us 19.520us 76.128us (Unnamed Layer* 134) [Activation]_output copy GPU activities: 100.00% 1.2449ms 239 5.2080us 4.9280us 5.8880us cuInt8::nc32hw32ToNc32hw32(char4 const *, char4*, nvinfer1::rt::reduced_divisor, int, nvinfer1::rt::reduced_divisor, nvinfer1::rt::reduced_divisor, int, int, float4 const *, float4 const *) API calls: 100.00% 3.6300ms 239 15.188us 12.864us 61.600us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 135) [Convolution] + (Unnamed Layer* 137) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.1669ms 239 21.618us 15.040us 737.60us (Unnamed Layer* 135) [Convolution] + (Unnamed Layer* 137) [Activation] GPU activities: 100.00% 3.0475ms 239 12.751us 12.448us 13.120us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.3507ms 239 14.019us 11.616us 58.816us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 138) [Convolution] + (Unnamed Layer* 140) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.2804ms 239 17.909us 14.976us 53.728us (Unnamed Layer* 138) [Convolution] + (Unnamed Layer* 140) [Activation] GPU activities: 100.00% 10.718ms 239 44.844us 44.001us 45.666us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.1672ms 239 13.251us 11.552us 40.736us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 14) [Convolution] + (Unnamed Layer* 16) [Activation] || (Unnamed Layer* 11) [Convolution] + (Unnamed Layer* 13) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.2193ms 239 21.838us 16.608us 105.63us (Unnamed Layer* 14) [Convolution] + (Unnamed Layer* 16) [Activation] || (Unnamed Layer* 11) [Convolution] + (Unnamed Layer* 13) [Activation] GPU activities: 100.00% 43.740ms 239 183.01us 179.33us 187.65us trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_interior_nt_v1 API calls: 100.00% 3.7705ms 239 15.776us 12.128us 97.696us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 142) [Convolution] + (Unnamed Layer* 144) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.5691ms 239 19.117us 15.136us 62.688us (Unnamed Layer* 142) [Convolution] + (Unnamed Layer* 144) [Activation] GPU activities: 100.00% 4.9750ms 239 20.816us 20.321us 21.312us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.4150ms 239 14.288us 11.712us 57.888us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 145) [Convolution] + (Unnamed Layer* 147) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.3473ms 239 18.189us 15.168us 67.488us (Unnamed Layer* 145) [Convolution] + (Unnamed Layer* 147) [Activation] GPU activities: 100.00% 5.2809ms 239 22.095us 21.569us 22.593us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.2252ms 239 13.494us 11.808us 39.296us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 147) [Activation]_output copy" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.6494ms 239 23.637us 19.456us 67.904us (Unnamed Layer* 147) [Activation]_output copy GPU activities: 100.00% 1.6657ms 239 6.9690us 6.5280us 7.8090us cuInt8::nc32hw32ToNc32hw32(char4 const *, char4*, nvinfer1::rt::reduced_divisor, int, nvinfer1::rt::reduced_divisor, nvinfer1::rt::reduced_divisor, int, int, float4 const *, float4 const *) API calls: 100.00% 3.6859ms 239 15.422us 12.800us 44.128us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 148) [Resize]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 6.3015ms 239 26.366us 17.472us 727.01us (Unnamed Layer* 148) [Resize] GPU activities: 100.00% 38.821ms 239 162.43us 153.06us 168.20us void cuResizeLayer::ResizeNearestGenericKernel(float*, cuResizeLayer::ResizeNearestGenericKernel const *, cuResizeLayer::LaunchParams) API calls: 100.00% 3.8156ms 239 15.965us 12.320us 58.752us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 148) [Resize] input reformatter 0" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 7.1109ms 239 29.752us 21.984us 89.728us (Unnamed Layer* 148) [Resize] input reformatter 0 GPU activities: 100.00% 5.5402ms 239 23.180us 22.848us 23.745us void genericReformat::copyPackedKernel, int=4>(unsigned int, unsigned int, void const *, genericReformat::ArrayN>, genericReformat::ArrayNWithReducedDivisors>, genericReformat::ArrayN, int, int, int, float const *, void*, genericReformat::ArrayN, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayN, int, int, int, float const , int=4) API calls: 100.00% 4.2608ms 239 17.827us 13.920us 59.200us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 148) [Resize]_output copy" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 14.454ms 239 60.478us 38.752us 141.06us (Unnamed Layer* 148) [Resize]_output copy GPU activities: 100.00% 32.647ms 239 136.60us 131.97us 139.81us void CUTENSOR_NAMESPACE::vectorized_tensor_elementwise_kernel, float, float, char, float, bool=0, cutensorOperator_t=126, cutensorOperator_t, cutensorOperator_t, cutensorOperator_t, cutensorOperator_t>(CUTENSOR_NAMESPACE::pw_params_t, int, int, unsigned int=1, int=32 const *, CUTENSOR_NAMESPACE::pw_params_t, unsigned int=256 const *, CUTENSOR_NAMESPACE::pw_params_t, unsigned int=1 const *, unsigned int=256 const **, cutensorOperator_t, void const *, cutensorOperator_t, void const , cutensorOperator_t, void const , cutensorOperator_t, void const , cutensorOperator_t, void const ) API calls: 100.00% 4.6531ms 239 19.469us 15.040us 52.672us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 153) [Convolution] + (Unnamed Layer* 155) [Activation] || (Unnamed Layer* 150) [Convolution] + (Unnamed Layer* 152) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.9872ms 239 20.867us 16.704us 63.392us (Unnamed Layer* 153) [Convolution] + (Unnamed Layer* 155) [Activation] || (Unnamed Layer* 150) [Convolution] + (Unnamed Layer* 152) [Activation] GPU activities: 100.00% 17.628ms 239 73.756us 71.810us 80.322us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.5956ms 239 15.044us 12.384us 48.192us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 155) [Activation]_output copy" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.8982ms 239 24.678us 19.968us 70.624us (Unnamed Layer* 155) [Activation]_output copy GPU activities: 100.00% 2.5984ms 239 10.871us 10.401us 11.680us cuInt8::nc32hw32ToNc32hw32(char4 const *, char4*, nvinfer1::rt::reduced_divisor, int, nvinfer1::rt::reduced_divisor, nvinfer1::rt::reduced_divisor, int, int, float4 const *, float4 const *) API calls: 100.00% 3.7915ms 239 15.863us 13.024us 62.368us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 156) [Convolution] + (Unnamed Layer* 158) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.8158ms 239 20.149us 16.640us 69.952us (Unnamed Layer* 156) [Convolution] + (Unnamed Layer* 158) [Activation] GPU activities: 100.00% 6.8429ms 239 28.631us 28.097us 31.105us trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_interior_nt_v1 API calls: 100.00% 3.5138ms 239 14.702us 12.384us 64.448us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 159) [Convolution] + (Unnamed Layer* 161) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.7895ms 239 20.039us 16.256us 73.760us (Unnamed Layer* 159) [Convolution] + (Unnamed Layer* 161) [Activation] GPU activities: 100.00% 21.032ms 239 87.998us 86.882us 102.18us trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 API calls: 100.00% 3.4586ms 239 14.470us 12.064us 66.336us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 16) [Activation]_output copy" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 6.2376ms 239 26.098us 20.032us 66.304us (Unnamed Layer* 16) [Activation]_output copy GPU activities: 100.00% 16.704ms 239 69.891us 69.154us 72.066us cuInt8::nc32hw32ToNc32hw32(char4 const *, char4*, nvinfer1::rt::reduced_divisor, int, nvinfer1::rt::reduced_divisor, nvinfer1::rt::reduced_divisor, int, int, float4 const *, float4 const *) API calls: 100.00% 3.8547ms 239 16.128us 12.832us 50.528us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 163) [Convolution] + (Unnamed Layer* 165) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.5539ms 239 19.053us 15.968us 43.552us (Unnamed Layer* 163) [Convolution] + (Unnamed Layer* 165) [Activation] GPU activities: 100.00% 12.221ms 239 51.135us 49.794us 55.042us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.2963ms 239 13.791us 12.000us 37.344us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 166) [Convolution] + (Unnamed Layer* 168) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.5666ms 239 19.107us 15.648us 74.176us (Unnamed Layer* 166) [Convolution] + (Unnamed Layer* 168) [Activation] GPU activities: 100.00% 8.1632ms 239 34.155us 33.441us 35.009us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.3744ms 239 14.118us 11.808us 58.944us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 168) [Activation]_output copy" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.4642ms 239 22.862us 19.424us 81.760us (Unnamed Layer* 168) [Activation]_output copy GPU activities: 100.00% 3.1509ms 239 13.183us 12.161us 13.985us cuInt8::nc32hw32ToNc32hw32(char4 const *, char4*, nvinfer1::rt::reduced_divisor, int, nvinfer1::rt::reduced_divisor, nvinfer1::rt::reduced_divisor, int, int, float4 const *, float4 const *) API calls: 100.00% 3.5438ms 239 14.827us 12.832us 46.528us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 169) [Resize]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.9621ms 239 20.762us 16.736us 68.480us (Unnamed Layer* 169) [Resize] GPU activities: 100.00% 102.06ms 239 427.03us 415.37us 434.38us void cuResizeLayer::ResizeNearestGenericKernel(float*, cuResizeLayer::ResizeNearestGenericKernel const *, cuResizeLayer::LaunchParams) API calls: 100.00% 3.4531ms 239 14.447us 12.064us 40.032us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 169) [Resize] input reformatter 0" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.9431ms 239 24.866us 20.608us 80.032us (Unnamed Layer* 169) [Resize] input reformatter 0 GPU activities: 100.00% 14.560ms 239 60.919us 60.098us 61.857us void genericReformat::copyPackedKernel, int=4>(unsigned int, unsigned int, void const *, genericReformat::ArrayN>, genericReformat::ArrayNWithReducedDivisors>, genericReformat::ArrayN, int, int, int, float const *, void*, genericReformat::ArrayN, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayN, int, int, int, float const , int=4) API calls: 100.00% 3.7125ms 239 15.533us 13.376us 66.176us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 169) [Resize]_output copy" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 11.237ms 239 47.018us 34.976us 127.81us (Unnamed Layer* 169) [Resize]_output copy GPU activities: 100.00% 80.180ms 239 335.48us 327.50us 346.99us void CUTENSOR_NAMESPACE::vectorized_tensor_elementwise_kernel, float, float, char, float, bool=0, cutensorOperator_t=126, cutensorOperator_t, cutensorOperator_t, cutensorOperator_t, cutensorOperator_t>(CUTENSOR_NAMESPACE::pw_params_t, int, int, unsigned int=1, int=32 const *, CUTENSOR_NAMESPACE::pw_params_t, unsigned int=256 const *, CUTENSOR_NAMESPACE::pw_params_t, unsigned int=1 const *, unsigned int=256 const **, cutensorOperator_t, void const *, cutensorOperator_t, void const , cutensorOperator_t, void const , cutensorOperator_t, void const , cutensorOperator_t, void const ) API calls: 100.00% 4.1221ms 239 17.247us 14.368us 60.096us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 17) [Convolution] + (Unnamed Layer* 19) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.5010ms 239 18.832us 15.328us 55.552us (Unnamed Layer* 17) [Convolution] + (Unnamed Layer* 19) [Activation] GPU activities: 100.00% 38.623ms 239 161.60us 159.33us 164.55us trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_interior_nt_v1 API calls: 100.00% 3.3804ms 239 14.143us 11.808us 48.704us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 174) [Convolution] + (Unnamed Layer* 176) [Activation] || (Unnamed Layer* 171) [Convolution] + (Unnamed Layer* 173) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.8334ms 239 20.223us 16.640us 64.096us (Unnamed Layer* 174) [Convolution] + (Unnamed Layer* 176) [Activation] || (Unnamed Layer* 171) [Convolution] + (Unnamed Layer* 173) [Activation] GPU activities: 100.00% 30.993ms 239 129.68us 126.95us 135.91us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.4841ms 239 14.577us 12.352us 50.112us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 176) [Activation]_output copy" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.6577ms 239 23.672us 19.904us 73.312us (Unnamed Layer* 176) [Activation]_output copy GPU activities: 100.00% 5.0623ms 239 21.181us 20.384us 22.048us cuInt8::nc32hw32ToNc32hw32(char4 const *, char4*, nvinfer1::rt::reduced_divisor, int, nvinfer1::rt::reduced_divisor, nvinfer1::rt::reduced_divisor, int, int, float4 const *, float4 const *) API calls: 100.00% 3.5308ms 239 14.773us 12.992us 47.584us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 177) [Convolution] + (Unnamed Layer* 179) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.4703ms 239 18.704us 15.520us 68.800us (Unnamed Layer* 177) [Convolution] + (Unnamed Layer* 179) [Activation] GPU activities: 100.00% 8.8979ms 239 37.229us 36.193us 38.465us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.3118ms 239 13.857us 11.744us 52.160us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 180) [Convolution] + (Unnamed Layer* 182) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.3062ms 239 18.017us 15.296us 61.408us (Unnamed Layer* 180) [Convolution] + (Unnamed Layer* 182) [Activation] GPU activities: 100.00% 29.702ms 239 124.28us 122.15us 125.80us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.2585ms 239 13.633us 11.712us 56.768us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 184) [Convolution] + (Unnamed Layer* 186) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.5694ms 239 19.118us 15.840us 67.424us (Unnamed Layer* 184) [Convolution] + (Unnamed Layer* 186) [Activation] GPU activities: 100.00% 19.983ms 239 83.612us 81.250us 87.299us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.3204ms 239 13.893us 11.744us 60.352us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 187) [Convolution] + (Unnamed Layer* 189) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.3308ms 239 18.120us 15.328us 48.288us (Unnamed Layer* 187) [Convolution] + (Unnamed Layer* 189) [Activation] GPU activities: 100.00% 11.695ms 239 48.935us 46.273us 52.386us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.2337ms 239 13.530us 11.648us 43.488us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 189) [Activation]_output copy" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.9148ms 239 24.747us 19.616us 92.288us (Unnamed Layer* 189) [Activation]_output copy GPU activities: 100.00% 5.1872ms 239 21.703us 21.025us 22.592us cuInt8::nc32hw32ToNc32hw32(char4 const *, char4*, nvinfer1::rt::reduced_divisor, int, nvinfer1::rt::reduced_divisor, nvinfer1::rt::reduced_divisor, int, int, float4 const *, float4 const *) API calls: 100.00% 3.6868ms 239 15.425us 12.864us 63.744us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 190) [Resize]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.9588ms 239 20.748us 16.576us 87.616us (Unnamed Layer* 190) [Resize] GPU activities: 100.00% 200.51ms 239 838.95us 823.10us 850.84us void cuResizeLayer::ResizeNearestGenericKernel(float*, cuResizeLayer::ResizeNearestGenericKernel const *, cuResizeLayer::LaunchParams) API calls: 100.00% 3.5776ms 239 14.969us 12.064us 59.808us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 190) [Resize] input reformatter 0" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.9575ms 239 24.926us 20.512us 75.776us (Unnamed Layer* 190) [Resize] input reformatter 0 GPU activities: 100.00% 28.582ms 239 119.59us 116.48us 123.08us void genericReformat::copyPackedKernel, int=4>(unsigned int, unsigned int, void const *, genericReformat::ArrayN>, genericReformat::ArrayNWithReducedDivisors>, genericReformat::ArrayN, int, int, int, float const *, void*, genericReformat::ArrayN, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayN, int, int, int, float const , int=4) API calls: 100.00% 3.7319ms 239 15.614us 13.280us 40.096us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 190) [Resize]_output copy" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 11.485ms 239 48.056us 33.888us 147.14us (Unnamed Layer* 190) [Resize]_output copy GPU activities: 100.00% 154.78ms 239 647.61us 642.93us 652.76us void CUTENSOR_NAMESPACE::vectorized_tensor_elementwise_kernel, float, float, char, float, bool=0, cutensorOperator_t=126, cutensorOperator_t, cutensorOperator_t, cutensorOperator_t, cutensorOperator_t>(CUTENSOR_NAMESPACE::pw_params_t, int, int, unsigned int=1, int=32 const *, CUTENSOR_NAMESPACE::pw_params_t, unsigned int=256 const *, CUTENSOR_NAMESPACE::pw_params_t, unsigned int=1 const *, unsigned int=256 const **, cutensorOperator_t, void const *, cutensorOperator_t, void const , cutensorOperator_t, void const , cutensorOperator_t, void const , cutensorOperator_t, void const ) API calls: 100.00% 4.1967ms 239 17.559us 14.368us 63.232us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 195) [Convolution] + (Unnamed Layer* 197) [Activation] || (Unnamed Layer* 192) [Convolution] + (Unnamed Layer* 194) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.7645ms 239 19.935us 16.288us 62.304us (Unnamed Layer* 195) [Convolution] + (Unnamed Layer* 197) [Activation] || (Unnamed Layer* 192) [Convolution] + (Unnamed Layer* 194) [Activation] GPU activities: 100.00% 37.857ms 239 158.40us 152.10us 162.98us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.4238ms 239 14.325us 12.128us 57.792us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 197) [Activation]_output copy" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.5310ms 239 23.142us 19.040us 78.688us (Unnamed Layer* 197) [Activation]_output copy GPU activities: 100.00% 22.171ms 239 92.767us 91.106us 94.626us cuInt8::nc32hw32ToNcqhw4(char4 const *, char4*, nvinfer1::rt::reduced_divisor, int, nvinfer1::rt::reduced_divisor, nvinfer1::rt::reduced_divisor, int, int, float const *, float const *) API calls: 100.00% 3.5558ms 239 14.877us 12.448us 65.504us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 198) [Convolution] + (Unnamed Layer* 200) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.4925ms 239 18.797us 15.936us 46.240us (Unnamed Layer* 198) [Convolution] + (Unnamed Layer* 200) [Activation] GPU activities: 100.00% 13.910ms 239 58.201us 56.353us 60.642us trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_interior_nt_v1 API calls: 100.00% 3.3563ms 239 14.042us 12.160us 39.584us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 2) [Slice]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.0643ms 239 21.189us 16.160us 73.216us (Unnamed Layer* 2) [Slice] GPU activities: 100.00% 53.075ms 239 222.07us 219.65us 225.03us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) API calls: 100.00% 3.5721ms 239 14.945us 11.744us 63.232us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 20) [Convolution] + (Unnamed Layer* 22) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.3877ms 239 18.358us 15.456us 72.928us (Unnamed Layer* 20) [Convolution] + (Unnamed Layer* 22) [Activation] GPU activities: 100.00% 75.878ms 239 317.48us 312.07us 323.56us trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 API calls: 100.00% 3.2359ms 239 13.539us 11.808us 37.568us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 201) [Convolution] + (Unnamed Layer* 203) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.4457ms 239 18.601us 15.520us 63.360us (Unnamed Layer* 201) [Convolution] + (Unnamed Layer* 203) [Activation] GPU activities: 100.00% 33.394ms 239 139.72us 137.32us 145.76us trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 API calls: 100.00% 3.2882ms 239 13.758us 11.840us 43.648us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 201) [Convolution] + (Unnamed Layer* 203) [Activation] output reformatter 0" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.8494ms 239 24.474us 20.096us 67.648us (Unnamed Layer* 201) [Convolution] + (Unnamed Layer* 203) [Activation] output reformatter 0 GPU activities: 100.00% 23.028ms 239 96.353us 94.978us 97.859us cuInt8::nc32hw32ToNcqhw4(char4 const *, char4*, nvinfer1::rt::reduced_divisor, int, nvinfer1::rt::reduced_divisor, nvinfer1::rt::reduced_divisor, int, int, float const *, float const *) API calls: 100.00% 3.6938ms 239 15.455us 13.120us 59.552us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 205) [Convolution] + (Unnamed Layer* 207) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.9783ms 239 20.829us 16.288us 58.784us (Unnamed Layer* 205) [Convolution] + (Unnamed Layer* 207) [Activation] GPU activities: 100.00% 44.916ms 239 187.94us 185.16us 190.09us trt_volta_int8x4_icudnn_int8x4_128x64_relu_interior_nn_v1 API calls: 100.00% 3.8720ms 239 16.200us 12.608us 47.232us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 208) [Convolution] + (Unnamed Layer* 210) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.4496ms 239 18.617us 15.616us 48.640us (Unnamed Layer* 208) [Convolution] + (Unnamed Layer* 210) [Activation] GPU activities: 100.00% 32.065ms 239 134.16us 132.55us 136.04us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.3196ms 239 13.889us 11.936us 38.400us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 208) [Convolution] + (Unnamed Layer* 210) [Activation] input reformatter 0" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 6.1200ms 239 25.606us 20.288us 61.760us (Unnamed Layer* 208) [Convolution] + (Unnamed Layer* 210) [Activation] input reformatter 0 GPU activities: 100.00% 32.347ms 239 135.34us 133.19us 137.48us cuInt8::ncqhw4ToNc32hw32(char4 const *, char4*, nvinfer1::rt::reduced_divisor, int, nvinfer1::rt::reduced_divisor, nvinfer1::rt::reduced_divisor, int, int, float const *, float const *) API calls: 100.00% 3.8452ms 239 16.088us 13.280us 43.424us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 215) [Convolution] + (Unnamed Layer* 217) [Activation] || (Unnamed Layer* 212) [Convolution] + (Unnamed Layer* 214) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.7730ms 239 19.970us 15.936us 86.112us (Unnamed Layer* 215) [Convolution] + (Unnamed Layer* 217) [Activation] || (Unnamed Layer* 212) [Convolution] + (Unnamed Layer* 214) [Activation] GPU activities: 100.00% 19.982ms 239 83.608us 81.571us 87.298us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.5337ms 239 14.785us 11.840us 81.344us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 217) [Activation]_output copy" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.4429ms 239 22.773us 19.328us 64.768us (Unnamed Layer* 217) [Activation]_output copy GPU activities: 100.00% 4.9589ms 239 20.748us 20.064us 21.601us cuInt8::nc32hw32ToNc32hw32(char4 const *, char4*, nvinfer1::rt::reduced_divisor, int, nvinfer1::rt::reduced_divisor, nvinfer1::rt::reduced_divisor, int, int, float4 const *, float4 const *) API calls: 100.00% 3.5363ms 239 14.796us 12.768us 52.960us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 218) [Convolution] + (Unnamed Layer* 220) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.5471ms 239 19.025us 15.136us 62.144us (Unnamed Layer* 218) [Convolution] + (Unnamed Layer* 220) [Activation] GPU activities: 100.00% 8.9467ms 239 37.433us 36.193us 40.065us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.4044ms 239 14.244us 11.616us 57.824us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 221) [Convolution] + (Unnamed Layer* 223) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.5296ms 239 18.952us 15.104us 88.832us (Unnamed Layer* 221) [Convolution] + (Unnamed Layer* 223) [Activation] GPU activities: 100.00% 29.736ms 239 124.42us 122.63us 125.60us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.3627ms 239 14.069us 11.584us 84.544us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 225) [Convolution] + (Unnamed Layer* 227) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.5314ms 239 18.959us 15.808us 42.496us (Unnamed Layer* 225) [Convolution] + (Unnamed Layer* 227) [Activation] GPU activities: 100.00% 20.075ms 239 83.996us 81.442us 87.523us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.3357ms 239 13.956us 11.936us 36.960us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 228) [Convolution] + (Unnamed Layer* 230) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.4574ms 239 18.650us 15.648us 70.912us (Unnamed Layer* 228) [Convolution] + (Unnamed Layer* 230) [Activation] GPU activities: 100.00% 30.414ms 239 127.25us 125.51us 128.39us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.3100ms 239 13.849us 11.904us 65.184us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 23) [ElementWise]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 6.0395ms 239 25.269us 16.672us 766.14us (Unnamed Layer* 23) [ElementWise] GPU activities: 100.00% 33.671ms 239 140.88us 140.48us 141.76us void cuEltwise::eltwise, cuEltwise::Compute>(cuEltwise::LaunchParams) API calls: 100.00% 4.4970ms 239 18.815us 12.352us 758.88us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 235) [Convolution] + (Unnamed Layer* 237) [Activation] || (Unnamed Layer* 232) [Convolution] + (Unnamed Layer* 234) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.5249ms 239 18.932us 15.904us 45.024us (Unnamed Layer* 235) [Convolution] + (Unnamed Layer* 237) [Activation] || (Unnamed Layer* 232) [Convolution] + (Unnamed Layer* 234) [Activation] GPU activities: 100.00% 13.687ms 239 57.266us 54.370us 60.418us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.3029ms 239 13.819us 11.936us 39.904us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 237) [Activation]_output copy" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.6127ms 239 23.483us 19.616us 70.496us (Unnamed Layer* 237) [Activation]_output copy GPU activities: 100.00% 2.6056ms 239 10.902us 10.272us 11.648us cuInt8::nc32hw32ToNc32hw32(char4 const *, char4*, nvinfer1::rt::reduced_divisor, int, nvinfer1::rt::reduced_divisor, nvinfer1::rt::reduced_divisor, int, int, float4 const *, float4 const *) API calls: 100.00% 3.6656ms 239 15.337us 12.864us 58.336us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 238) [Convolution] + (Unnamed Layer* 240) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.6402ms 239 19.415us 16.448us 37.088us (Unnamed Layer* 238) [Convolution] + (Unnamed Layer* 240) [Activation] GPU activities: 100.00% 6.8105ms 239 28.495us 27.969us 31.809us trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_interior_nt_v1 API calls: 100.00% 3.3892ms 239 14.180us 12.192us 31.936us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 241) [Convolution] + (Unnamed Layer* 243) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.8370ms 239 20.238us 16.032us 105.38us (Unnamed Layer* 241) [Convolution] + (Unnamed Layer* 243) [Activation] GPU activities: 100.00% 21.080ms 239 88.202us 87.298us 89.187us trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 API calls: 100.00% 3.5121ms 239 14.695us 11.904us 55.936us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 245) [Convolution] + (Unnamed Layer* 247) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.6658ms 239 19.522us 15.968us 70.592us (Unnamed Layer* 245) [Convolution] + (Unnamed Layer* 247) [Activation] GPU activities: 100.00% 12.315ms 239 51.527us 50.017us 55.010us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.3644ms 239 14.077us 11.968us 65.728us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 248) [Convolution] + (Unnamed Layer* 250) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.5531ms 239 19.050us 15.360us 60.096us (Unnamed Layer* 248) [Convolution] + (Unnamed Layer* 250) [Activation] GPU activities: 100.00% 22.281ms 239 93.227us 92.483us 93.987us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.3877ms 239 14.174us 11.840us 56.064us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 25) [Convolution] + (Unnamed Layer* 27) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.8282ms 239 20.201us 16.576us 55.424us (Unnamed Layer* 25) [Convolution] + (Unnamed Layer* 27) [Activation] GPU activities: 100.00% 43.106ms 239 180.36us 176.20us 184.10us trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_interior_nt_v1 API calls: 100.00% 3.5520ms 239 14.862us 12.288us 50.720us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 255) [Convolution] + (Unnamed Layer* 257) [Activation] || (Unnamed Layer* 252) [Convolution] + (Unnamed Layer* 254) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.4878ms 239 18.777us 15.520us 86.784us (Unnamed Layer* 255) [Convolution] + (Unnamed Layer* 257) [Activation] || (Unnamed Layer* 252) [Convolution] + (Unnamed Layer* 254) [Activation] GPU activities: 100.00% 6.5190ms 239 27.276us 26.752us 28.161us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.3784ms 239 14.135us 11.904us 80.480us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 257) [Activation]_output copy" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.4068ms 239 22.622us 19.488us 69.472us (Unnamed Layer* 257) [Activation]_output copy GPU activities: 100.00% 1.2691ms 239 5.3090us 4.9280us 5.8560us cuInt8::nc32hw32ToNc32hw32(char4 const *, char4*, nvinfer1::rt::reduced_divisor, int, nvinfer1::rt::reduced_divisor, nvinfer1::rt::reduced_divisor, int, int, float4 const *, float4 const *) API calls: 100.00% 3.5306ms 239 14.772us 12.768us 41.280us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 258) [Convolution] + (Unnamed Layer* 260) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.3135ms 239 18.048us 15.296us 57.824us (Unnamed Layer* 258) [Convolution] + (Unnamed Layer* 260) [Activation] GPU activities: 100.00% 3.0536ms 239 12.776us 12.480us 13.120us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.2221ms 239 13.481us 11.776us 34.848us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 261) [Convolution] + (Unnamed Layer* 263) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.4160ms 239 18.476us 15.136us 60.192us (Unnamed Layer* 261) [Convolution] + (Unnamed Layer* 263) [Activation] GPU activities: 100.00% 10.695ms 239 44.749us 43.841us 45.506us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.3270ms 239 13.920us 11.616us 53.856us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 265) [Convolution] + (Unnamed Layer* 267) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.5861ms 239 19.188us 15.232us 59.584us (Unnamed Layer* 265) [Convolution] + (Unnamed Layer* 267) [Activation] GPU activities: 100.00% 4.9828ms 239 20.848us 20.385us 21.281us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.4437ms 239 14.408us 11.776us 54.496us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 268) [Convolution]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.8672ms 239 20.364us 16.544us 45.312us (Unnamed Layer* 268) [Convolution] GPU activities: 100.00% 28.109ms 239 117.61us 115.52us 119.94us trt_volta_fp32_icudnn_int8x4_128x64_relu_interior_nn_v1 API calls: 100.00% 3.6264ms 239 15.173us 12.576us 33.216us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 269) [Convolution]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.6880ms 239 19.615us 16.096us 58.560us (Unnamed Layer* 269) [Convolution] GPU activities: 100.00% 13.841ms 239 57.911us 56.098us 59.490us trt_volta_fp32_icudnn_int8x4_128x64_relu_interior_nn_v1 API calls: 100.00% 3.5392ms 239 14.808us 12.224us 54.144us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 269) [Convolution] input reformatter 0" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.7906ms 239 24.228us 19.840us 102.46us (Unnamed Layer* 269) [Convolution] input reformatter 0 GPU activities: 100.00% 53.200ms 239 222.59us 215.27us 229.74us cuInt8::nc32hw32ToNcqhw4(char4 const *, char4*, nvinfer1::rt::reduced_divisor, int, nvinfer1::rt::reduced_divisor, nvinfer1::rt::reduced_divisor, int, int, float const *, float const *) API calls: 100.00% 3.6340ms 239 15.204us 12.704us 62.784us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 270) [Convolution]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.6722ms 239 23.733us 16.224us 778.14us (Unnamed Layer* 270) [Convolution] GPU activities: 100.00% 7.7703ms 239 32.511us 31.201us 33.057us trt_volta_fp32_icudnn_int8x4_128x64_relu_interior_nn_v1 API calls: 100.00% 3.7173ms 239 15.553us 12.224us 61.760us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 270) [Convolution] input reformatter 0" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.5416ms 239 23.186us 19.808us 81.056us (Unnamed Layer* 270) [Convolution] input reformatter 0 GPU activities: 100.00% 18.143ms 239 75.913us 75.043us 76.866us cuInt8::nc32hw32ToNcqhw4(char4 const *, char4*, nvinfer1::rt::reduced_divisor, int, nvinfer1::rt::reduced_divisor, nvinfer1::rt::reduced_divisor, int, int, float const *, float const *) API calls: 100.00% 3.4966ms 239 14.630us 12.608us 62.784us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 271) [Convolution]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.6589ms 239 19.493us 16.128us 66.304us (Unnamed Layer* 271) [Convolution] GPU activities: 100.00% 5.6999ms 239 23.848us 23.169us 24.993us trt_volta_fp32_icudnn_int8x4_128x64_relu_interior_nn_v1 API calls: 100.00% 3.4561ms 239 14.460us 12.160us 62.112us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 271) [Convolution] input reformatter 0" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.5861ms 239 23.372us 19.712us 70.528us (Unnamed Layer* 271) [Convolution] input reformatter 0 GPU activities: 100.00% 6.1516ms 239 25.738us 25.344us 26.177us cuInt8::nc32hw32ToNcqhw4(char4 const *, char4*, nvinfer1::rt::reduced_divisor, int, nvinfer1::rt::reduced_divisor, nvinfer1::rt::reduced_divisor, int, int, float const *, float const *) API calls: 100.00% 3.4898ms 239 14.601us 12.640us 61.824us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 272) [PluginV2IOExt]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 23.385ms 239 97.846us 74.752us 207.26us (Unnamed Layer* 272) [PluginV2IOExt] GPU activities: 98.38% 4.9064ms 956 5.1320us 2.2400us 10.465us nvinfer1::CalDetection(float const *, float*, int, int, int, int, int, int, float const *, int, int) 1.62% 80.642us 239 337ns 288ns 768ns [CUDA memset] API calls: 70.30% 14.261ms 956 14.917us 11.008us 54.656us cudaLaunchKernel 29.70% 6.0245ms 239 25.207us 15.392us 69.536us cudaMemset ==21320== Range "(Unnamed Layer* 28) [Convolution] + (Unnamed Layer* 30) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.8445ms 239 20.269us 15.680us 79.008us (Unnamed Layer* 28) [Convolution] + (Unnamed Layer* 30) [Activation] GPU activities: 100.00% 63.758ms 239 266.77us 261.99us 272.30us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.6020ms 239 15.071us 11.808us 73.536us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 3) [Slice]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.8096ms 239 20.123us 15.968us 52.576us (Unnamed Layer* 3) [Slice] GPU activities: 100.00% 53.148ms 239 222.38us 220.01us 225.26us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) API calls: 100.00% 3.3934ms 239 14.198us 11.520us 44.480us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 34) [Convolution] + (Unnamed Layer* 36) [Activation] || (Unnamed Layer* 31) [Convolution] + (Unnamed Layer* 33) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.7514ms 239 19.880us 16.320us 51.328us (Unnamed Layer* 34) [Convolution] + (Unnamed Layer* 36) [Activation] || (Unnamed Layer* 31) [Convolution] + (Unnamed Layer* 33) [Activation] GPU activities: 100.00% 28.157ms 239 117.81us 112.58us 121.64us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_interior_nt_v1 API calls: 100.00% 3.5558ms 239 14.877us 12.256us 44.480us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 36) [Activation]_output copy" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.8027ms 239 24.278us 19.712us 60.512us (Unnamed Layer* 36) [Activation]_output copy GPU activities: 100.00% 8.9801ms 239 37.573us 36.769us 39.041us cuInt8::nc32hw32ToNc32hw32(char4 const *, char4*, nvinfer1::rt::reduced_divisor, int, nvinfer1::rt::reduced_divisor, nvinfer1::rt::reduced_divisor, int, int, float4 const *, float4 const *) API calls: 100.00% 3.6731ms 239 15.368us 12.864us 52.992us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 37) [Convolution] + (Unnamed Layer* 39) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.5658ms 239 19.103us 15.712us 64.800us (Unnamed Layer* 37) [Convolution] + (Unnamed Layer* 39) [Activation] GPU activities: 100.00% 13.739ms 239 57.486us 55.874us 59.682us trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_interior_nt_v1 API calls: 100.00% 3.4156ms 239 14.291us 11.904us 57.824us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 40) [Convolution] + (Unnamed Layer* 42) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.4100ms 239 22.635us 15.424us 829.34us (Unnamed Layer* 40) [Convolution] + (Unnamed Layer* 42) [Activation] GPU activities: 100.00% 33.390ms 239 139.71us 137.09us 146.50us trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 API calls: 100.00% 4.2319ms 239 17.706us 11.744us 820.99us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 43) [ElementWise]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.5893ms 239 19.202us 16.064us 55.776us (Unnamed Layer* 43) [ElementWise] GPU activities: 100.00% 19.343ms 239 80.931us 80.546us 81.698us void cuEltwise::eltwise, cuEltwise::Compute>(cuEltwise::LaunchParams) API calls: 100.00% 3.3366ms 239 13.960us 11.968us 45.728us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 44) [Convolution] + (Unnamed Layer* 46) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.7902ms 239 20.042us 16.096us 65.504us (Unnamed Layer* 44) [Convolution] + (Unnamed Layer* 46) [Activation] GPU activities: 100.00% 13.211ms 239 55.276us 53.346us 57.474us trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_interior_nt_v1 API calls: 100.00% 3.6005ms 239 15.064us 12.096us 59.968us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 47) [Convolution] + (Unnamed Layer* 49) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.5701ms 239 19.121us 15.584us 54.304us (Unnamed Layer* 47) [Convolution] + (Unnamed Layer* 49) [Activation] GPU activities: 100.00% 32.675ms 239 136.72us 134.72us 142.95us trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 API calls: 100.00% 3.3983ms 239 14.218us 11.872us 47.904us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 5) [Convolution] + (Unnamed Layer* 7) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.6460ms 239 23.623us 16.544us 65.632us (Unnamed Layer* 5) [Convolution] + (Unnamed Layer* 7) [Activation] GPU activities: 100.00% 174.34ms 239 729.47us 716.28us 744.41us trt_volta_int8x4_icudnn_int8x4_128x32_relu_small_c32_nn_v1 API calls: 100.00% 3.9759ms 239 16.635us 12.448us 49.920us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 5) [Convolution] + (Unnamed Layer* 7) [Activation] input reformatter 0" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 7.9069ms 239 33.083us 22.656us 138.05us (Unnamed Layer* 5) [Convolution] + (Unnamed Layer* 7) [Activation] input reformatter 0 GPU activities: 100.00% 47.223ms 239 197.58us 193.77us 205.32us cuInt8::nchwToNcqhw4(float const *, unsigned int*, int, int, int, int, int, int, int, float const *, cuInt8::ReducedDivisorParameters) API calls: 100.00% 4.1217ms 239 17.245us 13.344us 95.552us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 50) [ElementWise]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.6094ms 239 19.286us 15.776us 68.288us (Unnamed Layer* 50) [ElementWise] GPU activities: 100.00% 19.294ms 239 80.729us 80.322us 81.443us void cuEltwise::eltwise, cuEltwise::Compute>(cuEltwise::LaunchParams) API calls: 100.00% 3.4183ms 239 14.302us 11.776us 55.968us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 51) [Convolution] + (Unnamed Layer* 53) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.7543ms 239 19.892us 15.936us 57.440us (Unnamed Layer* 51) [Convolution] + (Unnamed Layer* 53) [Activation] GPU activities: 100.00% 13.127ms 239 54.923us 53.441us 57.697us trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_interior_nt_v1 API calls: 100.00% 3.4936ms 239 14.617us 12.000us 43.264us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 54) [Convolution] + (Unnamed Layer* 56) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.3966ms 239 18.395us 15.552us 62.208us (Unnamed Layer* 54) [Convolution] + (Unnamed Layer* 56) [Activation] GPU activities: 100.00% 32.588ms 239 136.35us 133.06us 142.88us trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 API calls: 100.00% 3.2581ms 239 13.632us 11.808us 58.112us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 57) [ElementWise]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.6867ms 239 19.609us 15.776us 75.232us (Unnamed Layer* 57) [ElementWise] GPU activities: 100.00% 19.251ms 239 80.549us 80.034us 81.091us void cuEltwise::eltwise, cuEltwise::Compute>(cuEltwise::LaunchParams) API calls: 100.00% 3.4292ms 239 14.348us 11.744us 42.432us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 59) [Convolution] + (Unnamed Layer* 61) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.7428ms 239 19.844us 16.352us 63.776us (Unnamed Layer* 59) [Convolution] + (Unnamed Layer* 61) [Activation] GPU activities: 100.00% 28.110ms 239 117.61us 113.51us 122.37us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_interior_nt_v1 API calls: 100.00% 3.5010ms 239 14.648us 12.288us 57.280us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 62) [Convolution] + (Unnamed Layer* 64) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.6425ms 239 19.424us 15.776us 62.112us (Unnamed Layer* 62) [Convolution] + (Unnamed Layer* 64) [Activation] GPU activities: 100.00% 56.508ms 239 236.43us 233.45us 245.29us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.3492ms 239 14.013us 11.840us 51.200us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 68) [Convolution] + (Unnamed Layer* 70) [Activation] || (Unnamed Layer* 65) [Convolution] + (Unnamed Layer* 67) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.5786ms 239 19.157us 15.616us 56.832us (Unnamed Layer* 68) [Convolution] + (Unnamed Layer* 70) [Activation] || (Unnamed Layer* 65) [Convolution] + (Unnamed Layer* 67) [Activation] GPU activities: 100.00% 20.007ms 239 83.710us 81.955us 87.491us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.3314ms 239 13.939us 11.712us 50.976us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 70) [Activation]_output copy" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.8024ms 239 24.277us 19.712us 88.672us (Unnamed Layer* 70) [Activation]_output copy GPU activities: 100.00% 5.0406ms 239 21.090us 20.384us 21.792us cuInt8::nc32hw32ToNc32hw32(char4 const *, char4*, nvinfer1::rt::reduced_divisor, int, nvinfer1::rt::reduced_divisor, nvinfer1::rt::reduced_divisor, int, int, float4 const *, float4 const *) API calls: 100.00% 3.7956ms 239 15.881us 12.832us 63.200us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 71) [Convolution] + (Unnamed Layer* 73) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.3825ms 239 18.336us 15.264us 58.976us (Unnamed Layer* 71) [Convolution] + (Unnamed Layer* 73) [Activation] GPU activities: 100.00% 8.8124ms 239 36.871us 35.841us 39.905us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.2627ms 239 13.651us 11.712us 41.952us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 74) [Convolution] + (Unnamed Layer* 76) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.2461ms 239 21.950us 15.168us 825.73us (Unnamed Layer* 74) [Convolution] + (Unnamed Layer* 76) [Activation] GPU activities: 100.00% 29.751ms 239 124.48us 122.31us 125.86us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 4.1335ms 239 17.294us 11.648us 820.13us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 77) [ElementWise]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.6444ms 239 19.432us 15.968us 55.744us (Unnamed Layer* 77) [ElementWise] GPU activities: 100.00% 9.8707ms 239 41.299us 40.866us 41.825us void cuEltwise::eltwise, cuEltwise::Compute>(cuEltwise::LaunchParams) API calls: 100.00% 3.4569ms 239 14.463us 11.840us 50.752us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 78) [Convolution] + (Unnamed Layer* 80) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.5017ms 239 18.835us 15.296us 56.512us (Unnamed Layer* 78) [Convolution] + (Unnamed Layer* 80) [Activation] GPU activities: 100.00% 8.8480ms 239 37.020us 35.713us 39.713us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.3884ms 239 14.177us 11.648us 50.336us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 8) [Convolution] + (Unnamed Layer* 10) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.3658ms 239 22.451us 16.032us 54.592us (Unnamed Layer* 8) [Convolution] + (Unnamed Layer* 10) [Activation] GPU activities: 100.00% 96.759ms 239 404.85us 370.41us 413.55us trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 API calls: 100.00% 3.6595ms 239 15.311us 12.064us 44.928us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 81) [Convolution] + (Unnamed Layer* 83) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.4288ms 239 18.530us 15.392us 52.544us (Unnamed Layer* 81) [Convolution] + (Unnamed Layer* 83) [Activation] GPU activities: 100.00% 29.910ms 239 125.15us 123.17us 126.50us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.2665ms 239 13.667us 11.648us 45.984us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 84) [ElementWise]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.3565ms 239 18.228us 15.712us 43.104us (Unnamed Layer* 84) [ElementWise] GPU activities: 100.00% 9.8691ms 239 41.293us 40.993us 41.761us void cuEltwise::eltwise, cuEltwise::Compute>(cuEltwise::LaunchParams) API calls: 100.00% 3.2498ms 239 13.597us 11.808us 35.328us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 85) [Convolution] + (Unnamed Layer* 87) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.5349ms 239 18.974us 15.328us 72.352us (Unnamed Layer* 85) [Convolution] + (Unnamed Layer* 87) [Activation] GPU activities: 100.00% 8.8653ms 239 37.093us 35.585us 40.162us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.3647ms 239 14.078us 11.712us 62.976us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 88) [Convolution] + (Unnamed Layer* 90) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.4700ms 239 18.703us 15.328us 65.184us (Unnamed Layer* 88) [Convolution] + (Unnamed Layer* 90) [Activation] GPU activities: 100.00% 29.783ms 239 124.62us 121.67us 125.83us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.3518ms 239 14.024us 11.744us 33.472us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 91) [ElementWise]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.6595ms 239 19.495us 15.744us 57.952us (Unnamed Layer* 91) [ElementWise] GPU activities: 100.00% 9.8395ms 239 41.169us 40.865us 41.601us void cuEltwise::eltwise, cuEltwise::Compute>(cuEltwise::LaunchParams) API calls: 100.00% 3.4439ms 239 14.409us 11.904us 53.888us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 93) [Convolution] + (Unnamed Layer* 95) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.7807ms 239 20.002us 16.064us 64.192us (Unnamed Layer* 93) [Convolution] + (Unnamed Layer* 95) [Activation] GPU activities: 100.00% 19.902ms 239 83.272us 81.090us 87.043us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.4298ms 239 14.350us 11.936us 41.600us cudaLaunchKernel ==21320== Range "(Unnamed Layer* 96) [Convolution] + (Unnamed Layer* 98) [Activation]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.6549ms 239 19.476us 15.904us 78.592us (Unnamed Layer* 96) [Convolution] + (Unnamed Layer* 98) [Activation] GPU activities: 100.00% 42.839ms 239 179.24us 171.05us 189.35us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.3261ms 239 13.916us 11.808us 47.808us cudaLaunchKernel ==21320== Range "ExecutionContext::enqueue" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 665.54ms 239 2.7847ms 2.3189ms 8.9007ms ExecutionContext::enqueue GPU activities: 26.79% 797.84ms 10277 77.633us 12.448us 272.30us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 12.35% 367.83ms 2151 171.01us 86.882us 413.55us trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 11.46% 341.39ms 717 476.14us 153.06us 850.84us void cuResizeLayer::ResizeNearestGenericKernel(float*, cuResizeLayer::ResizeNearestGenericKernel const *, cuResizeLayer::LaunchParams) 8.99% 267.61ms 717 373.23us 131.97us 652.76us void CUTENSOR_NAMESPACE::vectorized_tensor_elementwise_kernel, float, float, char, float, bool=0, cutensorOperator_t=126, cutensorOperator_t, cutensorOperator_t, cutensorOperator_t, cutensorOperator_t>(CUTENSOR_NAMESPACE::pw_params_t, int, int, unsigned int=1, int=32 const *, CUTENSOR_NAMESPACE::pw_params_t, unsigned int=256 const *, CUTENSOR_NAMESPACE::pw_params_t, unsigned int=1 const *, unsigned int=256 const **, cutensorOperator_t, void const *, cutensorOperator_t, void const , cutensorOperator_t, void const , cutensorOperator_t, void const , cutensorOperator_t, void const ) 7.15% 212.81ms 956 222.60us 219.65us 240.94us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) 6.72% 200.00ms 2390 83.682us 27.969us 187.65us trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_interior_nt_v1 5.85% 174.34ms 239 729.47us 716.28us 744.41us trt_volta_int8x4_icudnn_int8x4_128x32_relu_small_c32_nn_v1 4.23% 125.85ms 1912 65.822us 19.361us 141.76us void cuEltwise::eltwise, cuEltwise::Compute>(cuEltwise::LaunchParams) 4.12% 122.69ms 1195 102.67us 25.344us 229.74us cuInt8::nc32hw32ToNcqhw4(char4 const *, char4*, nvinfer1::rt::reduced_divisor, int, nvinfer1::rt::reduced_divisor, nvinfer1::rt::reduced_divisor, int, int, float const *, float const *) 2.08% 62.006ms 3346 18.531us 3.8400us 72.066us cuInt8::nc32hw32ToNc32hw32(char4 const *, char4*, nvinfer1::rt::reduced_divisor, int, nvinfer1::rt::reduced_divisor, nvinfer1::rt::reduced_divisor, int, int, float4 const *, float4 const *) 1.89% 56.267ms 478 117.71us 112.58us 122.37us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_interior_nt_v1 1.86% 55.420ms 956 57.970us 23.169us 119.94us trt_volta_fp32_icudnn_int8x4_128x64_relu_interior_nn_v1 1.63% 48.682ms 717 67.897us 22.848us 123.08us void genericReformat::copyPackedKernel, int=4>(unsigned int, unsigned int, void const *, genericReformat::ArrayN>, genericReformat::ArrayNWithReducedDivisors>, genericReformat::ArrayN, int, int, int, float const *, void*, genericReformat::ArrayN, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayN, int, int, int, float const , int=4) 1.59% 47.223ms 239 197.58us 193.77us 205.32us cuInt8::nchwToNcqhw4(float const *, unsigned int*, int, int, int, int, int, int, int, float const *, cuInt8::ReducedDivisorParameters) 1.51% 44.916ms 239 187.94us 185.16us 190.09us trt_volta_int8x4_icudnn_int8x4_128x64_relu_interior_nn_v1 1.09% 32.347ms 239 135.34us 133.19us 137.48us cuInt8::ncqhw4ToNc32hw32(char4 const *, char4*, nvinfer1::rt::reduced_divisor, int, nvinfer1::rt::reduced_divisor, nvinfer1::rt::reduced_divisor, int, int, float const *, float const *) 0.50% 14.860ms 717 20.725us 9.8570us 33.953us void nvinfer1::poolNCxHWxInt8(nvinfer1::IMMAInt8PackedArray const *, nvinfer1::poolNCxHWxInt8*, int, int, nvinfer1::rt::reduced_divisor, int, int, int, nvinfer1::rt, int, int, int, int, int, int, nvinfer1::rt, float, float, nvinfer1::IMMAFloatPackedArray const *, nvinfer1::IMMAFloatPackedArray const , int, int) 0.16% 4.9064ms 956 5.1320us 2.2400us 10.465us nvinfer1::CalDetection(float const *, float*, int, int, int, int, int, int, float const *, int, int) 0.03% 927.39us 70 13.248us 2.4640us 74.722us [CUDA memcpy DtoD] 0.00% 80.642us 239 337ns 288ns 768ns [CUDA memset] API calls: 97.99% 424.59ms 28441 14.928us 11.008us 820.99us cudaLaunchKernel 1.39% 6.0245ms 239 25.207us 15.392us 69.536us cudaMemset 0.62% 2.6838ms 70 38.339us 22.112us 118.98us cudaMemcpyAsync