I convert an unet model into fp16 and int8 using tensorrt 5.1.6. When I measure the latency using trtexec they’re both around 16ms on xavier, could anyone help me to about the reason? The input is 512, 512, 3. Here’s the nvprof output:
**int8:**
Type Time(%) Time Calls Avg Min Max Name
Range: 100.00% 1.28353s 13860 92.606us 24.128us 5.0280ms <unnamed>
GPU activities: 19.88% 666.07ms 1100 605.52us 51.266us 3.4125ms trt_volta_fp32_igemm_int8_128x128_ldg4_relu_nn_v0
13.37% 447.94ms 220 2.0361ms 1.9471ms 2.9640ms trt_volta_fp32_icudnn_int8x4_128x32_relu_small_nn_v1
12.65% 423.80ms 880 481.59us 24.961us 2.5001ms void nhwkuv_to_nkpq_ker<int=4>(char*, float const *, int, int, int, int, int, int, nvinfer1::rt::reduced_divisor, int, int, nv\
infer1::rt::reduced_divisor, nvinfer1::rt::reduced_divisor, int, int, int, float const *, float const *)
11.97% 400.91ms 2640 151.86us 11.680us 1.7901ms void cuScale::scale<char, float, bool=1, cuScale::Mode, bool=1, bool=1, bool=0, int=4, nvinfer1::FusedActType>(char const *, c\
uScale::scale<char, float, bool=1, cuScale::Mode, bool=1, bool=1, bool=0, int=4, nvinfer1::FusedActType>*, cuScale::KernelParameters<cuScale::scale<char, float, bool=1, cuScale::Mode, bool=1, bool=1, boo\
l=0, int=4, nvinfer1::FusedActType>>, nvinfer1::rt::reduced_divisor, nvinfer1::rt, nvinfer1::rt, nvinfer1::rt, nvinfer1::rt)
10.15% 340.11ms 2420 140.54us 62.594us 1.2858ms trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1
8.69% 291.05ms 220 1.3230ms 1.2353ms 2.2325ms trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_medium_nt_v1
7.39% 247.60ms 1100 225.09us 4.2880us 1.4815ms cuInt8::nc32hw32ToNcqhw4(char4 const *, char4*, nvinfer1::rt::reduced_divisor, int, nvinfer1::rt::reduced_divisor, nvinfer1::r\
t::reduced_divisor, int, int)
4.37% 146.34ms 1980 73.911us 37.058us 620.09us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1
3.15% 105.59ms 1100 95.988us 2.8160us 923.75us cuInt8::ncqhw4ToNc32hw32(char4 const *, char4*, nvinfer1::rt::reduced_divisor, int, nvinfer1::rt::reduced_divisor, nvinfer1::r\
t::reduced_divisor, int, int)
2.68% 89.738ms 220 407.90us 401.81us 416.24us trt_volta_int8x4_icudnn_int8x4_128x32_relu_medium_c32_nn_v1
2.42% 80.989ms 880 92.032us 6.1760us 277.26us void nchw_to_nhwcin_ker<unsigned int>(unsigned int*, unsigned int const *, int, int, nvinfer1::rt::reduced_divisor, nvinfer1::\
rt::reduced_divisor, int, nvinfer1::rt::reduced_divisor, int, int, int, int, int, int)
0.59% 19.930ms 220 90.591us 89.827us 92.228us void nvinfer1::poolNCxHWxInt8<nvinfer1::PoolingType>(nvinfer1::IMMAInt8PackedArray const *, nvinfer1::poolNCxHWxInt8<nvinfer1:\
:PoolingType>*, int, int, nvinfer1::rt::reduced_divisor, int, int, int, nvinfer1::rt, int, int, int, int, int, int, nvinfer1::rt, float, float, nvinfer1::IMMAFloatPackedArray const *, nvinfer1::IMMAFloat\
PackedArray const , int, int)
0.59% 19.791ms 440 44.979us 9.2480us 661.69us void cuScale::scale<char, float, bool=1, cuScale::Mode, bool=1, bool=0, bool=0, int=4, nvinfer1::FusedActType>(char const *, c\
uScale::scale<char, float, bool=1, cuScale::Mode, bool=1, bool=0, bool=0, int=4, nvinfer1::FusedActType>*, cuScale::KernelParameters<cuScale::scale<char, float, bool=1, cuScale::Mode, bool=1, bool=0, boo\
l=0, int=4, nvinfer1::FusedActType>>, nvinfer1::rt::reduced_divisor, nvinfer1::rt, nvinfer1::rt, nvinfer1::rt, nvinfer1::rt)
0.50% 16.768ms 440 38.108us 25.409us 53.378us trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_interior_nt_v1
0.41% 13.722ms 220 62.371us 61.506us 64.035us void cuScale::scale<char, float, bool=1, cuScale::Mode, bool=1, bool=0, bool=0, int=4, nvinfer1::FusedActType>(char const *, c\
uScale::scale<char, float, bool=1, cuScale::Mode, bool=1, bool=0, bool=0, int=4, nvinfer1::FusedActType>*, cuScale::KernelParameters<cuScale::scale<char, float, bool=1, cuScale::Mode, bool=1, bool=0, boo\
l=0, int=4, nvinfer1::FusedActType>>, nvinfer1::rt::reduced_divisor, nvinfer1::rt, nvinfer1::rt, nvinfer1::rt, nvinfer1::rt)
0.28% 9.3616ms 440 21.276us 4.6090us 40.578us cuInt8::nchwToNcqhw4(float const *, unsigned int*, int, int, int, int, int, int, int, float const *, cuInt8::ReducedDivisorPar\
ameters)
0.21% 6.9184ms 220 31.447us 30.146us 33.058us void genericReformat::copyPackedKernel<float, float, bool=0, bool=1, genericReformat::IdentityCoordMapper<int=4>, int=4>(unsig\
ned int, unsigned int, void const *, genericReformat::ArrayN<genericReformat::IdentityCoordMapper<int=4>>, genericReformat::ArrayNWithReducedDivisors<genericReformat::IdentityCoordMapper<int=4>>, generic\
Reformat::ArrayN, int, int, int, float const *, void*, genericReformat::ArrayN, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayN, int, int, \
int, float const , int=4)
0.20% 6.7198ms 220 30.544us 29.537us 31.810us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_interior_nt_v1
0.18% 6.0753ms 220 27.615us 24.993us 485.68us trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_medium_nt_v1
0.16% 5.2601ms 220 23.909us 22.369us 25.569us void op_generic_tensor_kernel<int=2, float, float, float, int=256, cudnnGenericOp_t=6, cudnnNanPropagation_t=0, cudnnDimOrder_\
t=0, int=1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const *, cudnnTensorStruct, float const *, float, float, float, float, dimArray, reducedDivisorArray, bool)
0.09% 2.9764ms 220 13.528us 12.705us 14.561us void nchw_to_nhwcin_ker<double4>(double4*, double4 const *, int, int, nvinfer1::rt::reduced_divisor, nvinfer1::rt::reduced_div\
isor, int, nvinfer1::rt::reduced_divisor, int, int, int, int, int, int)
0.08% 2.5876ms 220 11.762us 11.040us 12.577us void nhwkuv_to_nkpq_ker<int=32>(float*, float const *, int, int, int, int, int, int, nvinfer1::rt::reduced_divisor, int, int, \
nvinfer1::rt::reduced_divisor, nvinfer1::rt::reduced_divisor, int, int, int, float const *, float const *)
API calls: 100.00% 455.84ms 15840 28.777us 18.145us 882.87us cudaLaunchKernel
**fp16**
Type Time(%) Time Calls Avg Min Max Name
Range: 100.00% 3.41539s 53000 64.441us 20.289us 4.5327ms <unnamed>
GPU activities: 42.34% 6.83722s 5000 1.3674ms 195.75us 3.7681ms trt_volta_h884cudnn_256x64_ldg8_relu_exp_small_nhwc_tn_v1
11.60% 1.87379s 3000 624.60us 111.27us 2.0342ms volta_h884gemm_256x128_ldg8_nn
10.65% 1.71947s 14000 122.82us 6.3370us 1.0250ms void cuScale::scale<__half, __half, bool=1, cuScale::Mode, bool=0, bool=0, bool=1, int=4, nvinfer1::FusedActType>(__half const *, cuScale::scale<__half, __half, bool=1, cuScale::Mode, bool=0, bool=0, bool=1, int=4, nvinfer1::FusedActType>*, cuScale::KernelParameters<cuScale::scale<__half, __half, bool=1, cuScale::Mode, bool=0, bool=0, bool=1, int=4, nvinfer1::FusedActType>>, nvinfer1::rt::reduced_divisor, nvinfer1::rt, nvinfer1::rt, nvinfer1::rt, nvinfer1::rt)
6.87% 1.10945s 7000 158.49us 94.917us 350.29us trt_volta_h884cudnn_256x64_sliced1x2_ldg8_relu_exp_medium_nhwc_tn_v1
6.33% 1.02213s 1000 1.0221ms 1.0168ms 1.0621ms trt_volta_scudnn_128x64_relu_medium_nn_v1
6.03% 974.01ms 8000 121.75us 26.082us 154.50us trt_volta_h884cudnn_256x128_ldg8_relu_exp_medium_nhwc_tn_v1
5.06% 816.80ms 5000 163.36us 4.9600us 647.39us void nhwuvk_to_npqk_ker<int=1>(__half*, __half const *, int, nvinfer1::rt::reduced_divisor, int, int, int, int, int, nvinfer1::rt::reduced_divisor, nvinfer1::rt::reduced_divisor, int, int, int, __half const *)
3.38% 545.76ms 5000 109.15us 7.3610us 421.94us void nhwc_to_nhwcin_ker<int=1>(int4*, int4 const *, int, int, nvinfer1::rt::reduced_divisor, nvinfer1::rt::reduced_divisor, int, nvinfer1::rt::reduced_divisor, int, int, int, int)
2.54% 410.01ms 2000 205.01us 187.82us 223.37us trt_volta_h884cudnn_256x64_ldg8_relu_exp_medium_nhwc_tn_v1
1.49% 241.37ms 1000 241.37us 236.78us 283.44us void cuInt8::nchwTonhwc<float, int=32, int=32, int=2>(float const *, __half*, int, int, int, int, int, int, int, int)
0.67% 108.06ms 1000 108.06us 107.08us 109.64us void nvinfer1::poolCoalescedC<nvinfer1::PoolingType, int=3, bool=0>(nvinfer1::half2pack const *, nvinfer1::poolCoalescedC<nvinfer1::PoolingType, int=3, bool=0>*, int, int, nvinfer1::rt::reduced_divisor, int, int, int, nvinfer1::rt, int, int, int, int, int, int, int, int, float)
0.53% 84.783ms 1000 84.782us 81.988us 90.500us trt_volta_h884cudnn_256x128_ldg8_relu_exp_small_nhwc_tn_v1
0.49% 78.753ms 1000 78.753us 76.100us 81.604us volta_h884gemm_64x64_ldg8_nn
0.42% 68.070ms 1000 68.070us 66.691us 69.667us trt_volta_h884cudnn_256x64_ldg8_relu_exp_interior_nhwc_tn_v1
0.37% 60.172ms 1000 60.172us 58.627us 70.148us void cuScale::scale<float, float, bool=0, cuScale::Mode, bool=0, bool=0, bool=0, int=4, nvinfer1::FusedActType>(float const *, cuScale::scale<float, float, bool=0, cuScale::Mode, bool=0, bool=0, bool=0, int=4, nvinfer1::FusedActType>*, cuScale::KernelParameters<cuScale::scale<float, float, bool=0, cuScale::Mode, bool=0, bool=0, bool=0, int=4, nvinfer1::FusedActType>>, nvinfer1::rt::reduced_divisor, nvinfer1::rt, nvinfer1::rt, nvinfer1::rt, nvinfer1::rt)
0.35% 55.707ms 2000 27.853us 26.657us 29.217us trt_volta_h884cudnn_128x128_ldg8_relu_exp_small_nhwc_tn_v1
0.30% 48.509ms 1000 48.508us 47.266us 51.618us void cuInt8::nhwcTonchw<float, int=32, int=4, int=2>(__half const *, float*, int, int, int, int, int, int)
0.23% 37.893ms 1000 37.892us 36.738us 39.522us volta_h884gemm_128x64_ldg8_nn
0.19% 31.093ms 1000 31.092us 29.729us 32.354us void genericReformat::copyPackedKernel<float, float, bool=0, bool=1, genericReformat::IdentityCoordMapper<int=4>, int=4>(unsigned int, unsigned int, void const *, genericReformat::ArrayN<genericReformat::IdentityCoordMapper<int=4>>, genericReformat::ArrayNWithReducedDivisors<genericReformat::IdentityCoordMapper<int=4>>, genericReformat::ArrayN, int, int, int, float const *, void*, genericReformat::ArrayN, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayN, int, int, int, float const , int=4)
0.15% 23.868ms 1000 23.867us 22.721us 26.241us void op_generic_tensor_kernel<int=2, float, float, float, int=256, cudnnGenericOp_t=6, cudnnNanPropagation_t=0, cudnnDimOrder_t=0, int=1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const *, cudnnTensorStruct, float const *, float, float, float, float, dimArray, reducedDivisorArray, bool)
API calls: 100.00% 1.19678s 62000 19.302us 16.065us 1.3211ms cudaLaunchKernel