==25751== Profiling application: trtexec --loadEngine=best_qat_24_int8_b1_sparse.trt ==25751== Profiling result: Type Time(%) Time Calls Avg Min Max Name GPU activities: 24.15% 771.22ms 12992 59.361us 2.8160us 5.4506ms generatedNativePointwise 23.49% 750.07ms 7504 99.956us 32.579us 2.3475ms trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 13.11% 418.58ms 2800 149.49us 86.535us 731.29us trt_volta_fp32_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 7.12% 227.37ms 3920 58.001us 19.490us 1.7740ms void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=32, int=64, int=128, char=4, bool=0, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) 6.66% 212.65ms 2576 82.552us 41.699us 1.1712ms void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=64, int=64, int=256, char=4, bool=0, bool=1, bool=1, bool=1, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) 5.23% 167.15ms 1008 165.83us 103.40us 1.0296ms trt_volta_fp32_icudnn_int8x4_128x64_relu_interior_nn_v1 5.22% 166.60ms 560 297.50us 127.11us 5.3590ms trt_volta_fp32_icudnn_int8x4_128x64_relu_small_nn_v1 2.36% 75.400ms 224 336.61us 236.05us 1.0020ms void cudnn::ops::pooling_fw_4d_kernel, cudnnPoolingMode_t=0, bool=0>(cudnnTensorStruct, float const *, cudnn::ops::pooling_fw_4d_kernel, cudnnPoolingMode_t=0, bool=0>, cudnnTensorStruct*, cudnnPoolingStruct, float, cudnnPoolingStruct, int, cudnn::reduced_divisor, float) 2.10% 66.916ms 112 597.46us 532.20us 1.7369ms trt_volta_fp32_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_large_nt_v1 1.30% 41.653ms 896 46.487us 32.419us 144.24us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) 1.23% 39.306ms 784 50.134us 17.089us 149.71us cuInt8::nchwToNcqhw4(char const *, unsigned int*, int, int, int, int, int, int, float const *, float const *) 1.12% 35.755ms 124 288.35us 928ns 19.597ms [CUDA memcpy HtoD] 0.88% 28.134ms 224 125.60us 82.375us 238.58us void cuResizeLayer::ResizeNearestGenericKernel(float*, cuResizeLayer::ResizeNearestGenericKernel const *, cuResizeLayer::LaunchParams) 0.80% 25.536ms 112 228.00us 221.68us 336.09us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=32, int=64, int=128, char=16, bool=0, bool=0, bool=1, bool=0, bool=1>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=16) 0.71% 22.820ms 672 33.958us 12.609us 2.0256ms cuInt8::nc32hw32ToNcqhw4_block(char4 const *, char4*, int, int, int, int, float const *, float const *) 0.55% 17.408ms 672 25.904us 14.657us 51.620us void cuFillLayer::fill(cuFillLayer::KernelArgs) 0.46% 14.674ms 112 131.02us 121.45us 178.93us trt_volta_scudnn_128x32_relu_interior_nn_v1 0.39% 12.403ms 336 36.912us 33.699us 110.89us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=4, int=256, int=64, char=4, bool=0, bool=0, bool=1, bool=0, bool=1>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) 0.36% 11.544ms 1344 8.5890us 1.9840us 28.194us void genericReformat::copyPackedKernel, int=4>(unsigned int, unsigned int, void const *, genericReformat::ArrayN>, genericReformat::ArrayNWithReducedDivisors>, genericReformat::ArrayN, int, int, int, float const *, void*, genericReformat::ArrayN, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayN, int, int, int, float const , int=4) 0.36% 11.389ms 112 101.68us 95.623us 141.16us void fused::fusedConvolutionReluKernel, fused::KpqkPtrWriter, float, float, int=4, int=7, int=2, int=1, int=1, int=1, int=1>(fused::ConvolutionParams) 0.27% 8.7035ms 458 19.003us 1.1200us 81.927us [CUDA memcpy DtoH] 0.26% 8.4329ms 216 39.041us 6.8480us 1.3463ms [CUDA memcpy DtoD] 0.24% 7.7303ms 1344 5.7510us 1.8560us 19.329us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) 0.22% 7.1360ms 336 21.238us 17.441us 57.540us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=1, int=2048, int=1, int=128, char=4, bool=1, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) 0.22% 7.0220ms 112 62.696us 60.293us 88.711us void nvinfer1::rt::cuda::poolCHW_PQT(nvinfer1::rt::cuda::TiledPoolingParams, int) 0.22% 6.9970ms 112 62.473us 57.477us 132.94us cuInt8::ncqhw4ToNc32hw32(char4 const *, char4*, nvinfer1::rt::ReducedDivisor, int, nvinfer1::rt::ReducedDivisor, nvinfer1::rt::ReducedDivisor, int, int, float const *, float const *) 0.18% 5.8340ms 112 52.088us 48.355us 72.070us void fused::fusedConvolutionReluKernel, fused::KpqkPtrWriter, float, float, int=4, int=7, int=8, int=1, int=1, int=1, int=1>(fused::ConvolutionParams) 0.16% 5.1520ms 336 15.333us 5.6320us 37.251us void genericReformat::copyPackedKernel, int=5>(unsigned int, unsigned int, void const *, genericReformat::ArrayN>, genericReformat::ArrayNWithReducedDivisors>, void const *, int, int, int, float const *, void*, void const *, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayNWithReducedDivisors, void const *, int, int, int, float const , int=5) 0.15% 4.7297ms 112 42.229us 40.100us 121.32us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=4, int=256, int=64, char=4, bool=1, bool=0, bool=1, bool=0, bool=1>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) 0.13% 4.1140ms 112 36.732us 34.659us 101.96us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=4, int=256, int=128, char=4, bool=0, bool=0, bool=1, bool=0, bool=1>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) 0.13% 4.0091ms 10 400.91us 1.3440us 2.6686ms [CUDA memset] 0.07% 2.1084ms 672 3.1370us 1.6960us 7.0400us void genericReformat::copyPackedKernel, int=4>(unsigned int, unsigned int, void const *, genericReformat::ArrayN>, genericReformat::ArrayNWithReducedDivisors>, genericReformat::ArrayN, int, int, int, float const *, void*, genericReformat::ArrayN, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayN, int, int, int, float const , int=4) 0.07% 2.0914ms 672 3.1120us 1.7930us 5.7930us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) 0.05% 1.5643ms 336 4.6550us 2.6880us 9.5360us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) 0.03% 1.1095ms 336 3.3020us 2.1120us 6.4330us void cuCastLayer::cast(float*, int, int const *, int, float const *, float const , int, int, int, int, bool, nvinfer1::rt::ReducedDivisor, nvinfer1::rt, nvinfer1::rt, int) API calls: 27.06% 2.62058s 10 262.06ms 9.9200us 2.62040s cudaMemGetInfo 20.77% 2.01150s 16 125.72ms 2.5600us 2.01077s cudaStreamCreateWithFlags 18.97% 1.83720s 112 16.404ms 13.974ms 28.470ms cudaEventSynchronize 14.65% 1.41842s 51 27.812ms 1.7920us 936.87ms cudaFree 6.94% 671.75ms 199 3.3756ms 7.2000us 91.680ms cuModuleUnload 5.92% 573.56ms 28560 20.082us 11.617us 2.6013ms cudaLaunchKernel 2.54% 246.10ms 12992 18.942us 11.872us 1.1696ms cuLaunchKernel 1.73% 167.23ms 116 1.4416ms 936.57us 9.3130ms cuModuleLoadData 0.50% 48.334ms 792 61.027us 11.168us 19.727ms cudaMemcpyAsync 0.38% 36.548ms 39 937.13us 7.4240us 9.7707ms cudaMalloc 0.16% 15.503ms 22522 688ns 288ns 97.701us cudaGetLastError 0.11% 10.917ms 785 13.907us 4.1280us 219.09us cudaEventRecord 0.07% 6.3573ms 7504 847ns 448ns 53.859us cudaPeekAtLastError 0.06% 5.3570ms 784 6.8320us 1.8240us 57.635us cudaEventElapsedTime 0.03% 2.6011ms 225 11.560us 3.9040us 746.09us cudaStreamWaitEvent 0.02% 1.8308ms 5 366.16us 15.905us 1.6572ms cudaMallocHost 0.01% 1.4453ms 14 103.23us 4.7360us 1.3135ms cudaStreamSynchronize 0.01% 1.2439ms 480 2.5910us 1.5040us 38.082us cudaFuncSetAttribute 0.01% 1.2147ms 505 2.4050us 1.5680us 51.747us cudaEventCreateWithFlags 0.01% 1.1928ms 2 596.42us 567.01us 625.83us cudaHostAlloc 0.01% 1.0127ms 505 2.0050us 1.2480us 86.053us cudaEventDestroy 0.01% 904.46us 473 1.9120us 544ns 63.875us cuDeviceGetAttribute 0.01% 779.76us 6 129.96us 63.171us 254.70us cudaMemcpy 0.01% 641.48us 207 3.0980us 960ns 49.955us cudaDeviceGetAttribute 0.00% 462.33us 10 46.233us 10.656us 162.09us cudaMemsetAsync 0.00% 394.65us 1 394.65us 394.65us 394.65us cudaLaunchHostFunc 0.00% 389.27us 11 35.388us 20.417us 72.836us cudaGetDeviceProperties 0.00% 300.50us 2 150.25us 63.107us 237.39us cudaFreeHost 0.00% 245.90us 30 8.1960us 2.6240us 37.698us cudaStreamDestroy 0.00% 243.98us 116 2.1030us 1.2160us 26.562us cuModuleGetFunction 0.00% 133.35us 6 22.225us 13.025us 32.289us cudaStreamCreate 0.00% 113.58us 5 22.715us 8.2560us 38.370us cuDeviceTotalMem 0.00% 107.33us 18 5.9620us 2.8160us 13.505us cudaDeviceSynchronize 0.00% 97.413us 2 48.706us 43.586us 53.827us cudaCreateTextureObject 0.00% 68.388us 16 4.2740us 1.4720us 15.681us cudaGetDevice 0.00% 67.140us 2 33.570us 19.745us 47.395us cudaSetDevice 0.00% 33.954us 2 16.977us 6.2730us 27.681us cudaDestroyTextureObject 0.00% 26.819us 4 6.7040us 4.1930us 7.6810us cuInit 0.00% 25.409us 8 3.1760us 2.6240us 4.5760us cudaStreamCreateWithPriority 0.00% 20.193us 7 2.8840us 544ns 13.825us cudaGetDeviceCount 0.00% 17.089us 2 8.5440us 8.1290us 8.9600us cudaHostGetDevicePointer 0.00% 16.481us 4 4.1200us 1.9520us 6.1440us cuDriverGetVersion 0.00% 15.200us 7 2.1710us 1.2800us 3.5840us cuDeviceGetCount 0.00% 10.240us 5 2.0480us 1.2800us 2.4320us cuDeviceGetName 0.00% 9.0570us 6 1.5090us 929ns 2.2080us cuDeviceGet 0.00% 8.6410us 2 4.3200us 2.6880us 5.9530us cudaDeviceGetStreamPriorityRange 0.00% 6.1120us 5 1.2220us 992ns 1.6320us cuDeviceGetUuid 0.00% 3.9040us 5 780ns 544ns 1.0560us cudaRuntimeGetVersion 0.00% 2.6240us 2 1.3120us 928ns 1.6960us cudaDriverGetVersion 0.00% 2.2400us 1 2.2400us 2.2400us 2.2400us cuDevicePrimaryCtxRelease 0.00% 2.1760us 2 1.0880us 672ns 1.5040us cudaCreateChannelDesc ==25751== NVTX result: ==25751== Thread "" (id = 2239747200) ==25751== Domain "TensorRT" ==25751== Range "(Unnamed Layer* 2154) [Shuffle]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 165.55us 112 1.4780us 1.1200us 2.4650us (Unnamed Layer* 2154) [Shuffle] No kernels were profiled in this range. No API activities were profiled in this range. ==25751== Range "(Unnamed Layer* 2361) [Shuffle]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 192.56us 112 1.7190us 1.0240us 25.858us (Unnamed Layer* 2361) [Shuffle] No kernels were profiled in this range. No API activities were profiled in this range. ==25751== Range "(Unnamed Layer* 2568) [Shuffle]" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 192.78us 112 1.7210us 1.0240us 21.505us (Unnamed Layer* 2568) [Shuffle] No kernels were profiled in this range. No API activities were profiled in this range. ==25751== Range "2157 copy" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 7.8724ms 112 70.289us 44.835us 131.14us 2157 copy GPU activities: 100.00% 4.3091ms 112 38.473us 36.450us 110.89us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=4, int=256, int=64, char=4, bool=0, bool=0, bool=1, bool=0, bool=1>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 3.1244ms 112 27.896us 17.889us 80.836us cudaLaunchKernel ==25751== Range "2167 copy" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 7.2340ms 112 64.589us 39.490us 122.57us 2167 copy GPU activities: 100.00% 3.9976ms 112 35.692us 33.699us 107.82us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=4, int=256, int=64, char=4, bool=0, bool=0, bool=1, bool=0, bool=1>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.8091ms 112 25.081us 15.009us 61.220us cudaLaunchKernel ==25751== Range "2177 copy" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 7.1025ms 112 63.415us 38.083us 158.15us 2177 copy GPU activities: 100.00% 4.0959ms 112 36.570us 34.754us 107.85us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=4, int=256, int=64, char=4, bool=0, bool=0, bool=1, bool=0, bool=1>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.7913ms 112 24.922us 15.232us 78.213us cudaLaunchKernel ==25751== Range "3991 copy" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.1106ms 112 36.701us 24.353us 213.07us 3991 copy GPU activities: 100.00% 520.23us 112 4.6440us 4.1280us 6.4960us void genericReformat::copyPackedKernel, int=4>(unsigned int, unsigned int, void const *, genericReformat::ArrayN>, genericReformat::ArrayNWithReducedDivisors>, genericReformat::ArrayN, int, int, int, float const *, void*, genericReformat::ArrayN, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayN, int, int, int, float const , int=4) API calls: 100.00% 2.4039ms 112 21.463us 15.361us 122.28us cudaLaunchKernel ==25751== Range "3992 copy" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.6649ms 112 23.793us 19.201us 83.141us 3992 copy GPU activities: 100.00% 255.96us 112 2.2850us 2.1440us 3.6170us void genericReformat::copyPackedKernel, int=4>(unsigned int, unsigned int, void const *, genericReformat::ArrayN>, genericReformat::ArrayNWithReducedDivisors>, genericReformat::ArrayN, int, int, int, float const *, void*, genericReformat::ArrayN, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayN, int, int, int, float const , int=4) API calls: 100.00% 1.8140ms 112 16.196us 13.409us 35.074us cudaLaunchKernel ==25751== Range "4050 copy" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.4317ms 112 30.639us 22.881us 76.484us 4050 copy GPU activities: 100.00% 1.3381ms 112 11.947us 10.753us 15.361us void genericReformat::copyPackedKernel, int=4>(unsigned int, unsigned int, void const *, genericReformat::ArrayN>, genericReformat::ArrayNWithReducedDivisors>, genericReformat::ArrayN, int, int, int, float const *, void*, genericReformat::ArrayN, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayN, int, int, int, float const , int=4) API calls: 100.00% 2.1818ms 112 19.480us 14.529us 41.250us cudaLaunchKernel ==25751== Range "4061 copy" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.6391ms 112 23.563us 18.881us 43.490us 4061 copy GPU activities: 100.00% 1.4116ms 112 12.603us 11.169us 15.233us void genericReformat::copyPackedKernel, int=4>(unsigned int, unsigned int, void const *, genericReformat::ArrayN>, genericReformat::ArrayNWithReducedDivisors>, genericReformat::ArrayN, int, int, int, float const *, void*, genericReformat::ArrayN, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayN, int, int, int, float const , int=4) API calls: 100.00% 1.8704ms 112 16.699us 13.088us 35.202us cudaLaunchKernel ==25751== Range "4066 copy" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.7626ms 112 24.666us 18.914us 87.237us 4066 copy GPU activities: 100.00% 2.0710ms 112 18.491us 16.865us 21.506us void genericReformat::copyPackedKernel, int=4>(unsigned int, unsigned int, void const *, genericReformat::ArrayN>, genericReformat::ArrayNWithReducedDivisors>, genericReformat::ArrayN, int, int, int, float const *, void*, genericReformat::ArrayN, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayN, int, int, int, float const , int=4) API calls: 100.00% 1.9786ms 112 17.666us 13.185us 80.773us cudaLaunchKernel ==25751== Range "4074 copy" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.3996ms 112 30.353us 19.362us 172.39us 4074 copy GPU activities: 100.00% 2.5293ms 112 22.583us 20.769us 28.194us void genericReformat::copyPackedKernel, int=4>(unsigned int, unsigned int, void const *, genericReformat::ArrayN>, genericReformat::ArrayNWithReducedDivisors>, genericReformat::ArrayN, int, int, int, float const *, void*, genericReformat::ArrayN, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayN, int, int, int, float const , int=4) API calls: 100.00% 2.4449ms 112 21.829us 12.961us 164.97us cudaLaunchKernel ==25751== Range "4118 copy" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.1076ms 112 27.746us 20.513us 86.917us 4118 copy GPU activities: 100.00% 465.98us 112 4.1600us 3.7760us 7.0400us void genericReformat::copyPackedKernel, int=4>(unsigned int, unsigned int, void const *, genericReformat::ArrayN>, genericReformat::ArrayNWithReducedDivisors>, genericReformat::ArrayN, int, int, int, float const *, void*, genericReformat::ArrayN, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayN, int, int, int, float const , int=4) API calls: 100.00% 1.9147ms 112 17.095us 13.825us 53.315us cudaLaunchKernel ==25751== Range "4119 copy" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.7472ms 112 24.528us 19.170us 66.596us 4119 copy GPU activities: 100.00% 217.45us 112 1.9410us 1.8240us 2.8800us void genericReformat::copyPackedKernel, int=4>(unsigned int, unsigned int, void const *, genericReformat::ArrayN>, genericReformat::ArrayNWithReducedDivisors>, genericReformat::ArrayN, int, int, int, float const *, void*, genericReformat::ArrayN, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayN, int, int, int, float const , int=4) API calls: 100.00% 1.9392ms 112 17.314us 13.281us 60.324us cudaLaunchKernel ==25751== Range "4177 copy" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.8930ms 112 25.830us 20.193us 117.70us 4177 copy GPU activities: 100.00% 699.61us 112 6.2460us 5.2490us 8.1930us void genericReformat::copyPackedKernel, int=4>(unsigned int, unsigned int, void const *, genericReformat::ArrayN>, genericReformat::ArrayNWithReducedDivisors>, genericReformat::ArrayN, int, int, int, float const *, void*, genericReformat::ArrayN, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayN, int, int, int, float const , int=4) API calls: 100.00% 1.8224ms 112 16.271us 13.313us 43.715us cudaLaunchKernel ==25751== Range "4188 copy" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.8209ms 112 25.186us 18.913us 214.83us 4188 copy GPU activities: 100.00% 429.12us 112 3.8310us 3.4880us 5.3440us void genericReformat::copyPackedKernel, int=4>(unsigned int, unsigned int, void const *, genericReformat::ArrayN>, genericReformat::ArrayNWithReducedDivisors>, genericReformat::ArrayN, int, int, int, float const *, void*, genericReformat::ArrayN, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayN, int, int, int, float const , int=4) API calls: 100.00% 2.0753ms 112 18.529us 13.024us 206.70us cudaLaunchKernel ==25751== Range "4193 copy" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.9501ms 112 26.340us 18.657us 158.09us 4193 copy GPU activities: 100.00% 646.26us 112 5.7700us 5.0560us 7.4890us void genericReformat::copyPackedKernel, int=4>(unsigned int, unsigned int, void const *, genericReformat::ArrayN>, genericReformat::ArrayNWithReducedDivisors>, genericReformat::ArrayN, int, int, int, float const *, void*, genericReformat::ArrayN, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayN, int, int, int, float const , int=4) API calls: 100.00% 2.1344ms 112 19.057us 12.897us 149.87us cudaLaunchKernel ==25751== Range "4201 copy" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.5617ms 112 40.729us 19.393us 222.35us 4201 copy GPU activities: 100.00% 951.40us 112 8.4940us 7.6800us 11.585us void genericReformat::copyPackedKernel, int=4>(unsigned int, unsigned int, void const *, genericReformat::ArrayN>, genericReformat::ArrayNWithReducedDivisors>, genericReformat::ArrayN, int, int, int, float const *, void*, genericReformat::ArrayN, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayN, int, int, int, float const , int=4) API calls: 100.00% 3.7710ms 112 33.669us 12.737us 213.61us cudaLaunchKernel ==25751== Range "4245 copy" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.4523ms 112 30.823us 20.065us 175.53us 4245 copy GPU activities: 100.00% 438.46us 112 3.9140us 3.6160us 5.5050us void genericReformat::copyPackedKernel, int=4>(unsigned int, unsigned int, void const *, genericReformat::ArrayN>, genericReformat::ArrayNWithReducedDivisors>, genericReformat::ArrayN, int, int, int, float const *, void*, genericReformat::ArrayN, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayN, int, int, int, float const , int=4) API calls: 100.00% 2.5282ms 112 22.573us 13.664us 166.89us cudaLaunchKernel ==25751== Range "4246 copy" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.0559ms 112 27.285us 19.009us 177.93us 4246 copy GPU activities: 100.00% 210.33us 112 1.8770us 1.6960us 3.3290us void genericReformat::copyPackedKernel, int=4>(unsigned int, unsigned int, void const *, genericReformat::ArrayN>, genericReformat::ArrayNWithReducedDivisors>, genericReformat::ArrayN, int, int, int, float const *, void*, genericReformat::ArrayN, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayN, int, int, int, float const , int=4) API calls: 100.00% 2.2082ms 112 19.715us 13.121us 169.13us cudaLaunchKernel ==25751== Range "4304 copy" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.8623ms 112 25.556us 19.681us 71.588us 4304 copy GPU activities: 100.00% 476.99us 112 4.2580us 3.9360us 5.6320us void genericReformat::copyPackedKernel, int=4>(unsigned int, unsigned int, void const *, genericReformat::ArrayN>, genericReformat::ArrayNWithReducedDivisors>, genericReformat::ArrayN, int, int, int, float const *, void*, genericReformat::ArrayN, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayN, int, int, int, float const , int=4) API calls: 100.00% 1.9962ms 112 17.823us 12.929us 64.932us cudaLaunchKernel ==25751== Range "4315 copy" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.8586ms 112 25.522us 18.753us 133.74us 4315 copy GPU activities: 100.00% 240.02us 112 2.1420us 1.9840us 3.4240us void genericReformat::copyPackedKernel, int=4>(unsigned int, unsigned int, void const *, genericReformat::ArrayN>, genericReformat::ArrayNWithReducedDivisors>, genericReformat::ArrayN, int, int, int, float const *, void*, genericReformat::ArrayN, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayN, int, int, int, float const , int=4) API calls: 100.00% 2.0533ms 112 18.333us 12.992us 125.83us cudaLaunchKernel ==25751== Range "4320 copy" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.2203ms 112 28.752us 18.785us 123.14us 4320 copy GPU activities: 100.00% 270.27us 112 2.4130us 2.2720us 3.7770us void genericReformat::copyPackedKernel, int=4>(unsigned int, unsigned int, void const *, genericReformat::ArrayN>, genericReformat::ArrayNWithReducedDivisors>, genericReformat::ArrayN, int, int, int, float const *, void*, genericReformat::ArrayN, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayN, int, int, int, float const , int=4) API calls: 100.00% 2.4461ms 112 21.840us 13.025us 117.16us cudaLaunchKernel ==25751== Range "4328 copy" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.2410ms 112 37.866us 19.009us 196.43us 4328 copy GPU activities: 100.00% 480.75us 112 4.2920us 3.8080us 6.2730us void genericReformat::copyPackedKernel, int=4>(unsigned int, unsigned int, void const *, genericReformat::ArrayN>, genericReformat::ArrayNWithReducedDivisors>, genericReformat::ArrayN, int, int, int, float const *, void*, genericReformat::ArrayN, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayN, int, int, int, float const , int=4) API calls: 100.00% 3.3682ms 112 30.073us 12.737us 187.50us cudaLaunchKernel ==25751== Range "4339" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 274.00us 112 2.4460us 1.2480us 25.730us 4339 No kernels were profiled in this range. No API activities were profiled in this range. ==25751== Range "4353" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 121.86us 112 1.0880us 320ns 30.434us 4353 No kernels were profiled in this range. No API activities were profiled in this range. ==25751== Range "4367" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 67.108us 112 599ns 320ns 1.0880us 4367 No kernels were profiled in this range. No API activities were profiled in this range. ==25751== Range "Cast_1848" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.9577ms 112 26.408us 18.049us 159.05us Cast_1848 GPU activities: 100.00% 508.33us 112 4.5380us 3.9050us 6.4330us void cuCastLayer::cast(float*, int, int const *, int, float const *, float const , int, int, int, int, bool, nvinfer1::rt::ReducedDivisor, nvinfer1::rt, nvinfer1::rt, int) API calls: 100.00% 2.1362ms 112 19.073us 13.313us 121.67us cudaLaunchKernel ==25751== Range "Cast_1861" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 208.78us 112 1.8640us 1.1840us 3.2970us Cast_1861 No kernels were profiled in this range. No API activities were profiled in this range. ==25751== Range "Cast_1947" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.3520ms 112 21.000us 16.769us 61.348us Cast_1947 GPU activities: 100.00% 311.60us 112 2.7820us 2.3360us 4.2560us void cuCastLayer::cast(float*, int, int const *, int, float const *, float const , int, int, int, int, bool, nvinfer1::rt::ReducedDivisor, nvinfer1::rt, nvinfer1::rt, int) API calls: 100.00% 1.7628ms 112 15.739us 12.513us 56.131us cudaLaunchKernel ==25751== Range "Cast_1960" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 179.83us 112 1.6050us 992ns 18.849us Cast_1960 No kernels were profiled in this range. No API activities were profiled in this range. ==25751== Range "Cast_2046" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.1985ms 112 28.558us 16.257us 217.16us Cast_2046 GPU activities: 100.00% 289.56us 112 2.5850us 2.1120us 4.6730us void cuCastLayer::cast(float*, int, int const *, int, float const *, float const , int, int, int, int, bool, nvinfer1::rt::ReducedDivisor, nvinfer1::rt, nvinfer1::rt, int) API calls: 100.00% 2.5544ms 112 22.806us 12.257us 210.16us cudaLaunchKernel ==25751== Range "Cast_2059" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 163.14us 112 1.4560us 1.0240us 4.1600us Cast_2059 No kernels were profiled in this range. No API activities were profiled in this range. ==25751== Range "Conv_1794" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.3755ms 112 30.138us 22.945us 85.093us Conv_1794 GPU activities: 100.00% 14.674ms 112 131.02us 121.45us 178.93us trt_volta_scudnn_128x32_relu_interior_nn_v1 API calls: 100.00% 2.5231ms 112 22.527us 16.801us 76.324us cudaLaunchKernel ==25751== Range "Conv_1893" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.1848ms 112 28.435us 21.441us 67.268us Conv_1893 GPU activities: 100.00% 11.389ms 112 101.68us 95.623us 141.16us void fused::fusedConvolutionReluKernel, fused::KpqkPtrWriter, float, float, int=4, int=7, int=2, int=1, int=1, int=1, int=1>(fused::ConvolutionParams) API calls: 100.00% 2.2587ms 112 20.167us 15.681us 47.682us cudaLaunchKernel ==25751== Range "Conv_1992" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.3975ms 112 30.334us 22.081us 118.06us Conv_1992 GPU activities: 100.00% 5.8340ms 112 52.088us 48.355us 72.070us void fused::fusedConvolutionReluKernel, fused::KpqkPtrWriter, float, float, int=4, int=7, int=8, int=1, int=1, int=1, int=1>(fused::ConvolutionParams) API calls: 100.00% 2.5053ms 112 22.368us 15.329us 104.61us cudaLaunchKernel ==25751== Range "ExecutionContext::enqueue" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.75164s 224 12.284ms 11.127ms 27.632ms ExecutionContext::enqueue GPU activities: 24.56% 771.22ms 12992 59.361us 2.8160us 5.4506ms generatedNativePointwise 23.89% 750.07ms 7504 99.956us 32.579us 2.3475ms trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 13.33% 418.58ms 2800 149.49us 86.535us 731.29us trt_volta_fp32_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 7.24% 227.37ms 3920 58.001us 19.490us 1.7740ms void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=32, int=64, int=128, char=4, bool=0, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) 6.77% 212.65ms 2576 82.552us 41.699us 1.1712ms void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=64, int=64, int=256, char=4, bool=0, bool=1, bool=1, bool=1, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) 5.32% 167.15ms 1008 165.83us 103.40us 1.0296ms trt_volta_fp32_icudnn_int8x4_128x64_relu_interior_nn_v1 5.31% 166.60ms 560 297.50us 127.11us 5.3590ms trt_volta_fp32_icudnn_int8x4_128x64_relu_small_nn_v1 2.40% 75.400ms 224 336.61us 236.05us 1.0020ms void cudnn::ops::pooling_fw_4d_kernel, cudnnPoolingMode_t=0, bool=0>(cudnnTensorStruct, float const *, cudnn::ops::pooling_fw_4d_kernel, cudnnPoolingMode_t=0, bool=0>, cudnnTensorStruct*, cudnnPoolingStruct, float, cudnnPoolingStruct, int, cudnn::reduced_divisor, float) 2.13% 66.916ms 112 597.46us 532.20us 1.7369ms trt_volta_fp32_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_large_nt_v1 1.33% 41.653ms 896 46.487us 32.419us 144.24us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) 1.25% 39.306ms 784 50.134us 17.089us 149.71us cuInt8::nchwToNcqhw4(char const *, unsigned int*, int, int, int, int, int, int, float const *, float const *) 0.90% 28.134ms 224 125.60us 82.375us 238.58us void cuResizeLayer::ResizeNearestGenericKernel(float*, cuResizeLayer::ResizeNearestGenericKernel const *, cuResizeLayer::LaunchParams) 0.81% 25.536ms 112 228.00us 221.68us 336.09us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=32, int=64, int=128, char=16, bool=0, bool=0, bool=1, bool=0, bool=1>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=16) 0.73% 22.820ms 672 33.958us 12.609us 2.0256ms cuInt8::nc32hw32ToNcqhw4_block(char4 const *, char4*, int, int, int, int, float const *, float const *) 0.55% 17.408ms 672 25.904us 14.657us 51.620us void cuFillLayer::fill(cuFillLayer::KernelArgs) 0.47% 14.674ms 112 131.02us 121.45us 178.93us trt_volta_scudnn_128x32_relu_interior_nn_v1 0.39% 12.403ms 336 36.912us 33.699us 110.89us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=4, int=256, int=64, char=4, bool=0, bool=0, bool=1, bool=0, bool=1>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) 0.37% 11.544ms 1344 8.5890us 1.9840us 28.194us void genericReformat::copyPackedKernel, int=4>(unsigned int, unsigned int, void const *, genericReformat::ArrayN>, genericReformat::ArrayNWithReducedDivisors>, genericReformat::ArrayN, int, int, int, float const *, void*, genericReformat::ArrayN, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayN, int, int, int, float const , int=4) 0.36% 11.389ms 112 101.68us 95.623us 141.16us void fused::fusedConvolutionReluKernel, fused::KpqkPtrWriter, float, float, int=4, int=7, int=2, int=1, int=1, int=1, int=1>(fused::ConvolutionParams) 0.25% 7.7303ms 1344 5.7510us 1.8560us 19.329us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) 0.23% 7.1360ms 336 21.238us 17.441us 57.540us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=1, int=2048, int=1, int=128, char=4, bool=1, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) 0.22% 7.0220ms 112 62.696us 60.293us 88.711us void nvinfer1::rt::cuda::poolCHW_PQT(nvinfer1::rt::cuda::TiledPoolingParams, int) 0.22% 6.9970ms 112 62.473us 57.477us 132.94us cuInt8::ncqhw4ToNc32hw32(char4 const *, char4*, nvinfer1::rt::ReducedDivisor, int, nvinfer1::rt::ReducedDivisor, nvinfer1::rt::ReducedDivisor, int, int, float const *, float const *) 0.19% 5.8340ms 112 52.088us 48.355us 72.070us void fused::fusedConvolutionReluKernel, fused::KpqkPtrWriter, float, float, int=4, int=7, int=8, int=1, int=1, int=1, int=1>(fused::ConvolutionParams) 0.16% 5.1520ms 336 15.333us 5.6320us 37.251us void genericReformat::copyPackedKernel, int=5>(unsigned int, unsigned int, void const *, genericReformat::ArrayN>, genericReformat::ArrayNWithReducedDivisors>, void const *, int, int, int, float const *, void*, void const *, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayNWithReducedDivisors, void const *, int, int, int, float const , int=5) 0.15% 4.7297ms 112 42.229us 40.100us 121.32us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=4, int=256, int=64, char=4, bool=1, bool=0, bool=1, bool=0, bool=1>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) 0.13% 4.1140ms 112 36.732us 34.659us 101.96us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=4, int=256, int=128, char=4, bool=0, bool=0, bool=1, bool=0, bool=1>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) 0.11% 3.4918ms 108 32.331us 6.8800us 280.79us [CUDA memcpy DtoD] 0.07% 2.1084ms 672 3.1370us 1.6960us 7.0400us void genericReformat::copyPackedKernel, int=4>(unsigned int, unsigned int, void const *, genericReformat::ArrayN>, genericReformat::ArrayNWithReducedDivisors>, genericReformat::ArrayN, int, int, int, float const *, void*, genericReformat::ArrayN, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayN, int, int, int, float const , int=4) 0.07% 2.0914ms 672 3.1120us 1.7930us 5.7930us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) 0.05% 1.5643ms 336 4.6550us 2.6880us 9.5360us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) 0.04% 1.1095ms 336 3.3020us 2.1120us 6.4330us void cuCastLayer::cast(float*, int, int const *, int, float const *, float const , int, int, int, int, bool, nvinfer1::rt::ReducedDivisor, nvinfer1::rt, nvinfer1::rt, int) 0.00% 14.370us 10 1.4370us 1.1200us 2.7840us [CUDA memcpy DtoH] 0.00% 5.9840us 5 1.1960us 928ns 2.0800us [CUDA memcpy HtoD] API calls: 69.55% 573.56ms 28560 20.082us 11.617us 2.6013ms cudaLaunchKernel 29.84% 246.10ms 12992 18.942us 11.872us 1.1696ms cuLaunchKernel 0.61% 5.0392ms 123 40.968us 14.497us 813.71us cudaMemcpyAsync ==25751== Range "Expand_1828" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.7881ms 112 24.893us 18.625us 82.373us Expand_1828 GPU activities: 100.00% 454.89us 112 4.0610us 3.6480us 5.7930us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) API calls: 100.00% 1.9472ms 112 17.385us 13.281us 53.539us cudaLaunchKernel ==25751== Range "Expand_1832" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.4587ms 112 21.952us 15.841us 210.09us Expand_1832 GPU activities: 100.00% 361.12us 112 3.2240us 2.8160us 5.4730us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) API calls: 100.00% 1.8069ms 112 16.132us 12.096us 104.61us cudaLaunchKernel ==25751== Range "Expand_1847" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.5790ms 112 23.026us 17.121us 142.57us Expand_1847 GPU activities: 100.00% 765.79us 112 6.8370us 6.4000us 9.5360us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) API calls: 100.00% 1.9365ms 112 17.289us 12.769us 135.05us cudaLaunchKernel ==25751== Range "Expand_1860" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.1287ms 112 27.934us 19.073us 69.412us Expand_1860 GPU activities: 100.00% 696.60us 112 6.2190us 5.6330us 9.2810us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) API calls: 100.00% 2.0649ms 112 18.437us 13.473us 62.020us cudaLaunchKernel ==25751== Range "Expand_1927" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.4169ms 112 21.579us 16.833us 181.23us Expand_1927 GPU activities: 100.00% 430.31us 112 3.8420us 3.4560us 5.6970us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) API calls: 100.00% 1.8261ms 112 16.304us 12.545us 167.85us cudaLaunchKernel ==25751== Range "Expand_1931" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.2240ms 112 19.857us 15.809us 44.674us Expand_1931 GPU activities: 100.00% 328.35us 112 2.9310us 2.5920us 4.7680us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) API calls: 100.00% 1.6630ms 112 14.848us 11.745us 36.322us cudaLaunchKernel ==25751== Range "Expand_1946" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.5207ms 112 22.506us 15.937us 217.01us Expand_1946 GPU activities: 100.00% 454.33us 112 4.0560us 3.6800us 6.0160us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) API calls: 100.00% 1.8807ms 112 16.791us 12.225us 210.03us cudaLaunchKernel ==25751== Range "Expand_1959" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.4967ms 112 22.291us 17.409us 53.027us Expand_1959 GPU activities: 100.00% 484.68us 112 4.3270us 3.9360us 6.3680us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) API calls: 100.00% 1.7627ms 112 15.738us 12.512us 46.786us cudaLaunchKernel ==25751== Range "Expand_2026" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.2683ms 112 29.181us 16.033us 247.02us Expand_2026 GPU activities: 100.00% 301.27us 112 2.6890us 2.3360us 3.6480us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) API calls: 100.00% 2.6855ms 112 23.977us 11.905us 239.73us cudaLaunchKernel ==25751== Range "Expand_2030" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.5131ms 112 22.438us 15.393us 106.82us Expand_2030 GPU activities: 100.00% 215.51us 112 1.9240us 1.7930us 3.2640us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) API calls: 100.00% 1.9959ms 112 17.820us 11.712us 102.21us cudaLaunchKernel ==25751== Range "Expand_2045" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.8974ms 112 25.870us 15.489us 171.98us Expand_2045 GPU activities: 100.00% 344.15us 112 3.0720us 2.6880us 5.0240us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) API calls: 100.00% 2.3285ms 112 20.790us 11.745us 167.40us cudaLaunchKernel ==25751== Range "Expand_2058" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.8769ms 112 25.686us 16.545us 239.57us Expand_2058 GPU activities: 100.00% 351.45us 112 3.1370us 2.5920us 5.3450us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) API calls: 100.00% 2.2036ms 112 19.674us 12.257us 230.35us cudaLaunchKernel ==25751== Range "MaxPool_977" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.0630ms 112 36.276us 28.162us 128.81us MaxPool_977 GPU activities: 100.00% 7.0220ms 112 62.696us 60.293us 88.711us void nvinfer1::rt::cuda::poolCHW_PQT(nvinfer1::rt::cuda::TiledPoolingParams, int) API calls: 100.00% 2.3264ms 112 20.771us 17.089us 46.659us cudaLaunchKernel ==25751== Range "MaxPool_978" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 6.1765ms 112 55.147us 40.962us 211.02us MaxPool_978 GPU activities: 100.00% 27.478ms 112 245.34us 236.05us 574.96us void cudnn::ops::pooling_fw_4d_kernel, cudnnPoolingMode_t=0, bool=0>(cudnnTensorStruct, float const *, cudnn::ops::pooling_fw_4d_kernel, cudnnPoolingMode_t=0, bool=0>, cudnnTensorStruct*, cudnnPoolingStruct, float, cudnnPoolingStruct, int, cudnn::reduced_divisor, float) API calls: 100.00% 2.7481ms 112 24.536us 19.617us 55.971us cudaLaunchKernel ==25751== Range "MaxPool_979" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.8849ms 112 25.758us 19.937us 49.411us MaxPool_979 GPU activities: 100.00% 47.923ms 112 427.88us 411.84us 1.0020ms void cudnn::ops::pooling_fw_4d_kernel, cudnnPoolingMode_t=0, bool=0>(cudnnTensorStruct, float const *, cudnn::ops::pooling_fw_4d_kernel, cudnnPoolingMode_t=0, bool=0>, cudnnTensorStruct*, cudnnPoolingStruct, float, cudnnPoolingStruct, int, cudnn::reduced_divisor, float) API calls: 100.00% 1.8555ms 112 16.566us 13.185us 33.602us cudaLaunchKernel ==25751== Range "PWN(PWN(4056 + (Unnamed Layer* 2247) [Shuffle] + Mul_1881, PWN(4343 + (Unnamed Layer* 2250) [Shuffle], Pow_1882)), Mul_1883)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.3912ms 112 21.349us 16.417us 68.452us PWN(PWN(4056 + (Unnamed Layer* 2247) [Shuffle] + Mul_1881, PWN(4343 + (Unnamed Layer* 2250) [Shuffle], Pow_1882)), Mul_1883) GPU activities: 100.00% 901.63us 112 8.0500us 7.2000us 11.105us generatedNativePointwise API calls: 100.00% 1.8645ms 112 16.647us 12.833us 63.588us cuLaunchKernel ==25751== Range "PWN(PWN(4183 + (Unnamed Layer* 2454) [Shuffle] + Mul_1980, PWN(4357 + (Unnamed Layer* 2457) [Shuffle], Pow_1981)), Mul_1982)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.4530ms 112 21.901us 15.841us 117.42us PWN(PWN(4183 + (Unnamed Layer* 2454) [Shuffle] + Mul_1980, PWN(4357 + (Unnamed Layer* 2457) [Shuffle], Pow_1981)), Mul_1982) GPU activities: 100.00% 484.58us 112 4.3260us 3.9360us 6.2400us generatedNativePointwise API calls: 100.00% 1.9394ms 112 17.316us 12.321us 107.91us cuLaunchKernel ==25751== Range "PWN(PWN(4310 + (Unnamed Layer* 2661) [Shuffle] + Mul_2079, PWN(4371 + (Unnamed Layer* 2664) [Shuffle], Pow_2080)), Mul_2081)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.1521ms 112 28.143us 15.457us 166.06us PWN(PWN(4310 + (Unnamed Layer* 2661) [Shuffle] + Mul_2079, PWN(4371 + (Unnamed Layer* 2664) [Shuffle], Pow_2080)), Mul_2081) GPU activities: 100.00% 387.39us 112 3.4580us 3.0080us 4.9600us generatedNativePointwise API calls: 100.00% 2.6149ms 112 23.347us 12.065us 159.88us cuLaunchKernel ==25751== Range "PWN(PWN(PWN(4044 + (Unnamed Layer* 2214) [Shuffle] + Mul_1869, PWN(4046 + (Unnamed Layer* 2217) [Shuffle], Sub_1871)), Add_1872), 4049 + (Unnamed Layer* 2221) [Shuffle] + Mul_1874)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.4440ms 112 21.821us 15.617us 89.125us PWN(PWN(PWN(4044 + (Unnamed Layer* 2214) [Shuffle] + Mul_1869, PWN(4046 + (Unnamed Layer* 2217) [Shuffle], Sub_1871)), Add_1872), 4049 + (Unnamed Layer* 2221) [Shuffle] + Mul_1874) GPU activities: 100.00% 830.65us 112 7.4160us 6.6560us 11.329us generatedNativePointwise API calls: 100.00% 1.8709ms 112 16.704us 12.417us 68.612us cuLaunchKernel ==25751== Range "PWN(PWN(PWN(4171 + (Unnamed Layer* 2421) [Shuffle] + Mul_1968, PWN(4173 + (Unnamed Layer* 2424) [Shuffle], Sub_1970)), Add_1971), 4176 + (Unnamed Layer* 2428) [Shuffle] + Mul_1973)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.3881ms 112 21.322us 15.841us 95.397us PWN(PWN(PWN(4171 + (Unnamed Layer* 2421) [Shuffle] + Mul_1968, PWN(4173 + (Unnamed Layer* 2424) [Shuffle], Sub_1970)), Add_1971), 4176 + (Unnamed Layer* 2428) [Shuffle] + Mul_1973) GPU activities: 100.00% 500.49us 112 4.4680us 3.9370us 6.6890us generatedNativePointwise API calls: 100.00% 1.9188ms 112 17.132us 12.448us 88.581us cuLaunchKernel ==25751== Range "PWN(PWN(PWN(4298 + (Unnamed Layer* 2628) [Shuffle] + Mul_2067, PWN(4300 + (Unnamed Layer* 2631) [Shuffle], Sub_2069)), Add_2070), 4303 + (Unnamed Layer* 2635) [Shuffle] + Mul_2072)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.7445ms 112 24.504us 15.232us 120.36us PWN(PWN(PWN(4298 + (Unnamed Layer* 2628) [Shuffle] + Mul_2067, PWN(4300 + (Unnamed Layer* 2631) [Shuffle], Sub_2069)), Add_2070), 4303 + (Unnamed Layer* 2635) [Shuffle] + Mul_2072) GPU activities: 100.00% 356.79us 112 3.1850us 2.8160us 4.9280us generatedNativePointwise API calls: 100.00% 2.2876ms 112 20.425us 12.064us 114.25us cuLaunchKernel ==25751== Range "PWN(PWN(Sigmoid_119, Mul_120), Add_121)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.2101ms 112 28.661us 18.561us 61.476us PWN(PWN(Sigmoid_119, Mul_120), Add_121) GPU activities: 100.00% 19.256ms 112 171.93us 160.53us 515.11us generatedNativePointwise API calls: 100.00% 2.4928ms 112 22.256us 14.305us 45.378us cuLaunchKernel ==25751== Range "PWN(PWN(Sigmoid_152, Mul_153), Add_154)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.0808ms 112 27.506us 18.433us 116.49us PWN(PWN(Sigmoid_152, Mul_153), Add_154) GPU activities: 100.00% 18.989ms 112 169.54us 160.40us 248.21us generatedNativePointwise API calls: 100.00% 2.3472ms 112 20.957us 14.112us 49.187us cuLaunchKernel ==25751== Range "PWN(PWN(Sigmoid_185, Mul_186), Add_187)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.0577ms 112 27.300us 19.297us 88.805us PWN(PWN(Sigmoid_185, Mul_186), Add_187) GPU activities: 100.00% 14.793ms 112 132.08us 126.99us 186.10us generatedNativePointwise API calls: 100.00% 2.3236ms 112 20.746us 14.881us 38.914us cuLaunchKernel ==25751== Range "PWN(PWN(Sigmoid_283, Mul_284), Add_285)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.0431ms 112 27.170us 19.361us 61.283us PWN(PWN(Sigmoid_283, Mul_284), Add_285) GPU activities: 100.00% 9.8602ms 112 88.037us 81.543us 124.43us generatedNativePointwise API calls: 100.00% 2.3777ms 112 21.229us 15.233us 55.587us cuLaunchKernel ==25751== Range "PWN(PWN(Sigmoid_316, Mul_317), Add_318)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.0797ms 112 27.497us 19.457us 125.86us PWN(PWN(Sigmoid_316, Mul_317), Add_318) GPU activities: 100.00% 9.8580ms 112 88.017us 82.278us 124.75us generatedNativePointwise API calls: 100.00% 2.4505ms 112 21.879us 15.297us 120.36us cuLaunchKernel ==25751== Range "PWN(PWN(Sigmoid_349, Mul_350), Add_351)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.1087ms 112 27.755us 19.201us 127.69us PWN(PWN(Sigmoid_349, Mul_350), Add_351) GPU activities: 100.00% 9.8823ms 112 88.234us 82.183us 124.75us generatedNativePointwise API calls: 100.00% 2.3817ms 112 21.265us 15.137us 63.620us cuLaunchKernel ==25751== Range "PWN(PWN(Sigmoid_382, Mul_383), Add_384)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.1199ms 112 27.856us 18.721us 120.52us PWN(PWN(Sigmoid_382, Mul_383), Add_384) GPU activities: 100.00% 9.9127ms 112 88.506us 81.830us 124.71us generatedNativePointwise API calls: 100.00% 2.4756ms 112 22.103us 14.656us 114.70us cuLaunchKernel ==25751== Range "PWN(PWN(Sigmoid_415, Mul_416), Add_417)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.8618ms 112 25.551us 17.793us 68.004us PWN(PWN(Sigmoid_415, Mul_416), Add_417) GPU activities: 100.00% 9.9025ms 112 88.415us 82.343us 129.71us generatedNativePointwise API calls: 100.00% 2.2445ms 112 20.040us 13.985us 61.507us cuLaunchKernel ==25751== Range "PWN(PWN(Sigmoid_448, Mul_449), Add_450)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.9424ms 112 26.271us 18.977us 63.171us PWN(PWN(Sigmoid_448, Mul_449), Add_450) GPU activities: 100.00% 9.8550ms 112 87.991us 81.991us 126.51us generatedNativePointwise API calls: 100.00% 2.3291ms 112 20.795us 15.105us 58.499us cuLaunchKernel ==25751== Range "PWN(PWN(Sigmoid_481, Mul_482), Add_483)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.9598ms 112 26.426us 18.145us 59.395us PWN(PWN(Sigmoid_481, Mul_482), Add_483) GPU activities: 100.00% 9.8630ms 112 88.062us 81.830us 124.91us generatedNativePointwise API calls: 100.00% 2.2935ms 112 20.477us 14.368us 53.603us cuLaunchKernel ==25751== Range "PWN(PWN(Sigmoid_514, Mul_515), Add_516)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.9011ms 112 34.830us 18.465us 952.28us PWN(PWN(Sigmoid_514, Mul_515), Add_516) GPU activities: 100.00% 9.8997ms 112 88.389us 83.206us 125.58us generatedNativePointwise API calls: 100.00% 3.2479ms 112 28.999us 14.881us 924.82us cuLaunchKernel ==25751== Range "PWN(PWN(Sigmoid_547, Mul_548), Add_549)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.9464ms 112 26.306us 18.817us 61.891us PWN(PWN(Sigmoid_547, Mul_548), Add_549) GPU activities: 100.00% 7.8489ms 112 70.079us 65.989us 95.464us generatedNativePointwise API calls: 100.00% 2.2872ms 112 20.421us 14.593us 55.491us cuLaunchKernel ==25751== Range "PWN(PWN(Sigmoid_645, Mul_646), Add_647)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.7627ms 112 24.666us 18.241us 42.019us PWN(PWN(Sigmoid_645, Mul_646), Add_647) GPU activities: 100.00% 5.0951ms 112 45.491us 42.435us 64.965us generatedNativePointwise API calls: 100.00% 2.1802ms 112 19.465us 14.560us 33.282us cuLaunchKernel ==25751== Range "PWN(PWN(Sigmoid_678, Mul_679), Add_680)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.8171ms 112 25.152us 18.753us 78.149us PWN(PWN(Sigmoid_678, Mul_679), Add_680) GPU activities: 100.00% 5.1145ms 112 45.665us 41.988us 66.149us generatedNativePointwise API calls: 100.00% 2.2136ms 112 19.764us 14.849us 72.932us cuLaunchKernel ==25751== Range "PWN(PWN(Sigmoid_711, Mul_712), Add_713)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.7274ms 112 24.351us 17.793us 88.869us PWN(PWN(Sigmoid_711, Mul_712), Add_713) GPU activities: 100.00% 5.0850ms 112 45.401us 42.147us 65.477us generatedNativePointwise API calls: 100.00% 2.1339ms 112 19.052us 13.953us 78.788us cuLaunchKernel ==25751== Range "PWN(PWN(Sigmoid_744, Mul_745), Add_746)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.5554ms 112 22.816us 17.313us 47.203us PWN(PWN(Sigmoid_744, Mul_745), Add_746) GPU activities: 100.00% 5.0760ms 112 45.321us 41.859us 65.413us generatedNativePointwise API calls: 100.00% 1.9653ms 112 17.547us 13.440us 34.690us cuLaunchKernel ==25751== Range "PWN(PWN(Sigmoid_777, Mul_778), Add_779)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.7840ms 112 24.856us 17.857us 96.965us PWN(PWN(Sigmoid_777, Mul_778), Add_779) GPU activities: 100.00% 5.1228ms 112 45.739us 42.019us 65.157us generatedNativePointwise API calls: 100.00% 2.1097ms 112 18.837us 13.920us 73.348us cuLaunchKernel ==25751== Range "PWN(PWN(Sigmoid_810, Mul_811), Add_812)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.5069ms 112 22.383us 17.985us 76.356us PWN(PWN(Sigmoid_810, Mul_811), Add_812) GPU activities: 100.00% 5.1196ms 112 45.710us 42.083us 66.021us generatedNativePointwise API calls: 100.00% 1.9260ms 112 17.196us 14.144us 31.362us cuLaunchKernel ==25751== Range "PWN(PWN(Sigmoid_843, Mul_844), Add_845)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.5781ms 112 23.018us 17.281us 74.372us PWN(PWN(Sigmoid_843, Mul_844), Add_845) GPU activities: 100.00% 5.0951ms 112 45.492us 42.083us 64.869us generatedNativePointwise API calls: 100.00% 2.0099ms 112 17.945us 13.697us 69.220us cuLaunchKernel ==25751== Range "PWN(PWN(Sigmoid_876, Mul_877), Add_878)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.5301ms 112 22.590us 18.113us 60.676us PWN(PWN(Sigmoid_876, Mul_877), Add_878) GPU activities: 100.00% 5.0818ms 112 45.372us 42.307us 64.997us generatedNativePointwise API calls: 100.00% 1.9933ms 112 17.797us 14.208us 55.971us cuLaunchKernel ==25751== Range "PWN(PWN(Sigmoid_909, Mul_910), Add_911)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.5901ms 112 23.126us 17.793us 112.17us PWN(PWN(Sigmoid_909, Mul_910), Add_911) GPU activities: 100.00% 4.4356ms 112 39.603us 36.707us 53.413us generatedNativePointwise API calls: 100.00% 2.0148ms 112 17.989us 14.017us 105.73us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1011, Mul_1012)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.4471ms 112 21.849us 16.961us 65.028us PWN(Sigmoid_1011, Mul_1012) GPU activities: 100.00% 2.4094ms 112 21.512us 19.394us 28.290us generatedNativePointwise API calls: 100.00% 1.9405ms 112 17.325us 13.601us 61.316us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1027, Mul_1028)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.3552ms 112 21.028us 16.449us 74.276us PWN(Sigmoid_1027, Mul_1028) GPU activities: 100.00% 2.3022ms 112 20.555us 19.041us 27.842us generatedNativePointwise API calls: 100.00% 1.8929ms 112 16.900us 13.313us 70.852us cuLaunchKernel ==25751== Range "PWN(Sigmoid_103, Mul_104)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.1343ms 112 27.985us 16.929us 121.26us PWN(Sigmoid_103, Mul_104) GPU activities: 100.00% 9.3095ms 112 83.120us 78.054us 246.93us generatedNativePointwise API calls: 100.00% 2.5002ms 112 22.323us 13.537us 115.66us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1043, Mul_1044)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.3977ms 112 21.408us 16.609us 46.499us PWN(Sigmoid_1043, Mul_1044) GPU activities: 100.00% 2.3634ms 112 21.101us 18.817us 29.090us generatedNativePointwise API calls: 100.00% 1.9389ms 112 17.311us 13.472us 42.594us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1059, Mul_1060)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.4371ms 112 21.759us 16.897us 68.868us PWN(Sigmoid_1059, Mul_1060) GPU activities: 100.00% 2.3175ms 112 20.692us 18.977us 28.802us generatedNativePointwise API calls: 100.00% 1.9882ms 112 17.751us 13.889us 64.420us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1075, Mul_1076)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.4573ms 112 21.940us 16.737us 51.171us PWN(Sigmoid_1075, Mul_1076) GPU activities: 100.00% 2.2842ms 112 20.395us 18.466us 27.618us generatedNativePointwise API calls: 100.00% 1.9534ms 112 17.441us 13.185us 47.139us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1091, Mul_1092)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.3845ms 112 21.290us 16.481us 82.629us PWN(Sigmoid_1091, Mul_1092) GPU activities: 100.00% 2.2573ms 112 20.154us 18.337us 26.114us generatedNativePointwise API calls: 100.00% 1.8821ms 112 16.804us 13.249us 77.893us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1107, Mul_1108)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.2946ms 112 20.487us 15.617us 51.363us PWN(Sigmoid_1107, Mul_1108) GPU activities: 100.00% 2.2732ms 112 20.296us 18.434us 30.178us generatedNativePointwise API calls: 100.00% 1.8577ms 112 16.586us 12.961us 43.618us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1123, Mul_1124)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.2736ms 112 20.299us 16.289us 43.811us PWN(Sigmoid_1123, Mul_1124) GPU activities: 100.00% 2.3503ms 112 20.984us 19.041us 27.234us generatedNativePointwise API calls: 100.00% 1.8251ms 112 16.295us 12.961us 39.171us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1140, Mul_1141)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.3841ms 112 21.286us 17.217us 69.380us PWN(Sigmoid_1140, Mul_1141) GPU activities: 100.00% 4.2075ms 112 37.566us 34.626us 51.460us generatedNativePointwise API calls: 100.00% 1.8951ms 112 16.920us 13.761us 31.138us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1156, Mul_1157)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.3124ms 112 20.646us 15.744us 67.652us PWN(Sigmoid_1156, Mul_1157) GPU activities: 100.00% 2.0092ms 112 17.939us 16.257us 24.610us generatedNativePointwise API calls: 100.00% 1.8061ms 112 16.125us 12.640us 33.921us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1175, Mul_1176)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.4494ms 112 21.869us 16.513us 98.597us PWN(Sigmoid_1175, Mul_1176) GPU activities: 100.00% 4.1806ms 112 37.326us 34.627us 50.564us generatedNativePointwise API calls: 100.00% 1.9774ms 112 17.655us 13.089us 93.061us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1191, Mul_1192)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.3829ms 112 21.276us 16.673us 75.588us PWN(Sigmoid_1191, Mul_1192) GPU activities: 100.00% 4.2148ms 112 37.632us 34.563us 50.308us generatedNativePointwise API calls: 100.00% 1.9154ms 112 17.101us 13.568us 70.116us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1207, Mul_1208)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.2439ms 112 20.035us 15.808us 38.467us PWN(Sigmoid_1207, Mul_1208) GPU activities: 100.00% 4.1930ms 112 37.437us 34.723us 52.580us generatedNativePointwise API calls: 100.00% 1.8094ms 112 16.155us 12.768us 33.378us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1223, Mul_1224)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.9408ms 112 26.257us 15.264us 767.95us PWN(Sigmoid_1223, Mul_1224) GPU activities: 100.00% 4.1999ms 112 37.499us 34.467us 51.332us generatedNativePointwise API calls: 100.00% 2.5013ms 112 22.332us 12.512us 762.51us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1239, Mul_1240)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.2768ms 112 20.328us 15.201us 64.676us PWN(Sigmoid_1239, Mul_1240) GPU activities: 100.00% 4.1765ms 112 37.290us 34.530us 50.084us generatedNativePointwise API calls: 100.00% 1.8128ms 112 16.185us 12.321us 36.162us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1255, Mul_1256)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.4076ms 112 21.496us 15.681us 76.132us PWN(Sigmoid_1255, Mul_1256) GPU activities: 100.00% 4.2066ms 112 37.559us 34.499us 50.116us generatedNativePointwise API calls: 100.00% 1.9652ms 112 17.546us 12.769us 72.100us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1271, Mul_1272)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.2748ms 112 20.310us 15.393us 63.908us PWN(Sigmoid_1271, Mul_1272) GPU activities: 100.00% 4.2371ms 112 37.831us 34.531us 52.196us generatedNativePointwise API calls: 100.00% 1.8388ms 112 16.417us 12.609us 59.140us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1287, Mul_1288)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.2842ms 112 20.394us 15.969us 67.204us PWN(Sigmoid_1287, Mul_1288) GPU activities: 100.00% 4.1887ms 112 37.399us 34.786us 50.884us generatedNativePointwise API calls: 100.00% 1.8220ms 112 16.267us 12.961us 61.859us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1304, Mul_1305)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.2618ms 112 20.194us 16.289us 58.980us PWN(Sigmoid_1304, Mul_1305) GPU activities: 100.00% 7.8064ms 112 69.700us 65.733us 95.175us generatedNativePointwise API calls: 100.00% 1.8123ms 112 16.180us 13.057us 54.691us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1320, Mul_1321)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.4774ms 112 22.119us 16.385us 71.108us PWN(Sigmoid_1320, Mul_1321) GPU activities: 100.00% 3.6236ms 112 32.354us 29.763us 44.228us generatedNativePointwise API calls: 100.00% 2.0181ms 112 18.018us 13.441us 67.076us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1339, Mul_1340)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.4734ms 112 22.084us 16.769us 68.964us PWN(Sigmoid_1339, Mul_1340) GPU activities: 100.00% 7.9530ms 112 71.009us 66.661us 95.368us generatedNativePointwise API calls: 100.00% 1.9765ms 112 17.647us 13.632us 59.556us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1355, Mul_1356)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.2391ms 112 19.992us 15.521us 47.779us PWN(Sigmoid_1355, Mul_1356) GPU activities: 100.00% 7.9377ms 112 70.872us 66.373us 95.399us generatedNativePointwise API calls: 100.00% 1.7754ms 112 15.851us 12.544us 38.434us cuLaunchKernel ==25751== Range "PWN(Sigmoid_136, Mul_137)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.1251ms 112 27.902us 18.177us 117.57us PWN(Sigmoid_136, Mul_137) GPU activities: 100.00% 9.3439ms 112 83.427us 79.879us 119.21us generatedNativePointwise API calls: 100.00% 2.4946ms 112 22.273us 13.857us 111.59us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1371, Mul_1372)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.3261ms 112 20.768us 15.585us 64.100us PWN(Sigmoid_1371, Mul_1372) GPU activities: 100.00% 7.9843ms 112 71.288us 66.725us 95.496us generatedNativePointwise API calls: 100.00% 1.7857ms 112 15.943us 12.641us 43.331us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1387, Mul_1388)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.2680ms 112 20.250us 15.585us 56.739us PWN(Sigmoid_1387, Mul_1388) GPU activities: 100.00% 7.9066ms 112 70.594us 65.349us 97.352us generatedNativePointwise API calls: 100.00% 1.7921ms 112 16.001us 12.641us 53.091us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1403, Mul_1404)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.2868ms 112 20.417us 16.033us 40.354us PWN(Sigmoid_1403, Mul_1404) GPU activities: 100.00% 7.9184ms 112 70.700us 66.757us 96.488us generatedNativePointwise API calls: 100.00% 1.8416ms 112 16.442us 12.801us 35.714us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1419, Mul_1420)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.3868ms 112 21.310us 16.289us 73.829us PWN(Sigmoid_1419, Mul_1420) GPU activities: 100.00% 7.8956ms 112 70.496us 65.893us 98.856us generatedNativePointwise API calls: 100.00% 1.9029ms 112 16.990us 13.313us 58.660us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1435, Mul_1436)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.1616ms 112 19.299us 15.201us 37.570us PWN(Sigmoid_1435, Mul_1436) GPU activities: 100.00% 7.7950ms 112 69.598us 65.669us 92.615us generatedNativePointwise API calls: 100.00% 1.7097ms 112 15.265us 12.288us 31.362us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1451, Mul_1452)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.1952ms 112 19.600us 14.880us 44.707us PWN(Sigmoid_1451, Mul_1452) GPU activities: 100.00% 7.7327ms 112 69.042us 64.581us 91.207us generatedNativePointwise API calls: 100.00% 1.7526ms 112 15.647us 11.872us 40.194us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1468, Mul_1469)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.3566ms 112 21.041us 16.129us 81.477us PWN(Sigmoid_1468, Mul_1469) GPU activities: 100.00% 13.071ms 112 116.71us 110.09us 160.43us generatedNativePointwise API calls: 100.00% 1.8450ms 112 16.473us 13.025us 42.915us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1484, Mul_1485)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.3611ms 112 21.081us 16.961us 40.322us PWN(Sigmoid_1484, Mul_1485) GPU activities: 100.00% 4.1954ms 112 37.459us 34.211us 50.820us generatedNativePointwise API calls: 100.00% 1.8259ms 112 16.302us 13.664us 27.874us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1501, Mul_1502)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.3013ms 112 20.547us 15.873us 69.732us PWN(Sigmoid_1501, Mul_1502) GPU activities: 100.00% 4.2156ms 112 37.639us 34.755us 50.084us generatedNativePointwise API calls: 100.00% 1.7700ms 112 15.803us 12.416us 36.386us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1517, Mul_1518)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.3901ms 112 21.339us 16.513us 132.65us PWN(Sigmoid_1517, Mul_1518) GPU activities: 100.00% 4.1755ms 112 37.281us 34.498us 52.868us generatedNativePointwise API calls: 100.00% 1.8856ms 112 16.835us 13.121us 102.66us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1533, Mul_1534)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.4043ms 112 21.466us 15.617us 73.572us PWN(Sigmoid_1533, Mul_1534) GPU activities: 100.00% 4.2899ms 112 38.302us 35.683us 50.404us generatedNativePointwise API calls: 100.00% 1.9047ms 112 17.006us 12.289us 68.356us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1549, Mul_1550)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.2077ms 112 19.711us 15.713us 50.179us PWN(Sigmoid_1549, Mul_1550) GPU activities: 100.00% 4.1806ms 112 37.326us 34.562us 50.404us generatedNativePointwise API calls: 100.00% 1.7281ms 112 15.429us 12.513us 23.522us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1565, Mul_1566)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.3077ms 112 20.604us 16.289us 73.476us PWN(Sigmoid_1565, Mul_1566) GPU activities: 100.00% 4.2354ms 112 37.816us 34.819us 50.244us generatedNativePointwise API calls: 100.00% 1.7958ms 112 16.033us 13.024us 37.858us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1581, Mul_1582)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.2829ms 112 20.383us 14.849us 61.667us PWN(Sigmoid_1581, Mul_1582) GPU activities: 100.00% 4.1510ms 112 37.062us 34.178us 50.404us generatedNativePointwise API calls: 100.00% 1.8292ms 112 16.332us 11.904us 55.715us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1597, Mul_1598)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.1816ms 112 19.478us 15.008us 33.922us PWN(Sigmoid_1597, Mul_1598) GPU activities: 100.00% 4.1815ms 112 37.334us 34.627us 50.788us generatedNativePointwise API calls: 100.00% 1.7694ms 112 15.798us 12.160us 30.434us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1613, Mul_1614)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.2023ms 112 19.663us 15.745us 66.692us PWN(Sigmoid_1613, Mul_1614) GPU activities: 100.00% 4.1802ms 112 37.323us 34.275us 51.684us generatedNativePointwise API calls: 100.00% 1.7154ms 112 15.315us 12.609us 45.315us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1630, Mul_1631)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.4484ms 112 21.860us 16.033us 63.332us PWN(Sigmoid_1630, Mul_1631) GPU activities: 100.00% 6.9156ms 112 61.746us 57.573us 83.142us generatedNativePointwise API calls: 100.00% 1.9747ms 112 17.631us 12.961us 57.188us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1646, Mul_1647)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.4300ms 112 21.696us 16.097us 73.093us PWN(Sigmoid_1646, Mul_1647) GPU activities: 100.00% 2.3192ms 112 20.707us 18.401us 28.290us generatedNativePointwise API calls: 100.00% 1.9432ms 112 17.350us 12.929us 67.460us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1663, Mul_1664)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.2771ms 112 20.330us 16.321us 40.835us PWN(Sigmoid_1663, Mul_1664) GPU activities: 100.00% 2.3832ms 112 21.278us 18.849us 29.187us generatedNativePointwise API calls: 100.00% 1.8139ms 112 16.195us 13.216us 37.666us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1679, Mul_1680)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.4206ms 112 21.612us 16.353us 78.468us PWN(Sigmoid_1679, Mul_1680) GPU activities: 100.00% 2.2863ms 112 20.412us 18.817us 26.882us generatedNativePointwise API calls: 100.00% 1.9655ms 112 17.548us 13.121us 75.076us cuLaunchKernel ==25751== Range "PWN(Sigmoid_169, Mul_170)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.9620ms 112 26.446us 18.977us 102.02us PWN(Sigmoid_169, Mul_170) GPU activities: 100.00% 9.4403ms 112 84.288us 80.422us 119.82us generatedNativePointwise API calls: 100.00% 2.3464ms 112 20.950us 15.073us 96.549us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1695, Mul_1696)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.6065ms 112 23.271us 15.489us 138.31us PWN(Sigmoid_1695, Mul_1696) GPU activities: 100.00% 2.4507ms 112 21.881us 19.970us 27.906us generatedNativePointwise API calls: 100.00% 2.0415ms 112 18.227us 12.449us 118.92us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1711, Mul_1712)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.3310ms 112 20.812us 16.001us 113.38us PWN(Sigmoid_1711, Mul_1712) GPU activities: 100.00% 2.3348ms 112 20.846us 19.073us 26.626us generatedNativePointwise API calls: 100.00% 1.8751ms 112 16.742us 12.449us 107.97us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1727, Mul_1728)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.2753ms 112 29.243us 15.424us 925.59us PWN(Sigmoid_1727, Mul_1728) GPU activities: 100.00% 2.3254ms 112 20.762us 18.785us 27.778us generatedNativePointwise API calls: 100.00% 2.8474ms 112 25.422us 12.320us 919.73us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1743, Mul_1744)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.6926ms 112 24.041us 14.817us 253.10us PWN(Sigmoid_1743, Mul_1744) GPU activities: 100.00% 2.2959ms 112 20.499us 18.561us 30.722us generatedNativePointwise API calls: 100.00% 2.2469ms 112 20.061us 11.873us 246.70us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1759, Mul_1760)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.3577ms 112 21.050us 15.905us 122.22us PWN(Sigmoid_1759, Mul_1760) GPU activities: 100.00% 2.2961ms 112 20.500us 18.625us 28.450us generatedNativePointwise API calls: 100.00% 1.9523ms 112 17.431us 12.992us 116.23us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1775, Mul_1776)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.3803ms 112 21.252us 15.777us 103.08us PWN(Sigmoid_1775, Mul_1776) GPU activities: 100.00% 2.3182ms 112 20.698us 18.850us 27.171us generatedNativePointwise API calls: 100.00% 1.9114ms 112 17.066us 12.961us 97.926us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1792, Mul_1793)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.2792ms 112 38.207us 15.745us 1.1763ms PWN(Sigmoid_1792, Mul_1793) GPU activities: 100.00% 3.6257ms 112 32.371us 29.730us 44.324us generatedNativePointwise API calls: 100.00% 3.6869ms 112 32.918us 12.513us 1.1696ms cuLaunchKernel ==25751== Range "PWN(Sigmoid_1862)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.4260ms 112 21.660us 16.641us 47.555us PWN(Sigmoid_1862) GPU activities: 100.00% 1.9956ms 112 17.818us 16.129us 23.202us generatedNativePointwise API calls: 100.00% 1.9282ms 112 17.215us 13.089us 41.955us cuLaunchKernel ==25751== Range "PWN(Sigmoid_1961)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.5493ms 112 22.761us 16.449us 222.22us PWN(Sigmoid_1961) GPU activities: 100.00% 888.49us 112 7.9320us 6.6890us 11.937us generatedNativePointwise API calls: 100.00% 1.9944ms 112 17.807us 12.928us 215.60us cuLaunchKernel ==25751== Range "PWN(Sigmoid_202, Mul_203)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.0428ms 112 27.167us 17.473us 146.99us PWN(Sigmoid_202, Mul_203) GPU activities: 100.00% 9.3763ms 112 83.716us 78.502us 247.41us generatedNativePointwise API calls: 100.00% 2.4392ms 112 21.778us 14.049us 138.98us cuLaunchKernel ==25751== Range "PWN(Sigmoid_2060)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.1865ms 112 28.450us 15.265us 797.52us PWN(Sigmoid_2060) GPU activities: 100.00% 388.16us 112 3.4650us 3.1370us 5.4410us generatedNativePointwise API calls: 100.00% 2.7515ms 112 24.567us 12.289us 792.85us cuLaunchKernel ==25751== Range "PWN(Sigmoid_219, Mul_220)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.0408ms 112 27.149us 18.593us 88.005us PWN(Sigmoid_219, Mul_220) GPU activities: 100.00% 18.114ms 112 161.73us 155.69us 231.79us generatedNativePointwise API calls: 100.00% 2.4317ms 112 21.711us 14.849us 84.197us cuLaunchKernel ==25751== Range "PWN(Sigmoid_235, Mul_236)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.9529ms 112 26.365us 18.113us 44.930us PWN(Sigmoid_235, Mul_236) GPU activities: 100.00% 14.867ms 112 132.74us 124.14us 181.33us generatedNativePointwise API calls: 100.00% 2.3125ms 112 20.647us 14.401us 37.538us cuLaunchKernel ==25751== Range "PWN(Sigmoid_251, Mul_252)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.0702ms 112 27.412us 18.081us 105.67us PWN(Sigmoid_251, Mul_252) GPU activities: 100.00% 6.7999ms 112 60.713us 55.237us 83.079us generatedNativePointwise API calls: 100.00% 2.4698ms 112 22.051us 14.657us 100.84us cuLaunchKernel ==25751== Range "PWN(Sigmoid_267, Mul_268)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.9745ms 112 26.558us 17.345us 62.563us PWN(Sigmoid_267, Mul_268) GPU activities: 100.00% 7.9719ms 112 71.177us 65.638us 95.336us generatedNativePointwise API calls: 100.00% 2.3725ms 112 21.183us 13.985us 56.227us cuLaunchKernel ==25751== Range "PWN(Sigmoid_300, Mul_301)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.9647ms 112 26.470us 18.017us 139.34us PWN(Sigmoid_300, Mul_301) GPU activities: 100.00% 7.9769ms 112 71.221us 66.662us 95.336us generatedNativePointwise API calls: 100.00% 2.3507ms 112 20.988us 14.145us 133.32us cuLaunchKernel ==25751== Range "PWN(Sigmoid_333, Mul_334)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.9627ms 112 26.452us 18.593us 93.221us PWN(Sigmoid_333, Mul_334) GPU activities: 100.00% 7.9657ms 112 71.122us 66.661us 95.879us generatedNativePointwise API calls: 100.00% 2.3399ms 112 20.892us 14.657us 85.541us cuLaunchKernel ==25751== Range "PWN(Sigmoid_366, Mul_367)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.7661ms 112 24.697us 17.857us 38.659us PWN(Sigmoid_366, Mul_367) GPU activities: 100.00% 8.0011ms 112 71.438us 66.693us 96.039us generatedNativePointwise API calls: 100.00% 2.1777ms 112 19.443us 14.337us 33.218us cuLaunchKernel ==25751== Range "PWN(Sigmoid_399, Mul_400)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.9593ms 112 26.422us 19.169us 104.55us PWN(Sigmoid_399, Mul_400) GPU activities: 100.00% 8.0116ms 112 71.532us 66.885us 96.743us generatedNativePointwise API calls: 100.00% 2.3784ms 112 21.235us 15.073us 98.949us cuLaunchKernel ==25751== Range "PWN(Sigmoid_432, Mul_433)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.8750ms 112 25.669us 19.041us 87.301us PWN(Sigmoid_432, Mul_433) GPU activities: 100.00% 7.9708ms 112 71.167us 66.566us 93.447us generatedNativePointwise API calls: 100.00% 2.2890ms 112 20.437us 15.041us 81.253us cuLaunchKernel ==25751== Range "PWN(Sigmoid_465, Mul_466)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.9263ms 112 26.127us 17.665us 76.068us PWN(Sigmoid_465, Mul_466) GPU activities: 100.00% 7.9438ms 112 70.926us 65.925us 99.240us generatedNativePointwise API calls: 100.00% 2.2867ms 112 20.417us 13.985us 62.115us cuLaunchKernel ==25751== Range "PWN(Sigmoid_498, Mul_499)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.8317ms 112 25.282us 17.121us 58.627us PWN(Sigmoid_498, Mul_499) GPU activities: 100.00% 8.0164ms 112 71.575us 66.501us 95.111us generatedNativePointwise API calls: 100.00% 2.2024ms 112 19.663us 13.825us 43.586us cuLaunchKernel ==25751== Range "PWN(Sigmoid_531, Mul_532)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.8654ms 112 25.584us 17.185us 116.23us PWN(Sigmoid_531, Mul_532) GPU activities: 100.00% 7.9922ms 112 71.359us 64.645us 96.487us generatedNativePointwise API calls: 100.00% 2.2499ms 112 20.088us 13.057us 111.08us cuLaunchKernel ==25751== Range "PWN(Sigmoid_55, Mul_56)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.5663ms 112 31.842us 20.289us 102.63us PWN(Sigmoid_55, Mul_56) GPU activities: 100.00% 74.829ms 112 668.12us 605.87us 5.4506ms generatedNativePointwise API calls: 100.00% 2.7098ms 112 24.194us 14.625us 94.694us cuLaunchKernel ==25751== Range "PWN(Sigmoid_564, Mul_565)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.0069ms 112 26.847us 17.857us 116.97us PWN(Sigmoid_564, Mul_565) GPU activities: 100.00% 7.8225ms 112 69.843us 64.997us 96.839us generatedNativePointwise API calls: 100.00% 2.3917ms 112 21.354us 14.368us 111.88us cuLaunchKernel ==25751== Range "PWN(Sigmoid_581, Mul_582)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.8175ms 112 25.155us 17.025us 57.251us PWN(Sigmoid_581, Mul_582) GPU activities: 100.00% 13.093ms 112 116.90us 110.28us 160.85us generatedNativePointwise API calls: 100.00% 2.2732ms 112 20.296us 13.729us 52.259us cuLaunchKernel ==25751== Range "PWN(Sigmoid_597, Mul_598)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.7596ms 112 24.639us 18.465us 47.522us PWN(Sigmoid_597, Mul_598) GPU activities: 100.00% 7.8207ms 112 69.827us 65.989us 95.655us generatedNativePointwise API calls: 100.00% 2.1980ms 112 19.625us 14.785us 42.050us cuLaunchKernel ==25751== Range "PWN(Sigmoid_613, Mul_614)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.8027ms 112 25.023us 17.377us 68.132us PWN(Sigmoid_613, Mul_614) GPU activities: 100.00% 3.6621ms 112 32.697us 29.922us 44.355us generatedNativePointwise API calls: 100.00% 2.2483ms 112 20.074us 14.209us 63.076us cuLaunchKernel ==25751== Range "PWN(Sigmoid_629, Mul_630)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.7365ms 112 24.432us 17.569us 85.765us PWN(Sigmoid_629, Mul_630) GPU activities: 100.00% 4.2289ms 112 37.758us 34.594us 50.596us generatedNativePointwise API calls: 100.00% 2.1783ms 112 19.448us 14.048us 80.324us cuLaunchKernel ==25751== Range "PWN(Sigmoid_662, Mul_663)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.7743ms 112 24.770us 17.537us 90.662us PWN(Sigmoid_662, Mul_663) GPU activities: 100.00% 4.2239ms 112 37.713us 34.691us 49.700us generatedNativePointwise API calls: 100.00% 2.2381ms 112 19.982us 14.241us 86.853us cuLaunchKernel ==25751== Range "PWN(Sigmoid_695, Mul_696)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.7420ms 112 24.482us 18.305us 105.45us PWN(Sigmoid_695, Mul_696) GPU activities: 100.00% 4.1802ms 112 37.323us 34.818us 50.724us generatedNativePointwise API calls: 100.00% 2.2086ms 112 19.719us 14.337us 100.13us cuLaunchKernel ==25751== Range "PWN(Sigmoid_71, Mul_72)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.1702ms 112 28.305us 18.209us 101.89us PWN(Sigmoid_71, Mul_72) GPU activities: 100.00% 32.284ms 112 288.25us 256.63us 2.4834ms generatedNativePointwise API calls: 100.00% 2.5189ms 112 22.490us 14.496us 96.006us cuLaunchKernel ==25751== Range "PWN(Sigmoid_728, Mul_729)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.7817ms 112 24.836us 18.721us 74.052us PWN(Sigmoid_728, Mul_729) GPU activities: 100.00% 4.1976ms 112 37.478us 34.595us 51.812us generatedNativePointwise API calls: 100.00% 2.2268ms 112 19.881us 14.848us 68.164us cuLaunchKernel ==25751== Range "PWN(Sigmoid_761, Mul_762)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.8463ms 112 25.413us 17.409us 115.21us PWN(Sigmoid_761, Mul_762) GPU activities: 100.00% 4.2047ms 112 37.541us 34.819us 53.220us generatedNativePointwise API calls: 100.00% 2.1631ms 112 19.313us 13.857us 99.398us cuLaunchKernel ==25751== Range "PWN(Sigmoid_794, Mul_795)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.5371ms 112 22.652us 16.673us 69.956us PWN(Sigmoid_794, Mul_795) GPU activities: 100.00% 4.1965ms 112 37.469us 34.435us 51.044us generatedNativePointwise API calls: 100.00% 1.9896ms 112 17.763us 13.121us 64.548us cuLaunchKernel ==25751== Range "PWN(Sigmoid_827, Mul_828)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.5872ms 112 23.100us 17.889us 44.771us PWN(Sigmoid_827, Mul_828) GPU activities: 100.00% 4.1957ms 112 37.461us 34.915us 49.828us generatedNativePointwise API calls: 100.00% 2.0587ms 112 18.381us 14.401us 39.746us cuLaunchKernel ==25751== Range "PWN(Sigmoid_860, Mul_861)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.6205ms 112 23.397us 18.145us 80.005us PWN(Sigmoid_860, Mul_861) GPU activities: 100.00% 4.2014ms 112 37.512us 35.011us 50.852us generatedNativePointwise API calls: 100.00% 2.0585ms 112 18.379us 14.305us 36.258us cuLaunchKernel ==25751== Range "PWN(Sigmoid_87, Mul_88)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.1723ms 112 28.324us 18.209us 87.365us PWN(Sigmoid_87, Mul_88) GPU activities: 100.00% 16.001ms 112 142.87us 110.02us 2.2492ms generatedNativePointwise API calls: 100.00% 2.5251ms 112 22.545us 14.529us 81.796us cuLaunchKernel ==25751== Range "PWN(Sigmoid_893, Mul_894)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.5166ms 112 22.469us 16.897us 77.892us PWN(Sigmoid_893, Mul_894) GPU activities: 100.00% 4.2039ms 112 37.534us 35.010us 49.796us generatedNativePointwise API calls: 100.00% 2.0153ms 112 17.994us 13.665us 71.844us cuLaunchKernel ==25751== Range "PWN(Sigmoid_926, Mul_927)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.7041ms 112 24.143us 17.057us 55.203us PWN(Sigmoid_926, Mul_927) GPU activities: 100.00% 4.2209ms 112 37.686us 34.691us 51.364us generatedNativePointwise API calls: 100.00% 2.1232ms 112 18.957us 13.697us 44.130us cuLaunchKernel ==25751== Range "PWN(Sigmoid_943, Mul_944)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.3279ms 112 20.784us 16.897us 26.722us PWN(Sigmoid_943, Mul_944) GPU activities: 100.00% 6.8948ms 112 61.561us 57.477us 84.071us generatedNativePointwise API calls: 100.00% 1.8574ms 112 16.584us 13.472us 21.569us cuLaunchKernel ==25751== Range "PWN(Sigmoid_959, Mul_960)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.6318ms 112 23.498us 17.089us 108.20us PWN(Sigmoid_959, Mul_960) GPU activities: 100.00% 4.2081ms 112 37.572us 34.339us 50.948us generatedNativePointwise API calls: 100.00% 2.0740ms 112 18.518us 13.377us 103.24us cuLaunchKernel ==25751== Range "PWN(Sigmoid_975, Mul_976)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.5247ms 112 22.542us 17.120us 96.454us PWN(Sigmoid_975, Mul_976) GPU activities: 100.00% 2.0064ms 112 17.914us 16.289us 24.834us generatedNativePointwise API calls: 100.00% 1.9463ms 112 17.378us 13.632us 39.042us cuLaunchKernel ==25751== Range "PWN(Sigmoid_995, Mul_996)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.7350ms 112 24.419us 18.081us 115.17us PWN(Sigmoid_995, Mul_996) GPU activities: 100.00% 4.2091ms 112 37.581us 34.979us 51.941us generatedNativePointwise API calls: 100.00% 2.1748ms 112 19.418us 14.561us 105.77us cuLaunchKernel ==25751== Range "QuantizeLinear_1163_quantize_scale_node_clone_0" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.6610ms 112 41.615us 34.914us 95.078us QuantizeLinear_1163_quantize_scale_node_clone_0 GPU activities: 100.00% 6.3493ms 112 56.689us 52.804us 69.925us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=32, int=64, int=128, char=4, bool=0, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.0184ms 112 18.021us 14.369us 54.211us cudaLaunchKernel ==25751== Range "QuantizeLinear_1163_quantize_scale_node_clone_1" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.0980ms 112 45.517us 35.746us 136.74us QuantizeLinear_1163_quantize_scale_node_clone_1 GPU activities: 100.00% 7.0861ms 112 63.268us 59.909us 77.894us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=32, int=64, int=128, char=4, bool=0, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 1.9908ms 112 17.774us 14.401us 29.634us cudaLaunchKernel ==25751== Range "QuantizeLinear_124_quantize_scale_node" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 7.7887ms 112 69.541us 38.659us 143.59us QuantizeLinear_124_quantize_scale_node GPU activities: 100.00% 13.226ms 112 118.09us 98.472us 1.7740ms void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=32, int=64, int=128, char=4, bool=0, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.8063ms 112 25.056us 15.585us 85.317us cudaLaunchKernel ==25751== Range "QuantizeLinear_1327_quantize_scale_node_clone_0" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.7946ms 112 42.809us 34.434us 221.96us QuantizeLinear_1327_quantize_scale_node_clone_0 GPU activities: 100.00% 10.982ms 112 98.054us 93.351us 129.23us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=32, int=64, int=128, char=4, bool=0, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.0194ms 112 18.030us 14.369us 74.116us cudaLaunchKernel ==25751== Range "QuantizeLinear_1327_quantize_scale_node_clone_1" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.9860ms 112 44.518us 35.394us 106.85us QuantizeLinear_1327_quantize_scale_node_clone_1 GPU activities: 100.00% 12.462ms 112 111.27us 102.50us 139.63us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=32, int=64, int=128, char=4, bool=0, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.0725ms 112 18.504us 14.529us 84.901us cudaLaunchKernel ==25751== Range "QuantizeLinear_1472_quantize_scale_node" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.3388ms 112 47.667us 36.034us 96.518us QuantizeLinear_1472_quantize_scale_node GPU activities: 100.00% 12.234ms 112 109.23us 104.27us 137.10us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=32, int=64, int=128, char=4, bool=0, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.0443ms 112 18.252us 14.401us 46.050us cudaLaunchKernel ==25751== Range "QuantizeLinear_1489_quantize_scale_node_clone_1" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.1401ms 112 45.893us 35.266us 137.48us QuantizeLinear_1489_quantize_scale_node_clone_1 GPU activities: 100.00% 4.2224ms 112 37.700us 35.459us 46.691us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=32, int=64, int=128, char=4, bool=0, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.0957ms 112 18.711us 14.400us 67.363us cudaLaunchKernel ==25751== Range "QuantizeLinear_157_quantize_scale_node" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 6.4184ms 112 57.307us 37.154us 90.533us QuantizeLinear_157_quantize_scale_node GPU activities: 100.00% 11.835ms 112 105.67us 101.48us 133.26us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=32, int=64, int=128, char=4, bool=0, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.5503ms 112 22.770us 14.753us 44.035us cudaLaunchKernel ==25751== Range "QuantizeLinear_1634_quantize_scale_node" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 6.1352ms 112 54.778us 38.242us 784.30us QuantizeLinear_1634_quantize_scale_node GPU activities: 100.00% 6.7508ms 112 60.274us 57.061us 75.718us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=32, int=64, int=128, char=4, bool=0, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.0515ms 112 18.316us 15.041us 41.410us cudaLaunchKernel ==25751== Range "QuantizeLinear_1651_quantize_scale_node_clone_1" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 6.1196ms 112 54.639us 36.450us 1.0081ms QuantizeLinear_1651_quantize_scale_node_clone_1 GPU activities: 100.00% 2.8572ms 112 25.511us 23.522us 32.514us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=32, int=64, int=128, char=4, bool=0, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.8701ms 112 25.626us 14.560us 979.77us cudaLaunchKernel ==25751== Range "QuantizeLinear_255_quantize_scale_node" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 7.4511ms 112 66.528us 37.282us 157.80us QuantizeLinear_255_quantize_scale_node GPU activities: 100.00% 6.7892ms 112 60.618us 57.573us 76.486us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=32, int=64, int=128, char=4, bool=0, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.7164ms 112 24.253us 14.721us 60.579us cudaLaunchKernel ==25751== Range "QuantizeLinear_288_quantize_scale_node" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 6.5828ms 112 58.775us 37.442us 154.28us QuantizeLinear_288_quantize_scale_node GPU activities: 100.00% 6.6259ms 112 59.160us 55.301us 75.814us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=32, int=64, int=128, char=4, bool=0, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.7334ms 112 24.405us 15.489us 123.46us cudaLaunchKernel ==25751== Range "QuantizeLinear_321_quantize_scale_node" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 6.5406ms 112 58.398us 36.706us 108.65us QuantizeLinear_321_quantize_scale_node GPU activities: 100.00% 6.5211ms 112 58.223us 54.084us 74.149us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=32, int=64, int=128, char=4, bool=0, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.5763ms 112 23.002us 15.265us 47.042us cudaLaunchKernel ==25751== Range "QuantizeLinear_354_quantize_scale_node" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 6.2808ms 112 56.078us 35.938us 137.29us QuantizeLinear_354_quantize_scale_node GPU activities: 100.00% 6.5032ms 112 58.064us 54.852us 77.318us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=32, int=64, int=128, char=4, bool=0, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.6070ms 112 23.276us 14.561us 102.28us cudaLaunchKernel ==25751== Range "QuantizeLinear_387_quantize_scale_node" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 6.6024ms 112 58.949us 36.866us 104.07us QuantizeLinear_387_quantize_scale_node GPU activities: 100.00% 6.6100ms 112 59.017us 55.652us 76.582us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=32, int=64, int=128, char=4, bool=0, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.5963ms 112 23.181us 15.457us 69.572us cudaLaunchKernel ==25751== Range "QuantizeLinear_420_quantize_scale_node" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 6.2444ms 112 55.753us 36.866us 140.33us QuantizeLinear_420_quantize_scale_node GPU activities: 100.00% 6.5736ms 112 58.693us 54.724us 75.078us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=32, int=64, int=128, char=4, bool=0, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.5358ms 112 22.641us 14.817us 53.251us cudaLaunchKernel ==25751== Range "QuantizeLinear_43_quantize_scale_node_clone_0" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 6.5699ms 112 58.659us 35.426us 112.93us QuantizeLinear_43_quantize_scale_node_clone_0 GPU activities: 100.00% 2.1699ms 112 19.373us 17.474us 51.140us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=1, int=2048, int=1, int=128, char=4, bool=1, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.6678ms 112 23.819us 14.625us 74.852us cudaLaunchKernel ==25751== Range "QuantizeLinear_43_quantize_scale_node_clone_1" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 6.7246ms 112 60.041us 36.450us 111.56us QuantizeLinear_43_quantize_scale_node_clone_1 GPU activities: 100.00% 2.1681ms 112 19.358us 17.441us 50.756us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=1, int=2048, int=1, int=128, char=4, bool=1, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.7965ms 112 24.968us 14.689us 71.780us cudaLaunchKernel ==25751== Range "QuantizeLinear_43_quantize_scale_node_clone_2" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 8.1308ms 112 72.596us 46.083us 132.26us QuantizeLinear_43_quantize_scale_node_clone_2 GPU activities: 100.00% 2.7980ms 112 24.981us 22.785us 57.540us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=1, int=2048, int=1, int=128, char=4, bool=1, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 3.2082ms 112 28.645us 18.689us 82.533us cudaLaunchKernel ==25751== Range "QuantizeLinear_43_quantize_scale_node_clone_3" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 11.453ms 112 102.26us 61.060us 215.85us QuantizeLinear_43_quantize_scale_node_clone_3 GPU activities: 100.00% 4.1140ms 112 36.732us 34.659us 101.96us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=4, int=256, int=128, char=4, bool=0, bool=0, bool=1, bool=0, bool=1>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 3.5573ms 112 31.761us 19.137us 150.60us cudaLaunchKernel ==25751== Range "QuantizeLinear_453_quantize_scale_node" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 6.6674ms 112 59.530us 35.906us 152.01us QuantizeLinear_453_quantize_scale_node GPU activities: 100.00% 6.6354ms 112 59.244us 55.876us 76.006us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=32, int=64, int=128, char=4, bool=0, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.5793ms 112 23.029us 14.945us 62.436us cudaLaunchKernel ==25751== Range "QuantizeLinear_486_quantize_scale_node" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 6.0822ms 112 54.305us 36.162us 88.197us QuantizeLinear_486_quantize_scale_node GPU activities: 100.00% 6.5775ms 112 58.727us 55.620us 76.518us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=32, int=64, int=128, char=4, bool=0, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.5316ms 112 22.603us 14.944us 53.699us cudaLaunchKernel ==25751== Range "QuantizeLinear_519_quantize_scale_node" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 6.5784ms 112 58.735us 36.738us 139.24us QuantizeLinear_519_quantize_scale_node GPU activities: 100.00% 6.6148ms 112 59.061us 54.949us 74.950us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=32, int=64, int=128, char=4, bool=0, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.5680ms 112 22.928us 14.593us 55.331us cudaLaunchKernel ==25751== Range "QuantizeLinear_585_quantize_scale_node" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 6.3642ms 112 56.823us 36.674us 138.09us QuantizeLinear_585_quantize_scale_node GPU activities: 100.00% 12.422ms 112 110.91us 104.81us 137.74us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=32, int=64, int=128, char=4, bool=0, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.6029ms 112 23.240us 14.689us 92.101us cudaLaunchKernel ==25751== Range "QuantizeLinear_617_quantize_scale_node" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.9231ms 112 52.885us 36.642us 123.43us QuantizeLinear_617_quantize_scale_node GPU activities: 100.00% 4.2316ms 112 37.782us 35.651us 47.491us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=32, int=64, int=128, char=4, bool=0, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.4457ms 112 21.836us 14.785us 67.108us cudaLaunchKernel ==25751== Range "QuantizeLinear_650_quantize_scale_node" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 6.2508ms 112 55.810us 36.002us 101.73us QuantizeLinear_650_quantize_scale_node GPU activities: 100.00% 4.2067ms 112 37.559us 35.170us 46.628us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=32, int=64, int=128, char=4, bool=0, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.5007ms 112 22.327us 15.041us 60.995us cudaLaunchKernel ==25751== Range "QuantizeLinear_683_quantize_scale_node" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.9892ms 112 53.474us 37.154us 187.98us QuantizeLinear_683_quantize_scale_node GPU activities: 100.00% 4.1138ms 112 36.730us 34.946us 47.172us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=32, int=64, int=128, char=4, bool=0, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.4331ms 112 21.724us 15.105us 124.10us cudaLaunchKernel ==25751== Range "QuantizeLinear_716_quantize_scale_node" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 6.3337ms 112 56.550us 35.970us 145.70us QuantizeLinear_716_quantize_scale_node GPU activities: 100.00% 4.2237ms 112 37.711us 35.618us 46.915us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=32, int=64, int=128, char=4, bool=0, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.5087ms 112 22.399us 14.817us 121.96us cudaLaunchKernel ==25751== Range "QuantizeLinear_749_quantize_scale_node" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.6592ms 112 50.528us 36.067us 91.045us QuantizeLinear_749_quantize_scale_node GPU activities: 100.00% 4.1462ms 112 37.019us 34.979us 47.428us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=32, int=64, int=128, char=4, bool=0, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.3834ms 112 21.280us 14.881us 49.283us cudaLaunchKernel ==25751== Range "QuantizeLinear_782_quantize_scale_node" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.5730ms 112 49.758us 35.554us 115.78us QuantizeLinear_782_quantize_scale_node GPU activities: 100.00% 4.2015ms 112 37.513us 35.363us 46.563us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=32, int=64, int=128, char=4, bool=0, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.2178ms 112 19.802us 14.849us 37.154us cudaLaunchKernel ==25751== Range "QuantizeLinear_815_quantize_scale_node" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.2920ms 112 47.249us 35.682us 101.48us QuantizeLinear_815_quantize_scale_node GPU activities: 100.00% 4.1131ms 112 36.723us 34.403us 48.196us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=32, int=64, int=128, char=4, bool=0, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.2020ms 112 19.660us 14.752us 42.402us cudaLaunchKernel ==25751== Range "QuantizeLinear_848_quantize_scale_node" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.4321ms 112 48.501us 36.002us 82.021us QuantizeLinear_848_quantize_scale_node GPU activities: 100.00% 4.1745ms 112 37.272us 34.659us 47.652us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=32, int=64, int=128, char=4, bool=0, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.1500ms 112 19.196us 14.817us 39.458us cudaLaunchKernel ==25751== Range "QuantizeLinear_881_quantize_scale_node" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.2753ms 112 47.100us 36.067us 89.574us QuantizeLinear_881_quantize_scale_node GPU activities: 100.00% 4.1357ms 112 36.926us 34.050us 48.643us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=32, int=64, int=128, char=4, bool=0, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.1525ms 112 19.218us 14.720us 39.971us cudaLaunchKernel ==25751== Range "QuantizeLinear_91_quantize_scale_node" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 8.5505ms 112 76.344us 40.802us 184.30us QuantizeLinear_91_quantize_scale_node GPU activities: 100.00% 12.957ms 112 115.69us 100.49us 1.1899ms void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=32, int=64, int=128, char=4, bool=0, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 3.1705ms 112 28.308us 17.377us 121.03us cudaLaunchKernel ==25751== Range "QuantizeLinear_947_quantize_scale_node" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.9895ms 112 53.477us 36.802us 180.49us QuantizeLinear_947_quantize_scale_node GPU activities: 100.00% 7.0067ms 112 62.560us 59.621us 78.278us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=32, int=64, int=128, char=4, bool=0, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.1849ms 112 19.508us 14.529us 95.077us cudaLaunchKernel ==25751== Range "QuantizeLinear_983_quantize_scale_node_clone_0" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.8191ms 112 43.027us 34.530us 128.33us QuantizeLinear_983_quantize_scale_node_clone_0 GPU activities: 100.00% 2.3696ms 112 21.157us 19.490us 28.099us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=32, int=64, int=128, char=4, bool=0, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.0542ms 112 18.341us 14.241us 48.834us cudaLaunchKernel ==25751== Range "QuantizeLinear_983_quantize_scale_node_clone_1" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.9587ms 112 44.274us 35.138us 178.03us QuantizeLinear_983_quantize_scale_node_clone_1 GPU activities: 100.00% 2.3687ms 112 21.149us 19.490us 27.842us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=32, int=64, int=128, char=4, bool=0, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.1393ms 112 19.101us 14.337us 102.92us cudaLaunchKernel ==25751== Range "QuantizeLinear_983_quantize_scale_node_clone_2" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.2317ms 112 46.711us 34.626us 92.485us QuantizeLinear_983_quantize_scale_node_clone_2 GPU activities: 100.00% 2.3725ms 112 21.183us 19.778us 28.131us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=32, int=64, int=128, char=4, bool=0, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.1708ms 112 19.381us 14.241us 64.676us cudaLaunchKernel ==25751== Range "QuantizeLinear_983_quantize_scale_node_clone_3" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.4574ms 112 48.727us 37.218us 99.014us QuantizeLinear_983_quantize_scale_node_clone_3 GPU activities: 100.00% 2.8684ms 112 25.610us 23.906us 32.867us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=32, int=64, int=128, char=4, bool=0, bool=0, bool=1, bool=0, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.2216ms 112 19.835us 14.689us 51.747us cudaLaunchKernel ==25751== Range "Range_1813" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.9168ms 112 52.828us 20.417us 2.6134ms Range_1813 GPU activities: 100.00% 4.5313ms 112 40.458us 39.043us 51.620us void cuFillLayer::fill(cuFillLayer::KernelArgs) API calls: 100.00% 4.9825ms 112 44.486us 15.169us 2.6013ms cudaLaunchKernel ==25751== Range "Range_1817" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.1336ms 112 19.049us 15.393us 60.004us Range_1817 GPU activities: 100.00% 1.8370ms 112 16.402us 15.330us 23.042us void cuFillLayer::fill(cuFillLayer::KernelArgs) API calls: 100.00% 1.7449ms 112 15.579us 12.577us 43.587us cudaLaunchKernel ==25751== Range "Range_1912" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.8308ms 112 25.274us 17.249us 362.49us Range_1912 GPU activities: 100.00% 3.8652ms 112 34.510us 33.378us 44.483us void cuFillLayer::fill(cuFillLayer::KernelArgs) API calls: 100.00% 2.2645ms 112 20.219us 13.728us 337.24us cudaLaunchKernel ==25751== Range "Range_1916" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.1625ms 112 19.308us 15.457us 67.812us Range_1916 GPU activities: 100.00% 1.7214ms 112 15.369us 14.690us 24.386us void cuFillLayer::fill(cuFillLayer::KernelArgs) API calls: 100.00% 1.7565ms 112 15.683us 12.384us 62.019us cudaLaunchKernel ==25751== Range "Range_2011" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.4766ms 112 22.112us 16.865us 99.366us Range_2011 GPU activities: 100.00% 3.7726ms 112 33.684us 32.706us 43.044us void cuFillLayer::fill(cuFillLayer::KernelArgs) API calls: 100.00% 1.9783ms 112 17.663us 13.057us 92.710us cudaLaunchKernel ==25751== Range "Range_2015" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.4721ms 112 22.072us 15.136us 121.13us Range_2015 GPU activities: 100.00% 1.6802ms 112 15.001us 14.657us 22.657us void cuFillLayer::fill(cuFillLayer::KernelArgs) API calls: 100.00% 2.0711ms 112 18.492us 12.256us 117.61us cudaLaunchKernel ==25751== Range "Reformatting CopyNode for Input Tensor 0 to PWN(PWN(Sigmoid_283, Mul_284), Add_285)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 7.7907ms 112 69.560us 46.979us 255.69us Reformatting CopyNode for Input Tensor 0 to PWN(PWN(Sigmoid_283, Mul_284), Add_285) GPU activities: 100.00% 12.492ms 112 111.53us 106.95us 160.08us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=64, int=64, int=256, char=4, bool=0, bool=1, bool=1, bool=1, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 3.0093ms 112 26.868us 18.209us 69.316us cudaLaunchKernel ==25751== Range "Reformatting CopyNode for Input Tensor 0 to PWN(PWN(Sigmoid_316, Mul_317), Add_318)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 7.0893ms 112 63.297us 37.794us 105.86us Reformatting CopyNode for Input Tensor 0 to PWN(PWN(Sigmoid_316, Mul_317), Add_318) GPU activities: 100.00% 11.943ms 112 106.63us 101.13us 156.84us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=64, int=64, int=256, char=4, bool=0, bool=1, bool=1, bool=1, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.7194ms 112 24.280us 15.137us 54.883us cudaLaunchKernel ==25751== Range "Reformatting CopyNode for Input Tensor 0 to PWN(PWN(Sigmoid_349, Mul_350), Add_351)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 6.7278ms 112 60.069us 38.467us 141.16us Reformatting CopyNode for Input Tensor 0 to PWN(PWN(Sigmoid_349, Mul_350), Add_351) GPU activities: 100.00% 11.973ms 112 106.90us 101.32us 160.88us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=64, int=64, int=256, char=4, bool=0, bool=1, bool=1, bool=1, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.6519ms 112 23.677us 15.393us 52.707us cudaLaunchKernel ==25751== Range "Reformatting CopyNode for Input Tensor 0 to PWN(PWN(Sigmoid_382, Mul_383), Add_384)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 6.9786ms 112 62.309us 37.890us 174.19us Reformatting CopyNode for Input Tensor 0 to PWN(PWN(Sigmoid_382, Mul_383), Add_384) GPU activities: 100.00% 12.957ms 112 115.69us 100.04us 1.1712ms void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=64, int=64, int=256, char=4, bool=0, bool=1, bool=1, bool=1, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.7987ms 112 24.988us 15.393us 112.77us cudaLaunchKernel ==25751== Range "Reformatting CopyNode for Input Tensor 0 to PWN(PWN(Sigmoid_415, Mul_416), Add_417)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 6.5240ms 112 58.249us 37.538us 132.71us Reformatting CopyNode for Input Tensor 0 to PWN(PWN(Sigmoid_415, Mul_416), Add_417) GPU activities: 100.00% 12.089ms 112 107.94us 101.67us 155.21us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=64, int=64, int=256, char=4, bool=0, bool=1, bool=1, bool=1, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.6339ms 112 23.517us 14.881us 48.003us cudaLaunchKernel ==25751== Range "Reformatting CopyNode for Input Tensor 0 to PWN(PWN(Sigmoid_448, Mul_449), Add_450)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 6.9399ms 112 61.963us 37.698us 114.85us Reformatting CopyNode for Input Tensor 0 to PWN(PWN(Sigmoid_448, Mul_449), Add_450) GPU activities: 100.00% 12.037ms 112 107.47us 102.63us 158.35us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=64, int=64, int=256, char=4, bool=0, bool=1, bool=1, bool=1, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.8296ms 112 25.264us 15.169us 89.798us cudaLaunchKernel ==25751== Range "Reformatting CopyNode for Input Tensor 0 to PWN(PWN(Sigmoid_481, Mul_482), Add_483)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 6.3297ms 112 56.514us 37.250us 118.09us Reformatting CopyNode for Input Tensor 0 to PWN(PWN(Sigmoid_481, Mul_482), Add_483) GPU activities: 100.00% 12.101ms 112 108.05us 102.34us 158.99us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=64, int=64, int=256, char=4, bool=0, bool=1, bool=1, bool=1, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.6106ms 112 23.308us 14.977us 93.446us cudaLaunchKernel ==25751== Range "Reformatting CopyNode for Input Tensor 0 to PWN(PWN(Sigmoid_514, Mul_515), Add_516)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 6.7261ms 112 60.054us 36.802us 143.53us Reformatting CopyNode for Input Tensor 0 to PWN(PWN(Sigmoid_514, Mul_515), Add_516) GPU activities: 100.00% 12.017ms 112 107.30us 99.816us 159.44us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=64, int=64, int=256, char=4, bool=0, bool=1, bool=1, bool=1, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.6617ms 112 23.764us 15.137us 103.37us cudaLaunchKernel ==25751== Range "Reformatting CopyNode for Input Tensor 0 to PWN(PWN(Sigmoid_547, Mul_548), Add_549)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 6.5642ms 112 58.609us 37.218us 148.20us Reformatting CopyNode for Input Tensor 0 to PWN(PWN(Sigmoid_547, Mul_548), Add_549) GPU activities: 100.00% 12.029ms 112 107.40us 102.41us 157.84us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=64, int=64, int=256, char=4, bool=0, bool=1, bool=1, bool=1, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.6690ms 112 23.830us 15.201us 58.084us cudaLaunchKernel ==25751== Range "Reformatting CopyNode for Input Tensor 0 to PWN(PWN(Sigmoid_645, Mul_646), Add_647)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 7.2949ms 112 65.133us 37.602us 1.2999ms Reformatting CopyNode for Input Tensor 0 to PWN(PWN(Sigmoid_645, Mul_646), Add_647) GPU activities: 100.00% 7.6755ms 112 68.530us 65.029us 97.640us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=64, int=64, int=256, char=4, bool=0, bool=1, bool=1, bool=1, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 3.6682ms 112 32.752us 15.073us 1.2650ms cudaLaunchKernel ==25751== Range "Reformatting CopyNode for Input Tensor 0 to PWN(PWN(Sigmoid_678, Mul_679), Add_680)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 6.1890ms 112 55.258us 36.866us 155.91us Reformatting CopyNode for Input Tensor 0 to PWN(PWN(Sigmoid_678, Mul_679), Add_680) GPU activities: 100.00% 7.7593ms 112 69.279us 65.829us 96.679us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=64, int=64, int=256, char=4, bool=0, bool=1, bool=1, bool=1, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.5450ms 112 22.723us 14.913us 70.148us cudaLaunchKernel ==25751== Range "Reformatting CopyNode for Input Tensor 0 to PWN(PWN(Sigmoid_711, Mul_712), Add_713)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 6.0786ms 112 54.273us 37.474us 161.93us Reformatting CopyNode for Input Tensor 0 to PWN(PWN(Sigmoid_711, Mul_712), Add_713) GPU activities: 100.00% 7.7212ms 112 68.939us 65.254us 96.263us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=64, int=64, int=256, char=4, bool=0, bool=1, bool=1, bool=1, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.5753ms 112 22.993us 15.265us 133.48us cudaLaunchKernel ==25751== Range "Reformatting CopyNode for Input Tensor 0 to PWN(PWN(Sigmoid_744, Mul_745), Add_746)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 6.0934ms 112 54.405us 37.666us 206.54us Reformatting CopyNode for Input Tensor 0 to PWN(PWN(Sigmoid_744, Mul_745), Add_746) GPU activities: 100.00% 7.7432ms 112 69.136us 64.837us 101.26us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=64, int=64, int=256, char=4, bool=0, bool=1, bool=1, bool=1, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.3970ms 112 21.401us 15.264us 53.379us cudaLaunchKernel ==25751== Range "Reformatting CopyNode for Input Tensor 0 to PWN(PWN(Sigmoid_777, Mul_778), Add_779)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.6917ms 112 50.818us 36.930us 98.309us Reformatting CopyNode for Input Tensor 0 to PWN(PWN(Sigmoid_777, Mul_778), Add_779) GPU activities: 100.00% 7.7052ms 112 68.795us 64.645us 99.592us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=64, int=64, int=256, char=4, bool=0, bool=1, bool=1, bool=1, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.2714ms 112 20.280us 15.105us 43.523us cudaLaunchKernel ==25751== Range "Reformatting CopyNode for Input Tensor 0 to PWN(PWN(Sigmoid_810, Mul_811), Add_812)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.7100ms 112 50.982us 36.994us 135.37us Reformatting CopyNode for Input Tensor 0 to PWN(PWN(Sigmoid_810, Mul_811), Add_812) GPU activities: 100.00% 7.7120ms 112 68.857us 65.093us 103.30us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=64, int=64, int=256, char=4, bool=0, bool=1, bool=1, bool=1, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.3009ms 112 20.544us 14.945us 75.973us cudaLaunchKernel ==25751== Range "Reformatting CopyNode for Input Tensor 0 to PWN(PWN(Sigmoid_843, Mul_844), Add_845)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.7196ms 112 51.067us 36.674us 145.58us Reformatting CopyNode for Input Tensor 0 to PWN(PWN(Sigmoid_843, Mul_844), Add_845) GPU activities: 100.00% 7.7329ms 112 69.043us 64.901us 97.672us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=64, int=64, int=256, char=4, bool=0, bool=1, bool=1, bool=1, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.3322ms 112 20.823us 15.009us 106.09us cudaLaunchKernel ==25751== Range "Reformatting CopyNode for Input Tensor 0 to PWN(PWN(Sigmoid_876, Mul_877), Add_878)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.5371ms 112 49.438us 37.058us 104.17us Reformatting CopyNode for Input Tensor 0 to PWN(PWN(Sigmoid_876, Mul_877), Add_878) GPU activities: 100.00% 7.7013ms 112 68.761us 64.997us 99.528us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=64, int=64, int=256, char=4, bool=0, bool=1, bool=1, bool=1, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.1932ms 112 19.582us 15.105us 42.211us cudaLaunchKernel ==25751== Range "Reformatting CopyNode for Input Tensor 0 to PWN(PWN(Sigmoid_909, Mul_910), Add_911)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.1809ms 112 46.257us 36.898us 75.300us Reformatting CopyNode for Input Tensor 0 to PWN(PWN(Sigmoid_909, Mul_910), Add_911) GPU activities: 100.00% 8.0820ms 112 72.160us 65.190us 404.64us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=64, int=64, int=256, char=4, bool=0, bool=1, bool=1, bool=1, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.1859ms 112 19.516us 15.137us 45.475us cudaLaunchKernel ==25751== Range "Reformatting CopyNode for Input Tensor 0 to PWN(Sigmoid_1156, Mul_1157)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.9459ms 112 53.088us 38.562us 109.00us Reformatting CopyNode for Input Tensor 0 to PWN(Sigmoid_1156, Mul_1157) GPU activities: 100.00% 4.9936ms 112 44.585us 42.564us 61.765us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=64, int=64, int=256, char=4, bool=0, bool=1, bool=1, bool=1, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.2963ms 112 20.502us 14.816us 76.132us cudaLaunchKernel ==25751== Range "Reformatting CopyNode for Input Tensor 0 to PWN(Sigmoid_1320, Mul_1321)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.8181ms 112 51.947us 38.595us 152.68us Reformatting CopyNode for Input Tensor 0 to PWN(Sigmoid_1320, Mul_1321) GPU activities: 100.00% 7.7218ms 112 68.944us 64.613us 94.343us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=64, int=64, int=256, char=4, bool=0, bool=1, bool=1, bool=1, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.1812ms 112 19.475us 15.073us 57.027us cudaLaunchKernel ==25751== Range "Reformatting CopyNode for Input Tensor 0 to PWN(Sigmoid_1792, Mul_1793)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 6.2500ms 112 55.803us 41.091us 252.30us Reformatting CopyNode for Input Tensor 0 to PWN(Sigmoid_1792, Mul_1793) GPU activities: 100.00% 7.7244ms 112 68.968us 65.477us 95.816us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=64, int=64, int=256, char=4, bool=0, bool=1, bool=1, bool=1, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.5672ms 112 22.921us 15.680us 190.25us cudaLaunchKernel ==25751== Range "Reformatting CopyNode for Input Tensor 0 to PWN(Sigmoid_613, Mul_614)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 6.9375ms 112 61.941us 37.122us 170.22us Reformatting CopyNode for Input Tensor 0 to PWN(Sigmoid_613, Mul_614) GPU activities: 100.00% 7.7362ms 112 69.072us 64.549us 96.776us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=64, int=64, int=256, char=4, bool=0, bool=1, bool=1, bool=1, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.5229ms 112 22.525us 15.233us 47.907us cudaLaunchKernel ==25751== Range "Reformatting CopyNode for Input Tensor 0 to PWN(Sigmoid_975, Mul_976)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.2121ms 112 46.536us 36.610us 102.73us Reformatting CopyNode for Input Tensor 0 to PWN(Sigmoid_975, Mul_976) GPU activities: 100.00% 5.0066ms 112 44.701us 41.699us 61.605us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=64, int=64, int=256, char=4, bool=0, bool=1, bool=1, bool=1, bool=0>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 2.0951ms 112 18.706us 14.881us 31.585us cudaLaunchKernel ==25751== Range "Reformatting CopyNode for Input Tensor 0 to model.0.conv.conv.module.weight + QuantizeLinear_49_quantize_scale_node + Conv_53" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.8659ms 112 43.445us 27.489us 115.69us Reformatting CopyNode for Input Tensor 0 to model.0.conv.conv.module.weight + QuantizeLinear_49_quantize_scale_node + Conv_53 GPU activities: 100.00% 6.9970ms 112 62.473us 57.477us 132.94us cuInt8::ncqhw4ToNc32hw32(char4 const *, char4*, nvinfer1::rt::ReducedDivisor, int, nvinfer1::rt::ReducedDivisor, nvinfer1::rt::ReducedDivisor, int, int, float const *, float const *) API calls: 100.00% 3.0822ms 112 27.519us 16.641us 100.97us cudaLaunchKernel ==25751== Range "Reformatting CopyNode for Input Tensor 0 to model.17.cv3.conv.module.weight + QuantizeLinear_1462_quantize_scale_node + Conv_1466" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.2353ms 112 28.886us 21.569us 74.276us Reformatting CopyNode for Input Tensor 0 to model.17.cv3.conv.module.weight + QuantizeLinear_1462_quantize_scale_node + Conv_1466 GPU activities: 100.00% 3.9025ms 112 34.843us 32.291us 45.924us cuInt8::nc32hw32ToNcqhw4_block(char4 const *, char4*, int, int, int, int, float const *, float const *) API calls: 100.00% 1.9232ms 112 17.171us 13.504us 43.587us cudaLaunchKernel ==25751== Range "Reformatting CopyNode for Input Tensor 0 to model.2.m.0.cv2.conv.module.weight + QuantizeLinear_113_quantize_scale_node + Conv_117" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.8156ms 112 34.068us 20.738us 79.525us Reformatting CopyNode for Input Tensor 0 to model.2.m.0.cv2.conv.module.weight + QuantizeLinear_113_quantize_scale_node + Conv_117 GPU activities: 100.00% 6.7287ms 112 60.077us 54.821us 149.71us cuInt8::nchwToNcqhw4(char const *, unsigned int*, int, int, int, int, int, int, float const *, float const *) API calls: 100.00% 2.4924ms 112 22.253us 13.601us 56.867us cudaLaunchKernel ==25751== Range "Reformatting CopyNode for Input Tensor 0 to model.2.m.1.cv2.conv.module.weight + QuantizeLinear_146_quantize_scale_node + Conv_150" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.7309ms 112 33.311us 21.153us 69.604us Reformatting CopyNode for Input Tensor 0 to model.2.m.1.cv2.conv.module.weight + QuantizeLinear_146_quantize_scale_node + Conv_150 GPU activities: 100.00% 6.7729ms 112 60.472us 56.676us 75.494us cuInt8::nchwToNcqhw4(char const *, unsigned int*, int, int, int, int, int, int, float const *, float const *) API calls: 100.00% 2.4313ms 112 21.707us 13.665us 58.563us cudaLaunchKernel ==25751== Range "Reformatting CopyNode for Input Tensor 0 to model.2.m.2.cv2.conv.module.weight + QuantizeLinear_179_quantize_scale_node + Conv_183" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.0658ms 112 36.301us 20.609us 133.90us Reformatting CopyNode for Input Tensor 0 to model.2.m.2.cv2.conv.module.weight + QuantizeLinear_179_quantize_scale_node + Conv_183 GPU activities: 100.00% 6.6979ms 112 59.802us 56.197us 74.822us cuInt8::nchwToNcqhw4(char const *, unsigned int*, int, int, int, int, int, int, float const *, float const *) API calls: 100.00% 2.6901ms 112 24.018us 13.345us 122.70us cudaLaunchKernel ==25751== Range "Reformatting CopyNode for Input Tensor 0 to model.20.cv3.conv.module.weight + QuantizeLinear_1624_quantize_scale_node + Conv_1628" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.0735ms 112 27.441us 21.538us 85.284us Reformatting CopyNode for Input Tensor 0 to model.20.cv3.conv.module.weight + QuantizeLinear_1624_quantize_scale_node + Conv_1628 GPU activities: 100.00% 2.2997ms 112 20.532us 18.594us 27.298us cuInt8::nc32hw32ToNcqhw4_block(char4 const *, char4*, int, int, int, int, float const *, float const *) API calls: 100.00% 1.8959ms 112 16.928us 13.473us 56.899us cudaLaunchKernel ==25751== Range "Reformatting CopyNode for Input Tensor 0 to model.3.conv.module.weight + QuantizeLinear_229_quantize_scale_node + Conv_233" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 8.8816ms 112 79.299us 49.571us 172.97us Reformatting CopyNode for Input Tensor 0 to model.3.conv.module.weight + QuantizeLinear_229_quantize_scale_node + Conv_233 GPU activities: 100.00% 25.536ms 112 228.00us 221.68us 336.09us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=32, int=64, int=128, char=16, bool=0, bool=0, bool=1, bool=0, bool=1>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=16) API calls: 100.00% 3.0068ms 112 26.846us 19.681us 35.810us cudaLaunchKernel ==25751== Range "Reformatting CopyNode for Input Tensor 0 to model.4.cv1.conv.module.weight + QuantizeLinear_245_quantize_scale_node + Conv_249" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.8815ms 112 34.655us 22.273us 105.32us Reformatting CopyNode for Input Tensor 0 to model.4.cv1.conv.module.weight + QuantizeLinear_245_quantize_scale_node + Conv_249 GPU activities: 100.00% 3.8874ms 112 34.708us 32.450us 46.468us cuInt8::nc32hw32ToNcqhw4_block(char4 const *, char4*, int, int, int, int, float const *, float const *) API calls: 100.00% 2.5216ms 112 22.514us 14.337us 90.725us cudaLaunchKernel ==25751== Range "Reformatting CopyNode for Output Tensor 0 to PWN(PWN(Sigmoid_185, Mul_186), Add_187)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.5337ms 112 31.550us 19.809us 64.516us Reformatting CopyNode for Output Tensor 0 to PWN(PWN(Sigmoid_185, Mul_186), Add_187) GPU activities: 100.00% 6.6138ms 112 59.051us 54.948us 76.326us cuInt8::nchwToNcqhw4(char const *, unsigned int*, int, int, int, int, int, int, float const *, float const *) API calls: 100.00% 2.3808ms 112 21.257us 13.153us 53.731us cudaLaunchKernel ==25751== Range "Reformatting CopyNode for Output Tensor 0 to PWN(PWN(Sigmoid_547, Mul_548), Add_549)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.6631ms 112 32.706us 21.154us 77.860us Reformatting CopyNode for Output Tensor 0 to PWN(PWN(Sigmoid_547, Mul_548), Add_549) GPU activities: 100.00% 3.5763ms 112 31.931us 29.443us 40.100us cuInt8::nchwToNcqhw4(char const *, unsigned int*, int, int, int, int, int, int, float const *, float const *) API calls: 100.00% 2.3618ms 112 21.087us 13.857us 67.268us cudaLaunchKernel ==25751== Range "Reformatting CopyNode for Output Tensor 0 to PWN(PWN(Sigmoid_909, Mul_910), Add_911)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.2838ms 112 29.319us 21.761us 114.18us Reformatting CopyNode for Output Tensor 0 to PWN(PWN(Sigmoid_909, Mul_910), Add_911) GPU activities: 100.00% 2.0770ms 112 18.544us 17.089us 24.034us cuInt8::nchwToNcqhw4(char const *, unsigned int*, int, int, int, int, int, int, float const *, float const *) API calls: 100.00% 2.0829ms 112 18.597us 13.824us 86.437us cudaLaunchKernel ==25751== Range "Reformatting CopyNode for Output Tensor 0 to PWN(Sigmoid_202, Mul_203)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 5.5529ms 112 49.579us 24.258us 1.0121ms Reformatting CopyNode for Output Tensor 0 to PWN(Sigmoid_202, Mul_203) GPU activities: 100.00% 6.8390ms 112 61.062us 56.676us 148.65us cuInt8::nchwToNcqhw4(char const *, unsigned int*, int, int, int, int, int, int, float const *, float const *) API calls: 100.00% 2.9476ms 112 26.318us 14.849us 368.79us cudaLaunchKernel ==25751== Range "Reformatting CopyNode for Output Tensor 0 to PWN(Sigmoid_564, Mul_565)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.5930ms 112 32.080us 21.601us 57.731us Reformatting CopyNode for Output Tensor 0 to PWN(Sigmoid_564, Mul_565) GPU activities: 100.00% 2.2236ms 112 19.853us 18.209us 25.922us cuInt8::nc32hw32ToNcqhw4_block(char4 const *, char4*, int, int, int, int, float const *, float const *) API calls: 100.00% 2.3049ms 112 20.579us 13.249us 46.947us cudaLaunchKernel ==25751== Range "Reformatting CopyNode for Output Tensor 0 to PWN(Sigmoid_71, Mul_72)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.9935ms 112 35.656us 23.873us 71.492us Reformatting CopyNode for Output Tensor 0 to PWN(Sigmoid_71, Mul_72) GPU activities: 100.00% 8.9381ms 112 79.804us 57.412us 2.0256ms cuInt8::nc32hw32ToNcqhw4_block(char4 const *, char4*, int, int, int, int, float const *, float const *) API calls: 100.00% 2.5399ms 112 22.677us 14.720us 48.323us cudaLaunchKernel ==25751== Range "Reformatting CopyNode for Output Tensor 0 to PWN(Sigmoid_926, Mul_927)" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.5423ms 112 31.627us 21.954us 65.316us Reformatting CopyNode for Output Tensor 0 to PWN(Sigmoid_926, Mul_927) GPU activities: 100.00% 1.5686ms 112 14.005us 12.609us 16.833us cuInt8::nc32hw32ToNcqhw4_block(char4 const *, char4*, int, int, int, int, float const *, float const *) API calls: 100.00% 2.2260ms 112 19.875us 13.537us 44.930us cudaLaunchKernel ==25751== Range "Reformatting CopyNode for Output Tensor 0 to QuantizeLinear_43_quantize_scale_node_clone_3" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 10.197ms 112 91.042us 62.979us 174.63us Reformatting CopyNode for Output Tensor 0 to QuantizeLinear_43_quantize_scale_node_clone_3 GPU activities: 100.00% 4.7297ms 112 42.229us 40.100us 121.32us void CUTENSOR_NAMESPACE::permutationKernelPLC3>, CUTENSOR_NAMESPACE::VectorRead2DTensorView>, CUTENSOR_NAMESPACE::ThreadLevelElementwise, CUTENSOR_NAMESPACE::GeneralBinary, int=2, int=4, int=256, int=64, char=4, bool=1, bool=0, bool=1, bool=0, bool=1>, float>, CUTENSOR_NAMESPACE::ElementwiseRuntimePLC3::Params>(unsigned int=4) API calls: 100.00% 3.4825ms 112 31.093us 20.833us 101.64us cudaLaunchKernel ==25751== Range "Reshape_1808 + Transpose_1809" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.1858ms 112 37.373us 26.818us 60.387us Reshape_1808 + Transpose_1809 GPU activities: 100.00% 3.1822ms 112 28.412us 26.338us 37.251us void genericReformat::copyPackedKernel, int=5>(unsigned int, unsigned int, void const *, genericReformat::ArrayN>, genericReformat::ArrayNWithReducedDivisors>, void const *, int, int, int, float const *, void*, void const *, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayNWithReducedDivisors, void const *, int, int, int, float const , int=5) API calls: 100.00% 2.2348ms 112 19.953us 15.617us 33.026us cudaLaunchKernel ==25751== Range "Reshape_1819 + Reshape_1827" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 207.95us 112 1.8560us 1.0880us 3.9690us Reshape_1819 + Reshape_1827 No kernels were profiled in this range. No API activities were profiled in this range. ==25751== Range "Reshape_1821 + Reshape_1831" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 883.92us 112 7.8920us 736ns 766.86us Reshape_1821 + Reshape_1831 No kernels were profiled in this range. No API activities were profiled in this range. ==25751== Range "Reshape_1892" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 156.84us 112 1.4000us 992ns 3.5520us Reshape_1892 No kernels were profiled in this range. No API activities were profiled in this range. ==25751== Range "Reshape_1907 + Transpose_1908" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.1758ms 112 28.355us 21.857us 76.516us Reshape_1907 + Transpose_1908 GPU activities: 100.00% 1.2917ms 112 11.532us 10.337us 14.338us void genericReformat::copyPackedKernel, int=5>(unsigned int, unsigned int, void const *, genericReformat::ArrayN>, genericReformat::ArrayNWithReducedDivisors>, void const *, int, int, int, float const *, void*, void const *, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayNWithReducedDivisors, void const *, int, int, int, float const , int=5) API calls: 100.00% 1.9517ms 112 17.425us 13.921us 66.436us cudaLaunchKernel ==25751== Range "Reshape_1918 + Reshape_1926" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 168.46us 112 1.5040us 992ns 17.729us Reshape_1918 + Reshape_1926 No kernels were profiled in this range. No API activities were profiled in this range. ==25751== Range "Reshape_1920 + Reshape_1930" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 124.78us 112 1.1140us 864ns 3.2960us Reshape_1920 + Reshape_1930 No kernels were profiled in this range. No API activities were profiled in this range. ==25751== Range "Reshape_1991" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 147.82us 112 1.3190us 928ns 3.2960us Reshape_1991 No kernels were profiled in this range. No API activities were profiled in this range. ==25751== Range "Reshape_2006 + Transpose_2007" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.3892ms 112 30.260us 20.897us 176.72us Reshape_2006 + Transpose_2007 GPU activities: 100.00% 678.07us 112 6.0540us 5.6320us 8.6090us void genericReformat::copyPackedKernel, int=5>(unsigned int, unsigned int, void const *, genericReformat::ArrayN>, genericReformat::ArrayNWithReducedDivisors>, void const *, int, int, int, float const *, void*, void const *, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayNWithReducedDivisors, void const *, int, int, int, float const , int=5) API calls: 100.00% 2.2001ms 112 19.643us 13.569us 163.53us cudaLaunchKernel ==25751== Range "Reshape_2017 + Reshape_2025" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 168.33us 112 1.5020us 1.0240us 18.721us Reshape_2017 + Reshape_2025 No kernels were profiled in this range. No API activities were profiled in this range. ==25751== Range "Reshape_2019 + Reshape_2029" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 120.75us 112 1.0780us 640ns 1.9520us Reshape_2019 + Reshape_2029 No kernels were profiled in this range. No API activities were profiled in this range. ==25751== Range "Reshape_2090" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 148.65us 112 1.3270us 896ns 3.0720us Reshape_2090 No kernels were profiled in this range. No API activities were profiled in this range. ==25751== Range "Resize_1159" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.2933ms 112 29.404us 21.665us 69.572us Resize_1159 GPU activities: 100.00% 9.6311ms 112 85.991us 82.375us 122.35us void cuResizeLayer::ResizeNearestGenericKernel(float*, cuResizeLayer::ResizeNearestGenericKernel const *, cuResizeLayer::LaunchParams) API calls: 100.00% 2.2894ms 112 20.441us 15.713us 48.227us cudaLaunchKernel ==25751== Range "Resize_1323" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.5696ms 112 22.943us 17.889us 71.108us Resize_1323 GPU activities: 100.00% 18.503ms 112 165.21us 158.19us 238.58us void cuResizeLayer::ResizeNearestGenericKernel(float*, cuResizeLayer::ResizeNearestGenericKernel const *, cuResizeLayer::LaunchParams) API calls: 100.00% 1.8093ms 112 16.154us 12.865us 55.299us cudaLaunchKernel ==25751== Range "Slice_14" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.7700ms 112 33.660us 19.170us 79.973us Slice_14 GPU activities: 100.00% 6.0531ms 112 54.045us 49.892us 139.66us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) API calls: 100.00% 2.7831ms 112 24.849us 14.560us 72.868us cudaLaunchKernel ==25751== Range "Slice_1867" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.5185ms 112 22.486us 16.641us 111.62us Slice_1867 GPU activities: 100.00% 1.2204ms 112 10.896us 9.8890us 14.497us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) API calls: 100.00% 1.8072ms 112 16.135us 12.065us 102.28us cudaLaunchKernel ==25751== Range "Slice_1879" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.2502ms 112 20.090us 15.328us 77.572us Slice_1879 GPU activities: 100.00% 1.1554ms 112 10.316us 9.2800us 14.114us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) API calls: 100.00% 1.6777ms 112 14.979us 11.648us 68.228us cudaLaunchKernel ==25751== Range "Slice_1888" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.2381ms 112 19.983us 15.360us 74.916us Slice_1888 GPU activities: 100.00% 1.6456ms 112 14.693us 13.249us 19.329us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) API calls: 100.00% 1.6377ms 112 14.622us 11.648us 59.171us cudaLaunchKernel ==25751== Range "Slice_19" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.4313ms 112 30.636us 15.841us 90.949us Slice_19 GPU activities: 100.00% 4.1031ms 112 36.634us 32.643us 102.02us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) API calls: 100.00% 2.5615ms 112 22.870us 12.033us 54.275us cudaLaunchKernel ==25751== Range "Slice_1966" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.3775ms 112 21.227us 16.353us 98.181us Slice_1966 GPU activities: 100.00% 584.79us 112 5.2210us 4.6410us 7.8400us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) API calls: 100.00% 1.7675ms 112 15.781us 11.841us 89.797us cudaLaunchKernel ==25751== Range "Slice_1978" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.4242ms 112 21.644us 15.521us 254.64us Slice_1978 GPU activities: 100.00% 462.91us 112 4.1330us 3.7760us 6.2400us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) API calls: 100.00% 1.8658ms 112 16.658us 11.777us 248.37us cudaLaunchKernel ==25751== Range "Slice_1987" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.8842ms 112 25.752us 15.393us 794.51us Slice_1987 GPU activities: 100.00% 430.28us 112 3.8410us 3.4560us 5.8560us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) API calls: 100.00% 2.3556ms 112 21.031us 11.617us 785.23us cudaLaunchKernel ==25751== Range "Slice_2065" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.5097ms 112 22.408us 16.193us 123.30us Slice_2065 GPU activities: 100.00% 222.61us 112 1.9870us 1.8560us 2.9760us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) API calls: 100.00% 1.9101ms 112 17.054us 11.936us 113.58us cudaLaunchKernel ==25751== Range "Slice_2077" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.7534ms 112 24.583us 15.745us 112.52us Slice_2077 GPU activities: 100.00% 230.42us 112 2.0570us 1.8880us 3.6800us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) API calls: 100.00% 2.1658ms 112 19.337us 11.745us 106.15us cudaLaunchKernel ==25751== Range "Slice_2086" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.9086ms 112 25.969us 15.520us 170.92us Slice_2086 GPU activities: 100.00% 245.18us 112 2.1890us 2.0480us 3.6800us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) API calls: 100.00% 2.3325ms 112 20.826us 11.712us 166.92us cudaLaunchKernel ==25751== Range "Slice_24" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.3317ms 112 29.747us 17.249us 79.749us Slice_24 GPU activities: 100.00% 6.1253ms 112 54.690us 49.796us 138.48us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) API calls: 100.00% 2.4710ms 112 22.062us 13.024us 71.140us cudaLaunchKernel ==25751== Range "Slice_29" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.1580ms 112 28.196us 15.713us 65.059us Slice_29 GPU activities: 100.00% 4.0114ms 112 35.816us 32.706us 102.28us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) API calls: 100.00% 2.3746ms 112 21.201us 11.841us 55.043us cudaLaunchKernel ==25751== Range "Slice_34" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.1660ms 112 28.268us 16.257us 57.156us Slice_34 GPU activities: 100.00% 6.2865ms 112 56.129us 51.748us 138.03us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) API calls: 100.00% 2.4044ms 112 21.467us 12.096us 43.682us cudaLaunchKernel ==25751== Range "Slice_39" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.1737ms 112 28.336us 15.681us 66.052us Slice_39 GPU activities: 100.00% 4.0088ms 112 35.792us 32.419us 103.05us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) API calls: 100.00% 2.3533ms 112 21.011us 11.969us 61.667us cudaLaunchKernel ==25751== Range "Slice_4" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 7.6245ms 112 68.076us 38.690us 191.79us Slice_4 GPU activities: 100.00% 6.9809ms 112 62.329us 58.052us 144.24us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) API calls: 100.00% 5.7260ms 112 51.124us 29.794us 144.36us cudaLaunchKernel ==25751== Range "Slice_9" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.3337ms 112 29.765us 15.841us 116.26us Slice_9 GPU activities: 100.00% 4.0840ms 112 36.464us 32.802us 102.38us void cuSliceLayer::naiveSlice(cuSliceLayer::LaunchParams) API calls: 100.00% 2.4639ms 112 21.999us 11.937us 109.51us cudaLaunchKernel ==25751== Range "Unsqueeze_1833" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 138.37us 112 1.2350us 704ns 28.674us Unsqueeze_1833 No kernels were profiled in this range. No API activities were profiled in this range. ==25751== Range "Unsqueeze_1834" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 123.91us 112 1.1060us 704ns 2.0160us Unsqueeze_1834 No kernels were profiled in this range. No API activities were profiled in this range. ==25751== Range "Unsqueeze_1932" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 108.20us 112 966ns 704ns 1.9840us Unsqueeze_1932 No kernels were profiled in this range. No API activities were profiled in this range. ==25751== Range "Unsqueeze_1933" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 121.64us 112 1.0860us 736ns 2.8480us Unsqueeze_1933 No kernels were profiled in this range. No API activities were profiled in this range. ==25751== Range "Unsqueeze_2031" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 108.91us 112 972ns 640ns 1.6320us Unsqueeze_2031 No kernels were profiled in this range. No API activities were profiled in this range. ==25751== Range "Unsqueeze_2032" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 123.88us 112 1.1060us 736ns 3.4880us Unsqueeze_2032 No kernels were profiled in this range. No API activities were profiled in this range. ==25751== Range "model.0.conv.conv.module.weight + QuantizeLinear_49_quantize_scale_node + Conv_53" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.8126ms 112 42.969us 29.090us 86.053us model.0.conv.conv.module.weight + QuantizeLinear_49_quantize_scale_node + Conv_53 GPU activities: 100.00% 66.916ms 112 597.46us 532.20us 1.7369ms trt_volta_fp32_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_large_nt_v1 API calls: 100.00% 3.0444ms 112 27.181us 17.409us 67.780us cudaLaunchKernel ==25751== Range "model.1.conv.module.weight + QuantizeLinear_65_quantize_scale_node + Conv_69" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.5495ms 112 31.691us 19.649us 70.436us model.1.conv.module.weight + QuantizeLinear_65_quantize_scale_node + Conv_69 GPU activities: 100.00% 48.232ms 112 430.64us 365.15us 2.3475ms trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 2.4786ms 112 22.130us 13.889us 62.884us cudaLaunchKernel ==25751== Range "model.10.conv.module.weight + QuantizeLinear_1150_quantize_scale_node + Conv_1154" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.3857ms 112 21.300us 16.929us 49.251us model.10.conv.module.weight + QuantizeLinear_1150_quantize_scale_node + Conv_1154 GPU activities: 100.00% 5.6902ms 112 50.805us 47.588us 72.229us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 1.7484ms 112 15.610us 12.417us 44.483us cudaLaunchKernel ==25751== Range "model.13.cv1.conv.module.weight + QuantizeLinear_1169_quantize_scale_node + Conv_1173" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.4430ms 112 21.812us 17.249us 54.851us model.13.cv1.conv.module.weight + QuantizeLinear_1169_quantize_scale_node + Conv_1173 GPU activities: 100.00% 10.930ms 112 97.585us 87.463us 480.97us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 1.7488ms 112 15.614us 12.769us 34.210us cudaLaunchKernel ==25751== Range "model.13.cv2.conv.module.weight + QuantizeLinear_1281_quantize_scale_node + Conv_1285" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.2661ms 112 20.232us 16.385us 44.738us model.13.cv2.conv.module.weight + QuantizeLinear_1281_quantize_scale_node + Conv_1285 GPU activities: 100.00% 10.198ms 112 91.049us 84.774us 130.19us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 1.6944ms 112 15.128us 12.321us 38.466us cudaLaunchKernel ==25751== Range "model.13.cv3.conv.module.weight + QuantizeLinear_1298_quantize_scale_node + Conv_1302" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.3974ms 112 21.405us 16.929us 44.419us model.13.cv3.conv.module.weight + QuantizeLinear_1298_quantize_scale_node + Conv_1302 GPU activities: 100.00% 12.949ms 112 115.61us 104.46us 161.23us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 1.7536ms 112 15.656us 12.257us 34.274us cudaLaunchKernel ==25751== Range "model.13.m.0.cv1.conv.module.weight + QuantizeLinear_1185_quantize_scale_node + Conv_1189" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.3854ms 112 21.298us 16.993us 84.933us model.13.m.0.cv1.conv.module.weight + QuantizeLinear_1185_quantize_scale_node + Conv_1189 GPU activities: 100.00% 5.6827ms 112 50.738us 45.507us 75.014us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 1.7442ms 112 15.572us 12.513us 77.060us cudaLaunchKernel ==25751== Range "model.13.m.0.cv2.conv.module.weight + QuantizeLinear_1201_quantize_scale_node + Conv_1205" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.5202ms 112 22.501us 17.185us 92.165us model.13.m.0.cv2.conv.module.weight + QuantizeLinear_1201_quantize_scale_node + Conv_1205 GPU activities: 100.00% 17.093ms 112 152.62us 147.28us 221.75us trt_volta_fp32_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 API calls: 100.00% 1.8559ms 112 16.570us 12.577us 82.277us cudaLaunchKernel ==25751== Range "model.13.m.1.cv1.conv.module.weight + QuantizeLinear_1217_quantize_scale_node + Conv_1221" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.3870ms 112 21.312us 17.024us 66.244us model.13.m.1.cv1.conv.module.weight + QuantizeLinear_1217_quantize_scale_node + Conv_1221 GPU activities: 100.00% 6.0009ms 112 53.579us 48.836us 79.078us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 1.7324ms 112 15.467us 12.608us 59.043us cudaLaunchKernel ==25751== Range "model.13.m.1.cv2.conv.module.weight + QuantizeLinear_1233_quantize_scale_node + Conv_1237" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.3321ms 112 20.822us 16.993us 41.891us model.13.m.1.cv2.conv.module.weight + QuantizeLinear_1233_quantize_scale_node + Conv_1237 GPU activities: 100.00% 17.196ms 112 153.53us 140.94us 731.29us trt_volta_fp32_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 API calls: 100.00% 1.7052ms 112 15.224us 12.480us 35.106us cudaLaunchKernel ==25751== Range "model.13.m.2.cv1.conv.module.weight + QuantizeLinear_1249_quantize_scale_node + Conv_1253" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.3345ms 112 20.843us 16.961us 64.292us model.13.m.2.cv1.conv.module.weight + QuantizeLinear_1249_quantize_scale_node + Conv_1253 GPU activities: 100.00% 5.8742ms 112 52.448us 45.603us 75.814us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 1.7329ms 112 15.472us 12.352us 58.115us cudaLaunchKernel ==25751== Range "model.13.m.2.cv2.conv.module.weight + QuantizeLinear_1265_quantize_scale_node + Conv_1269" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.3663ms 112 21.127us 16.993us 42.594us model.13.m.2.cv2.conv.module.weight + QuantizeLinear_1265_quantize_scale_node + Conv_1269 GPU activities: 100.00% 16.707ms 112 149.17us 144.08us 217.49us trt_volta_fp32_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 API calls: 100.00% 1.7328ms 112 15.471us 12.577us 31.106us cudaLaunchKernel ==25751== Range "model.14.conv.module.weight + QuantizeLinear_1314_quantize_scale_node + Conv_1318" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.3084ms 112 20.610us 16.673us 50.979us model.14.conv.module.weight + QuantizeLinear_1314_quantize_scale_node + Conv_1318 GPU activities: 100.00% 7.3607ms 112 65.720us 60.485us 91.368us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 1.6894ms 112 15.084us 12.225us 45.250us cudaLaunchKernel ==25751== Range "model.17.cv1.conv.module.weight + QuantizeLinear_1333_quantize_scale_node + Conv_1337" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.5178ms 112 22.480us 17.345us 83.909us model.17.cv1.conv.module.weight + QuantizeLinear_1333_quantize_scale_node + Conv_1337 GPU activities: 100.00% 13.845ms 112 123.61us 112.81us 168.94us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 1.8704ms 112 16.700us 12.769us 75.076us cudaLaunchKernel ==25751== Range "model.17.cv2.conv.module.weight + QuantizeLinear_1445_quantize_scale_node + Conv_1449" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.3239ms 112 20.749us 16.609us 87.141us model.17.cv2.conv.module.weight + QuantizeLinear_1445_quantize_scale_node + Conv_1449 GPU activities: 100.00% 13.448ms 112 120.07us 107.79us 169.29us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 1.7589ms 112 15.704us 12.384us 81.668us cudaLaunchKernel ==25751== Range "model.17.cv3.conv.module.weight + QuantizeLinear_1462_quantize_scale_node + Conv_1466" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.5984ms 112 23.200us 18.657us 44.931us model.17.cv3.conv.module.weight + QuantizeLinear_1462_quantize_scale_node + Conv_1466 GPU activities: 100.00% 21.804ms 112 194.68us 186.58us 278.77us trt_volta_fp32_icudnn_int8x4_128x64_relu_interior_nn_v1 API calls: 100.00% 1.8048ms 112 16.114us 12.993us 31.522us cudaLaunchKernel ==25751== Range "model.17.m.0.cv1.conv.module.weight + QuantizeLinear_1349_quantize_scale_node + Conv_1353" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.3595ms 112 21.066us 16.929us 46.786us model.17.m.0.cv1.conv.module.weight + QuantizeLinear_1349_quantize_scale_node + Conv_1353 GPU activities: 100.00% 8.8085ms 112 78.647us 71.077us 115.69us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 1.7051ms 112 15.223us 12.352us 33.890us cudaLaunchKernel ==25751== Range "model.17.m.0.cv2.conv.module.weight + QuantizeLinear_1365_quantize_scale_node + Conv_1369" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.2468ms 112 20.060us 16.673us 39.682us model.17.m.0.cv2.conv.module.weight + QuantizeLinear_1365_quantize_scale_node + Conv_1369 GPU activities: 100.00% 19.185ms 112 171.30us 154.96us 247.22us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 1.6472ms 112 14.707us 12.160us 32.706us cudaLaunchKernel ==25751== Range "model.17.m.1.cv1.conv.module.weight + QuantizeLinear_1381_quantize_scale_node + Conv_1385" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.3952ms 112 21.385us 17.057us 51.523us model.17.m.1.cv1.conv.module.weight + QuantizeLinear_1381_quantize_scale_node + Conv_1385 GPU activities: 100.00% 8.8061ms 112 78.626us 71.558us 113.19us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 1.7401ms 112 15.536us 12.512us 45.859us cudaLaunchKernel ==25751== Range "model.17.m.1.cv2.conv.module.weight + QuantizeLinear_1397_quantize_scale_node + Conv_1401" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.3519ms 112 20.998us 16.993us 90.533us model.17.m.1.cv2.conv.module.weight + QuantizeLinear_1397_quantize_scale_node + Conv_1401 GPU activities: 100.00% 18.909ms 112 168.83us 154.70us 257.14us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 1.6993ms 112 15.172us 12.352us 79.492us cudaLaunchKernel ==25751== Range "model.17.m.2.cv1.conv.module.weight + QuantizeLinear_1413_quantize_scale_node + Conv_1417" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.2843ms 112 20.395us 16.833us 43.170us model.17.m.2.cv1.conv.module.weight + QuantizeLinear_1413_quantize_scale_node + Conv_1417 GPU activities: 100.00% 9.1403ms 112 81.609us 66.054us 391.65us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 1.6527ms 112 14.755us 12.417us 37.794us cudaLaunchKernel ==25751== Range "model.17.m.2.cv2.conv.module.weight + QuantizeLinear_1429_quantize_scale_node + Conv_1433" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.3953ms 112 21.386us 16.673us 70.436us model.17.m.2.cv2.conv.module.weight + QuantizeLinear_1429_quantize_scale_node + Conv_1433 GPU activities: 100.00% 18.911ms 112 168.85us 152.72us 245.01us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 1.7781ms 112 15.875us 12.256us 64.484us cudaLaunchKernel ==25751== Range "model.18.conv.module.weight + QuantizeLinear_1478_quantize_scale_node + Conv_1482" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.2933ms 112 20.476us 16.961us 53.635us model.18.conv.module.weight + QuantizeLinear_1478_quantize_scale_node + Conv_1482 GPU activities: 100.00% 17.812ms 112 159.03us 151.56us 227.09us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 1.7124ms 112 15.289us 12.609us 48.227us cudaLaunchKernel ==25751== Range "model.2.cv1.conv.module.weight + QuantizeLinear_81_quantize_scale_node + Conv_85" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.7325ms 112 33.325us 21.857us 130.57us model.2.cv1.conv.module.weight + QuantizeLinear_81_quantize_scale_node + Conv_85 GPU activities: 100.00% 15.383ms 112 137.35us 128.75us 383.29us trt_volta_fp32_icudnn_int8x4_128x64_relu_small_nn_v1 API calls: 100.00% 2.7576ms 112 24.621us 15.361us 119.94us cudaLaunchKernel ==25751== Range "model.2.cv2.conv.module.weight + QuantizeLinear_196_quantize_scale_node + Conv_200" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.1612ms 112 28.225us 17.121us 71.908us model.2.cv2.conv.module.weight + QuantizeLinear_196_quantize_scale_node + Conv_200 GPU activities: 100.00% 15.077ms 112 134.61us 127.11us 375.58us trt_volta_fp32_icudnn_int8x4_128x64_relu_small_nn_v1 API calls: 100.00% 2.3277ms 112 20.783us 12.705us 64.964us cudaLaunchKernel ==25751== Range "model.2.cv3.conv.module.weight + QuantizeLinear_213_quantize_scale_node + Conv_217" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.3261ms 112 29.697us 18.689us 109.13us model.2.cv3.conv.module.weight + QuantizeLinear_213_quantize_scale_node + Conv_217 GPU activities: 100.00% 29.177ms 112 260.51us 243.92us 1.0296ms trt_volta_fp32_icudnn_int8x4_128x64_relu_interior_nn_v1 API calls: 100.00% 2.4124ms 112 21.539us 13.248us 101.38us cudaLaunchKernel ==25751== Range "model.2.m.0.cv1.conv.module.weight + QuantizeLinear_97_quantize_scale_node + Conv_101" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.8365ms 112 34.254us 22.017us 75.621us model.2.m.0.cv1.conv.module.weight + QuantizeLinear_97_quantize_scale_node + Conv_101 GPU activities: 100.00% 13.638ms 112 121.77us 115.50us 306.71us trt_volta_fp32_icudnn_int8x4_128x64_relu_interior_nn_v1 API calls: 100.00% 2.6709ms 112 23.847us 14.817us 46.787us cudaLaunchKernel ==25751== Range "model.2.m.0.cv2.conv.module.weight + QuantizeLinear_113_quantize_scale_node + Conv_117" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.2961ms 112 29.429us 17.857us 87.493us model.2.m.0.cv2.conv.module.weight + QuantizeLinear_113_quantize_scale_node + Conv_117 GPU activities: 100.00% 48.106ms 112 429.51us 378.72us 5.3590ms trt_volta_fp32_icudnn_int8x4_128x64_relu_small_nn_v1 API calls: 100.00% 2.4191ms 112 21.598us 12.865us 78.949us cudaLaunchKernel ==25751== Range "model.2.m.1.cv1.conv.module.weight + QuantizeLinear_130_quantize_scale_node + Conv_134" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.3820ms 112 30.196us 19.297us 111.30us model.2.m.1.cv1.conv.module.weight + QuantizeLinear_130_quantize_scale_node + Conv_134 GPU activities: 100.00% 13.347ms 112 119.17us 114.06us 157.39us trt_volta_fp32_icudnn_int8x4_128x64_relu_interior_nn_v1 API calls: 100.00% 2.4406ms 112 21.790us 13.856us 103.01us cudaLaunchKernel ==25751== Range "model.2.m.1.cv2.conv.module.weight + QuantizeLinear_146_quantize_scale_node + Conv_150" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.0994ms 112 27.673us 17.569us 54.819us model.2.m.1.cv2.conv.module.weight + QuantizeLinear_146_quantize_scale_node + Conv_150 GPU activities: 100.00% 44.044ms 112 393.25us 376.99us 1.2457ms trt_volta_fp32_icudnn_int8x4_128x64_relu_small_nn_v1 API calls: 100.00% 2.2987ms 112 20.524us 12.704us 47.907us cudaLaunchKernel ==25751== Range "model.2.m.2.cv1.conv.module.weight + QuantizeLinear_163_quantize_scale_node + Conv_167" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.3205ms 112 29.647us 18.689us 88.550us model.2.m.2.cv1.conv.module.weight + QuantizeLinear_163_quantize_scale_node + Conv_167 GPU activities: 100.00% 13.443ms 112 120.03us 115.02us 159.31us trt_volta_fp32_icudnn_int8x4_128x64_relu_interior_nn_v1 API calls: 100.00% 2.4125ms 112 21.540us 13.537us 79.620us cudaLaunchKernel ==25751== Range "model.2.m.2.cv2.conv.module.weight + QuantizeLinear_179_quantize_scale_node + Conv_183" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.2288ms 112 28.828us 17.921us 172.14us model.2.m.2.cv2.conv.module.weight + QuantizeLinear_179_quantize_scale_node + Conv_183 GPU activities: 100.00% 43.989ms 112 392.76us 378.40us 1.2566ms trt_volta_fp32_icudnn_int8x4_128x64_relu_small_nn_v1 API calls: 100.00% 2.4450ms 112 21.830us 13.249us 163.95us cudaLaunchKernel ==25751== Range "model.20.cv1.conv.module.weight + QuantizeLinear_1495_quantize_scale_node + Conv_1499" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.4533ms 112 21.904us 17.121us 82.661us model.20.cv1.conv.module.weight + QuantizeLinear_1495_quantize_scale_node + Conv_1499 GPU activities: 100.00% 7.7530ms 112 69.223us 65.222us 97.704us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 1.6854ms 112 15.048us 12.448us 41.346us cudaLaunchKernel ==25751== Range "model.20.cv2.conv.module.weight + QuantizeLinear_1607_quantize_scale_node + Conv_1611" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.2410ms 112 20.008us 16.737us 42.499us model.20.cv2.conv.module.weight + QuantizeLinear_1607_quantize_scale_node + Conv_1611 GPU activities: 100.00% 7.3278ms 112 65.427us 59.909us 91.143us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 1.6554ms 112 14.780us 12.449us 35.522us cudaLaunchKernel ==25751== Range "model.20.cv3.conv.module.weight + QuantizeLinear_1624_quantize_scale_node + Conv_1628" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.6397ms 112 23.569us 18.017us 65.668us model.20.cv3.conv.module.weight + QuantizeLinear_1624_quantize_scale_node + Conv_1628 GPU activities: 100.00% 20.765ms 112 185.40us 179.37us 284.18us trt_volta_fp32_icudnn_int8x4_128x64_relu_interior_nn_v1 API calls: 100.00% 1.8272ms 112 16.314us 12.801us 35.682us cudaLaunchKernel ==25751== Range "model.20.m.0.cv1.conv.module.weight + QuantizeLinear_1511_quantize_scale_node + Conv_1515" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.4115ms 112 21.531us 16.737us 116.55us model.20.m.0.cv1.conv.module.weight + QuantizeLinear_1511_quantize_scale_node + Conv_1515 GPU activities: 100.00% 5.5249ms 112 49.329us 44.260us 69.957us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 1.6523ms 112 14.752us 12.225us 34.562us cudaLaunchKernel ==25751== Range "model.20.m.0.cv2.conv.module.weight + QuantizeLinear_1527_quantize_scale_node + Conv_1531" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.4492ms 112 21.867us 17.185us 51.555us model.20.m.0.cv2.conv.module.weight + QuantizeLinear_1527_quantize_scale_node + Conv_1531 GPU activities: 100.00% 17.004ms 112 151.83us 146.67us 220.85us trt_volta_fp32_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 API calls: 100.00% 1.7919ms 112 15.999us 12.385us 42.883us cudaLaunchKernel ==25751== Range "model.20.m.1.cv1.conv.module.weight + QuantizeLinear_1543_quantize_scale_node + Conv_1547" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.3608ms 112 21.078us 17.153us 40.834us model.20.m.1.cv1.conv.module.weight + QuantizeLinear_1543_quantize_scale_node + Conv_1547 GPU activities: 100.00% 6.8739ms 112 61.373us 51.044us 606.86us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 1.6810ms 112 15.008us 12.193us 35.842us cudaLaunchKernel ==25751== Range "model.20.m.1.cv2.conv.module.weight + QuantizeLinear_1559_quantize_scale_node + Conv_1563" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.3650ms 112 21.116us 16.865us 81.701us model.20.m.1.cv2.conv.module.weight + QuantizeLinear_1559_quantize_scale_node + Conv_1563 GPU activities: 100.00% 17.053ms 112 152.26us 147.02us 219.70us trt_volta_fp32_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 API calls: 100.00% 1.6936ms 112 15.121us 12.320us 43.427us cudaLaunchKernel ==25751== Range "model.20.m.2.cv1.conv.module.weight + QuantizeLinear_1575_quantize_scale_node + Conv_1579" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.3137ms 112 20.657us 16.865us 78.948us model.20.m.2.cv1.conv.module.weight + QuantizeLinear_1575_quantize_scale_node + Conv_1579 GPU activities: 100.00% 5.9627ms 112 53.238us 48.004us 75.142us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 1.6603ms 112 14.823us 12.352us 73.316us cudaLaunchKernel ==25751== Range "model.20.m.2.cv2.conv.module.weight + QuantizeLinear_1591_quantize_scale_node + Conv_1595" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.3251ms 112 20.760us 16.833us 43.715us model.20.m.2.cv2.conv.module.weight + QuantizeLinear_1591_quantize_scale_node + Conv_1595 GPU activities: 100.00% 16.849ms 112 150.44us 144.81us 218.39us trt_volta_fp32_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 API calls: 100.00% 1.7059ms 112 15.231us 12.257us 36.546us cudaLaunchKernel ==25751== Range "model.21.conv.module.weight + QuantizeLinear_1640_quantize_scale_node + Conv_1644" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.5079ms 112 22.392us 17.089us 114.47us model.21.conv.module.weight + QuantizeLinear_1640_quantize_scale_node + Conv_1644 GPU activities: 100.00% 16.461ms 112 146.98us 141.32us 205.39us trt_volta_fp32_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 API calls: 100.00% 1.8733ms 112 16.725us 12.641us 106.95us cudaLaunchKernel ==25751== Range "model.23.cv1.conv.module.weight + QuantizeLinear_1657_quantize_scale_node + Conv_1661" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 4.2506ms 112 37.952us 16.993us 1.9135ms model.23.cv1.conv.module.weight + QuantizeLinear_1657_quantize_scale_node + Conv_1661 GPU activities: 100.00% 6.1043ms 112 54.502us 49.156us 361.08us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 3.5549ms 112 31.740us 12.289us 1.9007ms cudaLaunchKernel ==25751== Range "model.23.cv2.conv.module.weight + QuantizeLinear_1769_quantize_scale_node + Conv_1773" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.4553ms 112 21.922us 16.769us 209.39us model.23.cv2.conv.module.weight + QuantizeLinear_1769_quantize_scale_node + Conv_1773 GPU activities: 100.00% 5.5198ms 112 49.283us 46.403us 70.597us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 1.8560ms 112 16.571us 12.385us 197.26us cudaLaunchKernel ==25751== Range "model.23.cv3.conv.module.weight + QuantizeLinear_1786_quantize_scale_node + Conv_1790" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.5469ms 112 22.739us 17.121us 96.453us model.23.cv3.conv.module.weight + QuantizeLinear_1786_quantize_scale_node + Conv_1790 GPU activities: 100.00% 11.053ms 112 98.684us 88.935us 663.96us trt_volta_fp32_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 API calls: 100.00% 1.9133ms 112 17.082us 12.417us 87.493us cudaLaunchKernel ==25751== Range "model.23.m.0.cv1.conv.module.weight + QuantizeLinear_1673_quantize_scale_node + Conv_1677" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.3851ms 112 21.295us 16.705us 102.63us model.23.m.0.cv1.conv.module.weight + QuantizeLinear_1673_quantize_scale_node + Conv_1677 GPU activities: 100.00% 3.9755ms 112 35.495us 32.579us 51.140us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 1.7835ms 112 15.924us 12.193us 94.405us cudaLaunchKernel ==25751== Range "model.23.m.0.cv2.conv.module.weight + QuantizeLinear_1689_quantize_scale_node + Conv_1693" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.2909ms 112 20.454us 16.769us 65.092us model.23.m.0.cv2.conv.module.weight + QuantizeLinear_1689_quantize_scale_node + Conv_1693 GPU activities: 100.00% 15.711ms 112 140.28us 136.14us 202.00us trt_volta_fp32_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 API calls: 100.00% 1.6048ms 112 14.328us 12.161us 24.033us cudaLaunchKernel ==25751== Range "model.23.m.1.cv1.conv.module.weight + QuantizeLinear_1705_quantize_scale_node + Conv_1709" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.4760ms 112 22.106us 17.089us 111.65us model.23.m.1.cv1.conv.module.weight + QuantizeLinear_1705_quantize_scale_node + Conv_1709 GPU activities: 100.00% 4.4539ms 112 39.766us 38.147us 55.812us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 1.7528ms 112 15.650us 12.096us 103.08us cudaLaunchKernel ==25751== Range "model.23.m.1.cv2.conv.module.weight + QuantizeLinear_1721_quantize_scale_node + Conv_1725" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.4535ms 112 21.906us 16.961us 102.66us model.23.m.1.cv2.conv.module.weight + QuantizeLinear_1721_quantize_scale_node + Conv_1725 GPU activities: 100.00% 16.052ms 112 143.32us 136.75us 442.98us trt_volta_fp32_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 API calls: 100.00% 1.8246ms 112 16.290us 12.353us 95.238us cudaLaunchKernel ==25751== Range "model.23.m.2.cv1.conv.module.weight + QuantizeLinear_1737_quantize_scale_node + Conv_1741" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.5289ms 112 22.579us 16.673us 138.82us model.23.m.2.cv1.conv.module.weight + QuantizeLinear_1737_quantize_scale_node + Conv_1741 GPU activities: 100.00% 4.0982ms 112 36.590us 34.723us 51.908us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 1.8775ms 112 16.763us 12.000us 131.30us cudaLaunchKernel ==25751== Range "model.23.m.2.cv2.conv.module.weight + QuantizeLinear_1753_quantize_scale_node + Conv_1757" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.4611ms 112 21.974us 16.737us 130.09us model.23.m.2.cv2.conv.module.weight + QuantizeLinear_1753_quantize_scale_node + Conv_1757 GPU activities: 100.00% 15.354ms 112 137.09us 130.54us 199.60us trt_volta_fp32_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 API calls: 100.00% 1.8444ms 112 16.467us 12.160us 122.41us cudaLaunchKernel ==25751== Range "model.3.conv.module.weight + QuantizeLinear_229_quantize_scale_node + Conv_233" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.2757ms 112 29.247us 18.401us 104.58us model.3.conv.module.weight + QuantizeLinear_229_quantize_scale_node + Conv_233 GPU activities: 100.00% 38.313ms 112 342.08us 301.69us 1.9295ms trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 2.3065ms 112 20.594us 13.281us 44.738us cudaLaunchKernel ==25751== Range "model.4.cv1.conv.module.weight + QuantizeLinear_245_quantize_scale_node + Conv_249" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.3654ms 112 30.048us 19.649us 125.64us model.4.cv1.conv.module.weight + QuantizeLinear_245_quantize_scale_node + Conv_249 GPU activities: 100.00% 12.136ms 112 108.36us 103.40us 151.95us trt_volta_fp32_icudnn_int8x4_128x64_relu_interior_nn_v1 API calls: 100.00% 2.3756ms 112 21.210us 13.441us 112.84us cudaLaunchKernel ==25751== Range "model.4.cv2.conv.module.weight + QuantizeLinear_558_quantize_scale_node + Conv_562" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.0600ms 112 27.321us 17.153us 98.150us model.4.cv2.conv.module.weight + QuantizeLinear_558_quantize_scale_node + Conv_562 GPU activities: 100.00% 10.532ms 112 94.033us 84.711us 143.72us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 2.2793ms 112 20.350us 12.641us 90.085us cudaLaunchKernel ==25751== Range "model.4.cv3.conv.module.weight + QuantizeLinear_575_quantize_scale_node + Conv_579" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.1612ms 112 28.225us 18.689us 43.043us model.4.cv3.conv.module.weight + QuantizeLinear_575_quantize_scale_node + Conv_579 GPU activities: 100.00% 21.729ms 112 194.01us 185.46us 275.80us trt_volta_fp32_icudnn_int8x4_128x64_relu_interior_nn_v1 API calls: 100.00% 2.2389ms 112 19.990us 12.929us 35.106us cudaLaunchKernel ==25751== Range "model.4.m.0.cv1.conv.module.weight + QuantizeLinear_261_quantize_scale_node + Conv_265" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.2348ms 112 28.881us 17.569us 103.91us model.4.m.0.cv1.conv.module.weight + QuantizeLinear_261_quantize_scale_node + Conv_265 GPU activities: 100.00% 9.1225ms 112 81.450us 73.445us 114.38us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 2.3983ms 112 21.413us 12.832us 96.709us cudaLaunchKernel ==25751== Range "model.4.m.0.cv2.conv.module.weight + QuantizeLinear_277_quantize_scale_node + Conv_281" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.3424ms 112 29.842us 17.697us 128.97us model.4.m.0.cv2.conv.module.weight + QuantizeLinear_277_quantize_scale_node + Conv_281 GPU activities: 100.00% 18.602ms 112 166.09us 149.87us 265.46us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 2.5033ms 112 22.351us 12.897us 119.27us cudaLaunchKernel ==25751== Range "model.4.m.1.cv1.conv.module.weight + QuantizeLinear_294_quantize_scale_node + Conv_298" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.1601ms 112 28.215us 17.601us 114.60us model.4.m.1.cv1.conv.module.weight + QuantizeLinear_294_quantize_scale_node + Conv_298 GPU activities: 100.00% 9.0622ms 112 80.912us 75.110us 120.91us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 2.3389ms 112 20.883us 13.088us 105.74us cudaLaunchKernel ==25751== Range "model.4.m.1.cv2.conv.module.weight + QuantizeLinear_310_quantize_scale_node + Conv_314" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.0676ms 112 27.389us 17.313us 77.060us model.4.m.1.cv2.conv.module.weight + QuantizeLinear_310_quantize_scale_node + Conv_314 GPU activities: 100.00% 18.737ms 112 167.29us 148.88us 248.76us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 2.1941ms 112 19.590us 12.769us 65.155us cudaLaunchKernel ==25751== Range "model.4.m.2.cv1.conv.module.weight + QuantizeLinear_327_quantize_scale_node + Conv_331" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.1551ms 112 28.170us 18.369us 99.590us model.4.m.2.cv1.conv.module.weight + QuantizeLinear_327_quantize_scale_node + Conv_331 GPU activities: 100.00% 9.0487ms 112 80.792us 74.854us 118.92us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 2.2842ms 112 20.394us 13.409us 91.845us cudaLaunchKernel ==25751== Range "model.4.m.2.cv2.conv.module.weight + QuantizeLinear_343_quantize_scale_node + Conv_347" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.0656ms 112 27.371us 17.217us 109.29us model.4.m.2.cv2.conv.module.weight + QuantizeLinear_343_quantize_scale_node + Conv_347 GPU activities: 100.00% 19.436ms 112 173.54us 153.29us 924.33us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 2.2302ms 112 19.912us 12.481us 90.533us cudaLaunchKernel ==25751== Range "model.4.m.3.cv1.conv.module.weight + QuantizeLinear_360_quantize_scale_node + Conv_364" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.0474ms 112 27.208us 18.178us 57.635us model.4.m.3.cv1.conv.module.weight + QuantizeLinear_360_quantize_scale_node + Conv_364 GPU activities: 100.00% 9.0709ms 112 80.989us 73.542us 120.20us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 2.2416ms 112 20.013us 13.248us 50.563us cudaLaunchKernel ==25751== Range "model.4.m.3.cv2.conv.module.weight + QuantizeLinear_376_quantize_scale_node + Conv_380" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.9213ms 112 26.083us 17.089us 43.043us model.4.m.3.cv2.conv.module.weight + QuantizeLinear_376_quantize_scale_node + Conv_380 GPU activities: 100.00% 18.953ms 112 169.22us 154.89us 249.01us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 2.1011ms 112 18.759us 12.641us 32.674us cudaLaunchKernel ==25751== Range "model.4.m.4.cv1.conv.module.weight + QuantizeLinear_393_quantize_scale_node + Conv_397" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.0732ms 112 27.439us 17.537us 52.611us model.4.m.4.cv1.conv.module.weight + QuantizeLinear_393_quantize_scale_node + Conv_397 GPU activities: 100.00% 9.0913ms 112 81.172us 74.918us 127.40us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 2.2241ms 112 19.857us 12.832us 35.874us cudaLaunchKernel ==25751== Range "model.4.m.4.cv2.conv.module.weight + QuantizeLinear_409_quantize_scale_node + Conv_413" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.0670ms 112 27.383us 17.281us 119.14us model.4.m.4.cv2.conv.module.weight + QuantizeLinear_409_quantize_scale_node + Conv_413 GPU activities: 100.00% 19.040ms 112 170.00us 153.84us 244.76us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 2.2328ms 112 19.935us 12.321us 109.32us cudaLaunchKernel ==25751== Range "model.4.m.5.cv1.conv.module.weight + QuantizeLinear_426_quantize_scale_node + Conv_430" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.0312ms 112 27.064us 17.857us 48.067us model.4.m.5.cv1.conv.module.weight + QuantizeLinear_426_quantize_scale_node + Conv_430 GPU activities: 100.00% 8.9812ms 112 80.189us 73.222us 117.51us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 2.2297ms 112 19.907us 13.120us 36.962us cudaLaunchKernel ==25751== Range "model.4.m.5.cv2.conv.module.weight + QuantizeLinear_442_quantize_scale_node + Conv_446" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.0013ms 112 26.797us 17.761us 84.965us model.4.m.5.cv2.conv.module.weight + QuantizeLinear_442_quantize_scale_node + Conv_446 GPU activities: 100.00% 18.838ms 112 168.19us 153.32us 246.04us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 2.2090ms 112 19.723us 12.609us 77.828us cudaLaunchKernel ==25751== Range "model.4.m.6.cv1.conv.module.weight + QuantizeLinear_459_quantize_scale_node + Conv_463" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.1036ms 112 27.711us 17.409us 74.244us model.4.m.6.cv1.conv.module.weight + QuantizeLinear_459_quantize_scale_node + Conv_463 GPU activities: 100.00% 9.1012ms 112 81.260us 74.054us 114.92us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 2.2769ms 112 20.329us 12.769us 50.146us cudaLaunchKernel ==25751== Range "model.4.m.6.cv2.conv.module.weight + QuantizeLinear_475_quantize_scale_node + Conv_479" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.1593ms 112 28.208us 17.569us 104.26us model.4.m.6.cv2.conv.module.weight + QuantizeLinear_475_quantize_scale_node + Conv_479 GPU activities: 100.00% 18.847ms 112 168.27us 150.73us 238.35us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 2.2097ms 112 19.729us 12.481us 80.517us cudaLaunchKernel ==25751== Range "model.4.m.7.cv1.conv.module.weight + QuantizeLinear_492_quantize_scale_node + Conv_496" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.1471ms 112 28.099us 18.017us 91.750us model.4.m.7.cv1.conv.module.weight + QuantizeLinear_492_quantize_scale_node + Conv_496 GPU activities: 100.00% 9.0329ms 112 80.650us 74.758us 124.30us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 2.2292ms 112 19.903us 13.185us 56.995us cudaLaunchKernel ==25751== Range "model.4.m.7.cv2.conv.module.weight + QuantizeLinear_508_quantize_scale_node + Conv_512" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.9106ms 112 25.987us 17.121us 97.638us model.4.m.7.cv2.conv.module.weight + QuantizeLinear_508_quantize_scale_node + Conv_512 GPU activities: 100.00% 18.841ms 112 168.22us 154.25us 238.29us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 2.1410ms 112 19.115us 12.577us 89.734us cudaLaunchKernel ==25751== Range "model.4.m.8.cv1.conv.module.weight + QuantizeLinear_525_quantize_scale_node + Conv_529" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.0199ms 112 26.962us 17.857us 69.028us model.4.m.8.cv1.conv.module.weight + QuantizeLinear_525_quantize_scale_node + Conv_529 GPU activities: 100.00% 8.9688ms 112 80.078us 70.373us 116.59us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 2.2497ms 112 20.086us 13.120us 60.388us cudaLaunchKernel ==25751== Range "model.4.m.8.cv2.conv.module.weight + QuantizeLinear_541_quantize_scale_node + Conv_545" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.8673ms 112 25.600us 17.473us 43.842us model.4.m.8.cv2.conv.module.weight + QuantizeLinear_541_quantize_scale_node + Conv_545 GPU activities: 100.00% 18.763ms 112 167.53us 152.59us 244.95us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 2.0873ms 112 18.636us 12.705us 35.330us cudaLaunchKernel ==25751== Range "model.5.conv.module.weight + QuantizeLinear_591_quantize_scale_node + Conv_595" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.0286ms 112 27.041us 17.953us 54.819us model.5.conv.module.weight + QuantizeLinear_591_quantize_scale_node + Conv_595 GPU activities: 100.00% 33.454ms 112 298.70us 275.86us 417.31us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 2.2125ms 112 19.754us 13.344us 41.826us cudaLaunchKernel ==25751== Range "model.6.cv1.conv.module.weight + QuantizeLinear_607_quantize_scale_node + Conv_611" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.9742ms 112 26.554us 17.185us 53.731us model.6.cv1.conv.module.weight + QuantizeLinear_607_quantize_scale_node + Conv_611 GPU activities: 100.00% 7.3939ms 112 66.016us 60.293us 94.151us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 2.1673ms 112 19.351us 12.705us 47.651us cudaLaunchKernel ==25751== Range "model.6.cv2.conv.module.weight + QuantizeLinear_920_quantize_scale_node + Conv_924" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.0094ms 112 26.870us 17.025us 120.39us model.6.cv2.conv.module.weight + QuantizeLinear_920_quantize_scale_node + Conv_924 GPU activities: 100.00% 7.2938ms 112 65.122us 58.980us 95.272us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 2.2882ms 112 20.430us 12.704us 113.32us cudaLaunchKernel ==25751== Range "model.6.cv3.conv.module.weight + QuantizeLinear_937_quantize_scale_node + Conv_941" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.9440ms 112 26.286us 18.849us 70.052us model.6.cv3.conv.module.weight + QuantizeLinear_937_quantize_scale_node + Conv_941 GPU activities: 100.00% 21.110ms 112 188.48us 178.70us 493.32us trt_volta_fp32_icudnn_int8x4_128x64_relu_interior_nn_v1 API calls: 100.00% 2.0414ms 112 18.226us 13.313us 58.947us cudaLaunchKernel ==25751== Range "model.6.m.0.cv1.conv.module.weight + QuantizeLinear_623_quantize_scale_node + Conv_627" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.8184ms 112 25.163us 17.121us 41.026us model.6.m.0.cv1.conv.module.weight + QuantizeLinear_623_quantize_scale_node + Conv_627 GPU activities: 100.00% 6.2700ms 112 55.982us 51.748us 78.982us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 2.0624ms 112 18.414us 12.384us 33.698us cudaLaunchKernel ==25751== Range "model.6.m.0.cv2.conv.module.weight + QuantizeLinear_639_quantize_scale_node + Conv_643" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.1452ms 112 28.082us 20.577us 77.701us model.6.m.0.cv2.conv.module.weight + QuantizeLinear_639_quantize_scale_node + Conv_643 GPU activities: 100.00% 17.170ms 112 153.31us 148.56us 221.43us trt_volta_fp32_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 API calls: 100.00% 2.3022ms 112 20.555us 14.785us 63.427us cudaLaunchKernel ==25751== Range "model.6.m.1.cv1.conv.module.weight + QuantizeLinear_656_quantize_scale_node + Conv_660" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.8956ms 112 25.853us 18.113us 72.101us model.6.m.1.cv1.conv.module.weight + QuantizeLinear_656_quantize_scale_node + Conv_660 GPU activities: 100.00% 6.2326ms 112 55.647us 51.652us 86.727us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 2.1348ms 112 19.061us 12.993us 64.771us cudaLaunchKernel ==25751== Range "model.6.m.1.cv2.conv.module.weight + QuantizeLinear_672_quantize_scale_node + Conv_676" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.9846ms 112 26.648us 18.017us 87.397us model.6.m.1.cv2.conv.module.weight + QuantizeLinear_672_quantize_scale_node + Conv_676 GPU activities: 100.00% 17.144ms 112 153.07us 147.88us 219.95us trt_volta_fp32_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 API calls: 100.00% 2.1794ms 112 19.458us 12.705us 70.340us cudaLaunchKernel ==25751== Range "model.6.m.2.cv1.conv.module.weight + QuantizeLinear_689_quantize_scale_node + Conv_693" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.0427ms 112 27.167us 19.073us 156.43us model.6.m.2.cv1.conv.module.weight + QuantizeLinear_689_quantize_scale_node + Conv_693 GPU activities: 100.00% 6.2047ms 112 55.399us 51.044us 79.014us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 2.2671ms 112 20.242us 13.697us 148.10us cudaLaunchKernel ==25751== Range "model.6.m.2.cv2.conv.module.weight + QuantizeLinear_705_quantize_scale_node + Conv_709" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.7566ms 112 24.612us 16.993us 48.643us model.6.m.2.cv2.conv.module.weight + QuantizeLinear_705_quantize_scale_node + Conv_709 GPU activities: 100.00% 17.134ms 112 152.98us 147.88us 221.97us trt_volta_fp32_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 API calls: 100.00% 2.0186ms 112 18.022us 12.577us 38.210us cudaLaunchKernel ==25751== Range "model.6.m.3.cv1.conv.module.weight + QuantizeLinear_722_quantize_scale_node + Conv_726" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.9566ms 112 26.398us 18.049us 93.189us model.6.m.3.cv1.conv.module.weight + QuantizeLinear_722_quantize_scale_node + Conv_726 GPU activities: 100.00% 6.2555ms 112 55.852us 52.132us 78.502us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 2.1397ms 112 19.104us 13.088us 80.900us cudaLaunchKernel ==25751== Range "model.6.m.3.cv2.conv.module.weight + QuantizeLinear_738_quantize_scale_node + Conv_742" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.8454ms 112 25.405us 17.729us 77.156us model.6.m.3.cv2.conv.module.weight + QuantizeLinear_738_quantize_scale_node + Conv_742 GPU activities: 100.00% 17.122ms 112 152.88us 146.86us 221.71us trt_volta_fp32_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 API calls: 100.00% 2.0963ms 112 18.717us 12.673us 70.372us cudaLaunchKernel ==25751== Range "model.6.m.4.cv1.conv.module.weight + QuantizeLinear_755_quantize_scale_node + Conv_759" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.8134ms 112 25.119us 17.889us 80.740us model.6.m.4.cv1.conv.module.weight + QuantizeLinear_755_quantize_scale_node + Conv_759 GPU activities: 100.00% 6.2767ms 112 56.042us 52.516us 79.334us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 2.1015ms 112 18.763us 12.897us 73.284us cudaLaunchKernel ==25751== Range "model.6.m.4.cv2.conv.module.weight + QuantizeLinear_771_quantize_scale_node + Conv_775" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.6657ms 112 32.729us 17.697us 965.50us model.6.m.4.cv2.conv.module.weight + QuantizeLinear_771_quantize_scale_node + Conv_775 GPU activities: 100.00% 17.098ms 112 152.66us 147.05us 220.59us trt_volta_fp32_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 API calls: 100.00% 2.9488ms 112 26.328us 12.993us 955.29us cudaLaunchKernel ==25751== Range "model.6.m.5.cv1.conv.module.weight + QuantizeLinear_788_quantize_scale_node + Conv_792" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.6988ms 112 24.096us 17.889us 60.836us model.6.m.5.cv1.conv.module.weight + QuantizeLinear_788_quantize_scale_node + Conv_792 GPU activities: 100.00% 6.1944ms 112 55.307us 50.308us 79.750us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 1.9759ms 112 17.642us 13.089us 54.883us cudaLaunchKernel ==25751== Range "model.6.m.5.cv2.conv.module.weight + QuantizeLinear_804_quantize_scale_node + Conv_808" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.6492ms 112 23.653us 16.929us 45.091us model.6.m.5.cv2.conv.module.weight + QuantizeLinear_804_quantize_scale_node + Conv_808 GPU activities: 100.00% 17.096ms 112 152.64us 148.08us 220.27us trt_volta_fp32_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 API calls: 100.00% 1.9511ms 112 17.420us 12.416us 38.082us cudaLaunchKernel ==25751== Range "model.6.m.6.cv1.conv.module.weight + QuantizeLinear_821_quantize_scale_node + Conv_825" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.5850ms 112 23.080us 17.377us 34.882us model.6.m.6.cv1.conv.module.weight + QuantizeLinear_821_quantize_scale_node + Conv_825 GPU activities: 100.00% 6.2223ms 112 55.556us 51.780us 80.679us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 1.8876ms 112 16.853us 12.864us 28.705us cudaLaunchKernel ==25751== Range "model.6.m.6.cv2.conv.module.weight + QuantizeLinear_837_quantize_scale_node + Conv_841" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.5982ms 112 23.198us 17.537us 58.500us model.6.m.6.cv2.conv.module.weight + QuantizeLinear_837_quantize_scale_node + Conv_841 GPU activities: 100.00% 17.171ms 112 153.31us 148.11us 219.76us trt_volta_fp32_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 API calls: 100.00% 1.9083ms 112 17.038us 12.961us 51.426us cudaLaunchKernel ==25751== Range "model.6.m.7.cv1.conv.module.weight + QuantizeLinear_854_quantize_scale_node + Conv_858" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.7882ms 112 24.894us 18.049us 90.533us model.6.m.7.cv1.conv.module.weight + QuantizeLinear_854_quantize_scale_node + Conv_858 GPU activities: 100.00% 6.2484ms 112 55.789us 51.140us 79.622us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 2.0934ms 112 18.691us 12.929us 82.052us cudaLaunchKernel ==25751== Range "model.6.m.7.cv2.conv.module.weight + QuantizeLinear_870_quantize_scale_node + Conv_874" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.6406ms 112 23.576us 17.697us 97.414us model.6.m.7.cv2.conv.module.weight + QuantizeLinear_870_quantize_scale_node + Conv_874 GPU activities: 100.00% 17.196ms 112 153.54us 148.24us 220.91us trt_volta_fp32_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 API calls: 100.00% 1.8825ms 112 16.808us 12.800us 45.955us cudaLaunchKernel ==25751== Range "model.6.m.8.cv1.conv.module.weight + QuantizeLinear_887_quantize_scale_node + Conv_891" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.4856ms 112 22.192us 17.569us 44.067us model.6.m.8.cv1.conv.module.weight + QuantizeLinear_887_quantize_scale_node + Conv_891 GPU activities: 100.00% 6.1955ms 112 55.316us 50.916us 80.550us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 1.8452ms 112 16.474us 12.865us 37.282us cudaLaunchKernel ==25751== Range "model.6.m.8.cv2.conv.module.weight + QuantizeLinear_903_quantize_scale_node + Conv_907" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.7394ms 112 24.458us 17.313us 90.789us model.6.m.8.cv2.conv.module.weight + QuantizeLinear_903_quantize_scale_node + Conv_907 GPU activities: 100.00% 17.199ms 112 153.56us 147.98us 220.75us trt_volta_fp32_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 API calls: 100.00% 1.9480ms 112 17.392us 12.737us 58.243us cudaLaunchKernel ==25751== Range "model.7.conv.module.weight + QuantizeLinear_953_quantize_scale_node + Conv_957" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.7404ms 112 24.467us 17.633us 97.702us model.7.conv.module.weight + QuantizeLinear_953_quantize_scale_node + Conv_957 GPU activities: 100.00% 30.457ms 112 271.94us 263.35us 393.50us trt_volta_fp32_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 API calls: 100.00% 2.0069ms 112 17.918us 12.737us 74.500us cudaLaunchKernel ==25751== Range "model.8.cv1.conv.module.weight + QuantizeLinear_969_quantize_scale_node + Conv_973" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.4865ms 112 22.200us 17.121us 73.220us model.8.cv1.conv.module.weight + QuantizeLinear_969_quantize_scale_node + Conv_973 GPU activities: 100.00% 5.9063ms 112 52.735us 49.956us 75.014us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 1.8262ms 112 16.304us 12.609us 66.660us cudaLaunchKernel ==25751== Range "model.8.cv2.conv.module.weight + QuantizeLinear_989_quantize_scale_node + Conv_993" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.5989ms 112 23.204us 17.761us 103.27us model.8.cv2.conv.module.weight + QuantizeLinear_989_quantize_scale_node + Conv_993 GPU activities: 100.00% 16.721ms 112 149.29us 141.42us 212.56us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 1.8719ms 112 16.713us 12.800us 94.693us cudaLaunchKernel ==25751== Range "model.9.cv1.conv.module.weight + QuantizeLinear_1005_quantize_scale_node + Conv_1009" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.4799ms 112 22.141us 16.993us 62.787us model.9.cv1.conv.module.weight + QuantizeLinear_1005_quantize_scale_node + Conv_1009 GPU activities: 100.00% 5.6622ms 112 50.555us 48.163us 71.014us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 1.7358ms 112 15.498us 12.481us 35.042us cudaLaunchKernel ==25751== Range "model.9.cv2.conv.module.weight + QuantizeLinear_1117_quantize_scale_node + Conv_1121" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.6557ms 112 23.711us 16.993us 208.88us model.9.cv2.conv.module.weight + QuantizeLinear_1117_quantize_scale_node + Conv_1121 GPU activities: 100.00% 5.5571ms 112 49.617us 47.620us 70.502us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 1.9797ms 112 17.676us 12.545us 152.20us cudaLaunchKernel ==25751== Range "model.9.cv3.conv.module.weight + QuantizeLinear_1134_quantize_scale_node + Conv_1138" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.4279ms 112 21.677us 16.961us 63.940us model.9.cv3.conv.module.weight + QuantizeLinear_1134_quantize_scale_node + Conv_1138 GPU activities: 100.00% 10.275ms 112 91.739us 86.535us 134.06us trt_volta_fp32_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 API calls: 100.00% 1.7930ms 112 16.008us 12.545us 58.404us cudaLaunchKernel ==25751== Range "model.9.m.0.cv1.conv.module.weight + QuantizeLinear_1021_quantize_scale_node + Conv_1025" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.5074ms 112 22.387us 16.833us 66.595us model.9.m.0.cv1.conv.module.weight + QuantizeLinear_1021_quantize_scale_node + Conv_1025 GPU activities: 100.00% 4.0167ms 112 35.863us 32.834us 52.068us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 1.8246ms 112 16.291us 12.416us 59.587us cudaLaunchKernel ==25751== Range "model.9.m.0.cv2.conv.module.weight + QuantizeLinear_1037_quantize_scale_node + Conv_1041" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.4945ms 112 22.272us 17.825us 51.843us model.9.m.0.cv2.conv.module.weight + QuantizeLinear_1037_quantize_scale_node + Conv_1041 GPU activities: 100.00% 16.050ms 112 143.30us 138.28us 201.36us trt_volta_fp32_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 API calls: 100.00% 1.8226ms 112 16.273us 13.185us 41.058us cudaLaunchKernel ==25751== Range "model.9.m.1.cv1.conv.module.weight + QuantizeLinear_1053_quantize_scale_node + Conv_1057" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.3506ms 112 20.987us 17.217us 40.706us model.9.m.1.cv1.conv.module.weight + QuantizeLinear_1053_quantize_scale_node + Conv_1057 GPU activities: 100.00% 4.0677ms 112 36.318us 34.883us 52.292us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 1.7185ms 112 15.343us 12.673us 33.442us cudaLaunchKernel ==25751== Range "model.9.m.1.cv2.conv.module.weight + QuantizeLinear_1069_quantize_scale_node + Conv_1073" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 3.1909ms 112 28.490us 17.025us 734.41us model.9.m.1.cv2.conv.module.weight + QuantizeLinear_1069_quantize_scale_node + Conv_1073 GPU activities: 100.00% 15.426ms 112 137.73us 133.26us 199.18us trt_volta_fp32_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 API calls: 100.00% 2.5205ms 112 22.504us 12.577us 726.73us cudaLaunchKernel ==25751== Range "model.9.m.2.cv1.conv.module.weight + QuantizeLinear_1085_quantize_scale_node + Conv_1089" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.4335ms 112 21.727us 17.185us 59.395us model.9.m.2.cv1.conv.module.weight + QuantizeLinear_1085_quantize_scale_node + Conv_1089 GPU activities: 100.00% 4.1454ms 112 37.012us 35.298us 52.900us trt_volta_fp32_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 API calls: 100.00% 1.8008ms 112 16.078us 12.673us 54.147us cudaLaunchKernel ==25751== Range "model.9.m.2.cv2.conv.module.weight + QuantizeLinear_1101_quantize_scale_node + Conv_1105" Type Time(%) Time Calls Avg Min Max Name Range: 100.00% 2.6298ms 112 23.480us 17.377us 84.357us model.9.m.2.cv2.conv.module.weight + QuantizeLinear_1101_quantize_scale_node + Conv_1105 GPU activities: 100.00% 15.513ms 112 138.51us 133.61us 198.90us trt_volta_fp32_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 API calls: 100.00% 1.9652ms 112 17.546us 12.513us 78.853us cudaLaunchKernel