Hi, so I did whats in the pyprof readme.md but I cant get pyprof to work with nvprof.
But it seems that nvprof alone is working (but I lost the benefits of being able to know the pytorch operations that pyprof provides). It seems like a parser error in the output file so maybe is a bug in pyprof.
1. When I use pyprof inside my code (it doesn’t work)
# Any of this
sudo /usr/local/cuda/bin/nvprof -f -o my_output_file.sql --profile-from-start off -- python3 my_main.py
sudo /usr/local/cuda/bin/nvprof -f -o my_output_file.sql python3 my_main.py
# When trying to parse
python3 -m apex.pyprof.parse all_measurements_pyprof.sql > all_measurements_pyprof.dict
Traceback (most recent call last):
File "/usr/lib/python3.6/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "/usr/lib/python3.6/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/home/nvidia/.local/lib/python3.6/site-packages/apex/pyprof/parse/__main__.py", line 10, in <module>
File "/home/nvidia/.local/lib/python3.6/site-packages/apex/pyprof/parse/parse.py", line 63, in main
info = nvvp.getMarkerInfo(k.objId, k.rStartTime, k.rEndTime)
File "/home/nvidia/.local/lib/python3.6/site-packages/apex/pyprof/parse/nvvp.py", line 277, in getMarkerInfo
File "/home/nvidia/.local/lib/python3.6/site-packages/apex/pyprof/parse/nvvp.py", line 169, in seqcompare
assert (", seq = " in elem)
The output without a file is
==8594== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 14.52% 1.26781s 1761 719.94us 298.92us 1.5383ms volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1
13.81% 1.20576s 479 2.5172ms 109.35us 83.370ms volta_gcgemm_32x32_nt
10.80% 942.43ms 12 78.536ms 30.946us 443.27ms void transpose_readWrite_alignment_kernel<float2, float2, int=1, bool=0, int=6, int=4, int=4>(cublasTransposeParams<float2>, float2 const *, float2*, float2 const *)
9.09% 793.36ms 5536 143.31us 2.6240us 1.1046ms void nchwToNhwcKernel<__half, __half, float, bool=1, bool=0>(int, int, int, int, __half const *, __half*, float, float)
5.86% 511.51ms 24 21.313ms 7.3242ms 59.667ms volta_sgemm_128x128_nn
5.01% 437.53ms 2875 152.19us 18.209us 386.16us void op_generic_tensor_kernel<int=2, __half, float, __half, int=256, cudnnGenericOp_t=0, cudnnNanPropagation_t=0, cudnnDimOrder_t=0, int=0>(cudnnTensorStruct, __half*, cudnnTensorStruct, __half const *, cudnnTensorStruct, __half const *, float, float, float, float, dimArray, reducedDivisorArray)
3.54% 309.40ms 3 103.13ms 102.48ms 104.10ms volta_cgemm_32x64_tn
3.49% 304.24ms 377 807.00us 611.48us 1.1826ms volta_fp16_s884cudnn_fp16_256x128_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1
3.46% 302.41ms 544 555.91us 360.62us 763.49us void fermiCgemm_v3_kernel<bool=1, bool=0, bool=0, bool=0, int=5, int=5, int=3, int=8, int=8>(int, int, int, float2 const *, int, float2 const *, int, float2*, int, int, int, float2 const *, float2 const *, float2, float2, int)
2.89% 252.10ms 2250 112.04us 16.096us 289.58us void kernelPointwiseApply2<ThresholdUpdateOutput<at::Half>, at::Half, at::Half, unsigned int, int=1, int=1>(OffsetInfo<ThresholdUpdateOutput<at::Half>, at::Half, unsigned int>, OffsetInfo<at::Half, at::Half, int=1>, at::Half, at::Half)
2.43% 212.27ms 18 11.793ms 26.273us 64.494ms void fft2d_r2c_32x32<__half, bool=0, unsigned int=1, bool=1>(float2*, __half const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
2.32% 202.55ms 500 405.09us 115.40us 957.00us void kernelPointwiseApply2<CopyOp<float, float>, float, float, unsigned int, int=-1, int=1>(OffsetInfo<float, float, float>, OffsetInfo<CopyOp<float, float>, float, unsigned int>, float, float)
2.31% 201.26ms 500 402.52us 259.44us 580.57us void nearest_neighbor_4d_kernel<float, float>(int, THCDeviceTensor<float, int=4, int, DefaultPtrTraits>, THCDeviceTensor<float, int=4, int, DefaultPtrTraits>)
2.07% 180.63ms 252 716.78us 668.89us 784.10us volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_interior_nhwc2nchw_tn_v1
1.54% 134.34ms 671 200.21us 1.4720us 786.34us void kernelPointwiseApply2<CopyOp<at::Half, float>, at::Half, float, unsigned int, int=1, int=1>(OffsetInfo<float, at::Half, float>, OffsetInfo<CopyOp<at::Half, float>, at::Half, unsigned int>, at::Half, at::Half)
1.23% 107.49ms 500 214.98us 38.594us 574.10us void kernelPointwiseApply1<TensorFillOp<float>, float, unsigned int, int=1>(OffsetInfo<TensorFillOp<float>, float, unsigned int>, float, float)
1.23% 107.36ms 500 214.73us 60.834us 480.53us void CatArrayBatchedCopy<at::Half, unsigned int, int=4>(at::Half*, CatArrInputTensor<at::Half, unsigned int>*, OutputTensorSizeStride<unsigned int, unsigned int=4>, int, unsigned int)
1.23% 107.33ms 28 3.8331ms 235.21us 18.183ms void cudnn::winograd_nonfused::winogradForwardData4x4<float, __half>(cudnn::winograd_nonfused::WinogradDataParams<float, __half>)
0.91% 79.518ms 20 3.9759ms 1.9273ms 7.7055ms void cudnn::detail::implicit_convolve_sgemm<__half, __half, int=512, int=6, int=8, int=3, int=3, int=5, int=1, bool=1, bool=0, bool=1>(int, int, int, __half const *, int, __half*, cudnn::detail::implicit_convolve_sgemm<__half, __half, int=512, int=6, int=8, int=3, int=3, int=5, int=1, bool=1, bool=0, bool=1>*, kernel_conv_params, int, float, float, int, __half, __half, int, int)
0.81% 71.067ms 126 564.02us 560.02us 578.07us volta_fp16_s884cudnn_fp16_256x128_ldg8_relu_f2f_exp_interior_nhwc2nchw_tn_v1
0.76% 66.093ms 28 2.3605ms 344.43us 6.0229ms void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, __half>(cudnn::winograd_nonfused::WinogradOutputParams<float, __half>)
0.73% 63.637ms 126 505.06us 500.73us 515.57us volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_interior_nhwc2nchw_tn_v1
0.67% 58.897ms 13 4.5305ms 1.8310ms 8.0788ms void cudnn::winograd::winograd3x3Kernel<__half, float, int=1, int=4, int=8, bool=0>(cudnn::maxwell::winograd::KernelParams)
0.67% 58.668ms 500 117.34us 32.578us 252.27us void MaxPoolForward<at::Half, float>(int, at::Half const *, int, int, int, int, int, int, int, int, int, int, int, int, int, int, at::Half*, long*)
0.56% 48.904ms 766 63.843us 256ns 4.7354ms [CUDA memcpy HtoD]
0.52% 45.616ms 4 11.404ms 385.30us 43.667ms void DSE::regular_fft_pad<int=0, int=1, int=256, int=16, int=16, int=1, __half, float, float2>(float2*, __half*, int, int3, __half*, int, __half*, __half*, int, int, int, int, int, bool)
0.52% 44.998ms 126 357.12us 348.72us 368.27us volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1
0.48% 42.185ms 4 10.546ms 7.0822ms 14.003ms volta_sgemm_128x64_nn
0.48% 41.850ms 10 4.1850ms 331.21us 10.334ms void cudnn::detail::implicit_convolve_sgemm<__half, __half, int=1024, int=5, int=5, int=3, int=3, int=3, int=1, bool=1, bool=0, bool=1>(int, int, int, __half const *, int, __half*, cudnn::detail::implicit_convolve_sgemm<__half, __half, int=1024, int=5, int=5, int=3, int=3, int=3, int=1, bool=1, bool=0, bool=1>*, kernel_conv_params, int, float, float, int, __half, __half, int, int)
0.47% 41.211ms 10 4.1211ms 1.7906ms 7.2531ms volta_fp16_scudnn_fp16_128x64_relu_small_nn_v1
0.46% 40.404ms 2 20.202ms 219.02us 40.185ms void DSE::regular_fft_pad<int=0, int=1, int=128, int=16, int=32, int=1, __half, float, float2>(float2*, __half*, int, int3, __half*, int, __half*, __half*, int, int, int, int, int, bool)
0.46% 40.324ms 4 10.081ms 308.78us 38.627ms void DSE::vector_fft<int=0, int=1, int=256, int=16, int=16, int=1, __half, float, float2>(float2*, float2, int, int3, float2*)
0.44% 38.366ms 10 3.8366ms 1.8658ms 7.4346ms void cudnn::detail::explicit_convolve_sgemm<__half, int, int=512, int=6, int=8, int=3, int=3, int=5, int=0, bool=1>(int, int, int, __half const *, int, __half const , int, cudnn::detail::explicit_convolve_sgemm<__half, int, int=512, int=6, int=8, int=3, int=3, int=5, int=0, bool=1>*, kernel_conv_params, int, int, float, float, int, __half const *, __half const *)
0.44% 38.252ms 2 19.126ms 157.96us 38.094ms void DSE::vector_fft<int=0, int=1, int=128, int=8, int=8, int=1, __half, float, float2>(float2*, float2, int, int3, float2*)
0.43% 37.554ms 625 60.086us 17.697us 185.58us void kernelPointwiseApply2<CopyOp<float, at::Half>, float, at::Half, unsigned int, int=1, int=1>(OffsetInfo<at::Half, float, at::Half>, OffsetInfo<CopyOp<float, at::Half>, float, unsigned int>, float, float)
0.42% 36.334ms 8 4.5418ms 2.1470ms 8.1404ms void cudnn::detail::implicit_convolve_sgemm<__half, __half, int=1024, int=6, int=7, int=3, int=3, int=5, int=1, bool=1, bool=0, bool=1>(int, int, int, __half const *, int, __half*, cudnn::detail::implicit_convolve_sgemm<__half, __half, int=1024, int=6, int=7, int=3, int=3, int=5, int=1, bool=1, bool=0, bool=1>*, kernel_conv_params, int, float, float, int, __half, __half, int, int)
0.39% 34.189ms 2 17.095ms 106.95us 34.082ms void fft2d_r2c_64x64<__half>(float2*, __half const *, int, int, int, int, int, int, int, int)
0.36% 31.830ms 4 7.9576ms 52.162us 31.622ms void fft2d_r2c_32x32<__half, bool=0, unsigned int=1, bool=0>(float2*, __half const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
0.28% 24.520ms 126 194.61us 190.31us 200.97us volta_s884cudnn_fp16_256x64_ldg8_relu_exp_interior_nhwc_tn_v1
0.23% 19.715ms 5 3.9429ms 295.31us 9.6794ms void cudnn::detail::explicit_convolve_sgemm<__half, int, int=1024, int=5, int=5, int=3, int=3, int=3, int=0, bool=1>(int, int, int, __half const *, int, __half const , int, cudnn::detail::explicit_convolve_sgemm<__half, int, int=1024, int=5, int=5, int=3, int=3, int=3, int=0, bool=1>*, kernel_conv_params, int, int, float, float, int, __half const *, __half const *)
0.19% 16.622ms 4 4.1554ms 1.9669ms 7.4844ms void cudnn::detail::explicit_convolve_sgemm<__half, int, int=1024, int=6, int=7, int=3, int=3, int=5, int=0, bool=1>(int, int, int, __half const *, int, __half const , int, cudnn::detail::explicit_convolve_sgemm<__half, int, int=1024, int=6, int=7, int=3, int=3, int=5, int=0, bool=1>*, kernel_conv_params, int, int, float, float, int, __half const *, __half const *)
0.19% 16.463ms 171 96.273us 768ns 4.1981ms [CUDA memcpy DtoH]
0.14% 12.202ms 3 4.0674ms 1.7587ms 6.9460ms volta_fp16_scudnn_fp16_128x128_relu_small_nn_v1
0.13% 11.769ms 476 24.725us 5.7920us 141.48us void fft2d_r2c_32x32<__half, bool=0, unsigned int=0, bool=0>(float2*, __half const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
0.13% 10.956ms 252 43.476us 4.1600us 84.100us void nchwToNhwcKernel<__half, __half, float, bool=1, bool=1>(int, int, int, int, __half const *, __half*, float, float)
0.12% 10.468ms 2913 3.5930us 1.8240us 9.8240us cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
0.11% 9.8755ms 126 78.376us 77.667us 79.459us void nhwcToNchwKernel<float, __half, float, bool=1, bool=1>(int, int, int, int, float const *, __half*, float, float)
0.11% 9.6742ms 19 509.17us 59.234us 2.1930ms void im2col4d_kernel<__half, int>(im2col4d_params, cudnnConvolutionStruct, cudnnTensor4dStruct, __half const *, __half*, int)
0.11% 9.4702ms 476 19.895us 10.208us 129.64us void fft2d_c2r_32x32<__half, bool=0, bool=0, unsigned int=0, bool=0, bool=0>(__half*, float2 const *, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, __half*, __half*, int2, int, int)
0.11% 9.4329ms 3 3.1443ms 3.0958ms 3.1996ms volta_fp16_scudnn_fp16_128x64_relu_interior_nn_v1
0.10% 9.0751ms 28 324.11us 4.6080us 1.8901ms void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, __half>(cudnn::winograd_nonfused::WinogradFilterParams<float, __half>)
0.05% 4.5987ms 2 2.2993ms 903.46us 3.6952ms void flip_filter<__half, __half>(__half*, __half const *, int, int, int, int)
0.04% 3.7668ms 1 3.7668ms 3.7668ms 3.7668ms void fermiPlusCgemmLDS128_batched<bool=1, bool=0, bool=0, bool=0, int=4, int=4, int=4, int=3, int=3, bool=1, bool=0>(float2* const *, float2* const *, float2* const *, float2*, float2 const *, float2 const *, int, int, int, int, int, int, __int64, __int64, __int64, float2 const *, float2 const *, float2, float2, int)
0.04% 3.1032ms 1 3.1032ms 3.1032ms 3.1032ms volta_fp16_scudnn_fp16_128x128_relu_interior_nn_v1
0.03% 2.8060ms 14 200.43us 7.9680us 1.0719ms void cudnn::winograd::generateWinogradTilesKernel<int=0, __half, float>(cudnn::winograd::GenerateWinogradTilesParams<__half, float>)
0.01% 1.0265ms 2 513.27us 44.226us 982.31us void DSE::regular_fft_clip<int=1, int=2, int=256, int=16, int=16, int=1, __half, float, float2>(__half*, float2*, int, int3, float2*, int, float2*, float2*, int, int, int, int, int, float, float, bool, int, __half, __half)
0.01% 722.59us 1 722.59us 722.59us 722.59us void fft1d_r2c_256<__half, float, float2, bool=0, bool=0>(float2*, __half const *, int3, int3, int2, int2)
0.01% 647.52us 1 647.52us 647.52us 647.52us void cudnn::winograd::winograd3x3Kernel<__half, float, int=1, int=4, int=8, bool=1>(cudnn::maxwell::winograd::KernelParams)
0.01% 593.05us 2 296.52us 22.401us 570.65us void DSE::vector_fft<int=1, int=2, int=256, int=16, int=16, int=1, __half, float, float2>(float2*, float2, int, int3, float2*)
0.01% 494.26us 4 123.57us 15.489us 211.88us compute_gemm_pointers(float2**, float2 const *, int, float2 const *, int, float2 const *, int, int)
0.01% 494.04us 1 494.04us 494.04us 494.04us void DSE::regular_fft_clip<int=1, int=2, int=128, int=16, int=32, int=1, __half, float, float2>(__half*, float2*, int, int3, float2*, int, float2*, float2*, int, int, int, int, int, float, float, bool, int, __half, __half)
0.00% 319.82us 1 319.82us 319.82us 319.82us void DSE::vector_fft<int=1, int=2, int=128, int=8, int=8, int=1, __half, float, float2>(float2*, float2, int, int3, float2*)
0.00% 294.89us 1 294.89us 294.89us 294.89us volta_fp16_scudnn_fp16_128x32_relu_small_nn_v1
0.00% 269.23us 756 356ns 256ns 1.3760us [CUDA memset]
0.00% 265.52us 1 265.52us 265.52us 265.52us volta_fp16_scudnn_fp16_128x32_relu_interior_nn_v1
0.00% 249.61us 3 83.203us 80.228us 86.499us void fft2d_c2r_32x32<__half, bool=0, bool=0, unsigned int=1, bool=0, bool=0>(__half*, float2 const *, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, __half*, __half*, int2, int, int)
0.00% 175.11us 1 175.11us 175.11us 175.11us void fft2d_c2r_64x64<__half, bool=0>(__half*, float2*, int, int, int, int, int, int, int, int, int, int, float, float, int, __half*, __half*)
0.00% 41.409us 1 41.409us 41.409us 41.409us void fft1d_c2r_256<float2, float, __half, bool=0, bool=1, bool=0, bool=0>(__half*, float2 const *, int3, int3, int2, int, float, float, __half*, __half*)
0.00% 29.793us 1 29.793us 29.793us 29.793us void fft1d_r2c_256<__half, float, float2, bool=1, bool=0>(float2*, __half const *, int3, int3, int2, int2)
API calls: 46.73% 8.51666s 98 86.905ms 13.281us 4.79676s cudaMalloc
21.89% 3.98890s 161 24.776ms 308.59us 632.52ms cudaEventSynchronize
17.35% 3.16134s 8 395.17ms 28.897us 3.16113s cudaStreamCreateWithFlags
5.66% 1.03126s 22923 44.988us 20.257us 1.6283ms cudaLaunchKernel
3.21% 584.21ms 56 10.432ms 3.0080us 89.673ms cudaFree
1.51% 274.56ms 132316 2.0750us 1.3760us 1.0325ms cudaGetDevice
0.97% 176.84ms 890 198.70us 29.089us 5.2887ms cudaMemcpyAsync
0.89% 162.95ms 1 162.95ms 162.95ms 162.95ms cudaDeviceSynchronize
0.53% 97.009ms 37969 2.5540us 1.4400us 1.0500ms cudaSetDevice
0.45% 81.743ms 756 108.13us 19.425us 248.62us cudaMemsetAsync
0.32% 58.447ms 3915 14.928us 2.2080us 118.69us cudaEventRecord
0.11% 20.652ms 47 439.41us 37.986us 4.9120ms cudaMemcpy
0.10% 18.342ms 30262 606ns 224ns 57.603us cudaGetLastError
0.05% 9.3394ms 390 23.947us 6.5600us 149.29us cudaStreamSynchronize
0.05% 9.2902ms 1164 7.9810us 6.2080us 59.970us cudaBindTexture
0.04% 7.3940ms 499 14.817us 10.177us 51.970us cudaEventQuery
0.04% 6.8859ms 566 12.165us 2.2400us 62.946us cudaEventCreateWithFlags
0.02% 4.5489ms 818 5.5600us 2.2720us 37.697us cudaStreamWaitEvent
0.02% 4.4700ms 556 8.0390us 2.0480us 58.722us cudaEventDestroy
0.02% 3.6358ms 1164 3.1230us 2.0800us 35.713us cudaUnbindTexture
0.01% 1.9415ms 161 12.059us 4.2880us 86.851us cudaEventElapsedTime
0.00% 867.68us 2 433.84us 201.96us 665.72us cudaHostAlloc
0.00% 693.24us 19 36.486us 30.914us 55.939us cudaMemGetInfo
0.00% 551.25us 282 1.9540us 768ns 48.546us cuDeviceGetAttribute
0.00% 233.99us 243 962ns 256ns 23.713us cudaGetDeviceCount
0.00% 214.02us 4 53.506us 28.545us 126.09us cudaStreamCreateWithPriority
0.00% 197.35us 19 10.387us 8.8650us 12.225us cudaEventCreate
0.00% 141.38us 30 4.7120us 2.8160us 28.609us cudaFuncSetAttribute
0.00% 61.026us 1 61.026us 61.026us 61.026us cudaGetDeviceProperties
0.00% 45.217us 27 1.6740us 1.5040us 3.6480us cudaDeviceGetAttribute
0.00% 41.185us 5 8.2370us 1.3440us 30.369us cuDeviceGetCount
0.00% 38.754us 1 38.754us 38.754us 38.754us cudaProfilerStart
0.00% 36.034us 76 474ns 224ns 1.2160us cudaCreateChannelDesc
0.00% 35.393us 3 11.797us 7.4560us 20.033us cuDeviceTotalMem
0.00% 15.168us 2 7.5840us 3.6160us 11.552us cuDriverGetVersion
0.00% 9.0560us 4 2.2640us 1.5680us 4.0000us cuDeviceGet
0.00% 8.0320us 3 2.6770us 1.6320us 3.5200us cuDeviceGetUuid
0.00% 8.0320us 2 4.0160us 2.7840us 5.2480us cuInit
0.00% 6.1770us 3 2.0590us 1.8240us 2.3680us cuDeviceGetName
0.00% 5.7920us 1 5.7920us 5.7920us 5.7920us cudaHostGetDevicePointer
0.00% 4.7360us 1 4.7360us 4.7360us 4.7360us cudaDeviceGetStreamPriorityRange
2. When I dont use pyprof inside my code.
The output is pretty much the same that as when using pyprof as can be seen:
==16381== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 15.86% 1.51712s 2111 718.67us 298.13us 1.4808ms volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1
12.59% 1.20421s 479 2.5140ms 110.28us 83.299ms volta_gcgemm_32x32_nt
9.92% 948.72ms 6636 142.97us 2.7200us 1.1057ms void nchwToNhwcKernel<__half, __half, float, bool=1, bool=0>(int, int, int, int, __half const *, __half*, float, float)
9.85% 942.28ms 12 78.523ms 30.786us 442.70ms void transpose_readWrite_alignment_kernel<float2, float2, int=1, bool=0, int=6, int=4, int=4>(cublasTransposeParams<float2>, float2 const *, float2*, float2 const *)
5.50% 526.44ms 3450 152.59us 18.081us 381.27us void op_generic_tensor_kernel<int=2, __half, float, __half, int=256, cudnnGenericOp_t=0, cudnnNanPropagation_t=0, cudnnDimOrder_t=0, int=0>(cudnnTensorStruct, __half*, cudnnTensorStruct, __half const *, cudnnTensorStruct, __half const *, float, float, float, float, dimArray, reducedDivisorArray)
5.35% 511.69ms 24 21.320ms 7.3095ms 59.391ms volta_sgemm_128x128_nn
3.82% 365.56ms 452 808.77us 613.98us 1.1781ms volta_fp16_s884cudnn_fp16_256x128_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1
3.23% 309.42ms 3 103.14ms 102.48ms 104.10ms volta_cgemm_32x64_tn
3.14% 300.47ms 2700 111.29us 15.937us 291.47us void kernelPointwiseApply2<ThresholdUpdateOutput<at::Half>, at::Half, at::Half, unsigned int, int=1, int=1>(OffsetInfo<ThresholdUpdateOutput<at::Half>, at::Half, unsigned int>, OffsetInfo<at::Half, at::Half, int=1>, at::Half, at::Half)
3.07% 293.37ms 544 539.29us 361.71us 783.84us void fermiCgemm_v3_kernel<bool=1, bool=0, bool=0, bool=0, int=5, int=5, int=3, int=8, int=8>(int, int, int, float2 const *, int, float2 const *, int, float2*, int, int, int, float2 const *, float2 const *, float2, float2, int)
2.55% 244.14ms 600 406.91us 262.22us 573.91us void nearest_neighbor_4d_kernel<float, float>(int, THCDeviceTensor<float, int=4, int, DefaultPtrTraits>, THCDeviceTensor<float, int=4, int, DefaultPtrTraits>)
2.55% 244.02ms 600 406.70us 115.53us 958.16us void kernelPointwiseApply2<CopyOp<float, float>, float, float, unsigned int, int=-1, int=1>(OffsetInfo<float, float, float>, OffsetInfo<CopyOp<float, float>, float, unsigned int>, float, float)
2.26% 216.27ms 302 716.12us 662.17us 776.71us volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_interior_nhwc2nchw_tn_v1
2.22% 212.78ms 18 11.821ms 26.562us 64.670ms void fft2d_r2c_32x32<__half, bool=0, unsigned int=1, bool=1>(float2*, __half const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
1.67% 159.76ms 796 200.70us 1.4720us 803.27us void kernelPointwiseApply2<CopyOp<at::Half, float>, at::Half, float, unsigned int, int=1, int=1>(OffsetInfo<float, at::Half, float>, OffsetInfo<CopyOp<at::Half, float>, at::Half, unsigned int>, at::Half, at::Half)
1.35% 129.51ms 600 215.85us 60.163us 480.63us void CatArrayBatchedCopy<at::Half, unsigned int, int=4>(at::Half*, CatArrInputTensor<at::Half, unsigned int>*, OutputTensorSizeStride<unsigned int, unsigned int=4>, int, unsigned int)
1.23% 117.89ms 600 196.49us 40.354us 549.27us void kernelPointwiseApply1<TensorFillOp<float>, float, unsigned int, int=1>(OffsetInfo<TensorFillOp<float>, float, unsigned int>, float, float)
1.12% 107.59ms 28 3.8425ms 236.20us 18.196ms void cudnn::winograd_nonfused::winogradForwardData4x4<float, __half>(cudnn::winograd_nonfused::WinogradDataParams<float, __half>)
0.89% 85.471ms 151 566.03us 561.15us 576.76us volta_fp16_s884cudnn_fp16_256x128_ldg8_relu_f2f_exp_interior_nhwc2nchw_tn_v1
0.83% 79.490ms 20 3.9745ms 1.9291ms 7.6921ms void cudnn::detail::implicit_convolve_sgemm<__half, __half, int=512, int=6, int=8, int=3, int=3, int=5, int=1, bool=1, bool=0, bool=1>(int, int, int, __half const *, int, __half*, cudnn::detail::implicit_convolve_sgemm<__half, __half, int=512, int=6, int=8, int=3, int=3, int=5, int=1, bool=1, bool=0, bool=1>*, kernel_conv_params, int, float, float, int, __half, __half, int, int)
0.80% 76.094ms 151 503.94us 499.25us 517.59us volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_interior_nhwc2nchw_tn_v1
0.74% 70.377ms 600 117.30us 32.865us 254.44us void MaxPoolForward<at::Half, float>(int, at::Half const *, int, int, int, int, int, int, int, int, int, int, int, int, int, int, at::Half*, long*)
0.69% 65.989ms 28 2.3568ms 347.86us 6.0013ms void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, __half>(cudnn::winograd_nonfused::WinogradOutputParams<float, __half>)
0.62% 59.202ms 13 4.5540ms 1.7954ms 8.2444ms void cudnn::winograd::winograd3x3Kernel<__half, float, int=1, int=4, int=8, bool=0>(cudnn::maxwell::winograd::KernelParams)
0.56% 53.614ms 151 355.06us 347.57us 362.22us volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1
0.52% 50.098ms 891 56.226us 288ns 5.3730ms [CUDA memcpy HtoD]
0.47% 45.327ms 750 60.435us 17.216us 169.80us void kernelPointwiseApply2<CopyOp<float, at::Half>, float, at::Half, unsigned int, int=1, int=1>(OffsetInfo<at::Half, float, at::Half>, OffsetInfo<CopyOp<float, at::Half>, float, unsigned int>, float, float)
0.47% 45.320ms 4 11.330ms 384.88us 43.379ms void DSE::regular_fft_pad<int=0, int=1, int=256, int=16, int=16, int=1, __half, float, float2>(float2*, __half*, int, int3, __half*, int, __half*, __half*, int, int, int, int, int, bool)
0.44% 42.249ms 4 10.562ms 7.0908ms 14.012ms volta_sgemm_128x64_nn
0.44% 41.993ms 10 4.1993ms 333.39us 10.387ms void cudnn::detail::implicit_convolve_sgemm<__half, __half, int=1024, int=5, int=5, int=3, int=3, int=3, int=1, bool=1, bool=0, bool=1>(int, int, int, __half const *, int, __half*, cudnn::detail::implicit_convolve_sgemm<__half, __half, int=1024, int=5, int=5, int=3, int=3, int=3, int=1, bool=1, bool=0, bool=1>*, kernel_conv_params, int, float, float, int, __half, __half, int, int)
0.43% 41.194ms 10 4.1194ms 1.7911ms 7.2555ms volta_fp16_scudnn_fp16_128x64_relu_small_nn_v1
0.42% 40.298ms 2 20.149ms 216.65us 40.082ms void DSE::regular_fft_pad<int=0, int=1, int=128, int=16, int=32, int=1, __half, float, float2>(float2*, __half*, int, int3, __half*, int, __half*, __half*, int, int, int, int, int, bool)
0.42% 39.841ms 4 9.9603ms 302.86us 38.143ms void DSE::vector_fft<int=0, int=1, int=256, int=16, int=16, int=1, __half, float, float2>(float2*, float2, int, int3, float2*)
0.40% 38.357ms 10 3.8357ms 1.8650ms 7.4328ms void cudnn::detail::explicit_convolve_sgemm<__half, int, int=512, int=6, int=8, int=3, int=3, int=5, int=0, bool=1>(int, int, int, __half const *, int, __half const , int, cudnn::detail::explicit_convolve_sgemm<__half, int, int=512, int=6, int=8, int=3, int=3, int=5, int=0, bool=1>*, kernel_conv_params, int, int, float, float, int, __half const *, __half const *)
0.40% 37.997ms 2 18.999ms 157.32us 37.840ms void DSE::vector_fft<int=0, int=1, int=128, int=8, int=8, int=1, __half, float, float2>(float2*, float2, int, int3, float2*)
0.38% 36.555ms 8 4.5693ms 2.1408ms 8.1353ms void cudnn::detail::implicit_convolve_sgemm<__half, __half, int=1024, int=6, int=7, int=3, int=3, int=5, int=1, bool=1, bool=0, bool=1>(int, int, int, __half const *, int, __half*, cudnn::detail::implicit_convolve_sgemm<__half, __half, int=1024, int=6, int=7, int=3, int=3, int=5, int=1, bool=1, bool=0, bool=1>*, kernel_conv_params, int, float, float, int, __half, __half, int, int)
0.36% 34.120ms 2 17.060ms 109.09us 34.011ms void fft2d_r2c_64x64<__half>(float2*, __half const *, int, int, int, int, int, int, int, int)
0.33% 31.785ms 4 7.9463ms 50.531us 31.574ms void fft2d_r2c_32x32<__half, bool=0, unsigned int=1, bool=0>(float2*, __half const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
0.31% 29.443ms 151 194.99us 192.46us 198.44us volta_s884cudnn_fp16_256x64_ldg8_relu_exp_interior_nhwc_tn_v1
0.21% 19.897ms 5 3.9795ms 296.21us 9.7205ms void cudnn::detail::explicit_convolve_sgemm<__half, int, int=1024, int=5, int=5, int=3, int=3, int=3, int=0, bool=1>(int, int, int, __half const *, int, __half const , int, cudnn::detail::explicit_convolve_sgemm<__half, int, int=1024, int=5, int=5, int=3, int=3, int=3, int=0, bool=1>*, kernel_conv_params, int, int, float, float, int, __half const *, __half const *)
0.17% 16.686ms 4 4.1716ms 2.0288ms 7.4851ms void cudnn::detail::explicit_convolve_sgemm<__half, int, int=1024, int=6, int=7, int=3, int=3, int=5, int=0, bool=1>(int, int, int, __half const *, int, __half const , int, cudnn::detail::explicit_convolve_sgemm<__half, int, int=1024, int=6, int=7, int=3, int=3, int=5, int=0, bool=1>*, kernel_conv_params, int, int, float, float, int, __half const *, __half const *)
0.17% 16.375ms 196 83.547us 736ns 4.3144ms [CUDA memcpy DtoH]
0.14% 13.206ms 302 43.728us 4.2250us 84.516us void nchwToNhwcKernel<__half, __half, float, bool=1, bool=1>(int, int, int, int, __half const *, __half*, float, float)
0.13% 12.384ms 3488 3.5500us 1.8240us 10.304us cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
0.13% 12.200ms 3 4.0667ms 1.7532ms 6.9506ms volta_fp16_scudnn_fp16_128x128_relu_small_nn_v1
0.13% 12.036ms 151 79.711us 78.883us 80.804us void nhwcToNchwKernel<float, __half, float, bool=1, bool=1>(int, int, int, int, float const *, __half*, float, float)
0.12% 11.723ms 476 24.629us 5.6960us 123.56us void fft2d_r2c_32x32<__half, bool=0, unsigned int=0, bool=0>(float2*, __half const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
0.10% 9.7015ms 19 510.60us 58.499us 2.2062ms void im2col4d_kernel<__half, int>(im2col4d_params, cudnnConvolutionStruct, cudnnTensor4dStruct, __half const *, __half*, int)
0.10% 9.5072ms 476 19.973us 10.209us 106.60us void fft2d_c2r_32x32<__half, bool=0, bool=0, unsigned int=0, bool=0, bool=0>(__half*, float2 const *, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, __half*, __half*, int2, int, int)
0.10% 9.4330ms 3 3.1443ms 3.0944ms 3.2000ms volta_fp16_scudnn_fp16_128x64_relu_interior_nn_v1
0.09% 8.8468ms 28 315.96us 4.5760us 1.7903ms void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, __half>(cudnn::winograd_nonfused::WinogradFilterParams<float, __half>)
0.05% 4.5737ms 2 2.2869ms 904.62us 3.6691ms void flip_filter<__half, __half>(__half*, __half const *, int, int, int, int)
0.04% 3.7634ms 1 3.7634ms 3.7634ms 3.7634ms void fermiPlusCgemmLDS128_batched<bool=1, bool=0, bool=0, bool=0, int=4, int=4, int=4, int=3, int=3, bool=1, bool=0>(float2* const *, float2* const *, float2* const *, float2*, float2 const *, float2 const *, int, int, int, int, int, int, __int64, __int64, __int64, float2 const *, float2 const *, float2, float2, int)
0.03% 3.1034ms 1 3.1034ms 3.1034ms 3.1034ms volta_fp16_scudnn_fp16_128x128_relu_interior_nn_v1
0.03% 2.7891ms 14 199.22us 7.2650us 1.0794ms void cudnn::winograd::generateWinogradTilesKernel<int=0, __half, float>(cudnn::winograd::GenerateWinogradTilesParams<__half, float>)
0.01% 1.0252ms 2 512.62us 44.034us 981.20us void DSE::regular_fft_clip<int=1, int=2, int=256, int=16, int=16, int=1, __half, float, float2>(__half*, float2*, int, int3, float2*, int, float2*, float2*, int, int, int, int, int, float, float, bool, int, __half, __half)
0.01% 724.35us 1 724.35us 724.35us 724.35us void fft1d_r2c_256<__half, float, float2, bool=0, bool=0>(float2*, __half const *, int3, int3, int2, int2)
0.01% 650.75us 1 650.75us 650.75us 650.75us void cudnn::winograd::winograd3x3Kernel<__half, float, int=1, int=4, int=8, bool=1>(cudnn::maxwell::winograd::KernelParams)
0.01% 645.31us 2 322.65us 23.073us 622.24us void DSE::vector_fft<int=1, int=2, int=256, int=16, int=16, int=1, __half, float, float2>(float2*, float2, int, int3, float2*)
0.01% 493.43us 4 123.36us 15.681us 211.59us compute_gemm_pointers(float2**, float2 const *, int, float2 const *, int, float2 const *, int, int)
0.01% 489.33us 1 489.33us 489.33us 489.33us void DSE::regular_fft_clip<int=1, int=2, int=128, int=16, int=32, int=1, __half, float, float2>(__half*, float2*, int, int3, float2*, int, float2*, float2*, int, int, int, int, int, float, float, bool, int, __half, __half)
0.00% 363.09us 906 400ns 256ns 1.4400us [CUDA memset]
0.00% 296.62us 1 296.62us 296.62us 296.62us volta_fp16_scudnn_fp16_128x32_relu_small_nn_v1
0.00% 291.18us 1 291.18us 291.18us 291.18us void DSE::vector_fft<int=1, int=2, int=128, int=8, int=8, int=1, __half, float, float2>(float2*, float2, int, int3, float2*)
0.00% 263.02us 1 263.02us 263.02us 263.02us volta_fp16_scudnn_fp16_128x32_relu_interior_nn_v1
0.00% 251.98us 3 83.993us 79.172us 87.268us void fft2d_c2r_32x32<__half, bool=0, bool=0, unsigned int=1, bool=0, bool=0>(__half*, float2 const *, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, __half*, __half*, int2, int, int)
0.00% 180.39us 1 180.39us 180.39us 180.39us void fft2d_c2r_64x64<__half, bool=0>(__half*, float2*, int, int, int, int, int, int, int, int, int, int, float, float, int, __half*, __half*)
0.00% 42.401us 1 42.401us 42.401us 42.401us void fft1d_c2r_256<float2, float, __half, bool=0, bool=1, bool=0, bool=0>(__half*, float2 const *, int3, int3, int2, int, float, float, __half*, __half*)
0.00% 29.377us 1 29.377us 29.377us 29.377us void fft1d_r2c_256<__half, float, float2, bool=1, bool=0>(float2*, __half const *, int3, int3, int2, int2)
API calls: 43.29% 8.73867s 97 90.089ms 14.657us 4.91466s cudaMalloc
19.74% 3.98450s 161 24.748ms 282.54us 631.21ms cudaEventSynchronize
15.78% 3.18612s 8 398.27ms 30.882us 3.18585s cudaStreamCreateWithFlags
9.94% 2.00560s 300 6.6853ms 26.049us 15.733ms cudaDeviceSynchronize
4.14% 834.88ms 27023 30.895us 20.257us 1.5463ms cudaLaunchKernel
3.00% 606.42ms 55 11.026ms 3.2320us 92.019ms cudaFree
1.53% 308.71ms 158516 1.9470us 1.3760us 939.72us cudaGetDevice
0.92% 185.80ms 1040 178.65us 21.825us 6.0409ms cudaMemcpyAsync
0.49% 98.127ms 45544 2.1540us 1.5040us 938.53us cudaSetDevice
0.32% 64.440ms 906 71.125us 20.385us 944.10us cudaMemsetAsync
0.25% 51.422ms 47 1.0941ms 21.281us 31.602ms cudaMemcpy
0.22% 44.631ms 4590 9.7230us 2.4000us 203.82us cudaEventRecord
0.10% 20.989ms 35812 586ns 224ns 870.88us cudaGetLastError
0.05% 10.620ms 440 24.137us 5.5360us 111.37us cudaStreamSynchronize
0.05% 9.8539ms 1293 7.6200us 2.1120us 87.075us cudaEventQuery
0.05% 9.8371ms 1164 8.4510us 6.3040us 128.10us cudaBindTexture
0.03% 5.8189ms 666 8.7370us 2.3040us 64.579us cudaEventCreateWithFlags
0.02% 4.9241ms 655 7.5170us 2.0160us 814.69us cudaEventDestroy
0.02% 4.9055ms 818 5.9960us 2.3040us 87.363us cudaStreamWaitEvent
0.02% 3.5482ms 1164 3.0480us 2.0800us 28.033us cudaUnbindTexture
0.01% 1.9065ms 161 11.841us 4.5440us 71.843us cudaEventElapsedTime
0.00% 935.33us 3 311.78us 114.34us 576.92us cudaHostAlloc
0.00% 703.68us 19 37.035us 27.169us 70.915us cudaMemGetInfo
0.00% 570.30us 282 2.0220us 800ns 54.114us cuDeviceGetAttribute
0.00% 265.48us 19 13.972us 8.5130us 45.729us cudaEventCreate
0.00% 215.14us 243 885ns 256ns 26.657us cudaGetDeviceCount
0.00% 185.26us 30 6.1750us 3.3280us 36.065us cudaFuncSetAttribute
0.00% 152.55us 4 38.137us 32.033us 54.498us cudaStreamCreateWithPriority
0.00% 67.874us 27 2.5130us 1.5040us 23.681us cudaDeviceGetAttribute
0.00% 41.986us 3 13.995us 11.041us 18.016us cuDeviceTotalMem
0.00% 37.090us 76 488ns 256ns 1.5040us cudaCreateChannelDesc
0.00% 35.489us 1 35.489us 35.489us 35.489us cudaGetDeviceProperties
0.00% 12.609us 5 2.5210us 1.1200us 4.6090us cuDeviceGetCount
0.00% 11.424us 3 3.8080us 2.7840us 4.6720us cuDeviceGetUuid
0.00% 7.1040us 3 2.3680us 1.7920us 3.0400us cuDeviceGetName
0.00% 6.9770us 4 1.7440us 960ns 2.6560us cuDeviceGet
0.00% 6.9120us 1 6.9120us 6.9120us 6.9120us cudaHostGetDevicePointer
0.00% 5.8240us 1 5.8240us 5.8240us 5.8240us cudaDeviceGetStreamPriorityRange
0.00% 5.7920us 2 2.8960us 2.8800us 2.9120us cuInit
0.00% 5.6960us 2 2.8480us 1.9520us 3.7440us cuDriverGetVersion
The good thing is that the parser works perfectly and I can use the information.
The bad thing is that I cannot see pytorch operations and the tensor core usage if NA.
python -m apex.pyprof.parse my_output_file.sql > my_output_file.dict
python -m apex.pyprof.prof -w 150 -c kernel,op,sil,tc,flops
Output example:
Kernel Op Sil(ns) TC FLOPs
kernelPointwiseApply2 3680 na 0
kernelPointwiseApply2 1504 na 0
kernelPointwiseApply2 17953 na 0
cudnn::detail::implicit_convolve_sgemm 410066 na 0
cudnn::detail::implicit_convolve_sgemm 395121 na 0
cudnn::gemm::computeOffsetsKernel 10208 na 0
volta_fp16_scudnn_fp16_128x32_relu_small_nn_v1 292813 na 0
nchwToNhwcKernel 83459 na 0
nchwToNhwcKernel 4544 na 0
cudnn::gemm::computeOffsetsKernel 8160 na 0
volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1 407186 na 0
im2col4d_kernel 59427 na 0
cudnn::detail::explicit_convolve_sgemm 428275 na 0
fft2d_r2c_32x32 29345 na 0
fft2d_r2c_32x32 19520 na 0
volta_gcgemm_32x32_nt 126342 na 0
fft2d_c2r_32x32 22593 na 0
fft2d_r2c_32x32 7424 na 0
volta_gcgemm_32x32_nt 119045 na 0
fft2d_c2r_32x32 15169 na 0
fft2d_r2c_32x32 15425 na 0
volta_gcgemm_32x32_nt 118981 na 0
fft2d_c2r_32x32 15777 na 0
fft2d_r2c_32x32 11648 na 0
volta_gcgemm_32x32_nt 115205 na 0
fft2d_c2r_32x32 14624 na 0
fft2d_r2c_32x32 12097 na 0
volta_gcgemm_32x32_nt 117701 na 0
fft2d_c2r_32x32 15809 na 0
fft2d_r2c_32x32 11904 na 0
volta_gcgemm_32x32_nt 121318 na 0
fft2d_c2r_32x32 15169 na 0
fft2d_r2c_32x32 11936 na 0
volta_gcgemm_32x32_nt 116229 na 0
fft2d_c2r_32x32 15809 na 0
fft2d_r2c_32x32 13185 na 0
- Why on TC says NA? (1 should be tensor cores being used and I was expecting 0 for not being used)
Why all the FLOPS are 0?
- [b]Is there a way to know what operations would be able to run on tensor cores but are not currently doing it?, Since I see some fp16 kernels I guess those should be run on TC
I will also post this information on the github on apex/pyprof since it could be an issue with the parser.
Thanks for your help