About the ncu profiling

I used ncu to analysis cdist_backward. I have some questions about the print below。

what’s sm__throughput.avg.pct_of_peak_sustained_elapsed and gpu__dram_throughput.avg.pct_of_peak_sustained_elapsed mean ?

sm__throughput.avg.pct_of_peak_sustained_elapsed = compute_efficiency?
gpu__dram_throughput.avg.pct_of_peak_sustained_elapsed = io_efficiency?

How it is calculated
this is my gpu information

GPU Name = Tesla V100-SXM2-16GB
Compute Capability = 7.0
GPU SMs = 80
GPU CUDA cores = 5120
GPU SM clock rate = 1.530 GHz
GPU Mem clock rate = 0.877 GHz
FP32 Peak Performance = 15667.200 GFLOPS
FP16 Peak Performance = 31334.400 GFLOPS
INT8 Peak Performance = 62668.800 GFLOPS
Tensor Core FP16 Peak Performance = 125337.600 GFLOPS
root@sh815:/work/operator# ncu --print-summary per-kernel --section SpeedOfLight python cdist_backward.py
==PROF== Connected to process 489 (/opt/conda/bin/python3.7)
==PROF== Profiling "cdist_kernel_cuda_impl" - 1: 0%....50%....100% - 19 passes
torch.Size([900, 31])
==PROF== Profiling "cdist_backward_kernel_cuda_impl" - 2: 0%....50%....100% - 19 passes
==PROF== Profiling "reduce_kernel" - 3: 0%....50%....100% - 19 passes
torch.Size([900, 8])
==PROF== Disconnected from process 489
[489] python3.7@127.0.0.1
  void at::native::<unnamed>::cdist_backward_kernel_cuda_impl<float, at::native::<unnamed>::dists<float>::one>(T1 *, const T1 *, const T1 *, const T1 *, const T1 *, long, T1, long, long, long, long, long, long, long), Block Size 1, Grid Size 1, Device 0, 1 invocations 
    Section: GPU Speed Of Light Throughput
    Metric Name                                                      Metric Unit   Minimum      Maximum      Average     
    ---------------------------------------------------------------- ------------- ------------ ------------ ------------
    dram__cycles_elapsed.avg.per_second                              cycle/usecond 802.114804   802.114804   802.114804  
    gpc__cycles_elapsed.avg.per_second                               cycle/nsecond 1.185659     1.185659     1.185659    
    gpc__cycles_elapsed.max                                          cycle         25192.000000 25192.000000 25192.000000
    gpu__compute_memory_throughput.avg.pct_of_peak_sustained_elapsed %             13.941638    13.941638    13.941638   
    gpu__dram_throughput.avg.pct_of_peak_sustained_elapsed           %             0.813250     0.813250     0.813250    
    gpu__time_duration.sum                                           usecond       21.184000    21.184000    21.184000   
    l1tex__throughput.avg.pct_of_peak_sustained_active               %             15.368853    15.368853    15.368853   
    lts__throughput.avg.pct_of_peak_sustained_elapsed                %             2.568912     2.568912     2.568912    
    sm__cycles_active.avg                                            cycle         22784.962500 22784.962500 22784.962500
    sm__throughput.avg.pct_of_peak_sustained_elapsed                 %             60.943825    60.943825    60.943825   

  void at::native::<unnamed>::cdist_kernel_cuda_impl<float, at::native::<unnamed>::dists<float>::one>(T1 *, const T1 *, const T1 *, T1, long, long, long, long, long, long), Block Size 1, Grid Size 1, Device 0, 1 invocations 
    Section: GPU Speed Of Light Throughput
    Metric Name                                                      Metric Unit   Minimum       Maximum       Average      
    ---------------------------------------------------------------- ------------- ------------- ------------- -------------
    dram__cycles_elapsed.avg.per_second                              cycle/usecond 870.075509    870.075509    870.075509   
    gpc__cycles_elapsed.avg.per_second                               cycle/nsecond 1.297918      1.297918      1.297918     
    gpc__cycles_elapsed.max                                          cycle         253287.000000 253287.000000 253287.000000
    gpu__compute_memory_throughput.avg.pct_of_peak_sustained_elapsed %             12.712454     12.712454     12.712454    
    gpu__dram_throughput.avg.pct_of_peak_sustained_elapsed           %             0.017300      0.017300      0.017300     
    gpu__time_duration.sum                                           usecond       194.944000    194.944000    194.944000   
    l1tex__throughput.avg.pct_of_peak_sustained_active               %             12.822529     12.822529     12.822529    
    lts__throughput.avg.pct_of_peak_sustained_elapsed                %             0.575559      0.575559      0.575559     
    sm__cycles_active.avg                                            cycle         250853.300000 250853.300000 250853.300000
    sm__throughput.avg.pct_of_peak_sustained_elapsed                 %             76.590542     76.590542     76.590542    

  void at::native::reduce_kernel<(int)128, (int)4, at::native::ReduceOp<float, at::native::func_wrapper_t<float, at::native::sum_functor<float, float, float>::operator ()(at::TensorIterator &)::[lambda(float, float) (instance 1)]>, unsigned int, float, (int)4>>(T3), Block Size 1, Grid Size 1, Device 0, 1 invocations 
    Section: GPU Speed Of Light Throughput
    Metric Name                                                      Metric Unit   Minimum     Maximum     Average    
    ---------------------------------------------------------------- ------------- ----------- ----------- -----------
    dram__cycles_elapsed.avg.per_second                              cycle/usecond 610.294118  610.294118  610.294118 
    gpc__cycles_elapsed.avg.per_second                               cycle/usecond 912.055760  912.055760  912.055760 
    gpc__cycles_elapsed.max                                          cycle         9933.000000 9933.000000 9933.000000
    gpu__compute_memory_throughput.avg.pct_of_peak_sustained_elapsed %             13.133001   13.133001   13.133001  
    gpu__dram_throughput.avg.pct_of_peak_sustained_elapsed           %             13.133001   13.133001   13.133001  
    gpu__time_duration.sum                                           usecond       10.880000   10.880000   10.880000  
    l1tex__throughput.avg.pct_of_peak_sustained_active               %             23.517313   23.517313   23.517313  
    lts__throughput.avg.pct_of_peak_sustained_elapsed                %             5.245606    5.245606    5.245606   
    sm__cycles_active.avg                                            cycle         1482.950000 1482.950000 1482.950000
    sm__throughput.avg.pct_of_peak_sustained_elapsed                 %             0.998429    0.998429    0.998429