==PROF== Connected to process 3008613 (/usr/local/bin/cutlass_profiler) ==PROF== Profiling "BlockForEach" - 1: 0%....50%....100% - 3 passes ==PROF== Profiling "BlockForEach" - 2: 0%....50%....100% - 3 passes ==PROF== Profiling "BlockForEach" - 3: 0%....50%....100% - 3 passes ==PROF== Profiling "Kernel" - 4: 0%....50%....100% - 3 passes ==PROF== Profiling "Kernel" - 5: 0%....50%....100% - 3 passes ==PROF== Profiling "BlockCompareRelativelyEqual" - 6: 0%....50%....100% - 3 passes ==PROF== Profiling "GemmComplex" - 7: 0%....50%....100% - 3 passes ==PROF== Profiling "BlockCompareRelativelyEqual" - 8: 0%....50%....100% - 3 passes ==PROF== Profiling "Kernel" - 9: 0%....50%....100% - 3 passes ==PROF== Profiling "Kernel" - 10: 0%....50%....100% - 3 passes ==PROF== Profiling "Kernel" - 11: 0%....50%....100% - 3 passes ==PROF== Profiling "Kernel" - 12: 0%....50%....100% - 3 passes ==PROF== Profiling "Kernel" - 13: 0%....50%....100% - 3 passes ==PROF== Profiling "Kernel" - 14: 0%....50%....100% - 3 passes ==PROF== Profiling "Kernel" - 15: 0%....50%....100% - 3 passes ==PROF== Profiling "Kernel" - 16: 0%....50%....100% - 3 passes ==PROF== Profiling "Kernel" - 17: 0%....50%....100% - 3 passes ==PROF== Profiling "Kernel" - 18: 0%....50%....100% - 3 passes ==PROF== Profiling "Kernel" - 19: 0%....50%....100% - 3 passes ============================= Problem ID: 1 Provider: CUTLASS OperationKind: gemm Operation: cutlass_tensorop_h16816gemm_256x128_32x3_nn_align8 Status: Success Verification: ON Disposition: Passed reference_device: Passed cuBLAS: Passed Arguments: --gemm_kind=universal --m=224 --n=224 --k=224 --A=f16:column --B=f16:column --C=f16:column --alpha=1 \ --beta=0 --split_k_slices=1 --batch_count=1 --op_class=tensorop --accum=f16 --cta_m=256 --cta_n=128 \ --cta_k=32 --stages=3 --warps_m=4 --warps_n=2 --warps_k=1 --inst_m=16 --inst_n=8 --inst_k=16 --min_cc=80 \ --max_cc=1024 Bytes: 301056 bytes FLOPs: 22579200 flops Runtime: 220.331 ms Memory: 0.00127254 GiB/s Math: 0.102479 GFLOP/s ============================= CSV Results: Problem,Provider,OperationKind,Operation,Disposition,Status,gemm_kind,m,n,k,A,B,C,alpha,beta,split_k_slices,batch_count,op_class,accum,cta_m,cta_n,cta_k,stages,warps_m,warps_n,warps_k,inst_m,inst_n,inst_k,min_cc,max_cc,Bytes,Flops,Runtime,GB/s,GFLOPs 1,CUTLASS,gemm,cutlass_tensorop_h16816gemm_256x128_32x3_nn_align8,passed,success,universal,224,224,224,f16:column,f16:column,f16:column,1,0,1,1,tensorop,f16,256,128,32,3,4,2,1,16,8,16,80,1024,301056,22579200,220.331,0.00127254,0.102479 ==PROF== Disconnected from process 3008613 [3008613] cutlass_profiler@127.0.0.1 void cutlass::reference::device::kernel::BlockForEach>(T1 *, unsigned long, T2::Params), 2022-Oct-04 22:14:25, Context 1, Stream 7 Section: Command line profiler metrics ---------------------------------------------------------------------- --------------- ------------------------------ dram__bytes.sum Mbyte 25.77 sm__cycles_elapsed.avg cycle 318225.04 sm__cycles_elapsed.avg.per_second cycle/nsecond 1.15 sm__inst_executed_pipe_tensor.sum inst 0 sm__sass_thread_inst_executed_op_ffma_pred_on.sum inst 41947136 sm__sass_thread_inst_executed_op_hfma_pred_on.sum inst 0 ---------------------------------------------------------------------- --------------- ------------------------------ void cutlass::reference::device::kernel::BlockForEach>(T1 *, unsigned long, T2::Params), 2022-Oct-04 22:14:25, Context 1, Stream 7 Section: Command line profiler metrics ---------------------------------------------------------------------- --------------- ------------------------------ dram__bytes.sum Mbyte 25.75 sm__cycles_elapsed.avg cycle 320914.26 sm__cycles_elapsed.avg.per_second cycle/nsecond 1.15 sm__inst_executed_pipe_tensor.sum inst 0 sm__sass_thread_inst_executed_op_ffma_pred_on.sum inst 41947136 sm__sass_thread_inst_executed_op_hfma_pred_on.sum inst 0 ---------------------------------------------------------------------- --------------- ------------------------------ void cutlass::reference::device::kernel::BlockForEach>(T1 *, unsigned long, T2::Params), 2022-Oct-04 22:14:25, Context 1, Stream 7 Section: Command line profiler metrics ---------------------------------------------------------------------- --------------- ------------------------------ dram__bytes.sum Mbyte 25.72 sm__cycles_elapsed.avg cycle 318814.52 sm__cycles_elapsed.avg.per_second cycle/nsecond 1.15 sm__inst_executed_pipe_tensor.sum inst 0 sm__sass_thread_inst_executed_op_ffma_pred_on.sum inst 41947136 sm__sass_thread_inst_executed_op_hfma_pred_on.sum inst 0 ---------------------------------------------------------------------- --------------- ------------------------------ void cutlass::Kernel(T1::Params), 2022-Oct-04 22:14:26, Context 1, Stream 7 Section: Command line profiler metrics ---------------------------------------------------------------------- --------------- ------------------------------ dram__bytes.sum Kbyte 225.28 sm__cycles_elapsed.avg cycle 18982.65 sm__cycles_elapsed.avg.per_second cycle/nsecond 1.15 sm__inst_executed_pipe_tensor.sum inst 7168 sm__sass_thread_inst_executed_op_ffma_pred_on.sum inst 0 sm__sass_thread_inst_executed_op_hfma_pred_on.sum inst 0 ---------------------------------------------------------------------- --------------- ------------------------------ void cutlass::Kernel(T1::Params), 2022-Oct-04 22:14:27, Context 1, Stream 7 Section: Command line profiler metrics ---------------------------------------------------------------------- --------------- ------------------------------ dram__bytes.sum Kbyte 215.81 sm__cycles_elapsed.avg cycle 9189.33 sm__cycles_elapsed.avg.per_second cycle/nsecond 1.14 sm__inst_executed_pipe_tensor.sum inst 6272 sm__sass_thread_inst_executed_op_ffma_pred_on.sum inst 0 sm__sass_thread_inst_executed_op_hfma_pred_on.sum inst 0 ---------------------------------------------------------------------- --------------- ------------------------------ void cutlass::reference::device::kernel::BlockCompareRelativelyEqual(int *, const T1 *, const T1 *, unsigned long, T1, T1), 2022-Oct-04 22:14:27, Context 1, Stream 7 Section: Command line profiler metrics ---------------------------------------------------------------------- --------------- ------------------------------ dram__bytes.sum Kbyte 204.80 sm__cycles_elapsed.avg cycle 6234.31 sm__cycles_elapsed.avg.per_second cycle/nsecond 1.14 sm__inst_executed_pipe_tensor.sum inst 0 sm__sass_thread_inst_executed_op_ffma_pred_on.sum inst 0 sm__sass_thread_inst_executed_op_hfma_pred_on.sum inst 0 ---------------------------------------------------------------------- --------------- ------------------------------ void cutlass::reference::device::kernel::GemmComplex, cutlass::multiply_add, (int)4, (int)4>(cutlass::gemm::GemmCoord, T7, cutlass::TensorRef, cutlass::ComplexTransform, cutlass::TensorRef, cutlass::ComplexTransform, T7, cutlass::TensorRef, cutlass::TensorRef, T8, int, long, long, long, long), 2022-Oct-04 22:14:28, Context 1, Stream 7 Section: Command line profiler metrics ---------------------------------------------------------------------- --------------- ------------------------------ dram__bytes.sum Kbyte 312.19 sm__cycles_elapsed.avg cycle 241570.65 sm__cycles_elapsed.avg.per_second cycle/nsecond 1.15 sm__inst_executed_pipe_tensor.sum inst 0 sm__sass_thread_inst_executed_op_ffma_pred_on.sum inst 0 sm__sass_thread_inst_executed_op_hfma_pred_on.sum inst 11289600 ---------------------------------------------------------------------- --------------- ------------------------------ void cutlass::reference::device::kernel::BlockCompareRelativelyEqual(int *, const T1 *, const T1 *, unsigned long, T1, T1), 2022-Oct-04 22:14:28, Context 1, Stream 7 Section: Command line profiler metrics ---------------------------------------------------------------------- --------------- ------------------------------ dram__bytes.sum Kbyte 204.80 sm__cycles_elapsed.avg cycle 6176.72 sm__cycles_elapsed.avg.per_second cycle/nsecond 1.14 sm__inst_executed_pipe_tensor.sum inst 0 sm__sass_thread_inst_executed_op_ffma_pred_on.sum inst 0 sm__sass_thread_inst_executed_op_hfma_pred_on.sum inst 0 ---------------------------------------------------------------------- --------------- ------------------------------ void cutlass::Kernel(T1::Params), 2022-Oct-04 22:14:28, Context 1, Stream 7 Section: Command line profiler metrics ---------------------------------------------------------------------- --------------- ------------------------------ dram__bytes.sum Kbyte 225.66 sm__cycles_elapsed.avg cycle 19489.52 sm__cycles_elapsed.avg.per_second cycle/nsecond 1.15 sm__inst_executed_pipe_tensor.sum inst 7168 sm__sass_thread_inst_executed_op_ffma_pred_on.sum inst 0 sm__sass_thread_inst_executed_op_hfma_pred_on.sum inst 0 ---------------------------------------------------------------------- --------------- ------------------------------ void cutlass::Kernel(T1::Params), 2022-Oct-04 22:14:28, Context 1, Stream 7 Section: Command line profiler metrics ---------------------------------------------------------------------- --------------- ------------------------------ dram__bytes.sum Kbyte 227.71 sm__cycles_elapsed.avg cycle 19570.13 sm__cycles_elapsed.avg.per_second cycle/nsecond 1.15 sm__inst_executed_pipe_tensor.sum inst 7168 sm__sass_thread_inst_executed_op_ffma_pred_on.sum inst 0 sm__sass_thread_inst_executed_op_hfma_pred_on.sum inst 0 ---------------------------------------------------------------------- --------------- ------------------------------ void cutlass::Kernel(T1::Params), 2022-Oct-04 22:14:29, Context 1, Stream 7 Section: Command line profiler metrics ---------------------------------------------------------------------- --------------- ------------------------------ dram__bytes.sum Kbyte 225.66 sm__cycles_elapsed.avg cycle 19465.74 sm__cycles_elapsed.avg.per_second cycle/nsecond 1.15 sm__inst_executed_pipe_tensor.sum inst 7168 sm__sass_thread_inst_executed_op_ffma_pred_on.sum inst 0 sm__sass_thread_inst_executed_op_hfma_pred_on.sum inst 0 ---------------------------------------------------------------------- --------------- ------------------------------ void cutlass::Kernel(T1::Params), 2022-Oct-04 22:14:29, Context 1, Stream 7 Section: Command line profiler metrics ---------------------------------------------------------------------- --------------- ------------------------------ dram__bytes.sum Kbyte 225.79 sm__cycles_elapsed.avg cycle 19537.72 sm__cycles_elapsed.avg.per_second cycle/nsecond 1.15 sm__inst_executed_pipe_tensor.sum inst 7168 sm__sass_thread_inst_executed_op_ffma_pred_on.sum inst 0 sm__sass_thread_inst_executed_op_hfma_pred_on.sum inst 0 ---------------------------------------------------------------------- --------------- ------------------------------ void cutlass::Kernel(T1::Params), 2022-Oct-04 22:14:29, Context 1, Stream 7 Section: Command line profiler metrics ---------------------------------------------------------------------- --------------- ------------------------------ dram__bytes.sum Kbyte 225.79 sm__cycles_elapsed.avg cycle 19462.37 sm__cycles_elapsed.avg.per_second cycle/nsecond 1.14 sm__inst_executed_pipe_tensor.sum inst 7168 sm__sass_thread_inst_executed_op_ffma_pred_on.sum inst 0 sm__sass_thread_inst_executed_op_hfma_pred_on.sum inst 0 ---------------------------------------------------------------------- --------------- ------------------------------ void cutlass::Kernel(T1::Params), 2022-Oct-04 22:14:29, Context 1, Stream 7 Section: Command line profiler metrics ---------------------------------------------------------------------- --------------- ------------------------------ dram__bytes.sum Kbyte 225.79 sm__cycles_elapsed.avg cycle 19640.63 sm__cycles_elapsed.avg.per_second cycle/nsecond 1.15 sm__inst_executed_pipe_tensor.sum inst 7168 sm__sass_thread_inst_executed_op_ffma_pred_on.sum inst 0 sm__sass_thread_inst_executed_op_hfma_pred_on.sum inst 0 ---------------------------------------------------------------------- --------------- ------------------------------ void cutlass::Kernel(T1::Params), 2022-Oct-04 22:14:30, Context 1, Stream 7 Section: Command line profiler metrics ---------------------------------------------------------------------- --------------- ------------------------------ dram__bytes.sum Kbyte 225.79 sm__cycles_elapsed.avg cycle 19555.70 sm__cycles_elapsed.avg.per_second cycle/nsecond 1.15 sm__inst_executed_pipe_tensor.sum inst 7168 sm__sass_thread_inst_executed_op_ffma_pred_on.sum inst 0 sm__sass_thread_inst_executed_op_hfma_pred_on.sum inst 0 ---------------------------------------------------------------------- --------------- ------------------------------ void cutlass::Kernel(T1::Params), 2022-Oct-04 22:14:30, Context 1, Stream 7 Section: Command line profiler metrics ---------------------------------------------------------------------- --------------- ------------------------------ dram__bytes.sum Kbyte 225.79 sm__cycles_elapsed.avg cycle 19453.83 sm__cycles_elapsed.avg.per_second cycle/nsecond 1.15 sm__inst_executed_pipe_tensor.sum inst 7168 sm__sass_thread_inst_executed_op_ffma_pred_on.sum inst 0 sm__sass_thread_inst_executed_op_hfma_pred_on.sum inst 0 ---------------------------------------------------------------------- --------------- ------------------------------ void cutlass::Kernel(T1::Params), 2022-Oct-04 22:14:30, Context 1, Stream 7 Section: Command line profiler metrics ---------------------------------------------------------------------- --------------- ------------------------------ dram__bytes.sum Kbyte 225.79 sm__cycles_elapsed.avg cycle 19502.76 sm__cycles_elapsed.avg.per_second cycle/nsecond 1.15 sm__inst_executed_pipe_tensor.sum inst 7168 sm__sass_thread_inst_executed_op_ffma_pred_on.sum inst 0 sm__sass_thread_inst_executed_op_hfma_pred_on.sum inst 0 ---------------------------------------------------------------------- --------------- ------------------------------ void cutlass::Kernel(T1::Params), 2022-Oct-04 22:14:30, Context 1, Stream 7 Section: Command line profiler metrics ---------------------------------------------------------------------- --------------- ------------------------------ dram__bytes.sum Kbyte 225.66 sm__cycles_elapsed.avg cycle 19668.22 sm__cycles_elapsed.avg.per_second cycle/nsecond 1.15 sm__inst_executed_pipe_tensor.sum inst 7168 sm__sass_thread_inst_executed_op_ffma_pred_on.sum inst 0 sm__sass_thread_inst_executed_op_hfma_pred_on.sum inst 0 ---------------------------------------------------------------------- --------------- ------------------------------ void cutlass::Kernel(T1::Params), 2022-Oct-04 22:14:30, Context 1, Stream 7 Section: Command line profiler metrics ---------------------------------------------------------------------- --------------- ------------------------------ dram__bytes.sum Kbyte 225.79 sm__cycles_elapsed.avg cycle 20080.44 sm__cycles_elapsed.avg.per_second cycle/nsecond 1.15 sm__inst_executed_pipe_tensor.sum inst 7168 sm__sass_thread_inst_executed_op_ffma_pred_on.sum inst 0 sm__sass_thread_inst_executed_op_hfma_pred_on.sum inst 0 ---------------------------------------------------------------------- --------------- ------------------------------