Allocating more share memory than needed resulted in bank conflict

sz16 · October 18, 2024, 3:11pm

Hello, I have a piece of code written in Cutlass 3 which loads a matrix from global memory to shared memory. The thread layout is very naive, where each thread loads a 2 byte element with no stride.

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <cute/layout.hpp>
#include <cute/tensor.hpp>

#include "cutlass/util/print_error.hpp"
#include "cutlass/util/GPU_Clock.hpp"
#include "cutlass/util/helper_cuda.hpp"


using Element = cutlass::half_t;

/// Vectorized copy kernel.
///
/// Uses `make_tiled_copy()` to perform a copy using vector instructions. This operation
/// has the precondition that pointers are aligned to the vector size.
///
template <typename TS, typename TD, class ThreadLayout, class VecLayout>
__global__ void copy_kernel_vectorized(__grid_constant__ const TS *const S, __grid_constant__ const TD *const D, ThreadLayout, VecLayout)
{
  using namespace cute;
  constexpr auto tensor_shape = Shape<_16, _32>{};
  __shared__ char smem_[size(tensor_shape) * sizeof(Element)];

  Tensor gS = make_tensor(
    make_gmem_ptr(const_cast<Element *>(reinterpret_cast<const Element *>(S))),
    make_layout(Shape<_16, _32>{}, GenRowMajor{}));
  Tensor gD = make_tensor(make_gmem_ptr(const_cast<Element *>(D)), make_layout(tensor_shape, GenRowMajor{}));

  Tensor sS = make_tensor(make_smem_ptr(const_cast<Element *>(reinterpret_cast<const Element *>(smem_))), make_layout(tensor_shape, GenRowMajor{}));

  // Define `AccessType` which controls the size of the actual memory access.
  using AccessType = cutlass::AlignedArray<Element, size(VecLayout{})>;

  // A copy atom corresponds to one hardware memory access.
  using Atom = Copy_Atom<DefaultCopy, Element>;

  auto tiled_copy =
    make_tiled_copy(
      Atom{},                       // access size
      ThreadLayout{},               // thread layout
      VecLayout{});                 // vector layout (e.g. 4x1)

  // Construct a Tensor corresponding to each thread's slice.
  auto thr_copy = tiled_copy.get_thread_slice(threadIdx.x);

  Tensor thr_tile_S = thr_copy.partition_S(gS);             // (CopyOp, CopyM, CopyN)
  Tensor thr_tile_D = thr_copy.partition_D(gD);             // (CopyOp, CopyM, CopyN)
  Tensor tSsSD = thr_copy.partition_D(sS); // (CopyOp, CopyM, CopyN)
  Tensor tSsSS = thr_copy.partition_S(sS); // (CopyOp, CopyM, CopyN)


  for (int i = 0; i < 65536 / 16; i++)
  {
    copy(tiled_copy, thr_tile_S, tSsSD);
    thr_tile_S.data() = thr_tile_S.data() + int(size(tensor_shape));
  }
}

/// Main function
int main(int argc, char** argv)
{

  using namespace cute;

  auto tensor_shape = cute::Shape<cute::_65536, cute::_32>{};

  thrust::host_vector<Element> h_S(size(tensor_shape));
  thrust::host_vector<Element> h_D(size(tensor_shape));

  for (size_t i = 0; i < h_S.size(); ++i) {
    h_S[i] = static_cast<Element>(static_cast<float>(i));
    h_D[i] = Element{};
  }

  thrust::device_vector<Element> d_S = h_S;
  thrust::device_vector<Element> d_D = h_D;

  //
  // Make tensors
  //

  Tensor tensor_S = make_tensor(make_gmem_ptr(thrust::raw_pointer_cast(d_S.data())), make_layout(tensor_shape, GenRowMajor{}));
  Tensor tensor_D = make_tensor(make_gmem_ptr(thrust::raw_pointer_cast(d_D.data())), make_layout(tensor_shape, GenRowMajor{}));

  // Thread arrangement
  Layout thr_layout = make_layout(make_shape(Int<8>{}, Int<32>{}), GenRowMajor{});

  // Vector dimensions
  Layout vec_layout = make_layout(make_shape(Int<1>{}, Int<1>{}), GenRowMajor{});

  //
  // Determine grid and block dimensions
  //

  dim3 gridDim (1024);   // Grid shape corresponds to modes m' and n'
  dim3 blockDim(size(thr_layout));

  //
  // Launch the kernel
  //
  copy_kernel_vectorized<<< gridDim, blockDim >>>(
    tensor_S.data().get(),
    tensor_D.data().get(),
    thr_layout,
    vec_layout);

  cudaError result = cudaDeviceSynchronize();
  if (result != cudaSuccess) {
    std::cerr << "CUDA Runtime error: " << cudaGetErrorString(result) << std::endl;
    return -1;
  }

  std::cout << "Success." << std::endl;

  return 0;
}

The above code compiles and runs with no bank conflict, as confirmed by nsight compute

cutlass/build$ make tiled_copy && ncu --section MemoryWorkloadAnalysis --metric l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum --launch-count 1 ./examples/cute/tutorial/tiled_copy
Building CUDA object examples/cute/tutorial/CMakeFiles/tiled_copy.dir/tiled_copy.cu.o
Linking CUDA executable tiled_copy
Built target tiled_copy
==PROF== Connected to process 3901463 (/home/sean/manifest2/packages/state_kernel/csrc/cutlass/build/examples/cute/tutorial/tiled_copy)
==PROF== Profiling "copy_kernel_vectorized" - 0 (1/1): 0%....50%....100% - 7 passes
Success.
==PROF== Disconnected from process 3901463
[3901463] tiled_copy@127.0.0.1
  void copy_kernel_vectorized<half_t, half_t, Layout<tuple<C<8>, C<32>>, tuple<C<32>, C<1>>>, Layout<tuple<C<1>, C<1>>, tuple<C<0>, C<0>>>>(const T1 *, const T2 *, T3, T4) (1024, 1, 1)x(256, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
    Section: Command line profiler metrics
    -------------------------------------------------------- ----------- ------------
    Metric Name                                              Metric Unit Metric Value
    -------------------------------------------------------- ----------- ------------
    l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum                        0
    -------------------------------------------------------- ----------- ------------

    Section: Memory Workload Analysis
    --------------------------- ----------- ------------
    Metric Name                 Metric Unit Metric Value
    --------------------------- ----------- ------------
    Memory Throughput               Mbyte/s       885.50
    Mem Busy                              %         9.11
    Max Bandwidth                         %        17.90
    L1/TEX Hit Rate                       %        91.80
    L2 Compression Success Rate           %            0
    L2 Compression Ratio                               0
    L2 Hit Rate                           %        99.85
    Mem Pipes Busy                        %        17.90
    --------------------------- ----------- ------------

However, if I add more share memory than necessary by changing this line

__shared__ char smem_[size(tensor_shape) * sizeof(Element)];

to

__shared__ char smem_[size(tensor_shape) * sizeof(Element) * 16];

running it again will give

cutlass/build$ make tiled_copy && ncu --section MemoryWorkloadAnalysis --metric l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum --launch-count 1 ./examples/cute/tutorial/tiled_copy
Building CUDA object examples/cute/tutorial/CMakeFiles/tiled_copy.dir/tiled_copy.cu.o
Linking CUDA executable tiled_copy
Built target tiled_copy
==PROF== Connected to process 3902130 (/home/sean/manifest2/packages/state_kernel/csrc/cutlass/build/examples/cute/tutorial/tiled_copy)
==PROF== Profiling "copy_kernel_vectorized" - 0 (1/1): 0%....50%....100% - 7 passes
Success.
==PROF== Disconnected from process 3902130
[3902130] tiled_copy@127.0.0.1
  void copy_kernel_vectorized<half_t, half_t, Layout<tuple<C<8>, C<32>>, tuple<C<32>, C<1>>>, Layout<tuple<C<1>, C<1>>, tuple<C<0>, C<0>>>>(const T1 *, const T2 *, T3, T4) (1024, 1, 1)x(256, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
    Section: Command line profiler metrics
    -------------------------------------------------------- ----------- ------------
    Metric Name                                              Metric Unit Metric Value
    -------------------------------------------------------- ----------- ------------
    l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum                       22
    -------------------------------------------------------- ----------- ------------

    Section: Memory Workload Analysis
    --------------------------- ----------- ------------
    Metric Name                 Metric Unit Metric Value
    --------------------------- ----------- ------------
    Memory Throughput               Mbyte/s       888.89
    Mem Busy                              %         8.87
    Max Bandwidth                         %        17.41
    L1/TEX Hit Rate                       %        91.80
    L2 Compression Success Rate           %            0
    L2 Compression Ratio                               0
    L2 Hit Rate                           %        97.11
    Mem Pipes Busy                        %        17.41
    --------------------------- ----------- ------------

which shows 22 bank conflict. I suspect this is the compiler doing some optimization (since it sees the large space to be used?) which breaks the synchronization of threads doing copy.

Then I added a __syncwarp after the copy, meaning instead of

  for (int i = 0; i < 65536 / 16; i++)
  {
    copy(tiled_copy, thr_tile_S, tSsSD);
    thr_tile_S.data() = thr_tile_S.data() + int(size(tensor_shape));
  }

I do

  for (int i = 0; i < 65536 / 16; i++)
  {
    copy(tiled_copy, thr_tile_S, tSsSD);
    __syncwarp();
    thr_tile_S.data() = thr_tile_S.data() + int(size(tensor_shape));
  }

Running it again gives 5,713,107 bank conflict.

cutlass/build$ make tiled_copy && ncu --section MemoryWorkloadAnalysis --metric l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum --launch-count 1 ./examples/cute/tutorial/tiled_copy
Building CUDA object examples/cute/tutorial/CMakeFiles/tiled_copy.dir/tiled_copy.cu.o
Linking CUDA executable tiled_copy
Built target tiled_copy
==PROF== Connected to process 3902926 (/home/sean/manifest2/packages/state_kernel/csrc/cutlass/build/examples/cute/tutorial/tiled_copy)
==PROF== Profiling "copy_kernel_vectorized" - 0 (1/1): 0%....50%....100% - 7 passes
Success.
==PROF== Disconnected from process 3902926
[3902926] tiled_copy@127.0.0.1
  void copy_kernel_vectorized<half_t, half_t, Layout<tuple<C<8>, C<32>>, tuple<C<32>, C<1>>>, Layout<tuple<C<1>, C<1>>, tuple<C<0>, C<0>>>>(const T1 *, const T2 *, T3, T4) (1024, 1, 1)x(256, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
    Section: Command line profiler metrics
    -------------------------------------------------------- ----------- ------------
    Metric Name                                              Metric Unit Metric Value
    -------------------------------------------------------- ----------- ------------
    l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum                5,713,107
    -------------------------------------------------------- ----------- ------------

    Section: Memory Workload Analysis
    --------------------------- ----------- ------------
    Metric Name                 Metric Unit Metric Value
    --------------------------- ----------- ------------
    Memory Throughput               Gbyte/s         1.85
    Mem Busy                              %        85.43
    Max Bandwidth                         %        92.53
    L1/TEX Hit Rate                       %         0.09
    L2 Compression Success Rate           %            0
    L2 Compression Ratio                               0
    L2 Hit Rate                           %        99.90
    Mem Pipes Busy                        %        92.53
    --------------------------- ----------- ------------

Could this be the compiler is reordering the copy in a way that I don’t quite understand?

Robert_Crovella · October 19, 2024, 1:01am

You can study what the compiler is doing by studying the SASS code. The cuobjdump tool can be used for that, or even godbolt

I’m of the strong opinion that bank-conflicted access (or not) can be deduced from the source code. The idea that the compiler could/would do something contrary to that is doubtful to me, although I have been wrong before.

I do not think it is strictly possible with 100% accuracy in all cases to determine the presence or absence of shared memory bank conflicts by studying a single metric. At least partly based on this, and partly based on the observations I have made myself from time to time in the past that clearly un-bank-conflicted code shows up as bank conflicts in one metric or another.

sz16 · October 19, 2024, 3:32am

@Robert_Crovella Thanks! I checked L1 wavefront excessive and there seems to be no excessive wavefronts, but does this mean there’s no bank conflict at all？If so, why does this contradict to the l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum metric? Is the description “# of shared memory data bank conflicts generated by STs, ST, 3D attribute stores, LDGSTS” of the metric not accurate?

sz16 · October 19, 2024, 4:05am

The full nsight compute result is attached

Robert_Crovella · October 19, 2024, 6:09am

I don’t really know if your code has bank conflicts or not. I haven’t studied the code. That is the only method I have ever used to be confident about answering such a question. I’m convinced the method is both possible and reliable in every case except a data-dependent load pattern, in which case I view the question as moot.

You don’t seem to want to follow either of the suggestions I gave you. I don’t have any other suggestions. I won’t be able to help with the metric interpretation. Sorry. Another possible resource for profiler questions is one of the profiler forums.

sz16 · October 19, 2024, 6:12pm

Sorry for the miscommunication, I tried to study the sass output (as shown below) of my code but wasn’t able to understand it


Fatbin ptx code:
================
arch = sm_70
code version = [8,2]
host = linux
compile_size = 64bit
compressed

Fatbin elf code:
================
arch = sm_70
code version = [1,7]
host = linux
compile_size = 64bit

	code for sm_70
		Function : _ZN3cub45CUB_200200_700_720_750_800_860_870_890_900_NS11EmptyKernelIvEEvv
	.headerflags	@"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM70 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM70)"
        /*0000*/                   MOV R1, c[0x0][0x28] ;                     /* 0x00000a0000017a02 */
                                                                              /* 0x000fc60000000f00 */
        /*0010*/              @!PT SHFL.IDX PT, RZ, RZ, RZ, RZ ;              /* 0x000000fffffff389 */
                                                                              /* 0x000fe200000e00ff */
        /*0020*/                   EXIT ;                                     /* 0x000000000000794d */
                                                                              /* 0x000fea0003800000 */
        /*0030*/                   BRA 0x30;                                  /* 0xfffffff000007947 */
                                                                              /* 0x000fc0000383ffff */
        /*0040*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0050*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0060*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0070*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
		..........


		Function : _Z22copy_kernel_vectorizedIN7cutlass6half_tES1_N4cute6LayoutINS2_5tupleIJNS2_1CILi8EEENS5_ILi32EEEEEENS4_IJS7_NS5_ILi1EEEEEEEENS3_INS4_IJS9_S9_EEENS4_IJNS5_ILi0EEESD_EEEEEEvPKT_PKT0_T1_T2_
	.headerflags	@"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM70 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM70)"
        /*0000*/                   IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28] ;    /* 0x00000a00ff017624 */
                                                                              /* 0x000fc400078e00ff */
        /*0010*/              @!PT SHFL.IDX PT, RZ, RZ, RZ, RZ ;              /* 0x000000fffffff389 */
                                                                              /* 0x000fe200000e00ff */
        /*0020*/                   S2R R0, SR_TID.X ;                         /* 0x0000000000007919 */
                                                                              /* 0x000e220000002100 */
        /*0030*/                   IMAD.MOV.U32 R3, RZ, RZ, 0x2 ;             /* 0x00000002ff037424 */
                                                                              /* 0x000fe200078e00ff */
        /*0040*/                   MOV R4, RZ ;                               /* 0x000000ff00047202 */
                                                                              /* 0x000fc60000000f00 */
        /*0050*/                   IMAD.WIDE.U32 R2, R0, R3, c[0x0][0x160] ;  /* 0x0000580000027625 */
                                                                              /* 0x001fc800078e0003 */
        /*0060*/                   IMAD.SHL.U32 R0, R0, 0x2, RZ ;             /* 0x0000000200007824 */
                                                                              /* 0x000fe200078e00ff */
        /*0070*/                   IADD3 R2, P0, R2, -0x200, RZ ;             /* 0xfffffe0002027810 */
                                                                              /* 0x000fc80007f1e0ff */
        /*0080*/                   IADD3.X R3, R3, -0x1, RZ, P0, !PT ;        /* 0xffffffff03037810 */
                                                                              /* 0x000fc400007fe4ff */
        /*0090*/              @!PT SHFL.IDX PT, RZ, RZ, RZ, RZ ;              /* 0x000000fffffff389 */
                                                                              /* 0x000fe200000e00ff */
        /*00a0*/                   IADD3 R4, R4, 0x800, RZ ;                  /* 0x0000080004047810 */
                                                                              /* 0x000fe40007ffe0ff */
        /*00b0*/                   IADD3 R2, P1, R2, 0x200000, RZ ;           /* 0x0020000002027810 */
                                                                              /* 0x000fe40007f3e0ff */
        /*00c0*/                   ISETP.NE.AND P0, PT, R4, 0x1000, PT ;      /* 0x000010000400780c */
                                                                              /* 0x000fe40003f05270 */
        /*00d0*/                   IADD3.X R3, RZ, R3, RZ, P1, !PT ;          /* 0x00000003ff037210 */
                                                                              /* 0x000fd40000ffe4ff */
        /*00e0*/               @P0 BRA 0x90 ;                                 /* 0xffffffa000000947 */
                                                                              /* 0x000fea000383ffff */
        /*00f0*/                   LDG.E.U16.CONSTANT.SYS R5, [R2+-0x200] ;   /* 0xfffe000002057381 */
                                                                              /* 0x000ea800001e6500 */
        /*0100*/                   LDG.E.U16.CONSTANT.SYS R7, [R2] ;          /* 0x0000000002077381 */
                                                                              /* 0x000ee800001e6500 */
        /*0110*/                   STS.U16 [R0], R5 ;                         /* 0x0000000500007388 */
                                                                              /* 0x004fe80000000400 */
        /*0120*/                   STS.U16 [R0+0x200], R7 ;                   /* 0x0002000700007388 */
                                                                              /* 0x008fe20000000400 */
        /*0130*/                   EXIT ;                                     /* 0x000000000000794d */
                                                                              /* 0x000fea0003800000 */
        /*0140*/                   BRA 0x140;                                 /* 0xfffffff000007947 */
                                                                              /* 0x000fc0000383ffff */
        /*0150*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0160*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0170*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
		..........



Fatbin ptx code:
================
arch = sm_72
code version = [8,2]
host = linux
compile_size = 64bit
compressed

Fatbin elf code:
================
arch = sm_72
code version = [1,7]
host = linux
compile_size = 64bit

	code for sm_72
		Function : _ZN3cub45CUB_200200_700_720_750_800_860_870_890_900_NS11EmptyKernelIvEEvv
	.headerflags	@"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM72 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM72)"
        /*0000*/                   MOV R1, c[0x0][0x28] ;                     /* 0x00000a0000017a02 */
                                                                              /* 0x000fc60000000f00 */
        /*0010*/              @!PT SHFL.IDX PT, RZ, RZ, RZ, RZ ;              /* 0x000000fffffff389 */
                                                                              /* 0x000fe200000e00ff */
        /*0020*/                   EXIT ;                                     /* 0x000000000000794d */
                                                                              /* 0x000fea0003800000 */
        /*0030*/                   BRA 0x30;                                  /* 0xfffffff000007947 */
                                                                              /* 0x000fc0000383ffff */
        /*0040*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0050*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0060*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0070*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
		..........


		Function : _Z22copy_kernel_vectorizedIN7cutlass6half_tES1_N4cute6LayoutINS2_5tupleIJNS2_1CILi8EEENS5_ILi32EEEEEENS4_IJS7_NS5_ILi1EEEEEEEENS3_INS4_IJS9_S9_EEENS4_IJNS5_ILi0EEESD_EEEEEEvPKT_PKT0_T1_T2_
	.headerflags	@"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM72 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM72)"
        /*0000*/                   IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28] ;    /* 0x00000a00ff017624 */
                                                                              /* 0x000fc400078e00ff */
        /*0010*/              @!PT SHFL.IDX PT, RZ, RZ, RZ, RZ ;              /* 0x000000fffffff389 */
                                                                              /* 0x000fe200000e00ff */
        /*0020*/                   S2R R0, SR_TID.X ;                         /* 0x0000000000007919 */
                                                                              /* 0x000e220000002100 */
        /*0030*/                   IMAD.MOV.U32 R3, RZ, RZ, 0x2 ;             /* 0x00000002ff037424 */
                                                                              /* 0x000fe200078e00ff */
        /*0040*/                   MOV R4, RZ ;                               /* 0x000000ff00047202 */
                                                                              /* 0x000fc60000000f00 */
        /*0050*/                   IMAD.WIDE.U32 R2, R0, R3, c[0x0][0x160] ;  /* 0x0000580000027625 */
                                                                              /* 0x001fc800078e0003 */
        /*0060*/                   IMAD.SHL.U32 R0, R0, 0x2, RZ ;             /* 0x0000000200007824 */
                                                                              /* 0x000fe200078e00ff */
        /*0070*/                   IADD3 R2, P0, R2, -0x200, RZ ;             /* 0xfffffe0002027810 */
                                                                              /* 0x000fc80007f1e0ff */
        /*0080*/                   IADD3.X R3, R3, -0x1, RZ, P0, !PT ;        /* 0xffffffff03037810 */
                                                                              /* 0x000fc400007fe4ff */
        /*0090*/              @!PT SHFL.IDX PT, RZ, RZ, RZ, RZ ;              /* 0x000000fffffff389 */
                                                                              /* 0x000fe200000e00ff */
        /*00a0*/                   IADD3 R4, R4, 0x800, RZ ;                  /* 0x0000080004047810 */
                                                                              /* 0x000fe40007ffe0ff */
        /*00b0*/                   IADD3 R2, P1, R2, 0x200000, RZ ;           /* 0x0020000002027810 */
                                                                              /* 0x000fe40007f3e0ff */
        /*00c0*/                   ISETP.NE.AND P0, PT, R4, 0x1000, PT ;      /* 0x000010000400780c */
                                                                              /* 0x000fe40003f05270 */
        /*00d0*/                   IADD3.X R3, RZ, R3, RZ, P1, !PT ;          /* 0x00000003ff037210 */
                                                                              /* 0x000fd40000ffe4ff */
        /*00e0*/               @P0 BRA 0x90 ;                                 /* 0xffffffa000000947 */
                                                                              /* 0x000fea000383ffff */
        /*00f0*/                   LDG.E.U16.CONSTANT.SYS R5, [R2+-0x200] ;   /* 0xfffe000002057381 */
                                                                              /* 0x000ea800001e6500 */
        /*0100*/                   LDG.E.U16.CONSTANT.SYS R7, [R2] ;          /* 0x0000000002077381 */
                                                                              /* 0x000ee800001e6500 */
        /*0110*/                   STS.U16 [R0], R5 ;                         /* 0x0000000500007388 */
                                                                              /* 0x004fe80000000400 */
        /*0120*/                   STS.U16 [R0+0x200], R7 ;                   /* 0x0002000700007388 */
                                                                              /* 0x008fe20000000400 */
        /*0130*/                   EXIT ;                                     /* 0x000000000000794d */
                                                                              /* 0x000fea0003800000 */
        /*0140*/                   BRA 0x140;                                 /* 0xfffffff000007947 */
                                                                              /* 0x000fc0000383ffff */
        /*0150*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0160*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0170*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
		..........



Fatbin ptx code:
================
arch = sm_75
code version = [8,2]
host = linux
compile_size = 64bit
compressed

Fatbin elf code:
================
arch = sm_75
code version = [1,7]
host = linux
compile_size = 64bit

	code for sm_75
		Function : _ZN3cub45CUB_200200_700_720_750_800_860_870_890_900_NS11EmptyKernelIvEEvv
	.headerflags	@"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM75 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM75)"
        /*0000*/                   MOV R1, c[0x0][0x28] ;                     /* 0x00000a0000017a02 */
                                                                              /* 0x000fc60000000f00 */
        /*0010*/                   EXIT ;                                     /* 0x000000000000794d */
                                                                              /* 0x000fea0003800000 */
        /*0020*/                   BRA 0x20;                                  /* 0xfffffff000007947 */
                                                                              /* 0x000fc0000383ffff */
        /*0030*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0040*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0050*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0060*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0070*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
		..........


		Function : _Z22copy_kernel_vectorizedIN7cutlass6half_tES1_N4cute6LayoutINS2_5tupleIJNS2_1CILi8EEENS5_ILi32EEEEEENS4_IJS7_NS5_ILi1EEEEEEEENS3_INS4_IJS9_S9_EEENS4_IJNS5_ILi0EEESD_EEEEEEvPKT_PKT0_T1_T2_
	.headerflags	@"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM75 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM75)"
        /*0000*/                   IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28] ;    /* 0x00000a00ff017624 */
                                                                              /* 0x000fc400078e00ff */
        /*0010*/                   S2R R0, SR_TID.X ;                         /* 0x0000000000007919 */
                                                                              /* 0x000e220000002100 */
        /*0020*/                   IMAD.MOV.U32 R3, RZ, RZ, 0x2 ;             /* 0x00000002ff037424 */
                                                                              /* 0x000fe200078e00ff */
        /*0030*/                   MOV R4, RZ ;                               /* 0x000000ff00047202 */
                                                                              /* 0x000fc60000000f00 */
        /*0040*/                   IMAD.WIDE.U32 R2, R0, R3, c[0x0][0x160] ;  /* 0x0000580000027625 */
                                                                              /* 0x001fc800078e0003 */
        /*0050*/                   IMAD.SHL.U32 R0, R0, 0x2, RZ ;             /* 0x0000000200007824 */
                                                                              /* 0x000fe200078e00ff */
        /*0060*/                   IADD3 R2, P0, R2, -0x200, RZ ;             /* 0xfffffe0002027810 */
                                                                              /* 0x000fc80007f1e0ff */
        /*0070*/                   IADD3.X R3, R3, -0x1, RZ, P0, !PT ;        /* 0xffffffff03037810 */
                                                                              /* 0x000fc400007fe4ff */
        /*0080*/                   IADD3 R4, R4, 0x800, RZ ;                  /* 0x0000080004047810 */
                                                                              /* 0x000fe40007ffe0ff */
        /*0090*/                   IADD3 R2, P1, R2, 0x200000, RZ ;           /* 0x0020000002027810 */
                                                                              /* 0x000fe40007f3e0ff */
        /*00a0*/                   ISETP.NE.AND P0, PT, R4, 0x1000, PT ;      /* 0x000010000400780c */
                                                                              /* 0x000fe40003f05270 */
        /*00b0*/                   IADD3.X R3, RZ, R3, RZ, P1, !PT ;          /* 0x00000003ff037210 */
                                                                              /* 0x000fd40000ffe4ff */
        /*00c0*/               @P0 BRA 0x80 ;                                 /* 0xffffffb000000947 */
                                                                              /* 0x000fea000383ffff */
        /*00d0*/                   LDG.E.U16.CONSTANT.SYS R5, [R2+-0x200] ;   /* 0xfffe000002057381 */
                                                                              /* 0x000ea800001e6500 */
        /*00e0*/                   LDG.E.U16.CONSTANT.SYS R7, [R2] ;          /* 0x0000000002077381 */
                                                                              /* 0x000ee800001e6500 */
        /*00f0*/                   STS.U16 [R0], R5 ;                         /* 0x0000000500007388 */
                                                                              /* 0x004fe80000000400 */
        /*0100*/                   STS.U16 [R0+0x200], R7 ;                   /* 0x0002000700007388 */
                                                                              /* 0x008fe20000000400 */
        /*0110*/                   EXIT ;                                     /* 0x000000000000794d */
                                                                              /* 0x000fea0003800000 */
        /*0120*/                   BRA 0x120;                                 /* 0xfffffff000007947 */
                                                                              /* 0x000fc0000383ffff */
        /*0130*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0140*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0150*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0160*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0170*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
		..........



Fatbin ptx code:
================
arch = sm_80
code version = [8,2]
host = linux
compile_size = 64bit
compressed

Fatbin elf code:
================
arch = sm_80
code version = [1,7]
host = linux
compile_size = 64bit

	code for sm_80
		Function : _ZN3cub45CUB_200200_700_720_750_800_860_870_890_900_NS11EmptyKernelIvEEvv
	.headerflags	@"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM80 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM80)"
        /*0000*/                   MOV R1, c[0x0][0x28] ;                     /* 0x00000a0000017a02 */
                                                                              /* 0x000fc40000000f00 */
        /*0010*/                   EXIT ;                                     /* 0x000000000000794d */
                                                                              /* 0x000fea0003800000 */
        /*0020*/                   BRA 0x20;                                  /* 0xfffffff000007947 */
                                                                              /* 0x000fc0000383ffff */
        /*0030*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0040*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0050*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0060*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0070*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0080*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0090*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*00a0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*00b0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*00c0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*00d0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*00e0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*00f0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
		..........


		Function : _Z22copy_kernel_vectorizedIN7cutlass6half_tES1_N4cute6LayoutINS2_5tupleIJNS2_1CILi8EEENS5_ILi32EEEEEENS4_IJS7_NS5_ILi1EEEEEEEENS3_INS4_IJS9_S9_EEENS4_IJNS5_ILi0EEESD_EEEEEEvPKT_PKT0_T1_T2_
	.headerflags	@"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM80 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM80)"
        /*0000*/                   IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28] ;    /* 0x00000a00ff017624 */
                                                                              /* 0x000fc400078e00ff */
        /*0010*/                   S2R R0, SR_TID.X ;                         /* 0x0000000000007919 */
                                                                              /* 0x000e220000002100 */
        /*0020*/                   IMAD.MOV.U32 R3, RZ, RZ, 0x2 ;             /* 0x00000002ff037424 */
                                                                              /* 0x000fe200078e00ff */
        /*0030*/                   HFMA2.MMA R4, -RZ, RZ, 0, 0 ;              /* 0x00000000ff047435 */
                                                                              /* 0x000fe200000001ff */
        /*0040*/                   ULDC.64 UR4, c[0x0][0x118] ;               /* 0x0000460000047ab9 */
                                                                              /* 0x000fe40000000a00 */
        /*0050*/                   IMAD.WIDE.U32 R2, R0, R3, c[0x0][0x160] ;  /* 0x0000580000027625 */
                                                                              /* 0x001fc800078e0003 */
        /*0060*/                   IMAD.SHL.U32 R0, R0, 0x2, RZ ;             /* 0x0000000200007824 */
                                                                              /* 0x000fe200078e00ff */
        /*0070*/                   IADD3 R2, P0, R2, -0x200, RZ ;             /* 0xfffffe0002027810 */
                                                                              /* 0x000fc80007f1e0ff */
        /*0080*/                   IADD3.X R3, R3, -0x1, RZ, P0, !PT ;        /* 0xffffffff03037810 */
                                                                              /* 0x000fc400007fe4ff */
        /*0090*/                   IADD3 R4, R4, 0x800, RZ ;                  /* 0x0000080004047810 */
                                                                              /* 0x000fe40007ffe0ff */
        /*00a0*/                   IADD3 R2, P1, R2, 0x200000, RZ ;           /* 0x0020000002027810 */
                                                                              /* 0x000fe40007f3e0ff */
        /*00b0*/                   ISETP.NE.AND P0, PT, R4, 0x1000, PT ;      /* 0x000010000400780c */
                                                                              /* 0x000fe40003f05270 */
        /*00c0*/                   IADD3.X R3, RZ, R3, RZ, P1, !PT ;          /* 0x00000003ff037210 */
                                                                              /* 0x000fd60000ffe4ff */
        /*00d0*/               @P0 BRA 0x90 ;                                 /* 0xffffffb000000947 */
                                                                              /* 0x000fea000383ffff */
        /*00e0*/                   LDG.E.U16.CONSTANT R5, [R2.64+-0x200] ;    /* 0xfffe000402057981 */
                                                                              /* 0x000ea8000c1e9500 */
        /*00f0*/                   LDG.E.U16.CONSTANT R7, [R2.64] ;           /* 0x0000000402077981 */
                                                                              /* 0x000ee8000c1e9500 */
        /*0100*/                   STS.U16 [R0], R5 ;                         /* 0x0000000500007388 */
                                                                              /* 0x004fe80000000400 */
        /*0110*/                   STS.U16 [R0+0x200], R7 ;                   /* 0x0002000700007388 */
                                                                              /* 0x008fe20000000400 */
        /*0120*/                   EXIT ;                                     /* 0x000000000000794d */
                                                                              /* 0x000fea0003800000 */
        /*0130*/                   BRA 0x130;                                 /* 0xfffffff000007947 */
                                                                              /* 0x000fc0000383ffff */
        /*0140*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0150*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0160*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0170*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0180*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0190*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*01a0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*01b0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*01c0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*01d0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*01e0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*01f0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
		..........



Fatbin ptx code:
================
arch = sm_86
code version = [8,2]
host = linux
compile_size = 64bit
compressed

Fatbin elf code:
================
arch = sm_86
code version = [1,7]
host = linux
compile_size = 64bit

	code for sm_86
		Function : _ZN3cub45CUB_200200_700_720_750_800_860_870_890_900_NS11EmptyKernelIvEEvv
	.headerflags	@"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM86 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM86)"
        /*0000*/                   MOV R1, c[0x0][0x28] ;                     /* 0x00000a0000017a02 */
                                                                              /* 0x000fc40000000f00 */
        /*0010*/                   EXIT ;                                     /* 0x000000000000794d */
                                                                              /* 0x000fea0003800000 */
        /*0020*/                   BRA 0x20;                                  /* 0xfffffff000007947 */
                                                                              /* 0x000fc0000383ffff */
        /*0030*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0040*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0050*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0060*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0070*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0080*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0090*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*00a0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*00b0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*00c0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*00d0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*00e0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*00f0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
		..........


		Function : _Z22copy_kernel_vectorizedIN7cutlass6half_tES1_N4cute6LayoutINS2_5tupleIJNS2_1CILi8EEENS5_ILi32EEEEEENS4_IJS7_NS5_ILi1EEEEEEEENS3_INS4_IJS9_S9_EEENS4_IJNS5_ILi0EEESD_EEEEEEvPKT_PKT0_T1_T2_
	.headerflags	@"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM86 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM86)"
        /*0000*/                   IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28] ;    /* 0x00000a00ff017624 */
                                                                              /* 0x000fc400078e00ff */
        /*0010*/                   S2R R0, SR_TID.X ;                         /* 0x0000000000007919 */
                                                                              /* 0x000e220000002100 */
        /*0020*/                   IMAD.MOV.U32 R3, RZ, RZ, 0x2 ;             /* 0x00000002ff037424 */
                                                                              /* 0x000fe200078e00ff */
        /*0030*/                   MOV R4, RZ ;                               /* 0x000000ff00047202 */
                                                                              /* 0x000fe20000000f00 */
        /*0040*/                   ULDC.64 UR4, c[0x0][0x118] ;               /* 0x0000460000047ab9 */
                                                                              /* 0x000fe40000000a00 */
        /*0050*/                   IMAD.WIDE.U32 R2, R0, R3, c[0x0][0x160] ;  /* 0x0000580000027625 */
                                                                              /* 0x001fc800078e0003 */
        /*0060*/                   IMAD.SHL.U32 R0, R0, 0x2, RZ ;             /* 0x0000000200007824 */
                                                                              /* 0x000fe200078e00ff */
        /*0070*/                   IADD3 R2, P0, R2, -0x200, RZ ;             /* 0xfffffe0002027810 */
                                                                              /* 0x000fc80007f1e0ff */
        /*0080*/                   IADD3.X R3, R3, -0x1, RZ, P0, !PT ;        /* 0xffffffff03037810 */
                                                                              /* 0x000fc400007fe4ff */
        /*0090*/                   IADD3 R4, R4, 0x800, RZ ;                  /* 0x0000080004047810 */
                                                                              /* 0x000fe40007ffe0ff */
        /*00a0*/                   IADD3 R2, P1, R2, 0x200000, RZ ;           /* 0x0020000002027810 */
                                                                              /* 0x000fe40007f3e0ff */
        /*00b0*/                   ISETP.NE.AND P0, PT, R4, 0x1000, PT ;      /* 0x000010000400780c */
                                                                              /* 0x000fe40003f05270 */
        /*00c0*/                   IADD3.X R3, RZ, R3, RZ, P1, !PT ;          /* 0x00000003ff037210 */
                                                                              /* 0x000fd60000ffe4ff */
        /*00d0*/               @P0 BRA 0x90 ;                                 /* 0xffffffb000000947 */
                                                                              /* 0x000fea000383ffff */
        /*00e0*/                   LDG.E.U16.CONSTANT R5, [R2.64+-0x200] ;    /* 0xfffe000402057981 */
                                                                              /* 0x000ea8000c1e9500 */
        /*00f0*/                   LDG.E.U16.CONSTANT R7, [R2.64] ;           /* 0x0000000402077981 */
                                                                              /* 0x000ee8000c1e9500 */
        /*0100*/                   STS.U16 [R0], R5 ;                         /* 0x0000000500007388 */
                                                                              /* 0x004fe80000000400 */
        /*0110*/                   STS.U16 [R0+0x200], R7 ;                   /* 0x0002000700007388 */
                                                                              /* 0x008fe20000000400 */
        /*0120*/                   EXIT ;                                     /* 0x000000000000794d */
                                                                              /* 0x000fea0003800000 */
        /*0130*/                   BRA 0x130;                                 /* 0xfffffff000007947 */
                                                                              /* 0x000fc0000383ffff */
        /*0140*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0150*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0160*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0170*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0180*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0190*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*01a0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*01b0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*01c0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*01d0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*01e0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*01f0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
		..........



Fatbin ptx code:
================
arch = sm_87
code version = [8,2]
host = linux
compile_size = 64bit
compressed

Fatbin elf code:
================
arch = sm_87
code version = [1,7]
host = linux
compile_size = 64bit

	code for sm_87
		Function : _ZN3cub45CUB_200200_700_720_750_800_860_870_890_900_NS11EmptyKernelIvEEvv
	.headerflags	@"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM87 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM87)"
        /*0000*/                   ISETP.NE.U32.AND P0, PT, RZ, UR2, PT ;     /* 0x00000002ff007c0c */
                                                                              /* 0x000fda000bf05070 */
        /*0010*/               @P0 BRA 0x140 ;                                /* 0x0000012000000947 */
                                                                              /* 0x000fea0003800000 */
        /*0020*/                   BMOV.32 B0, 0xffffffff ;                   /* 0xffffffff00007956 */
                                                                              /* 0x000fe80000000000 */
        /*0030*/                   BMOV.32.CLEAR B1, B0 ;                     /* 0x0000000000017f55 */
                                                                              /* 0x000fe80000100000 */
        /*0040*/                   BMOV.32.CLEAR B2, B1 ;                     /* 0x0000000001027f55 */
                                                                              /* 0x000fe80000100000 */
        /*0050*/                   BMOV.32.CLEAR B3, B2 ;                     /* 0x0000000002037f55 */
                                                                              /* 0x000fe80000100000 */
        /*0060*/                   BMOV.32.CLEAR B4, B3 ;                     /* 0x0000000003047f55 */
                                                                              /* 0x000fe80000100000 */
        /*0070*/                   BMOV.32.CLEAR B5, B4 ;                     /* 0x0000000004057f55 */
                                                                              /* 0x000fe80000100000 */
        /*0080*/                   BMOV.32.CLEAR B6, B5 ;                     /* 0x0000000005067f55 */
                                                                              /* 0x000fe80000100000 */
        /*0090*/                   BMOV.32.CLEAR B7, B6 ;                     /* 0x0000000006077f55 */
                                                                              /* 0x000fe80000100000 */
        /*00a0*/                   BMOV.32.CLEAR B8, B7 ;                     /* 0x0000000007087f55 */
                                                                              /* 0x000fe80000100000 */
        /*00b0*/                   BMOV.32.CLEAR B9, B8 ;                     /* 0x0000000008097f55 */
                                                                              /* 0x000fe80000100000 */
        /*00c0*/                   BMOV.32.CLEAR B10, B9 ;                    /* 0x00000000090a7f55 */
                                                                              /* 0x000fe80000100000 */
        /*00d0*/                   BMOV.32.CLEAR B11, B10 ;                   /* 0x000000000a0b7f55 */
                                                                              /* 0x000fe80000100000 */
        /*00e0*/                   BMOV.32.CLEAR B12, B11 ;                   /* 0x000000000b0c7f55 */
                                                                              /* 0x000fe80000100000 */
        /*00f0*/                   BMOV.32.CLEAR B13, B12 ;                   /* 0x000000000c0d7f55 */
                                                                              /* 0x000fe80000100000 */
        /*0100*/                   BMOV.32.CLEAR B14, B13 ;                   /* 0x000000000d0e7f55 */
                                                                              /* 0x000fe80000100000 */
        /*0110*/                   BMOV.32.CLEAR B15, B14 ;                   /* 0x000000000e0f7f55 */
                                                                              /* 0x000fe80000100000 */
        /*0120*/                   BMOV.32 B15, 0x0 ;                         /* 0x000000000f007956 */
                                                                              /* 0x000fe80000000000 */
        /*0130*/                   UMOV UR2, 0x1 ;                            /* 0x0000000100027882 */
                                                                              /* 0x000fe40000000000 */
        /*0140*/                   MOV R1, c[0x0][0x28] ;                     /* 0x00000a0000017a02 */
                                                                              /* 0x000fc40000000f00 */
        /*0150*/                   EXIT ;                                     /* 0x000000000000794d */
                                                                              /* 0x000fea0003800000 */
        /*0160*/                   BRA 0x160;                                 /* 0xfffffff000007947 */
                                                                              /* 0x000fc0000383ffff */
        /*0170*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0180*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0190*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*01a0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*01b0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*01c0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*01d0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*01e0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*01f0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
		..........


		Function : _Z22copy_kernel_vectorizedIN7cutlass6half_tES1_N4cute6LayoutINS2_5tupleIJNS2_1CILi8EEENS5_ILi32EEEEEENS4_IJS7_NS5_ILi1EEEEEEEENS3_INS4_IJS9_S9_EEENS4_IJNS5_ILi0EEESD_EEEEEEvPKT_PKT0_T1_T2_
	.headerflags	@"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM87 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM87)"
        /*0000*/                   ISETP.NE.U32.AND P0, PT, RZ, UR2, PT ;     /* 0x00000002ff007c0c */
                                                                              /* 0x000fda000bf05070 */
        /*0010*/               @P0 BRA 0x140 ;                                /* 0x0000012000000947 */
                                                                              /* 0x000fea0003800000 */
        /*0020*/                   BMOV.32 B0, 0xffffffff ;                   /* 0xffffffff00007956 */
                                                                              /* 0x000fe80000000000 */
        /*0030*/                   BMOV.32.CLEAR B1, B0 ;                     /* 0x0000000000017f55 */
                                                                              /* 0x000fe80000100000 */
        /*0040*/                   BMOV.32.CLEAR B2, B1 ;                     /* 0x0000000001027f55 */
                                                                              /* 0x000fe80000100000 */
        /*0050*/                   BMOV.32.CLEAR B3, B2 ;                     /* 0x0000000002037f55 */
                                                                              /* 0x000fe80000100000 */
        /*0060*/                   BMOV.32.CLEAR B4, B3 ;                     /* 0x0000000003047f55 */
                                                                              /* 0x000fe80000100000 */
        /*0070*/                   BMOV.32.CLEAR B5, B4 ;                     /* 0x0000000004057f55 */
                                                                              /* 0x000fe80000100000 */
        /*0080*/                   BMOV.32.CLEAR B6, B5 ;                     /* 0x0000000005067f55 */
                                                                              /* 0x000fe80000100000 */
        /*0090*/                   BMOV.32.CLEAR B7, B6 ;                     /* 0x0000000006077f55 */
                                                                              /* 0x000fe80000100000 */
        /*00a0*/                   BMOV.32.CLEAR B8, B7 ;                     /* 0x0000000007087f55 */
                                                                              /* 0x000fe80000100000 */
        /*00b0*/                   BMOV.32.CLEAR B9, B8 ;                     /* 0x0000000008097f55 */
                                                                              /* 0x000fe80000100000 */
        /*00c0*/                   BMOV.32.CLEAR B10, B9 ;                    /* 0x00000000090a7f55 */
                                                                              /* 0x000fe80000100000 */
        /*00d0*/                   BMOV.32.CLEAR B11, B10 ;                   /* 0x000000000a0b7f55 */
                                                                              /* 0x000fe80000100000 */
        /*00e0*/                   BMOV.32.CLEAR B12, B11 ;                   /* 0x000000000b0c7f55 */
                                                                              /* 0x000fe80000100000 */
        /*00f0*/                   BMOV.32.CLEAR B13, B12 ;                   /* 0x000000000c0d7f55 */
                                                                              /* 0x000fe80000100000 */
        /*0100*/                   BMOV.32.CLEAR B14, B13 ;                   /* 0x000000000d0e7f55 */
                                                                              /* 0x000fe80000100000 */
        /*0110*/                   BMOV.32.CLEAR B15, B14 ;                   /* 0x000000000e0f7f55 */
                                                                              /* 0x000fe80000100000 */
        /*0120*/                   BMOV.32 B15, 0x0 ;                         /* 0x000000000f007956 */
                                                                              /* 0x000fe80000000000 */
        /*0130*/                   UMOV UR2, 0x1 ;                            /* 0x0000000100027882 */
                                                                              /* 0x000fe40000000000 */
        /*0140*/                   IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28] ;    /* 0x00000a00ff017624 */
                                                                              /* 0x000fc400078e00ff */
        /*0150*/                   S2R R0, SR_TID.X ;                         /* 0x0000000000007919 */
                                                                              /* 0x000e220000002100 */
        /*0160*/                   IMAD.MOV.U32 R3, RZ, RZ, 0x2 ;             /* 0x00000002ff037424 */
                                                                              /* 0x000fe200078e00ff */
        /*0170*/                   HFMA2.MMA R4, -RZ, RZ, 0, 0 ;              /* 0x00000000ff047435 */
                                                                              /* 0x000fe200000001ff */
        /*0180*/                   ULDC.64 UR4, c[0x0][0x118] ;               /* 0x0000460000047ab9 */
                                                                              /* 0x000fe40000000a00 */
        /*0190*/                   IMAD.WIDE.U32 R2, R0, R3, c[0x0][0x160] ;  /* 0x0000580000027625 */
                                                                              /* 0x001fc800078e0003 */
        /*01a0*/                   IMAD.SHL.U32 R0, R0, 0x2, RZ ;             /* 0x0000000200007824 */
                                                                              /* 0x000fe200078e00ff */
        /*01b0*/                   IADD3 R2, P0, R2, -0x200, RZ ;             /* 0xfffffe0002027810 */
                                                                              /* 0x000fc80007f1e0ff */
        /*01c0*/                   IADD3.X R3, R3, -0x1, RZ, P0, !PT ;        /* 0xffffffff03037810 */
                                                                              /* 0x000fc400007fe4ff */
        /*01d0*/                   IADD3 R4, R4, 0x800, RZ ;                  /* 0x0000080004047810 */
                                                                              /* 0x000fe40007ffe0ff */
        /*01e0*/                   IADD3 R2, P1, R2, 0x200000, RZ ;           /* 0x0020000002027810 */
                                                                              /* 0x000fe40007f3e0ff */
        /*01f0*/                   ISETP.NE.AND P0, PT, R4, 0x1000, PT ;      /* 0x000010000400780c */
                                                                              /* 0x000fe40003f05270 */
        /*0200*/                   IADD3.X R3, RZ, R3, RZ, P1, !PT ;          /* 0x00000003ff037210 */
                                                                              /* 0x000fd60000ffe4ff */
        /*0210*/               @P0 BRA 0x1d0 ;                                /* 0xffffffb000000947 */
                                                                              /* 0x000fea000383ffff */
        /*0220*/                   LDG.E.U16.CONSTANT R5, [R2.64+-0x200] ;    /* 0xfffe000402057981 */
                                                                              /* 0x000ea8000c1e9500 */
        /*0230*/                   LDG.E.U16.CONSTANT R7, [R2.64] ;           /* 0x0000000402077981 */
                                                                              /* 0x000ee8000c1e9500 */
        /*0240*/                   STS.U16 [R0], R5 ;                         /* 0x0000000500007388 */
                                                                              /* 0x004fe80000000400 */
        /*0250*/                   STS.U16 [R0+0x200], R7 ;                   /* 0x0002000700007388 */
                                                                              /* 0x008fe20000000400 */
        /*0260*/                   EXIT ;                                     /* 0x000000000000794d */
                                                                              /* 0x000fea0003800000 */
        /*0270*/                   BRA 0x270;                                 /* 0xfffffff000007947 */
                                                                              /* 0x000fc0000383ffff */
        /*0280*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0290*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*02a0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*02b0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*02c0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*02d0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*02e0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*02f0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
		..........



Fatbin ptx code:
================
arch = sm_89
code version = [8,2]
host = linux
compile_size = 64bit
compressed

Fatbin elf code:
================
arch = sm_89
code version = [1,7]
host = linux
compile_size = 64bit

	code for sm_89
		Function : _ZN3cub45CUB_200200_700_720_750_800_860_870_890_900_NS11EmptyKernelIvEEvv
	.headerflags	@"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM89 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM89)"
        /*0000*/                   MOV R1, c[0x0][0x28] ;                     /* 0x00000a0000017a02 */
                                                                              /* 0x000fc40000000f00 */
        /*0010*/                   EXIT ;                                     /* 0x000000000000794d */
                                                                              /* 0x000fea0003800000 */
        /*0020*/                   BRA 0x20;                                  /* 0xfffffff000007947 */
                                                                              /* 0x000fc0000383ffff */
        /*0030*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0040*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0050*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0060*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0070*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0080*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0090*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*00a0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*00b0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*00c0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*00d0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*00e0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*00f0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
		..........


		Function : _Z22copy_kernel_vectorizedIN7cutlass6half_tES1_N4cute6LayoutINS2_5tupleIJNS2_1CILi8EEENS5_ILi32EEEEEENS4_IJS7_NS5_ILi1EEEEEEEENS3_INS4_IJS9_S9_EEENS4_IJNS5_ILi0EEESD_EEEEEEvPKT_PKT0_T1_T2_
	.headerflags	@"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM89 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM89)"
        /*0000*/                   IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28] ;    /* 0x00000a00ff017624 */
                                                                              /* 0x000fc400078e00ff */
        /*0010*/                   S2R R0, SR_TID.X ;                         /* 0x0000000000007919 */
                                                                              /* 0x000e220000002100 */
        /*0020*/                   IMAD.MOV.U32 R3, RZ, RZ, 0x2 ;             /* 0x00000002ff037424 */
                                                                              /* 0x000fe200078e00ff */
        /*0030*/                   MOV R4, RZ ;                               /* 0x000000ff00047202 */
                                                                              /* 0x000fe20000000f00 */
        /*0040*/                   ULDC.64 UR4, c[0x0][0x118] ;               /* 0x0000460000047ab9 */
                                                                              /* 0x000fe40000000a00 */
        /*0050*/                   IMAD.WIDE.U32 R2, R0, R3, c[0x0][0x160] ;  /* 0x0000580000027625 */
                                                                              /* 0x001fc800078e0003 */
        /*0060*/                   IMAD.SHL.U32 R0, R0, 0x2, RZ ;             /* 0x0000000200007824 */
                                                                              /* 0x000fe200078e00ff */
        /*0070*/                   IADD3 R2, P0, R2, -0x200, RZ ;             /* 0xfffffe0002027810 */
                                                                              /* 0x000fc80007f1e0ff */
        /*0080*/                   IADD3.X R3, R3, -0x1, RZ, P0, !PT ;        /* 0xffffffff03037810 */
                                                                              /* 0x000fc400007fe4ff */
        /*0090*/                   IADD3 R4, R4, 0x800, RZ ;                  /* 0x0000080004047810 */
                                                                              /* 0x000fe40007ffe0ff */
        /*00a0*/                   IADD3 R2, P1, R2, 0x200000, RZ ;           /* 0x0020000002027810 */
                                                                              /* 0x000fe40007f3e0ff */
        /*00b0*/                   ISETP.NE.AND P0, PT, R4, 0x1000, PT ;      /* 0x000010000400780c */
                                                                              /* 0x000fe40003f05270 */
        /*00c0*/                   IADD3.X R3, RZ, R3, RZ, P1, !PT ;          /* 0x00000003ff037210 */
                                                                              /* 0x000fd60000ffe4ff */
        /*00d0*/               @P0 BRA 0x90 ;                                 /* 0xffffffb000000947 */
                                                                              /* 0x000fea000383ffff */
        /*00e0*/                   LDG.E.U16.CONSTANT R5, [R2.64+-0x200] ;    /* 0xfffe000402057981 */
                                                                              /* 0x000ea8000c1e9500 */
        /*00f0*/                   LDG.E.U16.CONSTANT R7, [R2.64] ;           /* 0x0000000402077981 */
                                                                              /* 0x000ee8000c1e9500 */
        /*0100*/                   STS.U16 [R0], R5 ;                         /* 0x0000000500007388 */
                                                                              /* 0x004fe80000000400 */
        /*0110*/                   STS.U16 [R0+0x200], R7 ;                   /* 0x0002000700007388 */
                                                                              /* 0x008fe20000000400 */
        /*0120*/                   EXIT ;                                     /* 0x000000000000794d */
                                                                              /* 0x000fea0003800000 */
        /*0130*/                   BRA 0x130;                                 /* 0xfffffff000007947 */
                                                                              /* 0x000fc0000383ffff */
        /*0140*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0150*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0160*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0170*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0180*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*0190*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*01a0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*01b0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*01c0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*01d0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*01e0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
        /*01f0*/                   NOP;                                       /* 0x0000000000007918 */
                                                                              /* 0x000fc00000000000 */
		..........



Fatbin ptx code:
================
arch = sm_90
code version = [8,2]
host = linux
compile_size = 64bit
compressed

Fatbin elf code:
================
arch = sm_90
code version = [1,7]
host = linux
compile_size = 64bit

	code for sm_90
		Function : _ZN3cub45CUB_200200_700_720_750_800_860_870_890_900_NS11EmptyKernelIvEEvv
	.headerflags	@"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM90 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM90)"
        /*0000*/                   LDC R1, c[0x0][0x28] ;                            /* 0x00000a00ff017b82 */
                                                                                     /* 0x000fe20000000800 */
        /*0010*/                   EXIT ;                                            /* 0x000000000000794d */
                                                                                     /* 0x000fea0003800000 */
        /*0020*/                   BRA 0x20;                                         /* 0xfffffffc00fc7947 */
                                                                                     /* 0x000fc0000383ffff */
        /*0030*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
        /*0040*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
        /*0050*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
        /*0060*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
        /*0070*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
        /*0080*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
        /*0090*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
        /*00a0*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
        /*00b0*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
        /*00c0*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
        /*00d0*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
        /*00e0*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
        /*00f0*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
		..........


		Function : _Z22copy_kernel_vectorizedIN7cutlass6half_tES1_N4cute6LayoutINS2_5tupleIJNS2_1CILi8EEENS5_ILi32EEEEEENS4_IJS7_NS5_ILi1EEEEEEEENS3_INS4_IJS9_S9_EEENS4_IJNS5_ILi0EEESD_EEEEEEvPKT_PKT0_T1_T2_
	.headerflags	@"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM90 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM90)"
        /*0000*/                   LDC R1, c[0x0][0x28] ;                            /* 0x00000a00ff017b82 */
                                                                                     /* 0x000e240000000800 */
        /*0010*/                   S2R R5, SR_TID.X ;                                /* 0x0000000000057919 */
                                                                                     /* 0x000e620000002100 */
        /*0020*/                   LDC.64 R2, c[0x0][0x210] ;                        /* 0x00008400ff027b82 */
                                                                                     /* 0x000e620000000a00 */
        /*0030*/                   UMOV UR4, 0x400 ;                                 /* 0x0000040000047882 */
                                                                                     /* 0x000fe20000000000 */
        /*0040*/                   IMAD.MOV.U32 R4, RZ, RZ, RZ ;                     /* 0x000000ffff047224 */
                                                                                     /* 0x000fcc00078e00ff */
        /*0050*/                   S2UR UR5, SR_CgaCtaId ;                           /* 0x00000000000579c3 */
                                                                                     /* 0x000ea20000008800 */
        /*0060*/                   IMAD.WIDE.U32 R2, R5, 0x2, R2 ;                   /* 0x0000000205027825 */
                                                                                     /* 0x002fe200078e0002 */
        /*0070*/                   UPRMT UR4, UR5, 0x654, UR4 ;                      /* 0x0000065405047896 */
                                                                                     /* 0x004fe40008000004 */
        /*0080*/                   IADD3 R2, P0, R2, -0x200, RZ ;                    /* 0xfffffe0002027810 */
                                                                                     /* 0x000fc80007f1e0ff */
        /*0090*/                   LEA R0, R5, UR4, 0x1 ;                            /* 0x0000000405007c11 */
                                                                                     /* 0x000fe2000f8e08ff */
        /*00a0*/                   ULDC.64 UR4, c[0x0][0x208] ;                      /* 0x0000820000047ab9 */
                                                                                     /* 0x000fe20000000a00 */
        /*00b0*/                   IADD3.X R3, R3, -0x1, RZ, P0, !PT ;               /* 0xffffffff03037810 */
                                                                                     /* 0x000fce00007fe4ff */
        /*00c0*/                   IADD3 R4, R4, 0x800, RZ ;                         /* 0x0000080004047810 */
                                                                                     /* 0x000fe40007ffe0ff */
        /*00d0*/                   IADD3 R2, P1, R2, 0x200000, RZ ;                  /* 0x0020000002027810 */
                                                                                     /* 0x000fe40007f3e0ff */
        /*00e0*/                   ISETP.NE.AND P0, PT, R4, 0x1000, PT ;             /* 0x000010000400780c */
                                                                                     /* 0x000fc60003f05270 */
        /*00f0*/                   IMAD.X R3, RZ, RZ, R3, P1 ;                       /* 0x000000ffff037224 */
                                                                                     /* 0x000fd400008e0603 */
        /*0100*/               @P0 BRA 0xc0 ;                                        /* 0xfffffffc00ec0947 */
                                                                                     /* 0x000fea000383ffff */
        /*0110*/                   LDG.E.U16.CONSTANT R5, desc[UR4][R2.64+-0x200] ;  /* 0xfffe000402057981 */
                                                                                     /* 0x000ea8000c1e9500 */
        /*0120*/                   LDG.E.U16.CONSTANT R7, desc[UR4][R2.64] ;         /* 0x0000000402077981 */
                                                                                     /* 0x000ee8000c1e9500 */
        /*0130*/                   STS.U16 [R0], R5 ;                                /* 0x0000000500007388 */
                                                                                     /* 0x004fe80000000400 */
        /*0140*/                   STS.U16 [R0+0x200], R7 ;                          /* 0x0002000700007388 */
                                                                                     /* 0x008fe20000000400 */
        /*0150*/                   EXIT ;                                            /* 0x000000000000794d */
                                                                                     /* 0x000fea0003800000 */
        /*0160*/                   BRA 0x160;                                        /* 0xfffffffc00fc7947 */
                                                                                     /* 0x000fc0000383ffff */
        /*0170*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
        /*0180*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
        /*0190*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
        /*01a0*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
        /*01b0*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
        /*01c0*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
        /*01d0*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
        /*01e0*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
        /*01f0*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
		..........



Fatbin ptx code:
================
arch = sm_90a
code version = [8,2]
host = linux
compile_size = 64bit
compressed

Fatbin elf code:
================
arch = sm_90a
code version = [1,7]
host = linux
compile_size = 64bit

	code for sm_90a
		Function : _ZN3cub45CUB_200200_700_720_750_800_860_870_890_900_NS11EmptyKernelIvEEvv
	.headerflags	@"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_ACCELERATORS EF_CUDA_SM90 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM90)"
        /*0000*/                   LDC R1, c[0x0][0x28] ;                            /* 0x00000a00ff017b82 */
                                                                                     /* 0x000fe20000000800 */
        /*0010*/                   EXIT ;                                            /* 0x000000000000794d */
                                                                                     /* 0x000fea0003800000 */
        /*0020*/                   BRA 0x20;                                         /* 0xfffffffc00fc7947 */
                                                                                     /* 0x000fc0000383ffff */
        /*0030*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
        /*0040*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
        /*0050*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
        /*0060*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
        /*0070*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
        /*0080*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
        /*0090*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
        /*00a0*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
        /*00b0*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
        /*00c0*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
        /*00d0*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
        /*00e0*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
        /*00f0*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
		..........


		Function : _Z22copy_kernel_vectorizedIN7cutlass6half_tES1_N4cute6LayoutINS2_5tupleIJNS2_1CILi8EEENS5_ILi32EEEEEENS4_IJS7_NS5_ILi1EEEEEEEENS3_INS4_IJS9_S9_EEENS4_IJNS5_ILi0EEESD_EEEEEEvPKT_PKT0_T1_T2_
	.headerflags	@"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_ACCELERATORS EF_CUDA_SM90 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM90)"
        /*0000*/                   LDC R1, c[0x0][0x28] ;                            /* 0x00000a00ff017b82 */
                                                                                     /* 0x000e240000000800 */
        /*0010*/                   S2R R5, SR_TID.X ;                                /* 0x0000000000057919 */
                                                                                     /* 0x000e620000002100 */
        /*0020*/                   LDC.64 R2, c[0x0][0x210] ;                        /* 0x00008400ff027b82 */
                                                                                     /* 0x000e620000000a00 */
        /*0030*/                   UMOV UR4, 0x400 ;                                 /* 0x0000040000047882 */
                                                                                     /* 0x000fe20000000000 */
        /*0040*/                   IMAD.MOV.U32 R4, RZ, RZ, RZ ;                     /* 0x000000ffff047224 */
                                                                                     /* 0x000fcc00078e00ff */
        /*0050*/                   S2UR UR5, SR_CgaCtaId ;                           /* 0x00000000000579c3 */
                                                                                     /* 0x000ea20000008800 */
        /*0060*/                   IMAD.WIDE.U32 R2, R5, 0x2, R2 ;                   /* 0x0000000205027825 */
                                                                                     /* 0x002fe200078e0002 */
        /*0070*/                   UPRMT UR4, UR5, 0x654, UR4 ;                      /* 0x0000065405047896 */
                                                                                     /* 0x004fe40008000004 */
        /*0080*/                   IADD3 R2, P0, R2, -0x200, RZ ;                    /* 0xfffffe0002027810 */
                                                                                     /* 0x000fc80007f1e0ff */
        /*0090*/                   LEA R0, R5, UR4, 0x1 ;                            /* 0x0000000405007c11 */
                                                                                     /* 0x000fe2000f8e08ff */
        /*00a0*/                   ULDC.64 UR4, c[0x0][0x208] ;                      /* 0x0000820000047ab9 */
                                                                                     /* 0x000fe20000000a00 */
        /*00b0*/                   IADD3.X R3, R3, -0x1, RZ, P0, !PT ;               /* 0xffffffff03037810 */
                                                                                     /* 0x000fce00007fe4ff */
        /*00c0*/                   IADD3 R4, R4, 0x800, RZ ;                         /* 0x0000080004047810 */
                                                                                     /* 0x000fe40007ffe0ff */
        /*00d0*/                   IADD3 R2, P1, R2, 0x200000, RZ ;                  /* 0x0020000002027810 */
                                                                                     /* 0x000fe40007f3e0ff */
        /*00e0*/                   ISETP.NE.AND P0, PT, R4, 0x1000, PT ;             /* 0x000010000400780c */
                                                                                     /* 0x000fc60003f05270 */
        /*00f0*/                   IMAD.X R3, RZ, RZ, R3, P1 ;                       /* 0x000000ffff037224 */
                                                                                     /* 0x000fd400008e0603 */
        /*0100*/               @P0 BRA 0xc0 ;                                        /* 0xfffffffc00ec0947 */
                                                                                     /* 0x000fea000383ffff */
        /*0110*/                   LDG.E.U16.CONSTANT R5, desc[UR4][R2.64+-0x200] ;  /* 0xfffe000402057981 */
                                                                                     /* 0x000ea8000c1e9500 */
        /*0120*/                   LDG.E.U16.CONSTANT R7, desc[UR4][R2.64] ;         /* 0x0000000402077981 */
                                                                                     /* 0x000ee8000c1e9500 */
        /*0130*/                   STS.U16 [R0], R5 ;                                /* 0x0000000500007388 */
                                                                                     /* 0x004fe80000000400 */
        /*0140*/                   STS.U16 [R0+0x200], R7 ;                          /* 0x0002000700007388 */
                                                                                     /* 0x008fe20000000400 */
        /*0150*/                   EXIT ;                                            /* 0x000000000000794d */
                                                                                     /* 0x000fea0003800000 */
        /*0160*/                   BRA 0x160;                                        /* 0xfffffffc00fc7947 */
                                                                                     /* 0x000fc0000383ffff */
        /*0170*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
        /*0180*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
        /*0190*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
        /*01a0*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
        /*01b0*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
        /*01c0*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
        /*01d0*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
        /*01e0*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
        /*01f0*/                   NOP;                                              /* 0x0000000000007918 */
                                                                                     /* 0x000fc00000000000 */
		..........

I also compared the sass code of my code with excessive share memory allocation, and the sass code without excessive share memory allocation ( bank conflict debug - Diffchecker, but there doesn’t seems to have any difference.

Although I’ve noticed that there’re ...... at the end of sass output, so perhaps cuobjdump doesn’t print everything? Is there a way I could make cuobjdump dump all sass code? I’m printing the sass code this way

cuobjdump --dump-sass ./examples/cute/tutorial/tiled_copy

Robert_Crovella · October 19, 2024, 7:01pm

The ..... don’t mean that it is not dumping all the code. Typically at the end of a routine (__device__ or __global__) there will be an EXIT opcode, sometimes after that a branch-to-self, and then sometimes a few more NOP. All of your routines end that way, from what I can see. You’re seeing all the code. I’ve never heard of nor witnessed a case where cuobjdump did not show all the code associated with any entry points you define.

I certainly would not expect a larger-than-necessary shared memory allocation to have any important effect on compiler code generation. Imagining the compiler is doing something different doesn’t seem very plausible to me.

I feel like you have something like 3 or 4 separate data points now that all suggest that there is no bank conflict. There is one “contrary” datapoint, the metric.

sz16 · October 19, 2024, 7:36pm

Thanks Robert! That makes the most sense to me as well (i.e., no bank conflict). How should one interpret the “l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum” metric in this case? Should I simply ignore it?

Topic		Replies	Views
The increase of the shared memory size leads to the bankconflict (from 9 KB shared memory) Nsight Compute	5	599	July 14, 2023
Very strange share memory bank conflicts CUDA-MEMCHECK cuda	1	980	October 15, 2021
How to understand the bank conflict of shared_mem CUDA Programming and Performance	16	14160	November 19, 2025
Why there is random bank conflicts? CUDA-MEMCHECK cuda	2	1255	September 19, 2023
Shared memory bank conflicts and nsight metric CUDA Programming and Performance	15	6003	October 19, 2024
Very strange share memory bank conflicts CUDA Programming and Performance cuda	4	574	November 2, 2021
Read bank conflict from SASS or PTX files CUDA Programming and Performance cuda , nsight , profiling	5	794	December 31, 2022
Optimizing bank conflicts - problem with occupancy CUDA Programming and Performance	12	2386	April 22, 2010
Requesting clarification for Shared Memory Bank Conflicts and Shared memory access? CUDA Programming and Performance hw , cuda	11	4981	January 23, 2024
Avoiding shared memory bank conflicts CUDA Programming and Performance	3	3117	October 12, 2010

Allocating more share memory than needed resulted in bank conflict

Related topics