How to understand the sass code on RTX3080GPU and cuda11.8

the source code is :

#include <cuda_runtime.h>

#include "cuda_helper/cuda_helper.h"
#include "gtest/gtest.h"
template <typename T>
__device__ T test_func(const T& lh) {
  return lh * lh + lh - lh / 2;
}

template <typename T, uint32_t task_num_per_thread = 1>
__global__ void ld_case1(T* __restrict__ out, const T* __restrict__ in) {
  uint32_t tid = blockDim.x * blockIdx.x + threadIdx.x;

  T tmp[task_num_per_thread];
  for (uint32_t t = tid * task_num_per_thread, i = 0;
       t < (tid + 1) * task_num_per_thread; ++t, ++i) {
    tmp[i] = in[t];
    tmp[i] = test_func(tmp[i]);
  }

  for (uint32_t t = tid * task_num_per_thread, i = 0;
       t < (tid + 1) * task_num_per_thread; ++t, ++i) {
    out[t] = tmp[i];
  }
}

TEST(CudaPerf, Load) {
  constexpr uint32_t task_num_per_thread = 16;
  using T = uint32_t;
  constexpr uint32_t block_dim = 128;
  constexpr uint32_t num_block = 40;
  cuda_helper::CudaHostMem<T> h_in(block_dim * num_block * task_num_per_thread);
  cuda_helper::CudaDeviceMem<T> d_in(block_dim * num_block *
                                     task_num_per_thread);
  cuda_helper::CudaDeviceMem<T> d_out(block_dim * num_block *
                                      task_num_per_thread);
  cuda_helper::CudaHostMem<T> h_out(block_dim * num_block *
                                    task_num_per_thread);

  for (size_t i = 0; i < task_num_per_thread * block_dim * num_block; ++i) {
    h_in.HPtr()[i] = rand() % 193;
  }

  cudaMemcpy(d_in.Ptr(), h_in.HPtr(), d_in.SizeInBytes(),
             cudaMemcpyHostToDevice);
  ld_case1<T, task_num_per_thread>
      <<<num_block, block_dim>>>(d_out.Ptr(), d_in.Ptr());
  cudaMemcpy(h_out.HPtr(), d_out.Ptr(), d_out.SizeInBytes(),
             cudaMemcpyDeviceToHost);
}

the sass code is:

It seems that the 43rd line of code instruction waits for all memory writes to be completed before executing
why? R19 is not the last data read.

L43 is waiting for at least L23 which is loading R19. The posted MVP is not a trivial compile and the screenshot does not have sufficient information. It is possible it is waiting for operations on the previous loop.

		ld_case1
1	00007fc0 26faf700	      IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28] 
2	00007fc0 26faf710	      S2R R0, SR_CTAID.X 
3	00007fc0 26faf720	      ULDC.64 UR4, c[0x0][0x118] 
4	00007fc0 26faf730	      BSSY B0, 0x7fc026fafc00 
5	00007fc0 26faf740	      S2R R3, SR_TID.X 
6	00007fc0 26faf750	      IMAD R0, R0, c[0x0][0x0], R3 
7	00007fc0 26faf760	      IMAD.SHL.U32 R0, R0, 0x10, RZ 
8	00007fc0 26faf770	      ISETP.NE.AND P1, PT, R0, -0x10, PT 
9	00007fc0 26faf780	@!P1  BRA 0x7fc026fafbf0 
10	00007fc0 26faf790	      MOV R3, 0x4 
11	00007fc0 26faf7a0	      IMAD.WIDE.U32 R2, R0, R3, c[0x0][0x168] 
12	00007fc0 26faf7b0	      LDG.E.CONSTANT R4, [R2.64] 
13	00007fc0 26faf7c0	      IADD3 R5, R0, 0x1, RZ 
14	00007fc0 26faf7d0	      IADD3 R6, R0, 0x10, RZ 
15	00007fc0 26faf7e0	      ISETP.GE.U32.AND P0, PT, R5, R6, PT 
16	00007fc0 26faf7f0	      IMAD R5, R4, R4, R4 
17	00007fc0 26faf800	      SHF.R.U32.HI R4, RZ, 0x1, R4 
18	00007fc0 26faf810	      IMAD.IADD R4, R5, 0x1, -R4 
19	00007fc0 26faf820	@P0   BRA 0x7fc026fafbf0 
20	00007fc0 26faf830	      LDG.E.CONSTANT R13, [R2.64+0x4] 
21	00007fc0 26faf840	      LDG.E.CONSTANT R15, [R2.64+0x8] 
22	00007fc0 26faf850	      LDG.E.CONSTANT R17, [R2.64+0xc] 
23	00007fc0 26faf860	      LDG.E.CONSTANT R19, [R2.64+0x10] 
24	00007fc0 26faf870	      LDG.E.CONSTANT R21, [R2.64+0x18] 
25	00007fc0 26faf880	      LDG.E.CONSTANT R22, [R2.64+0x1c] 
26	00007fc0 26faf890	      LDG.E.CONSTANT R20, [R2.64+0x14] 
27	00007fc0 26faf8a0	      LDG.E.CONSTANT R8, [R2.64+0x20] 
28	00007fc0 26faf8b0	      LDG.E.CONSTANT R10, [R2.64+0x28] 
29	00007fc0 26faf8c0	      LDG.E.CONSTANT R9, [R2.64+0x24] 
30	00007fc0 26faf8d0	      LDG.E.CONSTANT R6, [R2.64+0x30] 
31	00007fc0 26faf8e0	      LDG.E.CONSTANT R11, [R2.64+0x2c] 
32	00007fc0 26faf8f0	      LDG.E.CONSTANT R5, [R2.64+0x34] 
33	00007fc0 26faf900	      LDG.E.CONSTANT R7, [R2.64+0x38] 
34	00007fc0 26faf910	      LDG.E.CONSTANT R12, [R2.64+0x3c] 
35	00007fc0 26faf920	      IMAD R14, R13, R13, R13 
36	00007fc0 26faf930	      SHF.R.U32.HI R13, RZ, 0x1, R13 
37	00007fc0 26faf940	      IMAD R16, R15, R15, R15 
38	00007fc0 26faf950	      SHF.R.U32.HI R15, RZ, 0x1, R15 
39	00007fc0 26faf960	      IMAD R18, R17, R17, R17 
40	00007fc0 26faf970	      SHF.R.U32.HI R17, RZ, 0x1, R17 
41	00007fc0 26faf980	      IMAD.IADD R13, R14, 0x1, -R13 
42	00007fc0 26faf990	      IADD3 R14, R16, -R15, RZ 
43	00007fc0 26faf9a0	      IMAD R16, R19, R19, R19 
44	00007fc0 26faf9b0	      IMAD.IADD R15, R18, 0x1, -R17 
45	00007fc0 26faf9c0	      SHF.R.U32.HI R19, RZ, 0x1, R19 
46	00007fc0 26faf9d0	      IMAD R17, R21, R21, R21 
47	00007fc0 26faf9e0	      SHF.R.U32.HI R18, RZ, 0x1, R21 
48	00007fc0 26faf9f0	      IMAD R21, R22, R22, R22 
49	00007fc0 26fafa00	      SHF.R.U32.HI R22, RZ, 0x1, R22 
50	00007fc0 26fafa10	      IMAD R3, R20, R20, R20 
51	00007fc0 26fafa20	      SHF.R.U32.HI R20, RZ, 0x1, R20 
52	00007fc0 26fafa30	      IADD3 R2, R16, -R19, RZ 
53	00007fc0 26fafa40	      IADD3 R16, R17, -R18, RZ 
54	00007fc0 26fafa50	      IMAD.IADD R17, R21, 0x1, -R22 
55	00007fc0 26fafa60	      SHF.R.U32.HI R19, RZ, 0x1, R8 
56	00007fc0 26fafa70	      IMAD R18, R8, R8, R8 
57	00007fc0 26fafa80	      IMAD R21, R10, R10, R10 
58	00007fc0 26fafa90	      SHF.R.U32.HI R10, RZ, 0x1, R10 
59	00007fc0 26fafaa0	      IMAD.IADD R3, R3, 0x1, -R20 
60	00007fc0 26fafab0	      IMAD R20, R9, R9, R9 
61	00007fc0 26fafac0	      SHF.R.U32.HI R9, RZ, 0x1, R9 
62	00007fc0 26fafad0	      IADD3 R8, R18, -R19, RZ 
63	00007fc0 26fafae0	      IADD3 R21, R21, -R10, RZ 
64	00007fc0 26fafaf0	      IMAD R10, R6, R6, R6 
65	00007fc0 26fafb00	      SHF.R.U32.HI R19, RZ, 0x1, R6 
66	00007fc0 26fafb10	      IMAD R22, R11, R11, R11 
67	00007fc0 26fafb20	      SHF.R.U32.HI R11, RZ, 0x1, R11 
68	00007fc0 26fafb30	      IMAD R6, R5, R5, R5 
69	00007fc0 26fafb40	      SHF.R.U32.HI R5, RZ, 0x1, R5 
70	00007fc0 26fafb50	      IMAD.IADD R9, R20, 0x1, -R9 
71	00007fc0 26fafb60	      IMAD R18, R7, R7, R7 
72	00007fc0 26fafb70	      SHF.R.U32.HI R7, RZ, 0x1, R7 
73	00007fc0 26fafb80	      SHF.R.U32.HI R25, RZ, 0x1, R12 
74	00007fc0 26fafb90	      IMAD R20, R12, R12, R12 
75	00007fc0 26fafba0	      IADD3 R19, R10, -R19, RZ 
76	00007fc0 26fafbb0	      IMAD.IADD R11, R22, 0x1, -R11 
77	00007fc0 26fafbc0	      IADD3 R23, R18, -R7, RZ 
78	00007fc0 26fafbd0	      IMAD.IADD R5, R6, 0x1, -R5 
79	00007fc0 26fafbe0	      IMAD.IADD R25, R20, 0x1, -R25 
80	00007fc0 26fafbf0	      BSYNC B0 
81	00007fc0 26fafc00	@!P1  EXIT 
82	00007fc0 26fafc10	      IADD3 R6, R0, 0x1, RZ 
83	00007fc0 26fafc20	      IADD3 R27, R0, 0x10, RZ 
84	00007fc0 26fafc30	      MOV R7, 0x4 
85	00007fc0 26fafc40	      ISETP.GE.U32.AND P0, PT, R6, R27, PT 
86	00007fc0 26fafc50	      IMAD.WIDE.U32 R6, R0, R7, c[0x0][0x160] 
87	00007fc0 26fafc60	      STG.E [R6.64], R4 
88	00007fc0 26fafc70	@P0   EXIT 
89	00007fc0 26fafc80	      STG.E [R6.64+0x4], R13 
90	00007fc0 26fafc90	      STG.E [R6.64+0x8], R14 
91	00007fc0 26fafca0	      STG.E [R6.64+0xc], R15 
92	00007fc0 26fafcb0	      STG.E [R6.64+0x10], R2 
93	00007fc0 26fafcc0	      STG.E [R6.64+0x14], R3 
94	00007fc0 26fafcd0	      STG.E [R6.64+0x18], R16 
95	00007fc0 26fafce0	      STG.E [R6.64+0x1c], R17 
96	00007fc0 26fafcf0	      STG.E [R6.64+0x20], R8 
97	00007fc0 26fafd00	      STG.E [R6.64+0x24], R9 
98	00007fc0 26fafd10	      STG.E [R6.64+0x28], R21 
99	00007fc0 26fafd20	      STG.E [R6.64+0x2c], R11 
100	00007fc0 26fafd30	      STG.E [R6.64+0x30], R19 
101	00007fc0 26fafd40	      STG.E [R6.64+0x34], R5 
102	00007fc0 26fafd50	      STG.E [R6.64+0x38], R23 
103	00007fc0 26fafd60	      STG.E [R6.64+0x3c], R25 
104	00007fc0 26fafd70	      EXIT 
105	00007fc0 26fafd80	      BRA 0x7fc026fafd80
106	00007fc0 26fafd90	      NOP
107	00007fc0 26fafda0	      NOP
108	00007fc0 26fafdb0	      NOP
109	00007fc0 26fafdc0	      NOP
110	00007fc0 26fafdd0	      NOP
111	00007fc0 26fafde0	      NOP
112	00007fc0 26fafdf0	      NOP
113	00007fc0 26fafe00	      NOP
114	00007fc0 26fafe10	      NOP
115	00007fc0 26fafe20	      NOP
116	00007fc0 26fafe30	      NOP
117	00007fc0 26fafe40	      NOP
118	00007fc0 26fafe50	      NOP
119	00007fc0 26fafe60	      NOP
120	00007fc0 26fafe70	      NOP

it is complete code, I don’t think it is previous loop. I think L43 should’t wait for so long time. Why don’t R13, or R15 wait for so long time?

What operation does this instruction represent? Why do shl and IMAD appear simultaneously?

Integer Multiply-And-Accumulate is often used for calculating indices. If the data structures or dimensions have a size, which is a multiple of 2, then a shift can be used. The ALU offers to use those operations at the same time as the hardware is chained and can be operated at the same time.

In this case it probably means

R0 := (R0 << 0x4) + RZ;

with RZ == 0;

or

R0 := R0 * 0x10;

Edit: @njuffa is right I think that with this instruction not both shift and multiplication are done, but the multiplication is used as an alternative way to do shifts, so it is a pseudo-instruction: It probably has the same opcode as the actual IMAD multiplication with the difference that it uses a power of 2 as factor.

Unfortunately, the SASS disassembly is hardware-centric. From looking at quite a bit of code, I am 99% certain that IMAD.SHL.U32 means this is a left shift of an unsigned 32-bit integer that is accomplished by using the integer multiply-add unit, as opposed to using the funnel shifter. Obviously multiplication by a power of two is equivalent to a left shift.

If you look at more SASS code, you will find that the compiler attempts some load balancing between functional units, and consequently you may also encounter things like IMAD.MOV for a register-to-register transfer, IMAD.IADD for an integer addition that does not use the three-input IADD3, etc.

One might now ask, why use IMAD.SHL instead of just a regular IMAD to affect the left shift? My guess is that this activates an energy-saving feature: multipliers are usually the most power-hungry general-purpose computational units.

Your response is very clear, thank you so much! 😊