Hi, I recently learned about asynchronous copy memcpy_async
in CUDA, and I cannot understand how it executes asynchronously. For example, I have the following code which copies global memory to shared memory:
__global__ void memcpy_async_block(volatile float* global) {
const int s = 4096;
__shared__ volatile float smem[s];
auto group = cooperative_groups::this_thread_block();
cooperative_groups::memcpy_async(group, (float*)smem,
(float*)&global[s], sizeof(float) * s);
}
Compile it with nvcc async_copy.cu -gencode=arch=compute_90,code=\"sm_90,compute_90\"
and use cuobjdump
to view the assembly code, we have
Function : _Z18memcpy_async_blockPVf
.headerflags @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM90 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM90)"
/*0000*/ LDC R1, c[0x0][0x28] ; /* 0x00000a00ff017b82 */
/* 0x000e640000000800 */
/*0010*/ S2R R0, SR_TID.Z ; /* 0x0000000000007919 */
/* 0x000ea20000002300 */
/*0020*/ ULDC UR4, c[0x0][0x4] ; /* 0x0000010000047ab9 */
/* 0x000fe20000000800 */
/*0030*/ ULDC UR5, c[0x0][0x0] ; /* 0x0000000000057ab9 */
/* 0x000fe20000000800 */
/*0040*/ ULDC.64 UR6, c[0x0][0x210] ; /* 0x0000840000067ab9 */
/* 0x000fe20000000a00 */
/*0050*/ S2R R3, SR_TID.Y ; /* 0x0000000000037919 */
/* 0x000ea20000002200 */
/*0060*/ BSSY B0, 0x260 ; /* 0x000001f000007945 */
/* 0x000fe20003800000 */
/*0070*/ S2R R5, SR_TID.X ; /* 0x0000000000057919 */
/* 0x000ee20000002100 */
/*0080*/ IMAD R0, R0, UR4, R3 ; /* 0x0000000400007c24 */
/* 0x004fe2000f8e0203 */
/*0090*/ UIMAD UR4, UR4, UR5, URZ ; /* 0x00000005040472a4 */
/* 0x000fc6000f8e023f */
/*00a0*/ IMAD R0, R0, UR5, R5 ; /* 0x0000000500007c24 */
/* 0x008fca000f8e0205 */
/*00b0*/ ISETP.GT.U32.AND P0, PT, R0, 0xfff, PT ; /* 0x00000fff0000780c */
/* 0x000fda0003f04070 */
/*00c0*/ @P0 BRA 0x250 ; /* 0x0000000000600947 */
/* 0x000fea0003800000 */
/*00d0*/ S2R R5, SR_CgaCtaId ; /* 0x0000000000057919 */
/* 0x000ea20000008800 */
/*00e0*/ LDC R3, c[0x0][0x8] ; /* 0x00000200ff037b82 */
/* 0x000ee20000000800 */
/*00f0*/ MOV R2, 0x400 ; /* 0x0000040000027802 */
/* 0x000fe20000000f00 */
/*0100*/ IMAD.MOV.U32 R7, RZ, RZ, R0 ; /* 0x000000ffff077224 */
/* 0x000fe400078e0000 */
/*0110*/ IMAD.MOV.U32 R6, RZ, RZ, RZ ; /* 0x000000ffff067224 */
/* 0x000fc800078e00ff */
/*0120*/ LDC.64 R8, c[0x0][0x208] ; /* 0x00008200ff087b82 */
/* 0x000f220000000a00 */
/*0130*/ IMAD R0, R3, UR4, RZ ; /* 0x0000000403007c24 */
/* 0x008fe2000f8e02ff */
/*0140*/ PRMT R4, R5, 0x654, R2 ; /* 0x0000065405047816 */
/* 0x004fce0000000002 */
/*0150*/ LEA R2, P0, R7.reuse, UR6, 0x2 ; /* 0x0000000607027c11 */
/* 0x044fe2000f8010ff */
/*0160*/ IMAD R5, R7.reuse, 0x4, R4 ; /* 0x0000000407057824 */
/* 0x040fe200078e0204 */
/*0170*/ R2UR UR8, R8 ; /* 0x00000000080872ca */
/* 0x010fe400000e0000 */
/*0180*/ LEA.HI.X R3, R7, UR7, R6.reuse, 0x2, P0 ; /* 0x0000000707037c11 */
/* 0x100fe400080f1406 */
/*0190*/ IADD3 R7, P0, R0, R7, RZ ; /* 0x0000000700077210 */
/* 0x000fe40007f1e0ff */
/*01a0*/ R2UR UR9, R9 ; /* 0x00000000090972ca */
/* 0x000fe400000e0000 */
/*01b0*/ IADD3 R2, P1, R2, 0x4000, RZ ; /* 0x0000400002027810 */
/* 0x000fe20007f3e0ff */
/*01c0*/ IMAD.X R6, RZ, RZ, R6, P0 ; /* 0x000000ffff067224 */
/* 0x000fe200000e0606 */
/*01d0*/ ISETP.GE.U32.AND P0, PT, R7, 0x1000, PT ; /* 0x000010000700780c */
/* 0x000fc60003f06070 */
/*01e0*/ IMAD.X R3, RZ, RZ, R3, P1 ; /* 0x000000ffff037224 */
/* 0x000fe200008e0603 */
/*01f0*/ ISETP.GE.U32.AND.EX P0, PT, R6, RZ, PT, P0 ; /* 0x000000ff0600720c */
/* 0x000fca0003f06100 */
/*0200*/ @!PT LDS RZ, [RZ] ; /* 0x00000000fffff984 */
/* 0x000fe20000000800 */
/*0210*/ @!PT LDS RZ, [RZ] ; /* 0x00000000fffff984 */
/* 0x000fe20000000800 */
/*0220*/ @!PT LDS RZ, [RZ] ; /* 0x00000000fffff984 */
/* 0x000fe20000000800 */
/*0230*/ LDGSTS.E [R5], desc[UR8][R2.64] ; /* 0x0000000002057fae */
/* 0x0005f0000b921848 */
/*0240*/ @!P0 BRA 0x150 ; /* 0xfffffffc00c08947 */
/* 0x000fea000383ffff */
/*0250*/ BSYNC B0 ; /* 0x0000000000007941 */
/* 0x000fea0003800000 */
/*0260*/ LDGDEPBAR ; /* 0x00000000000079af */
/* 0x000e220000000000 */
/*0270*/ EXIT ; /* 0x000000000000794d */
/* 0x000fea0003800000 */
/*0280*/ BRA 0x280; /* 0xfffffffc00fc7947 */
/* 0x000fc0000383ffff */
/*0290*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*02a0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*02b0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*02c0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*02d0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*02e0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*02f0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0300*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0310*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0320*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0330*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0340*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0350*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0360*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0370*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
..........
I don’t know much about the SASS code, but it seems that the program loops the LDGSTS instruction. And we know that the execution of instructions is in order because there is only one instruction pointer, so the copy in the main program actually blocks the thread.
I want to know that how the asynchronization happens? In my imagination there should be a hardware that accepts an instruction and executes the whole copy process in parallel with the other pipelines (e.g. FMA pipeline), to achieve the asynchronization.