What is the reason why performance deteriorates when PTX code written with pipeline considerations is repeatedly used?
Hello, I’m a student who is working on a study through NVIDIA GPU.
The PTX output by the existing compiler was rewritten in consideration of the pipeline and implemented as an online ptx.
If using this repeatedly, performance will be degraded, so I ask you this question.
[Environment]
- RTX 3090 (ampere)
- Visual Studio 2022
- CUDA Runtime 12.1
- CUDA C language
An instruction can be divided into several stages, and if the next instruction is unprocessed until one instruction is processed, the part that processes the other steps during the processing of a particular step of the instruction is not done. At this time, the use of pipelines can improve speed by allowing different steps of one command to be processed while other parts can be processed.
First of all, it was confirmed that the code that applied the pipeline was nearly three times higher than the existing PTX.
However, it was confirmed that reusing it significantly reduces performance and eventually slows down compared to the existing compiler code
I wonder why this is. (I could see the compiler code repeating the same format)
Attached is the code implemented with the existing cuda c and the code implemented with the inline ptx.
[CUDA C implementation]
{ h = (h + 0x428a2f98d728ae22 + w0_t); h = (h + ((((e) >> (14)) | ((e) << (64 - (14)))) ^ (((e) >> (18)) | ((e) << (64 - (18)))) ^ (((e) >> (41)) | ((e) << (64 - (41))))) + ((g) ^ ((e) & ((f) ^ (g)))));
d += h;
h = (h + (((((a) >> (28)) | ((a) << (64 - (28))))) ^ (((a) >> (34)) | ((a) << (64 - (34)))) ^ (((a) >> (39)) | ((a) << (64 - (39))))) + (((a) & (b)) | ((c) & ((a) ^ (b))))); }
[PTX extracted through Compiler]
"add.u64 rd0, h, 0x428a2f98d728ae22; \n\t"
"xor.b64 rd6, f, g; \n\t"
"shl.b64 lhs, e, 50; \n\t"
"shr.b64 rhs, e, 14; \n\t"
"add.u64 rd1, lhs, rhs; \n\t"
"shl.b64 lhs, e, 46; \n\t"
"shr.b64 rhs, e, 18; \n\t"
"add.u64 rd2, lhs, rhs; \n\t"
"and.b64 rd7, e, rd6; \n\t"
"shl.b64 lhs, e, 23; \n\t"
"shr.b64 rhs, e, 41; \n\t"
"add.u64 rd4, lhs, rhs; \n\t"
"xor.b64 rd3, rd1, rd2; \n\t"
"xor.b64 rd8, g, rd7; \n\t"
"add.u64 h, w0_t, rd0; \n\t"
"shl.b64 lhs, a, 36; \n\t"
"shr.b64 rhs, a, 28; \n\t"
"xor.b64 rd5, rd3, rd4; \n\t"
"add.u64 rd1, lhs, rhs; \n\t"
"shl.b64 lhs, a, 30; \n\t"
"shr.b64 rhs, a, 34; \n\t"
"add.u64 rd2, lhs, rhs; \n\t"
"add.u64 rd9, rd5, rd8; \n\t"
"shl.b64 lhs, a, 25; \n\t"
"shr.b64 rhs, a, 39; \n\t"
"xor.b64 rd3, rd1, rd2; \n\t"
"add.u64 rd4, lhs, rhs; \n\t"
"xor.b64 rd6, a, b; \n\t"
"and.b64 rd8, a, b; \n\t"
"add.u64 h, h, rd9; \n\t"
"xor.b64 rd5, rd3, rd4; \n\t"
"and.b64 rd7, c, rd6; \n\t"
"add.u64 d, d, h; \n\t"
"or.b64 rd9, rd7, rd8; \n\t"
"add.u64 rd10, rd5, rd9; \n\t"
"add.u64 h, h, rd10; \n\t"
[My inline ptx code considering pipeline]
"add.u64 rd0, h, 0x428a2f98d728ae22; \n\t"
"xor.b64 rd6, f, g; \n\t"
"shl.b64 lhs0, e, 50; \n\t"
"shr.b64 rhs0, e, 14; \n\t"
"add.u64 rd1, lhs0, rhs0; \n\t"
"shl.b64 lhs1, e, 46; \n\t"
"shr.b64 rhs1, e, 18; \n\t"
"add.u64 rd2, lhs1, rhs1; \n\t"
"and.b64 rd7, e, rd6; \n\t"
"shl.b64 lhs2, e, 23; \n\t"
"shr.b64 rhs2, e, 41; \n\t"
"add.u64 rd4, lhs2, rhs2; \n\t"
"xor.b64 rd3, rd1, rd2; \n\t"
"xor.b64 rd8, g, rd7; \n\t"
"add.u64 h, w0_t, rd0; \n\t"
"shl.b64 lhs3, a, 36; \n\t"
"shr.b64 rhs3, a, 28; \n\t"
"xor.b64 rd5, rd3, rd4; \n\t"
"add.u64 rd1, lhs3, rhs3; \n\t"
"shl.b64 lhs4, a, 30; \n\t"
"shr.b64 rhs4, a, 34; \n\t"
"add.u64 rd2, lhs4, rhs4; \n\t"
"add.u64 rd9, rd5, rd8; \n\t"
"shl.b64 lhs5, a, 25; \n\t"
"shr.b64 rhs5, a, 39; \n\t"
"xor.b64 rd3, rd1, rd2; \n\t"
"add.u64 rd4, lhs5, rhs5; \n\t"
"xor.b64 rd6, a, b; \n\t"
"and.b64 rd8, a, b; \n\t"
"add.u64 h, h, rd9; \n\t"
"xor.b64 rd5, rd3, rd4; \n\t"
"and.b64 rd7, c, rd6; \n\t"
"add.u64 d, d, h; \n\t"
"or.b64 rd9, rd7, rd8; \n\t"
"add.u64 rd10, rd5, rd9; \n\t"
"add.u64 h, h, rd10; \n\t"
[repeated CUDA C implementation]
{ h = (h + 0x428a2f98d728ae22 + w0_t);
h = (h + ((((e) >> (14)) | ((e) << (64 - (14)))) ^ (((e) >> (18)) | ((e) << (64 - (18)))) ^ (((e) >> (41)) | ((e) << (64 - (41))))) + ((g) ^ ((e) & ((f) ^ (g)))));
d += h;
h = (h + (((((a) >> (28)) | ((a) << (64 - (28))))) ^ (((a) >> (34)) | ((a) << (64 - (34)))) ^ (((a) >> (39)) | ((a) << (64 - (39))))) + (((a) & (b)) | ((c) & ((a) ^ (b))))); }
{ g = (g + 0x7137449123ef65cd + w1_t);
g = (g + ((((d) >> (14)) | ((d) << (64 - (14)))) ^ (((d) >> (18)) | ((d) << (64 - (18)))) ^ (((d) >> (41)) | ((d) << (64 - (41))))) + ((f) ^ ((d) & ((e) ^ (f)))));
c += g;
g = (g + (((((h) >> (28)) | ((h) << (64 - (28))))) ^ (((h) >> (34)) | ((h) << (64 - (34)))) ^ (((h) >> (39)) | ((h) << (64 - (39))))) + (((h) & (a)) | ((b) & ((h) ^ (a))))); }
{ f = (f + 0xb5c0fbcfec4d3b2f + w2_t);
f = (f + ((((c) >> (14)) | ((c) << (64 - (14)))) ^ (((c) >> (18)) | ((c) << (64 - (18)))) ^ (((c) >> (41)) | ((c) << (64 - (41))))) + ((e) ^ ((c) & ((d) ^ (e)))));
b += f;
f = (f + (((((g) >> (28)) | ((g) << (64 - (28))))) ^ (((g) >> (34)) | ((g) << (64 - (34)))) ^ (((g) >> (39)) | ((g) << (64 - (39))))) + (((g) & (h)) | ((a) & ((g) ^ (h))))); }
{ e = (e + 0xe9b5dba58189dbbc + w3_t);
e = (e + ((((b) >> (14)) | ((b) << (64 - (14)))) ^ (((b) >> (18)) | ((b) << (64 - (18)))) ^ (((b) >> (41)) | ((b) << (64 - (41))))) + ((d) ^ ((b) & ((c) ^ (d)))));
a += e;
e = (e + (((((f) >> (28)) | ((f) << (64 - (28))))) ^ (((f) >> (34)) | ((f) << (64 - (34)))) ^ (((f) >> (39)) | ((f) << (64 - (39))))) + (((f) & (g)) | ((h) & ((f) ^ (g))))); }
[repeated My inline ptx code considering pipeline]
"add.u64 rd0, h, 0x428a2f98d728ae22; \n\t"
"xor.b64 rd6, f, g; \n\t"
"shl.b64 lhs0, e, 50; \n\t"
"shr.b64 rhs0, e, 14; \n\t"
"add.u64 rd1, lhs0, rhs0; \n\t"
"shl.b64 lhs1, e, 46; \n\t"
"shr.b64 rhs1, e, 18; \n\t"
"add.u64 rd2, lhs1, rhs1; \n\t"
"and.b64 rd7, e, rd6; \n\t"
"shl.b64 lhs2, e, 23; \n\t"
"shr.b64 rhs2, e, 41; \n\t"
"add.u64 rd4, lhs2, rhs2; \n\t"
"xor.b64 rd3, rd1, rd2; \n\t"
"xor.b64 rd8, g, rd7; \n\t"
"add.u64 h, w0_t, rd0; \n\t"
"shl.b64 lhs3, a, 36; \n\t"
"shr.b64 rhs3, a, 28; \n\t"
"xor.b64 rd5, rd3, rd4; \n\t"
"add.u64 rd1, lhs3, rhs3; \n\t"
"shl.b64 lhs4, a, 30; \n\t"
"shr.b64 rhs4, a, 34; \n\t"
"add.u64 rd2, lhs4, rhs4; \n\t"
"add.u64 rd9, rd5, rd8; \n\t"
"shl.b64 lhs5, a, 25; \n\t"
"shr.b64 rhs5, a, 39; \n\t"
"xor.b64 rd3, rd1, rd2; \n\t"
"add.u64 rd4, lhs5, rhs5; \n\t"
"xor.b64 rd6, a, b; \n\t"
"and.b64 rd8, a, b; \n\t"
"add.u64 h, h, rd9; \n\t"
"xor.b64 rd5, rd3, rd4; \n\t"
"and.b64 rd7, c, rd6; \n\t"
"add.u64 d, d, h; \n\t"
"or.b64 rd9, rd7, rd8; \n\t"
"add.u64 rd10, rd5, rd9; \n\t"
"add.u64 h, h, rd10; \n\t"
"add.u64 rd20, g, 0x7137449123ef65cd; \n\t"
"xor.b64 rd26, e, f; \n\t"
"shl.b64 lhs10, d, 50; \n\t"
"shr.b64 rhs10, d, 14; \n\t"
"add.u64 rd21, lhs10, rhs10; \n\t"
"shl.b64 lhs11, d, 46; \n\t"
"shr.b64 rhs11, d, 18; \n\t"
"add.u64 rd22, lhs11, rhs11; \n\t"
"and.b64 rd27, d, rd26; \n\t"
"shl.b64 lhs12, d, 23; \n\t"
"shr.b64 rhs12, d, 41; \n\t"
"add.u64 rd24, lhs12, rhs12; \n\t"
"xor.b64 rd23, rd21, rd22; \n\t"
"xor.b64 rd28, f, rd27; \n\t"
"add.u64 g, w1_t, rd20; \n\t"
"shl.b64 lhs13, h, 36; \n\t"
"shr.b64 rhs13, h, 28; \n\t"
"xor.b64 rd25, rd23, rd24; \n\t"
"add.u64 rd21, lhs13, rhs13; \n\t"
"shl.b64 lhs14, h, 30; \n\t"
"shr.b64 rhs14, h, 34; \n\t"
"add.u64 rd22, lhs14, rhs14; \n\t"
"add.u64 rd29, rd25, rd28; \n\t"
"shl.b64 lhs15, h, 25; \n\t"
"shr.b64 rhs15, h, 39; \n\t"
"xor.b64 rd23, rd21, rd22; \n\t"
"add.u64 rd24, lhs15, rhs15; \n\t"
"xor.b64 rd26, h, a; \n\t"
"and.b64 rd28, h, a; \n\t"
"add.u64 g, g, rd29; \n\t"
"xor.b64 rd25, rd23, rd24; \n\t"
"and.b64 rd27, b, rd26; \n\t"
"add.u64 c, c, g; \n\t"
"or.b64 rd29, rd27, rd28; \n\t"
"add.u64 rd30, rd25, rd29; \n\t"
"add.u64 g, g, rd30; \n\t"
"add.u64 rd40, f, 0xb5c0fbcfec4d3b2f; \n\t"
"xor.b64 rd46, d, e; \n\t"
"shl.b64 lhs20, c, 50; \n\t"
"shr.b64 rhs20, c, 14; \n\t"
"add.u64 rd41, lhs20, rhs20; \n\t"
"shl.b64 lhs21, c, 46; \n\t"
"shr.b64 rhs21, c, 18; \n\t"
"add.u64 rd42, lhs21, rhs21; \n\t"
"and.b64 rd47, c, rd46; \n\t"
"shl.b64 lhs22, c, 23; \n\t"
"shr.b64 rhs22, c, 41; \n\t"
"add.u64 rd44, lhs22, rhs22; \n\t"
"xor.b64 rd43, rd41, rd42; \n\t"
"xor.b64 rd48, e, rd47; \n\t"
"add.u64 f, w2_t, rd40; \n\t"
"shl.b64 lhs23, g, 36; \n\t"
"shr.b64 rhs23, g, 28; \n\t"
"xor.b64 rd45, rd43, rd44; \n\t"
"add.u64 rd41, lhs23, rhs23; \n\t"
"shl.b64 lhs24, g, 30; \n\t"
"shr.b64 rhs24, g, 34; \n\t"
"add.u64 rd42, lhs24, rhs24; \n\t"
"add.u64 rd49, rd45, rd48; \n\t"
"shl.b64 lhs25, g, 25; \n\t"
"shr.b64 rhs25, g, 39; \n\t"
"xor.b64 rd43, rd41, rd42; \n\t"
"add.u64 rd44, lhs25, rhs25; \n\t"
"xor.b64 rd46, g, h; \n\t"
"and.b64 rd48, g, h; \n\t"
"add.u64 f, f, rd49; \n\t"
"xor.b64 rd45, rd43, rd44; \n\t"
"and.b64 rd47, a, rd46; \n\t"
"add.u64 b, b, f; \n\t"
"or.b64 rd49, rd47, rd48; \n\t"
"add.u64 rd50, rd45, rd49; \n\t"
"add.u64 f, f, rd50; \n\t"
"add.u64 rd60, e, 0xe9b5dba58189dbbc; \n\t"
"xor.b64 rd66, c, d; \n\t"
"shl.b64 lhs30, b, 50; \n\t"
"shr.b64 rhs30, b, 14; \n\t"
"add.u64 rd61, lhs30, rhs30; \n\t"
"shl.b64 lhs31, b, 46; \n\t"
"shr.b64 rhs31, b, 18; \n\t"
"add.u64 rd62, lhs31, rhs31; \n\t"
"and.b64 rd67, b, rd66; \n\t"
"shl.b64 lhs32, b, 23; \n\t"
"shr.b64 rhs32, b, 41; \n\t"
"add.u64 rd64, lhs32, rhs32; \n\t"
"xor.b64 rd63, rd61, rd62; \n\t"
"xor.b64 rd68, d, rd67; \n\t"
"add.u64 e, w3_t, rd60; \n\t"
"shl.b64 lhs33, f, 36; \n\t"
"shr.b64 rhs33, f, 28; \n\t"
"xor.b64 rd65, rd63, rd64; \n\t"
"add.u64 rd61, lhs33, rhs33; \n\t"
"shl.b64 lhs34, f, 30; \n\t"
"shr.b64 rhs34, f, 34; \n\t"
"add.u64 rd62, lhs34, rhs34; \n\t"
"add.u64 rd69, rd65, rd68; \n\t"
"shl.b64 lhs35, f, 25; \n\t"
"shr.b64 rhs35, f, 39; \n\t"
"xor.b64 rd63, rd61, rd62; \n\t"
"add.u64 rd64, lhs35, rhs35; \n\t"
"xor.b64 rd66, f, g; \n\t"
"and.b64 rd68, f, g; \n\t"
"add.u64 e, e, rd69; \n\t"
"xor.b64 rd65, rd63, rd64; \n\t"
"and.b64 rd67, h, rd66; \n\t"
"add.u64 a, a, e; \n\t"
"or.b64 rd69, rd67, rd68; \n\t"
"add.u64 rd70, rd65, rd69; \n\t"
"add.u64 e, e, rd70; \n\t"
For your information, the measured performance is the average value of 130,000 times of operation.
In addition, if the experiment is repeated four times, the values in the register are all different.
The performance I want to know is the number of times the algorithm is performed per second.(number of outputs/seconds)
Please reply.
Thank you.