Hello!
During some tests, I found that there must be mismatch in compilation for video instruction with ‘optional merge’.
In PTX specification optMerge described as:
.s33 optMerge( Modifier dsel, .s33 tmp, .s33 c ) {
switch ( dsel ) {
case .h0: return ((tmp & 0xffff) | (0xffff0000 & c);
case .h1: return ((tmp & 0xffff) << 16) | (0x0000ffff & c);
case .b0: return ((tmp & 0xff) | (0xffffff00 & c);
case .b1: return ((tmp & 0xff) << 8) | (0xffff00ff & c);
case .b2: return ((tmp & 0xff) << 16) | (0xff00ffff & c);
case .b3: return ((tmp & 0xff) << 24) | (0x00ffffff & c);
default: return tmp;
}
}
But for .h1 case it seems that ptxas generate result as (tmp & 0xffff0000) | (0x0000ffff & c)
You can check it on godbolt: Compiler Explorer
Last PRMT R0, R0, 0x3254, R1 according to description must be PRMT R0, R0, 0x1054, R1