How to tell nvcc that some `if` must diverge and stop trying to fuse previous statements into it?

I’m trying to implement a “neighbor-thread-transpose” utility function, which takes T register[N] for each thread, essentially forming [blockDim/N][N][N] total registers within a block, and transpose the last two axis (inter-exchange per N thread).

Here is my first implementation:

template<int N, class T>__device__ __forceinline__ void shf(T (&registers)[N]) {
	for (int j = 1; j < N; j++) {
		registers[threadIdx.y % N ^ j] = __shfl_xor_sync(0xffffffff, registers[threadIdx.y % N], j);
	}
}

I found the result code uses local storage “for no reason”. After some thinking, I realized that this is necessary because registers aren’t addressable, so the compiler must use some kind of addressable space to dynamically select register.

Since N is usually small, my second plan is something like this:

template<int N, class T>__device__ __forceinline__ void shf(T (&registers)[N]) {
	for (int i = 0; i < N; i++) {
		if (i == threadIdx.y % N) {
			for (int j = 1; j < N; j++) {
				registers[i ^ j] = __shfl_xor_sync(0xffffffff, registers[i], j);
			}
		}
		break;
	}
}

In theory this should work. I have 128bit vector load for each thread before transposing, and each neighboring four threads transposes their 32bit component with each other. But I found some “wired” 32bit load in generated assembly while I’ve written none. After some thinking, I think the compiler is trying to be “smart”. Because when e.g. i==0, essentially register[1…3] will never be read and will be replaced after transposing. So the compiler inlines the previous vector load to each special case.

But since all branches must be traversed by design, the resulting code becomes the kernel loads the same address multiple times (with different size), which should be avoided.

Any ideas how can I stop this automatic “fusing optimization”? Or is there a better way to write this kind of neighbor-thread-transpose without relying on shared/global/local memory or linear-time branching?

Yes I’m looking at nsight benchmark result which includes disassembly.

Good idea, I’ll try test that. I’ve just written some manually asm and I’ll compare that with compiler optimized version.

But I found some “wired” 32bit load in generated assembly while I’ve written none.

You are looking at SASS (machine code), not PTX, correct? Since ptxas is an optimizing compiler, looking at PTX should be a rare occurrence.

Without seeing the entire context, it is impossible to determine whether this analysis is correct. But from what I have seen in terms of generated code, this is most likely what is happening. Memory traffic is expensive and vector loads increase register pressure, so the compiler tries to omit unnecessary loads. Programmers who use float4 to operate on what is essentially float3 data have noticed the anticipated 128-bit loads disappear for the same reason.

You might want to try a quick experiment by declaring the data volatile uint4 to see whether that induces the compiler to generate a vector load. Based purely on standard C++ semantics, I doubt that will lead to different SASS. Even if it does give you the desired vector load, you may well find that the resulting code executes no faster then what you are getting now. My experience with the CUDA compiler in recent years has been that “the compiler knows best” the vast majority of the time.

A more fruitful line of exploration might be to re-examine whether your use case actually needs this “neighbor-thread-transpose” idiom, or whether there are alternative ways to go about whatever it is you need to compute.

I finished testing. Here’s my test code:

#include <cuda_fp16.h>

template<bool ASM, int N, class T>
__device__ __forceinline__ void shf(T (&_)[N]) {
	if constexpr(ASM) {
		asm("{.reg .u32 %ry;"
		    ".reg .pred %py;"
		    "mov.u32 %ry,%tid.y;"
		    "and.b32 %ry,%ry,3;"
		    "setp.ne.u32 %py,%ry,0;"
		    "@%py bra L1;"
		    "shfl.sync.bfly.b32 %1,%0,1,31,-1;"
		    "shfl.sync.bfly.b32 %2,%0,2,31,-1;"
		    "shfl.sync.bfly.b32 %3,%0,3,31,-1;"
		    "bra L0;"
		    "L1:"
		    "setp.ne.u32 %py,%ry,1;"
		    "@%py bra L2;"
		    "shfl.sync.bfly.b32 %0,%1,1,31,-1;"
		    "shfl.sync.bfly.b32 %3,%1,2,31,-1;"
		    "shfl.sync.bfly.b32 %2,%1,3,31,-1;"
		    "bra L0;"
		    "L2:"
		    "setp.ne.u32 %py,%ry,2;"
		    "@%py bra L3;"
		    "shfl.sync.bfly.b32 %3,%2,1,31,-1;"
		    "shfl.sync.bfly.b32 %0,%2,2,31,-1;"
		    "shfl.sync.bfly.b32 %1,%2,3,31,-1;"
		    "bra L0;"
		    "L3:"
		    "shfl.sync.bfly.b32 %2,%3,1,31,-1;"
		    "shfl.sync.bfly.b32 %1,%3,2,31,-1;"
		    "shfl.sync.bfly.b32 %0,%3,3,31,-1;"
		    "L0:}":"+r"(_[0]), "+r"(_[1]), "+r"(_[2]), "+r"(_[3]):);
	} else {
		for (int i = 0; i < N; i++) {
			if (i == threadIdx.y % N) {
				for (int j = 1; j < N; j++) {
					_[i ^ j] = __shfl_xor_sync(0xffffffff, _[i], j);
				}
			}
			break;
		}
	}
}

template<bool ASM>
struct e2828 {
	int _[4];
	__device__ __forceinline__ void ld(const half* __restrict__ A, int Y) {
		*(int4*)_ = ((int4*)A)[(threadIdx.y << 3 & 16 | threadIdx.y >> 1 & 14) * Y | threadIdx.y & 1];
		shf<ASM>(_);
	}
	__device__ __forceinline__ void st(half* __restrict__ A, int Y) {
		shf<ASM>(_);
		((int4*)A)[(threadIdx.y << 3 & 16 | threadIdx.y >> 1 & 14) * Y | threadIdx.y & 1] = *(int4*)_;
	}
	__device__ __forceinline__ void zero() {
		((long*)_)[0] = 0;
		((long*)_)[1] = 0;
	}
	__device__ __forceinline__ void mm(e2828 a, e2828 b) {
		asm("mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%0,%2},{%4,%5,%6,%7},{%8,%9},{%0,%2};"
		    "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%1,%3},{%4,%5,%6,%7},{%10,%11},{%1,%3};":
		    "=r"(_[0]), "=r"(_[1]), "=r"(_[2]), "=r"(_[3]):
		    "r"(a._[0]), "r"(a._[1]), "r"(a._[2]), "r"(a._[3]),
		    "r"(b._[0]), "r"(b._[1]), "r"(b._[2]), "r"(b._[3]));
	}
};

template<bool ASM>
__global__ void ker(const half * __restrict__ A, const half * __restrict__ B, half * __restrict__ O, int X, int Y, int Z) {
	e2828<ASM> a, b, o;
	for (int x = 0; x < X; x++) {
		for (int y = 0; y < Y; y++) {
			o.zero();
			for (int z = 0; z < Z; z++) {
				a.ld(A + (x * 16 * Z + z) * 16, Z);
				b.ld(B + (y * 16 * Z + z) * 16, Z);
				o.mm(a, b);
			}
			o.st(O + (x * 16 * Y + y) * 16, Y);
		}
	}
}

int main() {
	half *A, *B, *O;
	cudaMalloc(&A, 64 * 16 * 64 * 16 * 2);
	cudaMalloc(&B, 64 * 16 * 64 * 16 * 2);
	cudaMalloc(&O, 64 * 16 * 64 * 16 * 2);

	ker<false> <<< 1, 1>>>(A, B, O, 64, 64, 64);
	ker<true> <<< 1, 1>>>(A, B, O, 64, 64, 64);
}

It seems that using hand written assembly that prevents generation of 32-bit load is indeed faster, according to nsight compute:


32-bit load in non-asm version

The salient difference seems to be that the handwritten PTX is using all four components of the vector load, so there is no longer any incentive for the compiler to split the vector load?

If so, can you not express at the C++ level what you expressed with the PTX inline assembly? C++ code will be more maintainable in the long term.

I think the ptx is essentially equivalent with the “else” for loop. I’ll try explicitly writing it

The innermost loop in the original C++ code is presumably too clever by half by XORing the indexes [*]. In the PTX version you removed that cleverness: you basically flattened that loop, instead explicitly enumerating the shuffles. At that point the use of all vector components is apparent, and the vector load remains as desired.

My working hypothesis is that if you translate the PTX code one-to-one back to C++ with intrinsics, it will result in the desired machine code.

[*] I admit that I did not interpret correctly what that XORing was accomplishing, only realizing it when comparing to the inline PTX code. Could a compiler find out that the loop + XOR basically delivers a “rotating enumeration” of all vector components? It seems like a difficult problem to tackle in general, but you could file an enhancement request against the compiler.

I modified code to this, with MODE==0=>ptx, MODE==1=>exact replicate of ptx in c++

template<int MODE, int N, class T>
__device__ __forceinline__ void shf(T (&_)[N]) {
	if constexpr(MODE == 0) {
		asm("{.reg .u32 %ry;"
		    ".reg .pred %py;"
		    "mov.u32 %ry,%tid.y;"
		    "and.b32 %ry,%ry,3;"
		    "setp.ne.u32 %py,%ry,0;"
		    "@%py bra L1;"
		    "shfl.sync.bfly.b32 %1,%0,1,31,-1;"
		    "shfl.sync.bfly.b32 %2,%0,2,31,-1;"
		    "shfl.sync.bfly.b32 %3,%0,3,31,-1;"
		    "bra L0;"
		    "L1:"
		    "setp.ne.u32 %py,%ry,1;"
		    "@%py bra L2;"
		    "shfl.sync.bfly.b32 %0,%1,1,31,-1;"
		    "shfl.sync.bfly.b32 %3,%1,2,31,-1;"
		    "shfl.sync.bfly.b32 %2,%1,3,31,-1;"
		    "bra L0;"
		    "L2:"
		    "setp.ne.u32 %py,%ry,2;"
		    "@%py bra L3;"
		    "shfl.sync.bfly.b32 %3,%2,1,31,-1;"
		    "shfl.sync.bfly.b32 %0,%2,2,31,-1;"
		    "shfl.sync.bfly.b32 %1,%2,3,31,-1;"
		    "bra L0;"
		    "L3:"
		    "shfl.sync.bfly.b32 %2,%3,1,31,-1;"
		    "shfl.sync.bfly.b32 %1,%3,2,31,-1;"
		    "shfl.sync.bfly.b32 %0,%3,3,31,-1;"
		    "L0:}":"+r"(_[0]), "+r"(_[1]), "+r"(_[2]), "+r"(_[3]):);
	} else if constexpr(MODE == 1) {
		switch (threadIdx.y & 3) {
		case 0:
			_[1] = __shfl_xor_sync(0xffffffff, _[0], 1);
			_[2] = __shfl_xor_sync(0xffffffff, _[0], 2);
			_[3] = __shfl_xor_sync(0xffffffff, _[0], 3);
			break;
		case 1:
			_[0] = __shfl_xor_sync(0xffffffff, _[1], 1);
			_[3] = __shfl_xor_sync(0xffffffff, _[1], 2);
			_[2] = __shfl_xor_sync(0xffffffff, _[1], 3);
			break;
		case 2:
			_[3] = __shfl_xor_sync(0xffffffff, _[2], 1);
			_[0] = __shfl_xor_sync(0xffffffff, _[2], 2);
			_[1] = __shfl_xor_sync(0xffffffff, _[2], 3);
			break;
		default:
			_[2] = __shfl_xor_sync(0xffffffff, _[3], 1);
			_[1] = __shfl_xor_sync(0xffffffff, _[3], 2);
			_[0] = __shfl_xor_sync(0xffffffff, _[3], 3);
		}
	} else {
		for (int i = 0; i < N; i++) {
			if (i == threadIdx.y % N) {
				for (int j = 1; j < N; j++) {
					_[i ^ j] = __shfl_xor_sync(0xffffffff, _[i], j);
				}
			}
			break;
		}
	}
}

EDIT: forgot to actually fire a whole wrap last time
MODE==1 did get rid of non-vector load, but didn’t really get a any, compared to pure ptx:

So either optimization of tricky code like this is “dark magic” in cuda, or maybe I’m missing something that __shfl_xor_sync did so my ptx code becomes improperly faster?

You would need to analyze the generated SASS code in detail. At first glance the C++ code for MODE==1 seems to functionally match your hand-coded PTX.

I looked into how switch statements are translated recently and was mildly surprised that the generated code deviated from what I expected. There are multiple idioms that can be used to translate switch statements, and the choice can definitely have an impact on performance.

An even closer translation of your PTX code into C++ would actually use an if-then-else chain rather than a switch, so maybe give that a try as well.

“Dark magic” is not a thing in software, there is always a rational explanation for everything. Tracking down the relevant details that plausibly explain a particular observation can be a hard problem, though. On the plus side, these kind of deep dives typically leave a programmer with a deeper understanding.

if becomes even slower 😂, 48.85 ms. I guess I need to dig into SASS to find out the actual reasons. But since SASS is deeply optimized, it’s really hard to compare them I think.

on a first glance, all SHFL.BFLYs are put at a deeper position (#807~832) in ptx code:

while they’re at a shallower place in switch code:

They’re so different that I’m not even sure whether they’re doing the same thing. Maybe I’ll research on this someday later I guess.

Since optimization is more or less like a mathematically optimization, (I guess?), so it would be nice if nvcc can report optimization statistics like budget archival rate and etc., so that I can at least compare them statistically.

This is one of the rare cases where one might want to do a quick sanity check by looking at the generated PTX code first: How does the PTX code generated from the C++ compare to the hand-written code? this might already provide a clue. Also, if it were me, I would double-check that the handwritten PTX is in fact functionally identically to the C++ code derived from it.

I have on occasion spent hours analyzing at SASS level, only to realize belatedly that I had an apples vs oranges scenario in front of me, i.e. two sets of code with subtly different functionality.

1 Like

Honestly I am not sure what this means. I am not a compiler engineer, but I have worked extensively with compiler engineers. Since many problems in compilers are NP hard, an optimizing compiler basically consists of layers upon layers of mechanical transformations controlled by heuristics. And the ordering of these phases matters as well, leading to a class of problems known as phase-ordering problems. Some compilers run some phases more than once because of that.

The CUDA compiler is especially complex, comprising two optimizing compilers: one that translates C++ to PTX, and one that translates PTX to SASS. Impedance mismatches are definitely possible; I recall historical issues with loop unrolling, as both compilers can perform loop unrolling but apply different heuristics.

Maybe I’ll research on this someday later I guess.

This is where my general tendency is to dive in right away until I have a good mental model of what the heck is going on. The SASS shown with a BRA after every SHFL sure looks odd to me.; makes me wonder what the corresponding PTX code looked like (did it already contain these branches?).

Ptx looks similar to me. I’ve switched to a simpler kernel which does exactly one load, exchange, and one store. I’m comparing ptx version with switch version.

Manual ptx:

{
	.reg .b32 	%r<14>;
	.reg .b64 	%rd<5>;
	ld.param.u64 	%rd1, [_Z3kerILi0EEvPi_param_0];
	cvta.to.global.u64 	%rd2, %rd1;
	mov.u32 	%r9, %tid.y;
	mul.wide.u32 	%rd3, %r9, 16;
	add.s64 	%rd4, %rd2, %rd3;
	ld.global.v4.u32 	{%r1, %r2, %r3, %r4}, [%rd4];
	// begin inline asm
	{.reg .u32 %ry;.reg .pred %py;mov.u32 %ry,%tid.y;and.b32 %ry,%ry,3;setp.ne.u32 %py,%ry,0;@%py bra L1;shfl.sync.bfly.b32 %r2,%r1,1,31,-1;shfl.sync.bfly.b32 %r3,%r1,2,31,-1;shfl.sync.bfly.b32 %r4,%r1,3,31,-1;bra L0;L1:setp.ne.u32 %py,%ry,1;@%py bra L2;shfl.sync.bfly.b32 %r1,%r2,1,31,-1;shfl.sync.bfly.b32 %r4,%r2,2,31,-1;shfl.sync.bfly.b32 %r3,%r2,3,31,-1;bra L0;L2:setp.ne.u32 %py,%ry,2;@%py bra L3;shfl.sync.bfly.b32 %r4,%r3,1,31,-1;shfl.sync.bfly.b32 %r1,%r3,2,31,-1;shfl.sync.bfly.b32 %r2,%r3,3,31,-1;bra L0;L3:shfl.sync.bfly.b32 %r3,%r4,1,31,-1;shfl.sync.bfly.b32 %r2,%r4,2,31,-1;shfl.sync.bfly.b32 %r1,%r4,3,31,-1;L0:}
	// end inline asm
	st.global.v4.u32 	[%rd4], {%r1, %r2, %r3, %r4};
	ret;
}

Switch:

{
	.reg .pred 	%p<16>;
	.reg .b16 	%rs<3>;
	.reg .b32 	%r<50>;
	.reg .b64 	%rd<5>;
	ld.param.u64 	%rd2, [_Z3kerILi1EEvPi_param_0];
	cvta.to.global.u64 	%rd3, %rd2;
	mov.u32 	%r21, %tid.y;
	mul.wide.u32 	%rd4, %r21, 16;
	add.s64 	%rd1, %rd3, %rd4;
	ld.global.v4.u32 	{%r49, %r48, %r47, %r46}, [%rd1];
	cvt.u16.u32 	%rs2, %r21;
	and.b16  	%rs1, %rs2, 3;
	setp.eq.s16 	%p1, %rs1, 0;
	@%p1 bra 	$L__BB1_5;
	setp.eq.s16 	%p2, %rs1, 1;
	@%p2 bra 	$L__BB1_4;
	setp.ne.s16 	%p3, %rs1, 2;
	@%p3 bra 	$L__BB1_6;
	mov.u32 	%r26, 3;
	mov.u32 	%r27, 31;
	mov.u32 	%r28, 1;
	mov.u32 	%r29, -1;
	shfl.sync.bfly.b32 	%r46|%p4, %r47, %r28, %r27, %r29;
	mov.u32 	%r30, 2;
	shfl.sync.bfly.b32 	%r49|%p5, %r47, %r30, %r27, %r29;
	shfl.sync.bfly.b32 	%r48|%p6, %r47, %r26, %r27, %r29;
	bra.uni 	$L__BB1_7;
$L__BB1_5:
	mov.u32 	%r36, 3;
	mov.u32 	%r37, 31;
	mov.u32 	%r38, 1;
	mov.u32 	%r39, -1;
	shfl.sync.bfly.b32 	%r48|%p10, %r49, %r38, %r37, %r39;
	mov.u32 	%r40, 2;
	shfl.sync.bfly.b32 	%r47|%p11, %r49, %r40, %r37, %r39;
	shfl.sync.bfly.b32 	%r46|%p12, %r49, %r36, %r37, %r39;
	bra.uni 	$L__BB1_7;
$L__BB1_4:
	mov.u32 	%r31, 3;
	mov.u32 	%r32, 31;
	mov.u32 	%r33, 1;
	mov.u32 	%r34, -1;
	shfl.sync.bfly.b32 	%r49|%p7, %r48, %r33, %r32, %r34;
	mov.u32 	%r35, 2;
	shfl.sync.bfly.b32 	%r46|%p8, %r48, %r35, %r32, %r34;
	shfl.sync.bfly.b32 	%r47|%p9, %r48, %r31, %r32, %r34;
	bra.uni 	$L__BB1_7;
$L__BB1_6:
	mov.u32 	%r41, 3;
	mov.u32 	%r42, 31;
	mov.u32 	%r43, 1;
	mov.u32 	%r44, -1;
	shfl.sync.bfly.b32 	%r47|%p13, %r46, %r43, %r42, %r44;
	mov.u32 	%r45, 2;
	shfl.sync.bfly.b32 	%r48|%p14, %r46, %r45, %r42, %r44;
	shfl.sync.bfly.b32 	%r49|%p15, %r46, %r41, %r42, %r44;
$L__BB1_7:
	st.global.v4.u32 	[%rd1], {%r49, %r48, %r47, %r46};
	ret;
}

Besides positioning of branch code and holding constant with register, they don’t seem that different to me. But for SASS result, one is like:

Other is like:

I don’t really understand what BRA.DIV does, since it’s actually @P... BRA 0x... used for branching right?

I am as mystified as you are regarding the BRA.DIV. I have never seen that before, maybe because I don’t use architectures newer than CC 7.5. What architecture are you targeting? One could guess that .DIV stands for divergent, i.e. maybe a kind of counterpart to .UNI?

One difference in the PTX code is that the compiler-generated switch version uses a different flavor of shfl, namely the one that returns both a predicate and a register, while your hand-written code uses the variant without the predicate result. Presumably the predicate result allows for the possibility of an “early out”, which then triggers the insertion of the BRA.DIV instructions?

If, instead of using CUDA’s built-in shfl intrinsic, you created your own device function that wraps an inline-PTX shfl, that might provide the best compromise between readability and programmer control and get you the same performance as the completely hand-coded PTX version.

I’m targeting native (8.9). I just tried wrapping ptx shfl, and I got a “in middle” result:

ptx looks fine

{
	.reg .pred 	%p<4>;
	.reg .b16 	%rs<3>;
	.reg .b32 	%r<78>;
	.reg .b64 	%rd<5>;
	ld.param.u64 	%rd2, [_Z3kerILi1EEvPi_param_0];
	cvta.to.global.u64 	%rd3, %rd2;
	mov.u32 	%r21, %tid.y;
	mul.wide.u32 	%rd4, %r21, 16;
	add.s64 	%rd1, %rd3, %rd4;
	ld.global.v4.u32 	{%r77, %r76, %r75, %r74}, [%rd1];
	cvt.u16.u32 	%rs2, %r21;
	and.b16  	%rs1, %rs2, 3;
	setp.eq.s16 	%p1, %rs1, 0;
	@%p1 bra 	$L__BB1_5;
	setp.eq.s16 	%p2, %rs1, 1;
	@%p2 bra 	$L__BB1_4;
	setp.ne.s16 	%p3, %rs1, 2;
	@%p3 bra 	$L__BB1_6;
	mov.u32 	%r28, 1;
	mov.u32 	%r37, -1;
	// begin inline asm
	shfl.sync.bfly.b32 %r74,%r75,%r28,31,%r37;
	// end inline asm
	mov.u32 	%r32, 2;
	// begin inline asm
	shfl.sync.bfly.b32 %r77,%r75,%r32,31,%r37;
	// end inline asm
	mov.u32 	%r36, 3;
	// begin inline asm
	shfl.sync.bfly.b32 %r76,%r75,%r36,31,%r37;
	// end inline asm
	bra.uni 	$L__BB1_7;
$L__BB1_5:
	mov.u32 	%r52, 1;
	mov.u32 	%r61, -1;
	// begin inline asm
	shfl.sync.bfly.b32 %r76,%r77,%r52,31,%r61;
	// end inline asm
	mov.u32 	%r56, 2;
	// begin inline asm
	shfl.sync.bfly.b32 %r75,%r77,%r56,31,%r61;
	// end inline asm
	mov.u32 	%r60, 3;
	// begin inline asm
	shfl.sync.bfly.b32 %r74,%r77,%r60,31,%r61;
	// end inline asm
	bra.uni 	$L__BB1_7;
$L__BB1_4:
	mov.u32 	%r40, 1;
	mov.u32 	%r49, -1;
	// begin inline asm
	shfl.sync.bfly.b32 %r77,%r76,%r40,31,%r49;
	// end inline asm
	mov.u32 	%r44, 2;
	// begin inline asm
	shfl.sync.bfly.b32 %r74,%r76,%r44,31,%r49;
	// end inline asm
	mov.u32 	%r48, 3;
	// begin inline asm
	shfl.sync.bfly.b32 %r75,%r76,%r48,31,%r49;
	// end inline asm
	bra.uni 	$L__BB1_7;
$L__BB1_6:
	mov.u32 	%r64, 1;
	mov.u32 	%r73, -1;
	// begin inline asm
	shfl.sync.bfly.b32 %r75,%r74,%r64,31,%r73;
	// end inline asm
	mov.u32 	%r68, 2;
	// begin inline asm
	shfl.sync.bfly.b32 %r76,%r74,%r68,31,%r73;
	// end inline asm
	mov.u32 	%r72, 3;
	// begin inline asm
	shfl.sync.bfly.b32 %r77,%r74,%r72,31,%r73;
	// end inline asm
$L__BB1_7:
	st.global.v4.u32 	[%rd1], {%r77, %r76, %r75, %r74};
	ret;
}

sass:

So now it’s having two BRA.DIV per three SHFL, better than the 3-per-3, but worse than the 1-per-3.

Is it possible to ask someone from compiler team and explain that BRA.DIV even is? Since it has no variable input (I assume URZ means Something-Register-Zero, which is essentially zero), it at least cannot be an unconditional branching, otherwise this doesn’t make sense.

I am fresh out of ideas what else to try. Maybe other forum participants will have better ideas.

Given that CC 8.9 is a fairly new architecture: are you using the latest toolchain, that is CUDA 12.3 update 2? NVIDIA’s compiler team is naturally reluctant to look at any issue that is not reproduceable with the latest available toolchain.

NVIDIA documents the SASS instructions only in a very rudimentary manner, in the binary tools documentation. That is very much on purpose. The company is secretive about the hardware ISA. The people who make the decisions to be secretive regarding the hardware ISA are not in the compiler team. You could file an enhancement request to get the documentation expanded.

In practical terms, you could file a performance bug against the compiler, by submitting a minimal, self-contained, buidable, runnable piece of code that contrasts the performance of your hand-written PTX code with the best efforts of the compiler. Performance differences <=2% are considered noise level. If you can demonstrate 10% or better yet 20% performance gap, that might get some traction. Again, make sure that the two sets of code are functionally identical, so it is an apples-to-apples comparison.

1 Like

The compiler has some defined hints that may be of interest. I don’t know if they would be useful or do anything here.

“Something” = Uniform. There is some detail on Uniform Registers on page 32 here.

So I fix some problem in my code, including:

  • stop doing inplace shfl, since it would override value that other wraplane want to read
  • do __wrapsync() to make sure register is actually written before shfl

Here’s the full code compiled and benchmarked with 4090 / cc8.9.

#include <cuda_fp16.h>

template<int MODE>
__device__ __forceinline__ void shf(int (&i)[4], int (&o)[4]) {
	__syncwarp();
	int y = threadIdx.y % 4;
	*(int4*)o = *(int4*)i;
	switch (MODE) {
	case 0:
		asm("{.reg .pred %p;"
		    "setp.ne.s32 %p,%8,0;"
		    "@%p bra L1;"
		    "shfl.sync.bfly.b32 %1,%4,1,31,-1;"
		    "shfl.sync.bfly.b32 %2,%4,2,31,-1;"
		    "shfl.sync.bfly.b32 %3,%4,3,31,-1;"
		    "bra L0;"
		    "L1:"
		    "setp.ne.s32 %p,%8,1;"
		    "@%p bra L2;"
		    "shfl.sync.bfly.b32 %0,%5,1,31,-1;"
		    "shfl.sync.bfly.b32 %3,%5,2,31,-1;"
		    "shfl.sync.bfly.b32 %2,%5,3,31,-1;"
		    "bra L0;"
		    "L2:"
		    "setp.ne.s32 %p,%8,2;"
		    "@%p bra L3;"
		    "shfl.sync.bfly.b32 %3,%6,1,31,-1;"
		    "shfl.sync.bfly.b32 %0,%6,2,31,-1;"
		    "shfl.sync.bfly.b32 %1,%6,3,31,-1;"
		    "bra L0;"
		    "L3:"
		    "shfl.sync.bfly.b32 %2,%7,1,31,-1;"
		    "shfl.sync.bfly.b32 %1,%7,2,31,-1;"
		    "shfl.sync.bfly.b32 %0,%7,3,31,-1;"
		    "L0:}"
		    :"=r"(o[0]), "=r"(o[1]), "=r"(o[2]), "=r"(o[3])
		    :"r"(i[0]), "r"(i[1]), "r"(i[2]), "r"(i[3]), "r"(y));
		break;
	case 1:
		switch (y) {
		case 0:
			o[1] = __shfl_xor_sync(0xffffffff, i[0], 1);
			o[2] = __shfl_xor_sync(0xffffffff, i[0], 2);
			o[3] = __shfl_xor_sync(0xffffffff, i[0], 3);
			break;
		case 1:
			o[0] = __shfl_xor_sync(0xffffffff, i[1], 1);
			o[3] = __shfl_xor_sync(0xffffffff, i[1], 2);
			o[2] = __shfl_xor_sync(0xffffffff, i[1], 3);
			break;
		case 2:
			o[3] = __shfl_xor_sync(0xffffffff, i[2], 1);
			o[0] = __shfl_xor_sync(0xffffffff, i[2], 2);
			o[1] = __shfl_xor_sync(0xffffffff, i[2], 3);
			break;
		default:
			o[2] = __shfl_xor_sync(0xffffffff, i[3], 1);
			o[1] = __shfl_xor_sync(0xffffffff, i[3], 2);
			o[0] = __shfl_xor_sync(0xffffffff, i[3], 3);
		}
		break;
	case 2:
		if (y == 0) {
			o[1] = __shfl_xor_sync(0xffffffff, i[0], 1);
			o[2] = __shfl_xor_sync(0xffffffff, i[0], 2);
			o[3] = __shfl_xor_sync(0xffffffff, i[0], 3);
		} else if (y == 1) {
			o[0] = __shfl_xor_sync(0xffffffff, i[1], 1);
			o[3] = __shfl_xor_sync(0xffffffff, i[1], 2);
			o[2] = __shfl_xor_sync(0xffffffff, i[1], 3);
		} else if (y == 2) {
			o[3] = __shfl_xor_sync(0xffffffff, i[2], 1);
			o[0] = __shfl_xor_sync(0xffffffff, i[2], 2);
			o[1] = __shfl_xor_sync(0xffffffff, i[2], 3);
		} else {
			o[2] = __shfl_xor_sync(0xffffffff, i[3], 1);
			o[1] = __shfl_xor_sync(0xffffffff, i[3], 2);
			o[0] = __shfl_xor_sync(0xffffffff, i[3], 3);
		}
		break;
	default:
		for (int x = 0; x < 4; x++) {
			if (x == y) {
				for (int j = 1; j < 4; j++) {
					o[x ^ j] = __shfl_xor_sync(0xffffffff, i[x], j);
				}
			}
			break;
		}
	}
}

template<int MODE>
struct e2828 {
	// (2x8)x(2x8) matrix where each wraplane hold 4x2 in (2x2)x(row major 8x8);
	int _[4];
	__device__ __forceinline__ void ld(const half* __restrict__ A, int Y) {
		int _[4];
		*(int4*)_ = ((int4*)A)[(threadIdx.y << 3 & 16 | threadIdx.y >> 1 & 14) * Y | threadIdx.y & 1];
		shf<MODE>(_, this->_);
	}
	__device__ __forceinline__ void st(half* __restrict__ A, int Y) {
		int _[4];
		shf<MODE>(this->_, _);
		((int4*)A)[(threadIdx.y << 3 & 16 | threadIdx.y >> 1 & 14) * Y | threadIdx.y & 1] = *(int4*)_;
	}
	__device__ __forceinline__ void zero() {
		((long*)_)[0] = 0;
		((long*)_)[1] = 0;
	}
	__device__ __forceinline__ void mm(e2828 a, e2828 b) {
		asm("mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%0,%2},{%4,%5,%6,%7},{%8,%9},{%0,%2};"
		    "mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%1,%3},{%4,%5,%6,%7},{%10,%11},{%1,%3};":
		    "=r"(_[0]), "=r"(_[1]), "=r"(_[2]), "=r"(_[3]):
		    "r"(a._[0]), "r"(a._[1]), "r"(a._[2]), "r"(a._[3]),
		    "r"(b._[0]), "r"(b._[1]), "r"(b._[2]), "r"(b._[3]));
	}
};

template<int MODE, int X, int Y, int Z>
__global__ void ker(const half * __restrict__ A, const half * __restrict__ B, half * __restrict__ O) {
	e2828<MODE> a, b, o;
	for (int x = 0; x < X; x++) {
		for (int y = 0; y < Y; y++) {
			o.zero();
			for (int z = 0; z < Z; z++) {
				a.ld(A + (x * 16 * Z + z) * 16, Z);
				b.ld(B + (y * 16 * Z + z) * 16, Z);
				o.mm(a, b);
			}
			o.st(O + (x * 16 * Y + y) * 16, Y);
		}
	}
}

int main() {
	half *A, *B, *O;
	cudaMalloc(&A, 64 * 16 * 64 * 16 * 2);
	cudaMalloc(&B, 64 * 16 * 64 * 16 * 2);
	cudaMalloc(&O, 64 * 16 * 64 * 16 * 2);
	ker<0, 64, 64, 64> <<< 1, 32>>>(A, B, O);
	ker<1, 64, 64, 64> <<< 1, 32>>>(A, B, O);
	ker<2, 64, 64, 64> <<< 1, 32>>>(A, B, O);
	ker<3, 64, 64, 64> <<< 1, 32>>>(A, B, O);
}

compile & bench commandline:

nvcc test2.cu -std=c++20 -arch native && ncu --section SchedulerStats --section WarpStateStats --section SourceCounters --section Occupancy -fo test2 ./a.out

nvcc versoin:

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Fri_Jan__6_16:45:21_PST_2023
Cuda compilation tools, release 12.0, V12.0.140
Build cuda_12.0.r12.0/compiler.32267302_0

cuda version:

Driver Version: 550.40.07      CUDA Version: 12.4

result:

where should I fire this problem? here?

Just updated cuda & nvcc and stuff, newest result:

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Wed_Nov_22_10:17:15_PST_2023
Cuda compilation tools, release 12.3, V12.3.107
Build cuda_12.3.r12.3/compiler.33567101_0

Driver Version: 550.54.14      CUDA Version: 12.4