In an attempt to write vector-addition without a branch, I tried predicated operations:
__global__ void add(const float* a, const float* b, float* c) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
const float aTmp = 0.0f;
const float bTmp = 0.0f;
__shared__ float tmp;
const bool condition = idx < 10000;
const float* aPtr = condition ? &a[idx] : &aTmp;
const float* bPtr = condition ? &b[idx] : &bTmp;
float* cPtr = condition ? &c[idx] : &tmp;
*cPtr = *aPtr + *bPtr;
}
output on godbolt for PTX:
.visible .entry add(float const*, float const*, float*)(
.param .u64 add(float const*, float const*, float*)_param_0,
.param .u64 add(float const*, float const*, float*)_param_1,
.param .u64 add(float const*, float const*, float*)_param_2
)
{
mov.u64 %SPL, __local_depot0;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd8, [add(float const*, float const*, float*)_param_0];
ld.param.u64 %rd9, [add(float const*, float const*, float*)_param_1];
ld.param.u64 %rd6, [add(float const*, float const*, float*)_param_2];
add.u64 %rd10, %SP, 0;
add.u64 %rd11, %SPL, 0;
add.u64 %rd12, %SP, 4;
add.u64 %rd13, %SPL, 4;
mov.u32 %r1, %ntid.x;
mov.u32 %r2, %ctaid.x;
mov.u32 %r3, %tid.x;
mad.lo.s32 %r4, %r2, %r1, %r3;
mov.u32 %r5, 0;
st.local.u32 [%rd11], %r5;
st.local.u32 [%rd13], %r5;
setp.gt.s32 %p1, %r4, 9999;
setp.lt.s32 %p2, %r4, 10000;
cvt.s64.s32 %rd1, %r4;
mul.wide.s32 %rd14, %r4, 4;
add.s64 %rd15, %rd8, %rd14;
selp.b64 %rd2, %rd15, %rd10, %p2;
add.s64 %rd16, %rd9, %rd14;
selp.b64 %rd3, %rd16, %rd12, %p2;
mov.u32 %r6, add(float const*, float const*, float*)::tmp;
{ .reg .b64 %tmp;
cvt.u64.u32 %tmp, %r6;
cvta.shared.u64 %rd18, %tmp; }
@%p1 bra $L__BB0_2;
shl.b64 %rd17, %rd1, 2;
add.s64 %rd18, %rd6, %rd17;
$L__BB0_2:
ld.f32 %f1, [%rd3];
ld.f32 %f2, [%rd2];
add.f32 %f3, %f2, %f1;
st.f32 [%rd18], %f3;
ret;
}
but when I change the output predication to a register instead of shared memory:
__global__ void add(const float* a, const float* b, float* c) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
const float aTmp = 0.0f;
const float bTmp = 0.0f;
float tmp; // now a register
const bool condition = idx < 10000;
const float* aPtr = condition ? &a[idx] : &aTmp;
const float* bPtr = condition ? &b[idx] : &bTmp;
float* cPtr = condition ? &c[idx] : &tmp;
*cPtr = *aPtr + *bPtr;
}
PTX output becomes a ret
.visible .entry add(float const*, float const*, float*)(
.param .u64 add(float const*, float const*, float*)_param_0,
.param .u64 add(float const*, float const*, float*)_param_1,
.param .u64 add(float const*, float const*, float*)_param_2
)
{
ret;
}
and its SASS is this:
add(float const*, float const*, float*):
MOV R1, c[0x0][0x20]
NOP
NOP
NOP
EXIT
.L_x_0:
BRA `(.L_x_0)
.L_x_1:
is this a bug in NVCC 12.5.1 (godbolt uses this)?
Perhaps mixing global memory and private memory in same predicate is not legal?
Link to the compiler explorer: Compiler Explorer