It seems I never got around to implementing vsubss4
five years ago, as I could find no corresponding code in my collection. I did code up vsubus4
, i.e. byte-wise subtraction with unsigned saturation, which results in an efficient 7-instruction sequence. Based on this and the previous post I think it is safe to summarize that emulation-based SIMD intrinsics for saturated arithmetic are fast enough to be of practical utility. Any missed optimization opportunities in the emulation sequences should be brought to NVIDIA’s attention via an enhancement request (RFE).
/*
Copyright 2019, Norbert Juffa
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <stdint.h>
#if (__CUDACC__)
#define __HOST__ __host__
#define __DEVICE__ __device__
#else // __CUDACC__
#define __HOST__
#define __DEVICE__
#endif // __CUDACC__
__HOST__ __DEVICE__ uint32_t lop3_b2 (uint32_t a, uint32_t b, uint32_t c)
{
uint32_t r;
#if (__CUDA_ARCH__ >= 500)
asm ("lop3.b32 %0,%1,%2,%3,0xb2;\n\t" : "=r"(r) : "r"(a), "r"(b), "r"(c));
#else // __CUDA_ARCH__
r = (a | c) & ((a & c) | ~b);
#endif // __CUDA_ARCH__
return r;
}
__HOST__ __DEVICE__ uint32_t lop3_82 (uint32_t a, uint32_t b, uint32_t c)
{
uint32_t r;
#if (__CUDA_ARCH__ >= 500)
asm ("lop3.b32 %0,%1,%2,%3,0x82;\n\t" : "=r"(r) : "r"(a), "r"(b), "r"(c));
#else // __CUDA_ARCH__
r = ~(a ^ b) & c;
#endif // __CUDA_ARCH__
return r;
}
__HOST__ __DEVICE__ uint32_t lop3_28 (uint32_t a, uint32_t b, uint32_t c)
{
uint32_t r;
#if (__CUDA_ARCH__ >= 500)
asm ("lop3.b32 %0,%1,%2,%3,0x28;\n\t" : "=r"(r) : "r"(a), "r"(b), "r"(c));
#else // __CUDA_ARCH__
r = (a ^ b) & c;
#endif // __CUDA_ARCH__
return r;
}
#define MSB_MASK (0x80808080U) // mask for msb of each byte
__HOST__ __DEVICE__ uint32_t vsubus4 (uint32_t x, uint32_t y)
{
uint32_t a, b, r, s, t;
a = x | MSB_MASK;
b = y &~ MSB_MASK;
r = a - b;
s = lop3_82 (x, y, MSB_MASK);
t = lop3_b2 (x, y, r);
t = masked_sign_to_byte_mask (t);
r = lop3_28 (r, s, t);
return r;
}
Compiled for an sm_80
target, this results in a 7-instruction sequence (as extracted from Compiler Explorer):
vsubus4(unsigned int, unsigned int):
LOP3.LUT R0, R4, 0x80808080, RZ, 0xfc, !PT
LOP3.LUT R3, R5, 0x7f7f7f7f, RZ, 0xc0, !PT
IMAD.IADD R0, R0, 0x1, -R3
LOP3.LUT R3, R4.reuse, R5, R0, 0xb2, !PT
LOP3.LUT R5, R4, 0x80808080, R5, 0x84, !PT
PRMT R4, R3, 0xba98, RZ
LOP3.LUT R4, R0, R5, R4, 0x28, !PT
RET.ABS.NODEC R20 0x0
[Later:]
I went ahead and programmed up vsubss
. This results in a 10-instruction sequence when compiled for an sm_80
target. I note that CUDA’s built-in intrinsics __vaddss4
, __vsubss4
, __vsubus4
produce instruction sequences with 10, 10, and 7 instructions, respectively. While the generated code for the intrinsics differs a bit from what is being generated for the source code presented here, it seems reasonable to conclude that the CUDA intrinsics enumerated above are “optimal”, given that the number of instructions of corresponding emulation sequences matches.
I further note that I did not run into issues with the CUDA compiler not delivering the desired LOP3s in the code for vsubss4
, while in the code for vadddss4
and vsubus4
from five years ago I apparently resorted to hard coded LOP3
instances. One might see this as anecdotal evidence that the compiler’s ability to generated “optimal” sequences of LOP3
s has improved.
/*
Copyright (c) 2024, Norbert Juffa
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <stdint.h>
#if (__CUDACC__)
#define __HOST__ __host__
#define __DEVICE__ __device__
#else // __CUDACC__
#define __HOST__
#define __DEVICE__
#endif // __CUDACC__
__HOST__ __DEVICE__ uint32_t lop3_e4 (uint32_t a, uint32_t b, uint32_t c)
{
uint32_t r;
#if (__CUDA_ARCH__ >= 500)
asm ("lop3.b32 %0,%1,%2,%3,0xe4;\n\t" : "=r"(r) : "r"(a), "r"(b), "r"(c));
#else // __CUDA_ARCH__
r = (a & c) | (b & ~c);
#endif // __CUDA_ARCH__
return r;
}
__HOST__ __DEVICE__ uint32_t masked_select (uint32_t a, uint32_t b, uint32_t m)
{
#if (__CUDA_ARCH__ >= 500)
return lop3_e4 (a, b, m);
#elif 0
return (((a)&(m))|((b)&(~(m))));
#else
return((((a)^(b))&(m))^(b));
#endif
}
#define MSB_MASK (0x80808080U) // mask for msb of each byte
__HOST__ __DEVICE__ uint32_t masked_sign_to_byte_mask (uint32_t a)
{
#if (__CUDA_ARCH__ >= 200)
asm ("prmt.b32 %0,%0,0,0xba98;\n\t" : "+r"(a)); // convert MSBs to masks
#else
a = a & MSB_MASK;
a = a + a - (a >> 7); // extend MSBs to full byte to create mask
#endif
return a;
}
__HOST__ __DEVICE__ uint32_t vsubss4 (uint32_t a, uint32_t b)
{
uint32_t r, s, t;
r = a | MSB_MASK; // LOP3
s = b &~ MSB_MASK; // LOP3
r = r - s; // IADD
t = (b ^ a) & MSB_MASK; // LOP3
s = t & (r ^ a); // LOP3 // msb indicates overflow
r = r ^ t ^ MSB_MASK; // LOP3 // regular result
t = masked_sign_to_byte_mask (s); // PRMT
s = ~MSB_MASK + ((a & MSB_MASK) >> 7);// LOP3, LEA // potential special res.
r = masked_select (s, r, t); // LOP3 //select spec. or reg. result
return r;
}
The source above truen into the following when compiled for an sm_80
target (extracted from Compiler Explorer):
vsubss4(unsigned int, unsigned int):
LOP3.LUT R3, R4.reuse, 0x80808080, RZ, 0xfc, !PT
LOP3.LUT R6, R5.reuse, 0x7f7f7f7f, RZ, 0xc0, !PT
LOP3.LUT R0, R5, 0x80808080, R4, 0x48, !PT
LOP3.LUT R5, R4, 0x80808080, RZ, 0xc0, !PT
IMAD.IADD R3, R3, 0x1, -R6
LEA.HI R5, R5, 0x7f7f7f7f, RZ, 0x19
LOP3.LUT R4, R0.reuse, R3, R4, 0x60, !PT
LOP3.LUT R0, R0, 0x80808080, R3, 0x96, !PT
PRMT R3, R4, 0xba98, RZ
LOP3.LUT R4, R5, R0, R3, 0xe4, !PT
RET.ABS.NODEC R20 0x0