After recent success with an accuracy-optimized implementation of acosf() I figured I should try my hand at an accuracy-optimized asinf() as well. The built-in implementation of CUDA 12.8 served as my baseline. It compiles to 26 instructions for the sm_89 architecture and has a maximum error < 1.35 ulps.
My implementation my_asinf() below reduces maximum error to < 1.24 ulps, and reduces the instruction count to 24 for an sm_89 target. Note that the helper function copysignf_pos() should compile to a single LOP3 instruction, but for some reason that did not happen here, so I had to force this by using PTX inline assembly.
my_asinf(float):
MOV R3, R4
MOV R0, 0x3f000000
MOV R6, 0x3d4f0000
FFMA R0, -|R3|, R0, 0.5
MUFU.RSQ R5, R0
FSETP.NEU.AND P0, PT, R0.reuse, RZ, PT
FMUL R7, R0, R5
FMUL R4, R5, 0.5
FFMA R5, R7, -R7, R0
@P0 FFMA R0, R4, R5, R7
FSETP.GT.AND P0, PT, |R3|, 0.5625, PT
FSEL R4, R0, |R3|, P0
FMUL R5, R4, R4
FFMA R6, R5, R6, 0.01894361153244972229
FFMA R6, R5, R6, 0.046598583459854125977
FFMA R6, R5, R6, 0.074859127402305603027
FFMA R6, R5, R6, 0.16666957736015319824
FMUL R5, R5, R6
FFMA R4, R4, R5, R4
@P0 MOV R5, 0x3fd774eb
@P0 FMUL R0, R4, -2
@P0 FFMA R4, R5, 0.93318945169448852539, R0
FSETP.GTU.AND P0, PT, |R4|, +INF , PT
@!P0 LOP3.LUT R4, R4, 0x80000000, R3, 0xf8, !PT
RET.ABS.NODEC R20 0x0
Here is my source code:
Copyright (c) 2026, Norbert Juffa
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
__forceinline__ __device__ float raw_rsqrt (float a)
{
float r;
asm ("rsqrt.approx.ftz.f32 %0,%1;" : "=f"(r) : "f"(a));
return r;
}
__forceinline__ __device__ float copysignf_pos (float x, float y)
{
int a, b, c, r;
a = __float_as_int (x);
b = __float_as_int (y);
c = 0x80000000; // sign bit
// a | (b & c)
asm ("lop3.b32 %0,%1,%2,%3,0xf8;\n\t" : "=r"(r) : "r"(a), "r"(b), "r"(c));
return __int_as_float (r);
}
// Compute sqrt(a) from rsqrt approximation using Schoenhage's coupled iteration
__forceinline__ __device__ float sqrtf_schoenhage (float a)
{
float ra = raw_rsqrt (a);
float s0 = a * ra;
float r0 = 0.5f * ra;
float s1 = fmaf (fmaf (s0, -s0, a), r0, s0);
return (a == 0) ? a : s1;
}
__device__ float my_asinf (float x)
{
const float SWITCHOVER = 0.5625f;
const float pio2_fac1 = 0.93318945f;
const float pio2_fac2 = 1.68325555f;
float ax, res, redx, sq, t;
ax = fabsf(x);
// asin (x) = pi/2 - 2 * asin (sqrt ((1 - x) / 2))
t = sqrtf_schoenhage (fmaf (0.5f, -ax, 0.5f));
redx = (ax > SWITCHOVER)? t : ax;
// approximate asin(redx) on [0,
sq = redx * redx;
res = 0x1.9e0000p-5f;
res = fmaf (res, sq, 0x1.365f44p-6f);
res = fmaf (res, sq, 0x1.7dbc50p-5f);
res = fmaf (res, sq, 0x1.329f7cp-4f);
res = fmaf (res, sq, 0x1.5556dcp-3f);
res = res * sq;
res = fmaf (res, redx, redx);
// map asin(x) from asin(redx) based on magnitude of |x|
if (ax > SWITCHOVER) res = fmaf (pio2_fac1, pio2_fac2, -2 * res);
// preserve "sign" of NaNs
if (!isnan (res)) res = copysignf_pos (res, x);
return res;
}