CUDA’s standard math library computes sine and cosine with a maximum error of 1.5 ulps, whether through individual calls to sin() and cos() or more efficiently through sincos() when both trig functions are required for the same argument, as is frequently the case. This level of accuracy is the best that can be achieved with an argument reduction accurate to double precision; a lower error bound would require the computation of the reduced argument with extended precision. This would introduce significant overhead, adding about 50% work to the fast path of these trig functions as I recall.
However, for some experimental work, I needed to compute sine and cosine as accurately as possible, yet efficiently, as sincos() would be invoked numerous times. However, for this use case I only needed the domain to comprise one full revolution, so ±2π. As it turns out, the three least significant bits of the double representation of π/2 are zero, which allows the error free computation of the product i * M_PI/2 for i = 0 … 7, which leads to a very efficient extended-precision argument reduction for the interval [-(15/4)*PI, +(15/4)*PI]. (15/4)*PI ≈ 11.78. One does not even need FMAs (fused multiply-adds) to do this, as I show in the code below (FOR_DEMONSTRATON_PURPOSES_ONLY), but of course use of FMA improves efficiency.
For the core approximations to sine and cosine on [-PI/4, +PI/4] I used polynomial minimax approximation whose coefficients I took from the literature for expediency. Since these were generated by a noted expert in the design of math libraries, they are likely very nearly optimal. The evaluation uses a mix of second-order and ordinary Horner schemes, which improves ILP while adding only a tiny bit of error. ILP of double-precision computation isn’t really a thing in the CUDA environment but may come in helpful when I re-use this code on other processor architectures.
Since the domain is restricted to a narrow interval, no special cases other than an argument of negative zero for sine need to be considered. This is easily addressed by use of copysign() to ensure same-signedness of argument and result, at the cost of only a cheap LOP3 operation at the SASS level.
My tests indicate that the maximum error in both the sine and cosine components is less than 0.8 ulp and that on average about 96.6% of the results are correctly rounded, compared to around 85% when using CUDA’s built-in sincos().
/*
Copyright 2025, Norbert Juffa
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* compute sin (x + x_tail) on [-PI/4, PI/4] */
__device__ double sin_poly (double x, double x_tail, double x2, double x4)
{
/*
Coefficients from: Ping Tak Peter Tang, "Some Software Implementations
of the Function Sine and Cosine." Report ANL-90/3, Argonne National
Laboratory, April 1990
*/
const double S0 = 1.59108690260756780e-10;
const double S1 = -2.50510254394993115e-8;
const double S2 = 2.75573156035895542e-6;
const double S3 = -1.98412698361250482e-4;
const double S4 = 8.33333333333178931e-3;
const double S5 = -1.66666666666666796e-1;
double p, q, r, x3;
p = S0;
q = S1;
p = fma (p, x4, S2);
q = fma (q, x4, S3);
p = fma (p, x4, S4);
p = fma (q, x2, p);
x3 = x * x2;
r = x + fma (x3, S5, fma (fma (x3, p, -0.5 * x_tail), x2, x_tail));
return copysign (r, x); // ensure -0 is passed through
}
/* compute cos (x + x_tail) on [-PI/4, PI/4] */
__device__ double cos_poly (double x, double x_tail, double x2, double x4)
{
/*
Coefficients from: Ping Tak Peter Tang, "Some Software Implementations
of the Function Sine and Cosine." Report ANL-90/3, Argonne National
Laboratory, April 1990
*/
const double C0 = -1.13599319556004135e-11;
const double C1 = 2.08757292566166689e-9;
const double C2 = -2.75573144009911252e-7;
const double C3 = 2.48015872896717822e-5;
const double C4 = -1.38888888888744744e-3;
const double C5 = 4.16666666666666019e-2;
double p, q, h, h_tail;
p = C0;
q = C1;
p = fma (p, x4, C2);
q = fma (q, x4, C3);
p = fma (p, x4, C4);
q = fma (q, x4, C5);
p = fma (p, x2, q);
h = fma (-0.5, x2, 1.0);
h_tail = fma (-0.5, x2, 1.0 - h);
return h + fma (x4, p, fma (-x, x_tail, h_tail));
}
/* compute sine and cosine based on quadrant */
__device__ void sc_core (double x, double x_tail, int quadrant, double *sp, double *cp)
{
double c, s, x2, x4;
x2 = x * x;
x4 = x2 * x2;
c = cos_poly (x, x_tail, x2, x4);
s = sin_poly (x, x_tail, x2, x4);
/*
if (quadrant & 2) { s = -s; c = -c; }
if (quadrant & 1) { double t = -s; s = c; c = t; }
*/
unsigned long long int sign_mask, swap_mask;
sign_mask = (unsigned long long int)(quadrant & 2) << 62;
s = __longlong_as_double (__double_as_longlong (s) ^ sign_mask);
c = __longlong_as_double (__double_as_longlong (c) ^ sign_mask);
sign_mask = (unsigned long long int)(quadrant & 1) << 63;
swap_mask = 0ULL - (unsigned long long int)(quadrant & 1);
swap_mask = (__double_as_longlong(s) ^ __double_as_longlong(c)) & swap_mask;
s = __longlong_as_double (__double_as_longlong (s) ^ swap_mask);
c = __longlong_as_double (__double_as_longlong (c) ^ swap_mask ^ sign_mask);
*sp = s;
*cp = c;
}
/* Divide x in [-15/4*PI, +15/4*PI] by PI/2, rounding the quotient to nearest.
This results in a remainder in [-PI/4, +PI/4], returned as a head:tail pair.
*/
__device__ double trig_red_restricted_domain (double x, double *redx_tail, int *quot)
{
const double cvt_magic = 6755399441055744.0; // 2**52 + 2**51
const double pio2_hi = 1.5707963267948966e+00; // (pi / 2)_head
const double pio2_lo = 6.1232339957367660e-17; // (pi / 2)_tail
const double two_over_pi = 6.3661977236758138e-1; // 2 / pi
double t, j, redx_head;
int i;
#if FOR_DEMONSTRATION_PURPOSES_ONLY
t = __dmul_rn (x, two_over_pi);
t = __dadd_rn (t, cvt_magic);
i = (int)(unsigned int)__double2loint(t);
j = __dsub_rn (t, cvt_magic);
double prod_hi, prod_lo;
prod_hi = __dmul_rn (j, pio2_hi); // exact
prod_lo = __dmul_rn (j, pio2_lo);
t = __dsub_rn (x, prod_hi);
redx_head = __dsub_rn (t, prod_lo);
*quot = i;
*redx_tail = __dsub_rn (__dsub_rn (t, redx_head), prod_lo);
#else // !FOR_DEMONSTRATION_PURPOSES_ONLY
t = fma (x, two_over_pi, cvt_magic);
i = (int)(unsigned int)__double2loint(t);
j = t - cvt_magic;
t = fma (j, -pio2_hi, x);
redx_head = fma (j, -pio2_lo, t);
t = t - redx_head;
*quot = i;
*redx_tail = fma (j, -pio2_lo, t);
#endif // FOR_DEMONSTRATION_PURPOSES_ONLY
return redx_head;
}
/* Compute sine and cosine for x in [-15/4*PI, +15/4*PI].
Largest observed errors in 1 billion random tests were as follows:
sin: maxulperr = 0.78982594031559195 @ -3.9301709436669303
cos: maxulperr = 0.77762313177140596 @ -10.195310145486687
*/
__device__ void sincos_restricted_domain (double x, double *sp, double *cp)
{
double redx_head, redx_tail;
int quadrant;
redx_head = trig_red_restricted_domain (x, &redx_tail, &quadrant);
sc_core (redx_head, redx_tail, quadrant, sp, cp);
}