A more accurate (and potentially faster) double-precision sincospi() implementation

This is the second part of potentially useful left-over work from my recent exploration of cross-platform math functions. What is sincospi(), you ask? It is part of a family of trigonometric functions sinpi(), cospi(), sincospi() that compute the sine ans cosine of πx, but without the user having to supply that multiplication explicitly. The advantages are improved accuracy, improved performance, smaller code footprint, and reduced register pressure compared to the conventional way of computing, such as sin (M_PIx). Variants of these have been around for year, I first became aware of them in Solaris more than a decade ago. The latest version of IEEE-754 lists sinpi() and cospi() in an appendix.

Once aware of their existence, you will probably discover that sin (xπ) and cos(xπ) occur in many computations with a mathematical or scientific context. If you want trigonometric functions with input in degrees (for geodesic computations) for example, these functions are your friend. My favorite use case is the Box-Muller method for generating normally distributed random numbers, which is a perfect fit for sincospi(). See this CUDA tech tip from a few years back: http://www.nvidia.com/content/newsletters/web/CUDA-Week-in-Review-sept-28-12-web.html#code

The code below, when compiled with CUDA 6.5 is 8.7% faster than CUDA’s built-in version on my Quadro 2000 (sm_21), which is not very surprising as this reflects the ratio of 64-bit operations (25/23) used. Speedup on other “DP lite” platforms should be similar, for “DP heavy” platforms it is probably a wash, i.e. performance neutral. The accuracy of this implementation is noticeably better than CUDA 6.5’s version, in that the maximum error is always below 1 ulp compared to the mathematical result, which means the function supplies faithfully-rounded results.

I am placing this code under a two-clause BSD license, so anybody is free to grab whatever parts they like for whatever purpose they like.

/*
  Copyright (c) 2015, Norbert Juffa
  All rights reserved.

  Redistribution and use in source and binary forms, with or without 
  modification, are permitted provided that the following conditions
  are met:

  1. Redistributions of source code must retain the above copyright 
     notice, this list of conditions and the following disclaimer.

  2. Redistributions in binary form must reproduce the above copyright
     notice, this list of conditions and the following disclaimer in the
     documentation and/or other materials provided with the distribution.

  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
  "AS IS" AND ANY SINCOSPIRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

/* approximate sin(pi*a) for a in [-0.25,0.25] */
__device__ __forceinline__
double my_sinpi_poly (double a, double s)
{
    double r;

    r =             4.6151442520157035e-4;
    r = fma (r, s, -7.3700183130883555e-3);
    r = fma (r, s,  8.2145868949323936e-2);
    r = fma (r, s, -5.9926452893214921e-1);
    r = fma (r, s,  2.5501640398732688e+0);
    r = fma (r, s, -5.1677127800499516e+0);
    s = s * a;
    r = r * s;
    r = fma (a, 3.1415926535897931e+0, r);
    return r;
}

/* approximate cos(pi*a) for a in [-0.25,0.25] */
__device__ __forceinline__
double my_cospi_poly (double s)
{
    double r;

    r =            -1.0369917389758117e-4;
    r = fma (r, s,  1.9294935641298806e-3);
    r = fma (r, s, -2.5806887942825395e-2);
    r = fma (r, s,  2.3533063028328211e-1);
    r = fma (r, s, -1.3352627688538006e+0);
    r = fma (r, s,  4.0587121264167623e+0);
    r = fma (r, s, -4.9348022005446790e+0);
    r = fma (r, s,  1.0000000000000000e+0);
    return r;
}

/* scale a by 2**i. No checks for underflow or overflow */
__device__ __forceinline__
double my_unchecked_scale (double a, int i)
{
    int ahi = __double2hiint (a);
    int alo = __double2loint (a);
    return __hiloint2double ((i << 20) + ahi, alo);
}

/* IEEE-754 change sign operation: flips sign bit, even for NaNs */
__device__ __forceinline__ 
double my_fast_fchs (double a)
{
    int ahi = __double2hiint (a);
    int alo = __double2loint (a);
    return __hiloint2double (ahi ^ 0x80000000U, alo);
}

/* Compute sin(a*pi) and cos(a*pi) simultaneously, based on quadrant */
__device__ __forceinline__
void my_sincospi_core (double a, int i, double *sp, double *cp)
{
    double c, s, t;

    s = a * a;
    c = my_cospi_poly (s);
    s = my_sinpi_poly (a, s);
    if (i & 2) {
        s = my_fast_fchs (s); // s = -s
        c = my_fast_fchs (c); // c = -c
    }
    t = fma (-1.0, s, 0.0); // t = -s; avoid cosine results of negative 0
    if (i & 1) {
        s = c;
        c = t;
    }
    *sp = s;
    *cp = c;
}

/* compute sin(a*pi) and cos(a*pi). In 1+ billion trials, maximum error found
   in sine component was < 0.96 ulps, maximum error found in cosine component
   was < 0.97 ulps. Implementation therefore appears to be faithfully rounded.
*/
__device__ __forceinline__
void my_sincospi (double a, double *sp, double *cp)
{
    double az, c, r, s;
    int i, ia, ic;
    
    ia = __double2hiint (a);
    ic = __double2hiint (9.0071992547409920e+15); // 2**53
    az = a * 0.0; // needed for handling corner cases
    /* for |a| >= 2**53, cospi(a) = 1.0, but cospi(Inf) = NaN */
#if ALTERNATE_RANGE_CHECK
    float fa = __int_as_float (ia);
    float fc = __int_as_float (ic);
    a = (fabsf (fa) < fc) ? a : az;
#else  // ALTERNATE_RANGE_CHECK    
    a = (((unsigned)ia + ia) < ((unsigned)ic + ic)) ? a : az;
#endif // ALTERNATE_RANGE_CHECK 
    /* reduce argument into primary approximation interval [-0.25,0.25] */
    r = my_unchecked_scale (a, 1); // 2*a
    r = rint (r);
    i = (int)(long long int)r;
    r = a - 0.5 * r;
    /* compute core approximation */
    my_sincospi_core (r, i, &s, &c);
    /* IEEE-754: sinPi(+n) is +0 and sinPi(-n) is -0 for positive integers n */
    if (a == trunc (a)) s = az;
    *sp = s;
    *cp = c;
}