Regular participants in these forums may have noticed that I am somewhat of a fan of the sinpi(), cospi(), sincospi() functions. These are applicable whenever an algorithm calls for the computation sin(π*[expr]) or cos(π*[expr]), which is surprisingly often. And what’s not to like: Compared with sin(), cos(), and sincos() performance and accuracy are improved, while code size and register pressure are reduced.

Recently I felt compelled to put some work in trying to find the ultimate single-precision implementation of sincospi(), and while I cannot be sure I achieved that I *was* able to get some meaningful improvements in speed and accuracy compared to CUDA 7.5.

On a Quadro K2200, I measured CUDA 7.5 throughput as 19.342e9 functions calls per second, while the code below clocks in at 20.728e9 function calls per second, for a 7% speedup. The accuracy is also improved, as the maximum error in sincospif() is 1.53358 ulps in CUDA 7.5 (for both the sine and cosine components), while the maximum error for my_sincospif() below is 0.96677 ulps for the sine component and 0.96563 ulp for the cosine component, meaning the results are faithfully rounded.

```
/*
Copyright (c) 2016, Norbert Juffa
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Argument reduction for sinpi, cospi, sincospi. Reduces to [-0.25, +0.25] */
__device__ __forceinline__ float trig_red_pi_f (float a, int *i)
{
float r;
r = rintf (a + a);
*i = (int)r;
r = a - 0.5f * r;
return r;
}
/* Approximate cos(pi*x) for x in [-0.25,0.25]. Maximum ulp error = 0.87440 */
__device__ __forceinline__ float cospif_poly (float s)
{
float r;
r = 2.31384277e-1f; // 0x1.d9e000p-3f
r = fmaf (r, s, -1.33502197e+0f); // -0x1.55c400p+0f
r = fmaf (r, s, 4.05870390e+0f); // 0x1.03c1cep+2f
r = fmaf (r, s, -4.93480206e+0f); // -0x1.3bd3ccp+2f
r = fmaf (r, s, 1.00000000e+0f); // 0x1.000000p+0f
return r;
}
/* Approximate sin(pi*x) for x in [-0.25,0.25]. Maximum ulp error = 0.96677 */
__device__ __forceinline__ float sinpif_poly (float a, float s)
{
float r;
r = -5.95703125e-1f; // -0x1.310000p-1f
r = fmaf (r, s, 2.55039954e+0f); // 0x1.46737ep+1f
r = fmaf (r, s, -5.16772413e+0f); // -0x1.4abbfep+2f
r = (a * s) * r;
r = fmaf (a, 3.14159274e+0f, r); // 0x1.921fb6p+1f
return r;
}
/* Compute sin(x*pi) and cos(x*pi) simultaneously, based on the quadrant
Maximum error for sinpi is 0.96677 ulp; for cospi it is 0.96563 ulp
*/
__device__ __forceinline__ void sincospif_core (float a, int i,
float * __restrict__ sp,
float * __restrict__ cp)
{
float c, s, t;
s = a * a;
c = cospif_poly (s);
s = sinpif_poly (a, s);
if (i & 2) {
s = 0.0f - s; // don't change "sign" of NaNs or create negative zeros
c = 0.0f - c; // don't change "sign" of NaNs or create negative zeros
}
if (i & 1) {
t = 0.0f - s; // don't change "sign" of NaNs or create negative zeros
s = c;
c = t;
}
*sp = s;
*cp = c;
}
__device__ void my_sincospif (float a, float *sp, float *cp)
{
float c, r, s;
int i;
/* for |a| > 2**24, cospi(a) = 1.0f, but cospi(Inf) = NaN */
a = (fabsf (a) < 16777216.0f) ? a : (a * 0.0f); // 0x1.0p24f
r = trig_red_pi_f (a, &i);
sincospif_core (r, i, &s, &c);
/* IEEE-754: sinPi(+n) is +0 and sinPi(-n) is -0 for positive integers n */
s = (a == floorf (a)) ? (a * 0.0f) : s;
*sp = s;
*cp = c;
}
```