A recent report evaluating and comparing the accuracy of math functions of various commonly-used libraries: Vincenzo Innocente and Paul Zimmermann, “Accuracy of Mathematical Functions in Single, Double, Extended Double and Quadruple Precision”, (HAL Preprint hal-03141101v2), 2022, highlights the fact that many of the math functions in CUDA are worthy of improvement in terms of accuracy. Looking at the details of some functions, it seems that this is possible without a negative impact on performance.
A case in point is the function tanf()
, where I was able to reduce the maximum error of 3.096 ulps in CUDA 11 to 1.955 ulps, with no negative performance impact observed on my Quadro RTX 4000 (compute capability 7.5).
/*
Copyright (c) 2022, Norbert Juffa
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Approximate tangent on [-PI/4,+PI/4]. Maximum ulp error = 0.99698 */
__forceinline__ __device__ float tanf_poly (float s)
{
float r;
r = 4.38117981e-3f; // 0x1.1f2000p-8
r = fmaf (r, s, 8.94600598e-5f); // 0x1.773902p-14
r = fmaf (r, s, 1.08341556e-2f); // 0x1.63037cp-7
r = fmaf (r, s, 2.12811474e-2f); // 0x1.5cab9ap-6
r = fmaf (r, s, 5.40602170e-2f); // 0x1.badc7ep-5
r = fmaf (r, s, 1.33326918e-1f); // 0x1.110db4p-3
r = fmaf (r, s, 3.33333433e-1f); // 0x1.110db4p-3
r = r * s;
return r;
}
/* GPU's built-in reciprocal approximation, MUFU.RCP */
__forceinline__ __device__ float rcp_gpu_raw (float a)
{
float r;
asm ("rcp.approx.ftz.f32 %0,%1;" : "=f"(r) : "f"(a)); // r = 1.0f / a
return r;
}
/* Map tangent value based on quadrant */
__forceinline__ __device__ float tanf_core (float a, int i)
{
float r, s, t;
s = a * a;
t = tanf_poly (s);
r = fmaf (t, a, a);
if (i & 1) {
/* -1/r = -1/(r_hi+r_lo) ~= -1/r_hi + r_lo/(r_hi*r_hi) */
s = a - r;
s = fmaf (t, a, s); // r_lo
t = rcp_gpu_raw (-r); // -1/r_hi
r = fmaf (r, t, 1.0f); // -1/r_hi = t+u
r = fmaf (s, t, r);
r = fmaf (r, t, t);
}
return r;
}
/* 190 bits of 2/PI for Payne-Hanek style argument reduction. */
__constant__ unsigned int two_over_pi_f [] =
{
0x28be60db,
0x9391054a,
0x7f09d5f4,
0x7d4d3770,
0x36d8a566,
0x4f10e410
};
__forceinline__ __device__ float trig_red_slowpath_f (float a, int *quadrant)
{
unsigned long long int p;
unsigned int ia, hi, mid, lo, tmp, i;
int e, q;
float r;
ia = ((((unsigned int)__float_as_int (a)) & 0x007fffff) << 8) | 0x80000000;
e = ((((unsigned int)__float_as_int (a)) >> 23) & 0xff) - 126;
/* compute product x * 2/pi in 2.62 fixed-point format */
i = (unsigned int)e >> 5;
e = (unsigned int)e & 31;
hi = i ? two_over_pi_f [i-1] : 0;
mid = two_over_pi_f [i+0];
lo = two_over_pi_f [i+1];
tmp = two_over_pi_f [i+2];
if (e) {
hi = (hi << e) | (mid >> (32 - e));
mid = (mid << e) | (lo >> (32 - e));
lo = (lo << e) | (tmp >> (32 - e));
}
p = (unsigned long long int)ia * lo;
p = (unsigned long long int)ia * mid + (p >> 32);
p = ((unsigned long long int)(ia * hi) << 32) + p;
q = (int)(p >> 62); // integral portion = quadrant<1:0>
p = p & 0x3fffffffffffffffULL; // fraction
if (p & 0x2000000000000000ULL) { // fraction >= 0.5
p = p - 0x4000000000000000ULL; // fraction - 1.0
q = q + 1;
}
/* compute remainder of x / (pi/2) */
double d = (double)(long long int)p;
d = d * 3.4061215800865545e-19; // 0x1.921fb54442d18p-62 // pi/2 * 0x1.p-62
r = (float)d;
if (a < 0.0f) {
r = -r;
q = -q;
}
*quadrant = q;
return r;
}
/* Argument reduction for trigonometric functions that reduces the argument
to the interval [-PI/4, +PI/4] and also returns the quadrant. It returns
-0.0f for an input of -0.0f
*/
__forceinline__ __device__ float trig_red_f (float a, float switch_over, int *q)
{
float j, r;
if (fabsf (a) > switch_over) {
/* Payne-Hanek style reduction. M. Payne and R. Hanek, Radian reduction
for trigonometric functions. SIGNUM Newsletter, 18:19-24, 1983
*/
r = trig_red_slowpath_f (a, q);
} else {
/* FMA-enhanced Cody-Waite style reduction. W. J. Cody and W. Waite,
"Software Manual for the Elementary Functions", Prentice-Hall 1980
*/
j = fmaf (a, 0.636619747f, 1.2582912e+7f);//0x1.45f306p-1,0x1.800000p+23
*q = __float_as_int (j);
j = j - 1.25829120e+7f; // 0x1.800000p+23
r = fmaf (j, -1.57079601e+00f, a); // -0x1.921fb0p+00 // pio2_high
r = fmaf (j, -3.13916473e-07f, r); // -0x1.5110b4p-22 // pio2_mid
r = fmaf (j, -5.39030253e-15f, r); // -0x1.846988p-48 // pio2_low
}
return r;
}
/* maximum ulp error = 1.95549 ulp1 = 669180566 */
__inline__ __device__ float my_tanf (float a)
{
float r;
int i;
a = fmaf (a, 0.0f, a); // inf -> NaN
r = trig_red_f (a, 252.898206f, &i);
r = tanf_core (r, i);
return r;
}