An accurate single-precision implementation of the Lambert W function

The Lambert W function is the solution w of w*exp(w)=z. It is named after the Swiss mathematician Johann Heinrich Lambert (1728-1777), although (in keeping with Stigler’s law of eponymy) it was originally examined by his more famous compatriot Leonard Euler, who, however, credited Lambert with some of the underlying ideas in the relevant publication:

Leonard Euler, “De serie Lambertina plurimisque eius insignibus proprietatibus,” (On Lambert’s series and its many distinctive properties) Acta Academiae Scientiarum Imperialis Petropolitanae pro Anno MDCCLXXIX, Tomus III, Pars II, (Proceedings of the Imperial Academy of Sciences of St. Petersburg for the Year 1779, volume 3, part 2, Jul. - Dec.), St. Petersburg: Academy of Sciences 1783, pp. 29-51 (scan by Bavarian State Library, Munich)

After a slumber of some 200 years, the function was rediscovered (and named after Lambert) in the 1990s for use in computer algebra packages, but there have since been various applications in the physical sciences, a useful overview of which can be found in Wikipedia. My own interest arose from using it as a building block for the computation of the inverse of the principal branch of the gamma function.

The Lambert W function is multi-branched, but when restricted to the real numbers, solving y*exp(y)=x, there are just two branches, commonly referred to as W0 (the principal branch) for x in [-exp(-1), ∞] and W-1 for x in [-exp(-1), 0). As in cases of other multi-branched functions such as the square root, the principal branch is usually the one of interest, and is the one computed by the single-precision implementation lambert_w0f below.

The numerical computation of the Lambert W function is an area of active research, with most of the focus either on piecewise rational approximations or on starting approximations of various kinds coupled with functional iteration schemes.

I created lambert_w0f as a starting point for further investigations. In its present form it provides results with good accuracy, with maximum error of 2.53995 ulps across the entire input domain. Performance is reasonably good, with the cost of logarithmic and exponential functions mitigated as best as possible, for example by using device intrinsics. I am using simple Newton iterations, as the iterations of higher order, like Halley’s (third order) and Schröder’s method (fifth order), often advocated in the literature, seem not to perform as well as one might expect when executed in finite-precision floating-point arithmetic. In particular three Newton iterations provide more accurate results than two Halley iterations.

[ Code below updated 7/21/2022, 7/25/2022, 7/26/2022, 7/27/2022, 7/29/2022, 8/3/2022 ]

/*
  Copyright (c) 2022, Norbert Juffa
  All rights reserved.

  Redistribution and use in source and binary forms, with or without 
  modification, are permitted provided that the following conditions
  are met:

  1. Redistributions of source code must retain the above copyright 
     notice, this list of conditions and the following disclaimer.

  2. Redistributions in binary form must reproduce the above copyright
     notice, this list of conditions and the following disclaimer in the
     documentation and/or other materials provided with the distribution.

  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

__forceinline__ __device__ float raw_rcp (float a)
{
    float r;
    asm ("rcp.approx.ftz.f32 %0,%1;" : "=f"(r) : "f"(a));
    return r;
}

__forceinline__ __device__ float raw_sqrt (float a)
{
    float r;
    asm ("sqrt.approx.ftz.f32 %0,%1;" : "=f"(r) : "f"(a));
    return r;
}

__forceinline__ __device__ float raw_ex2 (float a)
{
    float r;
    asm ("ex2.approx.ftz.f32 %0,%1;" : "=f"(r) : "f"(a));
    return r;
}

__forceinline__ __device__ float raw_lg2 (float a)
{
    float r;
    asm ("lg2.approx.ftz.f32 %0,%1;" : "=f"(r) : "f"(a));
    return r;
}

/* Compute exp(a) * 2**scale. Max ulp err = 0.86565 */
__forceinline__ __device__ float expf_scale (float a, int scale)
{
    const float flt_int_cvt = 12582912.0f; // 0x1.8p23
    float f, r, j, t;
    int i;

    // exp(a) = 2**i * exp(f); i = rintf (a / log(2))
    j = fmaf (1.442695f, a, flt_int_cvt); // // 0x1.715476p0 // log2(e)
    t = j - flt_int_cvt; 
    f = fmaf (t, -6.93145752e-1f, a); // -0x1.62e400p-1  // log_2_hi 
    f = fmaf (t, -1.42860677e-6f, f); // -0x1.7f7d1cp-20 // log_2_lo 
    i = __float_as_int (j);
    // approximate r = exp(f) on interval [-log(2)/2, +log(2)/2]
    r =             1.37805939e-3f;  // 0x1.694000p-10
    r = fmaf (r, f, 8.37312452e-3f); // 0x1.125edcp-7
    r = fmaf (r, f, 4.16695364e-2f); // 0x1.555b5ap-5
    r = fmaf (r, f, 1.66664720e-1f); // 0x1.555450p-3
    r = fmaf (r, f, 4.99999851e-1f); // 0x1.fffff6p-2
    r = fmaf (r, f, 1.00000000e+0f); // 0x1.000000p+0
    r = fmaf (r, f, 1.00000000e+0f); // 0x1.000000p+0
    // exp(a) = 2**(i+scale) * r;
    r = __int_as_float (((i + scale) << 23) + __float_as_int (r));
    return r;
}

/*
  Compute the principal branch of the Lambert W function, W_0. The maximum
  error in the positive half-plane is 1.49923 ulps and the maximum error in
  the negative half-plane is 2.53995 ulps; a total of 76456077 results are
  not correctly rounded.
*/
__device__ float lambert_w0f (float z) 
{
    const float MY_INF = __int_as_float (0x7f800000);
    const float em1_hi =  3.67879450e-1f; // exp(-1)_hi
    const float em1_lo = -9.14975473e-9f; // exp(-1)_lo
    const float qe1 = 2.71828183f / 4.0f; // exp(1)/4
    float e, w, num, den, rden, redz, y, r;

    if (isnan (z) || (z == MY_INF) || (z == 0.0f)) return z + z;
    if (fabsf (z) < 1.220703125e-4f) return fmaf (-z, z, z); // 0x1.0p-13
    redz = (z + em1_hi) + em1_lo;
    if (redz < 0.0244140625f) { // expansion at -(exp(-1))
        r = raw_sqrt (redz);
        w =             -2.28572245f;  // -0x1.24928ep+1
        w = fmaf (w, r,  2.77717149f); //  0x1.637a5ap+1
        w = fmaf (w, r, -2.33162966f); // -0x1.2a72d8p+1
        w = fmaf (w, r,  1.93585982f); //  0x1.ef9482p+0
        w = fmaf (w, r, -1.81217782f); // -0x1.cfeae2p+0
        w = fmaf (w, r,  2.33164396f); //  0x1.2a7350p+1
        w = fmaf (w, r, -1.00000000f); // -0x1.000000p+0
    } else {
        /* Compute initial approximation. Based on: Roberto Iacono and John 
           Philip Boyd, "New approximations to the principal real-valued branch
           of the Lambert W function", Advances in Computational Mathematics, 
           Vol. 43, No. 6, December 2017, pp. 1403-1436
        */
        y = 2.0f * raw_sqrt (fmaf (qe1, z, 0.25f));
        y = raw_lg2 (raw_rcp (fmaf (0.31819974f, raw_lg2 (1.0f + y), 1.0f)) * 
                     fmaf (1.15262585f, y, 1.0f));
        w = fmaf (1.41337042f, y, -1.0f);

        /* perform Newton iterations to refine approximation to full accuracy */
        const float l2e = 1.44269504f; // log2(exp(1))
        e = 0.125f * raw_ex2 (l2e * w);
        num = fmaf (w, e, -0.125f * z);
        den = fmaf (w, e, e);
        rden = raw_rcp (den);
        w = fmaf (-num, rden, w);

        e = 0.125f * raw_ex2 (l2e * w);
        num = fmaf (w, e, -0.125f * z);
        den = fmaf (w, e, e);
        rden = raw_rcp (den);
        w = fmaf (-num, rden, w);

        e = expf_scale (w, -3);
        num = fmaf (w, e, -0.125f * z);
        den = fmaf (w, e, e);
        rden = raw_rcp (den);
        w = fmaf (-num, rden, w);
    }
    return w;
}
1 Like

This is an interesting hobby of yours.

Ikr :-)

One of the things that brought me to computers decades ago was me wondering what happens under the hood after pressing the ex button on my calculator. And in the days before the internet, that was not trivial to figure out. As I recall it took me several years and access to a university library to discover all the details.

Now that I am retired and have plenty of time on my hands I am at times wondering much the same about slightly more complicated functions. Since the Lambert W function was named in the 1990s, after I was out of school, I did not know of its existence until the early 2000s. Recently, I discovered another useful function I had never heard of before: Owen’s T function. It came up in the context of some question on one of the Stackexchange sites.

When you find the right fit for an intellect like that, it is like striking gold. That’s probably true for most people and their work.

We were lucky to have njuffa during the time spent working at NVIDIA.

Here is a double-precision implementation of the principal branch of the Lambert W function W0. It is fully functional and accurate (maximum error found so far is 2.67268 ulp) but not yet fully optimized for performance.

/*
  Copyright (c) 2022, Norbert Juffa
  All rights reserved.

  Redistribution and use in source and binary forms, with or without 
  modification, are permitted provided that the following conditions
  are met:

  1. Redistributions of source code must retain the above copyright 
     notice, this list of conditions and the following disclaimer.

  2. Redistributions in binary form must reproduce the above copyright
     notice, this list of conditions and the following disclaimer in the
     documentation and/or other materials provided with the distribution.

  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

/* Compute exp(a) * 2**scale. Maximum error found: 0.89028 ulp */
__device__ double exp_scale (double a, int scale)
{
    const double ln2_hi = 6.9314718055829871e-01;
    const double ln2_lo = 1.6465949582897082e-12;
    const double l2e = 1.4426950408889634; // log2(e)
    const double cvt = 6755399441055744.0; // 3 * 2**51
    double f, r;
    int i;

    // exp(a) = exp(i + f); i = rint (a / log(2))
    r = fma (l2e, a, cvt);
    i = __double2loint (r);
    r = r - cvt;
    f = fma (r, -ln2_hi, a);
    f = fma (r, -ln2_lo, f);

    // approximate r = exp(f) on interval [-log(2)/2,+log(2)/2]
    r =            2.5022018235176802e-8;  // 0x1.ade0000000000p-26
    r = fma (r, f, 2.7630903497145818e-7); // 0x1.28af3fcbbf09bp-22
    r = fma (r, f, 2.7557514543490574e-6); // 0x1.71dee623774fap-19
    r = fma (r, f, 2.4801491039409158e-5); // 0x1.a01997c8b50d7p-16
    r = fma (r, f, 1.9841269589068419e-4); // 0x1.a01a01475db8cp-13
    r = fma (r, f, 1.3888888945916566e-3); // 0x1.6c16c1852b805p-10
    r = fma (r, f, 8.3333333334557735e-3); // 0x1.11111111224c7p-7
    r = fma (r, f, 4.1666666666519782e-2); // 0x1.55555555502a5p-5
    r = fma (r, f, 1.6666666666666477e-1); // 0x1.5555555555511p-3
    r = fma (r, f, 5.0000000000000122e-1); // 0x1.000000000000bp-1
    r = fma (r, f, 1.0000000000000000e+0); // 0x1.0000000000000p+0
    r = fma (r, f, 1.0000000000000000e+0); // 0x1.0000000000000p+0

    // exp(a) = 2**(i+scale) * r;
    r = __hiloint2double (__double2hiint (r) + ((i + scale) << 20), 
                          __double2loint (r));
    return r;
}

/* Compute natural lograithm, log(a). Maximum error found: 0.86902 ulp */
__device__ double my_log (double a)
{
    const double LOG2_HI = 6.9314718055994529e-01; // 0x1.62e42fefa39efp-01
    const double LOG2_LO = 2.3190468138462996e-17; // 0x1.abc9e3b39803fp-56
    double m, r, i, s, t, p, q;
    int e, ec = 0;

    if (__double2hiint (a) < __double2hiint (2.2250738585072014e-308)) { // subnormal
        a *= 4503599627370496.0; // 0x1.0p52
        ec -= (52 << 20);
    }
    e = (__double2hiint (a) - __double2hiint (0.70703125)) & 0xfff00000;
    m = __hiloint2double (__double2hiint (a) - e, __double2loint (a));
    t = __hiloint2double (0x41f00000, 0x80000000 ^ (e + ec));
    i = t - (__hiloint2double (0x41f00000, 0x80000000));

    /* m now in [181/256, 362/256]. Compute q = (m-1) / (m+1) */
    p = m + 1.0;
    r = 1.0 / p;
    q = fma (m, r, -r);
    m = m - 1.0;

    /* Compute (2*atanh(q)/q-2*q) as p(q**2), q in [-75/437, 53/309] */
    s = q * q;
    r =            1.4794533702196025e-1;  // 0x1.2efdf700d7135p-3
    r = fma (r, s, 1.5314187748152339e-1); // 0x1.39a272db730f7p-3
    r = fma (r, s, 1.8183559141306990e-1); // 0x1.746637f2f191bp-3
    r = fma (r, s, 2.2222198669309609e-1); // 0x1.c71c522a64577p-3
    r = fma (r, s, 2.8571428741489319e-1); // 0x1.24924941c9a2fp-2
    r = fma (r, s, 3.9999999999418523e-1); // 0x1.999999998006cp-2
    r = fma (r, s, 6.6666666666667340e-1); // 0x1.5555555555592p-1
    r = r * s;

    /* log(a) = 2*atanh(q) + i*log(2) = LOG2_LO*i + p(q**2)*q + 2q + LOG2_HI*i.
       Use K.C. Ng's trick to improve the accuracy of the computation, like so:
       p(q**2)*q + 2q = p(q**2)*q + q*t - t + m, where t = m**2/2.
    */
    t = m * m * 0.5;
    r = fma (q, t, fma (q, r, LOG2_LO * i)) - t + m;
    r = fma (LOG2_HI, i, r);

    /* Handle special cases */
    if (!((a > 0.0) && (a < INFINITY))) {
        r = a + a;  // handle inputs of NaN, +Inf
        if (a  < 0.0) r = __hiloint2double (0xfff80000, 0x00000000); //  NaN
        if (a == 0.0) r = __hiloint2double (0xfff00000, 0x00000000); // -Inf
    }
    return r;
}

/* Compute the principal branch of the Lambert W function, W_0. Maximum error:
   positive half-plane: 1.48025 ulp @  2.4369698018556342e-4
   negative half-plane: 2.67268 ulp @ -3.5722892262707617e-1
*/
__device__ double lambert_w0 (double z) 
{
    const double qe1 = 2.7182818284590452 * 0.25; // exp(1)/4
    const double em1_hi =  3.6787944117144233e-1; // exp(-1)_hi
    const double em1_lo = -1.2428753672788363e-17;// exp(-1)_lo
    double e, r, t, w, y, num, den, rden, redz;
    int i;
    
    if (isnan (z) || (z == INFINITY) || (z == 0.0)) return z + z;
    if (fabs (z) < 1.9073486328125e-6) return fma (fma (1.5, z, -1.) * z, z, z);
    redz = (z + em1_hi) + em1_lo;
    if (redz < 0.01025390625) { // expansion at -(exp(-1))
        r = sqrt (redz);
        w =             12.2549150361339980;  //  0x1.88284393ee88ap+3
        w = fma (w, r, -15.0789952403151450); // -0x1.e2872106b62ecp+3
        w = fma (w, r,  11.8725001236137670); //  0x1.7beb856115aabp+3
        w = fma (w, r,  -8.3706388286143945); // -0x1.0bdc45f5f0d9bp+3
        w = fma (w, r,   5.8564285634808346); //  0x1.76cfb9bfe0ab2p+2
        w = fma (w, r,  -4.1752813239140965); // -0x1.0b37cf2873e15p+2
        w = fma (w, r,   3.0668577430022212); //  0x1.888ecb65d6e6ap+1
        w = fma (w, r,  -2.3535511874120569); // -0x1.2d412a51b2c8cp+1
        w = fma (w, r,   1.9366311143993924); //  0x1.efc70e84c2eccp+0
        w = fma (w, r,  -1.8121878856391300); // -0x1.cfeb8b9707071p+0
        w = fma (w, r,   2.3316439815971242); //  0x1.2a734f5b6ffbep+1
        w = fma (w, r,  -1.0000000000000000); // -0x1.0000000000000p+0
        return w;
    }
    /* Roberto Iacono and John Philip Boyd, "New approximations to the 
       principal real-valued branch of the Lambert W function", Advances 
       in Computational Mathematics, Vol. 43, No. 6, December 2017, 
       pp. 1403-1436
     */
    y = 2.0 * sqrt (fma (qe1, z, 0.25));
    y = my_log (fma (1.14956131, y, 1.) / fma (0.4549574, my_log (1. + y), 1.));
    w = fma (2.036, y, -1.0);

    /* Use iteration scheme w = (w / ( 1 + w)) * ( 1 + log (z / w) from
       Roberto Iacono and John Philip Boyd, "New approximations to the 
       principal real-valued branch of the Lambert W function", Advances
       in Computational Mathematics, Vol. 43, No. 6, December 2017, pp. 
       1403-1436
    */
    for (i = 0; i < 3; i++) {
        t = w / (1.0 + w);
        w = fma (my_log (z / w), t, t);
    }

    /* Fine tune approximation with a single Newton iteration */
    e = exp_scale (w, -1);
    num = fma (w, e, -0.5 *z);
    den = fma (w, e, e);
    rden = 1.0 / den;
    w = fma (-num, rden, w);

    return w;
}
1 Like