From d202b5c9613620550944741fc44a9b29299acbaf Mon Sep 17 00:00:00 2001
From: Sergey B Kirpichev <skirpichev@gmail.com>
Date: Wed, 8 Feb 2023 09:35:17 +0300
Subject: [PATCH 01/11] gh-101678: refactor the math module to use special
 functions from c11

---
 Modules/mathmodule.c | 165 +------------------------------------------
 1 file changed, 2 insertions(+), 163 deletions(-)

diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c
index 992957e675a7a3..8e5cdfb1f47231 100644
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -101,10 +101,6 @@ get_math_module_state(PyObject *module)
 
 static const double pi = 3.141592653589793238462643383279502884197;
 static const double logpi = 1.144729885849400174143427351353058711647;
-#if !defined(HAVE_ERF) || !defined(HAVE_ERFC)
-static const double sqrtpi = 1.772453850905516027298167483341145182798;
-#endif /* !defined(HAVE_ERF) || !defined(HAVE_ERFC) */
-
 
 /* Version of PyFloat_AsDouble() with in-line fast paths
    for exact floats and integers.  Gives a substantial
@@ -458,163 +454,6 @@ m_lgamma(double x)
     return r;
 }
 
-#if !defined(HAVE_ERF) || !defined(HAVE_ERFC)
-
-/*
-   Implementations of the error function erf(x) and the complementary error
-   function erfc(x).
-
-   Method: we use a series approximation for erf for small x, and a continued
-   fraction approximation for erfc(x) for larger x;
-   combined with the relations erf(-x) = -erf(x) and erfc(x) = 1.0 - erf(x),
-   this gives us erf(x) and erfc(x) for all x.
-
-   The series expansion used is:
-
-      erf(x) = x*exp(-x*x)/sqrt(pi) * [
-                     2/1 + 4/3 x**2 + 8/15 x**4 + 16/105 x**6 + ...]
-
-   The coefficient of x**(2k-2) here is 4**k*factorial(k)/factorial(2*k).
-   This series converges well for smallish x, but slowly for larger x.
-
-   The continued fraction expansion used is:
-
-      erfc(x) = x*exp(-x*x)/sqrt(pi) * [1/(0.5 + x**2 -) 0.5/(2.5 + x**2 - )
-                              3.0/(4.5 + x**2 - ) 7.5/(6.5 + x**2 - ) ...]
-
-   after the first term, the general term has the form:
-
-      k*(k-0.5)/(2*k+0.5 + x**2 - ...).
-
-   This expansion converges fast for larger x, but convergence becomes
-   infinitely slow as x approaches 0.0.  The (somewhat naive) continued
-   fraction evaluation algorithm used below also risks overflow for large x;
-   but for large x, erfc(x) == 0.0 to within machine precision.  (For
-   example, erfc(30.0) is approximately 2.56e-393).
-
-   Parameters: use series expansion for abs(x) < ERF_SERIES_CUTOFF and
-   continued fraction expansion for ERF_SERIES_CUTOFF <= abs(x) <
-   ERFC_CONTFRAC_CUTOFF.  ERFC_SERIES_TERMS and ERFC_CONTFRAC_TERMS are the
-   numbers of terms to use for the relevant expansions.  */
-
-#define ERF_SERIES_CUTOFF 1.5
-#define ERF_SERIES_TERMS 25
-#define ERFC_CONTFRAC_CUTOFF 30.0
-#define ERFC_CONTFRAC_TERMS 50
-
-/*
-   Error function, via power series.
-
-   Given a finite float x, return an approximation to erf(x).
-   Converges reasonably fast for small x.
-*/
-
-static double
-m_erf_series(double x)
-{
-    double x2, acc, fk, result;
-    int i, saved_errno;
-
-    x2 = x * x;
-    acc = 0.0;
-    fk = (double)ERF_SERIES_TERMS + 0.5;
-    for (i = 0; i < ERF_SERIES_TERMS; i++) {
-        acc = 2.0 + x2 * acc / fk;
-        fk -= 1.0;
-    }
-    /* Make sure the exp call doesn't affect errno;
-       see m_erfc_contfrac for more. */
-    saved_errno = errno;
-    result = acc * x * exp(-x2) / sqrtpi;
-    errno = saved_errno;
-    return result;
-}
-
-/*
-   Complementary error function, via continued fraction expansion.
-
-   Given a positive float x, return an approximation to erfc(x).  Converges
-   reasonably fast for x large (say, x > 2.0), and should be safe from
-   overflow if x and nterms are not too large.  On an IEEE 754 machine, with x
-   <= 30.0, we're safe up to nterms = 100.  For x >= 30.0, erfc(x) is smaller
-   than the smallest representable nonzero float.  */
-
-static double
-m_erfc_contfrac(double x)
-{
-    double x2, a, da, p, p_last, q, q_last, b, result;
-    int i, saved_errno;
-
-    if (x >= ERFC_CONTFRAC_CUTOFF)
-        return 0.0;
-
-    x2 = x*x;
-    a = 0.0;
-    da = 0.5;
-    p = 1.0; p_last = 0.0;
-    q = da + x2; q_last = 1.0;
-    for (i = 0; i < ERFC_CONTFRAC_TERMS; i++) {
-        double temp;
-        a += da;
-        da += 2.0;
-        b = da + x2;
-        temp = p; p = b*p - a*p_last; p_last = temp;
-        temp = q; q = b*q - a*q_last; q_last = temp;
-    }
-    /* Issue #8986: On some platforms, exp sets errno on underflow to zero;
-       save the current errno value so that we can restore it later. */
-    saved_errno = errno;
-    result = p / q * x * exp(-x2) / sqrtpi;
-    errno = saved_errno;
-    return result;
-}
-
-#endif /* !defined(HAVE_ERF) || !defined(HAVE_ERFC) */
-
-/* Error function erf(x), for general x */
-
-static double
-m_erf(double x)
-{
-#ifdef HAVE_ERF
-    return erf(x);
-#else
-    double absx, cf;
-
-    if (Py_IS_NAN(x))
-        return x;
-    absx = fabs(x);
-    if (absx < ERF_SERIES_CUTOFF)
-        return m_erf_series(x);
-    else {
-        cf = m_erfc_contfrac(absx);
-        return x > 0.0 ? 1.0 - cf : cf - 1.0;
-    }
-#endif
-}
-
-/* Complementary error function erfc(x), for general x. */
-
-static double
-m_erfc(double x)
-{
-#ifdef HAVE_ERFC
-    return erfc(x);
-#else
-    double absx, cf;
-
-    if (Py_IS_NAN(x))
-        return x;
-    absx = fabs(x);
-    if (absx < ERF_SERIES_CUTOFF)
-        return 1.0 - m_erf_series(x);
-    else {
-        cf = m_erfc_contfrac(absx);
-        return x > 0.0 ? cf : 2.0 - cf;
-    }
-#endif
-}
-
 /*
    wrapper for atan2 that deals directly with special cases before
    delegating to the platform libm for the remaining cases.  This
@@ -1261,10 +1100,10 @@ FUNC1(cos, cos, 0,
 FUNC1(cosh, cosh, 1,
       "cosh($module, x, /)\n--\n\n"
       "Return the hyperbolic cosine of x.")
-FUNC1A(erf, m_erf,
+FUNC1A(erf, erf,
        "erf($module, x, /)\n--\n\n"
        "Error function at x.")
-FUNC1A(erfc, m_erfc,
+FUNC1A(erfc, erfc,
        "erfc($module, x, /)\n--\n\n"
        "Complementary error function at x.")
 FUNC1(exp, exp, 1,

From 7d14105b3e6847ac6f836584a8666eea54da504f Mon Sep 17 00:00:00 2001
From: Sergey B Kirpichev <skirpichev@gmail.com>
Date: Wed, 8 Feb 2023 10:33:16 +0300
Subject: [PATCH 02/11] Use libc's lgamma/tgamma instead of custom
 implementations

---
 Modules/mathmodule.c | 276 ++-----------------------------------------
 1 file changed, 12 insertions(+), 264 deletions(-)

diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c
index 8e5cdfb1f47231..84ef6663168058 100644
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -92,16 +92,6 @@ get_math_module_state(PyObject *module)
     return (math_module_state *)state;
 }
 
-/*
-   sin(pi*x), giving accurate results for all finite x (especially x
-   integral or close to an integer).  This is here for use in the
-   reflection formula for the gamma function.  It conforms to IEEE
-   754-2008 for finite arguments, but not for infinities or nans.
-*/
-
-static const double pi = 3.141592653589793238462643383279502884197;
-static const double logpi = 1.144729885849400174143427351353058711647;
-
 /* Version of PyFloat_AsDouble() with in-line fast paths
    for exact floats and integers.  Gives a substantial
    speed improvement for extracting float arguments.
@@ -124,162 +114,6 @@ static const double logpi = 1.144729885849400174143427351353058711647;
         }                                                  \
     }
 
-static double
-m_sinpi(double x)
-{
-    double y, r;
-    int n;
-    /* this function should only ever be called for finite arguments */
-    assert(Py_IS_FINITE(x));
-    y = fmod(fabs(x), 2.0);
-    n = (int)round(2.0*y);
-    assert(0 <= n && n <= 4);
-    switch (n) {
-    case 0:
-        r = sin(pi*y);
-        break;
-    case 1:
-        r = cos(pi*(y-0.5));
-        break;
-    case 2:
-        /* N.B. -sin(pi*(y-1.0)) is *not* equivalent: it would give
-           -0.0 instead of 0.0 when y == 1.0. */
-        r = sin(pi*(1.0-y));
-        break;
-    case 3:
-        r = -cos(pi*(y-1.5));
-        break;
-    case 4:
-        r = sin(pi*(y-2.0));
-        break;
-    default:
-        Py_UNREACHABLE();
-    }
-    return copysign(1.0, x)*r;
-}
-
-/* Implementation of the real gamma function.  In extensive but non-exhaustive
-   random tests, this function proved accurate to within <= 10 ulps across the
-   entire float domain.  Note that accuracy may depend on the quality of the
-   system math functions, the pow function in particular.  Special cases
-   follow C99 annex F.  The parameters and method are tailored to platforms
-   whose double format is the IEEE 754 binary64 format.
-
-   Method: for x > 0.0 we use the Lanczos approximation with parameters N=13
-   and g=6.024680040776729583740234375; these parameters are amongst those
-   used by the Boost library.  Following Boost (again), we re-express the
-   Lanczos sum as a rational function, and compute it that way.  The
-   coefficients below were computed independently using MPFR, and have been
-   double-checked against the coefficients in the Boost source code.
-
-   For x < 0.0 we use the reflection formula.
-
-   There's one minor tweak that deserves explanation: Lanczos' formula for
-   Gamma(x) involves computing pow(x+g-0.5, x-0.5) / exp(x+g-0.5).  For many x
-   values, x+g-0.5 can be represented exactly.  However, in cases where it
-   can't be represented exactly the small error in x+g-0.5 can be magnified
-   significantly by the pow and exp calls, especially for large x.  A cheap
-   correction is to multiply by (1 + e*g/(x+g-0.5)), where e is the error
-   involved in the computation of x+g-0.5 (that is, e = computed value of
-   x+g-0.5 - exact value of x+g-0.5).  Here's the proof:
-
-   Correction factor
-   -----------------
-   Write x+g-0.5 = y-e, where y is exactly representable as an IEEE 754
-   double, and e is tiny.  Then:
-
-     pow(x+g-0.5,x-0.5)/exp(x+g-0.5) = pow(y-e, x-0.5)/exp(y-e)
-     = pow(y, x-0.5)/exp(y) * C,
-
-   where the correction_factor C is given by
-
-     C = pow(1-e/y, x-0.5) * exp(e)
-
-   Since e is tiny, pow(1-e/y, x-0.5) ~ 1-(x-0.5)*e/y, and exp(x) ~ 1+e, so:
-
-     C ~ (1-(x-0.5)*e/y) * (1+e) ~ 1 + e*(y-(x-0.5))/y
-
-   But y-(x-0.5) = g+e, and g+e ~ g.  So we get C ~ 1 + e*g/y, and
-
-     pow(x+g-0.5,x-0.5)/exp(x+g-0.5) ~ pow(y, x-0.5)/exp(y) * (1 + e*g/y),
-
-   Note that for accuracy, when computing r*C it's better to do
-
-     r + e*g/y*r;
-
-   than
-
-     r * (1 + e*g/y);
-
-   since the addition in the latter throws away most of the bits of
-   information in e*g/y.
-*/
-
-#define LANCZOS_N 13
-static const double lanczos_g = 6.024680040776729583740234375;
-static const double lanczos_g_minus_half = 5.524680040776729583740234375;
-static const double lanczos_num_coeffs[LANCZOS_N] = {
-    23531376880.410759688572007674451636754734846804940,
-    42919803642.649098768957899047001988850926355848959,
-    35711959237.355668049440185451547166705960488635843,
-    17921034426.037209699919755754458931112671403265390,
-    6039542586.3520280050642916443072979210699388420708,
-    1439720407.3117216736632230727949123939715485786772,
-    248874557.86205415651146038641322942321632125127801,
-    31426415.585400194380614231628318205362874684987640,
-    2876370.6289353724412254090516208496135991145378768,
-    186056.26539522349504029498971604569928220784236328,
-    8071.6720023658162106380029022722506138218516325024,
-    210.82427775157934587250973392071336271166969580291,
-    2.5066282746310002701649081771338373386264310793408
-};
-
-/* denominator is x*(x+1)*...*(x+LANCZOS_N-2) */
-static const double lanczos_den_coeffs[LANCZOS_N] = {
-    0.0, 39916800.0, 120543840.0, 150917976.0, 105258076.0, 45995730.0,
-    13339535.0, 2637558.0, 357423.0, 32670.0, 1925.0, 66.0, 1.0};
-
-/* gamma values for small positive integers, 1 though NGAMMA_INTEGRAL */
-#define NGAMMA_INTEGRAL 23
-static const double gamma_integral[NGAMMA_INTEGRAL] = {
-    1.0, 1.0, 2.0, 6.0, 24.0, 120.0, 720.0, 5040.0, 40320.0, 362880.0,
-    3628800.0, 39916800.0, 479001600.0, 6227020800.0, 87178291200.0,
-    1307674368000.0, 20922789888000.0, 355687428096000.0,
-    6402373705728000.0, 121645100408832000.0, 2432902008176640000.0,
-    51090942171709440000.0, 1124000727777607680000.0,
-};
-
-/* Lanczos' sum L_g(x), for positive x */
-
-static double
-lanczos_sum(double x)
-{
-    double num = 0.0, den = 0.0;
-    int i;
-    assert(x > 0.0);
-    /* evaluate the rational function lanczos_sum(x).  For large
-       x, the obvious algorithm risks overflow, so we instead
-       rescale the denominator and numerator of the rational
-       function by x**(1-LANCZOS_N) and treat this as a
-       rational function in 1/x.  This also reduces the error for
-       larger x values.  The choice of cutoff point (5.0 below) is
-       somewhat arbitrary; in tests, smaller cutoff values than
-       this resulted in lower accuracy. */
-    if (x < 5.0) {
-        for (i = LANCZOS_N; --i >= 0; ) {
-            num = num * x + lanczos_num_coeffs[i];
-            den = den * x + lanczos_den_coeffs[i];
-        }
-    }
-    else {
-        for (i = 0; i < LANCZOS_N; i++) {
-            num = num / x + lanczos_num_coeffs[i];
-            den = den / x + lanczos_den_coeffs[i];
-        }
-    }
-    return num/den;
-}
-
 /* Constant for +infinity, generated in the same way as float('inf'). */
 
 static double
@@ -309,113 +143,46 @@ m_nan(void)
 
 #endif
 
+/*
+   gamma: the real gamma function.
+ */
+
 static double
-m_tgamma(double x)
+m_gamma(double x)
 {
-    double absx, r, y, z, sqrtpow;
-
     /* special cases */
     if (!Py_IS_FINITE(x)) {
         if (Py_IS_NAN(x) || x > 0.0)
-            return x;  /* tgamma(nan) = nan, tgamma(inf) = inf */
+            return x;  /* gamma(nan) = nan, gamma(inf) = inf */
         else {
             errno = EDOM;
-            return Py_NAN;  /* tgamma(-inf) = nan, invalid */
+            return Py_NAN;  /* gamma(-inf) = nan, invalid */
         }
     }
     if (x == 0.0) {
         errno = EDOM;
-        /* tgamma(+-0.0) = +-inf, divide-by-zero */
+        /* gamma(+-0.0) = +-inf, divide-by-zero */
         return copysign(Py_HUGE_VAL, x);
     }
 
     /* integer arguments */
     if (x == floor(x)) {
         if (x < 0.0) {
-            errno = EDOM;  /* tgamma(n) = nan, invalid for */
+            errno = EDOM;  /* gamma(n) = nan, invalid for */
             return Py_NAN; /* negative integers n */
         }
-        if (x <= NGAMMA_INTEGRAL)
-            return gamma_integral[(int)x - 1];
-    }
-    absx = fabs(x);
-
-    /* tiny arguments:  tgamma(x) ~ 1/x for x near 0 */
-    if (absx < 1e-20) {
-        r = 1.0/x;
-        if (Py_IS_INFINITY(r))
-            errno = ERANGE;
-        return r;
-    }
-
-    /* large arguments: assuming IEEE 754 doubles, tgamma(x) overflows for
-       x > 200, and underflows to +-0.0 for x < -200, not a negative
-       integer. */
-    if (absx > 200.0) {
-        if (x < 0.0) {
-            return 0.0/m_sinpi(x);
-        }
-        else {
-            errno = ERANGE;
-            return Py_HUGE_VAL;
-        }
     }
 
-    y = absx + lanczos_g_minus_half;
-    /* compute error in sum */
-    if (absx > lanczos_g_minus_half) {
-        /* note: the correction can be foiled by an optimizing
-           compiler that (incorrectly) thinks that an expression like
-           a + b - a - b can be optimized to 0.0.  This shouldn't
-           happen in a standards-conforming compiler. */
-        double q = y - absx;
-        z = q - lanczos_g_minus_half;
-    }
-    else {
-        double q = y - lanczos_g_minus_half;
-        z = q - absx;
-    }
-    z = z * lanczos_g / y;
-    if (x < 0.0) {
-        r = -pi / m_sinpi(absx) / absx * exp(y) / lanczos_sum(absx);
-        r -= z * r;
-        if (absx < 140.0) {
-            r /= pow(y, absx - 0.5);
-        }
-        else {
-            sqrtpow = pow(y, absx / 2.0 - 0.25);
-            r /= sqrtpow;
-            r /= sqrtpow;
-        }
-    }
-    else {
-        r = lanczos_sum(absx) / exp(y);
-        r += z * r;
-        if (absx < 140.0) {
-            r *= pow(y, absx - 0.5);
-        }
-        else {
-            sqrtpow = pow(y, absx / 2.0 - 0.25);
-            r *= sqrtpow;
-            r *= sqrtpow;
-        }
-    }
-    if (Py_IS_INFINITY(r))
-        errno = ERANGE;
-    return r;
+    return tgamma(x);
 }
 
 /*
    lgamma:  natural log of the absolute value of the Gamma function.
-   For large arguments, Lanczos' formula works extremely well here.
 */
 
 static double
 m_lgamma(double x)
 {
-    double r;
-    double absx;
-
     /* special cases */
     if (!Py_IS_FINITE(x)) {
         if (Py_IS_NAN(x))
@@ -430,28 +197,9 @@ m_lgamma(double x)
             errno = EDOM;  /* lgamma(n) = inf, divide-by-zero for */
             return Py_HUGE_VAL; /* integers n <= 0 */
         }
-        else {
-            return 0.0; /* lgamma(1) = lgamma(2) = 0.0 */
-        }
     }
 
-    absx = fabs(x);
-    /* tiny arguments: lgamma(x) ~ -log(fabs(x)) for small x */
-    if (absx < 1e-20)
-        return -log(absx);
-
-    /* Lanczos' formula.  We could save a fraction of a ulp in accuracy by
-       having a second set of numerator coefficients for lanczos_sum that
-       absorbed the exp(-lanczos_g) term, and throwing out the lanczos_g
-       subtraction below; it's probably not worth it. */
-    r = log(lanczos_sum(absx)) - lanczos_g;
-    r += (absx - 0.5) * (log(absx + lanczos_g - 0.5) - 1);
-    if (x < 0.0)
-        /* Use reflection formula to get value for negative x. */
-        r = logpi - log(fabs(m_sinpi(absx))) - log(absx) - r;
-    if (Py_IS_INFINITY(r))
-        errno = ERANGE;
-    return r;
+    return lgamma(x);
 }
 
 /*
@@ -1159,7 +907,7 @@ math_floor(PyObject *module, PyObject *number)
     return PyLong_FromDouble(floor(x));
 }
 
-FUNC1A(gamma, m_tgamma,
+FUNC1A(gamma, m_gamma,
       "gamma($module, x, /)\n--\n\n"
       "Gamma function at x.")
 FUNC1A(lgamma, m_lgamma,

From 468096bc9a5bb6e5577a06fc760f76aa7983dd5d Mon Sep 17 00:00:00 2001
From: Sergey B Kirpichev <skirpichev@gmail.com>
Date: Wed, 8 Feb 2023 11:13:59 +0300
Subject: [PATCH 03/11] Revert "Use libc's lgamma/tgamma instead of custom
 implementations"

This reverts commit 7d14105b3e6847ac6f836584a8666eea54da504f.
---
 Modules/mathmodule.c | 276 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 264 insertions(+), 12 deletions(-)

diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c
index 84ef6663168058..8e5cdfb1f47231 100644
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -92,6 +92,16 @@ get_math_module_state(PyObject *module)
     return (math_module_state *)state;
 }
 
+/*
+   sin(pi*x), giving accurate results for all finite x (especially x
+   integral or close to an integer).  This is here for use in the
+   reflection formula for the gamma function.  It conforms to IEEE
+   754-2008 for finite arguments, but not for infinities or nans.
+*/
+
+static const double pi = 3.141592653589793238462643383279502884197;
+static const double logpi = 1.144729885849400174143427351353058711647;
+
 /* Version of PyFloat_AsDouble() with in-line fast paths
    for exact floats and integers.  Gives a substantial
    speed improvement for extracting float arguments.
@@ -114,6 +124,162 @@ get_math_module_state(PyObject *module)
         }                                                  \
     }
 
+static double
+m_sinpi(double x)
+{
+    double y, r;
+    int n;
+    /* this function should only ever be called for finite arguments */
+    assert(Py_IS_FINITE(x));
+    y = fmod(fabs(x), 2.0);
+    n = (int)round(2.0*y);
+    assert(0 <= n && n <= 4);
+    switch (n) {
+    case 0:
+        r = sin(pi*y);
+        break;
+    case 1:
+        r = cos(pi*(y-0.5));
+        break;
+    case 2:
+        /* N.B. -sin(pi*(y-1.0)) is *not* equivalent: it would give
+           -0.0 instead of 0.0 when y == 1.0. */
+        r = sin(pi*(1.0-y));
+        break;
+    case 3:
+        r = -cos(pi*(y-1.5));
+        break;
+    case 4:
+        r = sin(pi*(y-2.0));
+        break;
+    default:
+        Py_UNREACHABLE();
+    }
+    return copysign(1.0, x)*r;
+}
+
+/* Implementation of the real gamma function.  In extensive but non-exhaustive
+   random tests, this function proved accurate to within <= 10 ulps across the
+   entire float domain.  Note that accuracy may depend on the quality of the
+   system math functions, the pow function in particular.  Special cases
+   follow C99 annex F.  The parameters and method are tailored to platforms
+   whose double format is the IEEE 754 binary64 format.
+
+   Method: for x > 0.0 we use the Lanczos approximation with parameters N=13
+   and g=6.024680040776729583740234375; these parameters are amongst those
+   used by the Boost library.  Following Boost (again), we re-express the
+   Lanczos sum as a rational function, and compute it that way.  The
+   coefficients below were computed independently using MPFR, and have been
+   double-checked against the coefficients in the Boost source code.
+
+   For x < 0.0 we use the reflection formula.
+
+   There's one minor tweak that deserves explanation: Lanczos' formula for
+   Gamma(x) involves computing pow(x+g-0.5, x-0.5) / exp(x+g-0.5).  For many x
+   values, x+g-0.5 can be represented exactly.  However, in cases where it
+   can't be represented exactly the small error in x+g-0.5 can be magnified
+   significantly by the pow and exp calls, especially for large x.  A cheap
+   correction is to multiply by (1 + e*g/(x+g-0.5)), where e is the error
+   involved in the computation of x+g-0.5 (that is, e = computed value of
+   x+g-0.5 - exact value of x+g-0.5).  Here's the proof:
+
+   Correction factor
+   -----------------
+   Write x+g-0.5 = y-e, where y is exactly representable as an IEEE 754
+   double, and e is tiny.  Then:
+
+     pow(x+g-0.5,x-0.5)/exp(x+g-0.5) = pow(y-e, x-0.5)/exp(y-e)
+     = pow(y, x-0.5)/exp(y) * C,
+
+   where the correction_factor C is given by
+
+     C = pow(1-e/y, x-0.5) * exp(e)
+
+   Since e is tiny, pow(1-e/y, x-0.5) ~ 1-(x-0.5)*e/y, and exp(x) ~ 1+e, so:
+
+     C ~ (1-(x-0.5)*e/y) * (1+e) ~ 1 + e*(y-(x-0.5))/y
+
+   But y-(x-0.5) = g+e, and g+e ~ g.  So we get C ~ 1 + e*g/y, and
+
+     pow(x+g-0.5,x-0.5)/exp(x+g-0.5) ~ pow(y, x-0.5)/exp(y) * (1 + e*g/y),
+
+   Note that for accuracy, when computing r*C it's better to do
+
+     r + e*g/y*r;
+
+   than
+
+     r * (1 + e*g/y);
+
+   since the addition in the latter throws away most of the bits of
+   information in e*g/y.
+*/
+
+#define LANCZOS_N 13
+static const double lanczos_g = 6.024680040776729583740234375;
+static const double lanczos_g_minus_half = 5.524680040776729583740234375;
+static const double lanczos_num_coeffs[LANCZOS_N] = {
+    23531376880.410759688572007674451636754734846804940,
+    42919803642.649098768957899047001988850926355848959,
+    35711959237.355668049440185451547166705960488635843,
+    17921034426.037209699919755754458931112671403265390,
+    6039542586.3520280050642916443072979210699388420708,
+    1439720407.3117216736632230727949123939715485786772,
+    248874557.86205415651146038641322942321632125127801,
+    31426415.585400194380614231628318205362874684987640,
+    2876370.6289353724412254090516208496135991145378768,
+    186056.26539522349504029498971604569928220784236328,
+    8071.6720023658162106380029022722506138218516325024,
+    210.82427775157934587250973392071336271166969580291,
+    2.5066282746310002701649081771338373386264310793408
+};
+
+/* denominator is x*(x+1)*...*(x+LANCZOS_N-2) */
+static const double lanczos_den_coeffs[LANCZOS_N] = {
+    0.0, 39916800.0, 120543840.0, 150917976.0, 105258076.0, 45995730.0,
+    13339535.0, 2637558.0, 357423.0, 32670.0, 1925.0, 66.0, 1.0};
+
+/* gamma values for small positive integers, 1 though NGAMMA_INTEGRAL */
+#define NGAMMA_INTEGRAL 23
+static const double gamma_integral[NGAMMA_INTEGRAL] = {
+    1.0, 1.0, 2.0, 6.0, 24.0, 120.0, 720.0, 5040.0, 40320.0, 362880.0,
+    3628800.0, 39916800.0, 479001600.0, 6227020800.0, 87178291200.0,
+    1307674368000.0, 20922789888000.0, 355687428096000.0,
+    6402373705728000.0, 121645100408832000.0, 2432902008176640000.0,
+    51090942171709440000.0, 1124000727777607680000.0,
+};
+
+/* Lanczos' sum L_g(x), for positive x */
+
+static double
+lanczos_sum(double x)
+{
+    double num = 0.0, den = 0.0;
+    int i;
+    assert(x > 0.0);
+    /* evaluate the rational function lanczos_sum(x).  For large
+       x, the obvious algorithm risks overflow, so we instead
+       rescale the denominator and numerator of the rational
+       function by x**(1-LANCZOS_N) and treat this as a
+       rational function in 1/x.  This also reduces the error for
+       larger x values.  The choice of cutoff point (5.0 below) is
+       somewhat arbitrary; in tests, smaller cutoff values than
+       this resulted in lower accuracy. */
+    if (x < 5.0) {
+        for (i = LANCZOS_N; --i >= 0; ) {
+            num = num * x + lanczos_num_coeffs[i];
+            den = den * x + lanczos_den_coeffs[i];
+        }
+    }
+    else {
+        for (i = 0; i < LANCZOS_N; i++) {
+            num = num / x + lanczos_num_coeffs[i];
+            den = den / x + lanczos_den_coeffs[i];
+        }
+    }
+    return num/den;
+}
+
 /* Constant for +infinity, generated in the same way as float('inf'). */
 
 static double
@@ -143,46 +309,113 @@ m_nan(void)
 
 #endif
 
-/*
-   gamma: the real gamma function.
- */
-
 static double
-m_gamma(double x)
+m_tgamma(double x)
 {
+    double absx, r, y, z, sqrtpow;
+
     /* special cases */
     if (!Py_IS_FINITE(x)) {
         if (Py_IS_NAN(x) || x > 0.0)
-            return x;  /* gamma(nan) = nan, gamma(inf) = inf */
+            return x;  /* tgamma(nan) = nan, tgamma(inf) = inf */
         else {
             errno = EDOM;
-            return Py_NAN;  /* gamma(-inf) = nan, invalid */
+            return Py_NAN;  /* tgamma(-inf) = nan, invalid */
         }
     }
     if (x == 0.0) {
         errno = EDOM;
-        /* gamma(+-0.0) = +-inf, divide-by-zero */
+        /* tgamma(+-0.0) = +-inf, divide-by-zero */
         return copysign(Py_HUGE_VAL, x);
     }
 
     /* integer arguments */
     if (x == floor(x)) {
         if (x < 0.0) {
-            errno = EDOM;  /* gamma(n) = nan, invalid for */
+            errno = EDOM;  /* tgamma(n) = nan, invalid for */
             return Py_NAN; /* negative integers n */
         }
+        if (x <= NGAMMA_INTEGRAL)
+            return gamma_integral[(int)x - 1];
+    }
+    absx = fabs(x);
+
+    /* tiny arguments:  tgamma(x) ~ 1/x for x near 0 */
+    if (absx < 1e-20) {
+        r = 1.0/x;
+        if (Py_IS_INFINITY(r))
+            errno = ERANGE;
+        return r;
+    }
+
+    /* large arguments: assuming IEEE 754 doubles, tgamma(x) overflows for
+       x > 200, and underflows to +-0.0 for x < -200, not a negative
+       integer. */
+    if (absx > 200.0) {
+        if (x < 0.0) {
+            return 0.0/m_sinpi(x);
+        }
+        else {
+            errno = ERANGE;
+            return Py_HUGE_VAL;
+        }
     }
 
-    return tgamma(x);
+    y = absx + lanczos_g_minus_half;
+    /* compute error in sum */
+    if (absx > lanczos_g_minus_half) {
+        /* note: the correction can be foiled by an optimizing
+           compiler that (incorrectly) thinks that an expression like
+           a + b - a - b can be optimized to 0.0.  This shouldn't
+           happen in a standards-conforming compiler. */
+        double q = y - absx;
+        z = q - lanczos_g_minus_half;
+    }
+    else {
+        double q = y - lanczos_g_minus_half;
+        z = q - absx;
+    }
+    z = z * lanczos_g / y;
+    if (x < 0.0) {
+        r = -pi / m_sinpi(absx) / absx * exp(y) / lanczos_sum(absx);
+        r -= z * r;
+        if (absx < 140.0) {
+            r /= pow(y, absx - 0.5);
+        }
+        else {
+            sqrtpow = pow(y, absx / 2.0 - 0.25);
+            r /= sqrtpow;
+            r /= sqrtpow;
+        }
+    }
+    else {
+        r = lanczos_sum(absx) / exp(y);
+        r += z * r;
+        if (absx < 140.0) {
+            r *= pow(y, absx - 0.5);
+        }
+        else {
+            sqrtpow = pow(y, absx / 2.0 - 0.25);
+            r *= sqrtpow;
+            r *= sqrtpow;
+        }
+    }
+    if (Py_IS_INFINITY(r))
+        errno = ERANGE;
+    return r;
 }
 
 /*
    lgamma:  natural log of the absolute value of the Gamma function.
+   For large arguments, Lanczos' formula works extremely well here.
 */
 
 static double
 m_lgamma(double x)
 {
+    double r;
+    double absx;
+
     /* special cases */
     if (!Py_IS_FINITE(x)) {
         if (Py_IS_NAN(x))
@@ -197,9 +430,28 @@ m_lgamma(double x)
             errno = EDOM;  /* lgamma(n) = inf, divide-by-zero for */
             return Py_HUGE_VAL; /* integers n <= 0 */
         }
+        else {
+            return 0.0; /* lgamma(1) = lgamma(2) = 0.0 */
+        }
     }
 
-    return lgamma(x);
+    absx = fabs(x);
+    /* tiny arguments: lgamma(x) ~ -log(fabs(x)) for small x */
+    if (absx < 1e-20)
+        return -log(absx);
+
+    /* Lanczos' formula.  We could save a fraction of a ulp in accuracy by
+       having a second set of numerator coefficients for lanczos_sum that
+       absorbed the exp(-lanczos_g) term, and throwing out the lanczos_g
+       subtraction below; it's probably not worth it. */
+    r = log(lanczos_sum(absx)) - lanczos_g;
+    r += (absx - 0.5) * (log(absx + lanczos_g - 0.5) - 1);
+    if (x < 0.0)
+        /* Use reflection formula to get value for negative x. */
+        r = logpi - log(fabs(m_sinpi(absx))) - log(absx) - r;
+    if (Py_IS_INFINITY(r))
+        errno = ERANGE;
+    return r;
 }
 
 /*
@@ -907,7 +1159,7 @@ math_floor(PyObject *module, PyObject *number)
     return PyLong_FromDouble(floor(x));
 }
 
-FUNC1A(gamma, m_gamma,
+FUNC1A(gamma, m_tgamma,
       "gamma($module, x, /)\n--\n\n"
       "Gamma function at x.")
 FUNC1A(lgamma, m_lgamma,

From 5b75ef88c3d6788ef76ab8ca895de964c80d02f1 Mon Sep 17 00:00:00 2001
From: Sergey B Kirpichev <skirpichev@gmail.com>
Date: Wed, 8 Feb 2023 12:30:12 +0300
Subject: [PATCH 04/11] Comment on custom implementations of tgamma/lgamma

---
 Modules/mathmodule.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c
index 8e5cdfb1f47231..05cd129ae4de03 100644
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -158,7 +158,9 @@ m_sinpi(double x)
     return copysign(1.0, x)*r;
 }
 
-/* Implementation of the real gamma function.  In extensive but non-exhaustive
+/* Implementation of the real gamma function.  Kept here to workaround
+   issues (see e.g. #70309) with quality of libm's tgamma/lgamma implementations
+   on various platforms (Windows, MacOS).  In extensive but non-exhaustive
    random tests, this function proved accurate to within <= 10 ulps across the
    entire float domain.  Note that accuracy may depend on the quality of the
    system math functions, the pow function in particular.  Special cases

From 538810dd92b95e8973b0c621a957741fa7d73d2f Mon Sep 17 00:00:00 2001
From: Sergey B Kirpichev <skirpichev@gmail.com>
Date: Wed, 8 Feb 2023 12:34:05 +0300
Subject: [PATCH 05/11] UPD commit msg: gh-101678: refactor the math module to
 use erf/erfc/log2 from c11

---
 Modules/mathmodule.c | 21 +--------------------
 1 file changed, 1 insertion(+), 20 deletions(-)

diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c
index 05cd129ae4de03..7147c744fa941c 100644
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -641,27 +641,8 @@ m_log2(double x)
         }
     }
 
-    if (x > 0.0) {
-#ifdef HAVE_LOG2
+    if (x > 0.0)
         return log2(x);
-#else
-        double m;
-        int e;
-        m = frexp(x, &e);
-        /* We want log2(m * 2**e) == log(m) / log(2) + e.  Care is needed when
-         * x is just greater than 1.0: in that case e is 1, log(m) is negative,
-         * and we get significant cancellation error from the addition of
-         * log(m) / log(2) to e.  The slight rewrite of the expression below
-         * avoids this problem.
-         */
-        if (x >= 1.0) {
-            return log(2.0 * m) / log(2.0) + (e - 1);
-        }
-        else {
-            return log(m) / log(2.0) + e;
-        }
-#endif
-    }
     else if (x == 0.0) {
         errno = EDOM;
         return -Py_HUGE_VAL; /* log2(0) = -inf, divide-by-zero */

From fa76552df13078c43c43fa9bf70b971efa465698 Mon Sep 17 00:00:00 2001
From: Sergey B Kirpichev <skirpichev@gmail.com>
Date: Thu, 9 Feb 2023 03:07:42 +0300
Subject: [PATCH 06/11] Grammar fix

Co-authored-by: Mark Dickinson <dickinsm@gmail.com>
---
 Modules/mathmodule.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c
index 7147c744fa941c..eb38af4b1ec3a3 100644
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -158,7 +158,7 @@ m_sinpi(double x)
     return copysign(1.0, x)*r;
 }
 
-/* Implementation of the real gamma function.  Kept here to workaround
+/* Implementation of the real gamma function.  Kept here to work around
    issues (see e.g. #70309) with quality of libm's tgamma/lgamma implementations
    on various platforms (Windows, MacOS).  In extensive but non-exhaustive
    random tests, this function proved accurate to within <= 10 ulps across the

From 42829cb79fd0ee7840a498397f1e6e730a222614 Mon Sep 17 00:00:00 2001
From: Sergey B Kirpichev <skirpichev@gmail.com>
Date: Thu, 9 Feb 2023 03:19:35 +0300
Subject: [PATCH 07/11] Restore braces in m_log2 per PEP7

---
 Modules/mathmodule.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c
index eb38af4b1ec3a3..805864ecf80d5b 100644
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -641,8 +641,9 @@ m_log2(double x)
         }
     }
 
-    if (x > 0.0)
+    if (x > 0.0) {
         return log2(x);
+    }
     else if (x == 0.0) {
         errno = EDOM;
         return -Py_HUGE_VAL; /* log2(0) = -inf, divide-by-zero */

From 47970e268331c486e23fd7d25174292d3a9a2038 Mon Sep 17 00:00:00 2001
From: Sergey B Kirpichev <skirpichev@gmail.com>
Date: Thu, 9 Feb 2023 05:00:12 +0300
Subject: [PATCH 08/11] Merge math_1_to_whatever() and math_1()

---
 Modules/mathmodule.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c
index 805864ecf80d5b..8ae109b27a41cd 100644
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -875,9 +875,7 @@ is_error(double x)
 */
 
 static PyObject *
-math_1_to_whatever(PyObject *arg, double (*func) (double),
-                   PyObject *(*from_double_func) (double),
-                   int can_overflow)
+math_1(PyObject *arg, double (*func) (double), int can_overflow)
 {
     double x, r;
     x = PyFloat_AsDouble(arg);
@@ -903,7 +901,7 @@ math_1_to_whatever(PyObject *arg, double (*func) (double),
         /* this branch unnecessary on most platforms */
         return NULL;
 
-    return (*from_double_func)(r);
+    return PyFloat_FromDouble(r);
 }
 
 /* variant of math_1, to be used when the function being wrapped is known to
@@ -951,12 +949,6 @@ math_1a(PyObject *arg, double (*func) (double))
    OverflowError.
 */
 
-static PyObject *
-math_1(PyObject *arg, double (*func) (double), int can_overflow)
-{
-    return math_1_to_whatever(arg, func, PyFloat_FromDouble, can_overflow);
-}
-
 static PyObject *
 math_2(PyObject *const *args, Py_ssize_t nargs,
        double (*func) (double, double), const char *funcname)

From bebbc79adaa543113ec4016633984085ce81c1a9 Mon Sep 17 00:00:00 2001
From: Sergey B Kirpichev <skirpichev@gmail.com>
Date: Thu, 9 Feb 2023 05:22:14 +0300
Subject: [PATCH 09/11] Provide context for log1p workaround
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

BTW, C11 Annex F says: log1p(±0) returns ±0
---
 Modules/_math.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Modules/_math.h b/Modules/_math.h
index 4a6bc223ef5fb5..64cca138a89abe 100644
--- a/Modules/_math.h
+++ b/Modules/_math.h
@@ -7,8 +7,9 @@
 static double
 _Py_log1p(double x)
 {
-    /* Some platforms supply a log1p function but don't respect the sign of
-       zero:  log1p(-0.0) gives 0.0 instead of the correct result of -0.0.
+    /* Some platforms (e.g. MacOS X 10.8, see #59682) supply a log1p function
+       but don't respect the sign of zero:  log1p(-0.0) gives 0.0 instead of
+       the correct result of -0.0.
 
        To save fiddling with configure tests and platform checks, we handle the
        special case of zero input directly on all platforms.

From ee5e68929b9f94c4e5e5d7b0da43810b95d9a203 Mon Sep 17 00:00:00 2001
From: Sergey B Kirpichev <skirpichev@gmail.com>
Date: Thu, 9 Feb 2023 11:06:10 +0300
Subject: [PATCH 10/11] Revert "Merge math_1_to_whatever() and math_1()"

This reverts commit 47970e268331c486e23fd7d25174292d3a9a2038.
---
 Modules/mathmodule.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c
index 8ae109b27a41cd..805864ecf80d5b 100644
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -875,7 +875,9 @@ is_error(double x)
 */
 
 static PyObject *
-math_1(PyObject *arg, double (*func) (double), int can_overflow)
+math_1_to_whatever(PyObject *arg, double (*func) (double),
+                   PyObject *(*from_double_func) (double),
+                   int can_overflow)
 {
     double x, r;
     x = PyFloat_AsDouble(arg);
@@ -901,7 +903,7 @@ math_1(PyObject *arg, double (*func) (double), int can_overflow)
         /* this branch unnecessary on most platforms */
         return NULL;
 
-    return PyFloat_FromDouble(r);
+    return (*from_double_func)(r);
 }
 
 /* variant of math_1, to be used when the function being wrapped is known to
@@ -949,6 +951,12 @@ math_1a(PyObject *arg, double (*func) (double))
    OverflowError.
 */
 
+static PyObject *
+math_1(PyObject *arg, double (*func) (double), int can_overflow)
+{
+    return math_1_to_whatever(arg, func, PyFloat_FromDouble, can_overflow);
+}
+
 static PyObject *
 math_2(PyObject *const *args, Py_ssize_t nargs,
        double (*func) (double, double), const char *funcname)

From 06e9c8c56fb92448a906ca5375e8c9424de4554f Mon Sep 17 00:00:00 2001
From: Sergey B Kirpichev <skirpichev@gmail.com>
Date: Thu, 9 Feb 2023 11:07:11 +0300
Subject: [PATCH 11/11] Fix issues prefixes

---
 Modules/_math.h      | 2 +-
 Modules/mathmodule.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Modules/_math.h b/Modules/_math.h
index 64cca138a89abe..2285b64747c0bd 100644
--- a/Modules/_math.h
+++ b/Modules/_math.h
@@ -7,7 +7,7 @@
 static double
 _Py_log1p(double x)
 {
-    /* Some platforms (e.g. MacOS X 10.8, see #59682) supply a log1p function
+    /* Some platforms (e.g. MacOS X 10.8, see gh-59682) supply a log1p function
        but don't respect the sign of zero:  log1p(-0.0) gives 0.0 instead of
        the correct result of -0.0.
 
diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c
index 805864ecf80d5b..939954c95d9ff2 100644
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -159,7 +159,7 @@ m_sinpi(double x)
 }
 
 /* Implementation of the real gamma function.  Kept here to work around
-   issues (see e.g. #70309) with quality of libm's tgamma/lgamma implementations
+   issues (see e.g. gh-70309) with quality of libm's tgamma/lgamma implementations
    on various platforms (Windows, MacOS).  In extensive but non-exhaustive
    random tests, this function proved accurate to within <= 10 ulps across the
    entire float domain.  Note that accuracy may depend on the quality of the