From 3f2fd8d53da13a0278a7540edc6ffa68baab74be Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Mon, 13 Mar 2023 11:12:22 -0500 Subject: [PATCH 01/11] Use sumprod() to speed-up and improve accuracy of correlation() --- Lib/statistics.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 7d5d750193a5ab..69dc33d931b4ff 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -134,7 +134,7 @@ from fractions import Fraction from decimal import Decimal -from itertools import count, groupby, repeat +from itertools import count, groupby, repeat, tee from bisect import bisect_left, bisect_right from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum, sumprod from functools import reduce @@ -1076,9 +1076,9 @@ def correlation(x, y, /, *, method='linear'): y = _rank(y, start=start) xbar = fsum(x) / n ybar = fsum(y) / n - sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y)) - sxx = fsum((d := xi - xbar) * d for xi in x) - syy = fsum((d := yi - ybar) * d for yi in y) + sxy = sumprod((xi - xbar for xi in x), (yi - ybar for yi in y)) + sxx = sumprod(*tee(xi - xbar for xi in x)) + syy = sumprod(*tee(yi - ybar for yi in y)) try: return sxy / sqrt(sxx * syy) except ZeroDivisionError: From 3241045212b341ca3396eff48b5db1ddce75efed Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Mon, 13 Mar 2023 11:30:00 -0500 Subject: [PATCH 02/11] Only compute differences once --- Lib/statistics.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 69dc33d931b4ff..8340829c7fc8f5 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -134,7 +134,7 @@ from fractions import Fraction from decimal import Decimal -from itertools import count, groupby, repeat, tee +from itertools import count, groupby, repeat from bisect import bisect_left, bisect_right from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum, sumprod from functools import reduce @@ -1076,9 +1076,11 @@ def correlation(x, y, /, *, method='linear'): y = _rank(y, start=start) xbar = fsum(x) / n ybar = fsum(y) / n - sxy = sumprod((xi - xbar for xi in x), (yi - ybar for yi in y)) - sxx = sumprod(*tee(xi - xbar for xi in x)) - syy = sumprod(*tee(yi - ybar for yi in y)) + centered_x = [xi - xbar for xi in x] + centered_y = [yi - ybar for yi in y] + sxy = sumprod(centered_x, centered_y) + sxx = sumprod(centered_x, centered_x) + syy = sumprod(centered_y, centered_y) try: return sxy / sqrt(sxx * syy) except ZeroDivisionError: From 52c75847b952c91c948435969fc3dad79567e55a Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Mon, 13 Mar 2023 11:40:36 -0500 Subject: [PATCH 03/11] Use sumprod() in covariance(). --- Lib/statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 8340829c7fc8f5..bf182f0234cb93 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1036,7 +1036,7 @@ def covariance(x, y, /): raise StatisticsError('covariance requires at least two data points') xbar = fsum(x) / n ybar = fsum(y) / n - sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y)) + sxy = sumprod((xi - xbar for xi in x), (yi - ybar for yi in y)) return sxy / (n - 1) From 16b2745301a6212dd9eb0cce646255fc35bccb9d Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Mon, 13 Mar 2023 12:02:41 -0500 Subject: [PATCH 04/11] Use sumprod() in linear_regression(). --- Lib/statistics.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index bf182f0234cb93..f153cbd4453676 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1134,13 +1134,15 @@ def linear_regression(x, y, /, *, proportional=False): if n < 2: raise StatisticsError('linear regression requires at least two data points') if proportional: - sxy = fsum(xi * yi for xi, yi in zip(x, y)) - sxx = fsum(xi * xi for xi in x) + sxy = sumprod(x, y) + sxx = sumprod(x, x) else: xbar = fsum(x) / n ybar = fsum(y) / n - sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y)) - sxx = fsum((d := xi - xbar) * d for xi in x) + centered_x = [xi - xbar for xi in x] + centered_y = (yi - ybar for yi in y) + sxy = sumprod(centered_x, centered_y) + sxx = sumprod(centered_x, centered_x) try: slope = sxy / sxx # equivalent to: covariance(x, y) / variance(x) except ZeroDivisionError: From b9eeee536d939d0b6c2165fb0ac92ce1a412a718 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Mon, 13 Mar 2023 12:18:15 -0500 Subject: [PATCH 05/11] Can skip centering if xbar or ybar is already zero --- Lib/statistics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index f153cbd4453676..29746e3d8db9c3 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1076,8 +1076,8 @@ def correlation(x, y, /, *, method='linear'): y = _rank(y, start=start) xbar = fsum(x) / n ybar = fsum(y) / n - centered_x = [xi - xbar for xi in x] - centered_y = [yi - ybar for yi in y] + centered_x = [xi - xbar for xi in x] if xbar else x + centered_y = [yi - ybar for yi in y] if ybar else y sxy = sumprod(centered_x, centered_y) sxx = sumprod(centered_x, centered_x) syy = sumprod(centered_y, centered_y) From c51e520b7c8f9f8e14ca6b2537de46f8e42cada2 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Mon, 13 Mar 2023 12:22:04 -0500 Subject: [PATCH 06/11] Factor-out common logic in linear_regression --- Lib/statistics.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 29746e3d8db9c3..cfd0819b4e5f51 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1074,13 +1074,14 @@ def correlation(x, y, /, *, method='linear'): start = (n - 1) / -2 # Center rankings around zero x = _rank(x, start=start) y = _rank(y, start=start) - xbar = fsum(x) / n - ybar = fsum(y) / n - centered_x = [xi - xbar for xi in x] if xbar else x - centered_y = [yi - ybar for yi in y] if ybar else y - sxy = sumprod(centered_x, centered_y) - sxx = sumprod(centered_x, centered_x) - syy = sumprod(centered_y, centered_y) + else: + xbar = fsum(x) / n + ybar = fsum(y) / n + x = [xi - xbar for xi in x] + y = [yi - ybar for yi in y] + sxy = sumprod(x, y) + sxx = sumprod(x, x) + syy = sumprod(y, y) try: return sxy / sqrt(sxx * syy) except ZeroDivisionError: From 39be9f8ec8f576ddb61eb4b5fa5b7366edab5443 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Mon, 13 Mar 2023 12:25:22 -0500 Subject: [PATCH 07/11] Factor-out common logic in linear_regression --- Lib/statistics.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index cfd0819b4e5f51..2571d5dd02cf3b 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1134,16 +1134,13 @@ def linear_regression(x, y, /, *, proportional=False): raise StatisticsError('linear regression requires that both inputs have same number of data points') if n < 2: raise StatisticsError('linear regression requires at least two data points') - if proportional: - sxy = sumprod(x, y) - sxx = sumprod(x, x) - else: + if not proportional: xbar = fsum(x) / n ybar = fsum(y) / n - centered_x = [xi - xbar for xi in x] - centered_y = (yi - ybar for yi in y) - sxy = sumprod(centered_x, centered_y) - sxx = sumprod(centered_x, centered_x) + x = [xi - xbar for xi in x] + y = [yi - ybar for yi in y] + sxy = sumprod(x, y) + sxx = sumprod(x, x) try: slope = sxy / sxx # equivalent to: covariance(x, y) / variance(x) except ZeroDivisionError: From a4486157c189a61d67cec6a25463bb60cb8f26d8 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Mon, 13 Mar 2023 12:28:23 -0500 Subject: [PATCH 08/11] Use generator for y array in linear_regression() --- Lib/statistics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 2571d5dd02cf3b..9cb76f5562387b 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1137,8 +1137,8 @@ def linear_regression(x, y, /, *, proportional=False): if not proportional: xbar = fsum(x) / n ybar = fsum(y) / n - x = [xi - xbar for xi in x] - y = [yi - ybar for yi in y] + x = [xi - xbar for xi in x] # List because used three times below + y = (yi - ybar for yi in y) # Generator because only used once below sxy = sumprod(x, y) sxx = sumprod(x, x) try: From c9a4e4d39acb2dd607b3ddffa3d539f772f5ab86 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Mon, 13 Mar 2023 15:48:28 -0500 Subject: [PATCH 09/11] Coerce result to a float --- Lib/statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 9cb76f5562387b..6bd214bbfe2ff5 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -1139,7 +1139,7 @@ def linear_regression(x, y, /, *, proportional=False): ybar = fsum(y) / n x = [xi - xbar for xi in x] # List because used three times below y = (yi - ybar for yi in y) # Generator because only used once below - sxy = sumprod(x, y) + sxy = sumprod(x, y) + 0.0 # Add zero to coerce result to a float sxx = sumprod(x, x) try: slope = sxy / sxx # equivalent to: covariance(x, y) / variance(x) From 5fe1be5898a0e649be8bb4c8127c05ef5d642b30 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Mon, 13 Mar 2023 17:42:18 -0500 Subject: [PATCH 10/11] Add test for the result type being a float. --- Lib/test/test_statistics.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index 31a3cb6b53a6f2..f0fa6454b1f91a 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -1,4 +1,4 @@ -"""Test suite for statistics module, including helper NumericTestCase and +x = """Test suite for statistics module, including helper NumericTestCase and approx_equal function. """ @@ -2610,6 +2610,16 @@ def test_proportional(self): self.assertAlmostEqual(slope, 20 + 1/150) self.assertEqual(intercept, 0.0) + def test_float_output(self): + x = [Fraction(2, 3), Fraction(3, 4)] + y = [Fraction(4, 5), Fraction(5, 6)] + slope, intercept = statistics.linear_regression(x, y) + self.assertTrue(isinstance(slope, float)) + self.assertTrue(isinstance(intercept, float)) + slope, intercept = statistics.linear_regression(x, y, proportional=True) + self.assertTrue(isinstance(slope, float)) + self.assertTrue(isinstance(intercept, float)) + class TestNormalDist: # General note on precision: The pdf(), cdf(), and overlap() methods From 45ffe7e100011b92c0fb073cb4ca6b71f9b52de9 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Mon, 13 Mar 2023 18:27:20 -0500 Subject: [PATCH 11/11] Add blurb --- .../next/Library/2023-03-13-18-27-00.gh-issue-102670.GyoThv.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2023-03-13-18-27-00.gh-issue-102670.GyoThv.rst diff --git a/Misc/NEWS.d/next/Library/2023-03-13-18-27-00.gh-issue-102670.GyoThv.rst b/Misc/NEWS.d/next/Library/2023-03-13-18-27-00.gh-issue-102670.GyoThv.rst new file mode 100644 index 00000000000000..3de09f86754f3e --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-03-13-18-27-00.gh-issue-102670.GyoThv.rst @@ -0,0 +1,2 @@ +Optimized fmean(), correlation(), covariance(), and linear_regression() +using the new math.sumprod() function.