pola-rs · ritchie46 · Dec 7, 2024 · Dec 6, 2024 · Dec 6, 2024 · Dec 6, 2024
@@ -192,8 +192,7 @@ impl PearsonState {
         self.mean_y = new_mean_y;
     }
 
-    pub fn finalize(&self, _ddof: u8) -> f64 {
-        // The division by sample_weight - ddof on both sides cancels out.
+    pub fn finalize(&self) -> f64 {
         let denom = (self.dp_xx * self.dp_yy).sqrt();
         if denom == 0.0 {
             f64::NAN

@@ -15,7 +15,7 @@ fn test_pearson_corr() -> PolarsResult<()> {
         .lazy()
         .group_by_stable([col("uid")])
         // a double aggregation expression.
-        .agg([pearson_corr(col("day"), col("cumcases"), 1).alias("pearson_corr")])
+        .agg([pearson_corr(col("day"), col("cumcases")).alias("pearson_corr")])
         .collect()?;
     let s = out.column("pearson_corr")?.f64()?;
     assert!((s.get(0).unwrap() - 0.997176).abs() < 0.000001);
@@ -25,7 +25,7 @@ fn test_pearson_corr() -> PolarsResult<()> {
         .lazy()
         .group_by_stable([col("uid")])
         // a double aggregation expression.
-        .agg([pearson_corr(col("day"), col("cumcases"), 1)
+        .agg([pearson_corr(col("day"), col("cumcases"))
             .pow(2.0)
             .alias("pearson_corr")])
         .collect()

@@ -19,7 +19,7 @@ where
 }
 
 /// Compute the pearson correlation between two columns.
-pub fn pearson_corr<T>(a: &ChunkedArray<T>, b: &ChunkedArray<T>, ddof: u8) -> Option<f64>
+pub fn pearson_corr<T>(a: &ChunkedArray<T>, b: &ChunkedArray<T>) -> Option<f64>
 where
     T: PolarsNumericType,
     T::Native: AsPrimitive<f64>,
@@ -30,5 +30,5 @@ where
     for (a, b) in a.downcast_iter().zip(b.downcast_iter()) {
         out.combine(&polars_compute::var_cov::pearson_corr(a, b))
     }
-    Some(out.finalize(ddof))
+    Some(out.finalize())
 }
@@ -9,7 +9,7 @@ pub enum CorrelationMethod {
     Pearson,
     #[cfg(all(feature = "rank", feature = "propagate_nans"))]
     SpearmanRank(bool),
-    Covariance,
+    Covariance(u8),
 }
 
 impl Display for CorrelationMethod {
@@ -19,20 +19,18 @@ impl Display for CorrelationMethod {
             Pearson => "pearson",
             #[cfg(all(feature = "rank", feature = "propagate_nans"))]
             SpearmanRank(_) => "spearman_rank",
-            Covariance => return write!(f, "covariance"),
+            Covariance(_) => return write!(f, "covariance"),
         };
         write!(f, "{}_correlation", s)
     }
 }
 
-pub(super) fn corr(s: &[Column], ddof: u8, method: CorrelationMethod) -> PolarsResult<Column> {
+pub(super) fn corr(s: &[Column], method: CorrelationMethod) -> PolarsResult<Column> {
     match method {
-        CorrelationMethod::Pearson => pearson_corr(s, ddof),
+        CorrelationMethod::Pearson => pearson_corr(s),
         #[cfg(all(feature = "rank", feature = "propagate_nans"))]
-        CorrelationMethod::SpearmanRank(propagate_nans) => {
-            spearman_rank_corr(s, ddof, propagate_nans)
-        },
-        CorrelationMethod::Covariance => covariance(s, ddof),
+        CorrelationMethod::SpearmanRank(propagate_nans) => spearman_rank_corr(s, propagate_nans),
+        CorrelationMethod::Covariance(ddof) => covariance(s, ddof),
     }
 }
 
@@ -61,32 +59,32 @@ fn covariance(s: &[Column], ddof: u8) -> PolarsResult<Column> {
     Ok(Column::new(name, &[ret]))
 }
 
-fn pearson_corr(s: &[Column], ddof: u8) -> PolarsResult<Column> {
+fn pearson_corr(s: &[Column]) -> PolarsResult<Column> {
     let a = &s[0];
     let b = &s[1];
     let name = PlSmallStr::from_static("pearson_corr");
 
     use polars_ops::chunked_array::cov::pearson_corr;
     let ret = match a.dtype() {
         DataType::Float32 => {
-            let ret = pearson_corr(a.f32().unwrap(), b.f32().unwrap(), ddof).map(|v| v as f32);
+            let ret = pearson_corr(a.f32().unwrap(), b.f32().unwrap()).map(|v| v as f32);
             return Ok(Column::new(name.clone(), &[ret]));
         },
-        DataType::Float64 => pearson_corr(a.f64().unwrap(), b.f64().unwrap(), ddof),
-        DataType::Int32 => pearson_corr(a.i32().unwrap(), b.i32().unwrap(), ddof),
-        DataType::Int64 => pearson_corr(a.i64().unwrap(), b.i64().unwrap(), ddof),
-        DataType::UInt32 => pearson_corr(a.u32().unwrap(), b.u32().unwrap(), ddof),
+        DataType::Float64 => pearson_corr(a.f64().unwrap(), b.f64().unwrap()),
+        DataType::Int32 => pearson_corr(a.i32().unwrap(), b.i32().unwrap()),
+        DataType::Int64 => pearson_corr(a.i64().unwrap(), b.i64().unwrap()),
+        DataType::UInt32 => pearson_corr(a.u32().unwrap(), b.u32().unwrap()),
         _ => {
             let a = a.cast(&DataType::Float64)?;
             let b = b.cast(&DataType::Float64)?;
-            pearson_corr(a.f64().unwrap(), b.f64().unwrap(), ddof)
+            pearson_corr(a.f64().unwrap(), b.f64().unwrap())
         },
     };
     Ok(Column::new(name, &[ret]))
 }
 
 #[cfg(all(feature = "rank", feature = "propagate_nans"))]
-fn spearman_rank_corr(s: &[Column], ddof: u8, propagate_nans: bool) -> PolarsResult<Column> {
+fn spearman_rank_corr(s: &[Column], propagate_nans: bool) -> PolarsResult<Column> {
     use polars_core::utils::coalesce_nulls_columns;
     use polars_ops::chunked_array::nan_propagating_aggregate::nan_max_s;
     let a = &s[0];
@@ -134,5 +132,5 @@ fn spearman_rank_corr(s: &[Column], ddof: u8, propagate_nans: bool) -> PolarsRes
         )
         .into();
 
-    pearson_corr(&[a_rank, b_rank], ddof)
+    pearson_corr(&[a_rank, b_rank])
 }
@@ -282,7 +282,6 @@ pub enum FunctionExpr {
     #[cfg(feature = "cov")]
     Correlation {
         method: correlation::CorrelationMethod,
-        ddof: u8,
     },
     #[cfg(feature = "peaks")]
     PeakMin,
@@ -1092,7 +1091,7 @@ impl From<FunctionExpr> for SpecialEq<Arc<dyn ColumnsUdf>> {
             Fused(op) => map_as_slice!(fused::fused, op),
             ConcatExpr(rechunk) => map_as_slice!(concat::concat_expr, rechunk),
             #[cfg(feature = "cov")]
-            Correlation { method, ddof } => map_as_slice!(correlation::corr, ddof, method),
+            Correlation { method } => map_as_slice!(correlation::corr, method),
             #[cfg(feature = "peaks")]
             PeakMin => map!(peaks::peak_min),
             #[cfg(feature = "peaks")]

@@ -4,8 +4,7 @@ use super::*;
 pub fn cov(a: Expr, b: Expr, ddof: u8) -> Expr {
     let input = vec![a, b];
     let function = FunctionExpr::Correlation {
-        method: CorrelationMethod::Covariance,
-        ddof,
+        method: CorrelationMethod::Covariance(ddof),
     };
     Expr::Function {
         input,
@@ -20,15 +19,10 @@ pub fn cov(a: Expr, b: Expr, ddof: u8) -> Expr {
 }
 
 /// Compute the pearson correlation between two columns.
-///
-/// # Arguments
-/// * ddof
-///     Delta degrees of freedom
-pub fn pearson_corr(a: Expr, b: Expr, ddof: u8) -> Expr {
+pub fn pearson_corr(a: Expr, b: Expr) -> Expr {
     let input = vec![a, b];
     let function = FunctionExpr::Correlation {
         method: CorrelationMethod::Pearson,
-        ddof,
     };
     Expr::Function {
         input,
@@ -45,18 +39,15 @@ pub fn pearson_corr(a: Expr, b: Expr, ddof: u8) -> Expr {
 /// Compute the spearman rank correlation between two columns.
 /// Missing data will be excluded from the computation.
 /// # Arguments
-/// * ddof
-///     Delta degrees of freedom
 /// * propagate_nans
 ///     If `true` any `NaN` encountered will lead to `NaN` in the output.
 ///     If to `false` then `NaN` are regarded as larger than any finite number
 ///     and thus lead to the highest rank.
 #[cfg(all(feature = "rank", feature = "propagate_nans"))]
-pub fn spearman_rank_corr(a: Expr, b: Expr, ddof: u8, propagate_nans: bool) -> Expr {
+pub fn spearman_rank_corr(a: Expr, b: Expr, propagate_nans: bool) -> Expr {
     let input = vec![a, b];
     let function = FunctionExpr::Correlation {
         method: CorrelationMethod::SpearmanRank(propagate_nans),
-        ddof,
     };
     Expr::Function {
         input,

@@ -504,8 +504,8 @@ pub fn map_mul(
 }
 
 #[pyfunction]
-pub fn pearson_corr(a: PyExpr, b: PyExpr, ddof: u8) -> PyExpr {
-    dsl::pearson_corr(a.inner, b.inner, ddof).into()
+pub fn pearson_corr(a: PyExpr, b: PyExpr) -> PyExpr {
+    dsl::pearson_corr(a.inner, b.inner).into()
 }
 
 #[pyfunction]
@@ -537,10 +537,10 @@ pub fn repeat(value: PyExpr, n: PyExpr, dtype: Option<Wrap<DataType>>) -> PyResu
 }
 
 #[pyfunction]
-pub fn spearman_rank_corr(a: PyExpr, b: PyExpr, ddof: u8, propagate_nans: bool) -> PyExpr {
+pub fn spearman_rank_corr(a: PyExpr, b: PyExpr, propagate_nans: bool) -> PyExpr {
     #[cfg(feature = "propagate_nans")]
     {
-        dsl::spearman_rank_corr(a.inner, b.inner, ddof, propagate_nans).into()
+        dsl::spearman_rank_corr(a.inner, b.inner, propagate_nans).into()
     }
     #[cfg(not(feature = "propagate_nans"))]
     {

@@ -781,7 +781,7 @@ def corr(
     b: IntoExpr,
     *,
     method: CorrelationMethod = "pearson",
-    ddof: int = 1,
+    ddof: int | None = None,
     propagate_nans: bool = False,
 ) -> Expr:
     """
@@ -794,9 +794,10 @@ def corr(
     b
         Column name or Expression.
     ddof
-        "Delta Degrees of Freedom": the divisor used in the calculation is N - ddof,
-        where N represents the number of elements.
-        By default ddof is 1.
+        Has no effect, do not use.
+
+        .. deprecated:: 1.17.0
+
     method : {'pearson', 'spearman'}
         Correlation method.
     propagate_nans
@@ -844,13 +845,19 @@ def corr(
     │ 0.5 │
     └─────┘
     """
+    if ddof is not None:
+        issue_deprecation_warning(
+            "The `ddof` parameter has no effect. Do not use it.",
+            version="1.17.0",
+        )
+
     a = parse_into_expression(a)
     b = parse_into_expression(b)
 
     if method == "pearson":
-        return wrap_expr(plr.pearson_corr(a, b, ddof))
+        return wrap_expr(plr.pearson_corr(a, b))
     elif method == "spearman":
-        return wrap_expr(plr.spearman_rank_corr(a, b, ddof, propagate_nans))
+        return wrap_expr(plr.spearman_rank_corr(a, b, propagate_nans))
     else:
         msg = f"method must be one of {{'pearson', 'spearman'}}, got {method!r}"
         raise ValueError(msg)

@@ -946,7 +946,7 @@ def test_spearman_corr_ties() -> None:
     df = pl.DataFrame({"a": [1, 1, 1, 2, 3, 7, 4], "b": [4, 3, 2, 2, 4, 3, 1]})
 
     result = df.select(
-        pl.corr("a", "b", method="spearman", ddof=0).alias("a1"),
+        pl.corr("a", "b", method="spearman").alias("a1"),
         pl.corr(pl.col("a").rank("min"), pl.col("b").rank("min")).alias("a2"),
         pl.corr(pl.col("a").rank(), pl.col("b").rank()).alias("a3"),
     )
@@ -972,7 +972,9 @@ def test_pearson_corr() -> None:
     out = (
         ldf.group_by("era", maintain_order=True).agg(
             pl.corr(
-                pl.col("prediction"), pl.col("target"), method="pearson", ddof=0
+                pl.col("prediction"),
+                pl.col("target"),
+                method="pearson",
             ).alias("c"),
         )
     ).collect()["c"]
@@ -981,7 +983,7 @@ def test_pearson_corr() -> None:
     # we can also pass in column names directly
     out = (
         ldf.group_by("era", maintain_order=True).agg(
-            pl.corr("prediction", "target", method="pearson", ddof=0).alias("c"),
+            pl.corr("prediction", "target", method="pearson").alias("c"),
         )
     ).collect()["c"]
     assert out.to_list() == pytest.approx([0.6546536707079772, -5.477514993831792e-1])

@@ -33,7 +33,7 @@ def test_corr() -> None:
 
 def test_corr_nan() -> None:
     df = pl.DataFrame({"a": [1.0, 1.0], "b": [1.0, 2.0]})
-    assert str(df.select(pl.corr("a", "b", ddof=1))[0, 0]) == "nan"
+    assert str(df.select(pl.corr("a", "b"))[0, 0]) == "nan"
 
 
 def test_median_quantile_duration() -> None: