Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: Deprecate ddof parameter for correlation coefficient #20197

Merged
merged 5 commits into from
Dec 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions crates/polars-compute/src/var_cov.rs
Original file line number Diff line number Diff line change
Expand Up @@ -192,8 +192,7 @@ impl PearsonState {
self.mean_y = new_mean_y;
}

pub fn finalize(&self, _ddof: u8) -> f64 {
// The division by sample_weight - ddof on both sides cancels out.
pub fn finalize(&self) -> f64 {
let denom = (self.dp_xx * self.dp_yy).sqrt();
if denom == 0.0 {
f64::NAN
Expand Down
4 changes: 2 additions & 2 deletions crates/polars-lazy/src/tests/arity.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ fn test_pearson_corr() -> PolarsResult<()> {
.lazy()
.group_by_stable([col("uid")])
// a double aggregation expression.
.agg([pearson_corr(col("day"), col("cumcases"), 1).alias("pearson_corr")])
.agg([pearson_corr(col("day"), col("cumcases")).alias("pearson_corr")])
.collect()?;
let s = out.column("pearson_corr")?.f64()?;
assert!((s.get(0).unwrap() - 0.997176).abs() < 0.000001);
Expand All @@ -25,7 +25,7 @@ fn test_pearson_corr() -> PolarsResult<()> {
.lazy()
.group_by_stable([col("uid")])
// a double aggregation expression.
.agg([pearson_corr(col("day"), col("cumcases"), 1)
.agg([pearson_corr(col("day"), col("cumcases"))
.pow(2.0)
.alias("pearson_corr")])
.collect()
Expand Down
4 changes: 2 additions & 2 deletions crates/polars-ops/src/chunked_array/cov.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ where
}

/// Compute the pearson correlation between two columns.
pub fn pearson_corr<T>(a: &ChunkedArray<T>, b: &ChunkedArray<T>, ddof: u8) -> Option<f64>
pub fn pearson_corr<T>(a: &ChunkedArray<T>, b: &ChunkedArray<T>) -> Option<f64>
where
T: PolarsNumericType,
T::Native: AsPrimitive<f64>,
Expand All @@ -30,5 +30,5 @@ where
for (a, b) in a.downcast_iter().zip(b.downcast_iter()) {
out.combine(&polars_compute::var_cov::pearson_corr(a, b))
}
Some(out.finalize(ddof))
Some(out.finalize())
}
32 changes: 15 additions & 17 deletions crates/polars-plan/src/dsl/function_expr/correlation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ pub enum CorrelationMethod {
Pearson,
#[cfg(all(feature = "rank", feature = "propagate_nans"))]
SpearmanRank(bool),
Covariance,
Covariance(u8),
}

impl Display for CorrelationMethod {
Expand All @@ -19,20 +19,18 @@ impl Display for CorrelationMethod {
Pearson => "pearson",
#[cfg(all(feature = "rank", feature = "propagate_nans"))]
SpearmanRank(_) => "spearman_rank",
Covariance => return write!(f, "covariance"),
Covariance(_) => return write!(f, "covariance"),
};
write!(f, "{}_correlation", s)
}
}

pub(super) fn corr(s: &[Column], ddof: u8, method: CorrelationMethod) -> PolarsResult<Column> {
pub(super) fn corr(s: &[Column], method: CorrelationMethod) -> PolarsResult<Column> {
match method {
CorrelationMethod::Pearson => pearson_corr(s, ddof),
CorrelationMethod::Pearson => pearson_corr(s),
#[cfg(all(feature = "rank", feature = "propagate_nans"))]
CorrelationMethod::SpearmanRank(propagate_nans) => {
spearman_rank_corr(s, ddof, propagate_nans)
},
CorrelationMethod::Covariance => covariance(s, ddof),
CorrelationMethod::SpearmanRank(propagate_nans) => spearman_rank_corr(s, propagate_nans),
CorrelationMethod::Covariance(ddof) => covariance(s, ddof),
}
}

Expand Down Expand Up @@ -61,32 +59,32 @@ fn covariance(s: &[Column], ddof: u8) -> PolarsResult<Column> {
Ok(Column::new(name, &[ret]))
}

fn pearson_corr(s: &[Column], ddof: u8) -> PolarsResult<Column> {
fn pearson_corr(s: &[Column]) -> PolarsResult<Column> {
let a = &s[0];
let b = &s[1];
let name = PlSmallStr::from_static("pearson_corr");

use polars_ops::chunked_array::cov::pearson_corr;
let ret = match a.dtype() {
DataType::Float32 => {
let ret = pearson_corr(a.f32().unwrap(), b.f32().unwrap(), ddof).map(|v| v as f32);
let ret = pearson_corr(a.f32().unwrap(), b.f32().unwrap()).map(|v| v as f32);
return Ok(Column::new(name.clone(), &[ret]));
},
DataType::Float64 => pearson_corr(a.f64().unwrap(), b.f64().unwrap(), ddof),
DataType::Int32 => pearson_corr(a.i32().unwrap(), b.i32().unwrap(), ddof),
DataType::Int64 => pearson_corr(a.i64().unwrap(), b.i64().unwrap(), ddof),
DataType::UInt32 => pearson_corr(a.u32().unwrap(), b.u32().unwrap(), ddof),
DataType::Float64 => pearson_corr(a.f64().unwrap(), b.f64().unwrap()),
DataType::Int32 => pearson_corr(a.i32().unwrap(), b.i32().unwrap()),
DataType::Int64 => pearson_corr(a.i64().unwrap(), b.i64().unwrap()),
DataType::UInt32 => pearson_corr(a.u32().unwrap(), b.u32().unwrap()),
_ => {
let a = a.cast(&DataType::Float64)?;
let b = b.cast(&DataType::Float64)?;
pearson_corr(a.f64().unwrap(), b.f64().unwrap(), ddof)
pearson_corr(a.f64().unwrap(), b.f64().unwrap())
},
};
Ok(Column::new(name, &[ret]))
}

#[cfg(all(feature = "rank", feature = "propagate_nans"))]
fn spearman_rank_corr(s: &[Column], ddof: u8, propagate_nans: bool) -> PolarsResult<Column> {
fn spearman_rank_corr(s: &[Column], propagate_nans: bool) -> PolarsResult<Column> {
use polars_core::utils::coalesce_nulls_columns;
use polars_ops::chunked_array::nan_propagating_aggregate::nan_max_s;
let a = &s[0];
Expand Down Expand Up @@ -134,5 +132,5 @@ fn spearman_rank_corr(s: &[Column], ddof: u8, propagate_nans: bool) -> PolarsRes
)
.into();

pearson_corr(&[a_rank, b_rank], ddof)
pearson_corr(&[a_rank, b_rank])
}
3 changes: 1 addition & 2 deletions crates/polars-plan/src/dsl/function_expr/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,6 @@ pub enum FunctionExpr {
#[cfg(feature = "cov")]
Correlation {
method: correlation::CorrelationMethod,
ddof: u8,
},
#[cfg(feature = "peaks")]
PeakMin,
Expand Down Expand Up @@ -1092,7 +1091,7 @@ impl From<FunctionExpr> for SpecialEq<Arc<dyn ColumnsUdf>> {
Fused(op) => map_as_slice!(fused::fused, op),
ConcatExpr(rechunk) => map_as_slice!(concat::concat_expr, rechunk),
#[cfg(feature = "cov")]
Correlation { method, ddof } => map_as_slice!(correlation::corr, ddof, method),
Correlation { method } => map_as_slice!(correlation::corr, method),
#[cfg(feature = "peaks")]
PeakMin => map!(peaks::peak_min),
#[cfg(feature = "peaks")]
Expand Down
15 changes: 3 additions & 12 deletions crates/polars-plan/src/dsl/functions/correlation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@ use super::*;
pub fn cov(a: Expr, b: Expr, ddof: u8) -> Expr {
let input = vec![a, b];
let function = FunctionExpr::Correlation {
method: CorrelationMethod::Covariance,
ddof,
method: CorrelationMethod::Covariance(ddof),
};
Expr::Function {
input,
Expand All @@ -20,15 +19,10 @@ pub fn cov(a: Expr, b: Expr, ddof: u8) -> Expr {
}

/// Compute the pearson correlation between two columns.
///
/// # Arguments
/// * ddof
/// Delta degrees of freedom
pub fn pearson_corr(a: Expr, b: Expr, ddof: u8) -> Expr {
pub fn pearson_corr(a: Expr, b: Expr) -> Expr {
let input = vec![a, b];
let function = FunctionExpr::Correlation {
method: CorrelationMethod::Pearson,
ddof,
};
Expr::Function {
input,
Expand All @@ -45,18 +39,15 @@ pub fn pearson_corr(a: Expr, b: Expr, ddof: u8) -> Expr {
/// Compute the spearman rank correlation between two columns.
/// Missing data will be excluded from the computation.
/// # Arguments
/// * ddof
/// Delta degrees of freedom
/// * propagate_nans
/// If `true` any `NaN` encountered will lead to `NaN` in the output.
/// If to `false` then `NaN` are regarded as larger than any finite number
/// and thus lead to the highest rank.
#[cfg(all(feature = "rank", feature = "propagate_nans"))]
pub fn spearman_rank_corr(a: Expr, b: Expr, ddof: u8, propagate_nans: bool) -> Expr {
pub fn spearman_rank_corr(a: Expr, b: Expr, propagate_nans: bool) -> Expr {
let input = vec![a, b];
let function = FunctionExpr::Correlation {
method: CorrelationMethod::SpearmanRank(propagate_nans),
ddof,
};
Expr::Function {
input,
Expand Down
8 changes: 4 additions & 4 deletions crates/polars-python/src/functions/lazy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -504,8 +504,8 @@ pub fn map_mul(
}

#[pyfunction]
pub fn pearson_corr(a: PyExpr, b: PyExpr, ddof: u8) -> PyExpr {
dsl::pearson_corr(a.inner, b.inner, ddof).into()
pub fn pearson_corr(a: PyExpr, b: PyExpr) -> PyExpr {
dsl::pearson_corr(a.inner, b.inner).into()
}

#[pyfunction]
Expand Down Expand Up @@ -537,10 +537,10 @@ pub fn repeat(value: PyExpr, n: PyExpr, dtype: Option<Wrap<DataType>>) -> PyResu
}

#[pyfunction]
pub fn spearman_rank_corr(a: PyExpr, b: PyExpr, ddof: u8, propagate_nans: bool) -> PyExpr {
pub fn spearman_rank_corr(a: PyExpr, b: PyExpr, propagate_nans: bool) -> PyExpr {
#[cfg(feature = "propagate_nans")]
{
dsl::spearman_rank_corr(a.inner, b.inner, ddof, propagate_nans).into()
dsl::spearman_rank_corr(a.inner, b.inner, propagate_nans).into()
}
#[cfg(not(feature = "propagate_nans"))]
{
Expand Down
19 changes: 13 additions & 6 deletions py-polars/polars/functions/lazy.py
Original file line number Diff line number Diff line change
Expand Up @@ -781,7 +781,7 @@ def corr(
b: IntoExpr,
*,
method: CorrelationMethod = "pearson",
ddof: int = 1,
ddof: int | None = None,
propagate_nans: bool = False,
) -> Expr:
"""
Expand All @@ -794,9 +794,10 @@ def corr(
b
Column name or Expression.
ddof
"Delta Degrees of Freedom": the divisor used in the calculation is N - ddof,
where N represents the number of elements.
By default ddof is 1.
Has no effect, do not use.

.. deprecated:: 1.17.0

ritchie46 marked this conversation as resolved.
Show resolved Hide resolved
method : {'pearson', 'spearman'}
Correlation method.
propagate_nans
Expand Down Expand Up @@ -844,13 +845,19 @@ def corr(
│ 0.5 │
└─────┘
"""
if ddof is not None:
issue_deprecation_warning(
"The `ddof` parameter has no effect. Do not use it.",
version="1.17.0",
)

a = parse_into_expression(a)
b = parse_into_expression(b)

if method == "pearson":
return wrap_expr(plr.pearson_corr(a, b, ddof))
return wrap_expr(plr.pearson_corr(a, b))
elif method == "spearman":
return wrap_expr(plr.spearman_rank_corr(a, b, ddof, propagate_nans))
return wrap_expr(plr.spearman_rank_corr(a, b, propagate_nans))
else:
msg = f"method must be one of {{'pearson', 'spearman'}}, got {method!r}"
raise ValueError(msg)
Expand Down
8 changes: 5 additions & 3 deletions py-polars/tests/unit/lazyframe/test_lazyframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -946,7 +946,7 @@ def test_spearman_corr_ties() -> None:
df = pl.DataFrame({"a": [1, 1, 1, 2, 3, 7, 4], "b": [4, 3, 2, 2, 4, 3, 1]})

result = df.select(
pl.corr("a", "b", method="spearman", ddof=0).alias("a1"),
pl.corr("a", "b", method="spearman").alias("a1"),
pl.corr(pl.col("a").rank("min"), pl.col("b").rank("min")).alias("a2"),
pl.corr(pl.col("a").rank(), pl.col("b").rank()).alias("a3"),
)
Expand All @@ -972,7 +972,9 @@ def test_pearson_corr() -> None:
out = (
ldf.group_by("era", maintain_order=True).agg(
pl.corr(
pl.col("prediction"), pl.col("target"), method="pearson", ddof=0
pl.col("prediction"),
pl.col("target"),
method="pearson",
).alias("c"),
)
).collect()["c"]
Expand All @@ -981,7 +983,7 @@ def test_pearson_corr() -> None:
# we can also pass in column names directly
out = (
ldf.group_by("era", maintain_order=True).agg(
pl.corr("prediction", "target", method="pearson", ddof=0).alias("c"),
pl.corr("prediction", "target", method="pearson").alias("c"),
)
).collect()["c"]
assert out.to_list() == pytest.approx([0.6546536707079772, -5.477514993831792e-1])
Expand Down
2 changes: 1 addition & 1 deletion py-polars/tests/unit/operations/test_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def test_corr() -> None:

def test_corr_nan() -> None:
df = pl.DataFrame({"a": [1.0, 1.0], "b": [1.0, 2.0]})
assert str(df.select(pl.corr("a", "b", ddof=1))[0, 0]) == "nan"
assert str(df.select(pl.corr("a", "b"))[0, 0]) == "nan"


def test_median_quantile_duration() -> None:
Expand Down
Loading