Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

function to remove items from freq and freq_meta #582

Merged
merged 15 commits into from
Aug 28, 2023
Merged
72 changes: 71 additions & 1 deletion gnomad/utils/filtering.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import functools
import logging
import operator
from typing import Callable, Dict, List, Optional, Union
from typing import Any, Callable, Dict, List, Optional, Tuple, Union

import hail as hl

Expand Down Expand Up @@ -531,3 +531,73 @@ def split_vds_by_strata(
return {
strata: hl.vds.filter_samples(vds, list(s)) for strata, s in s_by_strata.items()
}


def filter_freq_by_meta(
freq_expr: hl.expr.ArrayExpression,
freq_meta_expr: hl.expr.ArrayExpression,
items_to_filter: Union[Dict[str, List[Any]], List[Any]],
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
keep: bool = True,
combine_operator: str = "and",
) -> Tuple[hl.expr.ArrayExpression, hl.expr.ArrayExpression]:
"""
Filter frequency and frequency meta expressions specified by `items_to_filter`.

The `items_to_filter` can be used to filter in the following ways based on
`freq_meta_expr` items:
- By a list of keys, e.g. ["sex", "downsampling"].
- By specific key: value pairs, e.g. to filter where 'pop' is 'han' or 'papuan'
{"pop": ["han", "papuan"]}, or where 'pop' is 'afr' and/or 'sex' is 'XX'
{"pop": ["afr"], "sex": ["XX"]}.

The items can be kept or removed from `freq_expr` and `freq_meta_expr` based on the
value of `keep`.

The filtering can also be applied such that all criteria must be met
(`combine_operator` = "and") by the `freq_meta_expr` item in order to be filtered,
or at least one of the specified criteria must be met (`combine_operator` = "or")
by the `freq_meta_expr` item in order to be filtered.

:param freq_expr: Frequency expression.
:param freq_meta_expr: frequency meta expression
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
:param items_to_filter: Items to filter by, either a list or a dictionary.
:param keep: whether to keep or remove the items
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
:param combine_operator: Whether to use "and" or "or" to combine the items
specified by `items_to_filter`.
:return: filtered frequency and frequency meta expressions
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
"""
freq_meta_expr = freq_meta_expr.collect(_localize=False)[0]

if combine_operator == "and":
operator_func = hl.all
elif combine_operator == "or":
operator_func = hl.any
else:
raise ValueError(
"combine_operator must be one of 'and' or 'or', but found"
f" {combine_operator}"
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
)

KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
if isinstance(items_to_filter, list):
filter_func = lambda m, k: m.contains(k)
items_to_filter = [[k] for k in items_to_filter]
elif isinstance(items_to_filter, dict):
filter_func = lambda m, k: (m.get(k[0], "") == k[1])
items_to_filter = [
[(k, v) for v in values] for k, values in items_to_filter.items()
]
else:
raise TypeError("items_to_filter must be a list or a dictionary")
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved

freq_meta_expr = hl.enumerate(freq_meta_expr).filter(
lambda m: hl.bind(
lambda x: hl.if_else(keep, x, ~x),
operator_func(
[hl.any([filter_func(m[1], v) for v in k]) for k in items_to_filter]
),
),
)
freq_expr = freq_meta_expr.map(lambda x: freq_expr[x[0]])
freq_meta_expr = freq_meta_expr.map(lambda x: x[1])

return freq_expr, freq_meta_expr