Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge freq array function and new frequency dictionary builder #551

Merged
merged 24 commits into from
Jun 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
14819af
Merge N freq arrays by sum only
mike-w-wilson Jun 7, 2023
1f58568
Add subtraction
mike-w-wilson Jun 8, 2023
87fd2fd
Update diff dictionary and add comments
mike-w-wilson Jun 8, 2023
634ab95
Switch to Julia's merge
mike-w-wilson Jun 9, 2023
3499e40
Add diff back with options for negative values
mike-w-wilson Jun 14, 2023
4d374d2
Add description of function
mike-w-wilson Jun 14, 2023
3aa7f68
Drop TODOs
mike-w-wilson Jun 14, 2023
55340de
Update gnomad/utils/annotations.py
mike-w-wilson Jun 14, 2023
2d40b72
Update gnomad/utils/annotations.py
mike-w-wilson Jun 14, 2023
ac50e04
Move to same module as original function
mike-w-wilson Jun 14, 2023
8c29fb8
Update imports
mike-w-wilson Jun 14, 2023
ce76d83
Remove unnecessary imports
mike-w-wilson Jun 14, 2023
93be0d3
Apply suggestions from code review
mike-w-wilson Jun 14, 2023
0d4a67a
update merge docs
mike-w-wilson Jun 14, 2023
19f3c26
update merge docs
mike-w-wilson Jun 14, 2023
50c2b22
Black is misformating these, hail errors without them
mike-w-wilson Jun 14, 2023
c4d65b8
Apply suggestions from code review
mike-w-wilson Jun 21, 2023
e6ad87f
Add AF to correction for when AC < 0
mike-w-wilson Jun 21, 2023
95bf58c
Set default to error
mike-w-wilson Jun 21, 2023
980b1e8
Remove optional sort_order, add warning and docs
mike-w-wilson Jun 21, 2023
589a2fc
Apply suggestions from code review
mike-w-wilson Jun 23, 2023
65de216
Fix return doc and prevent formatting that breaks hail
mike-w-wilson Jun 23, 2023
8fd0ef9
Make sort_order optional
mike-w-wilson Jun 23, 2023
00418c6
Handle diff error when sort_order None is passed
mike-w-wilson Jun 23, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 137 additions & 4 deletions gnomad/utils/annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def project_max_expr(
> 0
),
# order the callstats computed by AF in decreasing order
lambda x: -x[1].AF[ai]
lambda x: -x[1].AF[ai],
mike-w-wilson marked this conversation as resolved.
Show resolved Hide resolved
mike-w-wilson marked this conversation as resolved.
Show resolved Hide resolved
mike-w-wilson marked this conversation as resolved.
Show resolved Hide resolved
mike-w-wilson marked this conversation as resolved.
Show resolved Hide resolved
# take the n_projects projects with largest AF
)[:n_projects].map(
# add the project in the callstats struct
Expand Down Expand Up @@ -219,8 +219,10 @@ def faf_expr(
~locus.in_autosome_or_par(),
hl.struct(
**{
f"faf{str(threshold)[2:]}": hl.experimental.filtering_allele_frequency(
freq[i].AC, freq[i].AN, threshold
f"faf{str(threshold)[2:]}": (
hl.experimental.filtering_allele_frequency(
freq[i].AC, freq[i].AN, threshold
)
)
for threshold in faf_thresholds
}
Expand Down Expand Up @@ -1106,7 +1108,8 @@ def region_flag_expr(
:return: `region_flag` struct row annotation
"""
prob_flags_expr = (
{"non_par": (t.locus.in_x_nonpar() | t.locus.in_y_nonpar())} if non_par else {}
{"non_par": (t.locus.in_x_nonpar() | t.locus.in_y_nonpar())
} if non_par else {} # fmt: skip
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ha the extra note made the line long enough that black wants parentheses there now...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh boy. I'm surprised it was even removing the parentheses, mine doesn't do that

)

if prob_regions is not None:
Expand Down Expand Up @@ -1188,3 +1191,133 @@ def hemi_expr(
# mt.GT[0] is alternate allele
gt.is_haploid() & (sex_expr == male_str) & (gt[0] == 1),
)


def merge_freq_arrays(
farrays: List[hl.expr.ArrayExpression],
fmeta: List[List[Dict[str, str]]],
operation: str = "sum",
set_negatives_to_zero: bool = False,
) -> Tuple[hl.expr.ArrayExpression, List[Dict[str, int]]]:
"""
Merge a list of frequency arrays based on the supplied `operation`.

.. warning::
Arrays must be on the same Table.

jkgoodrich marked this conversation as resolved.
Show resolved Hide resolved
.. note::

Arrays do not have to contain the same groupings or order of groupings but
the array indices for a freq array in `farrays` must be the same as its associated
frequency metadata index in `fmeta` i.e., `farrays = [freq1, freq2]` then `fmeta`
must equal `[fmeta1, fmeta2]` where fmeta1 contains the metadata information
for freq1.

If `operation` is set to "sum", groups in the merged array
will be the union of groupings found within the arrays' metadata and all arrays
with be summed by grouping. If `operation` is set to "diff", the merged array
will contain groups only found in the first array of `fmeta`. Any array containing
any of these groups will have thier values subtracted from the values of the first array.

:param farrays: List of frequency arrays to merge. First entry in the list is the primary array to which other arrays will be added or subtracted. All arrays must be on the same Table.
:param fmeta: List of frequency metadata for arrays being merged.
:param operation: Merge operation to perform. Options are "sum" and "diff". If "diff" is passed, the first freq array in the list will have the other arrays subtracted from it.
:param set_negatives_to_zero: If True, set negative array values to 0 for AC, AN, AF, and homozygote_count. If False, raise a ValueError. Default is True.
:return: Tuple of merged frequency array and its frequency metadata list.
"""
if len(farrays) < 2:
raise ValueError("Must provide at least two frequency arrays to merge!")
if len(farrays) != len(fmeta):
raise ValueError("Length of farrays and fmeta must be equal!")
if operation not in ["sum", "diff"]:
raise ValueError("Operation must be either 'sum' or 'diff'!")

# Create a list where each entry is a dictionary whose key is an aggregation
# group and the value is the corresponding index in the freq array.
fmeta = [hl.dict(hl.enumerate(f).map(lambda x: (x[1], [x[0]]))) for f in fmeta]

# Merge dictionaries in the list into a single dictionary where key is aggregation
# group and the value is a list of the group's index in each of the freq arrays, if
# it exists. For "sum" operation, use keys, aka groups, found in all freq dictionaries.
# For "diff" operations, only use key_set from the first entry.
jkgoodrich marked this conversation as resolved.
Show resolved Hide resolved
fmeta = hl.fold(
lambda i, j: hl.dict(
(
hl.if_else(operation == "sum", (i.key_set() | j.key_set()), i.key_set())
).map(
lambda k: (
k,
i.get(k, [hl.missing(hl.tint32)]).extend(
j.get(k, [hl.missing(hl.tint32)])
),
)
)
),
fmeta[0],
fmeta[1:],
)

# Create a list of tuples from the dictionary, sorted by the list of indices for
# each aggregation group.
fmeta = hl.sorted(fmeta.items(), key=lambda f: f[1])

# Create a list of the aggregation groups, maintaining the sorted order.
new_freq_meta = fmeta.map(lambda x: x[0])

# Create array for each aggregation group of arrays containing the group's freq
# values from each freq array.
freq_meta_idx = fmeta.map(lambda x: hl.zip(farrays, x[1]).map(lambda i: i[0][i[1]]))

def _sum_or_diff_fields(
field_1_expr: str, field_2_expr: str
) -> hl.expr.Int32Expression:
"""
Sum or subtract fields in call statistics struct.

:param field_1_expr: First field to sum or diff.
:param field_2_expr: Second field to sum or diff.
:return: Merged field value.
"""
return hl.if_else(
operation == "sum",
hl.or_else(field_1_expr, 0) + hl.or_else(field_2_expr, 0),
hl.or_else(field_1_expr, 0) - hl.or_else(field_2_expr, 0),
)

# Iterate through the groups and their freq lists to merge callstats.
callstat_ann = ["AC", "AN", "homozygote_count"]
new_freq = freq_meta_idx.map(
mike-w-wilson marked this conversation as resolved.
Show resolved Hide resolved
lambda x: hl.bind(
lambda y: y.annotate(AF=hl.if_else(y.AN > 0, y.AC / y.AN, 0)),
hl.fold(
lambda i, j: hl.struct(
**{ann: _sum_or_diff_fields(i[ann], j[ann]) for ann in callstat_ann}
),
x[0].select("AC", "AN", "homozygote_count"),
x[1:],
),
)
)
# Check and see if any annotation within the merged array is negative. If so,
# raise an error if set_negatives_to_zero is False or set the value to 0 if
# set_negatives_to_zero is True.
if operation == "diff":
negative_value_error_msg = (
"Negative values found in merged frequency array. Review data or set"
" `set_negatives_to_zero` to True to set negative values to 0."
)
callstat_ann.append("AF")
new_freq = new_freq.map(
jkgoodrich marked this conversation as resolved.
Show resolved Hide resolved
lambda x: x.annotate(
**{
ann: (
hl.case()
.when(set_negatives_to_zero, hl.max(x[ann], 0))
.or_error(negative_value_error_msg)
)
for ann in callstat_ann
}
)
)

return new_freq, new_freq_meta
61 changes: 60 additions & 1 deletion gnomad/utils/release.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,25 @@
# noqa: D100

import logging
from typing import Dict, List, Optional

import hail as hl

from gnomad.resources.grch38.gnomad import (
CURRENT_MAJOR_RELEASE,
GROUPS,
POPS,
SEXES,
SUBSETS,
)
from gnomad.utils.vcf import index_globals
from gnomad.utils.vcf import SORT_ORDER, index_globals

logging.basicConfig(
format="%(asctime)s (%(name)s %(lineno)s): %(message)s",
datefmt="%m/%d/%Y %I:%M:%S %p",
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


def make_faf_index_dict(
Expand Down Expand Up @@ -92,3 +102,52 @@ def _get_index(label_groups):
)

return index_dict


def make_freq_index_dict_from_meta(
freq_meta: List[Dict[str, str]],
label_delimiter: str = "_",
sort_order: Optional[List[str]] = SORT_ORDER,
) -> Dict[str, int]:
"""
Create a dictionary for accessing frequency array.

jkgoodrich marked this conversation as resolved.
Show resolved Hide resolved
The dictionary is keyed by the grouping combinations found in the frequency metadata
array, where values are the corresponding 0-based indices for the groupings in the
frequency array. For example, if the `freq_meta` entry [{'pop': 'nfe'}, {'sex': 'XX'}]
corresponds to the 5th entry in the frequency array, the returned dictionary entry
would be {'nfe_XX': 4}.

:param freq_meta: List of dictionaries containing frequency metadata.
:param label_delimiter: Delimiter to use when joining frequency metadata labels.
:param sort_order: List of frequency metadata labels to use when sorting the dictionary.
:return: Dictionary of frequency metadata.
"""
# Confirm all groups in freq_meta are in sort_order. Warn user if not.
if sort_order is not None:
diff = hl.eval(hl.set(freq_meta.flatmap(lambda i: i.keys()))) - set(sort_order)
if diff:
logger.warning(
"Found unexpected frequency metadata groupings: %s. These groupings"
" are not present in the provided sort_order: %s. These groupings"
" will not be included in the returned dictionary.",
diff,
sort_order,
)

index_dict = {}
jkgoodrich marked this conversation as resolved.
Show resolved Hide resolved
for i, f in enumerate(hl.eval(freq_meta)):
if sort_order is None or len(set(f.keys()) - set(sort_order)) < 1:
index_dict[
label_delimiter.join(
[
f[g]
for g in sorted(
f.keys(),
key=(lambda x: sort_order.index(x)) if sort_order else None,
)
]
)
] = i

return index_dict