broadinstitute · mike-w-wilson · Jun 23, 2023 · Jun 7, 2023 · Jun 8, 2023 · Jun 8, 2023
diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py
@@ -125,7 +125,7 @@ def project_max_expr(
                     > 0
                 ),
                 # order the callstats computed by AF in decreasing order
-                lambda x: -x[1].AF[ai]
+                lambda x: -x[1].AF[ai],
                 # take the n_projects projects with largest AF
             )[:n_projects].map(
                 # add the project in the callstats struct
@@ -219,8 +219,10 @@ def faf_expr(
                 ~locus.in_autosome_or_par(),
                 hl.struct(
                     **{
-                        f"faf{str(threshold)[2:]}": hl.experimental.filtering_allele_frequency(
-                            freq[i].AC, freq[i].AN, threshold
+                        f"faf{str(threshold)[2:]}": (
+                            hl.experimental.filtering_allele_frequency(
+                                freq[i].AC, freq[i].AN, threshold
+                            )
                         )
                         for threshold in faf_thresholds
                     }
@@ -1106,7 +1108,8 @@ def region_flag_expr(
     :return: `region_flag` struct row annotation
     """
     prob_flags_expr = (
-        {"non_par": (t.locus.in_x_nonpar() | t.locus.in_y_nonpar())} if non_par else {}
+        {"non_par": (t.locus.in_x_nonpar() | t.locus.in_y_nonpar())
+         } if non_par else {}  # fmt: skip
     )
 
     if prob_regions is not None:
@@ -1188,3 +1191,133 @@ def hemi_expr(
         # mt.GT[0] is alternate allele
         gt.is_haploid() & (sex_expr == male_str) & (gt[0] == 1),
     )
+
+
+def merge_freq_arrays(
+    farrays: List[hl.expr.ArrayExpression],
+    fmeta: List[List[Dict[str, str]]],
+    operation: str = "sum",
+    set_negatives_to_zero: bool = False,
+) -> Tuple[hl.expr.ArrayExpression, List[Dict[str, int]]]:
+    """
+    Merge a list of frequency arrays based on the supplied `operation`.
+
+    .. warning::
+        Arrays must be on the same Table.
+
+    .. note::
+
+        Arrays do not have to contain the same groupings or order of groupings but
+        the array indices for a freq array in `farrays` must be the same as its associated
+        frequency metadata index in `fmeta` i.e., `farrays = [freq1, freq2]` then `fmeta`
+        must equal `[fmeta1, fmeta2]` where fmeta1 contains the metadata information
+        for freq1.
+
+        If `operation` is set to "sum", groups in the merged array
+        will be the union of groupings found within the arrays' metadata and all arrays
+        with be summed by grouping. If `operation` is set to "diff", the merged array
+        will contain groups only found in the first array of `fmeta`. Any array containing
+        any of these groups will have thier values subtracted from the values of the first array.
+
+    :param farrays: List of frequency arrays to merge. First entry in the list is the primary array to which other arrays will be added or subtracted. All arrays must be on the same Table.
+    :param fmeta: List of frequency metadata for arrays being merged.
+    :param operation: Merge operation to perform. Options are "sum" and "diff". If "diff" is passed, the first freq array in the list will have the other arrays subtracted from it.
+    :param set_negatives_to_zero: If True, set negative array values to 0 for AC, AN, AF, and homozygote_count. If False, raise a ValueError. Default is True.
+    :return: Tuple of merged frequency array and its frequency metadata list.
+    """
+    if len(farrays) < 2:
+        raise ValueError("Must provide at least two frequency arrays to merge!")
+    if len(farrays) != len(fmeta):
+        raise ValueError("Length of farrays and fmeta must be equal!")
+    if operation not in ["sum", "diff"]:
+        raise ValueError("Operation must be either 'sum' or 'diff'!")
+
+    # Create a list where each entry is a dictionary whose key is an aggregation
+    # group and the value is the corresponding index in the freq array.
+    fmeta = [hl.dict(hl.enumerate(f).map(lambda x: (x[1], [x[0]]))) for f in fmeta]
+
+    # Merge dictionaries in the list into a single dictionary where key is aggregation
+    # group and the value is a list of the group's index in each of the freq arrays, if
+    # it exists. For "sum" operation, use keys, aka groups, found in all freq dictionaries.
+    # For "diff" operations, only use key_set from the first entry.
+    fmeta = hl.fold(
+        lambda i, j: hl.dict(
+            (
+                hl.if_else(operation == "sum", (i.key_set() | j.key_set()), i.key_set())
+            ).map(
+                lambda k: (
+                    k,
+                    i.get(k, [hl.missing(hl.tint32)]).extend(
+                        j.get(k, [hl.missing(hl.tint32)])
+                    ),
+                )
+            )
+        ),
+        fmeta[0],
+        fmeta[1:],
+    )
+
+    # Create a list of tuples from the dictionary, sorted by the list of indices for
+    # each aggregation group.
+    fmeta = hl.sorted(fmeta.items(), key=lambda f: f[1])
+
+    # Create a list of the aggregation groups, maintaining the sorted order.
+    new_freq_meta = fmeta.map(lambda x: x[0])
+
+    # Create array for each aggregation group of arrays containing the group's freq
+    # values from each freq array.
+    freq_meta_idx = fmeta.map(lambda x: hl.zip(farrays, x[1]).map(lambda i: i[0][i[1]]))
+
+    def _sum_or_diff_fields(
+        field_1_expr: str, field_2_expr: str
+    ) -> hl.expr.Int32Expression:
+        """
+        Sum or subtract fields in call statistics struct.
+
+        :param field_1_expr: First field to sum or diff.
+        :param field_2_expr: Second field to sum or diff.
+        :return: Merged field value.
+        """
+        return hl.if_else(
+            operation == "sum",
+            hl.or_else(field_1_expr, 0) + hl.or_else(field_2_expr, 0),
+            hl.or_else(field_1_expr, 0) - hl.or_else(field_2_expr, 0),
+        )
+
+    # Iterate through the groups and their freq lists to merge callstats.
+    callstat_ann = ["AC", "AN", "homozygote_count"]
+    new_freq = freq_meta_idx.map(
+        lambda x: hl.bind(
+            lambda y: y.annotate(AF=hl.if_else(y.AN > 0, y.AC / y.AN, 0)),
+            hl.fold(
+                lambda i, j: hl.struct(
+                    **{ann: _sum_or_diff_fields(i[ann], j[ann]) for ann in callstat_ann}
+                ),
+                x[0].select("AC", "AN", "homozygote_count"),
+                x[1:],
+            ),
+        )
+    )
+    # Check and see if any annotation within the merged array is negative. If so,
+    # raise an error if set_negatives_to_zero is False or set the value to 0 if
+    # set_negatives_to_zero is True.
+    if operation == "diff":
+        negative_value_error_msg = (
+            "Negative values found in merged frequency array. Review data or set"
+            " `set_negatives_to_zero` to True to set negative values to 0."
+        )
+        callstat_ann.append("AF")
+        new_freq = new_freq.map(
+            lambda x: x.annotate(
+                **{
+                    ann: (
+                        hl.case()
+                        .when(set_negatives_to_zero, hl.max(x[ann], 0))
+                        .or_error(negative_value_error_msg)
+                    )
+                    for ann in callstat_ann
+                }
+            )
+        )
+
+    return new_freq, new_freq_meta
diff --git a/gnomad/utils/release.py b/gnomad/utils/release.py
@@ -1,15 +1,25 @@
 # noqa: D100
 
+import logging
 from typing import Dict, List, Optional
 
+import hail as hl
+
 from gnomad.resources.grch38.gnomad import (
     CURRENT_MAJOR_RELEASE,
     GROUPS,
     POPS,
     SEXES,
     SUBSETS,
 )
-from gnomad.utils.vcf import index_globals
+from gnomad.utils.vcf import SORT_ORDER, index_globals
+
+logging.basicConfig(
+    format="%(asctime)s (%(name)s %(lineno)s): %(message)s",
+    datefmt="%m/%d/%Y %I:%M:%S %p",
+)
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
 
 
 def make_faf_index_dict(
@@ -92,3 +102,52 @@ def _get_index(label_groups):
         )
 
     return index_dict
+
+
+def make_freq_index_dict_from_meta(
+    freq_meta: List[Dict[str, str]],
+    label_delimiter: str = "_",
+    sort_order: Optional[List[str]] = SORT_ORDER,
+) -> Dict[str, int]:
+    """
+    Create a dictionary for accessing frequency array.
+
+    The dictionary is keyed by the grouping combinations found in the frequency metadata
+    array, where values are the corresponding 0-based indices for the groupings in the
+    frequency array. For example, if the `freq_meta` entry [{'pop': 'nfe'}, {'sex': 'XX'}]
+    corresponds to the 5th entry in the frequency array, the returned dictionary entry
+    would be {'nfe_XX': 4}.
+
+    :param freq_meta: List of dictionaries containing frequency metadata.
+    :param label_delimiter: Delimiter to use when joining frequency metadata labels.
+    :param sort_order: List of frequency metadata labels to use when sorting the dictionary.
+    :return: Dictionary of frequency metadata.
+    """
+    # Confirm all groups in freq_meta are in sort_order. Warn user if not.
+    if sort_order is not None:
+        diff = hl.eval(hl.set(freq_meta.flatmap(lambda i: i.keys()))) - set(sort_order)
+        if diff:
+            logger.warning(
+                "Found unexpected frequency metadata groupings: %s. These groupings"
+                " are not present in the provided sort_order: %s. These groupings"
+                " will not be included in the returned dictionary.",
+                diff,
+                sort_order,
+            )
+
+    index_dict = {}
+    for i, f in enumerate(hl.eval(freq_meta)):
+        if sort_order is None or len(set(f.keys()) - set(sort_order)) < 1:
+            index_dict[
+                label_delimiter.join(
+                    [
+                        f[g]
+                        for g in sorted(
+                            f.keys(),
+                            key=(lambda x: sort_order.index(x)) if sort_order else None,
+                        )
+                    ]
+                )
+            ] = i
+
+    return index_dict