Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add generic constraint functions - annotate_mutation_type(), trimer_from_heptamer(), collapse_strand(), add_most_severe_csq_to_tc_within_vep_root() #474

Merged
merged 15 commits into from
Sep 22, 2022
127 changes: 127 additions & 0 deletions gnomad/utils/constraint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
# noqa: D100

import logging
from typing import Union

import hail as hl

logging.basicConfig(
format="%(asctime)s (%(name)s %(lineno)s): %(message)s",
datefmt="%m/%d/%Y %I:%M:%S %p",
)
logger = logging.getLogger("constraint_utils")
logger.setLevel(logging.INFO)


def annotate_mutation_type(
t: Union[hl.MatrixTable, hl.Table]
) -> Union[hl.MatrixTable, hl.Table]:
"""
Annotate mutation types.

The following annotations are added to the output Table:
- cpg
- transition
- mutation_type - one of "CpG", "non-CpG transition", or "transversion"
- mutation_type_model

:param t: Input Table or MatrixTable.
:return: Table with mutation type annotations added.
"""
# Determine the middle index of context by sampling the first 100 values of 'context'
context_lengths = list(filter(None, set(hl.len(t.context).take(100))))
if len(context_lengths) > 1:
raise ValueError(
"More than one length was found among the first 100 'context' values. Length of 'context' should be consistent."
)
else:
context_length = context_lengths[0]
logger.info("Detected a length of %d for context length", context_length)

if context_length == 3:
mid_index = 1
elif context_length == 7:
mid_index = 3
else:
raise ValueError(
f"The length of context should be either 3 or 7, instead of {context_length}."
)

transition_expr = hl.is_transition(t.ref, t.alt)
cpg_expr = (
(t.ref == "G") & (t.alt == "A") & (t.context[mid_index - 1 : mid_index] == "C")
) | (
(t.ref == "C")
& (t.alt == "T")
& (t.context[mid_index + 1 : mid_index + 2] == "G")
)
if isinstance(t, hl.MatrixTable):
t = t.annotate_rows(transition=transition_expr, cpg=cpg_expr)
else:
t = t.annotate(transition=transition_expr, cpg=cpg_expr)
mutation_type_expr = (
hl.case()
.when(t.cpg, "CpG")
.when(t.transition, "non-CpG transition")
.default("transversion")
)
mutation_type_model_expr = hl.if_else(t.cpg, t.context, "non-CpG")
if isinstance(t, hl.MatrixTable):
return t.annotate_rows(
mutation_type=mutation_type_expr,
mutation_type_model=mutation_type_model_expr,
)
else:
return t.annotate(
mutation_type=mutation_type_expr,
mutation_type_model=mutation_type_model_expr,
)


def trimer_from_heptamer(
t: Union[hl.MatrixTable, hl.Table]
) -> Union[hl.MatrixTable, hl.Table]:
"""
Trim heptamer context to create trimer context.

:param t: Input MatrixTable or Table with context annotation.
:return: MatrixTable or Table with trimer context annotated.
"""
trimer_expr = hl.if_else(hl.len(t.context) == 7, t.context[2:5], t.context)
return (
t.annotate_rows(context=trimer_expr)
if isinstance(t, hl.MatrixTable)
else t.annotate(context=trimer_expr)
)


def collapse_strand(
t: Union[hl.Table, hl.MatrixTable]
) -> Union[hl.Table, hl.MatrixTable]:
"""
Return the deduplicated context by collapsing DNA strands.

Function returns the reverse complement for 'ref, 'alt', and 'context' if the reference allele is either 'G' or 'T'.

The following annotations are added to the output Table:
- was_flipped - whether the 'ref, 'alt', and 'context' were flipped (reverse complement taken)

:param ht: Input Table.
:return: Table with deduplicated context annotation (ref, alt, context, was_flipped).
"""
ref_g_or_t_expr = (t.ref == "G") | (t.ref == "T")
collapse_expr = {
"ref": hl.if_else(ref_g_or_t_expr, hl.reverse_complement(t.ref), t.ref),
"alt": hl.if_else(ref_g_or_t_expr, hl.reverse_complement(t.alt), t.alt),
"context": hl.if_else(
ref_g_or_t_expr,
hl.reverse_complement(t.context),
t.context,
),
"was_flipped": ref_g_or_t_expr,
}
return (
t.annotate(**collapse_expr)
if isinstance(t, hl.Table)
else t.annotate_rows(**collapse_expr)
)
22 changes: 22 additions & 0 deletions gnomad/utils/vep.py
Original file line number Diff line number Diff line change
Expand Up @@ -556,3 +556,25 @@ def _get_most_severe_csq(
)
.default(_get_most_severe_csq(ht.vep.intergenic_consequences, False))
)


def add_most_severe_csq_to_tc_within_vep_root(
t: Union[hl.Table, hl.MatrixTable], vep_root: str = "vep"
) -> Union[hl.Table, hl.MatrixTable]:
"""
Add most_severe_consequence annotation to 'transcript_consequences' within the vep root annotation.

:param t: Input Table or MatrixTable.
:param vep_root: Root for vep annotation (probably vep).
:return: Table or MatrixTable with most_severe_consequence annotation added.
"""
annotation = t[vep_root].annotate(
transcript_consequences=t[vep_root].transcript_consequences.map(
add_most_severe_consequence_to_consequence
)
)
return (
t.annotate_rows(**{vep_root: annotation})
if isinstance(t, hl.MatrixTable)
else t.annotate(**{vep_root: annotation})
)