Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix bug in process_consequences that was introduced when adding support for VEP without polyphen #710

Merged
merged 4 commits into from
Jun 14, 2024
Merged
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 119 additions & 42 deletions gnomad/utils/vep.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,24 +270,60 @@ def vep_or_lookup_vep(
return vep_ht.union(revep_ht)


def add_most_severe_consequence_to_consequence(
tc: hl.expr.StructExpression,
) -> hl.expr.StructExpression:
def get_most_severe_consequence_expr(
csq_expr: hl.expr.ArrayExpression,
csq_order: Optional[List[str]] = None,
) -> hl.expr.StringExpression:
"""
Add most_severe_consequence annotation to transcript consequences.
Get the most severe consequence from a collection of consequences.

This is for a given transcript, as there are often multiple annotations for a single
transcript: e.g. splice_region_variant&intron_variant -> splice_region_variant

This is for a given transcript, as there are often multiple annotations for a single transcript:
e.g. splice_region_variant&intron_variant -> splice_region_variant
:param csq_expr: ArrayExpression of consequences.
:param csq_order: Optional list indicating the order of VEP consequences, sorted
from high to low impact. Default is None, which uses the value of the
`CSQ_ORDER` global.
:return: Most severe consequence in `csq_expr`.
"""
csqs = hl.literal(CSQ_ORDER)
if csq_order is None:
csq_order = CSQ_ORDER
csqs = hl.literal(csq_order)

return tc.annotate(
most_severe_consequence=csqs.find(lambda c: tc.consequence_terms.contains(c))
)
return csqs.find(lambda c: csq_expr.contains(c))


def add_most_severe_consequence_to_consequence(
tc: Union[hl.expr.StructExpression, hl.expr.ArrayExpression],
csq_order: Optional[List[str]] = None,
most_severe_csq_field: str = "most_severe_consequence",
) -> Union[hl.expr.StructExpression, hl.expr.ArrayExpression]:
"""
Add a `most_severe_consequence` field to a transcript consequence or array of transcript consequences.

For a single transcript consequence, `tc` should be a StructExpression with a
`consequence_terms` field, e.g. Struct(consequence_terms=['missense_variant']).
For an array of transcript consequences, `tc` should be an ArrayExpression of
StructExpressions with a `consequence_terms` field.

:param tc: Transcript consequence or array of transcript consequences to annotate.
:param csq_order: Optional list indicating the order of VEP consequences, sorted
from high to low impact. Default is None, which uses the value of the
`CSQ_ORDER` global.
:param most_severe_csq_field: Field name to use for most severe consequence. Default
is 'most_severe_consequence'.
:return: Transcript consequence or array of transcript consequences annotated with
the most severe consequence.
"""
csq = lambda x: get_most_severe_consequence_expr(x.consequence_terms, csq_order)
if isinstance(tc, hl.expr.StructExpression):
return tc.annotate(**{most_severe_csq_field: csq(tc)})
else:
return tc.map(lambda x: x.annotate(**{most_severe_csq_field: csq(x)}))


def process_consequences(
mt: Union[hl.MatrixTable, hl.Table],
t: Union[hl.MatrixTable, hl.Table],
vep_root: str = "vep",
penalize_flags: bool = True,
csq_order: Optional[List[str]] = None,
Expand All @@ -298,12 +334,25 @@ def process_consequences(

`most_severe_consequence` is the worst consequence for a transcript.

Each transcript consequence is annotated with a `csq_score` which is a combination
of the index of the consequence's `most_severe_consequence` in `csq_order` and a
boost for loss-of-function consequences, and polyphen predictions if `has_polyphen`
is True.
jkgoodrich marked this conversation as resolved.
Show resolved Hide resolved

The score adjustment is as follows:
- lof == 'HC' & NO lof_flags (-1000 if penalize_flags, -500 if not)
- lof == 'HC' & lof_flags (-500)
- lof == 'OS' (-20)
- lof == 'LC' (-10)
- everything else (0)

.. note::

From gnomAD v4.0 on, the PolyPhen annotation was removed from the VEP Struct
in the release HTs. When using this function with gnomAD v4.0 or later,
set `has_polyphen` to False.

:param mt: Input Table or MatrixTable.
:param t: Input Table or MatrixTable.
:param vep_root: Root for VEP annotation (probably "vep").
:param penalize_flags: Whether to penalize LOFTEE flagged variants, or treat them
as equal to HC.
Expand All @@ -317,56 +366,84 @@ def process_consequences(
if csq_order is None:
csq_order = CSQ_ORDER
csqs = hl.literal(csq_order)

# Assign a score to each consequence based on the order in csq_order.
csq_dict = hl.literal(dict(zip(csq_order, range(len(csq_order)))))

def _csq_score(tc: hl.expr.StructExpression) -> int:
return csq_dict[tc.most_severe_consequence]
def _find_worst_transcript_consequence(
tcl: hl.expr.ArrayExpression,
) -> hl.expr.StructExpression:
"""
Find the worst transcript consequence in an array of transcript consequences.

flag_score = 500
no_flag_score = flag_score * (1 + penalize_flags)
:param tcl: Array of transcript consequences.
:return: Worst transcript consequence.
"""
flag = 500
no_flag = flag * (1 + penalize_flags)
Comment on lines +382 to +383
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also I know this is from before but since were in the process of revamping this module, I think the language around and then how we handle the logic in relation to those flags is counterintuitive, i.e the penalize_flag logic is actually a no_flag booster. These values seem arbitrary and I cant imagine anyone is actually using these score values themselves since LOF is deducted so far down. It would be more clear if the scores were no_flag=500, flag = 500/(1+penalize_flags).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I'm actually completely removing the scores in my next PR


def _csq_score_modifier(tc: hl.expr.StructExpression) -> float:
modifier = _csq_score(tc)
flag_condition = (tc.lof == "HC") & (tc.lof_flags != "")
modifier -= hl.if_else(flag_condition, flag_score, no_flag_score)
modifier -= hl.if_else(tc.lof == "OS", 20, 0)
modifier -= hl.if_else(tc.lof == "LC", 10, 0)
if has_polyphen:
modifier -= (
hl.case()
.when(tc.polyphen_prediction == "probably_damaging", 0.5)
.when(tc.polyphen_prediction == "possibly_damaging", 0.25)
.when(tc.polyphen_prediction == "benign", 0.1)
# Score each consequence based on the order in csq_order.
score_expr = tcl.map(
lambda tc: csq_dict[csqs.find(lambda x: x == tc.most_severe_consequence)]
)

# Determine the score adjustment based on the consequence's LOF and LOF flags.
sub_expr = tcl.map(
lambda tc: (
hl.case(missing_false=True)
.when((tc.lof == "HC") & hl.or_else(tc.lof_flags == "", True), no_flag)
.when((tc.lof == "HC") & (tc.lof_flags != ""), flag)
Comment on lines +394 to +395
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know this is unchanged from before but since no_flag and flag are only used once and you cant pass a value for flag, I think its more difficult to read with the variables, because you need to go back up to the assignment. I'd just do 500, 500 /(1 + penalize_flags). IF you agree with the value adjustment above.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just going to leave as is for now since my next PR will completely remove the use of the scores

.when(tc.lof == "OS", 20)
.when(tc.lof == "LC", 10)
.default(0)
)
return modifier
)

def find_worst_transcript_consequence(
tcl: hl.expr.ArrayExpression,
) -> hl.expr.StructExpression:
tcl = tcl.map(
lambda tc: tc.annotate(csq_score=_csq_score(tc) - _csq_score_modifier(tc))
# If requested, determine the score adjustment based on the consequence's
# PolyPhen prediction.
if has_polyphen:
polyphen_sub_expr = tcl.map(
lambda tc: (
hl.case(missing_false=True)
.when(tc.polyphen_prediction == "probably_damaging", 0.5)
.when(tc.polyphen_prediction == "possibly_damaging", 0.25)
.when(tc.polyphen_prediction == "benign", 0.1)
.default(0)
)
)
sub_expr = hl.map(lambda s, ps: s + ps, sub_expr, polyphen_sub_expr)

# Calculate the final consequence score.
tcl = hl.map(
lambda tc, s, ss: tc.annotate(csq_score=s - ss), tcl, score_expr, sub_expr
)

# Return the worst consequence based on the calculated score.
return hl.or_missing(hl.len(tcl) > 0, hl.sorted(tcl, lambda x: x.csq_score)[0])

transcript_csqs = mt[vep_root].transcript_consequences.map(
add_most_severe_consequence_to_consequence
# Annotate each transcript consequence with the 'most_severe_consequence'.
transcript_csqs = add_most_severe_consequence_to_consequence(
t[vep_root].transcript_consequences, csq_order
)

# Group transcript consequences by gene and find the worst consequence for each.
gene_dict = transcript_csqs.group_by(lambda tc: tc.gene_symbol)
worst_csq_gene = gene_dict.map_values(find_worst_transcript_consequence).values()
worst_csq_gene = gene_dict.map_values(_find_worst_transcript_consequence).values()
sorted_scores = hl.sorted(worst_csq_gene, key=lambda tc: tc.csq_score)

# Filter transcript consequences to only include canonical transcripts and find the
# worst consequence for each gene.
canonical = transcript_csqs.filter(lambda csq: csq.canonical == 1)
gene_canonical_dict = canonical.group_by(lambda tc: tc.gene_symbol)
worst_csq_gene_canonical = gene_canonical_dict.map_values(
find_worst_transcript_consequence
_find_worst_transcript_consequence
).values()
sorted_canonical_scores = hl.sorted(
worst_csq_gene_canonical, key=lambda tc: tc.csq_score
)

vep_data = mt[vep_root].annotate(
# Annotate the HT/MT with the worst consequence for each gene and variant.
vep_data = t[vep_root].annotate(
transcript_consequences=transcript_csqs,
worst_consequence_term=csqs.find(
lambda c: transcript_csqs.map(
Expand All @@ -384,9 +461,9 @@ def find_worst_transcript_consequence(
)

return (
mt.annotate_rows(**{vep_root: vep_data})
if isinstance(mt, hl.MatrixTable)
else mt.annotate(**{vep_root: vep_data})
t.annotate_rows(**{vep_root: vep_data})
if isinstance(t, hl.MatrixTable)
else t.annotate(**{vep_root: vep_data})
)


Expand Down
Loading