From 77279e871c560b074b1b294540031904b00f90ff Mon Sep 17 00:00:00 2001 From: KoalaQin Date: Fri, 8 Dec 2023 09:43:55 -0500 Subject: [PATCH 01/31] function to annotate variant with transcript expression --- gnomad/utils/transcript_annotation.py | 109 +++++++++++++++++++++++++- 1 file changed, 105 insertions(+), 4 deletions(-) diff --git a/gnomad/utils/transcript_annotation.py b/gnomad/utils/transcript_annotation.py index b151cf4d6..94749e3e5 100644 --- a/gnomad/utils/transcript_annotation.py +++ b/gnomad/utils/transcript_annotation.py @@ -3,6 +3,8 @@ import hail as hl +from gnomad.utils.vep import add_most_severe_consequence_to_consequence + def summarize_rsem_mt( rsem_mt: hl.MatrixTable, @@ -82,13 +84,13 @@ def get_expression_proportion( """ Calculate the proportion of expression of transcript to gene per tissue. - :param transcript_ht: Table of summarized transcript expression by tissue - :param gene_ht: Table of summarized gene expression by tissue - :param tissues_to_filter: Optional list of tissues to filter out + :param transcript_ht: Table of summarized transcript expression by tissue. + :param gene_ht: Table of summarized gene expression by tissue. + :param tissues_to_filter: Optional list of tissues to filter out. :param tissue_as_row: If True, the input Table is with a row annotation for each tissue instead of an array of values. Default is False. :return: Table with expression proportion of transcript to gene per tissue - and mean expression proportion across tissues + and mean expression proportion across tissues. """ if tissue_as_row: tissues1 = list(gene_ht.row) @@ -145,3 +147,102 @@ def get_expression_proportion( ) return transcript_ht + + +def tx_annotate_variants( + variant_ht: hl.Table, + tx_ht: hl.Table, + tx_annotation_type: str = "expression", + filter_to_protein_coding: bool = True, + filter_to_genes: List[str] = None, + filter_to_csqs: List[str] = None, + filter_to_homs: bool = False, +) -> hl.Table: + """ + Annotate variants with transcript-based expression values or expression proportion from GTEx. + + :param variant_ht: Table of variants to annotate, it should contain at + least the following nested fields: `vep.transcript_consequences`, + `freq`. + :param tx_ht: Table of transcript expression information. + :param tx_annotation_type: Type of transcript annotation to add. Options are + 'expression' or 'expression_proportion'. Default is 'expression'. + :param filter_to_protein_coding: If True, filter to protein coding + transcripts. Default is True. + :param filter_to_genes: List of genes to filter to. + :param filter_to_csqs: List of consequence terms to filter to. + :param filter_to_homs: If True, filter to variants with at least one + homozygote in `freq` field. Default is False. + :return: MatrixTable with transcript expression information annotated + """ + # GTEx data has transcript IDs without version numbers, so we need to + # remove the version numbers from the transcript IDs in the variant table + tx_ht = tx_ht.key_by() + tx_ht = tx_ht.annotate(transcript_id=tx_ht.transcript_id.split("\\.")[0]) + tx_ht = tx_ht.key_by(tx_ht.transcript_id) + + variant_ht = variant_ht.annotate( + vep=variant_ht.vep.annotate( + transcript_consequences=variant_ht.vep.transcript_consequences.map( + add_most_severe_consequence_to_consequence + ) + ) + ) + + # Explode the transcript consequences to be able to key by transcript ID + variant_ht = variant_ht.explode(variant_ht.vep.transcript_consequences) + + if filter_to_protein_coding: + variant_ht = variant_ht.filter( + variant_ht.vep.transcript_consequences.biotype == "protein_coding" + ) + + if filter_to_genes: + variant_ht = variant_ht.filter( + hl.literal(filter_to_genes).contains( + variant_ht.vep.transcript_consequences.gene_id + ) + ) + + if filter_to_csqs: + variant_ht = variant_ht.filter( + hl.literal(filter_to_csqs).contains( + variant_ht.vep.transcript_consequences.most_severe_consequence + ) + ) + + if filter_to_homs: + variant_ht = variant_ht.filter(variant_ht.freq[0].homozygote_count > 0) + + # Annotate the variant table with the transcript expression information + variant_ht = variant_ht.annotate( + tx_data=tx_ht[variant_ht.vep.transcript_consequences.transcript_id] + ) + + # Aggregate the transcript expression information by gene, csq, etc. + tx_annot_ht = variant_ht.group_by( + locus=variant_ht.locus, + alleles=variant_ht.alleles, + gene_id=variant_ht.vep.transcript_consequences.gene_id, + gene_symbol=variant_ht.vep.transcript_consequences.gene_symbol, + csq=variant_ht.vep.transcript_consequences.most_severe_consequence, + lof=variant_ht.vep.transcript_consequences.lof, + lof_flags=variant_ht.vep.transcript_consequences.lof_flags, + ).aggregate( + tx_annotation=hl.agg.array_sum( + variant_ht.tx_data.transcript_expression + if tx_annotation_type == "expression" + else variant_ht.tx_data.exp_prop + ) + ) + + # Remove unnecessary global annotations and add tissue and + # expression_type in global annotations + tx_annot_ht = tx_annot_ht.select_globals().annotate_globals( + tissues=tx_ht.index_globals().tissues, + tx_annotation_type=tx_annotation_type, + ) + + tx_annot_ht = tx_annot_ht.key_by(tx_annot_ht.locus, tx_annot_ht.alleles) + + return tx_annot_ht From e9cfeb30e209fe1f2847de5b8608439c5f193c25 Mon Sep 17 00:00:00 2001 From: KoalaQin Date: Fri, 8 Dec 2023 11:08:34 -0500 Subject: [PATCH 02/31] Aggregate mean_proportion --- gnomad/utils/transcript_annotation.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/gnomad/utils/transcript_annotation.py b/gnomad/utils/transcript_annotation.py index 94749e3e5..9e60011b5 100644 --- a/gnomad/utils/transcript_annotation.py +++ b/gnomad/utils/transcript_annotation.py @@ -152,7 +152,6 @@ def get_expression_proportion( def tx_annotate_variants( variant_ht: hl.Table, tx_ht: hl.Table, - tx_annotation_type: str = "expression", filter_to_protein_coding: bool = True, filter_to_genes: List[str] = None, filter_to_csqs: List[str] = None, @@ -165,8 +164,6 @@ def tx_annotate_variants( least the following nested fields: `vep.transcript_consequences`, `freq`. :param tx_ht: Table of transcript expression information. - :param tx_annotation_type: Type of transcript annotation to add. Options are - 'expression' or 'expression_proportion'. Default is 'expression'. :param filter_to_protein_coding: If True, filter to protein coding transcripts. Default is True. :param filter_to_genes: List of genes to filter to. @@ -220,27 +217,25 @@ def tx_annotate_variants( ) # Aggregate the transcript expression information by gene, csq, etc. + # TODO: Should we filter to only exonic variants since it's mostly exomes + # data? tx_annot_ht = variant_ht.group_by( + gene_id=variant_ht.vep.transcript_consequences.gene_id, locus=variant_ht.locus, alleles=variant_ht.alleles, - gene_id=variant_ht.vep.transcript_consequences.gene_id, gene_symbol=variant_ht.vep.transcript_consequences.gene_symbol, csq=variant_ht.vep.transcript_consequences.most_severe_consequence, lof=variant_ht.vep.transcript_consequences.lof, lof_flags=variant_ht.vep.transcript_consequences.lof_flags, ).aggregate( - tx_annotation=hl.agg.array_sum( - variant_ht.tx_data.transcript_expression - if tx_annotation_type == "expression" - else variant_ht.tx_data.exp_prop - ) + tx_annotation=hl.agg.array_sum(variant_ht.tx_data.transcript_expression), + mean_proportion=hl.agg.sum(variant_ht.tx_data.exp_prop_mean), ) # Remove unnecessary global annotations and add tissue and # expression_type in global annotations tx_annot_ht = tx_annot_ht.select_globals().annotate_globals( tissues=tx_ht.index_globals().tissues, - tx_annotation_type=tx_annotation_type, ) tx_annot_ht = tx_annot_ht.key_by(tx_annot_ht.locus, tx_annot_ht.alleles) From 373c83e28642b36bf2293a4643f93fe36b305a52 Mon Sep 17 00:00:00 2001 From: KoalaQin Date: Mon, 18 Dec 2023 17:56:24 -0500 Subject: [PATCH 03/31] integrate pull_out_worst_from_tx_annotatate into the main function --- gnomad/utils/transcript_annotation.py | 45 ++++++++++++++------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/gnomad/utils/transcript_annotation.py b/gnomad/utils/transcript_annotation.py index 5ec120ae3..bf7300dc8 100644 --- a/gnomad/utils/transcript_annotation.py +++ b/gnomad/utils/transcript_annotation.py @@ -4,7 +4,7 @@ import hail as hl -from gnomad.utils.vep import add_most_severe_consequence_to_consequence +from gnomad.utils.vep import process_consequences logging.basicConfig( format="%(asctime)s (%(name)s %(lineno)s): %(message)s", @@ -144,6 +144,7 @@ def tx_annotate_variants( filter_to_genes: List[str] = None, filter_to_csqs: List[str] = None, filter_to_homs: bool = False, + vep_annotation: str = "transcript_consequences", ) -> hl.Table: """ Annotate variants with transcript-based expression values or expression proportion from GTEx. @@ -158,7 +159,14 @@ def tx_annotate_variants( :param filter_to_csqs: List of consequence terms to filter to. :param filter_to_homs: If True, filter to variants with at least one homozygote in `freq` field. Default is False. - :return: MatrixTable with transcript expression information annotated + :param vep_annotation: Name of annotation in 'vep' annotation, + one of the processed consequences: ["transcript_consequences", + "worst_csq_by_gene", "worst_csq_for_variant", + "worst_csq_by_gene_canonical", "worst_csq_for_variant_canonical"]. + For exapmle, if you want to annotate each variant with the worst + consequence per gene and the transcript expression, you would use + "worst_csq_by_gene". Default is "transcript_consequences". + :return: Table with transcript expression information annotated """ # GTEx data has transcript IDs without version numbers, so we need to # remove the version numbers from the transcript IDs in the variant table @@ -166,33 +174,26 @@ def tx_annotate_variants( tx_ht = tx_ht.annotate(transcript_id=tx_ht.transcript_id.split("\\.")[0]) tx_ht = tx_ht.key_by(tx_ht.transcript_id) - variant_ht = variant_ht.annotate( - vep=variant_ht.vep.annotate( - transcript_consequences=variant_ht.vep.transcript_consequences.map( - add_most_severe_consequence_to_consequence - ) - ) - ) + variant_ht = process_consequences(variant_ht) - # Explode the transcript consequences to be able to key by transcript ID - variant_ht = variant_ht.explode(variant_ht.vep.transcript_consequences) + # Explode the processed transcript consequences to be able to key by + # transcript ID + variant_ht = variant_ht.explode(variant_ht.vep[vep_annotation]) if filter_to_protein_coding: variant_ht = variant_ht.filter( - variant_ht.vep.transcript_consequences.biotype == "protein_coding" + variant_ht.vep[vep_annotation].biotype == "protein_coding" ) if filter_to_genes: variant_ht = variant_ht.filter( - hl.literal(filter_to_genes).contains( - variant_ht.vep.transcript_consequences.gene_id - ) + hl.literal(filter_to_genes).contains(variant_ht.vep[vep_annotation].gene_id) ) if filter_to_csqs: variant_ht = variant_ht.filter( hl.literal(filter_to_csqs).contains( - variant_ht.vep.transcript_consequences.most_severe_consequence + variant_ht.vep[vep_annotation].most_severe_consequence ) ) @@ -201,20 +202,20 @@ def tx_annotate_variants( # Annotate the variant table with the transcript expression information variant_ht = variant_ht.annotate( - tx_data=tx_ht[variant_ht.vep.transcript_consequences.transcript_id] + tx_data=tx_ht[variant_ht.vep[vep_annotation].transcript_id] ) # Aggregate the transcript expression information by gene, csq, etc. # TODO: Should we filter to only exonic variants since it's mostly exomes # data? tx_annot_ht = variant_ht.group_by( - gene_id=variant_ht.vep.transcript_consequences.gene_id, + gene_id=variant_ht.vep[vep_annotation].gene_id, locus=variant_ht.locus, alleles=variant_ht.alleles, - gene_symbol=variant_ht.vep.transcript_consequences.gene_symbol, - csq=variant_ht.vep.transcript_consequences.most_severe_consequence, - lof=variant_ht.vep.transcript_consequences.lof, - lof_flags=variant_ht.vep.transcript_consequences.lof_flags, + gene_symbol=variant_ht.vep[vep_annotation].gene_symbol, + csq=variant_ht.vep[vep_annotation].most_severe_consequence, + lof=variant_ht.vep[vep_annotation].lof, + lof_flags=variant_ht.vep[vep_annotation].lof_flags, ).aggregate( tx_annotation=hl.agg.array_sum(variant_ht.tx_data.transcript_expression), mean_proportion=hl.agg.sum(variant_ht.tx_data.exp_prop_mean), From 21b551993967ab633eda89d328d590079eba85a6 Mon Sep 17 00:00:00 2001 From: KoalaQin Date: Mon, 18 Dec 2023 20:34:37 -0500 Subject: [PATCH 04/31] Edit docstring --- gnomad/utils/transcript_annotation.py | 6 +++--- gnomad/utils/vep.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/gnomad/utils/transcript_annotation.py b/gnomad/utils/transcript_annotation.py index bf7300dc8..1ecfa89f5 100644 --- a/gnomad/utils/transcript_annotation.py +++ b/gnomad/utils/transcript_annotation.py @@ -163,9 +163,9 @@ def tx_annotate_variants( one of the processed consequences: ["transcript_consequences", "worst_csq_by_gene", "worst_csq_for_variant", "worst_csq_by_gene_canonical", "worst_csq_for_variant_canonical"]. - For exapmle, if you want to annotate each variant with the worst - consequence per gene and the transcript expression, you would use - "worst_csq_by_gene". Default is "transcript_consequences". + For example, if you want to annotate each variant with the worst + consequence in each gene it falls on and the transcript expression, + you would use "worst_csq_by_gene". Default is "transcript_consequences". :return: Table with transcript expression information annotated """ # GTEx data has transcript IDs without version numbers, so we need to diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index 0563ef6a4..166dabaab 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -292,7 +292,7 @@ def process_consequences( def find_worst_transcript_consequence( tcl: hl.expr.ArrayExpression, ) -> hl.expr.StructExpression: - """Get worst transcript_consequence from an array of em.""" + """Get the worst transcript_consequence from an array of all annotated transcript consequences.""" flag_score = 500 no_flag_score = flag_score * (1 + penalize_flags) From 0f898e5c875f5eabba2506ce2f9834ddb6ffc500 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Tue, 19 Dec 2023 09:40:04 -0700 Subject: [PATCH 05/31] Suggested changes to `tx_annotate_variants` --- gnomad/utils/transcript_annotation.py | 103 +++++++++++--------------- 1 file changed, 43 insertions(+), 60 deletions(-) diff --git a/gnomad/utils/transcript_annotation.py b/gnomad/utils/transcript_annotation.py index 5ec120ae3..6c75cf487 100644 --- a/gnomad/utils/transcript_annotation.py +++ b/gnomad/utils/transcript_annotation.py @@ -141,9 +141,13 @@ def tx_annotate_variants( variant_ht: hl.Table, tx_ht: hl.Table, filter_to_protein_coding: bool = True, - filter_to_genes: List[str] = None, - filter_to_csqs: List[str] = None, - filter_to_homs: bool = False, + tissues_to_filter: Optional[List[str]] = None, + additional_group_by: Optional[Union[Tuple[str], List[str]]] = ( + "gene_symbol", + "most_severe_consequence", + "lof", + "lof_flags", + ), ) -> hl.Table: """ Annotate variants with transcript-based expression values or expression proportion from GTEx. @@ -154,76 +158,55 @@ def tx_annotate_variants( :param tx_ht: Table of transcript expression information. :param filter_to_protein_coding: If True, filter to protein coding transcripts. Default is True. - :param filter_to_genes: List of genes to filter to. - :param filter_to_csqs: List of consequence terms to filter to. - :param filter_to_homs: If True, filter to variants with at least one - homozygote in `freq` field. Default is False. - :return: MatrixTable with transcript expression information annotated + :param tissues_to_filter: Optional list of tissues to exclude from the output. + :param additional_group_by: Optional list of additional fields to group by before + sum aggregation. + :return: Input Table with transcript expression information annotated. """ - # GTEx data has transcript IDs without version numbers, so we need to - # remove the version numbers from the transcript IDs in the variant table - tx_ht = tx_ht.key_by() - tx_ht = tx_ht.annotate(transcript_id=tx_ht.transcript_id.split("\\.")[0]) - tx_ht = tx_ht.key_by(tx_ht.transcript_id) - - variant_ht = variant_ht.annotate( - vep=variant_ht.vep.annotate( - transcript_consequences=variant_ht.vep.transcript_consequences.map( - add_most_severe_consequence_to_consequence - ) + # Filter to tissues of interest and convert to arrays for easy aggregation. + tx_ht = tissue_expression_ht_to_array(tx_ht, tissues_to_filter=tissues_to_filter) + agg_annotations = list(tx_ht.row_value) + tissues = hl.eval(tx_ht.tissues) + + # Calculate the mean expression proportion across all tissues. + tx_ht = tx_ht.annotate(exp_prop_mean=hl.mean(tx_ht.expression_proportion)) + + # Add the most severe consequence to the transcript consequences. + variant_ht = variant_ht.select( + transcript_consequences=variant_ht.vep.transcript_consequences.map( + add_most_severe_consequence_to_consequence ) ) - # Explode the transcript consequences to be able to key by transcript ID - variant_ht = variant_ht.explode(variant_ht.vep.transcript_consequences) + # Explode the transcript consequences to be able to key by transcript ID. + variant_ht = variant_ht.explode(variant_ht.transcript_consequences) if filter_to_protein_coding: variant_ht = variant_ht.filter( - variant_ht.vep.transcript_consequences.biotype == "protein_coding" + variant_ht.transcript_consequences.biotype == "protein_coding" ) - if filter_to_genes: - variant_ht = variant_ht.filter( - hl.literal(filter_to_genes).contains( - variant_ht.vep.transcript_consequences.gene_id - ) - ) - - if filter_to_csqs: - variant_ht = variant_ht.filter( - hl.literal(filter_to_csqs).contains( - variant_ht.vep.transcript_consequences.most_severe_consequence - ) - ) - - if filter_to_homs: - variant_ht = variant_ht.filter(variant_ht.freq[0].homozygote_count > 0) - - # Annotate the variant table with the transcript expression information - variant_ht = variant_ht.annotate( - tx_data=tx_ht[variant_ht.vep.transcript_consequences.transcript_id] + grouping = ["transcript_id", "gene_id"] + list(additional_group_by) + variant_ht = variant_ht.select( + **{a: variant_ht.transcript_consequences[a] for a in grouping} ) - # Aggregate the transcript expression information by gene, csq, etc. - # TODO: Should we filter to only exonic variants since it's mostly exomes - # data? - tx_annot_ht = variant_ht.group_by( - gene_id=variant_ht.vep.transcript_consequences.gene_id, - locus=variant_ht.locus, - alleles=variant_ht.alleles, - gene_symbol=variant_ht.vep.transcript_consequences.gene_symbol, - csq=variant_ht.vep.transcript_consequences.most_severe_consequence, - lof=variant_ht.vep.transcript_consequences.lof, - lof_flags=variant_ht.vep.transcript_consequences.lof_flags, - ).aggregate( - tx_annotation=hl.agg.array_sum(variant_ht.tx_data.transcript_expression), - mean_proportion=hl.agg.sum(variant_ht.tx_data.exp_prop_mean), + # Aggregate the transcript expression information by gene_id and annotation in + # additional_group_by. + variant_to_tx = tx_ht[variant_ht.transcript_id, variant_ht.gene_id] + grouping = ["locus", "alleles", "gene_id"] + list(additional_group_by) + tx_annot_ht = variant_ht.group_by(*grouping).aggregate( + **{a: hl.agg.array_sum(variant_to_tx[a]) for a in agg_annotations}, + exp_prop_mean=hl.agg.sum(variant_to_tx.exp_prop_mean), ) - # Remove unnecessary global annotations and add tissue and - # expression_type in global annotations - tx_annot_ht = tx_annot_ht.select_globals().annotate_globals( - tissues=tx_ht.index_globals().tissues, + # Reformat the transcript expression information to be a struct per tissue. + tx_annot_ht = tx_annot_ht.select( + "exp_prop_mean", + **{ + t: hl.struct(**{a: tx_annot_ht[a][i] for a in agg_annotations}) + for i, t in enumerate(tissues) + }, ) tx_annot_ht = tx_annot_ht.key_by(tx_annot_ht.locus, tx_annot_ht.alleles) From 9e9a2835f893478300812bc6e8d63893fcf8fedd Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Tue, 19 Dec 2023 09:53:24 -0700 Subject: [PATCH 06/31] A little more clean up --- gnomad/utils/transcript_annotation.py | 31 +++++++++++---------------- 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/gnomad/utils/transcript_annotation.py b/gnomad/utils/transcript_annotation.py index a835306fb..2b5eae7d3 100644 --- a/gnomad/utils/transcript_annotation.py +++ b/gnomad/utils/transcript_annotation.py @@ -138,7 +138,7 @@ def get_expression_proportion( def tx_annotate_variants( - variant_ht: hl.Table, + ht: hl.Table, tx_ht: hl.Table, filter_to_protein_coding: bool = True, tissues_to_filter: Optional[List[str]] = None, @@ -153,9 +153,8 @@ def tx_annotate_variants( """ Annotate variants with transcript-based expression values or expression proportion from GTEx. - :param variant_ht: Table of variants to annotate, it should contain at - least the following nested fields: `vep.transcript_consequences`, - `freq`. + :param ht: Table of variants to annotate, it should contain at least the following + nested fields: `vep.transcript_consequences`, `freq`. :param tx_ht: Table of transcript expression information. :param filter_to_protein_coding: If True, filter to protein coding transcripts. Default is True. @@ -179,40 +178,36 @@ def tx_annotate_variants( # Calculate the mean expression proportion across all tissues. tx_ht = tx_ht.annotate(exp_prop_mean=hl.mean(tx_ht.expression_proportion)) - variant_ht = process_consequences(variant_ht) + ht = process_consequences(ht) # Explode the processed transcript consequences to be able to key by # transcript ID - variant_ht = variant_ht.explode(variant_ht.vep[vep_annotation]) + ht = ht.explode(ht.vep[vep_annotation]) if filter_to_protein_coding: - variant_ht = variant_ht.filter( - variant_ht.vep[vep_annotation].biotype == "protein_coding" - ) + ht = ht.filter(ht.vep[vep_annotation].biotype == "protein_coding") grouping = ["transcript_id", "gene_id"] + list(additional_group_by) - variant_ht = variant_ht.select( - **{a: variant_ht.transcript_consequences[a] for a in grouping} - ) + ht = ht.select(**{a: ht.vep[vep_annotation][a] for a in grouping}) # Aggregate the transcript expression information by gene_id and annotation in # additional_group_by. - variant_to_tx = tx_ht[variant_ht.transcript_id, variant_ht.gene_id] + variant_to_tx = tx_ht[ht.transcript_id, ht.gene_id] grouping = ["locus", "alleles", "gene_id"] + list(additional_group_by) - tx_annot_ht = variant_ht.group_by(*grouping).aggregate( + ht = ht.group_by(*grouping).aggregate( **{a: hl.agg.array_sum(variant_to_tx[a]) for a in agg_annotations}, exp_prop_mean=hl.agg.sum(variant_to_tx.exp_prop_mean), ) # Reformat the transcript expression information to be a struct per tissue. - tx_annot_ht = tx_annot_ht.select( + ht = ht.select( "exp_prop_mean", **{ - t: hl.struct(**{a: tx_annot_ht[a][i] for a in agg_annotations}) + t: hl.struct(**{a: ht[a][i] for a in agg_annotations}) for i, t in enumerate(tissues) }, ) - tx_annot_ht = tx_annot_ht.key_by(tx_annot_ht.locus, tx_annot_ht.alleles) + ht = ht.key_by(ht.locus, ht.alleles) - return tx_annot_ht + return ht From 42714b8aba1da4f496f3e570c5eec2db61b75531 Mon Sep 17 00:00:00 2001 From: KoalaQin Date: Wed, 20 Dec 2023 12:30:55 -0500 Subject: [PATCH 07/31] remove redundant content --- gnomad/utils/transcript_annotation.py | 43 --------------------------- 1 file changed, 43 deletions(-) diff --git a/gnomad/utils/transcript_annotation.py b/gnomad/utils/transcript_annotation.py index 9a37a2698..6de624898 100644 --- a/gnomad/utils/transcript_annotation.py +++ b/gnomad/utils/transcript_annotation.py @@ -175,49 +175,6 @@ def tissue_expression_ht_to_array( return ht -def get_expression_proportion( - transcript_ht: hl.Table, - gene_ht: hl.Table, - tissues_to_filter: Optional[List[str]] = None, -) -> hl.Table: - """ - Calculate the proportion of expression of transcript to gene per tissue. - - :param transcript_ht: Table of summarized transcript expression by tissue. - :param gene_ht: Table of summarized gene expression by tissue. - :param tissues_to_filter: Optional list of tissues to filter out - :return: Table with expression proportion of transcript to gene per tissue - and mean expression proportion across tissues. - """ - transcript_ht = tissue_expression_ht_to_array( - transcript_ht, tissues_to_filter=tissues_to_filter - ) - gene_ht = tissue_expression_ht_to_array( - gene_ht, tissues=hl.eval(transcript_ht.tissues) - ) - - # Join the transcript expression table and gene expression table. - transcript_ht = transcript_ht.annotate( - gene_expression=gene_ht[transcript_ht.gene_id].tissue_expression - ) - - # Calculate the proportion of expression of transcript to gene per tissue. - transcript_ht = transcript_ht.annotate( - exp_prop=hl.or_else( - transcript_ht.transcript_expression / transcript_ht.gene_expression, - hl.empty_array(hl.tfloat64), - ), - ) - # Calculate the mean expression proportion across tissues. - transcript_ht = transcript_ht.annotate( - exp_prop_mean=hl.mean( - hl.filter(lambda e: ~hl.is_nan(e), transcript_ht.exp_prop), - ) - ) - - return transcript_ht - - def tx_annotate_variants( ht: hl.Table, tx_ht: hl.Table, From 5a096170d8d2dae166f648870f0c4d04f699259b Mon Sep 17 00:00:00 2001 From: KoalaQin Date: Thu, 21 Dec 2023 12:50:52 -0500 Subject: [PATCH 08/31] split into small functions: preprocess, annotate and aggregate --- gnomad/utils/transcript_annotation.py | 130 +++++++++++++++++++++----- 1 file changed, 105 insertions(+), 25 deletions(-) diff --git a/gnomad/utils/transcript_annotation.py b/gnomad/utils/transcript_annotation.py index 6de624898..a53a90614 100644 --- a/gnomad/utils/transcript_annotation.py +++ b/gnomad/utils/transcript_annotation.py @@ -175,10 +175,80 @@ def tissue_expression_ht_to_array( return ht +def preprocess_variants_for_tx( + ht: hl.Table, + filter_to_genes: Optional[Union[Tuple[str], List[str]]] = None, + filter_to_csqs: Optional[Union[Tuple[str], List[str]]] = None, + ignore_splicing: bool = True, + filter_to_homs: bool = False, + filter_to_protein_coding: bool = True, + vep_annotation: str = "transcript_consequences", +) -> hl.Table: + """ + Filter variants to those that fall on transcripts of interest. + + :param ht: Table of variants to filter, it should contain at least the + following nested fields: `vep.transcript_consequences`, `freq`. + :param filter_to_genes: List of genes to filter to. + :param filter_to_csqs: List of consequence terms to filter to. + :param ignore_splicing: If True, ignore splice variants. Default is True. + :param filter_to_homs: If True, filter to variants with at least one + homozygote in `freq` field. Default is False. + :param filter_to_protein_coding: If True, filter to protein coding + transcripts. Default is True. + :param vep_annotation: Name of annotation in 'vep' annotation, + one of the processed consequences: ["transcript_consequences", + "worst_csq_by_gene", "worst_csq_for_variant", + "worst_csq_by_gene_canonical", "worst_csq_for_variant_canonical"]. + For example, if you want to annotate each variant with the worst + consequence in each gene it falls on and the transcript expression, + you would use "worst_csq_by_gene". Default is "transcript_consequences". + :return: Table of variants that fall on transcripts of interest. + """ + # TODO: Filter to only CDS regions? + ht = ht.select(freq=ht.freq, vep=ht.vep) + + ht = process_consequences(ht) + ht = ht.select(freq=ht.freq, vep_processed_csqs=ht.vep[vep_annotation]) + + # Explode the processed transcript consequences to be able to key by + # transcript ID + ht = ht.explode(ht.vep_processed_csqs) + + if filter_to_genes: + ht = ht.filter( + hl.literal(filter_to_genes).contains(ht.vep_processed_csqs.gene_id) + ) + + if filter_to_csqs: + ht = ht.filter( + hl.literal(filter_to_csqs).contains( + ht.vep_processed_csqs.most_severe_consequence + ) + ) + + # TODO: Need to modify process consequences to ignore splice variants, + # because these can occur on intronic regions? + splicing = hl.literal( + ["splice_acceptor_variant", "splice_donor_variant", "splice_region_variant"] + ) + if ignore_splicing: + ht = ht.filter( + ~splicing.contains(ht.vep_processed_csqs.most_severe_consequence) + ) + + if filter_to_homs: + ht = ht.filter(ht.freq[0].homozygote_count > 0) + + if filter_to_protein_coding: + ht = ht.filter(ht.vep_processed_csqs.biotype == "protein_coding") + + return ht + + def tx_annotate_variants( ht: hl.Table, tx_ht: hl.Table, - filter_to_protein_coding: bool = True, tissues_to_filter: Optional[List[str]] = None, additional_group_by: Optional[Union[Tuple[str], List[str]]] = ( "gene_symbol", @@ -186,7 +256,6 @@ def tx_annotate_variants( "lof", "lof_flags", ), - vep_annotation: str = "transcript_consequences", ) -> hl.Table: """ Annotate variants with transcript-based expression values or expression proportion from GTEx. @@ -194,47 +263,58 @@ def tx_annotate_variants( :param ht: Table of variants to annotate, it should contain at least the following nested fields: `vep.transcript_consequences`, `freq`. :param tx_ht: Table of transcript expression information. - :param filter_to_protein_coding: If True, filter to protein coding - transcripts. Default is True. :param tissues_to_filter: Optional list of tissues to exclude from the output. :param additional_group_by: Optional list of additional fields to group by before sum aggregation. - :param vep_annotation: Name of annotation in 'vep' annotation, - one of the processed consequences: ["transcript_consequences", - "worst_csq_by_gene", "worst_csq_for_variant", - "worst_csq_by_gene_canonical", "worst_csq_for_variant_canonical"]. - For example, if you want to annotate each variant with the worst - consequence in each gene it falls on and the transcript expression, - you would use "worst_csq_by_gene". Default is "transcript_consequences". :return: Input Table with transcript expression information annotated. """ # Filter to tissues of interest and convert to arrays for easy aggregation. tx_ht = tissue_expression_ht_to_array(tx_ht, tissues_to_filter=tissues_to_filter) - agg_annotations = list(tx_ht.row_value) - tissues = hl.eval(tx_ht.tissues) # Calculate the mean expression proportion across all tissues. tx_ht = tx_ht.annotate(exp_prop_mean=hl.mean(tx_ht.expression_proportion)) - ht = process_consequences(ht) + grouping = ["transcript_id", "gene_id"] + list(additional_group_by) + ht = ht.select(**{a: ht.vep_processed_csqs[a] for a in grouping}) - # Explode the processed transcript consequences to be able to key by - # transcript ID - ht = ht.explode(ht.vep[vep_annotation]) + ht = ht.annotate(**tx_ht[ht.transcript_id, ht.gene_id]) + ht = ht.annotate_globals(tissues=tx_ht.index_globals().tissues) - if filter_to_protein_coding: - ht = ht.filter(ht.vep[vep_annotation].biotype == "protein_coding") + return ht - grouping = ["transcript_id", "gene_id"] + list(additional_group_by) - ht = ht.select(**{a: ht.vep[vep_annotation][a] for a in grouping}) + +def tx_aggregate_variants( + ht: hl.Table, + agg_annotations: Optional[List[str]] = ( + "transcript_expression", + "expression_proportion", + ), + additional_group_by: Optional[Union[Tuple[str], List[str]]] = ( + "gene_symbol", + "most_severe_consequence", + "lof", + "lof_flags", + ), +) -> hl.Table: + """ + Aggregate transcript-based expression values or expression proportion from GTEx. + + :param ht: Table of variants annotated with transcript expression information. + :param agg_annotations: Optional list of annotations to aggregate. Default is + ['transcript_expression', 'expression_proportion', 'exp_prop_mean']. + :param additional_group_by: Optional list of additional fields to group by before + sum aggregation. + :return: Table of variants with transcript expression information aggregated. + """ + tissues = hl.eval(ht.tissues) + # TODO: to key by only locus to get base-level annotation + grouping = ["locus", "alleles", "gene_id"] + list(additional_group_by) # Aggregate the transcript expression information by gene_id and annotation in # additional_group_by. - variant_to_tx = tx_ht[ht.transcript_id, ht.gene_id] - grouping = ["locus", "alleles", "gene_id"] + list(additional_group_by) ht = ht.group_by(*grouping).aggregate( - **{a: hl.agg.array_sum(variant_to_tx[a]) for a in agg_annotations}, - exp_prop_mean=hl.agg.sum(variant_to_tx.exp_prop_mean), + **{a: hl.agg.array_sum(ht[a]) for a in agg_annotations}, + exp_prop_mean=hl.agg.sum(ht.exp_prop_mean), ) # Reformat the transcript expression information to be a struct per tissue. From df86088afa816a6e4c3f5c29182e90e3aabb77d2 Mon Sep 17 00:00:00 2001 From: KoalaQin Date: Thu, 21 Dec 2023 13:50:17 -0500 Subject: [PATCH 09/31] add TODOs --- gnomad/utils/transcript_annotation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gnomad/utils/transcript_annotation.py b/gnomad/utils/transcript_annotation.py index a53a90614..fb80a9e9d 100644 --- a/gnomad/utils/transcript_annotation.py +++ b/gnomad/utils/transcript_annotation.py @@ -4,7 +4,7 @@ import hail as hl -from gnomad.utils.vep import process_consequences +from gnomad.utils.vep import CSQ_CODING_HIGH_IMPACT, process_consequences logging.basicConfig( format="%(asctime)s (%(name)s %(lineno)s): %(message)s", @@ -307,7 +307,7 @@ def tx_aggregate_variants( :return: Table of variants with transcript expression information aggregated. """ tissues = hl.eval(ht.tissues) - # TODO: to key by only locus to get base-level annotation + # TODO: group_by and key_by locus to get base-level annotation grouping = ["locus", "alleles", "gene_id"] + list(additional_group_by) # Aggregate the transcript expression information by gene_id and annotation in From 86852b1e39eed19731bd9888911e305534b006e8 Mon Sep 17 00:00:00 2001 From: KoalaQin Date: Thu, 21 Dec 2023 15:19:51 -0500 Subject: [PATCH 10/31] add option when 'freq' is not in Table --- gnomad/utils/transcript_annotation.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/gnomad/utils/transcript_annotation.py b/gnomad/utils/transcript_annotation.py index fb80a9e9d..75bdab7c7 100644 --- a/gnomad/utils/transcript_annotation.py +++ b/gnomad/utils/transcript_annotation.py @@ -4,7 +4,7 @@ import hail as hl -from gnomad.utils.vep import CSQ_CODING_HIGH_IMPACT, process_consequences +from gnomad.utils.vep import process_consequences logging.basicConfig( format="%(asctime)s (%(name)s %(lineno)s): %(message)s", @@ -206,10 +206,18 @@ def preprocess_variants_for_tx( :return: Table of variants that fall on transcripts of interest. """ # TODO: Filter to only CDS regions? - ht = ht.select(freq=ht.freq, vep=ht.vep) - - ht = process_consequences(ht) - ht = ht.select(freq=ht.freq, vep_processed_csqs=ht.vep[vep_annotation]) + if "freq" not in list(ht.row_value): + logger.info( + "No 'freq' field found in input Table. You cannot use the filter_to_homs" + " option." + ) + ht = ht.select(vep=ht.vep) + ht = process_consequences(ht) + ht = ht.select(vep_processed_csqs=ht.vep[vep_annotation]) + else: + ht = ht.select(freq=ht.freq, vep=ht.vep) + ht = process_consequences(ht) + ht = ht.select(freq=ht.freq, vep_processed_csqs=ht.vep[vep_annotation]) # Explode the processed transcript consequences to be able to key by # transcript ID From 0cf1829e8f45eed99a8fffbed3e8e2058d92834a Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Tue, 26 Dec 2023 11:36:33 -0700 Subject: [PATCH 11/31] Suggestions to tx_annotate_mt --- gnomad/utils/transcript_annotation.py | 131 ++++++++++++++++++++------ gnomad/utils/vep.py | 117 ++++++++++++++++++++--- 2 files changed, 204 insertions(+), 44 deletions(-) diff --git a/gnomad/utils/transcript_annotation.py b/gnomad/utils/transcript_annotation.py index 6de624898..829109bfa 100644 --- a/gnomad/utils/transcript_annotation.py +++ b/gnomad/utils/transcript_annotation.py @@ -4,7 +4,11 @@ import hail as hl -from gnomad.utils.vep import process_consequences +from gnomad.utils.vep import ( + explode_by_vep_annotation, + filter_vep_transcript_csqs, + process_consequences, +) logging.basicConfig( format="%(asctime)s (%(name)s %(lineno)s): %(message)s", @@ -175,17 +179,66 @@ def tissue_expression_ht_to_array( return ht +def preprocess_variants_for_tx( + ht: hl.Table, + filter_to_genes: Optional[List[str]] = None, + match_by_gene_symbol: bool = False, + filter_to_csqs: Optional[List[str]] = None, + ignore_splicing: bool = True, + filter_to_protein_coding: bool = True, + vep_root: str = "vep", +) -> hl.Table: + """ + Prepare a Table of transcript expression information for annotation. + + :param ht: Table of transcript expression information. + :param filter_to_genes: Optional list of genes to filter to. Default is None. + :param match_by_gene_symbol: Whether to match by gene symbol instead of gene ID. + Default is False. + :param filter_to_csqs: Optional list of consequences to filter to. Default is None. + :param ignore_splicing: If True, ignore splice variants. Default is True. + :param filter_to_protein_coding: Whether to filter to protein coding transcripts. + Default is True. + :param vep_root: Name used for root VEP annotation. Default is 'vep'. + :return: Table of transcript expression information prepared for annotation. + """ + # TODO: Filter to only CDS regions? + + splice_csqs = [ + "splice_acceptor_variant", + "splice_donor_variant", + "splice_region_variant", + ] + if filter_to_csqs is not None: + logger.info("Adding most severe consequence to VEP transcript consequences...") + ht = process_consequences(ht, vep_root=vep_root) + keep_csqs = True + if ignore_splicing: + filter_to_csqs = [csq for csq in filter_to_csqs if csq not in splice_csqs] + elif ignore_splicing: + # TODO: Need to modify process consequences to ignore splice variants, + # because these can occur on intronic regions? + filter_to_csqs = splice_csqs + keep_csqs = False + + return filter_vep_transcript_csqs( + ht, + vep_root=vep_root, + synonymous=False, + canonical=False, + protein_coding=filter_to_protein_coding, + csqs=filter_to_csqs, + keep_csqs=keep_csqs, + genes=filter_to_genes, + match_by_gene_symbol=match_by_gene_symbol, + ) + + def tx_annotate_variants( ht: hl.Table, tx_ht: hl.Table, - filter_to_protein_coding: bool = True, tissues_to_filter: Optional[List[str]] = None, - additional_group_by: Optional[Union[Tuple[str], List[str]]] = ( - "gene_symbol", - "most_severe_consequence", - "lof", - "lof_flags", - ), + vep_root: str = "vep", vep_annotation: str = "transcript_consequences", ) -> hl.Table: """ @@ -194,11 +247,8 @@ def tx_annotate_variants( :param ht: Table of variants to annotate, it should contain at least the following nested fields: `vep.transcript_consequences`, `freq`. :param tx_ht: Table of transcript expression information. - :param filter_to_protein_coding: If True, filter to protein coding - transcripts. Default is True. :param tissues_to_filter: Optional list of tissues to exclude from the output. - :param additional_group_by: Optional list of additional fields to group by before - sum aggregation. + :param vep_root: Name used for root VEP annotation. Default is 'vep'. :param vep_annotation: Name of annotation in 'vep' annotation, one of the processed consequences: ["transcript_consequences", "worst_csq_by_gene", "worst_csq_for_variant", @@ -208,33 +258,56 @@ def tx_annotate_variants( you would use "worst_csq_by_gene". Default is "transcript_consequences". :return: Input Table with transcript expression information annotated. """ - # Filter to tissues of interest and convert to arrays for easy aggregation. - tx_ht = tissue_expression_ht_to_array(tx_ht, tissues_to_filter=tissues_to_filter) - agg_annotations = list(tx_ht.row_value) - tissues = hl.eval(tx_ht.tissues) + # Filter to tissues of interest. + tx_ht = filter_expression_ht_by_tissues(tx_ht, tissues_to_filter=tissues_to_filter) + tissues = list(tx_ht.row_value) # Calculate the mean expression proportion across all tissues. - tx_ht = tx_ht.annotate(exp_prop_mean=hl.mean(tx_ht.expression_proportion)) - - ht = process_consequences(ht) + tx_ht = tx_ht.annotate( + exp_prop_mean=hl.mean([tx_ht[t].expression_proportion for t in tissues]) + ) # Explode the processed transcript consequences to be able to key by - # transcript ID - ht = ht.explode(ht.vep[vep_annotation]) + # transcript ID. + ht = explode_by_vep_annotation(ht, vep_annotation=vep_annotation, vep_root=vep_root) + ht = ht.annotate(**tx_ht[ht.transcript_id, ht.gene_id]) + ht = ht.annotate_globals(tissues=tissues) - if filter_to_protein_coding: - ht = ht.filter(ht.vep[vep_annotation].biotype == "protein_coding") + return ht + + +def tx_aggregate_variants( + ht: hl.Table, + agg_annotations: Optional[List[str]] = ( + "transcript_expression", + "expression_proportion", + ), + additional_group_by: Optional[Union[Tuple[str], List[str]]] = ( + "gene_symbol", + "most_severe_consequence", + "lof", + "lof_flags", + ), +) -> hl.Table: + """ + Aggregate transcript-based expression values or expression proportion from GTEx. - grouping = ["transcript_id", "gene_id"] + list(additional_group_by) - ht = ht.select(**{a: ht.vep[vep_annotation][a] for a in grouping}) + :param ht: Table of variants annotated with transcript expression information. + :param agg_annotations: Optional list of annotations to aggregate. Default is + ['transcript_expression', 'expression_proportion', 'exp_prop_mean']. + :param additional_group_by: Optional list of additional fields to group by before + sum aggregation. + :return: Table of variants with transcript expression information aggregated. + """ + tissues = hl.eval(ht.tissues) + # TODO: group_by and key_by locus to get base-level annotation + grouping = ["locus", "alleles", "gene_id"] + list(additional_group_by) # Aggregate the transcript expression information by gene_id and annotation in # additional_group_by. - variant_to_tx = tx_ht[ht.transcript_id, ht.gene_id] - grouping = ["locus", "alleles", "gene_id"] + list(additional_group_by) ht = ht.group_by(*grouping).aggregate( - **{a: hl.agg.array_sum(variant_to_tx[a]) for a in agg_annotations}, - exp_prop_mean=hl.agg.sum(variant_to_tx.exp_prop_mean), + **{a: hl.agg.array_sum(ht[a]) for a in agg_annotations}, + exp_prop_mean=hl.agg.sum(ht.exp_prop_mean), ) # Reformat the transcript expression information to be a struct per tissue. diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index f30494de1..df9f5fe9e 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -275,19 +275,26 @@ def process_consequences( mt: Union[hl.MatrixTable, hl.Table], vep_root: str = "vep", penalize_flags: bool = True, + csq_order: Optional[List[str]] = None, ) -> Union[hl.MatrixTable, hl.Table]: """ Add most_severe_consequence into [vep_root].transcript_consequences, and worst_csq_by_gene, any_lof into [vep_root]. `most_severe_consequence` is the worst consequence for a transcript. - :param mt: Input MT - :param vep_root: Root for vep annotation (probably vep) - :param penalize_flags: Whether to penalize LOFTEE flagged variants, or treat them as equal to HC - :return: MT with better formatted consequences + :param mt: Input MT. + :param vep_root: Root for vep annotation (probably vep). + :param penalize_flags: Whether to penalize LOFTEE flagged variants, or treat them + as equal to HC. + :param csq_order: Optional list indicating the order of VEP consequences, sorted + from high to low impact. Default is None, which uses the value of the + `CSQ_ORDER` global. + :return: MT with better formatted consequences. """ - csqs = hl.literal(CSQ_ORDER) - csq_dict = hl.literal(dict(zip(CSQ_ORDER, range(len(CSQ_ORDER))))) + if csq_order is None: + csq_order = CSQ_ORDER + csqs = hl.literal(csq_order) + csq_dict = hl.literal(dict(zip(csq_order, range(len(csq_order))))) def find_worst_transcript_consequence( tcl: hl.expr.ArrayExpression, @@ -423,6 +430,36 @@ def filter_vep_to_synonymous_variants( ) +def filter_vep_to_gene_list( + t: Union[hl.MatrixTable, hl.Table], + genes: List[str], + match_by_gene_symbol: bool = False, + vep_root: str = "vep", + filter_empty_csq: bool = False, +): + """ + Filter VEP transcript consequences to those in a set of genes. + + :param t: Input Table or MatrixTable. + :param genes: Genes of interest to filter VEP transcript consequences to. + :param match_by_gene_symbol: Whether to match values in `genes` to VEP transcript + consequences by 'gene_symbol' instead of 'gene_id'. Default is False. + :param vep_root: Name used for VEP annotation. Default is 'vep'. + :param filter_empty_csq: Whether to filter out rows where 'transcript_consequences' + is empty. Default is False. + :return: Table or MatrixTable with VEP transcript consequences filtered. + """ + return filter_vep_transcript_csqs( + t, + vep_root, + synonymous=False, + canonical=False, + filter_empty_csq=filter_empty_csq, + genes=genes, + match_by_gene_symbol=match_by_gene_symbol, + ) + + def vep_struct_to_csq( vep_expr: hl.expr.StructExpression, csq_fields: str = VEP_CSQ_FIELDS[CURRENT_VEP_VERSION], @@ -654,23 +691,52 @@ def filter_vep_transcript_csqs( mane_select: bool = False, filter_empty_csq: bool = True, ensembl_only: bool = True, + protein_coding: bool = False, + csqs: List[str] = None, + keep_csqs: bool = True, + genes: Optional[List[str]] = None, + keep_genes: bool = True, + match_by_gene_symbol: bool = False, ) -> Union[hl.Table, hl.MatrixTable]: """ Filter VEP transcript consequences based on specified criteria, and optionally filter to variants where transcript consequences is not empty after filtering. - Transcript consequences can be filtered to those where 'most_severe_consequence' is 'synonymous_variant' and/or the transcript is the canonical transcript, - if the `synonymous` and `canonical` parameter are set to True, respectively. + Transcript consequences can be filtered to those where 'most_severe_consequence' is + 'synonymous_variant' and/or the transcript is the canonical transcript, if the + `synonymous` and `canonical` parameter are set to True, respectively. - If `filter_empty_csq` parameter is set to True, the Table/MatrixTable is filtered to variants where 'transcript_consequences' within the VEP annotation - is not empty after the specified filtering criteria is applied. + If `filter_empty_csq` parameter is set to True, the Table/MatrixTable is filtered + to variants where 'transcript_consequences' within the VEP annotation is not empty + after the specified filtering criteria is applied. :param t: Input Table or MatrixTable. :param vep_root: Name used for VEP annotation. Default is 'vep'. - :param synonymous: Whether to filter to variants where the most severe consequence is 'synonymous_variant'. Default is True. + :param synonymous: Whether to filter to variants where the most severe consequence + is 'synonymous_variant'. Default is True. :param canonical: Whether to filter to only canonical transcripts. Default is True. - :param mane_select: Whether to filter to only MANE Select transcripts. Default is False. - :param filter_empty_csq: Whether to filter out rows where 'transcript_consequences' is empty, after filtering 'transcript_consequences' to the specified criteria. Default is True. - :param ensembl_only: Whether to filter to only Ensembl transcipts. This option is useful for deduplicating transcripts that are the same between RefSeq and Emsembl. Default is True. + :param mane_select: Whether to filter to only MANE Select transcripts. Default is + False. + :param filter_empty_csq: Whether to filter out rows where 'transcript_consequences' + is empty, after filtering 'transcript_consequences' to the specified criteria. + Default is True. + :param ensembl_only: Whether to filter to only Ensembl transcipts. This option is + useful for deduplicating transcripts that are the same between RefSeq and + Emsembl. Default is True. + :param protein_coding: Whether to filter to only protein-coding transcripts. + Default is False. + :param csqs: Optional list of consequence terms to filter to. Transcript + consequences are filtered to those where 'most_severe_consequence' is in the + list of consequence terms `csqs`. Default is None. + :param keep_csqs: Whether to keep transcript consequences that are in `csqs`. If + set to False, transcript consequences that are in `csqs` will be removed. + Default is True. + :param genes: Optional list of genes to filter VEP transcript consequences to. + Default is None. + :param keep_genes: Whether to keep transcript consequences that are in `genes`. If + set to False, transcript consequences that are in `genes` will be removed. + Default is True. + :param match_by_gene_symbol: Whether to match values in `genes` to VEP transcript + consequences by 'gene_symbol' instead of 'gene_id'. Default is False. :return: Table or MatrixTable filtered to specified criteria. """ if not synonymous and not (canonical or mane_select) and not filter_empty_csq: @@ -680,13 +746,34 @@ def filter_vep_transcript_csqs( transcript_csqs = t[vep_root].transcript_consequences criteria = [lambda csq: True] if synonymous: - criteria.append(lambda csq: csq.most_severe_consequence == "synonymous_variant") + logger.info("Filtering to most severe consequence of synonymous_variant...") + csqs = ["synonymous_variant"] + if csqs is not None: + csqs = hl.literal(csqs) + if keep_csqs: + criteria.append(lambda csq: csqs.contains(csq.most_severe_consequence)) + else: + criteria.append(lambda csq: ~csqs.contains(csq.most_severe_consequence)) if canonical: + logger.info("Filtering to canonical transcripts") criteria.append(lambda csq: csq.canonical == 1) if mane_select: + logger.info("Filtering to MANE Select transcripts...") criteria.append(lambda csq: hl.is_defined(csq.mane_select)) if ensembl_only: + logger.info("Filtering to Ensembl transcripts...") criteria.append(lambda csq: csq.transcript_id.startswith("ENST")) + if protein_coding: + logger.info("Filtering to protein coding transcripts...") + criteria.append(lambda csq: csq.biotype == "protein_coding") + if genes is not None: + logger.info("Filtering to genes of interest...") + genes = hl.literal(genes) + gene_field = "gene_symbol" if match_by_gene_symbol else "gene_id" + if keep_csqs: + criteria.append(lambda csq: genes.contains(csq[gene_field])) + else: + criteria.append(lambda csq: ~genes.contains(csq[gene_field])) transcript_csqs = transcript_csqs.filter(lambda x: combine_functions(criteria, x)) is_mt = isinstance(t, hl.MatrixTable) From a98fc1fca43398a495fb509af7859e316097bb73 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Tue, 2 Jan 2024 12:33:51 -0700 Subject: [PATCH 12/31] Use `process_consequences` if ignore_splicing is True --- gnomad/utils/transcript_annotation.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/gnomad/utils/transcript_annotation.py b/gnomad/utils/transcript_annotation.py index 829109bfa..287b7297e 100644 --- a/gnomad/utils/transcript_annotation.py +++ b/gnomad/utils/transcript_annotation.py @@ -209,17 +209,19 @@ def preprocess_variants_for_tx( "splice_donor_variant", "splice_region_variant", ] + keep_csqs = True + if ignore_splicing: + if filter_to_csqs is not None: + filter_to_csqs = [csq for csq in filter_to_csqs if csq not in splice_csqs] + else: + # TODO: Need to modify process consequences to ignore splice variants, + # because these can occur on intronic regions? + filter_to_csqs = splice_csqs + keep_csqs = False + if filter_to_csqs is not None: logger.info("Adding most severe consequence to VEP transcript consequences...") ht = process_consequences(ht, vep_root=vep_root) - keep_csqs = True - if ignore_splicing: - filter_to_csqs = [csq for csq in filter_to_csqs if csq not in splice_csqs] - elif ignore_splicing: - # TODO: Need to modify process consequences to ignore splice variants, - # because these can occur on intronic regions? - filter_to_csqs = splice_csqs - keep_csqs = False return filter_vep_transcript_csqs( ht, From ff268794fc2a8895d2c48af231c8511069954468 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Tue, 2 Jan 2024 12:34:14 -0700 Subject: [PATCH 13/31] Fix keep_csqs -> keep_genes --- gnomad/utils/vep.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index df9f5fe9e..ff45494c8 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -770,7 +770,7 @@ def filter_vep_transcript_csqs( logger.info("Filtering to genes of interest...") genes = hl.literal(genes) gene_field = "gene_symbol" if match_by_gene_symbol else "gene_id" - if keep_csqs: + if keep_genes: criteria.append(lambda csq: genes.contains(csq[gene_field])) else: criteria.append(lambda csq: ~genes.contains(csq[gene_field])) From 0cac3554566579ab897255d2411df6940c973246 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Tue, 2 Jan 2024 13:55:30 -0700 Subject: [PATCH 14/31] Changes from testing --- gnomad/utils/transcript_annotation.py | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/gnomad/utils/transcript_annotation.py b/gnomad/utils/transcript_annotation.py index 287b7297e..4ae1da934 100644 --- a/gnomad/utils/transcript_annotation.py +++ b/gnomad/utils/transcript_annotation.py @@ -272,7 +272,10 @@ def tx_annotate_variants( # Explode the processed transcript consequences to be able to key by # transcript ID. ht = explode_by_vep_annotation(ht, vep_annotation=vep_annotation, vep_root=vep_root) - ht = ht.annotate(**tx_ht[ht.transcript_id, ht.gene_id]) + ht = ht.transpose( + **ht[vep_annotation], + **tx_ht[ht[vep_annotation].transcript_id, ht[vep_annotation].gene_id], + ) ht = ht.annotate_globals(tissues=tissues) return ht @@ -280,10 +283,6 @@ def tx_annotate_variants( def tx_aggregate_variants( ht: hl.Table, - agg_annotations: Optional[List[str]] = ( - "transcript_expression", - "expression_proportion", - ), additional_group_by: Optional[Union[Tuple[str], List[str]]] = ( "gene_symbol", "most_severe_consequence", @@ -295,8 +294,6 @@ def tx_aggregate_variants( Aggregate transcript-based expression values or expression proportion from GTEx. :param ht: Table of variants annotated with transcript expression information. - :param agg_annotations: Optional list of annotations to aggregate. Default is - ['transcript_expression', 'expression_proportion', 'exp_prop_mean']. :param additional_group_by: Optional list of additional fields to group by before sum aggregation. :return: Table of variants with transcript expression information aggregated. @@ -308,17 +305,8 @@ def tx_aggregate_variants( # Aggregate the transcript expression information by gene_id and annotation in # additional_group_by. ht = ht.group_by(*grouping).aggregate( - **{a: hl.agg.array_sum(ht[a]) for a in agg_annotations}, exp_prop_mean=hl.agg.sum(ht.exp_prop_mean), - ) - - # Reformat the transcript expression information to be a struct per tissue. - ht = ht.select( - "exp_prop_mean", - **{ - t: hl.struct(**{a: ht[a][i] for a in agg_annotations}) - for i, t in enumerate(tissues) - }, + **{t: hl.struct(**{a: hl.agg.sum(ht[t][a]) for a in ht[t]}) for t in tissues}, ) ht = ht.key_by(ht.locus, ht.alleles) From 1fdd3703b0deefb34e36d9507c61525e45417343 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Tue, 2 Jan 2024 16:06:57 -0700 Subject: [PATCH 15/31] Remove extra tab in docstring --- gnomad/utils/vep.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index ff45494c8..320acb401 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -718,7 +718,7 @@ def filter_vep_transcript_csqs( False. :param filter_empty_csq: Whether to filter out rows where 'transcript_consequences' is empty, after filtering 'transcript_consequences' to the specified criteria. - Default is True. + Default is True. :param ensembl_only: Whether to filter to only Ensembl transcipts. This option is useful for deduplicating transcripts that are the same between RefSeq and Emsembl. Default is True. From 3f0ca816905a4c2f2686968926631bb8f4c5bda2 Mon Sep 17 00:00:00 2001 From: KoalaQin Date: Mon, 8 Jan 2024 13:11:38 -0500 Subject: [PATCH 16/31] filter to CDS & redefine grouping --- gnomad/utils/transcript_annotation.py | 39 ++++++++++++++++----------- gnomad/utils/vep.py | 10 +++++-- 2 files changed, 31 insertions(+), 18 deletions(-) diff --git a/gnomad/utils/transcript_annotation.py b/gnomad/utils/transcript_annotation.py index 4ae1da934..f2c640057 100644 --- a/gnomad/utils/transcript_annotation.py +++ b/gnomad/utils/transcript_annotation.py @@ -5,6 +5,7 @@ import hail as hl from gnomad.utils.vep import ( + SPLICE_CSQS, explode_by_vep_annotation, filter_vep_transcript_csqs, process_consequences, @@ -181,6 +182,8 @@ def tissue_expression_ht_to_array( def preprocess_variants_for_tx( ht: hl.Table, + cds_intervals: Optional[hl.Table] = None, + filter_to_cds: bool = True, filter_to_genes: Optional[List[str]] = None, match_by_gene_symbol: bool = False, filter_to_csqs: Optional[List[str]] = None, @@ -189,9 +192,11 @@ def preprocess_variants_for_tx( vep_root: str = "vep", ) -> hl.Table: """ - Prepare a Table of transcript expression information for annotation. + Prepare a Table of variants with vep transcript consequences for annotation. - :param ht: Table of transcript expression information. + :param ht: Table of variants with 'vep' annotations. + :param cds_intervals: Optional Table of CDS intervals. Default is None. + :param filter_to_cds: Whether to filter to CDS regions. Default is True. :param filter_to_genes: Optional list of genes to filter to. Default is None. :param match_by_gene_symbol: Whether to match by gene symbol instead of gene ID. Default is False. @@ -200,23 +205,18 @@ def preprocess_variants_for_tx( :param filter_to_protein_coding: Whether to filter to protein coding transcripts. Default is True. :param vep_root: Name used for root VEP annotation. Default is 'vep'. - :return: Table of transcript expression information prepared for annotation. + :return: Table of variants with preprocessed/filtered transcript consequences + prepared for annotation. """ - # TODO: Filter to only CDS regions? + if filter_to_cds: + ht = ht.filter(hl.is_defined(cds_intervals[ht.locus])) - splice_csqs = [ - "splice_acceptor_variant", - "splice_donor_variant", - "splice_region_variant", - ] keep_csqs = True if ignore_splicing: if filter_to_csqs is not None: - filter_to_csqs = [csq for csq in filter_to_csqs if csq not in splice_csqs] + filter_to_csqs = [csq for csq in filter_to_csqs if csq not in SPLICE_CSQS] else: - # TODO: Need to modify process consequences to ignore splice variants, - # because these can occur on intronic regions? - filter_to_csqs = splice_csqs + filter_to_csqs = SPLICE_CSQS keep_csqs = False if filter_to_csqs is not None: @@ -283,7 +283,9 @@ def tx_annotate_variants( def tx_aggregate_variants( ht: hl.Table, + additional_grouping: bool = False, additional_group_by: Optional[Union[Tuple[str], List[str]]] = ( + "alleles", "gene_symbol", "most_severe_consequence", "lof", @@ -294,15 +296,20 @@ def tx_aggregate_variants( Aggregate transcript-based expression values or expression proportion from GTEx. :param ht: Table of variants annotated with transcript expression information. + :param additional_grouping: Whether to group by additional fields before sum + aggregation. Default is False. :param additional_group_by: Optional list of additional fields to group by before sum aggregation. :return: Table of variants with transcript expression information aggregated. """ tissues = hl.eval(ht.tissues) - # TODO: group_by and key_by locus to get base-level annotation - grouping = ["locus", "alleles", "gene_id"] + list(additional_group_by) - # Aggregate the transcript expression information by gene_id and annotation in + if additional_grouping: + grouping = ["locus", "gene_id"] + list(additional_group_by) + else: + grouping = ["locus", "gene_id"] + + # Aggregate the transcript expression information by locus, gene_id and annotation in # additional_group_by. ht = ht.group_by(*grouping).aggregate( exp_prop_mean=hl.agg.sum(ht.exp_prop_mean), diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index 320acb401..efb1664a1 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -81,6 +81,12 @@ + CSQ_NON_CODING ) +SPLICE_CSQS = [ + "splice_acceptor_variant", + "splice_donor_variant", + "splice_region_variant", +] + POSSIBLE_REFS = ("GRCh37", "GRCh38") """ Constant containing supported references @@ -282,7 +288,7 @@ def process_consequences( `most_severe_consequence` is the worst consequence for a transcript. - :param mt: Input MT. + :param mt: Input Table or MatrixTable. :param vep_root: Root for vep annotation (probably vep). :param penalize_flags: Whether to penalize LOFTEE flagged variants, or treat them as equal to HC. @@ -719,7 +725,7 @@ def filter_vep_transcript_csqs( :param filter_empty_csq: Whether to filter out rows where 'transcript_consequences' is empty, after filtering 'transcript_consequences' to the specified criteria. Default is True. - :param ensembl_only: Whether to filter to only Ensembl transcipts. This option is + :param ensembl_only: Whether to filter to only Ensembl transcripts. This option is useful for deduplicating transcripts that are the same between RefSeq and Emsembl. Default is True. :param protein_coding: Whether to filter to only protein-coding transcripts. From a7e791d4db81f1cb2ee33882478e4d72a5979190 Mon Sep 17 00:00:00 2001 From: KoalaQin Date: Mon, 8 Jan 2024 17:05:43 -0500 Subject: [PATCH 17/31] add CDS filtering in preprocess --- gnomad/utils/transcript_annotation.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/gnomad/utils/transcript_annotation.py b/gnomad/utils/transcript_annotation.py index f2c640057..39e9254bf 100644 --- a/gnomad/utils/transcript_annotation.py +++ b/gnomad/utils/transcript_annotation.py @@ -4,6 +4,8 @@ import hail as hl +from gnomad.resources.grch37 import gencode +from gnomad.utils.filtering import filter_gencode_to_cds from gnomad.utils.vep import ( SPLICE_CSQS, explode_by_vep_annotation, @@ -182,7 +184,6 @@ def tissue_expression_ht_to_array( def preprocess_variants_for_tx( ht: hl.Table, - cds_intervals: Optional[hl.Table] = None, filter_to_cds: bool = True, filter_to_genes: Optional[List[str]] = None, match_by_gene_symbol: bool = False, @@ -209,7 +210,9 @@ def preprocess_variants_for_tx( prepared for annotation. """ if filter_to_cds: - ht = ht.filter(hl.is_defined(cds_intervals[ht.locus])) + logger.info("Filtering to CDS regions...") + cds = filter_gencode_to_cds(gencode) + ht = ht.filter(hl.is_defined(cds[ht.locus])) keep_csqs = True if ignore_splicing: @@ -272,7 +275,7 @@ def tx_annotate_variants( # Explode the processed transcript consequences to be able to key by # transcript ID. ht = explode_by_vep_annotation(ht, vep_annotation=vep_annotation, vep_root=vep_root) - ht = ht.transpose( + ht = ht.transmute( **ht[vep_annotation], **tx_ht[ht[vep_annotation].transcript_id, ht[vep_annotation].gene_id], ) @@ -304,18 +307,19 @@ def tx_aggregate_variants( """ tissues = hl.eval(ht.tissues) - if additional_grouping: - grouping = ["locus", "gene_id"] + list(additional_group_by) - else: - grouping = ["locus", "gene_id"] + grouping = ( + ["locus", "gene_id"] + list(additional_group_by) + if additional_grouping + else ["locus", "gene_id"] + ) - # Aggregate the transcript expression information by locus, gene_id and annotation in - # additional_group_by. + # Aggregate the transcript expression information by locus, gene_id and + # annotations in additional_group_by. ht = ht.group_by(*grouping).aggregate( exp_prop_mean=hl.agg.sum(ht.exp_prop_mean), **{t: hl.struct(**{a: hl.agg.sum(ht[t][a]) for a in ht[t]}) for t in tissues}, ) - ht = ht.key_by(ht.locus, ht.alleles) + ht = ht.key_by(ht.locus, ht.alleles) if additional_grouping else ht.key_by(ht.locus) return ht From 203f8ad80ea9005037b6946ef4bc6b149b7faa5d Mon Sep 17 00:00:00 2001 From: KoalaQin Date: Mon, 8 Jan 2024 17:18:29 -0500 Subject: [PATCH 18/31] add filter to CDS function --- gnomad/utils/filtering.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py index b4a004ddb..4b024d076 100644 --- a/gnomad/utils/filtering.py +++ b/gnomad/utils/filtering.py @@ -383,6 +383,20 @@ def filter_to_clinvar_pathogenic( return t +def filter_gencode_to_cds(ht: hl.Table) -> hl.Table: + """ + Filter Gencode Table to only CDS regions. + + :param ht: Input Gencode Table that is converted from a GTF file. + :return: Table filtered to CDS intervals with necessary fields. + """ + return ( + ht.filter((ht.feature == "CDS") & (ht.transcript_type == "protein_coding")) + .select("gene_id", "transcript_id") + .distinct() + ) + + def remove_fields_from_constant( constant: List[str], fields_to_remove: List[str] ) -> List[str]: From 559c5d8ec28c44409ca866a5ad0b5d92ac991a07 Mon Sep 17 00:00:00 2001 From: KoalaQin Date: Sat, 13 Jan 2024 17:36:04 -0500 Subject: [PATCH 19/31] apply review suggestions --- gnomad/utils/filtering.py | 53 +++++++++++++--- gnomad/utils/transcript_annotation.py | 88 +++++++++++++++++++++------ gnomad/utils/vep.py | 10 ++- 3 files changed, 124 insertions(+), 27 deletions(-) diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py index 4b024d076..8d8197151 100644 --- a/gnomad/utils/filtering.py +++ b/gnomad/utils/filtering.py @@ -383,18 +383,55 @@ def filter_to_clinvar_pathogenic( return t -def filter_gencode_to_cds(ht: hl.Table) -> hl.Table: +def filter_to_gencode_cds( + t: Union[hl.MatrixTable, hl.Table], gencode_ht: Optional[hl.Table] = None +) -> hl.Table: """ - Filter Gencode Table to only CDS regions. + Filter a Table/MatrixTable to only Gencode CDS regions in protein coding transcripts. + + Example use: + + .. code-block:: python - :param ht: Input Gencode Table that is converted from a GTF file. - :return: Table filtered to CDS intervals with necessary fields. + from gnomad.resources.grch37.reference_data import gencode + gencode_ht = gencode.ht() + gencode_ht = filter_gencode_to_cds(gencode_ht) + + .. note:: + + If no Gencode Table is provided, the default version of the Gencode Table + resource for the genome build of the input Table/MatrixTable will be used. + + :param t: Input Table/MatrixTable to filter. + :param gencode_ht: Gencode Table to use for filtering the input Table/MatrixTable + to CDS regions. Default is None, which will use the default version of the + Gencode Table resource. + :return: Table/MatrixTable filtered to loci in Gencode CDS intervals. """ - return ( - ht.filter((ht.feature == "CDS") & (ht.transcript_type == "protein_coding")) - .select("gene_id", "transcript_id") - .distinct() + if gencode_ht is None: + build = get_reference_genome(t.locus).name + if build == "GRCh37": + from gnomad.resources.grch37.reference_data import gencode + elif build == "GRCh38": + from gnomad.resources.grch38.reference_data import gencode + + logger.info( + "No Gencode Table was supplied, using Gencode version %s", + gencode.default_version, + ) + gencode_ht = gencode.ht() + + gencode_ht = gencode_ht.filter( + (gencode_ht.feature == "CDS") & (gencode_ht.transcript_type == "protein_coding") ) + filter_expr = hl.is_defined(gencode_ht[t.locus]) + + if isinstance(t, hl.MatrixTable): + t = t.filter_rows(filter_expr) + else: + t = t.filter(filter_expr) + + return t def remove_fields_from_constant( diff --git a/gnomad/utils/transcript_annotation.py b/gnomad/utils/transcript_annotation.py index 5eeac8b8a..ddfa49929 100644 --- a/gnomad/utils/transcript_annotation.py +++ b/gnomad/utils/transcript_annotation.py @@ -4,10 +4,10 @@ import hail as hl -from gnomad.resources.grch37 import gencode -from gnomad.utils.filtering import filter_gencode_to_cds +from gnomad.utils.filtering import filter_to_gencode_cds from gnomad.utils.vep import ( - SPLICE_CSQS, + CSQ_CODING, + CSQ_SPLICE, explode_by_vep_annotation, filter_vep_transcript_csqs, process_consequences, @@ -219,6 +219,7 @@ def tissue_expression_ht_to_array( def preprocess_variants_for_tx( ht: hl.Table, filter_to_cds: bool = True, + gencode_ht: Optional[hl.Table] = None, filter_to_genes: Optional[List[str]] = None, match_by_gene_symbol: bool = False, filter_to_csqs: Optional[List[str]] = None, @@ -230,7 +231,9 @@ def preprocess_variants_for_tx( Prepare a Table of variants with vep transcript consequences for annotation. :param ht: Table of variants with 'vep' annotations. - :param cds_intervals: Optional Table of CDS intervals. Default is None. + :param gencode_ht: Optional Gencode resource Table containing CDS interval + information. Default is None, which will use the default version of the Gencode + Table resource for the reference build of the input Table `ht`. :param filter_to_cds: Whether to filter to CDS regions. Default is True. :param filter_to_genes: Optional list of genes to filter to. Default is None. :param match_by_gene_symbol: Whether to match by gene symbol instead of gene ID. @@ -245,15 +248,14 @@ def preprocess_variants_for_tx( """ if filter_to_cds: logger.info("Filtering to CDS regions...") - cds = filter_gencode_to_cds(gencode) - ht = ht.filter(hl.is_defined(cds[ht.locus])) + ht = filter_to_gencode_cds(ht, gencode_ht=gencode_ht) keep_csqs = True if ignore_splicing: if filter_to_csqs is not None: - filter_to_csqs = [csq for csq in filter_to_csqs if csq not in SPLICE_CSQS] + filter_to_csqs = [csq for csq in filter_to_csqs if csq not in CSQ_SPLICE] else: - filter_to_csqs = SPLICE_CSQS + filter_to_csqs = CSQ_SPLICE keep_csqs = False if filter_to_csqs is not None: @@ -320,7 +322,6 @@ def tx_annotate_variants( def tx_aggregate_variants( ht: hl.Table, - additional_grouping: bool = False, additional_group_by: Optional[Union[Tuple[str], List[str]]] = ( "alleles", "gene_symbol", @@ -333,19 +334,16 @@ def tx_aggregate_variants( Aggregate transcript-based expression values or expression proportion from GTEx. :param ht: Table of variants annotated with transcript expression information. - :param additional_grouping: Whether to group by additional fields before sum - aggregation. Default is False. :param additional_group_by: Optional list of additional fields to group by before - sum aggregation. + sum aggregation. If None, the returned Table will be grouped by only "locus" + and "gene_id" before the sum aggregation. :return: Table of variants with transcript expression information aggregated. """ tissues = hl.eval(ht.tissues) - grouping = ( - ["locus", "gene_id"] + list(additional_group_by) - if additional_grouping - else ["locus", "gene_id"] - ) + grouping = ["locus", "gene_id"] + if additional_group_by is not None: + grouping = grouping + list(additional_group_by) # Aggregate the transcript expression information by locus, gene_id and # annotations in additional_group_by. @@ -354,6 +352,60 @@ def tx_aggregate_variants( **{t: hl.struct(**{a: hl.agg.sum(ht[t][a]) for a in ht[t]}) for t in tissues}, ) - ht = ht.key_by(ht.locus, ht.alleles) if additional_grouping else ht.key_by(ht.locus) + # If 'alleles' is in the Table, key by 'locus' and 'alleles'. + keys = ["locus"] + if "alleles" in ht.row: + keys.append("alleles") + + ht = ht.key_by(*keys) + + return ht + + +def process_annotate_aggregate_variants( + ht: hl.Table, + tx_ht: hl.Table, + tissues_to_filter: Optional[List[str]] = None, + vep_root: str = "vep", + vep_annotation: str = "transcript_consequences", + filter_to_csqs: Optional[List[str]] = CSQ_CODING, + additional_group_by: Optional[Union[Tuple[str], List[str]]] = ( + "alleles", + "gene_symbol", + "most_severe_consequence", + "lof", + "lof_flags", + ), + **kwargs, +) -> hl.Table: + """ + One-stop usage of preprocess_variants_for_tx, tx_annotate_variants and tx_aggregate_variants. + + :param ht: Table of variants to annotate, it should contain at least the + following nested fields: `vep.transcript_consequences`, `freq`. + :param tx_ht: Table of transcript expression information. + :param tissues_to_filter: Optional list of tissues to exclude from the output. + :param vep_root: Name used for root VEP annotation. Default is 'vep'. + :param vep_annotation: Name of annotation in 'vep' annotation, refer to the + function where it is used for more details. + :param filter_to_csqs: Optional list of consequences to filter to. Default is None. + :param additional_group_by: Optional list of additional fields to group by before + sum aggregation. If None, the returned Table will be grouped by only "locus" + and "gene_id" before the sum aggregation. + :return: Table of variants with transcript expression information aggregated. + """ + tx_ht = tx_annotate_variants( + preprocess_variants_for_tx( + ht, vep_root=vep_root, filter_to_csqs=filter_to_csqs, **kwargs + ), + tx_ht, + tissues_to_filter=tissues_to_filter, + vep_root=vep_root, + vep_annotation=vep_annotation, + ) + + tx_ht = tx_aggregate_variants(tx_ht, additional_group_by=additional_group_by) + tx_ht = tx_ht.collect_by_key("tx_annotation") + ht = ht.annotate(**tx_ht[ht.key]) return ht diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index efb1664a1..1abe81733 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -81,11 +81,19 @@ + CSQ_NON_CODING ) -SPLICE_CSQS = [ +CSQ_CODING = CSQ_CODING_HIGH_IMPACT + CSQ_CODING_MEDIUM_IMPACT + CSQ_CODING_LOW_IMPACT +""" +Constant containing all coding consequences. +""" + +CSQ_SPLICE = [ "splice_acceptor_variant", "splice_donor_variant", "splice_region_variant", ] +""" +Constant containing all splice consequences. +""" POSSIBLE_REFS = ("GRCh37", "GRCh38") """ From 6f7cdcbf3a09087f982f24e41e73080fc138763c Mon Sep 17 00:00:00 2001 From: KoalaQin Date: Sat, 13 Jan 2024 17:46:31 -0500 Subject: [PATCH 20/31] add gencode v39 resource --- gnomad/resources/grch38/reference_data.py | 35 +++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/gnomad/resources/grch38/reference_data.py b/gnomad/resources/grch38/reference_data.py index 798a1d26e..f76267496 100644 --- a/gnomad/resources/grch38/reference_data.py +++ b/gnomad/resources/grch38/reference_data.py @@ -383,3 +383,38 @@ def get_truth_ht() -> Table: .repartition(200, shuffle=False) .persist() ) + + +def _import_gencode(gtf_path: str, **kwargs) -> hl.Table: + """ + Import GENCODE annotations GTF file as a Hail Table. + + :param gtf_path: Path to GENCODE GTF file. + :return: Table with GENCODE annotation information. + """ + ht = hl.experimental.import_gtf(gtf_path, **kwargs) + + # Only get gene and transcript stable IDs (without version numbers if they + # exist), early versions of GENCODE have no version numbers but later ones do. + ht = ht.annotate( + gene_id=ht.gene_id.split("\\.")[0], + transcript_id=ht.transcript_id.split("\\.")[0], + ) + return ht + + +gencode = VersionedTableResource( + default_version="v39", + versions={ + "v39": GnomadPublicTableResource( + path="gs://gnomad-public-requester-pays/resources/grch38/gencode/gencode.v39.annotation.ht", + import_func=_import_gencode, + import_args={ + "gtf_path": "gs://gcp-public-data--gnomad/resources/grch38/gencode/gencode.v39.annotation.gtf.gz", + "reference_genome": "GRCh38", + "force_bgz": True, + "min_partitions": 10, + }, + ), + }, +) From e2366ad465a44ff48af0260e8c0a60bff4363b7a Mon Sep 17 00:00:00 2001 From: KoalaQin Date: Sat, 13 Jan 2024 20:54:39 -0500 Subject: [PATCH 21/31] add tissues to global --- gnomad/utils/transcript_annotation.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gnomad/utils/transcript_annotation.py b/gnomad/utils/transcript_annotation.py index ddfa49929..b849ca374 100644 --- a/gnomad/utils/transcript_annotation.py +++ b/gnomad/utils/transcript_annotation.py @@ -406,6 +406,8 @@ def process_annotate_aggregate_variants( tx_ht = tx_aggregate_variants(tx_ht, additional_group_by=additional_group_by) tx_ht = tx_ht.collect_by_key("tx_annotation") - ht = ht.annotate(**tx_ht[ht.key]) + ht = ht.annotate(**tx_ht[ht.key]).annotate_globals( + tissues=tx_ht.index_globals().tissues + ) return ht From 705be2fb716ba5c3ee364949933cfa1d0db80db0 Mon Sep 17 00:00:00 2001 From: KoalaQin Date: Tue, 16 Jan 2024 12:33:24 -0500 Subject: [PATCH 22/31] delete annotate tx_ht back to variant ht because key changed --- gnomad/utils/transcript_annotation.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/gnomad/utils/transcript_annotation.py b/gnomad/utils/transcript_annotation.py index b849ca374..256731745 100644 --- a/gnomad/utils/transcript_annotation.py +++ b/gnomad/utils/transcript_annotation.py @@ -406,8 +406,5 @@ def process_annotate_aggregate_variants( tx_ht = tx_aggregate_variants(tx_ht, additional_group_by=additional_group_by) tx_ht = tx_ht.collect_by_key("tx_annotation") - ht = ht.annotate(**tx_ht[ht.key]).annotate_globals( - tissues=tx_ht.index_globals().tissues - ) - return ht + return tx_ht From 6f2a4fba2061255d89d57656c6ae03c6ccca834e Mon Sep 17 00:00:00 2001 From: KoalaQin Date: Tue, 16 Jan 2024 15:05:46 -0500 Subject: [PATCH 23/31] address review comments --- gnomad/resources/grch37/reference_data.py | 21 ++------------------- gnomad/resources/grch38/reference_data.py | 21 ++------------------- gnomad/resources/resource_utils.py | 18 ++++++++++++++++++ gnomad/utils/filtering.py | 2 ++ 4 files changed, 24 insertions(+), 38 deletions(-) diff --git a/gnomad/resources/grch37/reference_data.py b/gnomad/resources/grch37/reference_data.py index 10a7265f6..4f3a9eb7b 100644 --- a/gnomad/resources/grch37/reference_data.py +++ b/gnomad/resources/grch37/reference_data.py @@ -7,28 +7,11 @@ GnomadPublicTableResource, VersionedMatrixTableResource, VersionedTableResource, + import_gencode, import_sites_vcf, ) -def _import_gencode(gtf_path: str, **kwargs) -> hl.Table: - """ - Import GENCODE annotations GTF file as a Hail Table. - - :param gtf_path: Path to GENCODE GTF file. - :return: Table with GENCODE annotation information. - """ - ht = hl.experimental.import_gtf(gtf_path, **kwargs) - - # Only get gene and transcript stable IDs (without version numbers if they - # exist), early versions of GENCODE have no version numbers but later ones do. - ht = ht.annotate( - gene_id=ht.gene_id.split("\\.")[0], - transcript_id=ht.transcript_id.split("\\.")[0], - ) - return ht - - def _import_gtex_rsem(gtex_path: str, meta_path: str, **kwargs) -> hl.MatrixTable: """ Import GTEx RSEM data from expression data and sample attributes file. @@ -377,7 +360,7 @@ def get_truth_ht() -> hl.Table: versions={ "v19": GnomadPublicTableResource( path="gs://gnomad-public-requester-pays/resources/grch37/gencode/gencode.v19.annotation.ht", - import_func=_import_gencode, + import_func=import_gencode, import_args={ "gtf_path": "gs://gcp-public-data--gnomad/resources/grch37/gencode/gencode.v19.annotation.gtf.gz", "reference_genome": "GRCh37", diff --git a/gnomad/resources/grch38/reference_data.py b/gnomad/resources/grch38/reference_data.py index f76267496..f6e28668b 100644 --- a/gnomad/resources/grch38/reference_data.py +++ b/gnomad/resources/grch38/reference_data.py @@ -10,6 +10,7 @@ GnomadPublicTableResource, VersionedMatrixTableResource, VersionedTableResource, + import_gencode, import_sites_vcf, ) from gnomad.utils.vep import vep_or_lookup_vep @@ -385,30 +386,12 @@ def get_truth_ht() -> Table: ) -def _import_gencode(gtf_path: str, **kwargs) -> hl.Table: - """ - Import GENCODE annotations GTF file as a Hail Table. - - :param gtf_path: Path to GENCODE GTF file. - :return: Table with GENCODE annotation information. - """ - ht = hl.experimental.import_gtf(gtf_path, **kwargs) - - # Only get gene and transcript stable IDs (without version numbers if they - # exist), early versions of GENCODE have no version numbers but later ones do. - ht = ht.annotate( - gene_id=ht.gene_id.split("\\.")[0], - transcript_id=ht.transcript_id.split("\\.")[0], - ) - return ht - - gencode = VersionedTableResource( default_version="v39", versions={ "v39": GnomadPublicTableResource( path="gs://gnomad-public-requester-pays/resources/grch38/gencode/gencode.v39.annotation.ht", - import_func=_import_gencode, + import_func=import_gencode, import_args={ "gtf_path": "gs://gcp-public-data--gnomad/resources/grch38/gencode/gencode.v39.annotation.gtf.gz", "reference_genome": "GRCh38", diff --git a/gnomad/resources/resource_utils.py b/gnomad/resources/resource_utils.py index 04688edfd..f245ce70d 100644 --- a/gnomad/resources/resource_utils.py +++ b/gnomad/resources/resource_utils.py @@ -687,3 +687,21 @@ class DataException(Exception): # noqa: D101 def import_sites_vcf(**kwargs) -> hl.Table: """Import site-level data from a VCF into a Hail Table.""" return hl.import_vcf(**kwargs).rows() + + +def import_gencode(gtf_path: str, **kwargs) -> hl.Table: + """ + Import GENCODE annotations GTF file as a Hail Table. + + :param gtf_path: Path to GENCODE GTF file. + :return: Table with GENCODE annotation information. + """ + ht = hl.experimental.import_gtf(gtf_path, **kwargs) + + # Only get gene and transcript stable IDs (without version numbers if they + # exist), early versions of GENCODE have no version numbers but later ones do. + ht = ht.annotate( + gene_id=ht.gene_id.split("\\.")[0], + transcript_id=ht.transcript_id.split("\\.")[0], + ) + return ht diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py index 8d8197151..a329d9df5 100644 --- a/gnomad/utils/filtering.py +++ b/gnomad/utils/filtering.py @@ -414,6 +414,8 @@ def filter_to_gencode_cds( from gnomad.resources.grch37.reference_data import gencode elif build == "GRCh38": from gnomad.resources.grch38.reference_data import gencode + else: + raise ValueError(f"Unsupported reference genome build: {build}") logger.info( "No Gencode Table was supplied, using Gencode version %s", From 7ce7eaabf93442260ae861d5ddc285b83d29ca4b Mon Sep 17 00:00:00 2001 From: KoalaQin Date: Fri, 19 Jan 2024 16:16:00 -0500 Subject: [PATCH 24/31] add warnings to filter_to_cds and option to filter by amino_acids --- gnomad/utils/filtering.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py index a329d9df5..ccb1404ac 100644 --- a/gnomad/utils/filtering.py +++ b/gnomad/utils/filtering.py @@ -426,6 +426,12 @@ def filter_to_gencode_cds( gencode_ht = gencode_ht.filter( (gencode_ht.feature == "CDS") & (gencode_ht.transcript_type == "protein_coding") ) + logger.warning( + "When filtering to Gencode CDS intervals, it's not filtering by " + "transcript, variants that are not in CDS regions but are " + "annotated as a coding variants by VEP might be filtered out by " + "this." + ) filter_expr = hl.is_defined(gencode_ht[t.locus]) if isinstance(t, hl.MatrixTable): From 2274f6a5858812ea79309b7ec52a83b6a0786efc Mon Sep 17 00:00:00 2001 From: KoalaQin Date: Fri, 19 Jan 2024 16:17:40 -0500 Subject: [PATCH 25/31] option to filter by valid amino_acids in transcript_consequences --- gnomad/utils/transcript_annotation.py | 2 ++ gnomad/utils/vep.py | 8 ++++++++ 2 files changed, 10 insertions(+) diff --git a/gnomad/utils/transcript_annotation.py b/gnomad/utils/transcript_annotation.py index 256731745..d2f54c486 100644 --- a/gnomad/utils/transcript_annotation.py +++ b/gnomad/utils/transcript_annotation.py @@ -219,6 +219,7 @@ def tissue_expression_ht_to_array( def preprocess_variants_for_tx( ht: hl.Table, filter_to_cds: bool = True, + filter_by_amino_acids: bool = True, gencode_ht: Optional[hl.Table] = None, filter_to_genes: Optional[List[str]] = None, match_by_gene_symbol: bool = False, @@ -272,6 +273,7 @@ def preprocess_variants_for_tx( keep_csqs=keep_csqs, genes=filter_to_genes, match_by_gene_symbol=match_by_gene_symbol, + amio_acids=filter_by_amino_acids, ) diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index 1abe81733..d17f6ecae 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -711,6 +711,7 @@ def filter_vep_transcript_csqs( genes: Optional[List[str]] = None, keep_genes: bool = True, match_by_gene_symbol: bool = False, + amio_acids: bool = False, ) -> Union[hl.Table, hl.MatrixTable]: """ Filter VEP transcript consequences based on specified criteria, and optionally filter to variants where transcript consequences is not empty after filtering. @@ -751,6 +752,8 @@ def filter_vep_transcript_csqs( Default is True. :param match_by_gene_symbol: Whether to match values in `genes` to VEP transcript consequences by 'gene_symbol' instead of 'gene_id'. Default is False. + :param amio_acids: Whether to filter to variants whose amic_acids annotation is + defined (not None or '*'). Default is False. :return: Table or MatrixTable filtered to specified criteria. """ if not synonymous and not (canonical or mane_select) and not filter_empty_csq: @@ -788,6 +791,11 @@ def filter_vep_transcript_csqs( criteria.append(lambda csq: genes.contains(csq[gene_field])) else: criteria.append(lambda csq: ~genes.contains(csq[gene_field])) + if amio_acids: + logger.info("Filtering to variants with defined amio_acids annotation...") + criteria.append( + lambda csq: (csq.amino_acids is not None) & (csq.amino_acids != "*") + ) transcript_csqs = transcript_csqs.filter(lambda x: combine_functions(criteria, x)) is_mt = isinstance(t, hl.MatrixTable) From aa67ca16c25e987547c87c59f1848cfdd3dd035f Mon Sep 17 00:00:00 2001 From: KoalaQin Date: Fri, 19 Jan 2024 18:54:43 -0500 Subject: [PATCH 26/31] address review comments --- gnomad/resources/grch38/reference_data.py | 2 +- gnomad/utils/filtering.py | 16 ++++++--- gnomad/utils/transcript_annotation.py | 40 ++++++++++++----------- gnomad/utils/vep.py | 26 +++++++++------ 4 files changed, 50 insertions(+), 34 deletions(-) diff --git a/gnomad/resources/grch38/reference_data.py b/gnomad/resources/grch38/reference_data.py index f6e28668b..0870fa195 100644 --- a/gnomad/resources/grch38/reference_data.py +++ b/gnomad/resources/grch38/reference_data.py @@ -396,7 +396,7 @@ def get_truth_ht() -> Table: "gtf_path": "gs://gcp-public-data--gnomad/resources/grch38/gencode/gencode.v39.annotation.gtf.gz", "reference_genome": "GRCh38", "force_bgz": True, - "min_partitions": 10, + "min_partitions": 100, }, ), }, diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py index ccb1404ac..aae50a70b 100644 --- a/gnomad/utils/filtering.py +++ b/gnomad/utils/filtering.py @@ -402,6 +402,16 @@ def filter_to_gencode_cds( If no Gencode Table is provided, the default version of the Gencode Table resource for the genome build of the input Table/MatrixTable will be used. + .. warning:: + + This Gencode CDS interval filter does not take into account the + transcript_id, it filters to any locus that is found in a CDS interval for + any protein coding transcript. Therefore, if downstream analyses require + filtering to CDS intervals by transcript, an additional step must be taken. + For example, when filtering VEP transcript consequences, there may be cases + where a variant is retained with this filter, but is considered outside the + CDS intervals of some of the transcripts in the VEP annotation. + :param t: Input Table/MatrixTable to filter. :param gencode_ht: Gencode Table to use for filtering the input Table/MatrixTable to CDS regions. Default is None, which will use the default version of the @@ -427,10 +437,8 @@ def filter_to_gencode_cds( (gencode_ht.feature == "CDS") & (gencode_ht.transcript_type == "protein_coding") ) logger.warning( - "When filtering to Gencode CDS intervals, it's not filtering by " - "transcript, variants that are not in CDS regions but are " - "annotated as a coding variants by VEP might be filtered out by " - "this." + "This Gencode CDS interval filter does not filter by transcript! Please see the" + " documentation for more details to confirm it's being used as intended." ) filter_expr = hl.is_defined(gencode_ht[t.locus]) diff --git a/gnomad/utils/transcript_annotation.py b/gnomad/utils/transcript_annotation.py index d2f54c486..0a2ec837f 100644 --- a/gnomad/utils/transcript_annotation.py +++ b/gnomad/utils/transcript_annotation.py @@ -216,10 +216,9 @@ def tissue_expression_ht_to_array( return ht -def preprocess_variants_for_tx( +def tx_filter_variants_by_csqs( ht: hl.Table, filter_to_cds: bool = True, - filter_by_amino_acids: bool = True, gencode_ht: Optional[hl.Table] = None, filter_to_genes: Optional[List[str]] = None, match_by_gene_symbol: bool = False, @@ -229,18 +228,19 @@ def preprocess_variants_for_tx( vep_root: str = "vep", ) -> hl.Table: """ - Prepare a Table of variants with vep transcript consequences for annotation. + Prepare a Table of variants with VEP transcript consequences for annotation. :param ht: Table of variants with 'vep' annotations. :param gencode_ht: Optional Gencode resource Table containing CDS interval - information. Default is None, which will use the default version of the Gencode - Table resource for the reference build of the input Table `ht`. + information. This is only used when `filter_to_cds` is set to True. Default is + None, which will use the default version of the Gencode Table resource for + the reference build of the input Table `ht`. :param filter_to_cds: Whether to filter to CDS regions. Default is True. :param filter_to_genes: Optional list of genes to filter to. Default is None. :param match_by_gene_symbol: Whether to match by gene symbol instead of gene ID. Default is False. :param filter_to_csqs: Optional list of consequences to filter to. Default is None. - :param ignore_splicing: If True, ignore splice variants. Default is True. + :param ignore_splicing: If True, ignore splice consequences. Default is True. :param filter_to_protein_coding: Whether to filter to protein coding transcripts. Default is True. :param vep_root: Name used for root VEP annotation. Default is 'vep'. @@ -273,7 +273,9 @@ def preprocess_variants_for_tx( keep_csqs=keep_csqs, genes=filter_to_genes, match_by_gene_symbol=match_by_gene_symbol, - amio_acids=filter_by_amino_acids, + additional_filtering_criteria=( + lambda csq: (csq.amino_acids is not None) & (csq.amino_acids != "*") + ), ) @@ -287,18 +289,18 @@ def tx_annotate_variants( """ Annotate variants with transcript-based expression values or expression proportion from GTEx. - :param ht: Table of variants to annotate, it should contain at least the following - nested fields: `vep.transcript_consequences`, `freq`. + :param ht: Table of variants to annotate, it should contain the nested fields: + `vep.transcript_consequences`. :param tx_ht: Table of transcript expression information. :param tissues_to_filter: Optional list of tissues to exclude from the output. :param vep_root: Name used for root VEP annotation. Default is 'vep'. - :param vep_annotation: Name of annotation in 'vep' annotation, - one of the processed consequences: ["transcript_consequences", - "worst_csq_by_gene", "worst_csq_for_variant", - "worst_csq_by_gene_canonical", "worst_csq_for_variant_canonical"]. - For example, if you want to annotate each variant with the worst - consequence in each gene it falls on and the transcript expression, - you would use "worst_csq_by_gene". Default is "transcript_consequences". + :param vep_annotation: Name of annotation in `vep_root`, one of the processed + consequences: ["transcript_consequences", "worst_csq_by_gene", + "worst_csq_for_variant", "worst_csq_by_gene_canonical", + "worst_csq_for_variant_canonical"]. For example, if you want to annotate + each variant with the worst consequence in each gene it falls on and the + transcript expression, you would use "worst_csq_by_gene". Default is + "transcript_consequences". :return: Input Table with transcript expression information annotated. """ # Filter to tissues of interest. @@ -364,7 +366,7 @@ def tx_aggregate_variants( return ht -def process_annotate_aggregate_variants( +def combined_tx_pipeline( ht: hl.Table, tx_ht: hl.Table, tissues_to_filter: Optional[List[str]] = None, @@ -381,7 +383,7 @@ def process_annotate_aggregate_variants( **kwargs, ) -> hl.Table: """ - One-stop usage of preprocess_variants_for_tx, tx_annotate_variants and tx_aggregate_variants. + One-stop usage of `tx_filter_variants_by_csqs`, `tx_annotate_variants` and `tx_aggregate_variants`. :param ht: Table of variants to annotate, it should contain at least the following nested fields: `vep.transcript_consequences`, `freq`. @@ -397,7 +399,7 @@ def process_annotate_aggregate_variants( :return: Table of variants with transcript expression information aggregated. """ tx_ht = tx_annotate_variants( - preprocess_variants_for_tx( + tx_filter_variants_by_csqs( ht, vep_root=vep_root, filter_to_csqs=filter_to_csqs, **kwargs ), tx_ht, diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index d17f6ecae..8f242520a 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -4,7 +4,7 @@ import logging import os import subprocess -from typing import List, Optional, Union +from typing import Callable, List, Optional, Union import hail as hl @@ -297,7 +297,7 @@ def process_consequences( `most_severe_consequence` is the worst consequence for a transcript. :param mt: Input Table or MatrixTable. - :param vep_root: Root for vep annotation (probably vep). + :param vep_root: Root for VEP annotation (probably "vep"). :param penalize_flags: Whether to penalize LOFTEE flagged variants, or treat them as equal to HC. :param csq_order: Optional list indicating the order of VEP consequences, sorted @@ -454,6 +454,15 @@ def filter_vep_to_gene_list( """ Filter VEP transcript consequences to those in a set of genes. + .. note:: + + Filtering to a list of genes by their gene_id or gene_symbol will filter to + all variants that are annotated to the gene, including + ['upstream_gene_variant', 'downstream_gene_variant'], which will not be the + same as if you filter to a gene interval. If you only want variants inside + certain gene boundaries and filter faster, you can filter the variant first to an + interval list and further filter to specific genes. + :param t: Input Table or MatrixTable. :param genes: Genes of interest to filter VEP transcript consequences to. :param match_by_gene_symbol: Whether to match values in `genes` to VEP transcript @@ -711,7 +720,7 @@ def filter_vep_transcript_csqs( genes: Optional[List[str]] = None, keep_genes: bool = True, match_by_gene_symbol: bool = False, - amio_acids: bool = False, + additional_filtering_criteria: Optional[Callable] = None, ) -> Union[hl.Table, hl.MatrixTable]: """ Filter VEP transcript consequences based on specified criteria, and optionally filter to variants where transcript consequences is not empty after filtering. @@ -752,8 +761,7 @@ def filter_vep_transcript_csqs( Default is True. :param match_by_gene_symbol: Whether to match values in `genes` to VEP transcript consequences by 'gene_symbol' instead of 'gene_id'. Default is False. - :param amio_acids: Whether to filter to variants whose amic_acids annotation is - defined (not None or '*'). Default is False. + :param additional_filtering_criteria: Optional filtering criteria to apply to. :return: Table or MatrixTable filtered to specified criteria. """ if not synonymous and not (canonical or mane_select) and not filter_empty_csq: @@ -791,11 +799,9 @@ def filter_vep_transcript_csqs( criteria.append(lambda csq: genes.contains(csq[gene_field])) else: criteria.append(lambda csq: ~genes.contains(csq[gene_field])) - if amio_acids: - logger.info("Filtering to variants with defined amio_acids annotation...") - criteria.append( - lambda csq: (csq.amino_acids is not None) & (csq.amino_acids != "*") - ) + if additional_filtering_criteria is not None: + logger.info("Filtering to variants with additional criteria...") + criteria = criteria.append(additional_filtering_criteria) transcript_csqs = transcript_csqs.filter(lambda x: combine_functions(criteria, x)) is_mt = isinstance(t, hl.MatrixTable) From 09c3a8e854f8da702af11faea6838ca2d42781b7 Mon Sep 17 00:00:00 2001 From: KoalaQin Date: Fri, 19 Jan 2024 19:07:33 -0500 Subject: [PATCH 27/31] combine filtering criteria with concatenation --- gnomad/utils/vep.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index 8f242520a..ab14b926e 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -801,7 +801,7 @@ def filter_vep_transcript_csqs( criteria.append(lambda csq: ~genes.contains(csq[gene_field])) if additional_filtering_criteria is not None: logger.info("Filtering to variants with additional criteria...") - criteria = criteria.append(additional_filtering_criteria) + criteria = criteria + [additional_filtering_criteria] transcript_csqs = transcript_csqs.filter(lambda x: combine_functions(criteria, x)) is_mt = isinstance(t, hl.MatrixTable) From f3764a88baf24785c19c1b51377b12fac7806e0e Mon Sep 17 00:00:00 2001 From: KoalaQin Date: Fri, 19 Jan 2024 20:07:39 -0500 Subject: [PATCH 28/31] rename pipeline function --- gnomad/utils/transcript_annotation.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/gnomad/utils/transcript_annotation.py b/gnomad/utils/transcript_annotation.py index 0a2ec837f..9743c0c36 100644 --- a/gnomad/utils/transcript_annotation.py +++ b/gnomad/utils/transcript_annotation.py @@ -366,7 +366,7 @@ def tx_aggregate_variants( return ht -def combined_tx_pipeline( +def perform_tx_annotation_pipeline( ht: hl.Table, tx_ht: hl.Table, tissues_to_filter: Optional[List[str]] = None, @@ -409,6 +409,5 @@ def combined_tx_pipeline( ) tx_ht = tx_aggregate_variants(tx_ht, additional_group_by=additional_group_by) - tx_ht = tx_ht.collect_by_key("tx_annotation") return tx_ht From baff6638e0223ebf69f68f90accc36ae067d7532 Mon Sep 17 00:00:00 2001 From: KoalaQin Date: Fri, 19 Jan 2024 21:44:46 -0500 Subject: [PATCH 29/31] changes requested by reviewer --- gnomad/utils/transcript_annotation.py | 31 ++++++++++++++++++--------- gnomad/utils/vep.py | 11 +++++----- 2 files changed, 26 insertions(+), 16 deletions(-) diff --git a/gnomad/utils/transcript_annotation.py b/gnomad/utils/transcript_annotation.py index 9743c0c36..8300455ed 100644 --- a/gnomad/utils/transcript_annotation.py +++ b/gnomad/utils/transcript_annotation.py @@ -230,12 +230,20 @@ def tx_filter_variants_by_csqs( """ Prepare a Table of variants with VEP transcript consequences for annotation. + .. note:: + + When `filter_to_cds` is set to True, the returned Table will be further + filtered by defined 'amino_acids' annotation, which is to filter out certain + consequences, such as 'stop_retained_variant', that are kept by all CDS + intervals but don't belong to CDS of the transcript they fall on. + :param ht: Table of variants with 'vep' annotations. :param gencode_ht: Optional Gencode resource Table containing CDS interval information. This is only used when `filter_to_cds` is set to True. Default is None, which will use the default version of the Gencode Table resource for the reference build of the input Table `ht`. - :param filter_to_cds: Whether to filter to CDS regions. Default is True. + :param filter_to_cds: Whether to filter to CDS regions. Default is True. And it + will be further filtered by defined 'amino_acids' annotation. :param filter_to_genes: Optional list of genes to filter to. Default is None. :param match_by_gene_symbol: Whether to match by gene symbol instead of gene ID. Default is False. @@ -247,9 +255,13 @@ def tx_filter_variants_by_csqs( :return: Table of variants with preprocessed/filtered transcript consequences prepared for annotation. """ + additional_filtering_criteria = None if filter_to_cds: logger.info("Filtering to CDS regions...") ht = filter_to_gencode_cds(ht, gencode_ht=gencode_ht) + additional_filtering_criteria = [ + lambda csq: hl.is_defined(csq.amino_acids) & (csq.amino_acids != "*") + ] keep_csqs = True if ignore_splicing: @@ -273,9 +285,7 @@ def tx_filter_variants_by_csqs( keep_csqs=keep_csqs, genes=filter_to_genes, match_by_gene_symbol=match_by_gene_symbol, - additional_filtering_criteria=( - lambda csq: (csq.amino_acids is not None) & (csq.amino_acids != "*") - ), + additional_filtering_criteria=additional_filtering_criteria, ) @@ -290,11 +300,12 @@ def tx_annotate_variants( Annotate variants with transcript-based expression values or expression proportion from GTEx. :param ht: Table of variants to annotate, it should contain the nested fields: - `vep.transcript_consequences`. + `{vep_root}.{vep_annotation}`. :param tx_ht: Table of transcript expression information. :param tissues_to_filter: Optional list of tissues to exclude from the output. + Default is None. :param vep_root: Name used for root VEP annotation. Default is 'vep'. - :param vep_annotation: Name of annotation in `vep_root`, one of the processed + :param vep_annotation: Name of annotation under vep_root, one of the processed consequences: ["transcript_consequences", "worst_csq_by_gene", "worst_csq_for_variant", "worst_csq_by_gene_canonical", "worst_csq_for_variant_canonical"]. For example, if you want to annotate @@ -385,13 +396,13 @@ def perform_tx_annotation_pipeline( """ One-stop usage of `tx_filter_variants_by_csqs`, `tx_annotate_variants` and `tx_aggregate_variants`. - :param ht: Table of variants to annotate, it should contain at least the - following nested fields: `vep.transcript_consequences`, `freq`. + :param ht: Table of variants to annotate, it should contain the nested fields: + `{vep_root}.{vep_annotation}`. :param tx_ht: Table of transcript expression information. :param tissues_to_filter: Optional list of tissues to exclude from the output. :param vep_root: Name used for root VEP annotation. Default is 'vep'. - :param vep_annotation: Name of annotation in 'vep' annotation, refer to the - function where it is used for more details. + :param vep_annotation: Name of annotation under vep_root. Default is + 'transcript_consequences'. :param filter_to_csqs: Optional list of consequences to filter to. Default is None. :param additional_group_by: Optional list of additional fields to group by before sum aggregation. If None, the returned Table will be grouped by only "locus" diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index ab14b926e..4802c3f2f 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -456,12 +456,11 @@ def filter_vep_to_gene_list( .. note:: - Filtering to a list of genes by their gene_id or gene_symbol will filter to + Filtering to a list of genes by their 'gene_id' or 'gene_symbol' will filter to all variants that are annotated to the gene, including ['upstream_gene_variant', 'downstream_gene_variant'], which will not be the same as if you filter to a gene interval. If you only want variants inside - certain gene boundaries and filter faster, you can filter the variant first to an - interval list and further filter to specific genes. + certain gene boundaries and a faster filter, you can first filter `t` to an interval list and then apply this filter. :param t: Input Table or MatrixTable. :param genes: Genes of interest to filter VEP transcript consequences to. @@ -720,7 +719,7 @@ def filter_vep_transcript_csqs( genes: Optional[List[str]] = None, keep_genes: bool = True, match_by_gene_symbol: bool = False, - additional_filtering_criteria: Optional[Callable] = None, + additional_filtering_criteria: Optional[List[Callable]] = None, ) -> Union[hl.Table, hl.MatrixTable]: """ Filter VEP transcript consequences based on specified criteria, and optionally filter to variants where transcript consequences is not empty after filtering. @@ -761,7 +760,7 @@ def filter_vep_transcript_csqs( Default is True. :param match_by_gene_symbol: Whether to match values in `genes` to VEP transcript consequences by 'gene_symbol' instead of 'gene_id'. Default is False. - :param additional_filtering_criteria: Optional filtering criteria to apply to. + :param additional_filtering_criteria: Optional list of additional filtering criteria to apply to the VEP transcript consequences. :return: Table or MatrixTable filtered to specified criteria. """ if not synonymous and not (canonical or mane_select) and not filter_empty_csq: @@ -801,7 +800,7 @@ def filter_vep_transcript_csqs( criteria.append(lambda csq: ~genes.contains(csq[gene_field])) if additional_filtering_criteria is not None: logger.info("Filtering to variants with additional criteria...") - criteria = criteria + [additional_filtering_criteria] + criteria = criteria + additional_filtering_criteria transcript_csqs = transcript_csqs.filter(lambda x: combine_functions(criteria, x)) is_mt = isinstance(t, hl.MatrixTable) From 7462733e7b23f4191c90c4262ac728f317c79c4c Mon Sep 17 00:00:00 2001 From: KoalaQin Date: Fri, 19 Jan 2024 21:52:27 -0500 Subject: [PATCH 30/31] small formatting --- gnomad/utils/filtering.py | 3 ++- gnomad/utils/vep.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py index aae50a70b..21b52dffe 100644 --- a/gnomad/utils/filtering.py +++ b/gnomad/utils/filtering.py @@ -410,7 +410,8 @@ def filter_to_gencode_cds( filtering to CDS intervals by transcript, an additional step must be taken. For example, when filtering VEP transcript consequences, there may be cases where a variant is retained with this filter, but is considered outside the - CDS intervals of some of the transcripts in the VEP annotation. + CDS intervals of the transcript per the VEP predicted consequence of the + variant. :param t: Input Table/MatrixTable to filter. :param gencode_ht: Gencode Table to use for filtering the input Table/MatrixTable diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index 4802c3f2f..53cc44ff0 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -460,7 +460,8 @@ def filter_vep_to_gene_list( all variants that are annotated to the gene, including ['upstream_gene_variant', 'downstream_gene_variant'], which will not be the same as if you filter to a gene interval. If you only want variants inside - certain gene boundaries and a faster filter, you can first filter `t` to an interval list and then apply this filter. + certain gene boundaries and a faster filter, you can first filter `t` to an + interval list and then apply this filter. :param t: Input Table or MatrixTable. :param genes: Genes of interest to filter VEP transcript consequences to. From bcbc43325d92e1dbaa998782b23a2cb75c127ebd Mon Sep 17 00:00:00 2001 From: KoalaQin Date: Fri, 19 Jan 2024 22:20:40 -0500 Subject: [PATCH 31/31] small formatting * 2 --- gnomad/utils/vep.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index 53cc44ff0..ed1d767cf 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -761,7 +761,8 @@ def filter_vep_transcript_csqs( Default is True. :param match_by_gene_symbol: Whether to match values in `genes` to VEP transcript consequences by 'gene_symbol' instead of 'gene_id'. Default is False. - :param additional_filtering_criteria: Optional list of additional filtering criteria to apply to the VEP transcript consequences. + :param additional_filtering_criteria: Optional list of additional filtering + criteria to apply to the VEP transcript consequences. :return: Table or MatrixTable filtered to specified criteria. """ if not synonymous and not (canonical or mane_select) and not filter_empty_csq: