broadinstitute · KoalaQin · Jan 20, 2024 · Dec 8, 2023 · Dec 8, 2023 · Dec 18, 2023
diff --git a/gnomad/resources/grch38/reference_data.py b/gnomad/resources/grch38/reference_data.py
@@ -383,3 +383,38 @@ def get_truth_ht() -> Table:
         .repartition(200, shuffle=False)
         .persist()
     )
+
+
+def _import_gencode(gtf_path: str, **kwargs) -> hl.Table:
+    """
+    Import GENCODE annotations GTF file as a Hail Table.
+
+    :param gtf_path: Path to GENCODE GTF file.
+    :return: Table with GENCODE annotation information.
+    """
+    ht = hl.experimental.import_gtf(gtf_path, **kwargs)
+
+    # Only get gene and transcript stable IDs (without version numbers if they
+    # exist), early versions of GENCODE have no version numbers but later ones do.
+    ht = ht.annotate(
+        gene_id=ht.gene_id.split("\\.")[0],
+        transcript_id=ht.transcript_id.split("\\.")[0],
+    )
+    return ht
+
+
+gencode = VersionedTableResource(
+    default_version="v39",
+    versions={
+        "v39": GnomadPublicTableResource(
+            path="gs://gnomad-public-requester-pays/resources/grch38/gencode/gencode.v39.annotation.ht",
+            import_func=_import_gencode,
+            import_args={
+                "gtf_path": "gs://gcp-public-data--gnomad/resources/grch38/gencode/gencode.v39.annotation.gtf.gz",
+                "reference_genome": "GRCh38",
+                "force_bgz": True,
+                "min_partitions": 10,
+            },
+        ),
+    },
+)
diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py
@@ -383,6 +383,57 @@ def filter_to_clinvar_pathogenic(
     return t
 
 
+def filter_to_gencode_cds(
+    t: Union[hl.MatrixTable, hl.Table], gencode_ht: Optional[hl.Table] = None
+) -> hl.Table:
+    """
+    Filter a Table/MatrixTable to only Gencode CDS regions in protein coding transcripts.
+
+    Example use:
+
+    .. code-block:: python
+
+        from gnomad.resources.grch37.reference_data import gencode
+        gencode_ht = gencode.ht()
+        gencode_ht = filter_gencode_to_cds(gencode_ht)
+
+    .. note::
+
+        If no Gencode Table is provided, the default version of the Gencode Table
+        resource for the genome build of the input Table/MatrixTable will be used.
+
+    :param t: Input Table/MatrixTable to filter.
+    :param gencode_ht: Gencode Table to use for filtering the input Table/MatrixTable
+        to CDS regions. Default is None, which will use the default version of the
+        Gencode Table resource.
+    :return: Table/MatrixTable filtered to loci in Gencode CDS intervals.
+    """
+    if gencode_ht is None:
+        build = get_reference_genome(t.locus).name
+        if build == "GRCh37":
+            from gnomad.resources.grch37.reference_data import gencode
+        elif build == "GRCh38":
+            from gnomad.resources.grch38.reference_data import gencode
+
+        logger.info(
+            "No Gencode Table was supplied, using Gencode version %s",
+            gencode.default_version,
+        )
+        gencode_ht = gencode.ht()
+
+    gencode_ht = gencode_ht.filter(
+        (gencode_ht.feature == "CDS") & (gencode_ht.transcript_type == "protein_coding")
+    )
+    filter_expr = hl.is_defined(gencode_ht[t.locus])
+
+    if isinstance(t, hl.MatrixTable):
+        t = t.filter_rows(filter_expr)
+    else:
+        t = t.filter(filter_expr)
+
+    return t
+
+
 def remove_fields_from_constant(
     constant: List[str], fields_to_remove: List[str]
 ) -> List[str]:

diff --git a/gnomad/utils/transcript_annotation.py b/gnomad/utils/transcript_annotation.py
@@ -4,6 +4,15 @@
 
 import hail as hl
 
+from gnomad.utils.filtering import filter_to_gencode_cds
+from gnomad.utils.vep import (
+    CSQ_CODING,
+    CSQ_SPLICE,
+    explode_by_vep_annotation,
+    filter_vep_transcript_csqs,
+    process_consequences,
+)
+
 logging.basicConfig(
     format="%(asctime)s (%(name)s %(lineno)s): %(message)s",
     datefmt="%m/%d/%Y %I:%M:%S %p",
@@ -205,3 +214,197 @@ def tissue_expression_ht_to_array(
         )
 
     return ht
+
+
+def preprocess_variants_for_tx(
+    ht: hl.Table,
+    filter_to_cds: bool = True,
+    gencode_ht: Optional[hl.Table] = None,
+    filter_to_genes: Optional[List[str]] = None,
+    match_by_gene_symbol: bool = False,
+    filter_to_csqs: Optional[List[str]] = None,
+    ignore_splicing: bool = True,
+    filter_to_protein_coding: bool = True,
+    vep_root: str = "vep",
+) -> hl.Table:
+    """
+    Prepare a Table of variants with vep transcript consequences for annotation.
+
+    :param ht: Table of variants with 'vep' annotations.
+    :param gencode_ht: Optional Gencode resource Table containing CDS interval
+        information. Default is None, which will use the default version of the Gencode
+        Table resource for the reference build of the input Table `ht`.
+    :param filter_to_cds: Whether to filter to CDS regions. Default is True.
+    :param filter_to_genes: Optional list of genes to filter to. Default is None.
+    :param match_by_gene_symbol: Whether to match by gene symbol instead of gene ID.
+        Default is False.
+    :param filter_to_csqs: Optional list of consequences to filter to. Default is None.
+    :param ignore_splicing: If True, ignore splice variants. Default is True.
+    :param filter_to_protein_coding: Whether to filter to protein coding transcripts.
+        Default is True.
+    :param vep_root: Name used for root VEP annotation. Default is 'vep'.
+    :return: Table of variants with preprocessed/filtered transcript consequences
+        prepared for annotation.
+    """
+    if filter_to_cds:
+        logger.info("Filtering to CDS regions...")
+        ht = filter_to_gencode_cds(ht, gencode_ht=gencode_ht)
+
+    keep_csqs = True
+    if ignore_splicing:
+        if filter_to_csqs is not None:
+            filter_to_csqs = [csq for csq in filter_to_csqs if csq not in CSQ_SPLICE]
+        else:
+            filter_to_csqs = CSQ_SPLICE
+            keep_csqs = False
+
+    if filter_to_csqs is not None:
+        logger.info("Adding most severe consequence to VEP transcript consequences...")
+        ht = process_consequences(ht, vep_root=vep_root)
+
+    return filter_vep_transcript_csqs(
+        ht,
+        vep_root=vep_root,
+        synonymous=False,
+        canonical=False,
+        protein_coding=filter_to_protein_coding,
+        csqs=filter_to_csqs,
+        keep_csqs=keep_csqs,
+        genes=filter_to_genes,
+        match_by_gene_symbol=match_by_gene_symbol,
+    )
+
+
+def tx_annotate_variants(
+    ht: hl.Table,
+    tx_ht: hl.Table,
+    tissues_to_filter: Optional[List[str]] = None,
+    vep_root: str = "vep",
+    vep_annotation: str = "transcript_consequences",
+) -> hl.Table:
+    """
+    Annotate variants with transcript-based expression values or expression proportion from GTEx.
+
+    :param ht: Table of variants to annotate, it should contain at least the following
+        nested fields: `vep.transcript_consequences`, `freq`.
+    :param tx_ht: Table of transcript expression information.
+    :param tissues_to_filter: Optional list of tissues to exclude from the output.
+    :param vep_root: Name used for root VEP annotation. Default is 'vep'.
+    :param vep_annotation: Name of annotation in 'vep' annotation,
+        one of the processed consequences: ["transcript_consequences",
+        "worst_csq_by_gene", "worst_csq_for_variant",
+        "worst_csq_by_gene_canonical", "worst_csq_for_variant_canonical"].
+        For example, if you want to annotate each variant with the worst
+        consequence in each gene it falls on and the transcript expression,
+        you would use "worst_csq_by_gene". Default is "transcript_consequences".
+    :return: Input Table with transcript expression information annotated.
+    """
+    # Filter to tissues of interest.
+    tx_ht = filter_expression_ht_by_tissues(tx_ht, tissues_to_filter=tissues_to_filter)
+    tissues = list(tx_ht.row_value)
+
+    # Calculate the mean expression proportion across all tissues.
+    tx_ht = tx_ht.annotate(
+        exp_prop_mean=hl.mean([tx_ht[t].expression_proportion for t in tissues])
+    )
+
+    # Explode the processed transcript consequences to be able to key by
+    # transcript ID.
+    ht = explode_by_vep_annotation(ht, vep_annotation=vep_annotation, vep_root=vep_root)
+    ht = ht.transmute(
+        **ht[vep_annotation],
+        **tx_ht[ht[vep_annotation].transcript_id, ht[vep_annotation].gene_id],
+    )
+    ht = ht.annotate_globals(tissues=tissues)
+
+    return ht
+
+
+def tx_aggregate_variants(
+    ht: hl.Table,
+    additional_group_by: Optional[Union[Tuple[str], List[str]]] = (
+        "alleles",
+        "gene_symbol",
+        "most_severe_consequence",
+        "lof",
+        "lof_flags",
+    ),
+) -> hl.Table:
+    """
+    Aggregate transcript-based expression values or expression proportion from GTEx.
+
+    :param ht: Table of variants annotated with transcript expression information.
+    :param additional_group_by: Optional list of additional fields to group by before
+        sum aggregation. If None, the returned Table will be grouped by only "locus"
+        and "gene_id" before the sum aggregation.
+    :return: Table of variants with transcript expression information aggregated.
+    """
+    tissues = hl.eval(ht.tissues)
+
+    grouping = ["locus", "gene_id"]
+    if additional_group_by is not None:
+        grouping = grouping + list(additional_group_by)
+
+    # Aggregate the transcript expression information by locus, gene_id and
+    # annotations in additional_group_by.
+    ht = ht.group_by(*grouping).aggregate(
+        exp_prop_mean=hl.agg.sum(ht.exp_prop_mean),
+        **{t: hl.struct(**{a: hl.agg.sum(ht[t][a]) for a in ht[t]}) for t in tissues},
+    )
+
+    # If 'alleles' is in the Table, key by 'locus' and 'alleles'.
+    keys = ["locus"]
+    if "alleles" in ht.row:
+        keys.append("alleles")
+
+    ht = ht.key_by(*keys)
+
+    return ht
+
+
+def process_annotate_aggregate_variants(
+    ht: hl.Table,
+    tx_ht: hl.Table,
+    tissues_to_filter: Optional[List[str]] = None,
+    vep_root: str = "vep",
+    vep_annotation: str = "transcript_consequences",
+    filter_to_csqs: Optional[List[str]] = CSQ_CODING,
+    additional_group_by: Optional[Union[Tuple[str], List[str]]] = (
+        "alleles",
+        "gene_symbol",
+        "most_severe_consequence",
+        "lof",
+        "lof_flags",
+    ),
+    **kwargs,
+) -> hl.Table:
+    """
+    One-stop usage of preprocess_variants_for_tx, tx_annotate_variants and tx_aggregate_variants.
+
+    :param ht: Table of variants to annotate, it should contain at least the
+         following nested fields: `vep.transcript_consequences`, `freq`.
+    :param tx_ht: Table of transcript expression information.
+    :param tissues_to_filter: Optional list of tissues to exclude from the output.
+    :param vep_root: Name used for root VEP annotation. Default is 'vep'.
+    :param vep_annotation: Name of annotation in 'vep' annotation, refer to the
+        function where it is used for more details.
+    :param filter_to_csqs: Optional list of consequences to filter to. Default is None.
+    :param additional_group_by: Optional list of additional fields to group by before
+        sum aggregation. If None, the returned Table will be grouped by only "locus"
+        and "gene_id" before the sum aggregation.
+    :return: Table of variants with transcript expression information aggregated.
+    """
+    tx_ht = tx_annotate_variants(
+        preprocess_variants_for_tx(
+            ht, vep_root=vep_root, filter_to_csqs=filter_to_csqs, **kwargs
+        ),
+        tx_ht,
+        tissues_to_filter=tissues_to_filter,
+        vep_root=vep_root,
+        vep_annotation=vep_annotation,
+    )
+
+    tx_ht = tx_aggregate_variants(tx_ht, additional_group_by=additional_group_by)
+    tx_ht = tx_ht.collect_by_key("tx_annotation")
+
+    return tx_ht