Merge pull request #651 from broadinstitute/qh/tx_annotate_mt

Functions to process, filter, annotate and aggregate variants by transcript expression (get the pext scores per variant)
broadinstitute · Jan 20, 2024 · bea49f8 · bea49f8
2 parents c310157 + bcbc433
commit bea49f8
Show file tree

Hide file tree

Showing 6 changed files with 456 additions and 36 deletions.
diff --git a/gnomad/resources/grch37/reference_data.py b/gnomad/resources/grch37/reference_data.py
@@ -7,28 +7,11 @@
     GnomadPublicTableResource,
     VersionedMatrixTableResource,
     VersionedTableResource,
+    import_gencode,
     import_sites_vcf,
 )
 
 
-def _import_gencode(gtf_path: str, **kwargs) -> hl.Table:
-    """
-    Import GENCODE annotations GTF file as a Hail Table.
-
-    :param gtf_path: Path to GENCODE GTF file.
-    :return: Table with GENCODE annotation information.
-    """
-    ht = hl.experimental.import_gtf(gtf_path, **kwargs)
-
-    # Only get gene and transcript stable IDs (without version numbers if they
-    # exist), early versions of GENCODE have no version numbers but later ones do.
-    ht = ht.annotate(
-        gene_id=ht.gene_id.split("\\.")[0],
-        transcript_id=ht.transcript_id.split("\\.")[0],
-    )
-    return ht
-
-
 def _import_gtex_rsem(gtex_path: str, meta_path: str, **kwargs) -> hl.MatrixTable:
     """
     Import GTEx RSEM data from expression data and sample attributes file.
@@ -377,7 +360,7 @@ def get_truth_ht() -> hl.Table:
     versions={
         "v19": GnomadPublicTableResource(
             path="gs://gnomad-public-requester-pays/resources/grch37/gencode/gencode.v19.annotation.ht",
-            import_func=_import_gencode,
+            import_func=import_gencode,
             import_args={
                 "gtf_path": "gs://gcp-public-data--gnomad/resources/grch37/gencode/gencode.v19.annotation.gtf.gz",
                 "reference_genome": "GRCh37",

diff --git a/gnomad/resources/grch38/reference_data.py b/gnomad/resources/grch38/reference_data.py
@@ -10,6 +10,7 @@
     GnomadPublicTableResource,
     VersionedMatrixTableResource,
     VersionedTableResource,
+    import_gencode,
     import_sites_vcf,
 )
 from gnomad.utils.vep import vep_or_lookup_vep
@@ -383,3 +384,20 @@ def get_truth_ht() -> Table:
         .repartition(200, shuffle=False)
         .persist()
     )
+
+
+gencode = VersionedTableResource(
+    default_version="v39",
+    versions={
+        "v39": GnomadPublicTableResource(
+            path="gs://gnomad-public-requester-pays/resources/grch38/gencode/gencode.v39.annotation.ht",
+            import_func=import_gencode,
+            import_args={
+                "gtf_path": "gs://gcp-public-data--gnomad/resources/grch38/gencode/gencode.v39.annotation.gtf.gz",
+                "reference_genome": "GRCh38",
+                "force_bgz": True,
+                "min_partitions": 100,
+            },
+        ),
+    },
+)
diff --git a/gnomad/resources/resource_utils.py b/gnomad/resources/resource_utils.py
@@ -687,3 +687,21 @@ class DataException(Exception):  # noqa: D101
 def import_sites_vcf(**kwargs) -> hl.Table:
     """Import site-level data from a VCF into a Hail Table."""
     return hl.import_vcf(**kwargs).rows()
+
+
+def import_gencode(gtf_path: str, **kwargs) -> hl.Table:
+    """
+    Import GENCODE annotations GTF file as a Hail Table.
+
+    :param gtf_path: Path to GENCODE GTF file.
+    :return: Table with GENCODE annotation information.
+    """
+    ht = hl.experimental.import_gtf(gtf_path, **kwargs)
+
+    # Only get gene and transcript stable IDs (without version numbers if they
+    # exist), early versions of GENCODE have no version numbers but later ones do.
+    ht = ht.annotate(
+        gene_id=ht.gene_id.split("\\.")[0],
+        transcript_id=ht.transcript_id.split("\\.")[0],
+    )
+    return ht
diff --git a/gnomad/utils/filtering.py b/gnomad/utils/filtering.py
@@ -383,6 +383,74 @@ def filter_to_clinvar_pathogenic(
     return t
 
 
+def filter_to_gencode_cds(
+    t: Union[hl.MatrixTable, hl.Table], gencode_ht: Optional[hl.Table] = None
+) -> hl.Table:
+    """
+    Filter a Table/MatrixTable to only Gencode CDS regions in protein coding transcripts.
+
+    Example use:
+
+    .. code-block:: python
+
+        from gnomad.resources.grch37.reference_data import gencode
+        gencode_ht = gencode.ht()
+        gencode_ht = filter_gencode_to_cds(gencode_ht)
+
+    .. note::
+
+        If no Gencode Table is provided, the default version of the Gencode Table
+        resource for the genome build of the input Table/MatrixTable will be used.
+
+    .. warning::
+
+        This Gencode CDS interval filter does not take into account the
+        transcript_id, it filters to any locus that is found in a CDS interval for
+        any protein coding transcript. Therefore, if downstream analyses require
+        filtering to CDS intervals by transcript, an additional step must be taken.
+        For example, when filtering VEP transcript consequences, there may be cases
+        where a variant is retained with this filter, but is considered outside the
+        CDS intervals of the transcript per the VEP predicted consequence of the
+        variant.
+
+    :param t: Input Table/MatrixTable to filter.
+    :param gencode_ht: Gencode Table to use for filtering the input Table/MatrixTable
+        to CDS regions. Default is None, which will use the default version of the
+        Gencode Table resource.
+    :return: Table/MatrixTable filtered to loci in Gencode CDS intervals.
+    """
+    if gencode_ht is None:
+        build = get_reference_genome(t.locus).name
+        if build == "GRCh37":
+            from gnomad.resources.grch37.reference_data import gencode
+        elif build == "GRCh38":
+            from gnomad.resources.grch38.reference_data import gencode
+        else:
+            raise ValueError(f"Unsupported reference genome build: {build}")
+
+        logger.info(
+            "No Gencode Table was supplied, using Gencode version %s",
+            gencode.default_version,
+        )
+        gencode_ht = gencode.ht()
+
+    gencode_ht = gencode_ht.filter(
+        (gencode_ht.feature == "CDS") & (gencode_ht.transcript_type == "protein_coding")
+    )
+    logger.warning(
+        "This Gencode CDS interval filter does not filter by transcript! Please see the"
+        " documentation for more details to confirm it's being used as intended."
+    )
+    filter_expr = hl.is_defined(gencode_ht[t.locus])
+
+    if isinstance(t, hl.MatrixTable):
+        t = t.filter_rows(filter_expr)
+    else:
+        t = t.filter(filter_expr)
+
+    return t
+
+
 def remove_fields_from_constant(
     constant: List[str], fields_to_remove: List[str]
 ) -> List[str]:

diff --git a/gnomad/utils/transcript_annotation.py b/gnomad/utils/transcript_annotation.py
@@ -4,6 +4,15 @@
 
 import hail as hl
 
+from gnomad.utils.filtering import filter_to_gencode_cds
+from gnomad.utils.vep import (
+    CSQ_CODING,
+    CSQ_SPLICE,
+    explode_by_vep_annotation,
+    filter_vep_transcript_csqs,
+    process_consequences,
+)
+
 logging.basicConfig(
     format="%(asctime)s (%(name)s %(lineno)s): %(message)s",
     datefmt="%m/%d/%Y %I:%M:%S %p",
@@ -205,3 +214,211 @@ def tissue_expression_ht_to_array(
         )
 
     return ht
+
+
+def tx_filter_variants_by_csqs(
+    ht: hl.Table,
+    filter_to_cds: bool = True,
+    gencode_ht: Optional[hl.Table] = None,
+    filter_to_genes: Optional[List[str]] = None,
+    match_by_gene_symbol: bool = False,
+    filter_to_csqs: Optional[List[str]] = None,
+    ignore_splicing: bool = True,
+    filter_to_protein_coding: bool = True,
+    vep_root: str = "vep",
+) -> hl.Table:
+    """
+    Prepare a Table of variants with VEP transcript consequences for annotation.
+
+    .. note::
+
+        When `filter_to_cds` is set to True, the returned Table will be further
+        filtered by defined 'amino_acids' annotation, which is to filter out certain
+        consequences, such as 'stop_retained_variant', that are kept by all CDS
+        intervals but don't belong to CDS of the transcript they fall on.
+
+    :param ht: Table of variants with 'vep' annotations.
+    :param gencode_ht: Optional Gencode resource Table containing CDS interval
+        information. This is only used when `filter_to_cds` is set to True. Default is
+        None, which will use the default version of the Gencode Table resource for
+        the reference build of the input Table `ht`.
+    :param filter_to_cds: Whether to filter to CDS regions. Default is True. And it
+        will be further filtered by defined 'amino_acids' annotation.
+    :param filter_to_genes: Optional list of genes to filter to. Default is None.
+    :param match_by_gene_symbol: Whether to match by gene symbol instead of gene ID.
+        Default is False.
+    :param filter_to_csqs: Optional list of consequences to filter to. Default is None.
+    :param ignore_splicing: If True, ignore splice consequences. Default is True.
+    :param filter_to_protein_coding: Whether to filter to protein coding transcripts.
+        Default is True.
+    :param vep_root: Name used for root VEP annotation. Default is 'vep'.
+    :return: Table of variants with preprocessed/filtered transcript consequences
+        prepared for annotation.
+    """
+    additional_filtering_criteria = None
+    if filter_to_cds:
+        logger.info("Filtering to CDS regions...")
+        ht = filter_to_gencode_cds(ht, gencode_ht=gencode_ht)
+        additional_filtering_criteria = [
+            lambda csq: hl.is_defined(csq.amino_acids) & (csq.amino_acids != "*")
+        ]
+
+    keep_csqs = True
+    if ignore_splicing:
+        if filter_to_csqs is not None:
+            filter_to_csqs = [csq for csq in filter_to_csqs if csq not in CSQ_SPLICE]
+        else:
+            filter_to_csqs = CSQ_SPLICE
+            keep_csqs = False
+
+    if filter_to_csqs is not None:
+        logger.info("Adding most severe consequence to VEP transcript consequences...")
+        ht = process_consequences(ht, vep_root=vep_root)
+
+    return filter_vep_transcript_csqs(
+        ht,
+        vep_root=vep_root,
+        synonymous=False,
+        canonical=False,
+        protein_coding=filter_to_protein_coding,
+        csqs=filter_to_csqs,
+        keep_csqs=keep_csqs,
+        genes=filter_to_genes,
+        match_by_gene_symbol=match_by_gene_symbol,
+        additional_filtering_criteria=additional_filtering_criteria,
+    )
+
+
+def tx_annotate_variants(
+    ht: hl.Table,
+    tx_ht: hl.Table,
+    tissues_to_filter: Optional[List[str]] = None,
+    vep_root: str = "vep",
+    vep_annotation: str = "transcript_consequences",
+) -> hl.Table:
+    """
+    Annotate variants with transcript-based expression values or expression proportion from GTEx.
+
+    :param ht: Table of variants to annotate, it should contain the nested fields:
+        `{vep_root}.{vep_annotation}`.
+    :param tx_ht: Table of transcript expression information.
+    :param tissues_to_filter: Optional list of tissues to exclude from the output.
+        Default is None.
+    :param vep_root: Name used for root VEP annotation. Default is 'vep'.
+    :param vep_annotation: Name of annotation under vep_root, one of the processed
+        consequences: ["transcript_consequences", "worst_csq_by_gene",
+        "worst_csq_for_variant", "worst_csq_by_gene_canonical",
+        "worst_csq_for_variant_canonical"]. For example, if you want to annotate
+        each variant with the worst consequence in each gene it falls on and the
+        transcript expression, you would use "worst_csq_by_gene". Default is
+        "transcript_consequences".
+    :return: Input Table with transcript expression information annotated.
+    """
+    # Filter to tissues of interest.
+    tx_ht = filter_expression_ht_by_tissues(tx_ht, tissues_to_filter=tissues_to_filter)
+    tissues = list(tx_ht.row_value)
+
+    # Calculate the mean expression proportion across all tissues.
+    tx_ht = tx_ht.annotate(
+        exp_prop_mean=hl.mean([tx_ht[t].expression_proportion for t in tissues])
+    )
+
+    # Explode the processed transcript consequences to be able to key by
+    # transcript ID.
+    ht = explode_by_vep_annotation(ht, vep_annotation=vep_annotation, vep_root=vep_root)
+    ht = ht.transmute(
+        **ht[vep_annotation],
+        **tx_ht[ht[vep_annotation].transcript_id, ht[vep_annotation].gene_id],
+    )
+    ht = ht.annotate_globals(tissues=tissues)
+
+    return ht
+
+
+def tx_aggregate_variants(
+    ht: hl.Table,
+    additional_group_by: Optional[Union[Tuple[str], List[str]]] = (
+        "alleles",
+        "gene_symbol",
+        "most_severe_consequence",
+        "lof",
+        "lof_flags",
+    ),
+) -> hl.Table:
+    """
+    Aggregate transcript-based expression values or expression proportion from GTEx.
+
+    :param ht: Table of variants annotated with transcript expression information.
+    :param additional_group_by: Optional list of additional fields to group by before
+        sum aggregation. If None, the returned Table will be grouped by only "locus"
+        and "gene_id" before the sum aggregation.
+    :return: Table of variants with transcript expression information aggregated.
+    """
+    tissues = hl.eval(ht.tissues)
+
+    grouping = ["locus", "gene_id"]
+    if additional_group_by is not None:
+        grouping = grouping + list(additional_group_by)
+
+    # Aggregate the transcript expression information by locus, gene_id and
+    # annotations in additional_group_by.
+    ht = ht.group_by(*grouping).aggregate(
+        exp_prop_mean=hl.agg.sum(ht.exp_prop_mean),
+        **{t: hl.struct(**{a: hl.agg.sum(ht[t][a]) for a in ht[t]}) for t in tissues},
+    )
+
+    # If 'alleles' is in the Table, key by 'locus' and 'alleles'.
+    keys = ["locus"]
+    if "alleles" in ht.row:
+        keys.append("alleles")
+
+    ht = ht.key_by(*keys)
+
+    return ht
+
+
+def perform_tx_annotation_pipeline(
+    ht: hl.Table,
+    tx_ht: hl.Table,
+    tissues_to_filter: Optional[List[str]] = None,
+    vep_root: str = "vep",
+    vep_annotation: str = "transcript_consequences",
+    filter_to_csqs: Optional[List[str]] = CSQ_CODING,
+    additional_group_by: Optional[Union[Tuple[str], List[str]]] = (
+        "alleles",
+        "gene_symbol",
+        "most_severe_consequence",
+        "lof",
+        "lof_flags",
+    ),
+    **kwargs,
+) -> hl.Table:
+    """
+    One-stop usage of `tx_filter_variants_by_csqs`, `tx_annotate_variants` and `tx_aggregate_variants`.
+
+    :param ht: Table of variants to annotate, it should contain the nested fields:
+        `{vep_root}.{vep_annotation}`.
+    :param tx_ht: Table of transcript expression information.
+    :param tissues_to_filter: Optional list of tissues to exclude from the output.
+    :param vep_root: Name used for root VEP annotation. Default is 'vep'.
+    :param vep_annotation: Name of annotation under vep_root. Default is
+        'transcript_consequences'.
+    :param filter_to_csqs: Optional list of consequences to filter to. Default is None.
+    :param additional_group_by: Optional list of additional fields to group by before
+        sum aggregation. If None, the returned Table will be grouped by only "locus"
+        and "gene_id" before the sum aggregation.
+    :return: Table of variants with transcript expression information aggregated.
+    """
+    tx_ht = tx_annotate_variants(
+        tx_filter_variants_by_csqs(
+            ht, vep_root=vep_root, filter_to_csqs=filter_to_csqs, **kwargs
+        ),
+        tx_ht,
+        tissues_to_filter=tissues_to_filter,
+        vep_root=vep_root,
+        vep_annotation=vep_annotation,
+    )
+
+    tx_ht = tx_aggregate_variants(tx_ht, additional_group_by=additional_group_by)
+
+    return tx_ht