Skip to content

Commit

Permalink
Merge pull request #651 from broadinstitute/qh/tx_annotate_mt
Browse files Browse the repository at this point in the history
Functions to process, filter, annotate and aggregate variants by transcript expression (get the pext scores per variant)
  • Loading branch information
KoalaQin authored Jan 20, 2024
2 parents c310157 + bcbc433 commit bea49f8
Show file tree
Hide file tree
Showing 6 changed files with 456 additions and 36 deletions.
21 changes: 2 additions & 19 deletions gnomad/resources/grch37/reference_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,28 +7,11 @@
GnomadPublicTableResource,
VersionedMatrixTableResource,
VersionedTableResource,
import_gencode,
import_sites_vcf,
)


def _import_gencode(gtf_path: str, **kwargs) -> hl.Table:
"""
Import GENCODE annotations GTF file as a Hail Table.
:param gtf_path: Path to GENCODE GTF file.
:return: Table with GENCODE annotation information.
"""
ht = hl.experimental.import_gtf(gtf_path, **kwargs)

# Only get gene and transcript stable IDs (without version numbers if they
# exist), early versions of GENCODE have no version numbers but later ones do.
ht = ht.annotate(
gene_id=ht.gene_id.split("\\.")[0],
transcript_id=ht.transcript_id.split("\\.")[0],
)
return ht


def _import_gtex_rsem(gtex_path: str, meta_path: str, **kwargs) -> hl.MatrixTable:
"""
Import GTEx RSEM data from expression data and sample attributes file.
Expand Down Expand Up @@ -377,7 +360,7 @@ def get_truth_ht() -> hl.Table:
versions={
"v19": GnomadPublicTableResource(
path="gs://gnomad-public-requester-pays/resources/grch37/gencode/gencode.v19.annotation.ht",
import_func=_import_gencode,
import_func=import_gencode,
import_args={
"gtf_path": "gs://gcp-public-data--gnomad/resources/grch37/gencode/gencode.v19.annotation.gtf.gz",
"reference_genome": "GRCh37",
Expand Down
18 changes: 18 additions & 0 deletions gnomad/resources/grch38/reference_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
GnomadPublicTableResource,
VersionedMatrixTableResource,
VersionedTableResource,
import_gencode,
import_sites_vcf,
)
from gnomad.utils.vep import vep_or_lookup_vep
Expand Down Expand Up @@ -383,3 +384,20 @@ def get_truth_ht() -> Table:
.repartition(200, shuffle=False)
.persist()
)


gencode = VersionedTableResource(
default_version="v39",
versions={
"v39": GnomadPublicTableResource(
path="gs://gnomad-public-requester-pays/resources/grch38/gencode/gencode.v39.annotation.ht",
import_func=import_gencode,
import_args={
"gtf_path": "gs://gcp-public-data--gnomad/resources/grch38/gencode/gencode.v39.annotation.gtf.gz",
"reference_genome": "GRCh38",
"force_bgz": True,
"min_partitions": 100,
},
),
},
)
18 changes: 18 additions & 0 deletions gnomad/resources/resource_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -687,3 +687,21 @@ class DataException(Exception): # noqa: D101
def import_sites_vcf(**kwargs) -> hl.Table:
"""Import site-level data from a VCF into a Hail Table."""
return hl.import_vcf(**kwargs).rows()


def import_gencode(gtf_path: str, **kwargs) -> hl.Table:
"""
Import GENCODE annotations GTF file as a Hail Table.
:param gtf_path: Path to GENCODE GTF file.
:return: Table with GENCODE annotation information.
"""
ht = hl.experimental.import_gtf(gtf_path, **kwargs)

# Only get gene and transcript stable IDs (without version numbers if they
# exist), early versions of GENCODE have no version numbers but later ones do.
ht = ht.annotate(
gene_id=ht.gene_id.split("\\.")[0],
transcript_id=ht.transcript_id.split("\\.")[0],
)
return ht
68 changes: 68 additions & 0 deletions gnomad/utils/filtering.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,74 @@ def filter_to_clinvar_pathogenic(
return t


def filter_to_gencode_cds(
t: Union[hl.MatrixTable, hl.Table], gencode_ht: Optional[hl.Table] = None
) -> hl.Table:
"""
Filter a Table/MatrixTable to only Gencode CDS regions in protein coding transcripts.
Example use:
.. code-block:: python
from gnomad.resources.grch37.reference_data import gencode
gencode_ht = gencode.ht()
gencode_ht = filter_gencode_to_cds(gencode_ht)
.. note::
If no Gencode Table is provided, the default version of the Gencode Table
resource for the genome build of the input Table/MatrixTable will be used.
.. warning::
This Gencode CDS interval filter does not take into account the
transcript_id, it filters to any locus that is found in a CDS interval for
any protein coding transcript. Therefore, if downstream analyses require
filtering to CDS intervals by transcript, an additional step must be taken.
For example, when filtering VEP transcript consequences, there may be cases
where a variant is retained with this filter, but is considered outside the
CDS intervals of the transcript per the VEP predicted consequence of the
variant.
:param t: Input Table/MatrixTable to filter.
:param gencode_ht: Gencode Table to use for filtering the input Table/MatrixTable
to CDS regions. Default is None, which will use the default version of the
Gencode Table resource.
:return: Table/MatrixTable filtered to loci in Gencode CDS intervals.
"""
if gencode_ht is None:
build = get_reference_genome(t.locus).name
if build == "GRCh37":
from gnomad.resources.grch37.reference_data import gencode
elif build == "GRCh38":
from gnomad.resources.grch38.reference_data import gencode
else:
raise ValueError(f"Unsupported reference genome build: {build}")

logger.info(
"No Gencode Table was supplied, using Gencode version %s",
gencode.default_version,
)
gencode_ht = gencode.ht()

gencode_ht = gencode_ht.filter(
(gencode_ht.feature == "CDS") & (gencode_ht.transcript_type == "protein_coding")
)
logger.warning(
"This Gencode CDS interval filter does not filter by transcript! Please see the"
" documentation for more details to confirm it's being used as intended."
)
filter_expr = hl.is_defined(gencode_ht[t.locus])

if isinstance(t, hl.MatrixTable):
t = t.filter_rows(filter_expr)
else:
t = t.filter(filter_expr)

return t


def remove_fields_from_constant(
constant: List[str], fields_to_remove: List[str]
) -> List[str]:
Expand Down
217 changes: 217 additions & 0 deletions gnomad/utils/transcript_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,15 @@

import hail as hl

from gnomad.utils.filtering import filter_to_gencode_cds
from gnomad.utils.vep import (
CSQ_CODING,
CSQ_SPLICE,
explode_by_vep_annotation,
filter_vep_transcript_csqs,
process_consequences,
)

logging.basicConfig(
format="%(asctime)s (%(name)s %(lineno)s): %(message)s",
datefmt="%m/%d/%Y %I:%M:%S %p",
Expand Down Expand Up @@ -205,3 +214,211 @@ def tissue_expression_ht_to_array(
)

return ht


def tx_filter_variants_by_csqs(
ht: hl.Table,
filter_to_cds: bool = True,
gencode_ht: Optional[hl.Table] = None,
filter_to_genes: Optional[List[str]] = None,
match_by_gene_symbol: bool = False,
filter_to_csqs: Optional[List[str]] = None,
ignore_splicing: bool = True,
filter_to_protein_coding: bool = True,
vep_root: str = "vep",
) -> hl.Table:
"""
Prepare a Table of variants with VEP transcript consequences for annotation.
.. note::
When `filter_to_cds` is set to True, the returned Table will be further
filtered by defined 'amino_acids' annotation, which is to filter out certain
consequences, such as 'stop_retained_variant', that are kept by all CDS
intervals but don't belong to CDS of the transcript they fall on.
:param ht: Table of variants with 'vep' annotations.
:param gencode_ht: Optional Gencode resource Table containing CDS interval
information. This is only used when `filter_to_cds` is set to True. Default is
None, which will use the default version of the Gencode Table resource for
the reference build of the input Table `ht`.
:param filter_to_cds: Whether to filter to CDS regions. Default is True. And it
will be further filtered by defined 'amino_acids' annotation.
:param filter_to_genes: Optional list of genes to filter to. Default is None.
:param match_by_gene_symbol: Whether to match by gene symbol instead of gene ID.
Default is False.
:param filter_to_csqs: Optional list of consequences to filter to. Default is None.
:param ignore_splicing: If True, ignore splice consequences. Default is True.
:param filter_to_protein_coding: Whether to filter to protein coding transcripts.
Default is True.
:param vep_root: Name used for root VEP annotation. Default is 'vep'.
:return: Table of variants with preprocessed/filtered transcript consequences
prepared for annotation.
"""
additional_filtering_criteria = None
if filter_to_cds:
logger.info("Filtering to CDS regions...")
ht = filter_to_gencode_cds(ht, gencode_ht=gencode_ht)
additional_filtering_criteria = [
lambda csq: hl.is_defined(csq.amino_acids) & (csq.amino_acids != "*")
]

keep_csqs = True
if ignore_splicing:
if filter_to_csqs is not None:
filter_to_csqs = [csq for csq in filter_to_csqs if csq not in CSQ_SPLICE]
else:
filter_to_csqs = CSQ_SPLICE
keep_csqs = False

if filter_to_csqs is not None:
logger.info("Adding most severe consequence to VEP transcript consequences...")
ht = process_consequences(ht, vep_root=vep_root)

return filter_vep_transcript_csqs(
ht,
vep_root=vep_root,
synonymous=False,
canonical=False,
protein_coding=filter_to_protein_coding,
csqs=filter_to_csqs,
keep_csqs=keep_csqs,
genes=filter_to_genes,
match_by_gene_symbol=match_by_gene_symbol,
additional_filtering_criteria=additional_filtering_criteria,
)


def tx_annotate_variants(
ht: hl.Table,
tx_ht: hl.Table,
tissues_to_filter: Optional[List[str]] = None,
vep_root: str = "vep",
vep_annotation: str = "transcript_consequences",
) -> hl.Table:
"""
Annotate variants with transcript-based expression values or expression proportion from GTEx.
:param ht: Table of variants to annotate, it should contain the nested fields:
`{vep_root}.{vep_annotation}`.
:param tx_ht: Table of transcript expression information.
:param tissues_to_filter: Optional list of tissues to exclude from the output.
Default is None.
:param vep_root: Name used for root VEP annotation. Default is 'vep'.
:param vep_annotation: Name of annotation under vep_root, one of the processed
consequences: ["transcript_consequences", "worst_csq_by_gene",
"worst_csq_for_variant", "worst_csq_by_gene_canonical",
"worst_csq_for_variant_canonical"]. For example, if you want to annotate
each variant with the worst consequence in each gene it falls on and the
transcript expression, you would use "worst_csq_by_gene". Default is
"transcript_consequences".
:return: Input Table with transcript expression information annotated.
"""
# Filter to tissues of interest.
tx_ht = filter_expression_ht_by_tissues(tx_ht, tissues_to_filter=tissues_to_filter)
tissues = list(tx_ht.row_value)

# Calculate the mean expression proportion across all tissues.
tx_ht = tx_ht.annotate(
exp_prop_mean=hl.mean([tx_ht[t].expression_proportion for t in tissues])
)

# Explode the processed transcript consequences to be able to key by
# transcript ID.
ht = explode_by_vep_annotation(ht, vep_annotation=vep_annotation, vep_root=vep_root)
ht = ht.transmute(
**ht[vep_annotation],
**tx_ht[ht[vep_annotation].transcript_id, ht[vep_annotation].gene_id],
)
ht = ht.annotate_globals(tissues=tissues)

return ht


def tx_aggregate_variants(
ht: hl.Table,
additional_group_by: Optional[Union[Tuple[str], List[str]]] = (
"alleles",
"gene_symbol",
"most_severe_consequence",
"lof",
"lof_flags",
),
) -> hl.Table:
"""
Aggregate transcript-based expression values or expression proportion from GTEx.
:param ht: Table of variants annotated with transcript expression information.
:param additional_group_by: Optional list of additional fields to group by before
sum aggregation. If None, the returned Table will be grouped by only "locus"
and "gene_id" before the sum aggregation.
:return: Table of variants with transcript expression information aggregated.
"""
tissues = hl.eval(ht.tissues)

grouping = ["locus", "gene_id"]
if additional_group_by is not None:
grouping = grouping + list(additional_group_by)

# Aggregate the transcript expression information by locus, gene_id and
# annotations in additional_group_by.
ht = ht.group_by(*grouping).aggregate(
exp_prop_mean=hl.agg.sum(ht.exp_prop_mean),
**{t: hl.struct(**{a: hl.agg.sum(ht[t][a]) for a in ht[t]}) for t in tissues},
)

# If 'alleles' is in the Table, key by 'locus' and 'alleles'.
keys = ["locus"]
if "alleles" in ht.row:
keys.append("alleles")

ht = ht.key_by(*keys)

return ht


def perform_tx_annotation_pipeline(
ht: hl.Table,
tx_ht: hl.Table,
tissues_to_filter: Optional[List[str]] = None,
vep_root: str = "vep",
vep_annotation: str = "transcript_consequences",
filter_to_csqs: Optional[List[str]] = CSQ_CODING,
additional_group_by: Optional[Union[Tuple[str], List[str]]] = (
"alleles",
"gene_symbol",
"most_severe_consequence",
"lof",
"lof_flags",
),
**kwargs,
) -> hl.Table:
"""
One-stop usage of `tx_filter_variants_by_csqs`, `tx_annotate_variants` and `tx_aggregate_variants`.
:param ht: Table of variants to annotate, it should contain the nested fields:
`{vep_root}.{vep_annotation}`.
:param tx_ht: Table of transcript expression information.
:param tissues_to_filter: Optional list of tissues to exclude from the output.
:param vep_root: Name used for root VEP annotation. Default is 'vep'.
:param vep_annotation: Name of annotation under vep_root. Default is
'transcript_consequences'.
:param filter_to_csqs: Optional list of consequences to filter to. Default is None.
:param additional_group_by: Optional list of additional fields to group by before
sum aggregation. If None, the returned Table will be grouped by only "locus"
and "gene_id" before the sum aggregation.
:return: Table of variants with transcript expression information aggregated.
"""
tx_ht = tx_annotate_variants(
tx_filter_variants_by_csqs(
ht, vep_root=vep_root, filter_to_csqs=filter_to_csqs, **kwargs
),
tx_ht,
tissues_to_filter=tissues_to_filter,
vep_root=vep_root,
vep_annotation=vep_annotation,
)

tx_ht = tx_aggregate_variants(tx_ht, additional_group_by=additional_group_by)

return tx_ht
Loading

0 comments on commit bea49f8

Please sign in to comment.