Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Functions to process, filter, annotate and aggregate variants by transcript expression (get the pext scores per variant) #651

Merged
merged 41 commits into from
Jan 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
77279e8
function to annotate variant with transcript expression
KoalaQin Dec 8, 2023
e9cfeb3
Aggregate mean_proportion
KoalaQin Dec 8, 2023
223e130
merge from base branch
KoalaQin Dec 18, 2023
373c83e
integrate pull_out_worst_from_tx_annotatate into the main function
KoalaQin Dec 18, 2023
21b5519
Edit docstring
KoalaQin Dec 19, 2023
0f898e5
Suggested changes to `tx_annotate_variants`
jkgoodrich Dec 19, 2023
ba3a1c4
Merge branch 'qh/tx_annotate_mt' of /~https://github.com/broadinstitute…
jkgoodrich Dec 19, 2023
9e9a283
A little more clean up
jkgoodrich Dec 19, 2023
9e9e60f
Merge pull request #654 from broadinstitute/jg/tx_annotate_mt_suggest…
KoalaQin Dec 20, 2023
1fb32ea
merge changes from base branch
KoalaQin Dec 20, 2023
42714b8
remove redundant content
KoalaQin Dec 20, 2023
5a09617
split into small functions: preprocess, annotate and aggregate
KoalaQin Dec 21, 2023
df86088
add TODOs
KoalaQin Dec 21, 2023
86852b1
add option when 'freq' is not in Table
KoalaQin Dec 21, 2023
81f9106
Merge branch 'qh/get_expression_proportion' of /~https://github.com/bro…
jkgoodrich Dec 22, 2023
0cf1829
Suggestions to tx_annotate_mt
jkgoodrich Dec 26, 2023
a98fc1f
Use `process_consequences` if ignore_splicing is True
jkgoodrich Jan 2, 2024
ff26879
Fix keep_csqs -> keep_genes
jkgoodrich Jan 2, 2024
0cac355
Changes from testing
jkgoodrich Jan 2, 2024
6957027
Merge branch 'qh/get_expression_proportion' of /~https://github.com/bro…
jkgoodrich Jan 2, 2024
1a1fadb
Merge branch 'qh/tx_annotate_mt' of /~https://github.com/broadinstitute…
jkgoodrich Jan 2, 2024
1fdd370
Remove extra tab in docstring
jkgoodrich Jan 2, 2024
0e2aca0
Merge pull request #655 from broadinstitute/jg/tx_annotate_mt_suggest…
KoalaQin Jan 4, 2024
3f0ca81
filter to CDS & redefine grouping
KoalaQin Jan 8, 2024
a7e791d
add CDS filtering in preprocess
KoalaQin Jan 8, 2024
c3e6cf3
merge from base branch
KoalaQin Jan 8, 2024
203f8ad
add filter to CDS function
KoalaQin Jan 8, 2024
259af2b
Merge branch 'main' into qh/tx_annotate_mt
KoalaQin Jan 8, 2024
559c5d8
apply review suggestions
KoalaQin Jan 13, 2024
6f7cdcb
add gencode v39 resource
KoalaQin Jan 13, 2024
e2366ad
add tissues to global
KoalaQin Jan 14, 2024
705be2f
delete annotate tx_ht back to variant ht because key changed
KoalaQin Jan 16, 2024
6f2a4fb
address review comments
KoalaQin Jan 16, 2024
7ce7eaa
add warnings to filter_to_cds and option to filter by amino_acids
KoalaQin Jan 19, 2024
2274f6a
option to filter by valid amino_acids in transcript_consequences
KoalaQin Jan 19, 2024
aa67ca1
address review comments
KoalaQin Jan 19, 2024
09c3a8e
combine filtering criteria with concatenation
KoalaQin Jan 20, 2024
f3764a8
rename pipeline function
KoalaQin Jan 20, 2024
baff663
changes requested by reviewer
KoalaQin Jan 20, 2024
7462733
small formatting
KoalaQin Jan 20, 2024
bcbc433
small formatting * 2
KoalaQin Jan 20, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 2 additions & 19 deletions gnomad/resources/grch37/reference_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,28 +7,11 @@
GnomadPublicTableResource,
VersionedMatrixTableResource,
VersionedTableResource,
import_gencode,
import_sites_vcf,
)


def _import_gencode(gtf_path: str, **kwargs) -> hl.Table:
"""
Import GENCODE annotations GTF file as a Hail Table.

:param gtf_path: Path to GENCODE GTF file.
:return: Table with GENCODE annotation information.
"""
ht = hl.experimental.import_gtf(gtf_path, **kwargs)

# Only get gene and transcript stable IDs (without version numbers if they
# exist), early versions of GENCODE have no version numbers but later ones do.
ht = ht.annotate(
gene_id=ht.gene_id.split("\\.")[0],
transcript_id=ht.transcript_id.split("\\.")[0],
)
return ht


def _import_gtex_rsem(gtex_path: str, meta_path: str, **kwargs) -> hl.MatrixTable:
"""
Import GTEx RSEM data from expression data and sample attributes file.
Expand Down Expand Up @@ -377,7 +360,7 @@ def get_truth_ht() -> hl.Table:
versions={
"v19": GnomadPublicTableResource(
path="gs://gnomad-public-requester-pays/resources/grch37/gencode/gencode.v19.annotation.ht",
import_func=_import_gencode,
import_func=import_gencode,
import_args={
"gtf_path": "gs://gcp-public-data--gnomad/resources/grch37/gencode/gencode.v19.annotation.gtf.gz",
"reference_genome": "GRCh37",
Expand Down
18 changes: 18 additions & 0 deletions gnomad/resources/grch38/reference_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
GnomadPublicTableResource,
VersionedMatrixTableResource,
VersionedTableResource,
import_gencode,
import_sites_vcf,
)
from gnomad.utils.vep import vep_or_lookup_vep
Expand Down Expand Up @@ -383,3 +384,20 @@ def get_truth_ht() -> Table:
.repartition(200, shuffle=False)
.persist()
)


gencode = VersionedTableResource(
default_version="v39",
versions={
"v39": GnomadPublicTableResource(
path="gs://gnomad-public-requester-pays/resources/grch38/gencode/gencode.v39.annotation.ht",
import_func=import_gencode,
import_args={
"gtf_path": "gs://gcp-public-data--gnomad/resources/grch38/gencode/gencode.v39.annotation.gtf.gz",
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
"reference_genome": "GRCh38",
"force_bgz": True,
"min_partitions": 100,
},
),
},
)
18 changes: 18 additions & 0 deletions gnomad/resources/resource_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -687,3 +687,21 @@ class DataException(Exception): # noqa: D101
def import_sites_vcf(**kwargs) -> hl.Table:
"""Import site-level data from a VCF into a Hail Table."""
return hl.import_vcf(**kwargs).rows()


def import_gencode(gtf_path: str, **kwargs) -> hl.Table:
"""
Import GENCODE annotations GTF file as a Hail Table.

:param gtf_path: Path to GENCODE GTF file.
:return: Table with GENCODE annotation information.
"""
ht = hl.experimental.import_gtf(gtf_path, **kwargs)

# Only get gene and transcript stable IDs (without version numbers if they
# exist), early versions of GENCODE have no version numbers but later ones do.
ht = ht.annotate(
gene_id=ht.gene_id.split("\\.")[0],
transcript_id=ht.transcript_id.split("\\.")[0],
)
return ht
68 changes: 68 additions & 0 deletions gnomad/utils/filtering.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,74 @@ def filter_to_clinvar_pathogenic(
return t


def filter_to_gencode_cds(
t: Union[hl.MatrixTable, hl.Table], gencode_ht: Optional[hl.Table] = None
) -> hl.Table:
"""
Filter a Table/MatrixTable to only Gencode CDS regions in protein coding transcripts.
Example use:
.. code-block:: python
from gnomad.resources.grch37.reference_data import gencode
gencode_ht = gencode.ht()
gencode_ht = filter_gencode_to_cds(gencode_ht)
.. note::
If no Gencode Table is provided, the default version of the Gencode Table
resource for the genome build of the input Table/MatrixTable will be used.
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
.. warning::
This Gencode CDS interval filter does not take into account the
transcript_id, it filters to any locus that is found in a CDS interval for
any protein coding transcript. Therefore, if downstream analyses require
filtering to CDS intervals by transcript, an additional step must be taken.
For example, when filtering VEP transcript consequences, there may be cases
where a variant is retained with this filter, but is considered outside the
CDS intervals of the transcript per the VEP predicted consequence of the
variant.
:param t: Input Table/MatrixTable to filter.
:param gencode_ht: Gencode Table to use for filtering the input Table/MatrixTable
to CDS regions. Default is None, which will use the default version of the
Gencode Table resource.
:return: Table/MatrixTable filtered to loci in Gencode CDS intervals.
"""
jkgoodrich marked this conversation as resolved.
Show resolved Hide resolved
if gencode_ht is None:
build = get_reference_genome(t.locus).name
if build == "GRCh37":
from gnomad.resources.grch37.reference_data import gencode
elif build == "GRCh38":
jkgoodrich marked this conversation as resolved.
Show resolved Hide resolved
from gnomad.resources.grch38.reference_data import gencode
else:
raise ValueError(f"Unsupported reference genome build: {build}")

logger.info(
"No Gencode Table was supplied, using Gencode version %s",
gencode.default_version,
)
gencode_ht = gencode.ht()

gencode_ht = gencode_ht.filter(
(gencode_ht.feature == "CDS") & (gencode_ht.transcript_type == "protein_coding")
)
logger.warning(
"This Gencode CDS interval filter does not filter by transcript! Please see the"
" documentation for more details to confirm it's being used as intended."
)
filter_expr = hl.is_defined(gencode_ht[t.locus])

if isinstance(t, hl.MatrixTable):
t = t.filter_rows(filter_expr)
else:
t = t.filter(filter_expr)

return t


def remove_fields_from_constant(
constant: List[str], fields_to_remove: List[str]
) -> List[str]:
Expand Down
217 changes: 217 additions & 0 deletions gnomad/utils/transcript_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,15 @@

import hail as hl

from gnomad.utils.filtering import filter_to_gencode_cds
from gnomad.utils.vep import (
CSQ_CODING,
CSQ_SPLICE,
explode_by_vep_annotation,
filter_vep_transcript_csqs,
process_consequences,
)

logging.basicConfig(
format="%(asctime)s (%(name)s %(lineno)s): %(message)s",
datefmt="%m/%d/%Y %I:%M:%S %p",
Expand Down Expand Up @@ -205,3 +214,211 @@ def tissue_expression_ht_to_array(
)

return ht


def tx_filter_variants_by_csqs(
ht: hl.Table,
filter_to_cds: bool = True,
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
gencode_ht: Optional[hl.Table] = None,
filter_to_genes: Optional[List[str]] = None,
match_by_gene_symbol: bool = False,
filter_to_csqs: Optional[List[str]] = None,
ignore_splicing: bool = True,
jkgoodrich marked this conversation as resolved.
Show resolved Hide resolved
filter_to_protein_coding: bool = True,
vep_root: str = "vep",
) -> hl.Table:
"""
Prepare a Table of variants with VEP transcript consequences for annotation.

.. note::

When `filter_to_cds` is set to True, the returned Table will be further
filtered by defined 'amino_acids' annotation, which is to filter out certain
consequences, such as 'stop_retained_variant', that are kept by all CDS
intervals but don't belong to CDS of the transcript they fall on.

:param ht: Table of variants with 'vep' annotations.
:param gencode_ht: Optional Gencode resource Table containing CDS interval
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
information. This is only used when `filter_to_cds` is set to True. Default is
None, which will use the default version of the Gencode Table resource for
the reference build of the input Table `ht`.
:param filter_to_cds: Whether to filter to CDS regions. Default is True. And it
will be further filtered by defined 'amino_acids' annotation.
:param filter_to_genes: Optional list of genes to filter to. Default is None.
:param match_by_gene_symbol: Whether to match by gene symbol instead of gene ID.
Default is False.
:param filter_to_csqs: Optional list of consequences to filter to. Default is None.
:param ignore_splicing: If True, ignore splice consequences. Default is True.
:param filter_to_protein_coding: Whether to filter to protein coding transcripts.
Default is True.
:param vep_root: Name used for root VEP annotation. Default is 'vep'.
:return: Table of variants with preprocessed/filtered transcript consequences
prepared for annotation.
"""
additional_filtering_criteria = None
if filter_to_cds:
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
logger.info("Filtering to CDS regions...")
ht = filter_to_gencode_cds(ht, gencode_ht=gencode_ht)
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
additional_filtering_criteria = [
lambda csq: hl.is_defined(csq.amino_acids) & (csq.amino_acids != "*")
]

keep_csqs = True
if ignore_splicing:
if filter_to_csqs is not None:
filter_to_csqs = [csq for csq in filter_to_csqs if csq not in CSQ_SPLICE]
else:
filter_to_csqs = CSQ_SPLICE
keep_csqs = False

if filter_to_csqs is not None:
logger.info("Adding most severe consequence to VEP transcript consequences...")
ht = process_consequences(ht, vep_root=vep_root)

return filter_vep_transcript_csqs(
ht,
vep_root=vep_root,
synonymous=False,
canonical=False,
protein_coding=filter_to_protein_coding,
csqs=filter_to_csqs,
keep_csqs=keep_csqs,
genes=filter_to_genes,
match_by_gene_symbol=match_by_gene_symbol,
additional_filtering_criteria=additional_filtering_criteria,
)


def tx_annotate_variants(
ht: hl.Table,
tx_ht: hl.Table,
tissues_to_filter: Optional[List[str]] = None,
vep_root: str = "vep",
vep_annotation: str = "transcript_consequences",
) -> hl.Table:
"""
Annotate variants with transcript-based expression values or expression proportion from GTEx.

:param ht: Table of variants to annotate, it should contain the nested fields:
`{vep_root}.{vep_annotation}`.
:param tx_ht: Table of transcript expression information.
:param tissues_to_filter: Optional list of tissues to exclude from the output.
Default is None.
:param vep_root: Name used for root VEP annotation. Default is 'vep'.
:param vep_annotation: Name of annotation under vep_root, one of the processed
consequences: ["transcript_consequences", "worst_csq_by_gene",
"worst_csq_for_variant", "worst_csq_by_gene_canonical",
"worst_csq_for_variant_canonical"]. For example, if you want to annotate
each variant with the worst consequence in each gene it falls on and the
transcript expression, you would use "worst_csq_by_gene". Default is
"transcript_consequences".
:return: Input Table with transcript expression information annotated.
"""
# Filter to tissues of interest.
tx_ht = filter_expression_ht_by_tissues(tx_ht, tissues_to_filter=tissues_to_filter)
tissues = list(tx_ht.row_value)

# Calculate the mean expression proportion across all tissues.
tx_ht = tx_ht.annotate(
exp_prop_mean=hl.mean([tx_ht[t].expression_proportion for t in tissues])
)

# Explode the processed transcript consequences to be able to key by
# transcript ID.
ht = explode_by_vep_annotation(ht, vep_annotation=vep_annotation, vep_root=vep_root)
ht = ht.transmute(
**ht[vep_annotation],
**tx_ht[ht[vep_annotation].transcript_id, ht[vep_annotation].gene_id],
)
ht = ht.annotate_globals(tissues=tissues)

return ht


def tx_aggregate_variants(
ht: hl.Table,
additional_group_by: Optional[Union[Tuple[str], List[str]]] = (
"alleles",
"gene_symbol",
"most_severe_consequence",
"lof",
"lof_flags",
),
) -> hl.Table:
"""
Aggregate transcript-based expression values or expression proportion from GTEx.

:param ht: Table of variants annotated with transcript expression information.
:param additional_group_by: Optional list of additional fields to group by before
sum aggregation. If None, the returned Table will be grouped by only "locus"
and "gene_id" before the sum aggregation.
:return: Table of variants with transcript expression information aggregated.
"""
tissues = hl.eval(ht.tissues)

grouping = ["locus", "gene_id"]
if additional_group_by is not None:
grouping = grouping + list(additional_group_by)

# Aggregate the transcript expression information by locus, gene_id and
# annotations in additional_group_by.
ht = ht.group_by(*grouping).aggregate(
exp_prop_mean=hl.agg.sum(ht.exp_prop_mean),
**{t: hl.struct(**{a: hl.agg.sum(ht[t][a]) for a in ht[t]}) for t in tissues},
)

# If 'alleles' is in the Table, key by 'locus' and 'alleles'.
keys = ["locus"]
if "alleles" in ht.row:
keys.append("alleles")

ht = ht.key_by(*keys)

return ht
jkgoodrich marked this conversation as resolved.
Show resolved Hide resolved


def perform_tx_annotation_pipeline(
ht: hl.Table,
tx_ht: hl.Table,
tissues_to_filter: Optional[List[str]] = None,
vep_root: str = "vep",
vep_annotation: str = "transcript_consequences",
filter_to_csqs: Optional[List[str]] = CSQ_CODING,
additional_group_by: Optional[Union[Tuple[str], List[str]]] = (
"alleles",
"gene_symbol",
"most_severe_consequence",
"lof",
"lof_flags",
),
**kwargs,
) -> hl.Table:
"""
One-stop usage of `tx_filter_variants_by_csqs`, `tx_annotate_variants` and `tx_aggregate_variants`.

:param ht: Table of variants to annotate, it should contain the nested fields:
`{vep_root}.{vep_annotation}`.
:param tx_ht: Table of transcript expression information.
:param tissues_to_filter: Optional list of tissues to exclude from the output.
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
:param vep_root: Name used for root VEP annotation. Default is 'vep'.
:param vep_annotation: Name of annotation under vep_root. Default is
'transcript_consequences'.
:param filter_to_csqs: Optional list of consequences to filter to. Default is None.
:param additional_group_by: Optional list of additional fields to group by before
sum aggregation. If None, the returned Table will be grouped by only "locus"
and "gene_id" before the sum aggregation.
:return: Table of variants with transcript expression information aggregated.
"""
tx_ht = tx_annotate_variants(
tx_filter_variants_by_csqs(
ht, vep_root=vep_root, filter_to_csqs=filter_to_csqs, **kwargs
),
tx_ht,
tissues_to_filter=tissues_to_filter,
vep_root=vep_root,
vep_annotation=vep_annotation,
)

tx_ht = tx_aggregate_variants(tx_ht, additional_group_by=additional_group_by)

return tx_ht
Loading