Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add vep versions #282

Merged
merged 10 commits into from
Jan 20, 2021
9 changes: 9 additions & 0 deletions gnomad/resources/grch37/reference_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,15 @@
)

# Versioned resources: versions should be listed from most recent to oldest
vep_context = VersionedTableResource(
default_version="85",
versions={
"85": TableResource(
path="gs://gnomad-public-requester-pays/resources/context/grch37_context_vep_annotated.ht",
)
},
)

dbsnp = VersionedTableResource(
default_version="20180423",
versions={
Expand Down
9 changes: 9 additions & 0 deletions gnomad/resources/grch38/reference_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,15 @@ def _import_clinvar(**kwargs) -> hl.Table:
)

# Versioned resources: versions should be listed from most recent to oldest
vep_context = VersionedTableResource(
default_version="95",
versions={
"95": TableResource(
path="gs://gnomad-public-requester-pays/resources/context/grch38_context_vep_annotated.ht",
)
},
)

syndip = VersionedMatrixTableResource(
default_version="20180222",
versions={
Expand Down
116 changes: 83 additions & 33 deletions gnomad/utils/vep.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,16 @@
import json
import logging
import os
import subprocess
from typing import Union

import hail as hl
from gnomad.resources.resource_utils import DataException

from gnomad.resources.resource_utils import VersionedTableResource

logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Note that this is the current as of v81 with some included for backwards compatibility (VEP <= 75)
CSQ_CODING_HIGH_IMPACT = [
Expand Down Expand Up @@ -62,17 +71,10 @@
+ CSQ_NON_CODING
)

VEP_REFERENCE_DATA = {
"GRCh37": {
"vep_config": "file:///vep_data/vep-gcloud.json",
"all_possible": "gs://gnomad-public-requester-pays/resources/context/grch37_context_vep_annotated.ht",
},
"GRCh38": {
"vep_config": "file:///vep_data/vep-gcloud.json",
"all_possible": "gs://gnomad-public-requester-pays/resources/context/grch38_context_vep_annotated.ht",
},
}

VEP_CONFIG_PATH = "file:///vep_data/vep-gcloud.json"
"""
Constant that contains the local path to the VEP config file
"""

VEP_CSQ_FIELDS = "Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|ALLELE_NUM|DISTANCE|STRAND|VARIANT_CLASS|MINIMISED|SYMBOL_SOURCE|HGNC_ID|CANONICAL|TSL|APPRIS|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|GENE_PHENO|SIFT|PolyPhen|DOMAINS|HGVS_OFFSET|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|LoF|LoF_filter|LoF_flags|LoF_info"
"""
Expand All @@ -85,34 +87,59 @@
"""


def vep_context_ht_path(ref: str = "GRCh37"):
if ref not in VEP_REFERENCE_DATA.keys():
raise DataException(
"Select reference as one of: {}".format(",".join(VEP_REFERENCE_DATA.keys()))
)
return VEP_REFERENCE_DATA[ref]["all_possible"]
def get_vep_help():
"""
Returns the output of vep --help which includes the VEP version.

.. warning::
This function will work for Dataproc clusters created with `hailctl dataproc start --vep`.
It assumes that the command is `/path/to/vep`.
"""
with hl.hadoop_open(os.environ["VEP_CONFIG_URI"]) as vep_config_file:
vep_config = json.load(vep_config_file)
vep_command = vep_config["command"]
vep_help = subprocess.check_output([vep_command[0]]).decode("utf-8")
return vep_help

def vep_config_path(ref: str = "GRCh37"):
if ref not in VEP_REFERENCE_DATA.keys():
raise DataException(
"Select reference as one of: {}".format(",".join(VEP_REFERENCE_DATA.keys()))
)
return VEP_REFERENCE_DATA[ref]["vep_config"]

def get_vep_context(ref: str = "GRCh37") -> VersionedTableResource:
jkgoodrich marked this conversation as resolved.
Show resolved Hide resolved
"""
Get VEP context resource for the genome build `ref`

def vep_or_lookup_vep(ht, reference_vep_ht=None, reference=None, vep_config=None):
:param ref: Genome build
:return: VEPed context resource
"""
import gnomad.resources.grch37.reference_data as grch37
import gnomad.resources.grch38.reference_data as grch38

vep_context = grch37.vep_context if ref == "GRCh37" else grch38.vep_context
jkgoodrich marked this conversation as resolved.
Show resolved Hide resolved
return vep_context


def vep_or_lookup_vep(
ht, reference_vep_ht=None, reference=None, vep_config_path=None, vep_version=None
):
"""
VEP a table, or lookup variants in a reference database

:param ht: Input Table
:param reference_vep_ht: A reference database with VEP annotations (must be in top-level `vep`)
:param reference: If reference_vep_ht is not specified, find a suitable one in reference (if None, grabs from hl.default_reference)
:param vep_config: vep_config to pass to hl.vep (if None, a suitable one for `reference` is chosen)
:return: VEPped Table
:param vep_config_path: vep_config to pass to hl.vep (if None, a suitable one for `reference` is chosen)
:param vep_version: Version of VEPed context Table to use (if None, the default `vep_context` resource will be used)
:return: VEPed Table
"""
vep_help = get_vep_help()
jkgoodrich marked this conversation as resolved.
Show resolved Hide resolved

if reference is None:
reference = hl.default_reference().name

if vep_config_path is None:
vep_config_path = VEP_CONFIG_PATH

with hl.hadoop_open(vep_config_path) as vep_config_file:
vep_config = vep_config_file.read()

if reference_vep_ht is None:

possible_refs = ("GRCh37", "GRCh38")
Expand All @@ -121,17 +148,40 @@ def vep_or_lookup_vep(ht, reference_vep_ht=None, reference=None, vep_config=None
f'vep_or_lookup_vep got {reference}. Expected one of {", ".join(possible_refs)}'
)

reference_vep_ht = hl.read_table(vep_context_ht_path(reference))
vep_context = get_vep_context(reference)
if vep_version is None:
vep_version = vep_context.default_version

if vep_version not in vep_context.versions:
logger.warning(
f"No VEPed context Table available for genome build {reference} and VEP version {vep_version}, "
f"all variants will be VEPed using the following VEP:\n{vep_help}"
)
return hl.vep(ht, vep_config_path)

logger.info(
f"Using VEPed context Table from genome build {reference} and VEP version {vep_version}"
)

reference_vep_ht = vep_context.versions[vep_version].ht()
vep_context_help = hl.eval(reference_vep_ht.vep_help)
vep_context_config = hl.eval(reference_vep_ht.vep_config)
jkgoodrich marked this conversation as resolved.
Show resolved Hide resolved

assert vep_help == vep_context_help, (
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should these also be checked if a reference_vep_ht is passed? Or do we assume anyone doing so is aware of the requirements?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wasn't going to because then is requires them to add those globals, but happy to force users to add them if you think it would be helpful

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I could go either way. It would be nice to have the guard rails, but it's not obvious how to make a Table compatible with these functions. No objection to leaving them out, though it might be worth a note in the docstring to make sure the reference_vep_ht was generated with the same version of VEP / VEP configuration.

f"The VEP context HT version does not match the version referenced in the VEP config file."
f"\nVEP context:\n{vep_context_help}\n\n VEP config:\n{vep_help}"
)

assert vep_config == vep_context_config, (
f"The VEP context HT configuration does not match the configuration in {vep_config_path}."
f"\nVEP context:\n{vep_context_config}\n\n Current config:\n{vep_config}"
)

ht = ht.annotate(vep=reference_vep_ht[ht.key].vep)

vep_ht = ht.filter(hl.is_defined(ht.vep))
revep_ht = ht.filter(hl.is_missing(ht.vep))

if vep_config is None:
vep_config = vep_config_path(reference)

revep_ht = hl.vep(revep_ht, vep_config)
revep_ht = hl.vep(revep_ht, vep_config_path)

return vep_ht.union(revep_ht)

Expand Down