diff --git a/notebooks/gnomad-gks-v1.ipynb b/notebooks/gnomad-gks-v1.ipynb new file mode 100644 index 00000000..2bc5a32a --- /dev/null +++ b/notebooks/gnomad-gks-v1.ipynb @@ -0,0 +1,1364 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "5b5ee5c4", + "metadata": {}, + "source": [ + "This can read from a locally-staged hail table directory, or use one in GCS. The table must have a pre-computed `.info.VRS` field. This is computed using `tgg_methods` `vrs_annotation_batch.py`, which uses the `vrs-python` `vcf_annotation.py`\n", + "\n", + "/~https://github.com/broadinstitute/tgg_methods/blob/master/vrs/vrs_annotation_batch.py (last checked at `a0002f02fbd5dd25487b261e94081a3daec29c64`)\n", + "\n", + "This uses gnomad_methods functions `gnomad_gks` and `add_gks_vrs`, and `add_gks_va`, to return GKS structures for hail tables that are in the gnomad v3 schema, off the branch in this pull request: /~https://github.com/broadinstitute/gnomad_methods/pull/556\n", + "\n", + "The JSON schema is in `ga4gh/va-spec` on the `gk-pilot` branch" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "afc240ba", + "metadata": {}, + "outputs": [], + "source": [ + "import hail as hl\n", + "import json\n", + "from typing import List, Tuple\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e4985d6", + "metadata": {}, + "outputs": [], + "source": [ + "# The clingen-public bucket will only be the week of 2023-09-18\n", + "# The clingen-public-requesterpays bucket will remain available but the client will need to \n", + "# be authenticated with a Google Cloud account with billing enabled to pay network transfer fees\n", + "\n", + "# configuration for data outputs\n", + "bucket = \"clingen-public-requesterpays\"\n", + "bucket = \"clingen-public\"\n", + "\n", + "# Writes inputs array as a hail table to this destination, if not None.\n", + "# This can be useful for other testing, using this hail table as input without reconstructing it\n", + "# inputs_ht_destination_url = f\"gs://{bucket}/gnomad-gks-qc/gnomad-filtered-inputs.ht\"\n", + "inputs_ht_destination_url = None\n", + "\n", + "# Copies output annotations as newline delimited json to this url, if not None\n", + "outputs_destination_file = f\"gs://{bucket}/gnomad-gks-qc/outputs.ndjson\"\n", + "\n", + "\n", + "# ht_url can be a gs:// path, or a file:// local path\n", + "\n", + "# Publicly readable, but doesn't have all gnomad variants in it\n", + "ht_url = f\"gs://{bucket}/gnomad-gks-downsampled-100k.ht\"\n", + "\n", + "# Can refer to a local hail table\n", + "# ht_url = \"../downsample_to_100k_full_release.ht\"\n", + "\n", + "# ht_url = \"gs://clingen-gnomad-mirror/gnomad.genomes.v3.1.2.sites_vrs.ht\"\n", + "\n", + "# ht_url = \"gs://gcp-public-data--gnomad/release/3.1.2/ht/genomes/gnomad.genomes.v3.1.2.sites.ht\"\n", + "\n", + "print(ht_url)\n", + "ht = hl.read_table(ht_url)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "f8bd96f1", + "metadata": {}, + "outputs": [], + "source": [ + "# ht.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "5b26b21a", + "metadata": {}, + "outputs": [], + "source": [ + "def loci_alleles_to_pandas(loci: List[hl.Locus], alleles: List[List[str]]) -> pd.DataFrame:\n", + " return pd.DataFrame({\n", + " \"locus\": loci,\n", + " \"alleles\": alleles\n", + " })\n", + "\n", + "def filter_table_to_loci_alleles(ht: hl.Table, loci: List[hl.Locus], alleles: List[List[str]]) -> hl.Table:\n", + " left = hl.Table.from_pandas(loci_alleles_to_pandas(loci, alleles))\n", + " left = left.key_by(left.locus, left.alleles)\n", + " return left.join(ht)\n", + "\n", + "def make_loci(inputs: List[Tuple[str,int]], reference_genome=\"GRCh38\") -> List[hl.Locus]:\n", + " return list(\n", + " map(lambda contig, pos: hl.locus(contig, pos, reference_genome=reference_genome),\n", + " map(lambda x: x[0], inputs),\n", + " map(lambda x: x[1], inputs)))\n", + "\n", + "# hl.Table.from_pandas(pd.DataFrame({\n", + "# \"locus\": [hl.locus(\"chr1\", 629844, reference_genome=\"GRCh38\")],\n", + "# \"alleles\": [[\"A\", \"G\"]]\n", + "# })).key_by(\"locus\", \"alleles\").join(ht).show()\n", + "\n", + "# filter_table_to_loci_alleles(\n", + "# ht, \n", + "# make_loci([[\"chr1\", 629844]]), \n", + "# [[\"A\", \"G\"]]).show()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "edac5b0a", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/kferrite/dev/gnomad_methods/venv/lib/python3.11/site-packages/python_jsonschema_objects/__init__.py:49: UserWarning: Schema version http://json-schema.org/draft-07/schema not recognized. Some keywords and features may not be supported.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "import gnomad\n", + "import gnomad.utils.annotations\n", + "import gnomad.resources.grch38.gnomad\n", + "\n", + "# reload (re-running this cell will reload modifications to these modules on disk)\n", + "import importlib\n", + "importlib.reload(gnomad.utils.annotations)\n", + "importlib.reload(gnomad.resources.grch38.gnomad)\n", + "\n", + "from gnomad.utils.annotations import add_gks_va, add_gks_vrs\n", + "from gnomad.resources.grch38.gnomad import gnomad_gks" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "94145a04", + "metadata": {}, + "outputs": [], + "source": [ + "# GnomAD 3.1.2\n", + "# GRCh38 expressions\n", + "\n", + "inputs = [\n", + " {\"gnomad\": \"1-55051215-G-GA\"},\n", + " {\"gnomad\": \"1-629844-A-G\"},\n", + " {\"gnomad\": \"1-633440-A-ATCCC\"},\n", + " {\"gnomad\": \"1-37917308-TTATATA-T\"},\n", + " {\"gnomad\": \"1-4807634-TAAAA-T\"}\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f35b7333", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 0:> (0 + 1) / 1]\r" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
locus
alleles
locus<GRCh38>array<str>
chr1:629844["A","G"]
chr1:633440["A","ATCCC"]
chr1:4807634["TAAAA","T"]
chr1:37917308["TTATATA","T"]
chr1:55051215["G","GA"]
" + ], + "text/plain": [ + "+---------------+-----------------+\n", + "| locus | alleles |\n", + "+---------------+-----------------+\n", + "| locus | array |\n", + "+---------------+-----------------+\n", + "| chr1:629844 | [\"A\",\"G\"] |\n", + "| chr1:633440 | [\"A\",\"ATCCC\"] |\n", + "| chr1:4807634 | [\"TAAAA\",\"T\"] |\n", + "| chr1:37917308 | [\"TTATATA\",\"T\"] |\n", + "| chr1:55051215 | [\"G\",\"GA\"] |\n", + "+---------------+-----------------+" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# # This cell constructs an indexed hail table from the input alleles\n", + "# # so the full gnomad dataset can be rapidly filtered by left joining it onto this.\n", + "# # Using a .filter() to check membership in the input set is slow because it needs\n", + "# # to do a table scan.\n", + "\n", + "input_gnomad_expressions = hl.literal([x[\"gnomad\"] for x in inputs])\n", + "input_terms = [x[\"gnomad\"].split(\"-\") for x in inputs]\n", + "\n", + "df = pd.DataFrame(\n", + " {\n", + " \"contig\": [str(\"chr\" + i[0]) for i in input_terms],\n", + " \"position\": [int(i[1]) for i in input_terms],\n", + " \"ref\": [i[2] for i in input_terms],\n", + " \"alt\": [i[3] for i in input_terms]\n", + " }\n", + ")\n", + "input_ht = hl.Table.from_pandas(df)\n", + "input_ht = (input_ht\n", + " .annotate(\n", + " locus=hl.locus(input_ht.contig, input_ht.position, reference_genome=\"GRCh38\"),\n", + " alleles=hl.array([input_ht.ref, input_ht.alt]))\n", + " .drop(\"contig\", \"position\", \"ref\", \"alt\")\n", + " .key_by(\"locus\", \"alleles\"))\n", + "\n", + "input_ht.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "79f65c41", + "metadata": {}, + "outputs": [], + "source": [ + "ht_filtered = ht\n", + "\n", + "# NOTE: To keep table filtered to only the set of input variants, keep this line uncommented\n", + "ht_filtered = input_ht.join(ht)\n", + "\n", + "\n", + "ht_filtered = ht_filtered.annotate(\n", + " genomic_coordinates = hl.format(\"%s-%s-%s-%s\",\n", + " ht_filtered.locus.contig[3:], # Remove 'chr'\n", + " hl.str(ht_filtered.locus.position),\n", + " ht_filtered.alleles[0],\n", + " ht_filtered.alleles[1]\n", + " )\n", + ")\n", + "\n", + "# # Write the filtered gnomad table to storage\n", + "# if inputs_ht_destination_url:\n", + "# ht_filtered.write(inputs_ht_destination_url, overwrite=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "05a6b9b2", + "metadata": {}, + "outputs": [], + "source": [ + "# Parameters for gnomad_gks/get_gks\n", + "ancestry_group_short_names = gnomad.resources.grch38.gnomad.POPS[\"v3\"]\n", + "ancestry_groups_full_name_map = gnomad.sample_qc.ancestry.POP_NAMES\n", + "gnomad_version_label = \"3.1.4\"" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "2d6092b2", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-10-05 14:00:20.150 Hail: INFO: Coerced sorted dataset (11 + 5) / 16]\n", + "2023-10-05 14:00:20.153 Hail: INFO: Coerced dataset with out-of-order partitions.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "gnomAD allele strings: ['1-629844-A-G', '1-633440-A-ATCCC', '1-4807634-TAAAA-T', '1-37917308-TTATATA-T']\n", + "calling add_gks_vrs on: 1-629844-A-G\n", + "calling add_gks_vrs on: 1-633440-A-ATCCC\n", + "calling add_gks_vrs on: 1-4807634-TAAAA-T\n", + "calling add_gks_vrs on: 1-37917308-TTATATA-T\n" + ] + } + ], + "source": [ + "vrs_variants = []\n", + "records = ht_filtered.select(\n", + " ht_filtered.freq,\n", + " ht_filtered.popmax,\n", + " ht_filtered.info,\n", + " ht_filtered.genomic_coordinates,\n", + " vrs=ht_filtered.info.vrs\n", + ")\n", + "records = records.take(5)\n", + "\n", + "variant_strs = [r.genomic_coordinates for r in records]\n", + "loci = [\n", + " hl.locus(contig=str(\"chr\" + v0), pos=int(v1), reference_genome=\"GRCh38\")\n", + " for [v0, v1, *_] in \n", + " [v.split(\"-\") for v in variant_strs]\n", + "]\n", + "\n", + "print(f\"gnomAD allele strings: {variant_strs}\")\n", + "for record in records:\n", + " print(\"calling add_gks_vrs on: \" + record.genomic_coordinates)\n", + " vrs_variant = add_gks_vrs(\n", + " input_vrs=record.info.vrs,\n", + " input_locus=record.locus\n", + " )\n", + " # print(json.dumps(vrs_variant, indent=2))\n", + " vrs_variants.append(vrs_variant)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "4f66c7db", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-10-05 14:00:28.364 Hail: INFO: Coerced sorted dataset\n", + "2023-10-05 14:00:28.366 Hail: INFO: Coerced dataset with out-of-order partitions.\n", + "[Stage 6:===============================================> (4 + 1) / 5]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Annotated 4 records in 12.39543604850769 seconds (3.098859 sec/rec)\n" + ] + } + ], + "source": [ + "# This cell takes one locus present in the input table, performs VRS and VA annotation and prints it.\n", + "\n", + "# For performance on larger datasets, the interval should be much larger, at least a few thousand.\n", + "\n", + "ivl_0 =hl.locus_interval(loci[0].contig, loci[0].position, loci[0].position+1, reference_genome=\"GRCh38\")\n", + "ht_locus_0 = (hl.filter_intervals(ht_filtered, [ivl_0]))\n", + "\n", + "ivl_1 =hl.locus_interval(loci[1].contig, loci[1].position, loci[1].position+1, reference_genome=\"GRCh38\")\n", + "ht_locus_1 = (hl.filter_intervals(ht_filtered, [ivl_1]))\n", + "\n", + "ivl_full_chr1 = hl.locus_interval(\"chr1\", 1, 248956422, reference_genome=\"GRCh38\")\n", + "\n", + "import time\n", + "t0 = time.time()\n", + "gks_annotations = gnomad_gks(\n", + " locus_interval=ivl_full_chr1,\n", + " custom_ht=ht_filtered,\n", + " version=\"3.1.4\",\n", + " data_type=\"genomes\",\n", + " by_ancestry_group=True,\n", + " by_sex=True,\n", + " vrs_only=False,\n", + " skip_checkpoint=True,\n", + " skip_coverage=False\n", + ")\n", + "t1 = time.time()\n", + "l = len(gks_annotations)\n", + "td = t1 - t0\n", + "print(f\"Annotated {l} records in {td} seconds ({td/l:.6f} sec/rec)\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "fe4cb80a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "4\n", + "{\n", + " \"locus\": {\n", + " \"contig\": \"chr1\",\n", + " \"position\": 4807634,\n", + " \"reference_genome\": \"GRCh38\"\n", + " },\n", + " \"alleles\": [\n", + " \"TAAAA\",\n", + " \"T\"\n", + " ],\n", + " \"gks_vrs_variant\": {\n", + " \"_id\": \"ga4gh:VA.daK3dsUv6m3WLMDpdSYcWUw5Spo-jwgB\",\n", + " \"type\": \"Allele\",\n", + " \"location\": {\n", + " \"type\": \"SequenceLocation\",\n", + " \"sequence_id\": \"ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO\",\n", + " \"interval\": {\n", + " \"start\": {\n", + " \"type\": \"Number\",\n", + " \"value\": 4807634\n", + " },\n", + " \"end\": {\n", + " \"type\": \"Number\",\n", + " \"value\": 4807659\n", + " },\n", + " \"type\": \"SequenceInterval\"\n", + " },\n", + " \"_id\": \"ga4gh:VSL.hcL2_ScoNOE8rHdZdHRZjRS22464N1eV\"\n", + " },\n", + " \"state\": {\n", + " \"type\": \"LiteralSequenceExpression\",\n", + " \"sequence\": \"AAAAAAAAAAAAAAAAAAAAA\"\n", + " }\n", + " },\n", + " \"gks_va_freq\": {\n", + " \"id\": \"gnomAD-3.1.4-chr1-4807634-TAAAA-T\",\n", + " \"type\": \"CohortAlleleFrequency\",\n", + " \"label\": \"Overall Cohort Allele Frequency for chr1-4807634-TAAAA-T\",\n", + " \"derivedFrom\": {\n", + " \"id\": \"gnomAD3.1.4\",\n", + " \"type\": \"DataSet\",\n", + " \"label\": \"gnomAD v3.1.4\",\n", + " \"version\": \"3.1.4\"\n", + " },\n", + " \"focusAllele\": {\n", + " \"_id\": \"ga4gh:VA.daK3dsUv6m3WLMDpdSYcWUw5Spo-jwgB\",\n", + " \"type\": \"Allele\",\n", + " \"location\": {\n", + " \"type\": \"SequenceLocation\",\n", + " \"sequence_id\": \"ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO\",\n", + " \"interval\": {\n", + " \"start\": {\n", + " \"type\": \"Number\",\n", + " \"value\": 4807634\n", + " },\n", + " \"end\": {\n", + " \"type\": \"Number\",\n", + " \"value\": 4807659\n", + " },\n", + " \"type\": \"SequenceInterval\"\n", + " },\n", + " \"_id\": \"ga4gh:VSL.hcL2_ScoNOE8rHdZdHRZjRS22464N1eV\"\n", + " },\n", + " \"state\": {\n", + " \"type\": \"LiteralSequenceExpression\",\n", + " \"sequence\": \"AAAAAAAAAAAAAAAAAAAAA\"\n", + " }\n", + " },\n", + " \"focusAlleleCount\": 51467,\n", + " \"locusAlleleCount\": 90458,\n", + " \"alleleFrequency\": 0.5689601804152203,\n", + " \"cohort\": {\n", + " \"id\": \"ALL\"\n", + " },\n", + " \"ancillaryResults\": {\n", + " \"homozygotes\": 13498,\n", + " \"popMaxFAF95\": {\n", + " \"frequency\": 0.6498521500000005,\n", + " \"confidenceInterval\": 0.95,\n", + " \"popFreqId\": \"chr1-4807634-TAAAA-T.NFE\"\n", + " },\n", + " \"qcFilters\": [\n", + " \"AS_VQSR\"\n", + " ],\n", + " \"lowComplexityRegion\": true,\n", + " \"heterozygousAlleleBalanceFlagged\": 65,\n", + " \"meanDepth\": 12.622911494797913\n", + " },\n", + " \"subcohortFrequency\": [\n", + " {\n", + " \"id\": \"chr1-4807634-TAAAA-T.AFR\",\n", + " \"type\": \"CohortAlleleFrequency\",\n", + " \"label\": \"African/African-American Cohort Allele Frequency for chr1-4807634-TAAAA-T\",\n", + " \"focusAllele\": \"#/focusAllele\",\n", + " \"focusAlleleCount\": 9212,\n", + " \"locusAlleleCount\": 22798,\n", + " \"alleleFrequency\": 0.40407053250285113,\n", + " \"cohort\": {\n", + " \"id\": \"AFR\",\n", + " \"characteristics\": [\n", + " {\n", + " \"name\": \"genetic ancestry\",\n", + " \"value\": \"African/African-American\"\n", + " }\n", + " ]\n", + " },\n", + " \"ancillaryResults\": {\n", + " \"homozygotes\": 1440\n", + " },\n", + " \"subcohortFrequency\": [\n", + " {\n", + " \"id\": \"chr1-4807634-TAAAA-T.AFR.XX\",\n", + " \"type\": \"CohortAlleleFrequency\",\n", + " \"label\": \"African/African-American Cohort Allele Frequency for chr1-4807634-TAAAA-T\",\n", + " \"focusAllele\": \"#/focusAllele\",\n", + " \"focusAlleleCount\": 4972,\n", + " \"locusAlleleCount\": 12210,\n", + " \"alleleFrequency\": 0.4072072072072072,\n", + " \"cohort\": {\n", + " \"id\": \"AFR.XX\",\n", + " \"characteristics\": [\n", + " {\n", + " \"name\": \"genetic ancestry\",\n", + " \"value\": \"African/African-American\"\n", + " },\n", + " {\n", + " \"name\": \"biological sex\",\n", + " \"value\": \"XX\"\n", + " }\n", + " ]\n", + " },\n", + " \"ancillaryResults\": {\n", + " \"homozygotes\": 781\n", + " }\n", + " },\n", + " {\n", + " \"id\": \"chr1-4807634-TAAAA-T.AFR.XY\",\n", + " \"type\": \"CohortAlleleFrequency\",\n", + " \"label\": \"African/African-American Cohort Allele Frequency for chr1-4807634-TAAAA-T\",\n", + " \"focusAllele\": \"#/focusAllele\",\n", + " \"focusAlleleCount\": 4240,\n", + " \"locusAlleleCount\": 10588,\n", + " \"alleleFrequency\": 0.4004533434076313,\n", + " \"cohort\": {\n", + " \"id\": \"AFR.XY\",\n", + " \"characteristics\": [\n", + " {\n", + " \"name\": \"genetic ancestry\",\n", + " \"value\": \"African/African-American\"\n", + " },\n", + " {\n", + " \"name\": \"biological sex\",\n", + " \"value\": \"XY\"\n", + " }\n", + " ]\n", + " },\n", + " \"ancillaryResults\": {\n", + " \"homozygotes\": 659\n", + " }\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"id\": \"chr1-4807634-TAAAA-T.AMI\",\n", + " \"type\": \"CohortAlleleFrequency\",\n", + " \"label\": \"Amish Cohort Allele Frequency for chr1-4807634-TAAAA-T\",\n", + " \"focusAllele\": \"#/focusAllele\",\n", + " \"focusAlleleCount\": 414,\n", + " \"locusAlleleCount\": 708,\n", + " \"alleleFrequency\": 0.5847457627118644,\n", + " \"cohort\": {\n", + " \"id\": \"AMI\",\n", + " \"characteristics\": [\n", + " {\n", + " \"name\": \"genetic ancestry\",\n", + " \"value\": \"Amish\"\n", + " }\n", + " ]\n", + " },\n", + " \"ancillaryResults\": {\n", + " \"homozygotes\": 103\n", + " },\n", + " \"subcohortFrequency\": [\n", + " {\n", + " \"id\": \"chr1-4807634-TAAAA-T.AMI.XX\",\n", + " \"type\": \"CohortAlleleFrequency\",\n", + " \"label\": \"Amish Cohort Allele Frequency for chr1-4807634-TAAAA-T\",\n", + " \"focusAllele\": \"#/focusAllele\",\n", + " \"focusAlleleCount\": 218,\n", + " \"locusAlleleCount\": 362,\n", + " \"alleleFrequency\": 0.6022099447513812,\n", + " \"cohort\": {\n", + " \"id\": \"AMI.XX\",\n", + " \"characteristics\": [\n", + " {\n", + " \"name\": \"genetic ancestry\",\n", + " \"value\": \"Amish\"\n", + " },\n", + " {\n", + " \"name\": \"biological sex\",\n", + " \"value\": \"XX\"\n", + " }\n", + " ]\n", + " },\n", + " \"ancillaryResults\": {\n", + " \"homozygotes\": 56\n", + " }\n", + " },\n", + " {\n", + " \"id\": \"chr1-4807634-TAAAA-T.AMI.XY\",\n", + " \"type\": \"CohortAlleleFrequency\",\n", + " \"label\": \"Amish Cohort Allele Frequency for chr1-4807634-TAAAA-T\",\n", + " \"focusAllele\": \"#/focusAllele\",\n", + " \"focusAlleleCount\": 196,\n", + " \"locusAlleleCount\": 346,\n", + " \"alleleFrequency\": 0.5664739884393064,\n", + " \"cohort\": {\n", + " \"id\": \"AMI.XY\",\n", + " \"characteristics\": [\n", + " {\n", + " \"name\": \"genetic ancestry\",\n", + " \"value\": \"Amish\"\n", + " },\n", + " {\n", + " \"name\": \"biological sex\",\n", + " \"value\": \"XY\"\n", + " }\n", + " ]\n", + " },\n", + " \"ancillaryResults\": {\n", + " \"homozygotes\": 47\n", + " }\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"id\": \"chr1-4807634-TAAAA-T.AMR\",\n", + " \"type\": \"CohortAlleleFrequency\",\n", + " \"label\": \"Latino Cohort Allele Frequency for chr1-4807634-TAAAA-T\",\n", + " \"focusAllele\": \"#/focusAllele\",\n", + " \"focusAlleleCount\": 5044,\n", + " \"locusAlleleCount\": 8076,\n", + " \"alleleFrequency\": 0.6245666171371966,\n", + " \"cohort\": {\n", + " \"id\": \"AMR\",\n", + " \"characteristics\": [\n", + " {\n", + " \"name\": \"genetic ancestry\",\n", + " \"value\": \"Latino\"\n", + " }\n", + " ]\n", + " },\n", + " \"ancillaryResults\": {\n", + " \"homozygotes\": 1426\n", + " },\n", + " \"subcohortFrequency\": [\n", + " {\n", + " \"id\": \"chr1-4807634-TAAAA-T.AMR.XX\",\n", + " \"type\": \"CohortAlleleFrequency\",\n", + " \"label\": \"Latino Cohort Allele Frequency for chr1-4807634-TAAAA-T\",\n", + " \"focusAllele\": \"#/focusAllele\",\n", + " \"focusAlleleCount\": 2274,\n", + " \"locusAlleleCount\": 3722,\n", + " \"alleleFrequency\": 0.6109618484685653,\n", + " \"cohort\": {\n", + " \"id\": \"AMR.XX\",\n", + " \"characteristics\": [\n", + " {\n", + " \"name\": \"genetic ancestry\",\n", + " \"value\": \"Latino\"\n", + " },\n", + " {\n", + " \"name\": \"biological sex\",\n", + " \"value\": \"XX\"\n", + " }\n", + " ]\n", + " },\n", + " \"ancillaryResults\": {\n", + " \"homozygotes\": 618\n", + " }\n", + " },\n", + " {\n", + " \"id\": \"chr1-4807634-TAAAA-T.AMR.XY\",\n", + " \"type\": \"CohortAlleleFrequency\",\n", + " \"label\": \"Latino Cohort Allele Frequency for chr1-4807634-TAAAA-T\",\n", + " \"focusAllele\": \"#/focusAllele\",\n", + " \"focusAlleleCount\": 2770,\n", + " \"locusAlleleCount\": 4354,\n", + " \"alleleFrequency\": 0.6361966008268259,\n", + " \"cohort\": {\n", + " \"id\": \"AMR.XY\",\n", + " \"characteristics\": [\n", + " {\n", + " \"name\": \"genetic ancestry\",\n", + " \"value\": \"Latino\"\n", + " },\n", + " {\n", + " \"name\": \"biological sex\",\n", + " \"value\": \"XY\"\n", + " }\n", + " ]\n", + " },\n", + " \"ancillaryResults\": {\n", + " \"homozygotes\": 808\n", + " }\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"id\": \"chr1-4807634-TAAAA-T.ASJ\",\n", + " \"type\": \"CohortAlleleFrequency\",\n", + " \"label\": \"Ashkenazi Jewish Cohort Allele Frequency for chr1-4807634-TAAAA-T\",\n", + " \"focusAllele\": \"#/focusAllele\",\n", + " \"focusAlleleCount\": 1602,\n", + " \"locusAlleleCount\": 2628,\n", + " \"alleleFrequency\": 0.6095890410958904,\n", + " \"cohort\": {\n", + " \"id\": \"ASJ\",\n", + " \"characteristics\": [\n", + " {\n", + " \"name\": \"genetic ancestry\",\n", + " \"value\": \"Ashkenazi Jewish\"\n", + " }\n", + " ]\n", + " },\n", + " \"ancillaryResults\": {\n", + " \"homozygotes\": 425\n", + " },\n", + " \"subcohortFrequency\": [\n", + " {\n", + " \"id\": \"chr1-4807634-TAAAA-T.ASJ.XX\",\n", + " \"type\": \"CohortAlleleFrequency\",\n", + " \"label\": \"Ashkenazi Jewish Cohort Allele Frequency for chr1-4807634-TAAAA-T\",\n", + " \"focusAllele\": \"#/focusAllele\",\n", + " \"focusAlleleCount\": 865,\n", + " \"locusAlleleCount\": 1402,\n", + " \"alleleFrequency\": 0.6169757489300999,\n", + " \"cohort\": {\n", + " \"id\": \"ASJ.XX\",\n", + " \"characteristics\": [\n", + " {\n", + " \"name\": \"genetic ancestry\",\n", + " \"value\": \"Ashkenazi Jewish\"\n", + " },\n", + " {\n", + " \"name\": \"biological sex\",\n", + " \"value\": \"XX\"\n", + " }\n", + " ]\n", + " },\n", + " \"ancillaryResults\": {\n", + " \"homozygotes\": 237\n", + " }\n", + " },\n", + " {\n", + " \"id\": \"chr1-4807634-TAAAA-T.ASJ.XY\",\n", + " \"type\": \"CohortAlleleFrequency\",\n", + " \"label\": \"Ashkenazi Jewish Cohort Allele Frequency for chr1-4807634-TAAAA-T\",\n", + " \"focusAllele\": \"#/focusAllele\",\n", + " \"focusAlleleCount\": 737,\n", + " \"locusAlleleCount\": 1226,\n", + " \"alleleFrequency\": 0.6011419249592169,\n", + " \"cohort\": {\n", + " \"id\": \"ASJ.XY\",\n", + " \"characteristics\": [\n", + " {\n", + " \"name\": \"genetic ancestry\",\n", + " \"value\": \"Ashkenazi Jewish\"\n", + " },\n", + " {\n", + " \"name\": \"biological sex\",\n", + " \"value\": \"XY\"\n", + " }\n", + " ]\n", + " },\n", + " \"ancillaryResults\": {\n", + " \"homozygotes\": 188\n", + " }\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"id\": \"chr1-4807634-TAAAA-T.EAS\",\n", + " \"type\": \"CohortAlleleFrequency\",\n", + " \"label\": \"East Asian Cohort Allele Frequency for chr1-4807634-TAAAA-T\",\n", + " \"focusAllele\": \"#/focusAllele\",\n", + " \"focusAlleleCount\": 391,\n", + " \"locusAlleleCount\": 2488,\n", + " \"alleleFrequency\": 0.15715434083601287,\n", + " \"cohort\": {\n", + " \"id\": \"EAS\",\n", + " \"characteristics\": [\n", + " {\n", + " \"name\": \"genetic ancestry\",\n", + " \"value\": \"East Asian\"\n", + " }\n", + " ]\n", + " },\n", + " \"ancillaryResults\": {\n", + " \"homozygotes\": 22\n", + " },\n", + " \"subcohortFrequency\": [\n", + " {\n", + " \"id\": \"chr1-4807634-TAAAA-T.EAS.XX\",\n", + " \"type\": \"CohortAlleleFrequency\",\n", + " \"label\": \"East Asian Cohort Allele Frequency for chr1-4807634-TAAAA-T\",\n", + " \"focusAllele\": \"#/focusAllele\",\n", + " \"focusAlleleCount\": 208,\n", + " \"locusAlleleCount\": 1214,\n", + " \"alleleFrequency\": 0.171334431630972,\n", + " \"cohort\": {\n", + " \"id\": \"EAS.XX\",\n", + " \"characteristics\": [\n", + " {\n", + " \"name\": \"genetic ancestry\",\n", + " \"value\": \"East Asian\"\n", + " },\n", + " {\n", + " \"name\": \"biological sex\",\n", + " \"value\": \"XX\"\n", + " }\n", + " ]\n", + " },\n", + " \"ancillaryResults\": {\n", + " \"homozygotes\": 10\n", + " }\n", + " },\n", + " {\n", + " \"id\": \"chr1-4807634-TAAAA-T.EAS.XY\",\n", + " \"type\": \"CohortAlleleFrequency\",\n", + " \"label\": \"East Asian Cohort Allele Frequency for chr1-4807634-TAAAA-T\",\n", + " \"focusAllele\": \"#/focusAllele\",\n", + " \"focusAlleleCount\": 183,\n", + " \"locusAlleleCount\": 1274,\n", + " \"alleleFrequency\": 0.14364207221350078,\n", + " \"cohort\": {\n", + " \"id\": \"EAS.XY\",\n", + " \"characteristics\": [\n", + " {\n", + " \"name\": \"genetic ancestry\",\n", + " \"value\": \"East Asian\"\n", + " },\n", + " {\n", + " \"name\": \"biological sex\",\n", + " \"value\": \"XY\"\n", + " }\n", + " ]\n", + " },\n", + " \"ancillaryResults\": {\n", + " \"homozygotes\": 12\n", + " }\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"id\": \"chr1-4807634-TAAAA-T.FIN\",\n", + " \"type\": \"CohortAlleleFrequency\",\n", + " \"label\": \"Finnish Cohort Allele Frequency for chr1-4807634-TAAAA-T\",\n", + " \"focusAllele\": \"#/focusAllele\",\n", + " \"focusAlleleCount\": 1460,\n", + " \"locusAlleleCount\": 2186,\n", + " \"alleleFrequency\": 0.6678865507776761,\n", + " \"cohort\": {\n", + " \"id\": \"FIN\",\n", + " \"characteristics\": [\n", + " {\n", + " \"name\": \"genetic ancestry\",\n", + " \"value\": \"Finnish\"\n", + " }\n", + " ]\n", + " },\n", + " \"ancillaryResults\": {\n", + " \"homozygotes\": 438\n", + " },\n", + " \"subcohortFrequency\": [\n", + " {\n", + " \"id\": \"chr1-4807634-TAAAA-T.FIN.XX\",\n", + " \"type\": \"CohortAlleleFrequency\",\n", + " \"label\": \"Finnish Cohort Allele Frequency for chr1-4807634-TAAAA-T\",\n", + " \"focusAllele\": \"#/focusAllele\",\n", + " \"focusAlleleCount\": 392,\n", + " \"locusAlleleCount\": 574,\n", + " \"alleleFrequency\": 0.6829268292682927,\n", + " \"cohort\": {\n", + " \"id\": \"FIN.XX\",\n", + " \"characteristics\": [\n", + " {\n", + " \"name\": \"genetic ancestry\",\n", + " \"value\": \"Finnish\"\n", + " },\n", + " {\n", + " \"name\": \"biological sex\",\n", + " \"value\": \"XX\"\n", + " }\n", + " ]\n", + " },\n", + " \"ancillaryResults\": {\n", + " \"homozygotes\": 127\n", + " }\n", + " },\n", + " {\n", + " \"id\": \"chr1-4807634-TAAAA-T.FIN.XY\",\n", + " \"type\": \"CohortAlleleFrequency\",\n", + " \"label\": \"Finnish Cohort Allele Frequency for chr1-4807634-TAAAA-T\",\n", + " \"focusAllele\": \"#/focusAllele\",\n", + " \"focusAlleleCount\": 1068,\n", + " \"locusAlleleCount\": 1612,\n", + " \"alleleFrequency\": 0.6625310173697271,\n", + " \"cohort\": {\n", + " \"id\": \"FIN.XY\",\n", + " \"characteristics\": [\n", + " {\n", + " \"name\": \"genetic ancestry\",\n", + " \"value\": \"Finnish\"\n", + " },\n", + " {\n", + " \"name\": \"biological sex\",\n", + " \"value\": \"XY\"\n", + " }\n", + " ]\n", + " },\n", + " \"ancillaryResults\": {\n", + " \"homozygotes\": 311\n", + " }\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"id\": \"chr1-4807634-TAAAA-T.NFE\",\n", + " \"type\": \"CohortAlleleFrequency\",\n", + " \"label\": \"Non-Finnish European Cohort Allele Frequency for chr1-4807634-TAAAA-T\",\n", + " \"focusAllele\": \"#/focusAllele\",\n", + " \"focusAlleleCount\": 31415,\n", + " \"locusAlleleCount\": 47894,\n", + " \"alleleFrequency\": 0.6559276736125611,\n", + " \"cohort\": {\n", + " \"id\": \"NFE\",\n", + " \"characteristics\": [\n", + " {\n", + " \"name\": \"genetic ancestry\",\n", + " \"value\": \"Non-Finnish European\"\n", + " }\n", + " ]\n", + " },\n", + " \"ancillaryResults\": {\n", + " \"homozygotes\": 9190\n", + " },\n", + " \"subcohortFrequency\": [\n", + " {\n", + " \"id\": \"chr1-4807634-TAAAA-T.NFE.XX\",\n", + " \"type\": \"CohortAlleleFrequency\",\n", + " \"label\": \"Non-Finnish European Cohort Allele Frequency for chr1-4807634-TAAAA-T\",\n", + " \"focusAllele\": \"#/focusAllele\",\n", + " \"focusAlleleCount\": 18763,\n", + " \"locusAlleleCount\": 28542,\n", + " \"alleleFrequency\": 0.6573821035666737,\n", + " \"cohort\": {\n", + " \"id\": \"NFE.XX\",\n", + " \"characteristics\": [\n", + " {\n", + " \"name\": \"genetic ancestry\",\n", + " \"value\": \"Non-Finnish European\"\n", + " },\n", + " {\n", + " \"name\": \"biological sex\",\n", + " \"value\": \"XX\"\n", + " }\n", + " ]\n", + " },\n", + " \"ancillaryResults\": {\n", + " \"homozygotes\": 5525\n", + " }\n", + " },\n", + " {\n", + " \"id\": \"chr1-4807634-TAAAA-T.NFE.XY\",\n", + " \"type\": \"CohortAlleleFrequency\",\n", + " \"label\": \"Non-Finnish European Cohort Allele Frequency for chr1-4807634-TAAAA-T\",\n", + " \"focusAllele\": \"#/focusAllele\",\n", + " \"focusAlleleCount\": 12652,\n", + " \"locusAlleleCount\": 19352,\n", + " \"alleleFrequency\": 0.6537825547747003,\n", + " \"cohort\": {\n", + " \"id\": \"NFE.XY\",\n", + " \"characteristics\": [\n", + " {\n", + " \"name\": \"genetic ancestry\",\n", + " \"value\": \"Non-Finnish European\"\n", + " },\n", + " {\n", + " \"name\": \"biological sex\",\n", + " \"value\": \"XY\"\n", + " }\n", + " ]\n", + " },\n", + " \"ancillaryResults\": {\n", + " \"homozygotes\": 3665\n", + " }\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"id\": \"chr1-4807634-TAAAA-T.OTH\",\n", + " \"type\": \"CohortAlleleFrequency\",\n", + " \"label\": \"Other Cohort Allele Frequency for chr1-4807634-TAAAA-T\",\n", + " \"focusAllele\": \"#/focusAllele\",\n", + " \"focusAlleleCount\": 726,\n", + " \"locusAlleleCount\": 1216,\n", + " \"alleleFrequency\": 0.5970394736842105,\n", + " \"cohort\": {\n", + " \"id\": \"OTH\",\n", + " \"characteristics\": [\n", + " {\n", + " \"name\": \"genetic ancestry\",\n", + " \"value\": \"Other\"\n", + " }\n", + " ]\n", + " },\n", + " \"ancillaryResults\": {\n", + " \"homozygotes\": 197\n", + " },\n", + " \"subcohortFrequency\": [\n", + " {\n", + " \"id\": \"chr1-4807634-TAAAA-T.OTH.XX\",\n", + " \"type\": \"CohortAlleleFrequency\",\n", + " \"label\": \"Other Cohort Allele Frequency for chr1-4807634-TAAAA-T\",\n", + " \"focusAllele\": \"#/focusAllele\",\n", + " \"focusAlleleCount\": 342,\n", + " \"locusAlleleCount\": 594,\n", + " \"alleleFrequency\": 0.5757575757575758,\n", + " \"cohort\": {\n", + " \"id\": \"OTH.XX\",\n", + " \"characteristics\": [\n", + " {\n", + " \"name\": \"genetic ancestry\",\n", + " \"value\": \"Other\"\n", + " },\n", + " {\n", + " \"name\": \"biological sex\",\n", + " \"value\": \"XX\"\n", + " }\n", + " ]\n", + " },\n", + " \"ancillaryResults\": {\n", + " \"homozygotes\": 87\n", + " }\n", + " },\n", + " {\n", + " \"id\": \"chr1-4807634-TAAAA-T.OTH.XY\",\n", + " \"type\": \"CohortAlleleFrequency\",\n", + " \"label\": \"Other Cohort Allele Frequency for chr1-4807634-TAAAA-T\",\n", + " \"focusAllele\": \"#/focusAllele\",\n", + " \"focusAlleleCount\": 384,\n", + " \"locusAlleleCount\": 622,\n", + " \"alleleFrequency\": 0.617363344051447,\n", + " \"cohort\": {\n", + " \"id\": \"OTH.XY\",\n", + " \"characteristics\": [\n", + " {\n", + " \"name\": \"genetic ancestry\",\n", + " \"value\": \"Other\"\n", + " },\n", + " {\n", + " \"name\": \"biological sex\",\n", + " \"value\": \"XY\"\n", + " }\n", + " ]\n", + " },\n", + " \"ancillaryResults\": {\n", + " \"homozygotes\": 110\n", + " }\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"id\": \"chr1-4807634-TAAAA-T.SAS\",\n", + " \"type\": \"CohortAlleleFrequency\",\n", + " \"label\": \"South Asian Cohort Allele Frequency for chr1-4807634-TAAAA-T\",\n", + " \"focusAllele\": \"#/focusAllele\",\n", + " \"focusAlleleCount\": 1114,\n", + " \"locusAlleleCount\": 2326,\n", + " \"alleleFrequency\": 0.47893379191745483,\n", + " \"cohort\": {\n", + " \"id\": \"SAS\",\n", + " \"characteristics\": [\n", + " {\n", + " \"name\": \"genetic ancestry\",\n", + " \"value\": \"South Asian\"\n", + " }\n", + " ]\n", + " },\n", + " \"ancillaryResults\": {\n", + " \"homozygotes\": 232\n", + " },\n", + " \"subcohortFrequency\": [\n", + " {\n", + " \"id\": \"chr1-4807634-TAAAA-T.SAS.XX\",\n", + " \"type\": \"CohortAlleleFrequency\",\n", + " \"label\": \"South Asian Cohort Allele Frequency for chr1-4807634-TAAAA-T\",\n", + " \"focusAllele\": \"#/focusAllele\",\n", + " \"focusAlleleCount\": 287,\n", + " \"locusAlleleCount\": 556,\n", + " \"alleleFrequency\": 0.5161870503597122,\n", + " \"cohort\": {\n", + " \"id\": \"SAS.XX\",\n", + " \"characteristics\": [\n", + " {\n", + " \"name\": \"genetic ancestry\",\n", + " \"value\": \"South Asian\"\n", + " },\n", + " {\n", + " \"name\": \"biological sex\",\n", + " \"value\": \"XX\"\n", + " }\n", + " ]\n", + " },\n", + " \"ancillaryResults\": {\n", + " \"homozygotes\": 65\n", + " }\n", + " },\n", + " {\n", + " \"id\": \"chr1-4807634-TAAAA-T.SAS.XY\",\n", + " \"type\": \"CohortAlleleFrequency\",\n", + " \"label\": \"South Asian Cohort Allele Frequency for chr1-4807634-TAAAA-T\",\n", + " \"focusAllele\": \"#/focusAllele\",\n", + " \"focusAlleleCount\": 827,\n", + " \"locusAlleleCount\": 1770,\n", + " \"alleleFrequency\": 0.4672316384180791,\n", + " \"cohort\": {\n", + " \"id\": \"SAS.XY\",\n", + " \"characteristics\": [\n", + " {\n", + " \"name\": \"genetic ancestry\",\n", + " \"value\": \"South Asian\"\n", + " },\n", + " {\n", + " \"name\": \"biological sex\",\n", + " \"value\": \"XY\"\n", + " }\n", + " ]\n", + " },\n", + " \"ancillaryResults\": {\n", + " \"homozygotes\": 167\n", + " }\n", + " }\n", + " ]\n", + " },\n", + " {\n", + " \"id\": \"chr1-4807634-TAAAA-T.MID\",\n", + " \"type\": \"CohortAlleleFrequency\",\n", + " \"label\": \"Middle Eastern Cohort Allele Frequency for chr1-4807634-TAAAA-T\",\n", + " \"focusAllele\": \"#/focusAllele\",\n", + " \"focusAlleleCount\": 89,\n", + " \"locusAlleleCount\": 138,\n", + " \"alleleFrequency\": 0.644927536231884,\n", + " \"cohort\": {\n", + " \"id\": \"MID\",\n", + " \"characteristics\": [\n", + " {\n", + " \"name\": \"genetic ancestry\",\n", + " \"value\": \"Middle Eastern\"\n", + " }\n", + " ]\n", + " },\n", + " \"ancillaryResults\": {\n", + " \"homozygotes\": 25\n", + " },\n", + " \"subcohortFrequency\": [\n", + " {\n", + " \"id\": \"chr1-4807634-TAAAA-T.MID.XX\",\n", + " \"type\": \"CohortAlleleFrequency\",\n", + " \"label\": \"Middle Eastern Cohort Allele Frequency for chr1-4807634-TAAAA-T\",\n", + " \"focusAllele\": \"#/focusAllele\",\n", + " \"focusAlleleCount\": 45,\n", + " \"locusAlleleCount\": 72,\n", + " \"alleleFrequency\": 0.625,\n", + " \"cohort\": {\n", + " \"id\": \"MID.XX\",\n", + " \"characteristics\": [\n", + " {\n", + " \"name\": \"genetic ancestry\",\n", + " \"value\": \"Middle Eastern\"\n", + " },\n", + " {\n", + " \"name\": \"biological sex\",\n", + " \"value\": \"XX\"\n", + " }\n", + " ]\n", + " },\n", + " \"ancillaryResults\": {\n", + " \"homozygotes\": 13\n", + " }\n", + " },\n", + " {\n", + " \"id\": \"chr1-4807634-TAAAA-T.MID.XY\",\n", + " \"type\": \"CohortAlleleFrequency\",\n", + " \"label\": \"Middle Eastern Cohort Allele Frequency for chr1-4807634-TAAAA-T\",\n", + " \"focusAllele\": \"#/focusAllele\",\n", + " \"focusAlleleCount\": 44,\n", + " \"locusAlleleCount\": 66,\n", + " \"alleleFrequency\": 0.6666666666666666,\n", + " \"cohort\": {\n", + " \"id\": \"MID.XY\",\n", + " \"characteristics\": [\n", + " {\n", + " \"name\": \"genetic ancestry\",\n", + " \"value\": \"Middle Eastern\"\n", + " },\n", + " {\n", + " \"name\": \"biological sex\",\n", + " \"value\": \"XY\"\n", + " }\n", + " ]\n", + " },\n", + " \"ancillaryResults\": {\n", + " \"homozygotes\": 12\n", + " }\n", + " }\n", + " ]\n", + " }\n", + " ]\n", + " }\n", + "}\n" + ] + } + ], + "source": [ + "print(len(gks_annotations))\n", + "print(json.dumps(gks_annotations[2], indent=2))" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "6cc1fe66", + "metadata": {}, + "outputs": [], + "source": [ + "# # This cell does a larger annotation on a chromosome interval (e.g. first 20k bases of APC gene)\n", + "\n", + "# # APC: Chromosome: 5; NC_000005.10 (112707498..112846239)\n", + "# apc_start = 112707498\n", + "# apc_end = 112846239\n", + "# larger_interval = hl.locus_interval(\n", + "# contig=\"chr5\",\n", + "# start=apc_start,\n", + "# end=apc_start + 20000,\n", + "# reference_genome=\"GRCh38\")\n", + "\n", + "# import time\n", + "# t0 = time.time()\n", + "# gks_annotations = gnomad_gks(\n", + "# locus_interval=larger_interval,\n", + "# custom_ht=ht_filtered,\n", + "# version=\"3.1.4\",\n", + "# data_type=\"genomes\",\n", + "# by_ancestry_group=True,\n", + "# by_sex=True,\n", + "# vrs_only=False,\n", + "# skip_checkpoint=True,\n", + "# skip_coverage=False\n", + "# )\n", + "# t1 = time.time()\n", + "# l = len(gks_annotations)\n", + "# td = t1 - t0\n", + "# print(f\"Annotated {l} records in {td} seconds ({td/l:.6f} sec/rec)\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "c38f2cb2", + "metadata": {}, + "outputs": [], + "source": [ + "# print(json.dumps(gks_annotations[2], indent=2))" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "43cf10a1", + "metadata": {}, + "outputs": [], + "source": [ + "# # Load the jsonschema that can be used for validating objects\n", + "# import subprocess\n", + "# import shutil\n", + "# import json\n", + "# va_spec_clone = \"gnomad-gks-v1_va-spec\"\n", + "# va_spec_branch = \"gk-pilot\"\n", + "# shutil.rmtree(va_spec_clone, ignore_errors=True)\n", + "# p = subprocess.run([\"git\", \"clone\", \"/~https://github.com/ga4gh/va-spec\", \"gnomad-gks-v1_va-spec\"],\n", + "# check=True)\n", + "# p = subprocess.run([\"bash\", \"-c\",\n", + "# f\"cd {va_spec_clone} && git checkout {va_spec_branch}\"],\n", + "# check=True)\n", + "# with open(f\"{va_spec_clone}/schema/cohortAlleleFreq.json\") as f:\n", + "# schema = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "0cf009ca", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import json\n", + "import requests\n", + "import yaml\n", + "import jsonschema\n", + "\n", + "def get_json_http(url):\n", + " r = requests.get(url)\n", + " if r.status_code != 200:\n", + " raise RuntimeError(f\"Request failed:\\n{r.status_code} {r.content}\")\n", + " return json.loads(r.content.decode(\"utf-8\"))\n", + "\n", + "schema = get_json_http(\"https://raw.githubusercontent.com/ga4gh/va-spec/gk-pilot/schema/cohortAlleleFreq.json\")\n", + "\n", + "# Local schema\n", + "# with open(os.path.expanduser(\"~/dev/va-spec/schema/cohortAlleleFreq.yaml\")) as f:\n", + "# schema = yaml.safe_load(f)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "13d53b65", + "metadata": {}, + "outputs": [], + "source": [ + "instance = gks_annotations[2][\"gks_va_freq\"]\n", + "\n", + "jsonschema.validate(\n", + " instance=instance, \n", + " schema=schema)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d3c052d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/schema/cohortAlleleFreq.yaml b/schema/cohortAlleleFreq.yaml index afadcd17..785d04c2 100644 --- a/schema/cohortAlleleFreq.yaml +++ b/schema/cohortAlleleFreq.yaml @@ -256,10 +256,18 @@ properties: type: integer meanDepth: type: number - additonalProperties: false + qcFilters: + type: array + items: + type: string + lowComplexityRegion: + type: boolean + heterozygousAlleleBalanceFlagged: + type: integer + additionalProperties: false subcohortFrequency: type: array items: $ref: "#" required: [id, type, focusAllele, focusAlleleCount, locusAlleleCount, alleleFrequency, cohort] -additionalProperties: false \ No newline at end of file +additionalProperties: false