From 79e2b1cc91a3d3dc78ad81c9ddbc224574ff9e5f Mon Sep 17 00:00:00 2001 From: Kyle Ferriter Date: Tue, 3 Oct 2023 23:31:49 -0400 Subject: [PATCH 01/35] Add qcFilters to gks annotation --- gnomad/resources/grch38/gnomad.py | 2 +- gnomad/utils/annotations.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/gnomad/resources/grch38/gnomad.py b/gnomad/resources/grch38/gnomad.py index d7b4535c2..e598c7f08 100644 --- a/gnomad/resources/grch38/gnomad.py +++ b/gnomad/resources/grch38/gnomad.py @@ -585,7 +585,7 @@ def gnomad_gks( ) ) - keep_fields = [ht.freq, ht.info.vrs, ht.faf95] + keep_fields = [ht.freq, ht.info.vrs, ht.faf95, ht.filters] if not skip_coverage: keep_fields.append(ht.mean_depth) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index f64b45d4e..084c3ef4d 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -2639,6 +2639,7 @@ def _create_group_dicts( "frequency": input_struct.faf95.popmax, "confidenceInterval": 0.95, }, + "qcFilters": list(input_struct.filters), } if input_struct.faf95.popmax_population is not None: From aecfa273c0735889b5df49f3ac56b74f58e10a6c Mon Sep 17 00:00:00 2001 From: Kyle Ferriter Date: Tue, 3 Oct 2023 23:53:18 -0400 Subject: [PATCH 02/35] Add lowComplexityRegion to gks annotations --- gnomad/resources/grch38/gnomad.py | 2 +- gnomad/utils/annotations.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/gnomad/resources/grch38/gnomad.py b/gnomad/resources/grch38/gnomad.py index e598c7f08..4406d01b6 100644 --- a/gnomad/resources/grch38/gnomad.py +++ b/gnomad/resources/grch38/gnomad.py @@ -585,7 +585,7 @@ def gnomad_gks( ) ) - keep_fields = [ht.freq, ht.info.vrs, ht.faf95, ht.filters] + keep_fields = [ht.freq, ht.info.vrs, ht.faf95, ht.filters, ht.region_flag] if not skip_coverage: keep_fields.append(ht.mean_depth) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 084c3ef4d..cca08ffd2 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -2640,6 +2640,7 @@ def _create_group_dicts( "confidenceInterval": 0.95, }, "qcFilters": list(input_struct.filters), + "lowComplexityRegion": input_struct.region_flag.lcr, } if input_struct.faf95.popmax_population is not None: From aabb709c7d85ffd740abc4642e77c0a2285ad6d1 Mon Sep 17 00:00:00 2001 From: Kyle Ferriter Date: Wed, 4 Oct 2023 17:22:05 -0400 Subject: [PATCH 03/35] Add heterozygous allele balance flag count to gks annotations --- gnomad/resources/grch38/gnomad.py | 23 ++++++++++++++++++++++- gnomad/utils/annotations.py | 12 ++++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/gnomad/resources/grch38/gnomad.py b/gnomad/resources/grch38/gnomad.py index 4406d01b6..9674a6c8f 100644 --- a/gnomad/resources/grch38/gnomad.py +++ b/gnomad/resources/grch38/gnomad.py @@ -585,7 +585,28 @@ def gnomad_gks( ) ) - keep_fields = [ht.freq, ht.info.vrs, ht.faf95, ht.filters, ht.region_flag] + # Add >0.9 allele balance for heterozygotes count flag + # The bins are each 5pct so the last 2 are .9-.95, 0.95-1 + ht = ht.annotate(ab_hist_alt_bin_freq=ht.qual_hists.ab_hist_alt.bin_freq) + # ht = ht.annotate( + # allele_balance_qual_flag=hl.sum( + # # ht.qual_hists.ab_hist_alt.bin_freq[-2:] + # [ + # ht.qual_hists.ab_hist_alt.bin_freq[18], + # ht.qual_hists.ab_hist_alt.bin_freq[19], + # ] + # ) + # ) + + keep_fields = [ + ht.freq, + ht.info.vrs, + ht.faf95, + ht.filters, + ht.region_flag, + # ht.allele_balance_qual_flag, + ht.ab_hist_alt_bin_freq, + ] if not skip_coverage: keep_fields.append(ht.mean_depth) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index cca08ffd2..14d11eac0 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -2632,6 +2632,12 @@ def _create_group_dicts( "cohort": {"id": "ALL"}, } + if len(input_struct.ab_hist_alt_bin_freq) != 20: + logger.error( + "ab_hist_alt.bin_freq had %s items, expected 20", + len(input_struct.ab_hist_alt_bin_freq), + ) + # Create ancillaryResults for additional frequency and popMaxFAF95 information ancillaryResults = { "homozygotes": overall_freq["homozygote_count"], @@ -2641,6 +2647,12 @@ def _create_group_dicts( }, "qcFilters": list(input_struct.filters), "lowComplexityRegion": input_struct.region_flag.lcr, + "heterozygousAlleleBalanceFlagged": sum( + [ + input_struct.ab_hist_alt_bin_freq[18], + input_struct.ab_hist_alt_bin_freq[19], + ] + ), } if input_struct.faf95.popmax_population is not None: From f55b0e4f781ba241d5dddd81113b7e939c0cbd3d Mon Sep 17 00:00:00 2001 From: Kyle Ferriter Date: Wed, 4 Oct 2023 17:24:08 -0400 Subject: [PATCH 04/35] Remove prior method of summing ab_hist_alt.bin_freq in hail expression --- gnomad/resources/grch38/gnomad.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/gnomad/resources/grch38/gnomad.py b/gnomad/resources/grch38/gnomad.py index 9674a6c8f..901d9afdc 100644 --- a/gnomad/resources/grch38/gnomad.py +++ b/gnomad/resources/grch38/gnomad.py @@ -588,15 +588,6 @@ def gnomad_gks( # Add >0.9 allele balance for heterozygotes count flag # The bins are each 5pct so the last 2 are .9-.95, 0.95-1 ht = ht.annotate(ab_hist_alt_bin_freq=ht.qual_hists.ab_hist_alt.bin_freq) - # ht = ht.annotate( - # allele_balance_qual_flag=hl.sum( - # # ht.qual_hists.ab_hist_alt.bin_freq[-2:] - # [ - # ht.qual_hists.ab_hist_alt.bin_freq[18], - # ht.qual_hists.ab_hist_alt.bin_freq[19], - # ] - # ) - # ) keep_fields = [ ht.freq, @@ -604,7 +595,6 @@ def gnomad_gks( ht.faf95, ht.filters, ht.region_flag, - # ht.allele_balance_qual_flag, ht.ab_hist_alt_bin_freq, ] From 222c22ab1b04013a112a0eef9d5e48b35e2fc3bd Mon Sep 17 00:00:00 2001 From: Kyle Ferriter Date: Thu, 5 Oct 2023 10:57:11 -0400 Subject: [PATCH 05/35] Make popFreqId casing match schema. And make single string, not tuple. --- gnomad/utils/annotations.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 14d11eac0..620055781 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -2656,11 +2656,10 @@ def _create_group_dicts( } if input_struct.faf95.popmax_population is not None: - ancillaryResults["popMaxFAF95"][ - "popFreqID" - ] = f"{gnomad_id}.{input_struct.faf95.popmax_population.upper()}" + popFreqId = f"{gnomad_id}.{input_struct.faf95.popmax_population.upper()}" + ancillaryResults["popMaxFAF95"]["popFreqId"] = popFreqId else: - ancillaryResults["popMaxFAF95"]["popFreqID"] = None + ancillaryResults["popMaxFAF95"]["popFreqId"] = None # Add mean coverage depth statistics if the input was annotated # with coverage information. From c3b1744817f673d04cead5ec85ccdd0292771545 Mon Sep 17 00:00:00 2001 From: Kyle Ferriter Date: Thu, 5 Oct 2023 11:09:20 -0400 Subject: [PATCH 06/35] Make group ids match the dot syntax of the top level popmax pop id --- gnomad/utils/annotations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 620055781..48d70c7d9 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -2561,7 +2561,7 @@ def _create_group_dicts( # Dictionary to be returned containing information for a specified group. freq_record = { - "id": f"{gnomad_id},{group_id.upper()}", + "id": f"{gnomad_id}.{group_id.upper()}", "type": "CohortAlleleFrequency", "label": f"{group_label} Cohort Allele Frequency for {gnomad_id}", "focusAllele": "#/focusAllele", From d94f3a7a2f492101a2b7788e7d1dabbf0297de28 Mon Sep 17 00:00:00 2001 From: Kyle Ferriter Date: Thu, 5 Oct 2023 11:55:02 -0400 Subject: [PATCH 07/35] Better validation for allele balance flag in gks annotation --- gnomad/resources/grch38/gnomad.py | 6 +----- gnomad/utils/annotations.py | 30 ++++++++++++++++++++---------- 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/gnomad/resources/grch38/gnomad.py b/gnomad/resources/grch38/gnomad.py index 901d9afdc..f6ad8c625 100644 --- a/gnomad/resources/grch38/gnomad.py +++ b/gnomad/resources/grch38/gnomad.py @@ -585,17 +585,13 @@ def gnomad_gks( ) ) - # Add >0.9 allele balance for heterozygotes count flag - # The bins are each 5pct so the last 2 are .9-.95, 0.95-1 - ht = ht.annotate(ab_hist_alt_bin_freq=ht.qual_hists.ab_hist_alt.bin_freq) - keep_fields = [ ht.freq, ht.info.vrs, ht.faf95, ht.filters, ht.region_flag, - ht.ab_hist_alt_bin_freq, + ht.qual_hists.ab_hist_alt, ] if not skip_coverage: diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 48d70c7d9..2368f7bd0 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -2632,11 +2632,26 @@ def _create_group_dicts( "cohort": {"id": "ALL"}, } - if len(input_struct.ab_hist_alt_bin_freq) != 20: - logger.error( - "ab_hist_alt.bin_freq had %s items, expected 20", - len(input_struct.ab_hist_alt_bin_freq), + # Check allele balance for heterozygotes values. + # Flagged values are those in bins >0.9. Each is 0.05, so last 2. + if len(input_struct.ab_hist_alt.bin_freq) != 20: + raise ValueError( + f"{gnomad_id} ab_hist_alt.bin_freq had " + f"{len(input_struct.ab_hist_alt.bin_freq)} items, expected 20" + ) + # The bin_freq should be in order but we can verify the order from bin_edges + ab_bin_freq = list( + map( + lambda tpl: tpl[1], + sorted( + zip( + input_struct.ab_hist_alt.bin_edges, + input_struct.ab_hist_alt.bin_freq, + ), + key=lambda tpl: tpl[0], + ), ) + ) # Create ancillaryResults for additional frequency and popMaxFAF95 information ancillaryResults = { @@ -2647,12 +2662,7 @@ def _create_group_dicts( }, "qcFilters": list(input_struct.filters), "lowComplexityRegion": input_struct.region_flag.lcr, - "heterozygousAlleleBalanceFlagged": sum( - [ - input_struct.ab_hist_alt_bin_freq[18], - input_struct.ab_hist_alt_bin_freq[19], - ] - ), + "heterozygousAlleleBalanceFlagged": sum(ab_bin_freq[-2:]), } if input_struct.faf95.popmax_population is not None: From a07ab9986c759cd8670cca810c82e3741290c084 Mon Sep 17 00:00:00 2001 From: Kyle Ferriter Date: Wed, 18 Oct 2023 11:30:04 -0400 Subject: [PATCH 08/35] Add gnomad-id.NO_COHORTS as popFreqId when no popmaxFAF95 --- gnomad/utils/annotations.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 2368f7bd0..819557be2 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -2667,9 +2667,9 @@ def _create_group_dicts( if input_struct.faf95.popmax_population is not None: popFreqId = f"{gnomad_id}.{input_struct.faf95.popmax_population.upper()}" - ancillaryResults["popMaxFAF95"]["popFreqId"] = popFreqId else: - ancillaryResults["popMaxFAF95"]["popFreqId"] = None + popFreqId = f"{gnomad_id}.NO_COHORTS" + ancillaryResults["popMaxFAF95"]["popFreqId"] = popFreqId # Add mean coverage depth statistics if the input was annotated # with coverage information. From e14c28a34ee185738cd3200575dfcb3495ec3d1f Mon Sep 17 00:00:00 2001 From: Kyle Ferriter Date: Wed, 18 Oct 2023 11:41:39 -0400 Subject: [PATCH 09/35] Comment changes from review --- gnomad/utils/annotations.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 819557be2..1a903daab 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -2633,13 +2633,14 @@ def _create_group_dicts( } # Check allele balance for heterozygotes values. - # Flagged values are those in bins >0.9. Each is 0.05, so last 2. + # Flagged allele balance values are those in bins > 0.90. + # Each bin is 0.05, so flagged values are in the last 2 bins. if len(input_struct.ab_hist_alt.bin_freq) != 20: raise ValueError( f"{gnomad_id} ab_hist_alt.bin_freq had " f"{len(input_struct.ab_hist_alt.bin_freq)} items, expected 20" ) - # The bin_freq should be in order but we can verify the order from bin_edges + # The bin_freq should be in order but we can verify the order from bin_edges. ab_bin_freq = list( map( lambda tpl: tpl[1], @@ -2653,7 +2654,7 @@ def _create_group_dicts( ) ) - # Create ancillaryResults for additional frequency and popMaxFAF95 information + # Create ancillaryResults for additional frequency and popMaxFAF95 information. ancillaryResults = { "homozygotes": overall_freq["homozygote_count"], "popMaxFAF95": { From bc2cb8342b0075883c5e58f88d70d1f0675fffc3 Mon Sep 17 00:00:00 2001 From: Daniel Marten Date: Fri, 20 Oct 2023 11:17:08 -0400 Subject: [PATCH 10/35] Nullable popMaxFAF95 & pop->grp --- gnomad/utils/annotations.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 1a903daab..431318cc5 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -2657,20 +2657,25 @@ def _create_group_dicts( # Create ancillaryResults for additional frequency and popMaxFAF95 information. ancillaryResults = { "homozygotes": overall_freq["homozygote_count"], - "popMaxFAF95": { - "frequency": input_struct.faf95.popmax, - "confidenceInterval": 0.95, - }, "qcFilters": list(input_struct.filters), "lowComplexityRegion": input_struct.region_flag.lcr, "heterozygousAlleleBalanceFlagged": sum(ab_bin_freq[-2:]), } if input_struct.faf95.popmax_population is not None: - popFreqId = f"{gnomad_id}.{input_struct.faf95.popmax_population.upper()}" + grpMaxFAF95 = ( + { + "frequency": input_struct.faf95.popmax, + "confidenceInterval": 0.95, + "grpFreqId": ( + f"{gnomad_id}.{input_struct.faf95.popmax_population.upper()}" + ), + }, + ) else: - popFreqId = f"{gnomad_id}.NO_COHORTS" - ancillaryResults["popMaxFAF95"]["popFreqId"] = popFreqId + grpMaxFAF95 = None + + ancillaryResults["grpMaxFAF95"] = grpMaxFAF95 # Add mean coverage depth statistics if the input was annotated # with coverage information. From d5d89c4c761039271903346b7c62b7ed6b522b2f Mon Sep 17 00:00:00 2001 From: Kyle Ferriter Date: Thu, 19 Oct 2023 10:52:47 -0400 Subject: [PATCH 11/35] Rename heterozygousAlleleBalanceFlagged to heterozygousSkewedAlleleCount --- gnomad/utils/annotations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 431318cc5..43aa811f2 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -2659,7 +2659,7 @@ def _create_group_dicts( "homozygotes": overall_freq["homozygote_count"], "qcFilters": list(input_struct.filters), "lowComplexityRegion": input_struct.region_flag.lcr, - "heterozygousAlleleBalanceFlagged": sum(ab_bin_freq[-2:]), + "heterozygousSkewedAlleleCount": sum(ab_bin_freq[-2:]), } if input_struct.faf95.popmax_population is not None: From 0a48f11e8282aabacce029e317822078411dc1b5 Mon Sep 17 00:00:00 2001 From: Kyle Ferriter Date: Thu, 19 Oct 2023 13:59:54 -0400 Subject: [PATCH 12/35] Add hemizygote count to overall and by_sex subcohorts --- gnomad/resources/grch38/gnomad.py | 4 +++- gnomad/utils/annotations.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/gnomad/resources/grch38/gnomad.py b/gnomad/resources/grch38/gnomad.py index f6ad8c625..a8d8a8cfa 100644 --- a/gnomad/resources/grch38/gnomad.py +++ b/gnomad/resources/grch38/gnomad.py @@ -582,7 +582,8 @@ def gnomad_gks( popmax=hl.missing(hl.tfloat), popmax_population=hl.missing(hl.tstr) ), ), - ) + ), + in_autosome_or_par=ht.locus.in_autosome_or_par(), ) keep_fields = [ @@ -592,6 +593,7 @@ def gnomad_gks( ht.filters, ht.region_flag, ht.qual_hists.ab_hist_alt, + ht.in_autosome_or_par, ] if not skip_coverage: diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 43aa811f2..cb1548037 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -2352,6 +2352,24 @@ def _update_struct( return ht.annotate(**updated_rows) +def freq_index_key(subset=None, pop=None, sex=None, raw=False): + """ + Return the index into freq_index_dict for a given subset, population, and sex. + If raw, return the raw index, else the adj. + + e.g. (example values only) + + freq_index_key(subset=None, pop="afr", sex="XX") + + -> freq_index_dict["afr-XX-adj"] + + -> 20 + """ + parts = [s for s in [subset, pop, sex] if s is not None] + parts.append("raw" if raw else "adj") + return "-".join(parts) + + def gks_compute_seqloc_digest( ht: hl.Table, export_tmpfile: Optional[str] = None, @@ -2572,6 +2590,11 @@ def _create_group_dicts( "ancillaryResults": {"homozygotes": group_freq["homozygote_count"]}, } + # Add hemizygote allele count if variant is non-autosomal/non-PAR. + # Only XY groups can be hemizygous. Other group AC is mixed homo/hetero. + if not input_struct.in_autosome_or_par and group_sex == "XY": + freq_record["ancillaryResults"]["hemizygotes"] = group_freq.AC + return freq_record # Create a list to then add the dictionaries for frequency reports for @@ -2662,6 +2685,11 @@ def _create_group_dicts( "heterozygousSkewedAlleleCount": sum(ab_bin_freq[-2:]), } + # Add hemizygote count if not autosomal or PAR. + if not input_struct.in_autosome_or_par: + hemizygote_count = input_struct.freq[freq_index_dict["XY-adj"]].AC + ancillaryResults["hemizygotes"] = hemizygote_count + if input_struct.faf95.popmax_population is not None: grpMaxFAF95 = ( { From f402b30cfcd3a7860e921fa75b6348807944e2d2 Mon Sep 17 00:00:00 2001 From: Kyle Ferriter Date: Thu, 19 Oct 2023 14:54:58 -0400 Subject: [PATCH 13/35] Include hemizygote count for unsexed subcohorts --- gnomad/utils/annotations.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index cb1548037..7687ae6e2 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -2592,8 +2592,16 @@ def _create_group_dicts( # Add hemizygote allele count if variant is non-autosomal/non-PAR. # Only XY groups can be hemizygous. Other group AC is mixed homo/hetero. - if not input_struct.in_autosome_or_par and group_sex == "XY": - freq_record["ancillaryResults"]["hemizygotes"] = group_freq.AC + # If not a by_sex group, include the XY hemizygote count for XY subgroup. + if not input_struct.in_autosome_or_par: + if group_sex == "XY": + freq_record["ancillaryResults"]["hemizygotes"] = group_freq.AC + elif group_sex is None: + # Group is not by_sex, but still need to report hemizygotes. + hemi_group_freq = input_struct.freq[ + freq_index_dict[freq_index_key(pop=group_id, sex="XY")] + ] + freq_record["ancillaryResults"]["hemizygotes"] = hemi_group_freq.AC return freq_record From bd173feafbc65dfb636bff9d514fa759e986ff3e Mon Sep 17 00:00:00 2001 From: Kyle Ferriter Date: Thu, 19 Oct 2023 17:46:06 -0400 Subject: [PATCH 14/35] Now that we need to sometimes retrieve other group freq inside _create_group_dicts, refactor to remove idx as param, construct ids and get freq idx locally --- gnomad/utils/annotations.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 7687ae6e2..ef55e3b44 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -2550,7 +2550,6 @@ def add_gks_va( # Define function to return a frequency report dictionary for a given group def _create_group_dicts( - group_index: int, group_id: str, group_label: str, group_sex: str = None, @@ -2568,8 +2567,16 @@ def _create_group_dicts( :return: Dictionary containing variant frequency information, - (by genetic ancestry group and sex if desired) for specified variant. """ + if group_sex: + cohort_id = f"{group_id.upper()}.{group_sex}" + else: + cohort_id = f"{group_id.upper()}" + record_id = f"{gnomad_id}.{cohort_id}" + # Obtain frequency information for the specified variant. - group_freq = input_struct.freq[group_index] + group_freq = input_struct.freq[ + freq_index_dict[freq_index_key(pop=group_id, sex=group_sex)] + ] # Cohort characteristics. characteristics = [] @@ -2579,14 +2586,14 @@ def _create_group_dicts( # Dictionary to be returned containing information for a specified group. freq_record = { - "id": f"{gnomad_id}.{group_id.upper()}", + "id": record_id, "type": "CohortAlleleFrequency", "label": f"{group_label} Cohort Allele Frequency for {gnomad_id}", "focusAllele": "#/focusAllele", "focusAlleleCount": group_freq["AC"], "locusAlleleCount": group_freq["AN"], "alleleFrequency": group_freq["AF"], - "cohort": {"id": group_id.upper(), "characteristics": characteristics}, + "cohort": {"id": cohort_id, "characteristics": characteristics}, "ancillaryResults": {"homozygotes": group_freq["homozygote_count"]}, } @@ -2612,10 +2619,7 @@ def _create_group_dicts( # Iterate through provided groups and generate dictionaries. if ancestry_groups: for group in ancestry_groups: - key = f"{group}-adj" - index_value = freq_index_dict.get(key) group_result = _create_group_dicts( - group_index=index_value, group_id=group, group_label=ancestry_groups_dict[group], ) @@ -2624,12 +2628,8 @@ def _create_group_dicts( if by_sex: sex_list = [] for sex in ["XX", "XY"]: - sex_key = f"{group}-{sex}-adj" - sex_index_value = freq_index_dict.get(sex_key) - sex_id = f"{group}.{sex}" sex_result = _create_group_dicts( - group_index=sex_index_value, - group_id=sex_id, + group_id=group, group_label=ancestry_groups_dict[group], group_sex=sex, ) From 9c2f6ed85588afd91a6c862c598757b6a115cd78 Mon Sep 17 00:00:00 2001 From: Kyle Ferriter Date: Fri, 20 Oct 2023 15:43:53 -0400 Subject: [PATCH 15/35] Remove freq_index_key, not worth it for only a couple values --- gnomad/utils/annotations.py | 26 ++++---------------------- 1 file changed, 4 insertions(+), 22 deletions(-) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index ef55e3b44..3d9701184 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -2352,24 +2352,6 @@ def _update_struct( return ht.annotate(**updated_rows) -def freq_index_key(subset=None, pop=None, sex=None, raw=False): - """ - Return the index into freq_index_dict for a given subset, population, and sex. - If raw, return the raw index, else the adj. - - e.g. (example values only) - - freq_index_key(subset=None, pop="afr", sex="XX") - - -> freq_index_dict["afr-XX-adj"] - - -> 20 - """ - parts = [s for s in [subset, pop, sex] if s is not None] - parts.append("raw" if raw else "adj") - return "-".join(parts) - - def gks_compute_seqloc_digest( ht: hl.Table, export_tmpfile: Optional[str] = None, @@ -2569,14 +2551,14 @@ def _create_group_dicts( """ if group_sex: cohort_id = f"{group_id.upper()}.{group_sex}" + freq_index_key = f"{group_id}-{group_sex}-adj" else: cohort_id = f"{group_id.upper()}" + freq_index_key = f"{group_id}-adj" record_id = f"{gnomad_id}.{cohort_id}" # Obtain frequency information for the specified variant. - group_freq = input_struct.freq[ - freq_index_dict[freq_index_key(pop=group_id, sex=group_sex)] - ] + group_freq = input_struct.freq[freq_index_dict[freq_index_key]] # Cohort characteristics. characteristics = [] @@ -2606,7 +2588,7 @@ def _create_group_dicts( elif group_sex is None: # Group is not by_sex, but still need to report hemizygotes. hemi_group_freq = input_struct.freq[ - freq_index_dict[freq_index_key(pop=group_id, sex="XY")] + freq_index_dict[f"{group_id}-XY-adj"] ] freq_record["ancillaryResults"]["hemizygotes"] = hemi_group_freq.AC From 06c172b53fe363a9e2f7b7c48654980760f497a4 Mon Sep 17 00:00:00 2001 From: Kyle Ferriter Date: Fri, 20 Oct 2023 15:47:49 -0400 Subject: [PATCH 16/35] Rename tpl->x --- gnomad/utils/annotations.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 3d9701184..24b7d84c9 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -2656,13 +2656,13 @@ def _create_group_dicts( # The bin_freq should be in order but we can verify the order from bin_edges. ab_bin_freq = list( map( - lambda tpl: tpl[1], + lambda x: x[1], sorted( zip( input_struct.ab_hist_alt.bin_edges, input_struct.ab_hist_alt.bin_freq, ), - key=lambda tpl: tpl[0], + key=lambda x: x[0], ), ) ) From 2b9d2a917eab2a54e1fc41501f2d0c49650781db Mon Sep 17 00:00:00 2001 From: Kyle Ferriter Date: Tue, 31 Oct 2023 19:48:43 -0400 Subject: [PATCH 17/35] Add handling for joint groupmax faf in v4 tables. Add handling for lcr and allele balance structures in v3/v4 --- gnomad/resources/grch38/gnomad.py | 120 +++++++++++++++++++++--------- gnomad/utils/annotations.py | 65 +++++++++------- 2 files changed, 123 insertions(+), 62 deletions(-) diff --git a/gnomad/resources/grch38/gnomad.py b/gnomad/resources/grch38/gnomad.py index a8d8a8cfa..be41b28a2 100644 --- a/gnomad/resources/grch38/gnomad.py +++ b/gnomad/resources/grch38/gnomad.py @@ -479,6 +479,63 @@ def release_vcf_path(data_type: str, version: str, contig: str) -> str: return f"gs://gcp-public-data--gnomad/release/{version}/vcf/{data_type}/gnomad.{data_type}.{version_prefix}{version}.sites{contig}.vcf.bgz" +def add_grpMaxFAF95_v3(ht: hl.Table) -> hl.Table: + ht = ht.annotate( + grpMaxFAF95=hl.rbind( + hl.sorted( + hl.array( + [ + hl.struct( + faf=ht.faf[ht.faf_index_dict[f"{pop}_adj"]].faf95, + population=pop, + ) + for pop in FAF_POPS + ] + ), + key=lambda f: (-f.faf, f.population), + ), + lambda fafs: hl.if_else( + hl.len(fafs) > 0, + hl.struct( + popmax=fafs[0].faf, + popmax_population=hl.if_else( + fafs[0].faf == 0, hl.missing(hl.tstr), fafs[0].population + ), + ), + hl.struct( + popmax=hl.missing(hl.tfloat), popmax_population=hl.missing(hl.tstr) + ), + ), + ), + ) + return ht + + +def add_grpMaxFAF95_v4(ht: hl.Table) -> hl.Table: + ht = ht.annotate( + grpMaxFAF95=hl.struct( + popmax=ht.fafmax.faf95_max, popmax_population=ht.fafmax.faf95_max_gen_anc + ), + jointGrpMaxFAF95=hl.struct( + popmax=ht.joint_fafmax.faf95_max, + popmax_population=ht.joint_fafmax.faf95_max_gen_anc, + ), + ) + return ht + + +def add_grpMaxFAF95(ht: hl.Table) -> hl.Table: + """ + Adds a grpMaxFAF95 struct with 'popmax' and 'popmax_population'. + + Accepts tables with popmax FAF fields in v3 and v4 formats. + """ + if "fafmax" in ht.row and "faf95_max" in ht.fafmax: + return add_grpMaxFAF95_v4(ht) + else: + return add_grpMaxFAF95_v3(ht) + + def gnomad_gks( locus_interval: hl.IntervalExpression, version: str, @@ -521,9 +578,11 @@ def gnomad_gks( # Read coverage statistics if requested if high_level_version == "v3": coverage_version = "3.0.1" + elif high_level_version == "v4": + coverage_version = "3.0.1" else: raise NotImplementedError( - "gnomad_gks() is currently only implemented for gnomAD v3." + "gnomad_gks() is currently only implemented for gnomAD v3 and v4." ) coverage_ht = None @@ -552,53 +611,41 @@ def gnomad_gks( " please also specify 'by_ancestry_group' to stratify by." ) - # Call and return add_gks_vrs and add_gks_va for chosen arguments. - # Select relevant fields, checkpoint, and filter to interval before adding # annotations - ht = ht.annotate( - faf95=hl.rbind( - hl.sorted( - hl.array( - [ - hl.struct( - faf=ht.faf[ht.faf_index_dict[f"{pop}-adj"]].faf95, - population=pop, - ) - for pop in FAF_POPS - ] - ), - key=lambda f: (-f.faf, f.population), - ), - lambda fafs: hl.if_else( - hl.len(fafs) > 0, - hl.struct( - popmax=fafs[0].faf, - popmax_population=hl.if_else( - fafs[0].faf == 0, hl.missing(hl.tstr), fafs[0].population - ), - ), - hl.struct( - popmax=hl.missing(hl.tfloat), popmax_population=hl.missing(hl.tstr) - ), - ), - ), - in_autosome_or_par=ht.locus.in_autosome_or_par(), - ) + + # Pull up LCR flag and make referrable in the same field + if high_level_version == "v3": # v3 + ht = ht.annotate(lcr=ht.region_flag.lcr) + else: # v4 + ht = ht.annotate(lcr=ht.region_flags.lcr) + + # Pull up v3 / v4 allele balance histogram arrays + if high_level_version == "v3": # v3 + ht = ht.annotate(ab_hist_alt=ht.qual_hists.ab_hist_alt) + else: # v4 + ht = ht.annotate(ab_hist_alt=ht.histograms.qual_hists.ab_hist_alt) + + ht = add_grpMaxFAF95(ht) + + ht = ht.annotate(in_autosome_or_par=ht.locus.in_autosome_or_par()) keep_fields = [ ht.freq, ht.info.vrs, - ht.faf95, + ht.grpMaxFAF95, ht.filters, - ht.region_flag, - ht.qual_hists.ab_hist_alt, + ht.lcr, + ht.ab_hist_alt, ht.in_autosome_or_par, ] if not skip_coverage: keep_fields.append(ht.mean_depth) + if "jointGrpMaxFAF95" in ht.row: + keep_fields.append(ht.jointGrpMaxFAF95) + ht = ht.select(*keep_fields) # Checkpoint narrower set of columns if not skipped. @@ -610,6 +657,9 @@ def gnomad_gks( # done in native Python variant_list = ht.collect() ht_freq_index_dict = ht.freq_index_dict.collect()[0] + # gnomad v4 renamed freq_index_dict keys to use underscores instead of dashes. + # Use underscores for v3 as well. + ht_freq_index_dict = {k.replace("-", "_"): v for k, v in ht_freq_index_dict.items()} # Assemble output dicts with VRS and optionally frequency, append to list, # then return list diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 24b7d84c9..fd1b03589 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -2551,10 +2551,10 @@ def _create_group_dicts( """ if group_sex: cohort_id = f"{group_id.upper()}.{group_sex}" - freq_index_key = f"{group_id}-{group_sex}-adj" + freq_index_key = f"{group_id}_{group_sex}_adj" else: cohort_id = f"{group_id.upper()}" - freq_index_key = f"{group_id}-adj" + freq_index_key = f"{group_id}_adj" record_id = f"{gnomad_id}.{cohort_id}" # Obtain frequency information for the specified variant. @@ -2588,7 +2588,7 @@ def _create_group_dicts( elif group_sex is None: # Group is not by_sex, but still need to report hemizygotes. hemi_group_freq = input_struct.freq[ - freq_index_dict[f"{group_id}-XY-adj"] + freq_index_dict[f"{group_id}_XY_adj"] ] freq_record["ancillaryResults"]["hemizygotes"] = hemi_group_freq.AC @@ -2645,6 +2645,38 @@ def _create_group_dicts( "cohort": {"id": "ALL"}, } + # Create ancillaryResults for additional frequency and popMaxFAF95 information. + ancillaryResults = { + "homozygotes": overall_freq["homozygote_count"], + } + + # Add hemizygote count if not autosomal or PAR. + if not input_struct.in_autosome_or_par: + hemizygote_count = input_struct.freq[freq_index_dict["XY_adj"]].AC + ancillaryResults["hemizygotes"] = hemizygote_count + + if input_struct.faf95.popmax_population is not None: + grpMaxFAF95 = { + "frequency": input_struct.grpMaxFAF95.popmax, + "confidenceInterval": 0.95, + "groupId": ( + f"{gnomad_id}.{input_struct.grpMaxFAF95.popmax_population.upper()}" + ), + } + else: + grpMaxFAF95 = None + ancillaryResults["grpMaxFAF95"] = grpMaxFAF95 + + # Add joint group max FAF if it exists + if "jointGrpMaxFAF95" in input_struct: + ancillaryResults["jointGrpMaxFAF95"] = { + "frequency": input_struct.jointGrpMaxFAF95.popmax, + "confidenceInterval": 0.95, + "groupId": ( + f"{gnomad_id}.{input_struct.jointGrpMaxFAF95.popmax_population.upper()}" + ), + } + # Check allele balance for heterozygotes values. # Flagged allele balance values are those in bins > 0.90. # Each bin is 0.05, so flagged values are in the last 2 bins. @@ -2667,33 +2699,12 @@ def _create_group_dicts( ) ) - # Create ancillaryResults for additional frequency and popMaxFAF95 information. - ancillaryResults = { - "homozygotes": overall_freq["homozygote_count"], + qualityMeasures = { "qcFilters": list(input_struct.filters), - "lowComplexityRegion": input_struct.region_flag.lcr, + "lowComplexityRegion": input_struct.lcr, "heterozygousSkewedAlleleCount": sum(ab_bin_freq[-2:]), } - - # Add hemizygote count if not autosomal or PAR. - if not input_struct.in_autosome_or_par: - hemizygote_count = input_struct.freq[freq_index_dict["XY-adj"]].AC - ancillaryResults["hemizygotes"] = hemizygote_count - - if input_struct.faf95.popmax_population is not None: - grpMaxFAF95 = ( - { - "frequency": input_struct.faf95.popmax, - "confidenceInterval": 0.95, - "grpFreqId": ( - f"{gnomad_id}.{input_struct.faf95.popmax_population.upper()}" - ), - }, - ) - else: - grpMaxFAF95 = None - - ancillaryResults["grpMaxFAF95"] = grpMaxFAF95 + ancillaryResults["qualityMeasures"] = qualityMeasures # Add mean coverage depth statistics if the input was annotated # with coverage information. From 6247036b2b3603bf861cde9e2d83a17fd6bd1713 Mon Sep 17 00:00:00 2001 From: Kyle Ferriter Date: Tue, 31 Oct 2023 20:53:57 -0400 Subject: [PATCH 18/35] Add 'remaining' to POP_NAMES and POP_COLORS --- gnomad/sample_qc/ancestry.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gnomad/sample_qc/ancestry.py b/gnomad/sample_qc/ancestry.py index 54f5280ac..8dc827107 100644 --- a/gnomad/sample_qc/ancestry.py +++ b/gnomad/sample_qc/ancestry.py @@ -51,6 +51,7 @@ "oeu": "Other European", "onf": "Other Non-Finnish European", "unk": "Unknown", + "remaining": "Remaining", } POP_COLORS = { @@ -82,6 +83,7 @@ "oeu": "#6AA5CD", "onf": "#6AA5CD", "unk": "#ABB9B9", + "remaining": "#ABB9B9", "": "#ABB9B9", } From a07b71f43bc4b118d89d0002e5983384ab6f3578 Mon Sep 17 00:00:00 2001 From: Kyle Ferriter Date: Tue, 31 Oct 2023 21:24:30 -0400 Subject: [PATCH 19/35] Fix FAF key for v3. Fix when jointGrpMaxFAF95 is null --- gnomad/resources/grch38/gnomad.py | 2 +- gnomad/utils/annotations.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/gnomad/resources/grch38/gnomad.py b/gnomad/resources/grch38/gnomad.py index be41b28a2..47be35a7d 100644 --- a/gnomad/resources/grch38/gnomad.py +++ b/gnomad/resources/grch38/gnomad.py @@ -486,7 +486,7 @@ def add_grpMaxFAF95_v3(ht: hl.Table) -> hl.Table: hl.array( [ hl.struct( - faf=ht.faf[ht.faf_index_dict[f"{pop}_adj"]].faf95, + faf=ht.faf[ht.faf_index_dict[f"{pop}-adj"]].faf95, population=pop, ) for pop in FAF_POPS diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index fd1b03589..ba0bf192a 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -2655,7 +2655,7 @@ def _create_group_dicts( hemizygote_count = input_struct.freq[freq_index_dict["XY_adj"]].AC ancillaryResults["hemizygotes"] = hemizygote_count - if input_struct.faf95.popmax_population is not None: + if input_struct.grpMaxFAF95.popmax_population is not None: grpMaxFAF95 = { "frequency": input_struct.grpMaxFAF95.popmax, "confidenceInterval": 0.95, @@ -2668,7 +2668,10 @@ def _create_group_dicts( ancillaryResults["grpMaxFAF95"] = grpMaxFAF95 # Add joint group max FAF if it exists - if "jointGrpMaxFAF95" in input_struct: + if ( + "jointGrpMaxFAF95" in input_struct + and input_struct.jointGrpMaxFAF95.popmax_population is not None + ): ancillaryResults["jointGrpMaxFAF95"] = { "frequency": input_struct.jointGrpMaxFAF95.popmax, "confidenceInterval": 0.95, From 35f18ae6f418bd4799751ebaa97bdbb08fac3eee Mon Sep 17 00:00:00 2001 From: Kyle Ferriter Date: Tue, 31 Oct 2023 21:35:50 -0400 Subject: [PATCH 20/35] Add docstrings to add_grpMaxFAF95* functions --- gnomad/resources/grch38/gnomad.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/gnomad/resources/grch38/gnomad.py b/gnomad/resources/grch38/gnomad.py index 47be35a7d..1b0d09840 100644 --- a/gnomad/resources/grch38/gnomad.py +++ b/gnomad/resources/grch38/gnomad.py @@ -480,6 +480,12 @@ def release_vcf_path(data_type: str, version: str, contig: str) -> str: def add_grpMaxFAF95_v3(ht: hl.Table) -> hl.Table: + """ + Add a grpMaxFAF95 annotation using the v3 style faf and faf_index_dict structures. + + :param ht: Input hail table. + :return: Annotated hail table. + """ ht = ht.annotate( grpMaxFAF95=hl.rbind( hl.sorted( @@ -512,6 +518,12 @@ def add_grpMaxFAF95_v3(ht: hl.Table) -> hl.Table: def add_grpMaxFAF95_v4(ht: hl.Table) -> hl.Table: + """ + Add a grpMaxFAF95 and jointGrpMaxFAF95 annotation using the v4 fafmax and joint_fafmax structures. + + :param ht: Input hail table. + :return: Annotated hail table. + """ ht = ht.annotate( grpMaxFAF95=hl.struct( popmax=ht.fafmax.faf95_max, popmax_population=ht.fafmax.faf95_max_gen_anc @@ -526,9 +538,13 @@ def add_grpMaxFAF95_v4(ht: hl.Table) -> hl.Table: def add_grpMaxFAF95(ht: hl.Table) -> hl.Table: """ - Adds a grpMaxFAF95 struct with 'popmax' and 'popmax_population'. + Add a grpMaxFAF95 struct with 'popmax' and 'popmax_population'. Accepts tables with popmax FAF fields in v3 and v4 formats. + If the table includes the v4 style joint_fafmax, also adds jointGrpMaxFAF95. + + :param ht: Input hail table. Can be v3 or v4. + :return: Annotated hail table. """ if "fafmax" in ht.row and "faf95_max" in ht.fafmax: return add_grpMaxFAF95_v4(ht) From 3c86062ba11588a2fc14a5f6722f419c0447c1d7 Mon Sep 17 00:00:00 2001 From: Kyle Ferriter Date: Tue, 31 Oct 2023 22:01:31 -0400 Subject: [PATCH 21/35] Move qualityMeasures to top level of cohort freq statement. Omit grpMaxFAF95 if null to conform with latest schema. Move meanDepth to qualityMeasures --- gnomad/utils/annotations.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index ba0bf192a..9ae05e049 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -2655,17 +2655,15 @@ def _create_group_dicts( hemizygote_count = input_struct.freq[freq_index_dict["XY_adj"]].AC ancillaryResults["hemizygotes"] = hemizygote_count + # Add group max FAF if it exists if input_struct.grpMaxFAF95.popmax_population is not None: - grpMaxFAF95 = { + ancillaryResults["grpMaxFAF95"] = { "frequency": input_struct.grpMaxFAF95.popmax, "confidenceInterval": 0.95, "groupId": ( f"{gnomad_id}.{input_struct.grpMaxFAF95.popmax_population.upper()}" ), } - else: - grpMaxFAF95 = None - ancillaryResults["grpMaxFAF95"] = grpMaxFAF95 # Add joint group max FAF if it exists if ( @@ -2680,6 +2678,8 @@ def _create_group_dicts( ), } + final_freq_dict["ancillaryResults"] = ancillaryResults + # Check allele balance for heterozygotes values. # Flagged allele balance values are those in bins > 0.90. # Each bin is 0.05, so flagged values are in the last 2 bins. @@ -2707,14 +2707,13 @@ def _create_group_dicts( "lowComplexityRegion": input_struct.lcr, "heterozygousSkewedAlleleCount": sum(ab_bin_freq[-2:]), } - ancillaryResults["qualityMeasures"] = qualityMeasures # Add mean coverage depth statistics if the input was annotated # with coverage information. if "mean_depth" in input_struct: - ancillaryResults["meanDepth"] = input_struct.mean_depth + qualityMeasures["meanDepth"] = input_struct.mean_depth - final_freq_dict["ancillaryResults"] = ancillaryResults + final_freq_dict["qualityMeasures"] = qualityMeasures # If ancestry_groups were passed, add the ancestry group dictionary to the # final frequency dictionary to be returned. From 0261dbd4681a3412e79c1056f6a72154191aa747 Mon Sep 17 00:00:00 2001 From: Kyle Ferriter Date: Tue, 31 Oct 2023 23:54:05 -0400 Subject: [PATCH 22/35] Add monoallelic flag using AC/AN --- gnomad/utils/annotations.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 9ae05e049..3ac24cc4c 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -2713,6 +2713,10 @@ def _create_group_dicts( if "mean_depth" in input_struct: qualityMeasures["meanDepth"] = input_struct.mean_depth + # Add monoallelic flag (all samples homozygous for alternate allele) + if final_freq_dict["focusAlleleCount"] == final_freq_dict["locusAlleleCount"]: + qualityMeasures["monoallelic"] = True + final_freq_dict["qualityMeasures"] = qualityMeasures # If ancestry_groups were passed, add the ancestry group dictionary to the From 9dbd4188113c115788fbe6e49d17153d2bc68992 Mon Sep 17 00:00:00 2001 From: Kyle Ferriter Date: Thu, 2 Nov 2023 12:28:50 -0400 Subject: [PATCH 23/35] Use monoallelic flag from table, which is based on raw AF --- gnomad/resources/grch38/gnomad.py | 1 + gnomad/utils/annotations.py | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/gnomad/resources/grch38/gnomad.py b/gnomad/resources/grch38/gnomad.py index 1b0d09840..66b3d74bc 100644 --- a/gnomad/resources/grch38/gnomad.py +++ b/gnomad/resources/grch38/gnomad.py @@ -649,6 +649,7 @@ def gnomad_gks( keep_fields = [ ht.freq, ht.info.vrs, + ht.info.monoallelic, ht.grpMaxFAF95, ht.filters, ht.lcr, diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 3ac24cc4c..3cf0d3ee0 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -2714,8 +2714,7 @@ def _create_group_dicts( qualityMeasures["meanDepth"] = input_struct.mean_depth # Add monoallelic flag (all samples homozygous for alternate allele) - if final_freq_dict["focusAlleleCount"] == final_freq_dict["locusAlleleCount"]: - qualityMeasures["monoallelic"] = True + qualityMeasures["monoallelic"] = input_struct.monoallelic final_freq_dict["qualityMeasures"] = qualityMeasures From 13a0ed2c4955bf490802ae6cae5514f2dab69eb2 Mon Sep 17 00:00:00 2001 From: Kyle Ferriter Date: Tue, 14 Nov 2023 01:09:00 -0500 Subject: [PATCH 24/35] Use version field to determine fafmax struct. Handle fafmax subsets in exomes table --- gnomad/resources/grch38/gnomad.py | 32 +++++++++++++------------------ 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/gnomad/resources/grch38/gnomad.py b/gnomad/resources/grch38/gnomad.py index 66b3d74bc..134aef4a9 100644 --- a/gnomad/resources/grch38/gnomad.py +++ b/gnomad/resources/grch38/gnomad.py @@ -519,14 +519,21 @@ def add_grpMaxFAF95_v3(ht: hl.Table) -> hl.Table: def add_grpMaxFAF95_v4(ht: hl.Table) -> hl.Table: """ - Add a grpMaxFAF95 and jointGrpMaxFAF95 annotation using the v4 fafmax and joint_fafmax structures. + Add a grpMaxFAF95 struct with 'popmax' and 'popmax_population'. + + Also includes a jointGrpMaxFAF95 annotation using the v4 fafmax and joint_fafmax structures. :param ht: Input hail table. :return: Annotated hail table. """ + if "gnomad" in ht.fafmax: + fafmax_field = ht.fafmax.gnomad + else: + fafmax_field = ht.fafmax ht = ht.annotate( grpMaxFAF95=hl.struct( - popmax=ht.fafmax.faf95_max, popmax_population=ht.fafmax.faf95_max_gen_anc + popmax=fafmax_field.faf95_max, + popmax_population=fafmax_field.faf95_max_gen_anc, ), jointGrpMaxFAF95=hl.struct( popmax=ht.joint_fafmax.faf95_max, @@ -536,22 +543,6 @@ def add_grpMaxFAF95_v4(ht: hl.Table) -> hl.Table: return ht -def add_grpMaxFAF95(ht: hl.Table) -> hl.Table: - """ - Add a grpMaxFAF95 struct with 'popmax' and 'popmax_population'. - - Accepts tables with popmax FAF fields in v3 and v4 formats. - If the table includes the v4 style joint_fafmax, also adds jointGrpMaxFAF95. - - :param ht: Input hail table. Can be v3 or v4. - :return: Annotated hail table. - """ - if "fafmax" in ht.row and "faf95_max" in ht.fafmax: - return add_grpMaxFAF95_v4(ht) - else: - return add_grpMaxFAF95_v3(ht) - - def gnomad_gks( locus_interval: hl.IntervalExpression, version: str, @@ -642,7 +633,10 @@ def gnomad_gks( else: # v4 ht = ht.annotate(ab_hist_alt=ht.histograms.qual_hists.ab_hist_alt) - ht = add_grpMaxFAF95(ht) + if high_level_version == "v3": + ht = add_grpMaxFAF95_v3(ht) + else: + ht = add_grpMaxFAF95_v4(ht) ht = ht.annotate(in_autosome_or_par=ht.locus.in_autosome_or_par()) From 50797cd06189da0785284933bee67af45b2372a0 Mon Sep 17 00:00:00 2001 From: Kyle Ferriter Date: Tue, 14 Nov 2023 01:14:26 -0500 Subject: [PATCH 25/35] Replace None alleleFrequency due to 0/0, with 0.0 --- gnomad/utils/annotations.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 3cf0d3ee0..32a44d042 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -2574,7 +2574,9 @@ def _create_group_dicts( "focusAllele": "#/focusAllele", "focusAlleleCount": group_freq["AC"], "locusAlleleCount": group_freq["AN"], - "alleleFrequency": group_freq["AF"], + "alleleFrequency": ( + group_freq["AF"] if group_freq["AF"] is not None else 0.0 + ), "cohort": {"id": cohort_id, "characteristics": characteristics}, "ancillaryResults": {"homozygotes": group_freq["homozygote_count"]}, } @@ -2641,7 +2643,9 @@ def _create_group_dicts( ), # Information can be populated with the result of add_gks_vrs() "focusAlleleCount": overall_freq["AC"], "locusAlleleCount": overall_freq["AN"], - "alleleFrequency": overall_freq["AF"], + "alleleFrequency": ( + overall_freq["AF"] if overall_freq["AF"] is not None else 0.0 + ), "cohort": {"id": "ALL"}, } From 6c60dca9b3b34b83699eaf7795e62bb822540230 Mon Sep 17 00:00:00 2001 From: Kyle Ferriter Date: Tue, 14 Nov 2023 15:06:31 -0500 Subject: [PATCH 26/35] Use v4 exomes coverage if table passed is v4. --- gnomad/resources/grch38/gnomad.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/gnomad/resources/grch38/gnomad.py b/gnomad/resources/grch38/gnomad.py index 134aef4a9..1eb322451 100644 --- a/gnomad/resources/grch38/gnomad.py +++ b/gnomad/resources/grch38/gnomad.py @@ -586,7 +586,7 @@ def gnomad_gks( if high_level_version == "v3": coverage_version = "3.0.1" elif high_level_version == "v4": - coverage_version = "3.0.1" + coverage_version = "4.0" else: raise NotImplementedError( "gnomad_gks() is currently only implemented for gnomAD v3 and v4." @@ -598,9 +598,16 @@ def gnomad_gks( if custom_coverage_ht: coverage_ht = custom_coverage_ht else: - coverage_ht = hl.read_table( - coverage("genomes").versions[coverage_version].path - ) + # in v4, only exomes have coverage + # in v3, only genomes have coverage (only genomes exist) + if high_level_version == "v3": + coverage_ht = hl.read_table( + coverage("genomes").versions[coverage_version].path + ) + else: + coverage_ht = hl.read_table( + coverage("exomes").versions[coverage_version].path + ) ht = ht.annotate(mean_depth=coverage_ht[ht.locus].mean) # Retrieve ancestry groups from the imported POPS dictionary. From ca9cffb5c1e75c76c8e3756c630765610385edd3 Mon Sep 17 00:00:00 2001 From: Kyle Ferriter Date: Wed, 10 Jan 2024 16:22:59 -0500 Subject: [PATCH 27/35] Update coverage table determination for gks annotations --- gnomad/resources/grch38/gnomad.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/gnomad/resources/grch38/gnomad.py b/gnomad/resources/grch38/gnomad.py index 1eb322451..20f466f91 100644 --- a/gnomad/resources/grch38/gnomad.py +++ b/gnomad/resources/grch38/gnomad.py @@ -583,10 +583,19 @@ def gnomad_gks( high_level_version = f"v{version.split('.')[0]}" # Read coverage statistics if requested + coverage_version_3_0_1 = "3.0.1" # v3/v4 genomes coverage + coverage_version_4_0 = "4.0" # v4 exomes coverage + + # In v4, exomes have coverage in v4 coverage table, + # genomes have coverage in v3 coverage table. + # In v3, only genomes have coverage (only genomes exist). if high_level_version == "v3": - coverage_version = "3.0.1" + coverage_version = coverage_version_3_0_1 elif high_level_version == "v4": - coverage_version = "4.0" + if data_type == "exomes": + coverage_version = coverage_version_4_0 + elif data_type == "genomes": + coverage_version = coverage_version_3_0_1 else: raise NotImplementedError( "gnomad_gks() is currently only implemented for gnomAD v3 and v4." @@ -598,16 +607,9 @@ def gnomad_gks( if custom_coverage_ht: coverage_ht = custom_coverage_ht else: - # in v4, only exomes have coverage - # in v3, only genomes have coverage (only genomes exist) - if high_level_version == "v3": - coverage_ht = hl.read_table( - coverage("genomes").versions[coverage_version].path - ) - else: - coverage_ht = hl.read_table( - coverage("exomes").versions[coverage_version].path - ) + coverage_ht = hl.read_table( + coverage(data_type).versions[coverage_version].path + ) ht = ht.annotate(mean_depth=coverage_ht[ht.locus].mean) # Retrieve ancestry groups from the imported POPS dictionary. From 91b67f71364599854205961170ed0c21dea7f802 Mon Sep 17 00:00:00 2001 From: Kyle Ferriter Date: Thu, 11 Jan 2024 12:48:46 -0500 Subject: [PATCH 28/35] Add fractionCoverage20x annotation to qualityMeasures --- gnomad/resources/grch38/gnomad.py | 2 ++ gnomad/utils/annotations.py | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/gnomad/resources/grch38/gnomad.py b/gnomad/resources/grch38/gnomad.py index 20f466f91..a7a604c9d 100644 --- a/gnomad/resources/grch38/gnomad.py +++ b/gnomad/resources/grch38/gnomad.py @@ -611,6 +611,7 @@ def gnomad_gks( coverage(data_type).versions[coverage_version].path ) ht = ht.annotate(mean_depth=coverage_ht[ht.locus].mean) + ht = ht.annotate(fraction_cov_over_20=coverage_ht[ht.locus].over_20) # Retrieve ancestry groups from the imported POPS dictionary. pops_list = list(POPS[high_level_version]) if by_ancestry_group else None @@ -662,6 +663,7 @@ def gnomad_gks( if not skip_coverage: keep_fields.append(ht.mean_depth) + keep_fields.append(ht.fraction_cov_over_20) if "jointGrpMaxFAF95" in ht.row: keep_fields.append(ht.jointGrpMaxFAF95) diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 32a44d042..526144bc1 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -2712,11 +2712,14 @@ def _create_group_dicts( "heterozygousSkewedAlleleCount": sum(ab_bin_freq[-2:]), } - # Add mean coverage depth statistics if the input was annotated + # Add coverage depth statistics if the input was annotated # with coverage information. if "mean_depth" in input_struct: qualityMeasures["meanDepth"] = input_struct.mean_depth + if "fraction_cov_over_20" in input_struct: + qualityMeasures["fractionCoverage20x"] = input_struct.fraction_cov_over_20 + # Add monoallelic flag (all samples homozygous for alternate allele) qualityMeasures["monoallelic"] = input_struct.monoallelic From 4350015112c70b8eb8ff1dde8fcfdcf7f0195ca2 Mon Sep 17 00:00:00 2001 From: Kyle Ferriter Date: Thu, 11 Jan 2024 13:12:54 -0500 Subject: [PATCH 29/35] Remove duplicate 'remaining' key from ancestry.py --- gnomad/sample_qc/ancestry.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gnomad/sample_qc/ancestry.py b/gnomad/sample_qc/ancestry.py index 8dc827107..1227cefa6 100644 --- a/gnomad/sample_qc/ancestry.py +++ b/gnomad/sample_qc/ancestry.py @@ -51,7 +51,6 @@ "oeu": "Other European", "onf": "Other Non-Finnish European", "unk": "Unknown", - "remaining": "Remaining", } POP_COLORS = { From cd668741831626c8f1fbfe57494210861c2d0246 Mon Sep 17 00:00:00 2001 From: Kyle Ferriter Date: Wed, 7 Feb 2024 13:35:56 -0500 Subject: [PATCH 30/35] Apply suggestions from code review Co-authored-by: Daniel Marten <78616802+matren395@users.noreply.github.com> Co-authored-by: klaricch --- gnomad/resources/grch38/gnomad.py | 44 +++++++++++-------------------- gnomad/utils/annotations.py | 2 +- 2 files changed, 17 insertions(+), 29 deletions(-) diff --git a/gnomad/resources/grch38/gnomad.py b/gnomad/resources/grch38/gnomad.py index a7a604c9d..c96f6ca81 100644 --- a/gnomad/resources/grch38/gnomad.py +++ b/gnomad/resources/grch38/gnomad.py @@ -583,22 +583,19 @@ def gnomad_gks( high_level_version = f"v{version.split('.')[0]}" # Read coverage statistics if requested - coverage_version_3_0_1 = "3.0.1" # v3/v4 genomes coverage + coverage_version_3_0_1 = "3.0.1" # v4 genomes coverage coverage_version_4_0 = "4.0" # v4 exomes coverage # In v4, exomes have coverage in v4 coverage table, # genomes have coverage in v3 coverage table. - # In v3, only genomes have coverage (only genomes exist). - if high_level_version == "v3": + if data_type == "genomes": coverage_version = coverage_version_3_0_1 - elif high_level_version == "v4": - if data_type == "exomes": - coverage_version = coverage_version_4_0 - elif data_type == "genomes": - coverage_version = coverage_version_3_0_1 else: + coverage_version = coverage_version_4_0 + + if high_level_version != "v4": raise NotImplementedError( - "gnomad_gks() is currently only implemented for gnomAD v3 and v4." + "gnomad_gks() is currently only implemented for gnomAD v4." ) coverage_ht = None @@ -629,24 +626,15 @@ def gnomad_gks( ) # Select relevant fields, checkpoint, and filter to interval before adding - # annotations - - # Pull up LCR flag and make referrable in the same field - if high_level_version == "v3": # v3 - ht = ht.annotate(lcr=ht.region_flag.lcr) - else: # v4 - ht = ht.annotate(lcr=ht.region_flags.lcr) - - # Pull up v3 / v4 allele balance histogram arrays - if high_level_version == "v3": # v3 - ht = ht.annotate(ab_hist_alt=ht.qual_hists.ab_hist_alt) - else: # v4 - ht = ht.annotate(ab_hist_alt=ht.histograms.qual_hists.ab_hist_alt) - - if high_level_version == "v3": - ht = add_grpMaxFAF95_v3(ht) - else: - ht = add_grpMaxFAF95_v4(ht) + # annotations. + + # Pull up LCR flag and make referrable in the same field. + ht = ht.annotate(lcr=ht.region_flags.lcr) + + # Pull up v3 / v4 allele balance histogram arrays. + ht = ht.annotate(ab_hist_alt=ht.histograms.qual_hists.ab_hist_alt) + + ht = add_grpMaxFAF95_v4(ht) ht = ht.annotate(in_autosome_or_par=ht.locus.in_autosome_or_par()) @@ -676,7 +664,7 @@ def gnomad_gks( ht = ht.checkpoint(hl.utils.new_temp_file("vrs_checkpoint", extension="ht")) # Collect all variants as structs, so all dictionary construction can be - # done in native Python + # done in native Python. variant_list = ht.collect() ht_freq_index_dict = ht.freq_index_dict.collect()[0] # gnomad v4 renamed freq_index_dict keys to use underscores instead of dashes. diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index 526144bc1..d91451200 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -2669,7 +2669,7 @@ def _create_group_dicts( ), } - # Add joint group max FAF if it exists + # Add joint group max FAF if it exists. if ( "jointGrpMaxFAF95" in input_struct and input_struct.jointGrpMaxFAF95.popmax_population is not None From ed2312c58670f424de38d14473f1b9dce2e652a2 Mon Sep 17 00:00:00 2001 From: Kyle Ferriter Date: Wed, 7 Feb 2024 13:38:26 -0500 Subject: [PATCH 31/35] Apply formatting --- gnomad/resources/grch38/gnomad.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gnomad/resources/grch38/gnomad.py b/gnomad/resources/grch38/gnomad.py index c96f6ca81..95324ba28 100644 --- a/gnomad/resources/grch38/gnomad.py +++ b/gnomad/resources/grch38/gnomad.py @@ -591,8 +591,8 @@ def gnomad_gks( if data_type == "genomes": coverage_version = coverage_version_3_0_1 else: - coverage_version = coverage_version_4_0 - + coverage_version = coverage_version_4_0 + if high_level_version != "v4": raise NotImplementedError( "gnomad_gks() is currently only implemented for gnomAD v4." From b6c5101f6e40f4185ebc63e597a2b0157019ede5 Mon Sep 17 00:00:00 2001 From: Kyle Ferriter Date: Wed, 7 Feb 2024 14:55:45 -0500 Subject: [PATCH 32/35] Delete add_grpMaxFAF95_v3 and gks_compute_seqloc_digest. --- gnomad/resources/grch38/gnomad.py | 38 --------------- gnomad/utils/annotations.py | 81 ------------------------------- 2 files changed, 119 deletions(-) diff --git a/gnomad/resources/grch38/gnomad.py b/gnomad/resources/grch38/gnomad.py index 95324ba28..68bc9463b 100644 --- a/gnomad/resources/grch38/gnomad.py +++ b/gnomad/resources/grch38/gnomad.py @@ -479,44 +479,6 @@ def release_vcf_path(data_type: str, version: str, contig: str) -> str: return f"gs://gcp-public-data--gnomad/release/{version}/vcf/{data_type}/gnomad.{data_type}.{version_prefix}{version}.sites{contig}.vcf.bgz" -def add_grpMaxFAF95_v3(ht: hl.Table) -> hl.Table: - """ - Add a grpMaxFAF95 annotation using the v3 style faf and faf_index_dict structures. - - :param ht: Input hail table. - :return: Annotated hail table. - """ - ht = ht.annotate( - grpMaxFAF95=hl.rbind( - hl.sorted( - hl.array( - [ - hl.struct( - faf=ht.faf[ht.faf_index_dict[f"{pop}-adj"]].faf95, - population=pop, - ) - for pop in FAF_POPS - ] - ), - key=lambda f: (-f.faf, f.population), - ), - lambda fafs: hl.if_else( - hl.len(fafs) > 0, - hl.struct( - popmax=fafs[0].faf, - popmax_population=hl.if_else( - fafs[0].faf == 0, hl.missing(hl.tstr), fafs[0].population - ), - ), - hl.struct( - popmax=hl.missing(hl.tfloat), popmax_population=hl.missing(hl.tstr) - ), - ), - ), - ) - return ht - - def add_grpMaxFAF95_v4(ht: hl.Table) -> hl.Table: """ Add a grpMaxFAF95 struct with 'popmax' and 'popmax_population'. diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index d91451200..ab60cec39 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -2352,87 +2352,6 @@ def _update_struct( return ht.annotate(**updated_rows) -def gks_compute_seqloc_digest( - ht: hl.Table, - export_tmpfile: Optional[str] = None, - computed_tmpfile: Optional[str] = None, -): - """ - Compute sequence location digest-based id for a hail variant Table. - - Exports table to tsv, computes SequenceLocation digests, reimports and replaces - the vrs_json field with the result. Input table must have a .vrs field, like the - one added by add_gks_vrs, that can be used to construct ga4gh.vrs models. - - :param ht: hail table with VRS annotation - :param export_tmpfile: Optional file path to export the table to. - :param computed_tmpfile: Optional file path to write the updated rows to, - which is then imported as a hail table - :return: a hail table with the VRS annotation updated with the new SequenceLocations - """ - # NOTE: The pinned ga4gh.vrs module breaks logging when this annotations module is - # imported. Importing ga4gh here to avoid this issue. - import ga4gh.core as ga4gh_core - import ga4gh.vrs as ga4gh_vrs - - if export_tmpfile is None: - export_tmpfile = new_temp_file("gks-seqloc-pre.tsv") - if computed_tmpfile is None: - computed_tmpfile = new_temp_file("gks-seqloc-post.tsv") - - logger.info("Exporting ht to %s", export_tmpfile) - ht.select("vrs_json").export(export_tmpfile, header=True) - - logger.info( - "Computing SequenceLocation digests and writing to %s", computed_tmpfile - ) - start = timer() - counter = 0 - with open(computed_tmpfile, "w", encoding="utf-8") as f_out: - with open(export_tmpfile, "r", encoding="utf-8") as f: - reader = csv.reader(f, delimiter="\t") - header = None - for line in reader: - if header is None: - header = line - f_out.write("\t".join(header)) - f_out.write("\n") - continue - else: - locus, alleles, vrs_json = line - vrs_variant = json.loads(vrs_json) - location = vrs_variant["location"] - location.pop("_id") - location_id = ga4gh_core._internal.identifiers.ga4gh_identify( - ga4gh_vrs.models.SequenceLocation(**location) - ) - vrs_variant["location"]["_id"] = location_id - # serialize outputs to JSON and write to TSV - vrs_json = json.dumps(vrs_variant) - alleles = json.dumps(json.loads(alleles)) - f_out.write("\t".join([locus, alleles, vrs_json])) - f_out.write("\n") - counter += 1 - end = timer() - logger.info( - "Computed %s SequenceLocation digests in %s seconds", counter, (end - start) - ) - logger.info("Importing VRS records with computed SequenceLocation digests") - ht_with_location = hl.import_table( - computed_tmpfile, types={"locus": "tstr", "alleles": "tstr", "vrs_json": "tstr"} - ) - ht_with_location_parsed = ht_with_location.annotate( - locus=hl.locus( - contig=ht_with_location.locus.split(":")[0], - pos=hl.int32(ht_with_location.locus.split(":")[1]), - reference_genome="GRCh38", - ), - alleles=hl.parse_json(ht_with_location.alleles, dtype=hl.tarray(hl.tstr)), - ).key_by("locus", "alleles") - - return ht.drop("vrs_json").join(ht_with_location_parsed, how="left") - - def add_gks_vrs( input_locus: hl.locus, input_vrs: hl.struct, From 2c8c99e69ce5ad19c97de5722e6e021a8c63e64a Mon Sep 17 00:00:00 2001 From: Kyle Ferriter Date: Wed, 7 Feb 2024 15:23:10 -0500 Subject: [PATCH 33/35] lint --- gnomad/resources/grch38/gnomad.py | 1 - gnomad/utils/annotations.py | 4 ---- 2 files changed, 5 deletions(-) diff --git a/gnomad/resources/grch38/gnomad.py b/gnomad/resources/grch38/gnomad.py index 68bc9463b..838285ba4 100644 --- a/gnomad/resources/grch38/gnomad.py +++ b/gnomad/resources/grch38/gnomad.py @@ -14,7 +14,6 @@ ) from gnomad.sample_qc.ancestry import POP_NAMES from gnomad.utils.annotations import add_gks_va, add_gks_vrs -from gnomad.utils.vcf import FAF_POPS logging.basicConfig( format="%(asctime)s (%(name)s %(lineno)s): %(message)s", diff --git a/gnomad/utils/annotations.py b/gnomad/utils/annotations.py index ab60cec39..26f23438e 100644 --- a/gnomad/utils/annotations.py +++ b/gnomad/utils/annotations.py @@ -1,14 +1,10 @@ # noqa: D100 -import csv import itertools -import json import logging -from timeit import default_timer as timer from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union import hail as hl -from hail.utils.misc import new_temp_file import gnomad.utils.filtering as filter_utils from gnomad.utils.gen_stats import to_phred From 0ae91f264beba495cb46411e91e7321ad127f149 Mon Sep 17 00:00:00 2001 From: Kyle Ferriter Date: Tue, 27 Feb 2024 14:38:39 -0500 Subject: [PATCH 34/35] Apply suggestions from code review Co-authored-by: klaricch --- gnomad/resources/grch38/gnomad.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gnomad/resources/grch38/gnomad.py b/gnomad/resources/grch38/gnomad.py index 838285ba4..96d68fd5d 100644 --- a/gnomad/resources/grch38/gnomad.py +++ b/gnomad/resources/grch38/gnomad.py @@ -572,7 +572,8 @@ def gnomad_gks( ht = ht.annotate(fraction_cov_over_20=coverage_ht[ht.locus].over_20) # Retrieve ancestry groups from the imported POPS dictionary. - pops_list = list(POPS[high_level_version]) if by_ancestry_group else None + pops_list = list(POPS["v4" if data_type=="exomes" else "v3"]) if by_ancestry_group else None + # Throw warnings if contradictory arguments are passed. if by_ancestry_group and vrs_only: @@ -592,7 +593,7 @@ def gnomad_gks( # Pull up LCR flag and make referrable in the same field. ht = ht.annotate(lcr=ht.region_flags.lcr) - # Pull up v3 / v4 allele balance histogram arrays. + # Pull up allele balance histogram arrays. ht = ht.annotate(ab_hist_alt=ht.histograms.qual_hists.ab_hist_alt) ht = add_grpMaxFAF95_v4(ht) From caabf24bf3fbfe99d7823b5330d2b1b123e7c223 Mon Sep 17 00:00:00 2001 From: Kyle Ferriter Date: Tue, 27 Feb 2024 15:14:11 -0500 Subject: [PATCH 35/35] Move high level version check above attempt to read table --- gnomad/resources/grch38/gnomad.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/gnomad/resources/grch38/gnomad.py b/gnomad/resources/grch38/gnomad.py index 96d68fd5d..e417ca750 100644 --- a/gnomad/resources/grch38/gnomad.py +++ b/gnomad/resources/grch38/gnomad.py @@ -535,15 +535,20 @@ def gnomad_gks( :return: List of dictionaries containing VRS information (and freq info split by ancestry groups and sex if desired) for specified variant. """ - # Read public_release table if no custom table provided + # Obtain the high level version number and verify that it is 4. + high_level_version = f"v{version.split('.')[0]}" + if high_level_version != "v4": + raise NotImplementedError( + "gnomad_gks() is currently only implemented for gnomAD v4." + ) + + # Read public_release table if no custom table provided. if custom_ht: ht = custom_ht else: ht = hl.read_table(public_release(data_type).versions[version].path) - high_level_version = f"v{version.split('.')[0]}" - - # Read coverage statistics if requested + # Read coverage statistics if requested. coverage_version_3_0_1 = "3.0.1" # v4 genomes coverage coverage_version_4_0 = "4.0" # v4 exomes coverage @@ -554,11 +559,6 @@ def gnomad_gks( else: coverage_version = coverage_version_4_0 - if high_level_version != "v4": - raise NotImplementedError( - "gnomad_gks() is currently only implemented for gnomAD v4." - ) - coverage_ht = None if not skip_coverage: @@ -572,8 +572,11 @@ def gnomad_gks( ht = ht.annotate(fraction_cov_over_20=coverage_ht[ht.locus].over_20) # Retrieve ancestry groups from the imported POPS dictionary. - pops_list = list(POPS["v4" if data_type=="exomes" else "v3"]) if by_ancestry_group else None - + pops_list = ( + list(POPS["v4" if data_type == "exomes" else "v3"]) + if by_ancestry_group + else None + ) # Throw warnings if contradictory arguments are passed. if by_ancestry_group and vrs_only: