-
Notifications
You must be signed in to change notification settings - Fork 29
/
Copy pathvcf.py
1579 lines (1444 loc) · 58.4 KB
/
vcf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# noqa: D100
import copy
import itertools
import logging
from typing import Dict, List, Optional, Union
import hail as hl
from gnomad.sample_qc.ancestry import POP_NAMES
logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
SORT_ORDER = [
"subset",
"downsampling",
"popmax",
"grpmax",
"pop",
"gen_anc",
"subpop",
"sex",
"group",
]
"""
Order to sort subgroupings during VCF export.
Ensures that INFO labels in VCF are in desired order (e.g., raw_AC_afr_female).
"""
GROUPS = ["adj", "raw"]
"""
Group names used to generate labels for high quality genotypes and all raw genotypes. Used in VCF export.
"""
HISTS = ["gq_hist_alt", "gq_hist_all", "dp_hist_alt", "dp_hist_all", "ab_hist_alt"]
"""
Quality histograms used in VCF export.
"""
FAF_POPS = {
"v3": ["afr", "amr", "eas", "nfe", "sas"],
"v4": ["afr", "amr", "eas", "mid", "nfe", "sas"],
}
"""
Global populations that are included in filtering allele frequency (faf) calculations. Used in VCF export.
"""
SEXES = ["XX", "XY"]
"""
Sample sexes used in VCF export.
Used to stratify frequency annotations (AC, AN, AF) for each sex.
Note that sample sexes in gnomAD v3 and earlier were 'male' and 'female'.
"""
AS_FIELDS = [
"AS_FS",
"AS_MQ",
"AS_MQRankSum",
"AS_pab_max",
"AS_QUALapprox",
"AS_QD",
"AS_ReadPosRankSum",
"AS_SB_TABLE",
"AS_SOR",
"AS_VarDP",
"InbreedingCoeff",
]
"""
Allele-specific variant annotations.
"""
SITE_FIELDS = [
"FS",
"MQ",
"MQRankSum",
"QUALapprox",
"QD",
"ReadPosRankSum",
"SB",
"SOR",
"VarDP",
]
"""
Site level variant annotations.
"""
ALLELE_TYPE_FIELDS = [
"allele_type",
"has_star",
"n_alt_alleles",
"original_alleles",
"variant_type",
"was_mixed",
]
"""
Allele-type annotations.
"""
REGION_FLAG_FIELDS = ["decoy", "lcr", "nonpar", "non_par", "segdup"]
"""
Annotations about variant region type.
.. note::
decoy resource files do not currently exist for GRCh38/hg38.
"""
JOINT_REGION_FLAG_FIELDS = [
"fail_interval_qc",
"outside_broad_capture_region",
"outside_ukb_capture_region",
"outside_broad_calling_region",
"outside_ukb_calling_region",
"not_called_in_exomes",
"not_called_in_genomes",
]
"""
Annotations about variant region type that are specifically created for joint dataset of exomes and genomes from gnomAD v4.1.
"""
RF_FIELDS = [
"rf_positive_label",
"rf_negative_label",
"rf_label",
"rf_train",
"rf_tp_probability",
]
"""
Annotations specific to the variant QC using a random forest model.
"""
AS_VQSR_FIELDS = ["AS_culprit", "AS_VQSLOD"]
"""
Allele-specific VQSR annotations.
"""
VQSR_FIELDS = AS_VQSR_FIELDS + ["NEGATIVE_TRAIN_SITE", "POSITIVE_TRAIN_SITE"]
"""
Annotations specific to VQSR.
"""
INFO_VCF_AS_PIPE_DELIMITED_FIELDS = [
"AS_QUALapprox",
"AS_VarDP",
"AS_MQ_DP",
"AS_RAW_MQ",
"AS_SB_TABLE",
]
INFO_DICT = {
"FS": {
"Description": "Phred-scaled p-value of Fisher's exact test for strand bias"
},
"InbreedingCoeff": {
"Number": "A",
"Description": (
"Inbreeding coefficient, the excess heterozygosity at a variant site,"
" computed as 1 - (the number of heterozygous genotypes)/(the number of"
" heterozygous genotypes expected under Hardy-Weinberg equilibrium)"
),
},
"inbreeding_coeff": {
"Number": "A",
"Description": (
"Inbreeding coefficient, the excess heterozygosity at a variant site,"
" computed as 1 - (the number of heterozygous genotypes)/(the number of"
" heterozygous genotypes expected under Hardy-Weinberg equilibrium)"
),
},
"MQ": {
"Description": (
"Root mean square of the mapping quality of reads across all samples"
)
},
"MQRankSum": {
"Description": (
"Z-score from Wilcoxon rank sum test of alternate vs. reference read"
" mapping qualities"
)
},
"QD": {
"Description": (
"Variant call confidence normalized by depth of sample reads supporting a"
" variant"
)
},
"ReadPosRankSum": {
"Description": (
"Z-score from Wilcoxon rank sum test of alternate vs. reference read"
" position bias"
)
},
"SOR": {"Description": "Strand bias estimated by the symmetric odds ratio test"},
"POSITIVE_TRAIN_SITE": {
"Description": (
"Variant was used to build the positive training set of high-quality"
" variants for VQSR"
)
},
"NEGATIVE_TRAIN_SITE": {
"Description": (
"Variant was used to build the negative training set of low-quality"
" variants for VQSR"
)
},
"positive_train_site": {
"Description": (
"Variant was used to build the positive training set of high-quality"
" variants for VQSR"
)
},
"negative_train_site": {
"Description": (
"Variant was used to build the negative training set of low-quality"
" variants for VQSR"
)
},
"BaseQRankSum": {
"Description": (
"Z-score from Wilcoxon rank sum test of alternate vs. reference base"
" qualities"
),
},
"VarDP": {
"Description": (
"Depth over variant genotypes (does not include depth of reference samples)"
)
},
"VQSLOD": {
"Description": (
"Log-odds ratio of being a true variant versus being a false positive under"
" the trained VQSR Gaussian mixture model"
),
},
"culprit": {
"Description": "Worst-performing annotation in the VQSR Gaussian mixture model",
},
"decoy": {"Description": "Variant falls within a reference decoy region"},
"lcr": {"Description": "Variant falls within a low complexity region"},
"nonpar": {
"Description": (
"Variant (on sex chromosome) falls outside a pseudoautosomal region"
)
},
"non_par": {
"Description": (
"Variant (on sex chromosome) falls outside a pseudoautosomal region"
)
},
"segdup": {"Description": "Variant falls within a segmental duplication region"},
"fail_interval_qc": {
"Description": (
"Less than 85 percent of samples meet 20X coverage if variant is in"
" autosomal or PAR regions or 10X coverage for non-PAR regions of"
" chromosomes X and Y."
)
},
"outside_ukb_capture_region": {
"Description": "Variant falls outside of UK Biobank exome capture regions."
},
"outside_broad_capture_region": {
"Description": "Variant falls outside of Broad exome capture regions."
},
"rf_positive_label": {
"Description": (
"Variant was labelled as a positive example for training of random forest"
" model"
)
},
"rf_negative_label": {
"Description": (
"Variant was labelled as a negative example for training of random forest"
" model"
)
},
"rf_label": {"Description": "Random forest training label"},
"rf_train": {"Description": "Variant was used in training random forest model"},
"rf_tp_probability": {
"Description": (
"Probability of a called variant being a true variant as determined by"
" random forest model"
)
},
"transmitted_singleton": {
"Description": (
"Variant was a callset-wide doubleton that was transmitted within a family"
" from a parent to a child (i.e., a singleton amongst unrelated samples in"
" cohort)"
)
},
"sibling_singleton": {
"Description": (
"Variant was a callset-wide doubleton that was present only in two siblings"
" (i.e., a singleton amongst unrelated samples in cohort)."
)
},
"original_alleles": {"Description": "Alleles before splitting multiallelics"},
"variant_type": {
"Description": "Variant type (snv, indel, multi-snv, multi-indel, or mixed)"
},
"allele_type": {
"Description": "Allele type (snv, insertion, deletion, or mixed)",
},
"n_alt_alleles": {
"Number": "1",
"Description": "Total number of alternate alleles observed at variant locus",
},
"was_mixed": {"Description": "Variant type was mixed"},
"has_star": {
"Description": (
"Variant locus coincides with a spanning deletion (represented by a star)"
" observed elsewhere in the callset"
)
},
"AS_pab_max": {
"Number": "A",
"Description": (
"Maximum p-value over callset for binomial test of observed allele balance"
" for a heterozygous genotype, given expectation of 0.5"
),
},
"monoallelic": {
"Description": "All samples are homozygous alternate for the variant"
},
"only_het": {"Description": "All samples are heterozygous for the variant"},
"QUALapprox": {
"Number": "1",
"Description": "Sum of PL[0] values; used to approximate the QUAL score",
},
"AS_SB_TABLE": {
"Number": ".",
"Description": (
"Allele-specific forward/reverse read counts for strand bias tests"
),
},
}
"""
Dictionary used during VCF export to export row (variant) annotations.
"""
JOINT_REGION_FLAGS_INFO_DICT = {
"fail_interval_qc": {
"Description": (
"Less than 85 percent of samples meet 20X coverage if variant is in"
" autosomal or PAR regions or 10X coverage for non-PAR regions of"
" chromosomes X and Y."
)
},
"outside_ukb_capture_region": {
"Description": "Variant falls outside of the UK Biobank exome capture regions."
},
"outside_broad_capture_region": {
"Description": "Variant falls outside of the Broad exome capture regions."
},
"outside_ukb_calling_region": {
"Description": (
"Variant falls outside of the UK Biobank exome capture regions plus 150 bp"
" padding."
)
},
"outside_broad_calling_region": {
"Description": (
"Variant falls outside of the Broad exome capture regions plus 150 bp"
" padding."
)
},
"not_called_in_exomes": {
"Description": "Variant was not called in the gnomAD exomes."
},
"not_called_in_genomes": {
"Description": "Variant was not called in the gnomAD genomes."
},
}
IN_SILICO_ANNOTATIONS_INFO_DICT = {
"cadd_raw_score": {
"Number": "1",
"Description": (
"Raw CADD scores are interpretable as the extent to which the annotation"
" profile for a given variant suggests that the variant is likely to be"
" 'observed' (negative values) vs 'simulated' (positive values). Larger"
" values are more deleterious."
),
},
"cadd_phred": {
"Number": "1",
"Description": (
"Cadd Phred-like scores ('scaled C-scores') ranging from 1 to 99, based on"
" the rank of each variant relative to all possible 8.6 billion"
" substitutions in the human reference genome. Larger values are more"
" deleterious."
),
},
"revel_max": {
"Number": "1",
"Description": (
"The maximum REVEL score at a site's MANE Select or canonical"
" transcript. It's an ensemble score for predicting the pathogenicity of"
" missense variants (based on 13 other variant predictors). Scores ranges"
" from 0 to 1. Variants with higher scores are predicted to be more likely"
" to be deleterious."
),
},
"spliceai_ds_max": {
"Number": "1",
"Description": (
"Illumina's SpliceAI max delta score; interpreted as the probability of the"
" variant being splice-altering."
),
},
"pangolin_largest_ds": {
"Number": "1",
"Description": (
"Pangolin's largest delta score across 2 splicing consequences, which"
" reflects the probability of the variant being splice-altering"
),
},
"phylop": {
"Number": "1",
"Description": (
"Base-wise conservation score across the 241 placental mammals in the"
" Zoonomia project. Score ranges from -20 to 9.28, and reflects"
" acceleration (faster evolution than expected under neutral drift,"
" assigned negative scores) as well as conservation (slower than expected"
" evolution, assigned positive scores)."
),
},
"sift_max": {
"Number": "1",
"Description": (
"Score reflecting the scaled probability of the amino acid substitution"
" being tolerated, ranging from 0 to 1. Scores below 0.05 are predicted to"
" impact protein function. We prioritize max scores for MANE Select"
" transcripts where possible and otherwise report a score for the canonical"
" transcript."
),
},
"polyphen_max": {
"Number": "1",
"Description": (
"Score that predicts the possible impact of an amino acid substitution on"
" the structure and function of a human protein, ranging from 0.0"
" (tolerated) to 1.0 (deleterious). We prioritize max scores for MANE"
" Select transcripts where possible and otherwise report a score for the"
" canonical transcript."
),
},
}
"""
Dictionary with in silico score descriptions to include in the VCF INFO header.
"""
VRS_FIELDS_DICT = {
"VRS_Allele_IDs": {
"Number": "R",
"Description": (
"The computed identifiers for the GA4GH VRS Alleles corresponding to the"
" values in the REF and ALT fields"
),
},
"VRS_Starts": {
"Number": "R",
"Description": (
"Interresidue coordinates used as the location starts for the GA4GH VRS"
" Alleles corresponding to the values in the REF and ALT fields"
),
},
"VRS_Ends": {
"Number": "R",
"Description": (
"Interresidue coordinates used as the location ends for the GA4GH VRS"
" Alleles corresponding to the values in the REF and ALT fields"
),
},
"VRS_States": {
"Number": ".",
"Description": (
"The literal sequence states used for the GA4GH VRS Alleles corresponding"
" to the values in the REF and ALT fields"
),
},
}
"""
Dictionary with VRS annotations to include in the VCF INFO field and VCF header.
"""
ENTRIES = ["GT", "GQ", "DP", "AD", "MIN_DP", "PGT", "PID", "PL", "SB"]
"""
Densified entries to be selected during VCF export.
"""
SPARSE_ENTRIES = [
"END",
"DP",
"GQ",
"LA",
"LAD",
"LGT",
"LPGT",
"LPL",
"MIN_DP",
"PID",
"RGQ",
"SB",
]
"""
Sparse entries to be selected and densified during VCF export.
"""
FORMAT_DICT = {
"GT": {"Description": "Genotype", "Number": "1", "Type": "String"},
"AD": {
"Description": "Allelic depths for the ref and alt alleles in the order listed",
"Number": "R",
"Type": "Integer",
},
"DP": {
"Description": (
"Approximate read depth (reads with MQ=255 or with bad mates are filtered)"
),
"Number": "1",
"Type": "Integer",
},
"GQ": {
"Description": (
"Phred-scaled confidence that the genotype assignment is correct. Value is"
" the difference between the second lowest PL and the lowest PL (always"
" normalized to 0)."
),
"Number": "1",
"Type": "Integer",
},
"MIN_DP": {
"Description": "Minimum DP observed within the GVCF block",
"Number": "1",
"Type": "Integer",
},
"PGT": {
"Description": (
"Physical phasing haplotype information, describing how the alternate"
" alleles are phased in relation to one another"
),
"Number": "1",
"Type": "String",
},
"PID": {
"Description": (
"Physical phasing ID information, where each unique ID within a given"
" sample (but not across samples) connects records within a phasing group"
),
"Number": "1",
"Type": "String",
},
"PL": {
"Description": (
"Normalized, phred-scaled likelihoods for genotypes as defined in the VCF"
" specification"
),
"Number": "G",
"Type": "Integer",
},
"SB": {
"Description": (
"Per-sample component statistics which comprise the Fisher's exact test to"
" detect strand bias. Values are: depth of reference allele on forward"
" strand, depth of reference allele on reverse strand, depth of alternate"
" allele on forward strand, depth of alternate allele on reverse strand."
),
"Number": "4",
"Type": "Integer",
},
}
"""
Dictionary used during VCF export to export MatrixTable entries.
"""
JOINT_FILTERS_INFO_DICT = {
"exomes_filters": {"Description": "Filters' values from the exomes dataset."},
"genomes_filters": {"Description": "Filters' values from the genomes dataset."},
}
def adjust_vcf_incompatible_types(
ht: hl.Table,
pipe_delimited_annotations: List[str] = INFO_VCF_AS_PIPE_DELIMITED_FIELDS,
) -> hl.Table:
"""
Create a Table ready for vcf export.
In particular, the following conversions are done:
- All int64 are coerced to int32
- Fields specified by `pipe_delimited_annotations` are converted from arrays to pipe-delimited strings
:param ht: Input Table.
:param pipe_delimited_annotations: List of info fields (they must be fields of the ht.info Struct).
:return: Table ready for VCF export.
"""
def get_pipe_expr(array_expr: hl.expr.ArrayExpression) -> hl.expr.StringExpression:
return hl.delimit(array_expr.map(lambda x: hl.or_else(hl.str(x), "")), "|")
# Make sure the HT is keyed by locus, alleles
ht = ht.key_by("locus", "alleles")
info_type_convert_expr = {}
# Convert int64 fields to int32 (int64 isn't supported by VCF)
for f, ft in ht.info.dtype.items():
if ft == hl.dtype("int64"):
logger.warning(
"Coercing field info.%s from int64 to int32 for VCF output. Value"
" will be capped at int32 max value.",
f,
)
info_type_convert_expr.update(
{
f: hl.or_missing(
hl.is_defined(ht.info[f]),
hl.int32(hl.min(2**31 - 1, ht.info[f])),
)
}
)
elif ft == hl.dtype("array<int64>"):
logger.warning(
"Coercing field info.%s from array<int64> to array<int32> for VCF"
" output. Array values will be capped at int32 max value.",
f,
)
info_type_convert_expr.update(
{
f: ht.info[f].map(
lambda x: hl.or_missing(
hl.is_defined(x), hl.int32(hl.min(2**31 - 1, x))
)
)
}
)
ht = ht.annotate(info=ht.info.annotate(**info_type_convert_expr))
info_expr = {}
# Make sure to pipe-delimit fields that need to.
# Note: the expr needs to be prefixed by "|" because GATK expect one value for the ref (always empty)
# Note2: this doesn't produce the correct annotation for AS_SB_TABLE, it
# is handled below
for f in pipe_delimited_annotations:
if f in ht.info and f != "AS_SB_TABLE":
info_expr[f] = "|" + get_pipe_expr(ht.info[f])
# Flatten SB if it is an array of arrays
if "SB" in ht.info and not isinstance(ht.info.SB, hl.expr.ArrayNumericExpression):
info_expr["SB"] = ht.info.SB[0].extend(ht.info.SB[1])
if "AS_SB_TABLE" in ht.info:
info_expr["AS_SB_TABLE"] = get_pipe_expr(
ht.info.AS_SB_TABLE.map(lambda x: hl.delimit(x, ","))
)
# Annotate with new expression
ht = ht.annotate(info=ht.info.annotate(**info_expr))
return ht
def make_label_combos(
label_groups: Dict[str, List[str]],
sort_order: List[str] = SORT_ORDER,
label_delimiter: str = "_",
) -> List[str]:
"""
Make combinations of all possible labels for a supplied dictionary of label groups.
For example, if label_groups is `{"sex": ["male", "female"], "pop": ["afr", "nfe", "amr"]}`,
this function will return `["afr_male", "afr_female", "nfe_male", "nfe_female", "amr_male", "amr_female']`
:param label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping,
e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"]).
:param sort_order: List containing order to sort label group combinations. Default is SORT_ORDER.
:param label_delimiter: String to use as delimiter when making group label combinations.
:return: list of all possible combinations of values for the supplied label groupings.
"""
copy_label_groups = copy.deepcopy(label_groups)
if len(copy_label_groups) == 1:
return [item for sublist in copy_label_groups.values() for item in sublist]
anchor_group = sorted(copy_label_groups.keys(), key=lambda x: sort_order.index(x))[
0
]
anchor_val = copy_label_groups.pop(anchor_group)
combos = []
for x, y in itertools.product(
anchor_val,
make_label_combos(copy_label_groups, label_delimiter=label_delimiter),
):
combos.append(f"{x}{label_delimiter}{y}")
return combos
def index_globals(
globals_array: List[Dict[str, str]],
label_groups: Dict[str, List[str]],
label_delimiter: str = "_",
) -> Dict[str, int]:
"""
Create a dictionary keyed by the specified label groupings with values describing the corresponding index of each grouping entry in the meta_array annotation.
:param globals_array: Ordered list containing dictionary entries describing all the grouping combinations contained in the globals_array annotation.
Keys are the grouping type (e.g., 'group', 'pop', 'sex') and values are the grouping attribute (e.g., 'adj', 'eas', 'XY').
:param label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping,
e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"])
:param label_delimiter: String used as delimiter when making group label combinations.
:return: Dictionary keyed by specified label grouping combinations, with values describing the corresponding index
of each grouping entry in the globals
"""
combos = make_label_combos(label_groups, label_delimiter=label_delimiter)
index_dict = {}
for combo in combos:
combo_fields = combo.split(label_delimiter)
for i, v in enumerate(globals_array):
if set(v.values()) == set(combo_fields):
index_dict.update({f"{combo}": i})
return index_dict
def make_combo_header_text(
preposition: str,
combo_dict: Dict[str, str],
pop_names: Dict[str, str],
) -> str:
"""
Programmatically generate text to populate the VCF header description for a given variant annotation with specific groupings and subset.
For example, if preposition is "for", group_types is ["group", "pop", "sex"], and combo_fields is ["adj", "afr", "female"],
this function will return the string " for female samples in the African-American/African genetic ancestry group".
:param preposition: Relevant preposition to precede automatically generated text.
:param combo_dict: Dict with grouping types as keys and values for grouping type as values. This function generates text for these values.
Possible grouping types are: "group", "pop", "sex", and "subpop".
Example input: {"pop": "afr", "sex": "female"}
:param pop_names: Dict with global population names (keys) and population descriptions (values).
:return: String with automatically generated description text for a given set of combo fields.
"""
header_text = " " + preposition
if len(combo_dict) == 1:
if combo_dict["group"] == "adj":
return ""
if "sex" in combo_dict:
header_text = header_text + " " + combo_dict["sex"]
header_text = header_text + " samples"
if "subpop" in combo_dict or "pop" in combo_dict:
if "subpop" in combo_dict:
header_text = (
header_text
+ f" in the {pop_names[combo_dict['subpop']]} genetic ancestry subgroup"
)
else:
header_text = (
header_text
+ f" in the {pop_names[combo_dict['pop']]} genetic ancestry group"
)
if "group" in combo_dict:
if combo_dict["group"] == "raw":
header_text = header_text + ", before removing low-confidence genotypes"
return header_text
def create_label_groups(
pops: List[str],
sexes: List[str] = SEXES,
all_groups: List[str] = GROUPS,
pop_sex_groups: List[str] = ["adj"],
) -> List[Dict[str, List[str]]]:
"""
Generate a list of label group dictionaries needed to populate info dictionary.
Label dictionaries are passed as input to `make_info_dict`.
:param pops: List of population names.
:param sexes: List of sample sexes.
:param all_groups: List of data types (raw, adj). Default is `GROUPS`, which is ["raw", "adj"].
:param pop_sex_groups: List of data types (raw, adj) to populate with pops and sexes. Default is ["adj"].
:return: List of label group dictionaries.
"""
return [
# This is to capture raw frequency fields, which are
# not stratified by sex or population (e.g., only AC_raw exists, not AC_XX_raw)
dict(group=all_groups),
dict(group=pop_sex_groups, sex=sexes),
dict(group=pop_sex_groups, pop=pops),
dict(group=pop_sex_groups, pop=pops, sex=sexes),
]
def make_info_dict(
prefix: str = "",
suffix: str = "",
prefix_before_metric: bool = True,
pop_names: Dict[str, str] = POP_NAMES,
label_groups: Dict[str, List[str]] = None,
label_delimiter: str = "_",
bin_edges: Dict[str, str] = None,
faf: bool = False,
popmax: bool = False,
grpmax: bool = False,
fafmax: bool = False,
callstats: bool = False,
freq_ctt: bool = False,
freq_cmh: bool = False,
freq_stat_union: bool = False,
description_text: str = "",
age_hist_distribution: str = None,
sort_order: List[str] = SORT_ORDER,
) -> Dict[str, Dict[str, str]]:
"""
Generate dictionary of Number and Description attributes of VCF INFO fields.
Used to populate the INFO fields of the VCF header during export.
Creates:
- INFO fields for age histograms (bin freq, n_smaller, and n_larger for heterozygous and homozygous variant carriers)
- INFO fields for popmax AC, AN, AF, nhomalt, and popmax population
- INFO fields for AC, AN, AF, nhomalt for each combination of sample population, sex, and subpopulation, both for adj and raw data
- INFO fields for filtering allele frequency (faf) annotations
:param prefix: Prefix string for data, e.g. "gnomAD". Default is empty string.
:param suffix: Suffix string for data, e.g. "gnomAD". Default is empty string.
:param prefix_before_metric: Whether prefix should be added before the metric (AC, AN, AF, nhomalt, faf95, faf99) in INFO field. Default is True.
:param pop_names: Dict with global population names (keys) and population descriptions (values). Default is POP_NAMES.
:param label_groups: Dictionary containing an entry for each label group, where key is the name of the grouping,
e.g. "sex" or "pop", and value is a list of all possible values for that grouping (e.g. ["male", "female"] or ["afr", "nfe", "amr"]).
:param label_delimiter: String to use as delimiter when making group label combinations.
:param bin_edges: Dictionary keyed by annotation type, with values that reflect the bin edges corresponding to the annotation.
:param faf: If True, use alternate logic to auto-populate dictionary values associated with filter allele frequency annotations.
:param popmax: If True, use alternate logic to auto-populate dictionary values associated with popmax annotations.
:param grpmax: If True, use alternate logic to auto-populate dictionary values associated with grpmax annotations.
:param fafmax: If True, use alternate logic to auto-populate dictionary values associated with fafmax annotations.
:param callstats: If True, use alternate logic to auto-populate dictionary values associated with callstats annotations.
:param freq_ctt: If True, use alternate logic to auto-populate dictionary values associated with frequency contingency table test (CTT) annotations.
:param freq_cmh: If True, use alternate logic to auto-populate dictionary values associated with frequency Cochran-Mantel-Haenszel (CMH) annotations.
:param freq_stat_union: If True, use alternate logic to auto-populate dictionary values associated with the union of the contingency table and Cochran-Mantel-Haenszel tests.
:param description_text: Optional text to append to the end of descriptions. Needs to start with a space if specified.
:param str age_hist_distribution: Pipe-delimited string of overall age distribution.
:param sort_order: List containing order to sort label group combinations. Default is SORT_ORDER.
:return: Dictionary keyed by VCF INFO annotations, where values are dictionaries of Number and Description attributes.
"""
if prefix != "":
prefix = f"{prefix}{label_delimiter}"
if suffix != "":
suffix = f"{label_delimiter}{suffix}"
info_dict = dict()
if age_hist_distribution:
age_hist_dict = {
f"{prefix}age_hist_het_bin_freq{suffix}": {
"Number": "A",
"Description": (
f"Histogram of ages of heterozygous individuals{description_text};"
f" bin edges are: {bin_edges['het']}; total number of individuals"
f" of any genotype bin: {age_hist_distribution}"
),
},
f"{prefix}age_hist_het_n_smaller{suffix}": {
"Number": "A",
"Description": (
"Count of age values falling below lowest histogram bin edge for"
f" heterozygous individuals{description_text}"
),
},
f"{prefix}age_hist_het_n_larger{suffix}": {
"Number": "A",
"Description": (
"Count of age values falling above highest histogram bin edge for"
f" heterozygous individuals{description_text}"
),
},
f"{prefix}age_hist_hom_bin_freq{suffix}": {
"Number": "A",
"Description": (
"Histogram of ages of homozygous alternate"
f" individuals{description_text}; bin edges are:"
f" {bin_edges['hom']}; total number of individuals of any genotype"
f" bin: {age_hist_distribution}"
),
},
f"{prefix}age_hist_hom_n_smaller{suffix}": {
"Number": "A",
"Description": (
"Count of age values falling below lowest histogram bin edge for"
f" homozygous alternate individuals{description_text}"
),
},
f"{prefix}age_hist_hom_n_larger{suffix}": {
"Number": "A",
"Description": (
"Count of age values falling above highest histogram bin edge for"
f" homozygous alternate individuals{description_text}"
),
},
}
info_dict.update(age_hist_dict)
if popmax:
popmax_dict = {
f"{prefix}popmax{suffix}": {
"Number": "A",
"Description": (
f"Population with the maximum allele frequency{description_text}"
),
},
f"{prefix}AC{label_delimiter}popmax{suffix}": {
"Number": "A",
"Description": (
"Allele count in the population with the maximum allele"
f" frequency{description_text}"
),
},
f"{prefix}AN{label_delimiter}popmax{suffix}": {
"Number": "A",
"Description": (
"Total number of alleles in the population with the maximum allele"
f" frequency{description_text}"
),
},
f"{prefix}AF{label_delimiter}popmax{suffix}": {
"Number": "A",
"Description": (
f"Maximum allele frequency across populations{description_text}"
),
},
f"{prefix}nhomalt{label_delimiter}popmax{suffix}": {
"Number": "A",
"Description": (
"Count of homozygous individuals in the population with the"
f" maximum allele frequency{description_text}"
),
},
f"{prefix}faf95{label_delimiter}popmax{suffix}": {
"Number": "A",
"Description": (
"Filtering allele frequency (using Poisson 95% CI) for the"
f" population with the maximum allele frequency{description_text}"
),
},
}
info_dict.update(popmax_dict)
if grpmax:
grpmax_dict = {
f"{prefix}grpmax{suffix}": {
"Number": "A",
"Description": (
"Genetic ancestry group with the maximum allele"
f" frequency{description_text}"
),
},
f"{prefix}AC{label_delimiter}grpmax{suffix}": {
"Number": "A",
"Description": (
"Allele count in the genetic ancestry group with the maximum allele"
f" frequency{description_text}"
),
},
f"{prefix}AN{label_delimiter}grpmax{suffix}": {
"Number": "A",
"Description": (
"Total number of alleles in the genetic ancestry group with the"
f" maximum allele frequency{description_text}"
),
},
f"{prefix}AF{label_delimiter}grpmax{suffix}": {
"Number": "A",
"Description": (
"Maximum allele frequency across genetic ancestry"
f" groups{description_text}"
),
},
f"{prefix}nhomalt{label_delimiter}grpmax{suffix}": {
"Number": "A",
"Description": (
"Count of homozygous individuals in the genetic ancestry group"
f" with the maximum allele frequency{description_text}"
),
},
}
info_dict.update(grpmax_dict)
if fafmax:
fafmax_dict = {