From 56d7f67cbb07c3c6504aa2d6640991a031ccf37f Mon Sep 17 00:00:00 2001 From: wlu04 Date: Tue, 14 Dec 2021 17:02:57 -0500 Subject: [PATCH 01/11] Modified annotate_sex() --- gnomad/sample_qc/pipeline.py | 55 ++++++++++++++++++++++++++++-------- 1 file changed, 43 insertions(+), 12 deletions(-) diff --git a/gnomad/sample_qc/pipeline.py b/gnomad/sample_qc/pipeline.py index 31bc0992e..587efe21c 100644 --- a/gnomad/sample_qc/pipeline.py +++ b/gnomad/sample_qc/pipeline.py @@ -11,7 +11,7 @@ from gnomad.utils.reference_genome import get_reference_genome from gnomad.utils.sparse_mt import impute_sex_ploidy -from typing import List, Optional +from typing import List, Optional, Union logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") logger = logging.getLogger(__name__) @@ -209,7 +209,7 @@ def get_qc_mt( def annotate_sex( - mt: hl.MatrixTable, + mtds: Union[hl.MatrixTable, hl.vds.VariantDataset], is_sparse: bool = True, excluded_intervals: Optional[hl.Table] = None, included_intervals: Optional[hl.Table] = None, @@ -238,7 +238,7 @@ def annotate_sex( - Y_karyotype (str): Sample's chromosome Y karyotype. - sex_karyotype (str): Sample's sex karyotype. - :param mt: Input MatrixTable + :param mtds: Input MatrixTable or VariantDataset :param bool is_sparse: Whether input MatrixTable is in sparse data format :param excluded_intervals: Optional table of intervals to exclude from the computation. :param included_intervals: Optional table of intervals to use in the computation. REQUIRED for exomes. @@ -252,14 +252,29 @@ def annotate_sex( :return: Table of samples and their imputed sex karyotypes. """ logger.info("Imputing sex chromosome ploidies...") - if is_sparse: - ploidy_ht = impute_sex_ploidy( - mt, excluded_intervals, included_intervals, normalization_contig + + is_vds = isinstance(mtds, hl.vds.VariantDataset) + if is_vds: + ploidy_ht = hl.vds.impute_sex_chromosome_ploidy( + mtds, + calling_intervals=included_intervals, + normalization_contig=normalization_contig, ) - else: - raise NotImplementedError( - "Imputing sex ploidy does not exist yet for dense data." + ploidy_ht = ploidy_ht.transmute( + chrX_ploidy=ploidy_ht.x_ploidy, chrY_ploidy=ploidy_ht.y_ploidy ) + mtds = hl.vds.split_multi(mtds, filter_changed_loci=True) + mt = mtds.variant_data + else: + mt = mtds + if is_sparse: + ploidy_ht = impute_sex_ploidy( + mt, excluded_intervals, included_intervals, normalization_contig + ) + else: + raise NotImplementedError( + "Imputing sex ploidy does not exist yet for dense data." + ) x_contigs = get_reference_genome(mt.locus).x_contigs logger.info("Filtering mt to biallelic SNPs in X contigs: %s", x_contigs) @@ -269,9 +284,22 @@ def annotate_sex( mt = mt.filter_rows( (hl.len(mt.alleles) == 2) & hl.is_snp(mt.alleles[0], mt.alleles[1]) ) - mt = hl.filter_intervals( - mt, [hl.parse_locus_interval(contig) for contig in x_contigs] - ) + + if is_vds: + mtds = hl.vds.filter_variants(mtds, mt.rows(), keep=True) + mtds = hl.vds.filter_intervals( + mtds, + [ + hl.parse_locus_interval(contig, reference_genome="GRCh38") + for contig in x_contigs + ], + keep=True, + ) + mt = mtds.variant_data + else: + mt = hl.filter_intervals( + mt, [hl.parse_locus_interval(contig) for contig in x_contigs], keep=True + ) if sites_ht is not None: if aaf_expr == None: @@ -282,6 +310,9 @@ def annotate_sex( logger.info("Filtering to provided sites") mt = mt.annotate_rows(**sites_ht[mt.row_key]) mt = mt.filter_rows(hl.is_defined(mt[aaf_expr])) + if is_vds: + mtds = hl.vds.filter_variants(mtds, mt.rows()) + mt = mtds.variant_data logger.info("Calculating inbreeding coefficient on chrX") sex_ht = hl.impute_sex( From c3718fc844280dc0c34adae133493e8120b911e9 Mon Sep 17 00:00:00 2001 From: Wenhan Lu <68076796+wlu04@users.noreply.github.com> Date: Wed, 15 Dec 2021 11:55:23 -0500 Subject: [PATCH 02/11] Update gnomad/sample_qc/pipeline.py Co-authored-by: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> --- gnomad/sample_qc/pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gnomad/sample_qc/pipeline.py b/gnomad/sample_qc/pipeline.py index 587efe21c..b2106a92f 100644 --- a/gnomad/sample_qc/pipeline.py +++ b/gnomad/sample_qc/pipeline.py @@ -260,8 +260,8 @@ def annotate_sex( calling_intervals=included_intervals, normalization_contig=normalization_contig, ) - ploidy_ht = ploidy_ht.transmute( - chrX_ploidy=ploidy_ht.x_ploidy, chrY_ploidy=ploidy_ht.y_ploidy +.rename({'C1' : 'col1', 'C2' : 'col2'}) + ploidy_ht = ploidy_ht.rename({'x_ploidy' : 'chrX_ploidy', 'y_ploidy' : 'chrY_ploidy'}) ) mtds = hl.vds.split_multi(mtds, filter_changed_loci=True) mt = mtds.variant_data From e8b3c1e1b07c030eaca5191232d9cca33be9dc66 Mon Sep 17 00:00:00 2001 From: wlu04 Date: Wed, 15 Dec 2021 12:28:37 -0500 Subject: [PATCH 03/11] Removed redundant steps in annotate_sex() --- gnomad/sample_qc/pipeline.py | 37 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/gnomad/sample_qc/pipeline.py b/gnomad/sample_qc/pipeline.py index b2106a92f..c1fe566f1 100644 --- a/gnomad/sample_qc/pipeline.py +++ b/gnomad/sample_qc/pipeline.py @@ -214,6 +214,7 @@ def annotate_sex( excluded_intervals: Optional[hl.Table] = None, included_intervals: Optional[hl.Table] = None, normalization_contig: str = "chr20", + reference_genome: str = "GRCh38", sites_ht: Optional[hl.Table] = None, aaf_expr: Optional[str] = None, gt_expr: str = "GT", @@ -243,6 +244,7 @@ def annotate_sex( :param excluded_intervals: Optional table of intervals to exclude from the computation. :param included_intervals: Optional table of intervals to use in the computation. REQUIRED for exomes. :param normalization_contig: Which chromosome to use to normalize sex chromosome coverage. Used in determining sex chromosome ploidies. + :param reference_genome: Reference genome used for constructing interval list. Default: 'GRCh38' :param sites_ht: Optional Table to use. If present, filters input MatrixTable to sites in this Table prior to imputing sex, and pulls alternate allele frequency from this Table. :param aaf_expr: Optional. Name of field in input MatrixTable with alternate allele frequency. @@ -255,15 +257,18 @@ def annotate_sex( is_vds = isinstance(mtds, hl.vds.VariantDataset) if is_vds: + if excluded_intervals is not None: + raise NotImplementedError( + "excluded_intervals is not used when imputing sex chromosome ploidy for VDS" + ) ploidy_ht = hl.vds.impute_sex_chromosome_ploidy( mtds, calling_intervals=included_intervals, normalization_contig=normalization_contig, ) -.rename({'C1' : 'col1', 'C2' : 'col2'}) - ploidy_ht = ploidy_ht.rename({'x_ploidy' : 'chrX_ploidy', 'y_ploidy' : 'chrY_ploidy'}) + ploidy_ht = ploidy_ht.rename( + {"x_ploidy": "chrX_ploidy", "y_ploidy": "chrY_ploidy"} ) - mtds = hl.vds.split_multi(mtds, filter_changed_loci=True) mt = mtds.variant_data else: mt = mtds @@ -285,21 +290,14 @@ def annotate_sex( (hl.len(mt.alleles) == 2) & hl.is_snp(mt.alleles[0], mt.alleles[1]) ) - if is_vds: - mtds = hl.vds.filter_variants(mtds, mt.rows(), keep=True) - mtds = hl.vds.filter_intervals( - mtds, - [ - hl.parse_locus_interval(contig, reference_genome="GRCh38") - for contig in x_contigs - ], - keep=True, - ) - mt = mtds.variant_data - else: - mt = hl.filter_intervals( - mt, [hl.parse_locus_interval(contig) for contig in x_contigs], keep=True - ) + mt = hl.filter_intervals( + mt, + [ + hl.parse_locus_interval(contig, reference_genome=reference_genome) + for contig in x_contigs + ], + keep=True, + ) if sites_ht is not None: if aaf_expr == None: @@ -310,9 +308,6 @@ def annotate_sex( logger.info("Filtering to provided sites") mt = mt.annotate_rows(**sites_ht[mt.row_key]) mt = mt.filter_rows(hl.is_defined(mt[aaf_expr])) - if is_vds: - mtds = hl.vds.filter_variants(mtds, mt.rows()) - mt = mtds.variant_data logger.info("Calculating inbreeding coefficient on chrX") sex_ht = hl.impute_sex( From 6e1e8a92b0b8bc7bbeb4e6b27a3ced0add56edb1 Mon Sep 17 00:00:00 2001 From: Wenhan Lu <68076796+wlu04@users.noreply.github.com> Date: Wed, 15 Dec 2021 14:17:22 -0500 Subject: [PATCH 04/11] Update gnomad/sample_qc/pipeline.py Co-authored-by: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> --- gnomad/sample_qc/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/sample_qc/pipeline.py b/gnomad/sample_qc/pipeline.py index c1fe566f1..1e9219a80 100644 --- a/gnomad/sample_qc/pipeline.py +++ b/gnomad/sample_qc/pipeline.py @@ -259,7 +259,7 @@ def annotate_sex( if is_vds: if excluded_intervals is not None: raise NotImplementedError( - "excluded_intervals is not used when imputing sex chromosome ploidy for VDS" + "The use of the parameter excluded_intervals is currently not implemented for imputing sex chromosome ploidy on a VDS" ) ploidy_ht = hl.vds.impute_sex_chromosome_ploidy( mtds, From 554a2ec37e64a9d707a638e9882f1da1ba88849f Mon Sep 17 00:00:00 2001 From: wlu04 Date: Wed, 15 Dec 2021 14:25:45 -0500 Subject: [PATCH 05/11] Minor fixes --- gnomad/sample_qc/pipeline.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/gnomad/sample_qc/pipeline.py b/gnomad/sample_qc/pipeline.py index 1e9219a80..c07596a58 100644 --- a/gnomad/sample_qc/pipeline.py +++ b/gnomad/sample_qc/pipeline.py @@ -214,7 +214,6 @@ def annotate_sex( excluded_intervals: Optional[hl.Table] = None, included_intervals: Optional[hl.Table] = None, normalization_contig: str = "chr20", - reference_genome: str = "GRCh38", sites_ht: Optional[hl.Table] = None, aaf_expr: Optional[str] = None, gt_expr: str = "GT", @@ -226,7 +225,7 @@ def annotate_sex( Return Table with the following fields: - s (str): Sample - - chr20_mean_dp (float32): Sample's mean coverage over chromosome 20. + - chr20_mean_dp/autosomal_mean_dp (float32): Sample's mean coverage over chromosome 20. - chrX_mean_dp (float32): Sample's mean coverage over chromosome X. - chrY_mean_dp (float32): Sample's mean coverage over chromosome Y. - chrX_ploidy (float32): Sample's imputed ploidy over chromosome X. @@ -244,7 +243,6 @@ def annotate_sex( :param excluded_intervals: Optional table of intervals to exclude from the computation. :param included_intervals: Optional table of intervals to use in the computation. REQUIRED for exomes. :param normalization_contig: Which chromosome to use to normalize sex chromosome coverage. Used in determining sex chromosome ploidies. - :param reference_genome: Reference genome used for constructing interval list. Default: 'GRCh38' :param sites_ht: Optional Table to use. If present, filters input MatrixTable to sites in this Table prior to imputing sex, and pulls alternate allele frequency from this Table. :param aaf_expr: Optional. Name of field in input MatrixTable with alternate allele frequency. @@ -267,7 +265,12 @@ def annotate_sex( normalization_contig=normalization_contig, ) ploidy_ht = ploidy_ht.rename( - {"x_ploidy": "chrX_ploidy", "y_ploidy": "chrY_ploidy"} + { + "x_ploidy": "chrX_ploidy", + "y_ploidy": "chrY_ploidy", + "x_mean_dp": "chrX_mean_dp", + "y_mean_dp": "chrY_mean_dp", + } ) mt = mtds.variant_data else: @@ -290,10 +293,11 @@ def annotate_sex( (hl.len(mt.alleles) == 2) & hl.is_snp(mt.alleles[0], mt.alleles[1]) ) + build = get_reference_genome(mt.locus).name mt = hl.filter_intervals( mt, [ - hl.parse_locus_interval(contig, reference_genome=reference_genome) + hl.parse_locus_interval(contig, reference_genome=build) for contig in x_contigs ], keep=True, From 218a4f6d0c7698301f817f26915909327a906b38 Mon Sep 17 00:00:00 2001 From: Wenhan Lu <68076796+wlu04@users.noreply.github.com> Date: Fri, 17 Dec 2021 10:00:42 -0500 Subject: [PATCH 06/11] Update gnomad/sample_qc/pipeline.py Co-authored-by: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> --- gnomad/sample_qc/pipeline.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gnomad/sample_qc/pipeline.py b/gnomad/sample_qc/pipeline.py index c07596a58..8422e43d8 100644 --- a/gnomad/sample_qc/pipeline.py +++ b/gnomad/sample_qc/pipeline.py @@ -270,6 +270,7 @@ def annotate_sex( "y_ploidy": "chrY_ploidy", "x_mean_dp": "chrX_mean_dp", "y_mean_dp": "chrY_mean_dp", + "autosomal_mean_dp": f"{normalization_contig}_mean_dp", } ) mt = mtds.variant_data From 47a841b739c1011999d74a25a5b9be7027c2c7d9 Mon Sep 17 00:00:00 2001 From: Wenhan Lu <68076796+wlu04@users.noreply.github.com> Date: Fri, 17 Dec 2021 10:00:53 -0500 Subject: [PATCH 07/11] Update gnomad/sample_qc/pipeline.py Co-authored-by: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> --- gnomad/sample_qc/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/sample_qc/pipeline.py b/gnomad/sample_qc/pipeline.py index 8422e43d8..2845c5f6b 100644 --- a/gnomad/sample_qc/pipeline.py +++ b/gnomad/sample_qc/pipeline.py @@ -257,7 +257,7 @@ def annotate_sex( if is_vds: if excluded_intervals is not None: raise NotImplementedError( - "The use of the parameter excluded_intervals is currently not implemented for imputing sex chromosome ploidy on a VDS" + "The use of the parameter 'excluded_intervals' is currently not implemented for imputing sex chromosome ploidy on a VDS!" ) ploidy_ht = hl.vds.impute_sex_chromosome_ploidy( mtds, From 8b60b9279262afb4baeb130815d5cf196e95363f Mon Sep 17 00:00:00 2001 From: Wenhan Lu <68076796+wlu04@users.noreply.github.com> Date: Fri, 17 Dec 2021 10:01:07 -0500 Subject: [PATCH 08/11] Update gnomad/sample_qc/pipeline.py Co-authored-by: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> --- gnomad/sample_qc/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/sample_qc/pipeline.py b/gnomad/sample_qc/pipeline.py index 2845c5f6b..22252f34b 100644 --- a/gnomad/sample_qc/pipeline.py +++ b/gnomad/sample_qc/pipeline.py @@ -225,7 +225,7 @@ def annotate_sex( Return Table with the following fields: - s (str): Sample - - chr20_mean_dp/autosomal_mean_dp (float32): Sample's mean coverage over chromosome 20. + - {normalization_contig}_mean_dp (float32): Sample's mean coverage over the specified `normalization_contig`. - chrX_mean_dp (float32): Sample's mean coverage over chromosome X. - chrY_mean_dp (float32): Sample's mean coverage over chromosome Y. - chrX_ploidy (float32): Sample's imputed ploidy over chromosome X. From 4053ef45964537e1f5bd57bfba887d5a4f654d62 Mon Sep 17 00:00:00 2001 From: wlu04 Date: Fri, 17 Dec 2021 10:07:55 -0500 Subject: [PATCH 09/11] Re-commit to include hl.vds.impute_sex_chromosome_ploidy() From 700638518e072067336869b15749bd5e1d099c14 Mon Sep 17 00:00:00 2001 From: wlu04 Date: Fri, 17 Dec 2021 12:05:28 -0500 Subject: [PATCH 10/11] Minor fixes --- gnomad/sample_qc/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/sample_qc/pipeline.py b/gnomad/sample_qc/pipeline.py index 22252f34b..362b7a53d 100644 --- a/gnomad/sample_qc/pipeline.py +++ b/gnomad/sample_qc/pipeline.py @@ -225,7 +225,7 @@ def annotate_sex( Return Table with the following fields: - s (str): Sample - - {normalization_contig}_mean_dp (float32): Sample's mean coverage over the specified `normalization_contig`. + - `normalization_contig`_mean_dp (float32): Sample's mean coverage over the specified `normalization_contig`. - chrX_mean_dp (float32): Sample's mean coverage over chromosome X. - chrY_mean_dp (float32): Sample's mean coverage over chromosome Y. - chrX_ploidy (float32): Sample's imputed ploidy over chromosome X. From 5a3b89dc79616092fcf8ba3d622c8c27aff51b39 Mon Sep 17 00:00:00 2001 From: wlu04 Date: Mon, 3 Jan 2022 10:19:24 -0500 Subject: [PATCH 11/11] temporary(?) fix --- gnomad/utils/file_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gnomad/utils/file_utils.py b/gnomad/utils/file_utils.py index a40a18b1e..988cf5090 100644 --- a/gnomad/utils/file_utils.py +++ b/gnomad/utils/file_utils.py @@ -11,7 +11,8 @@ from typing import Callable, List, Dict, Optional, Tuple, Union import hail as hl -from hailtop.aiotools import LocalAsyncFS, RouterAsyncFS, AsyncFS +from hailtop.aiotools import LocalAsyncFS, AsyncFS +from hailtop.aiotools.router_fs import RouterAsyncFS from hailtop.aiogoogle import GoogleStorageAsyncFS from hailtop.utils import bounded_gather, tqdm