Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add function get_chr_x_hom_alt_cutoffs, add arguments to infer_sex_karyotype and get_sex_expr to use the new function and it's output. #492

Merged
merged 8 commits into from
Oct 12, 2022
61 changes: 55 additions & 6 deletions gnomad/sample_qc/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from gnomad.sample_qc.sex import (
gaussian_mixture_model_karyotype_assignment,
get_chr_x_hom_alt_cutoffs,
get_ploidy_cutoffs,
get_sex_expr,
)
Expand Down Expand Up @@ -252,6 +253,8 @@ def infer_sex_karyotype(
use_gaussian_mixture_model: bool = False,
normal_ploidy_cutoff: int = 5,
aneuploidy_cutoff: int = 6,
chr_x_frac_hom_alt_expr: Optional[hl.expr.NumericExpression] = None,
normal_chr_x_hom_alt_cutoff: int = 5,
jkgoodrich marked this conversation as resolved.
Show resolved Hide resolved
) -> hl.Table:
"""
Create a Table with X_karyotype, Y_karyotype, and sex_karyotype.
Expand All @@ -265,15 +268,22 @@ def infer_sex_karyotype(

:param ploidy_ht: Input Table with chromosome X and chromosome Y ploidy values and optionally f-stat.
:param f_stat_cutoff: f-stat to roughly divide 'XX' from 'XY' samples. Assumes XX samples are below cutoff and XY
are above cutoff. Default is 0.5
are above cutoff. Default is 0.5.
:param use_gaussian_mixture_model: Use gaussian mixture model to split samples into 'XX' and 'XY' instead of f-stat.
:param normal_ploidy_cutoff: Number of standard deviations to use when determining sex chromosome ploidy cutoffs.
:param normal_ploidy_cutoff: Number of standard deviations to use when determining sex chromosome ploidy cutoffs
for XX, XY karyotypes.
:param aneuploidy_cutoff: Number of standard deviations to use when determining sex chromosome ploidy cutoffs for
aneuploidies.
:param chr_x_frac_hom_alt_expr: Fraction of homozygous alternate genotypes (hom-alt/(hom-alt + het)) on chromosome X.
:param normal_chr_x_hom_alt_cutoff: Number of standard deviations to use when determining cutoffs for the fraction
of homozygous alternate genotypes (hom-alt/(hom-alt + het)) on chromosome X for for XX and XY karyotypes. Only
used if `chr_x_frac_hom_alt_expr` is supplied.
:return: Table of samples imputed sex karyotype.
"""
logger.info("Inferring sex karyotype")
if chr_x_frac_hom_alt_expr is not None:
ploidy_ht = ploidy_ht.annotate(_chr_x_frac_hom_alt=chr_x_frac_hom_alt_expr)

if use_gaussian_mixture_model:
logger.info("Using Gaussian Mixture Model for karyotype assignment")
gmm_sex_ht = gaussian_mixture_model_karyotype_assignment(ploidy_ht)
Expand All @@ -283,6 +293,11 @@ def infer_sex_karyotype(
normal_ploidy_cutoff=normal_ploidy_cutoff,
aneuploidy_cutoff=aneuploidy_cutoff,
)
ploidy_ht = ploidy_ht.annotate(
gmm_karyotype=gmm_sex_ht[ploidy_ht.key].gmm_karyotype
)
group_by_expr = ploidy_ht.gmm_karyotype
f_stat_cutoff = None
else:
logger.info("Using f-stat for karyotype assignment")
x_ploidy_cutoffs, y_ploidy_cutoffs = get_ploidy_cutoffs(
Expand All @@ -291,16 +306,40 @@ def infer_sex_karyotype(
normal_ploidy_cutoff=normal_ploidy_cutoff,
aneuploidy_cutoff=aneuploidy_cutoff,
)
group_by_expr = None

if chr_x_frac_hom_alt_expr is not None:
logger.info(
"Including cutoffs for the fraction of homozygous alternate genotypes (hom-alt/(hom-alt + het)) on "
"chromosome X. Using %d standard deviations to determine cutoffs.",
normal_chr_x_hom_alt_cutoff,
)
chr_x_frac_hom_alt_expr = ploidy_ht._chr_x_frac_hom_alt
chr_x_frac_hom_alt_cutoffs = get_chr_x_hom_alt_cutoffs(
ploidy_ht,
chr_x_frac_hom_alt_expr,
f_stat_cutoff=f_stat_cutoff,
group_by_expr=group_by_expr,
cutoff_stdev=normal_chr_x_hom_alt_cutoff,
)

else:
chr_x_frac_hom_alt_cutoffs = None

karyotype_ht = ploidy_ht.select(
**get_sex_expr(
ploidy_ht.chrX_ploidy,
ploidy_ht.chrY_ploidy,
x_ploidy_cutoffs,
y_ploidy_cutoffs,
chr_x_frac_hom_alt_expr=chr_x_frac_hom_alt_expr,
chr_x_frac_hom_alt_cutoffs=chr_x_frac_hom_alt_cutoffs,
)
)
karyotype_ht = karyotype_ht.annotate_globals(
use_gaussian_mixture_model=use_gaussian_mixture_model,
normal_ploidy_cutoff=normal_ploidy_cutoff,
aneuploidy_cutoff=aneuploidy_cutoff,
x_ploidy_cutoffs=hl.struct(
upper_cutoff_X=x_ploidy_cutoffs[0],
lower_cutoff_XX=x_ploidy_cutoffs[1][0],
Expand All @@ -312,11 +351,21 @@ def infer_sex_karyotype(
upper_cutoff_Y=y_ploidy_cutoffs[0][1],
lower_cutoff_YY=y_ploidy_cutoffs[1],
),
use_gaussian_mixture_model=use_gaussian_mixture_model,
normal_ploidy_cutoff=normal_ploidy_cutoff,
aneuploidy_cutoff=aneuploidy_cutoff,
)
if not use_gaussian_mixture_model:
if chr_x_frac_hom_alt_expr is not None:
karyotype_ht = karyotype_ht.annotate_globals(
x_frac_hom_alt_cutoffs=hl.struct(
lower_cutoff_more_than_one_X=chr_x_frac_hom_alt_cutoffs[0][0],
upper_cutoff_more_than_one_X=chr_x_frac_hom_alt_cutoffs[0][1],
lower_cutoff_single_X=chr_x_frac_hom_alt_cutoffs[1],
)
)

if use_gaussian_mixture_model:
karyotype_ht = karyotype_ht.annotate(
gmm_sex_karyotype=ploidy_ht[karyotype_ht.key].gmm_karyotype
)
else:
karyotype_ht = karyotype_ht.annotate_globals(f_stat_cutoff=f_stat_cutoff)

return karyotype_ht
Expand Down
145 changes: 130 additions & 15 deletions gnomad/sample_qc/sex.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# noqa: D100

import logging
from typing import List, Tuple, Union
from typing import List, Optional, Tuple, Union

import hail as hl
import numpy as np
Expand Down Expand Up @@ -214,7 +214,7 @@ def get_ploidy_cutoffs(
)
if "XX" not in sex_stats:
raise ValueError("No samples are grouped as XX!")
if "XX" not in sex_stats:
if "XY" not in sex_stats:
raise ValueError("No samples are grouped as XY!")
logger.info("XX stats: %s", sex_stats["XX"])
logger.info("XY stats: %s", sex_stats["XY"])
Expand Down Expand Up @@ -246,50 +246,165 @@ def get_ploidy_cutoffs(
return cutoffs


def get_chr_x_hom_alt_cutoffs(
ht: hl.Table,
chr_x_frac_hom_alt_expr: hl.expr.NumericExpression,
f_stat_cutoff: float = None,
group_by_expr: hl.expr.StringExpression = None,
cutoff_stdev: int = 5,
) -> Tuple[Tuple[float, float], float]:
"""
Get cutoffs for the fraction homozygous alternate genotypes on chromosome X in 'XY' and 'XX' samples.

.. note::

This assumes the input hail Table has the field 'f_stat' if `f_stat_cutoff` is set.

Return a tuple of cutoffs for the fraction of homozygous alternate genotypes (hom-alt/(hom-alt + het)) on
chromosome X: ((lower cutoff for more than one X, upper cutoff for more than one X), lower cutoff for single X).

Uses the `cutoff_stdev` parameter to determine the fraction of homozygous alternate genotypes
(hom-alt/(hom-alt + het)) on chromosome X cutoffs for 'XX' and 'XY' karyotypes.

.. note::

`f_stat_cutoff` or `group_by_expr` must be supplied. If `f_stat_cutoff` is supplied then f-stat is used to
split the samples into roughly 'XX' and 'XY'. If `group_by_expr` is supplied instead, then it must include an
annotation grouping samples by 'XX' and 'XY'. These are both only used to divide samples into XX and XY to
determine means and standard deviations for these categories and are not used in the final karyotype annotation.

:param ht: Table with f_stat and fraction of homozygous alternate genotypes on chromosome X.
:param chr_x_frac_hom_alt_expr: Fraction of homozygous alternate genotypes (hom-alt/(hom-alt + het)) on chromosome X.
:param f_stat_cutoff: f-stat to roughly divide 'XX' from 'XY' samples. Assumes XX samples are below cutoff and XY
are above cutoff.
:param group_by_expr: Expression grouping samples into 'XX' and 'XY'. Can be used instead of `f_stat_cutoff`.
:param cutoff_stdev: Number of standard deviations to use when determining sex chromosome ploidy cutoffs
for XX, XY karyotypes.
:return: Tuple of cutoffs: ((lower cutoff for more than one X, upper cutoff for more than one X), lower cutoff for
single X).
"""
if (f_stat_cutoff is None and group_by_expr is None) or (
f_stat_cutoff is not None and group_by_expr is not None
):
raise ValueError(
"One and only one of 'f_stat_cutoff' or 'group_by_expr' must be supplied!"
)

# If 'f_stat_cutoff' is supplied, group the input Table by f_stat cutoff
if f_stat_cutoff is not None:
group_by_expr = hl.if_else(ht.f_stat < f_stat_cutoff, "XX", "XY")

# Get mean/stdev based on 'group_by_expr'
sex_stats = ht.aggregate(
hl.agg.group_by(
group_by_expr,
hl.struct(chrx_homalt=hl.agg.stats(chr_x_frac_hom_alt_expr)),
)
)
if "XX" not in sex_stats:
raise ValueError("No samples are grouped as XX!")
if "XY" not in sex_stats:
raise ValueError("No samples are grouped as XY!")

logger.info("XX stats: %s", sex_stats["XX"])
logger.info("XY stats: %s", sex_stats["XY"])

cutoffs = (
(
sex_stats["XX"].chrx_homalt.mean
- (cutoff_stdev * sex_stats["XX"].chrx_homalt.stdev),
sex_stats["XX"].chrx_homalt.mean
+ (cutoff_stdev * sex_stats["XX"].chrx_homalt.stdev),
),
sex_stats["XY"].chrx_homalt.mean
- (cutoff_stdev * sex_stats["XY"].chrx_homalt.stdev),
)

logger.info("chrx_homalt cutoffs: %s", cutoffs)

return cutoffs


def get_sex_expr(
chr_x_ploidy: hl.expr.NumericExpression,
chr_y_ploidy: hl.expr.NumericExpression,
x_ploidy_cutoffs: Tuple[float, Tuple[float, float], float],
y_ploidy_cutoffs: Tuple[Tuple[float, float], float],
chr_x_frac_hom_alt_expr: Optional[hl.expr.NumericExpression] = None,
chr_x_frac_hom_alt_cutoffs: Optional[Tuple[Tuple[float, float], float]] = None,
) -> hl.expr.StructExpression:
"""
Create a struct with X_karyotype, Y_karyotype, and sex_karyotype.

Note that X0 is currently returned as 'X'.

:param chr_x_ploidy: Chromosome X ploidy (or relative ploidy)
:param chr_y_ploidy: Chromosome Y ploidy (or relative ploidy)
:param chr_x_ploidy: Chromosome X ploidy (or relative ploidy).
:param chr_y_ploidy: Chromosome Y ploidy (or relative ploidy).
:param x_ploidy_cutoffs: Tuple of X chromosome ploidy cutoffs: (upper cutoff for single X, (lower cutoff for
double X, upper cutoff for double X), lower cutoff for triple X)
double X, upper cutoff for double X), lower cutoff for triple X).
:param y_ploidy_cutoffs: Tuple of Y chromosome ploidy cutoffs: ((lower cutoff for single Y, upper cutoff for
single Y), lower cutoff for double Y)
:return: Struct containing X_karyotype, Y_karyotype, and sex_karyotype
single Y), lower cutoff for double Y).
:param chr_x_frac_hom_alt_expr: Fraction of homozygous alternate genotypes (hom-alt/(hom-alt + het)) on chromosome X.
:param chr_x_frac_hom_alt_cutoffs: Tuple of cutoffs for the fraction of homozygous alternate genotypes
(hom-alt/(hom-alt + het)) on chromosome X: ((lower cutoff for more than one X, upper cutoff for more than one X),
lower cutoff for single X).
:return: Struct containing X_karyotype, Y_karyotype, and sex_karyotype.
"""
if sum([chr_x_frac_hom_alt_expr is None, chr_x_frac_hom_alt_cutoffs is None]) == 1:
raise ValueError(
"None or both of `chr_x_frac_hom_alt_expr` and `chr_x_frac_hom_alt_cutoffs` must be set!"
)

if chr_x_frac_hom_alt_expr is not None:
lower_cutoff_for_single_x = chr_x_frac_hom_alt_cutoffs[1]
lower_cutoff_for_multiple_x = chr_x_frac_hom_alt_cutoffs[0][0]
upper_cutoff_for_multiple_x = chr_x_frac_hom_alt_cutoffs[0][1]

add_x_condition = chr_x_frac_hom_alt_expr > lower_cutoff_for_single_x
add_xx_condition = (chr_x_frac_hom_alt_expr > lower_cutoff_for_multiple_x) & (
chr_x_frac_hom_alt_expr < upper_cutoff_for_multiple_x
)
add_xxx_condition = chr_x_frac_hom_alt_expr < upper_cutoff_for_multiple_x
else:
add_x_condition = add_xx_condition = add_xxx_condition = True

upper_ploidy_cutoff_for_x = x_ploidy_cutoffs[0]
lower_ploidy_cutoff_for_xx = x_ploidy_cutoffs[1][0]
upper_ploidy_cutoff_for_xx = x_ploidy_cutoffs[1][1]
lower_ploidy_cutoff_for_xxx = x_ploidy_cutoffs[2]

lower_ploidy_cutoff_for_y = y_ploidy_cutoffs[0][0]
upper_ploidy_cutoff_for_y = y_ploidy_cutoffs[0][1]
lower_ploidy_cutoff_for_yy = y_ploidy_cutoffs[1]

sex_expr = hl.struct(
X_karyotype=(
hl.case()
.when(chr_x_ploidy < x_ploidy_cutoffs[0], "X")
.when((chr_x_ploidy < upper_ploidy_cutoff_for_x) & add_x_condition, "X")
.when(
(
(chr_x_ploidy > x_ploidy_cutoffs[1][0])
& (chr_x_ploidy < x_ploidy_cutoffs[1][1])
(chr_x_ploidy > lower_ploidy_cutoff_for_xx)
& (chr_x_ploidy < upper_ploidy_cutoff_for_xx)
& add_xx_condition
),
"XX",
)
.when((chr_x_ploidy >= x_ploidy_cutoffs[2]), "XXX")
.when(
(chr_x_ploidy >= lower_ploidy_cutoff_for_xxx) & add_xxx_condition, "XXX"
)
.default("ambiguous")
),
Y_karyotype=(
hl.case()
.when(chr_y_ploidy < y_ploidy_cutoffs[0][0], "")
.when(chr_y_ploidy < lower_ploidy_cutoff_for_y, "")
.when(
(
(chr_y_ploidy > y_ploidy_cutoffs[0][0])
& (chr_y_ploidy < y_ploidy_cutoffs[0][1])
(chr_y_ploidy > lower_ploidy_cutoff_for_y)
& (chr_y_ploidy < upper_ploidy_cutoff_for_y)
),
"Y",
)
.when(chr_y_ploidy >= y_ploidy_cutoffs[1], "YY")
.when(chr_y_ploidy >= lower_ploidy_cutoff_for_yy, "YY")
.default("ambiguous")
),
)
Expand Down