Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix multiple problems with mishandling of missing entries in compute_… #242

Merged
merged 8 commits into from
Sep 15, 2020
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
* Changed quality histograms to label histograms calculated on raw and not adj data [(#247)](/~https://github.com/broadinstitute/gnomad_methods/pull/247)
* Updated some VCF export constants [(#249)](/~https://github.com/broadinstitute/gnomad_methods/pull/249)
* Changed default DP threshold to 5 for hemi genotype calls in `annotate_adj` and `get_adj_expr` [(#252)](/~https://github.com/broadinstitute/gnomad_methods/pull/252)
* Updated coverage resources to version 3.0.1 [[#242]] (/~https://github.com/broadinstitute/gnomad_methods/pull/242)
* Fixed handling of missing entries (not within a ref block / alt site) when computing `coverage_stats` in `sparse_mt.py` [[#242]] (/~https://github.com/broadinstitute/gnomad_methods/pull/242)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

might be helpful to include a note about the resource changes here too


## Version 0.4.0 - July 9th, 2020

Expand Down
10 changes: 6 additions & 4 deletions gnomad/resources/grch38/gnomad.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@

CURRENT_EXOME_RELEASE = ""
CURRENT_GENOME_RELEASE = "3.0"
CURRENT_GENOME_COVERAGE_RELEASE = "3.0.1"
EXOME_RELEASES = []
GENOME_RELEASES = ["3.0"]
GENOME_COVERAGE_RELEASES = GENOME_RELEASES + ["3.0.1"]
DATA_TYPES = ["genomes"]

GENOME_POPS = ["AFR", "AMI", "AMR", "ASJ", "EAS", "FIN", "NFE", "SAS", "OTH"]
Expand Down Expand Up @@ -101,8 +103,8 @@ def coverage(data_type: str) -> VersionedTableResource:
current_release = CURRENT_EXOME_RELEASE
releases = EXOME_RELEASES
else:
current_release = CURRENT_GENOME_RELEASE
releases = GENOME_RELEASES
current_release = CURRENT_GENOME_COVERAGE_RELEASE
releases = GENOME_COVERAGE_RELEASES

return VersionedTableResource(
current_release,
Expand Down Expand Up @@ -134,8 +136,8 @@ def coverage_tsv_path(data_type: str, version: Optional[str] = None) -> str:
)
else:
if version is None:
version = CURRENT_GENOME_RELEASE
elif version not in GENOME_RELEASES:
version = CURRENT_GENOME_COVERAGE_RELEASE
elif version not in GENOME_COVERAGE_RELEASES:
raise DataException(
f"Version {version} of gnomAD genomes for GRCh38 does not exist"
)
Expand Down
16 changes: 8 additions & 8 deletions gnomad/utils/reference_genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,14 @@ def get_reference_ht(
}
)

context = None
context = []
for contig in contigs:
n_partitions = max(1, int(ref.contig_length(contig) / 5000000))
logger.info(
f"Creating reference contig {contig} with {n_partitions} partitions."
)
_context = hl.utils.range_table(
ref.contig_length(contig),
n_partitions=int(ref.contig_length(contig) / 5000000),
ref.contig_length(contig), n_partitions=n_partitions
)

locus_expr = hl.locus(contig=contig, pos=_context.idx + 1, reference_genome=ref)
Expand All @@ -77,12 +80,9 @@ def get_reference_ht(
if filter_n:
_context = _context.filter(_context.alleles[0] == "n", keep=False)

if context is None:
context = _context
else:
context = context.union(_context)
context.append(_context)

return context
return context.pop().union(*context)


def add_reference_sequence(ref: hl.ReferenceGenome) -> hl.ReferenceGenome:
Expand Down
13 changes: 8 additions & 5 deletions gnomad/utils/sparse_mt.py
Original file line number Diff line number Diff line change
Expand Up @@ -651,10 +651,10 @@ def compute_coverage_stats(
print(f"Computing coverage stats on {n_samples} samples.")

# Create an outer join with the reference Table
mt = mt.select_entries("END", "DP")
mt = mt.select_entries("END", "DP").select_cols().select_rows()
col_key_fields = list(mt.col_key)
t = mt._localize_entries("__entries", "__cols")
t = t.join(reference_ht.annotate(_in_ref=True), how="outer")
t = t.join(reference_ht.key_by(*mt.row_key).select(_in_ref=True), how="outer")
t = t.annotate(
__entries=hl.or_else(
t.__entries,
Expand All @@ -669,14 +669,17 @@ def compute_coverage_stats(
# Filter rows where the reference is missing
mt = mt.filter_rows(mt._in_ref)

# Unfilter entries so that entries with no ref block overlap aren't null
mt = mt.unfilter_entries()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ohh, great catch!! I didn't realize you needed this and tested it out in a notebook to confirm.


# Compute coverage stats
coverage_over_x_bins = sorted(coverage_over_x_bins)
max_coverage_bin = coverage_over_x_bins[-1]
hl_coverage_over_x_bins = hl.array(coverage_over_x_bins)

# This expression creates a counter DP -> number of samples for DP between 0 and max_coverage_bin
coverage_counter_expr = hl.agg.counter(
hl.or_else(hl.min(max_coverage_bin, mt.DP), 0)
hl.min(max_coverage_bin, hl.or_else(mt.DP, 0))
)

# This expression aggregates the DP counter in reverse order of the coverage_over_x_bins
Expand All @@ -697,12 +700,12 @@ def compute_coverage_stats(
)
)
)
mean_expr = hl.agg.mean(mt.DP)
mean_expr = hl.agg.mean(hl.or_else(mt.DP, 0))

# Annotate rows now
return mt.select_rows(
mean=hl.cond(hl.is_nan(mean_expr), 0, mean_expr),
median=hl.or_else(hl.agg.approx_median(mt.DP), 0),
median_approx=hl.or_else(hl.agg.approx_median(hl.or_else(mt.DP, 0)), 0),
total_DP=hl.agg.sum(mt.DP),
**{
f"over_{x}": count_array_expr[i] / n_samples
Expand Down