broadinstitute · ch-kr · Sep 15, 2020 · Jul 22, 2020 · Jul 22, 2020 · Jul 22, 2020
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,8 @@
 * Changed quality histograms to label histograms calculated on raw and not adj data [(#247)](/~https://github.com/broadinstitute/gnomad_methods/pull/247)
 * Updated some VCF export constants [(#249)](/~https://github.com/broadinstitute/gnomad_methods/pull/249)
 * Changed default DP threshold to 5 for hemi genotype calls in `annotate_adj` and `get_adj_expr` [(#252)](/~https://github.com/broadinstitute/gnomad_methods/pull/252)
+* Updated coverage resources to version 3.0.1 [[#242]] (/~https://github.com/broadinstitute/gnomad_methods/pull/242)
+* Fixed handling of missing entries (not within a ref block / alt site) when computing `coverage_stats` in `sparse_mt.py` [[#242]] (/~https://github.com/broadinstitute/gnomad_methods/pull/242)
 
 ## Version 0.4.0 - July 9th, 2020
 

diff --git a/gnomad/resources/grch38/gnomad.py b/gnomad/resources/grch38/gnomad.py
@@ -9,8 +9,10 @@
 
 CURRENT_EXOME_RELEASE = ""
 CURRENT_GENOME_RELEASE = "3.0"
+CURRENT_GENOME_COVERAGE_RELEASE = "3.0.1"
 EXOME_RELEASES = []
 GENOME_RELEASES = ["3.0"]
+GENOME_COVERAGE_RELEASES = GENOME_RELEASES + ["3.0.1"]
 DATA_TYPES = ["genomes"]
 
 GENOME_POPS = ["AFR", "AMI", "AMR", "ASJ", "EAS", "FIN", "NFE", "SAS", "OTH"]
@@ -101,8 +103,8 @@ def coverage(data_type: str) -> VersionedTableResource:
         current_release = CURRENT_EXOME_RELEASE
         releases = EXOME_RELEASES
     else:
-        current_release = CURRENT_GENOME_RELEASE
-        releases = GENOME_RELEASES
+        current_release = CURRENT_GENOME_COVERAGE_RELEASE
+        releases = GENOME_COVERAGE_RELEASES
 
     return VersionedTableResource(
         current_release,
@@ -134,8 +136,8 @@ def coverage_tsv_path(data_type: str, version: Optional[str] = None) -> str:
             )
     else:
         if version is None:
-            version = CURRENT_GENOME_RELEASE
-        elif version not in GENOME_RELEASES:
+            version = CURRENT_GENOME_COVERAGE_RELEASE
+        elif version not in GENOME_COVERAGE_RELEASES:
             raise DataException(
                 f"Version {version} of gnomAD genomes for GRCh38 does not exist"
             )

diff --git a/gnomad/utils/reference_genome.py b/gnomad/utils/reference_genome.py
@@ -49,11 +49,14 @@ def get_reference_ht(
             }
         )
 
-    context = None
+    context = []
     for contig in contigs:
+        n_partitions = max(1, int(ref.contig_length(contig) / 5000000))
+        logger.info(
+            f"Creating reference contig {contig} with {n_partitions} partitions."
+        )
         _context = hl.utils.range_table(
-            ref.contig_length(contig),
-            n_partitions=int(ref.contig_length(contig) / 5000000),
+            ref.contig_length(contig), n_partitions=n_partitions
         )
 
         locus_expr = hl.locus(contig=contig, pos=_context.idx + 1, reference_genome=ref)
@@ -77,12 +80,9 @@ def get_reference_ht(
         if filter_n:
             _context = _context.filter(_context.alleles[0] == "n", keep=False)
 
-        if context is None:
-            context = _context
-        else:
-            context = context.union(_context)
+        context.append(_context)
 
-    return context
+    return context.pop().union(*context)
 
 
 def add_reference_sequence(ref: hl.ReferenceGenome) -> hl.ReferenceGenome:

diff --git a/gnomad/utils/sparse_mt.py b/gnomad/utils/sparse_mt.py
@@ -651,10 +651,10 @@ def compute_coverage_stats(
     print(f"Computing coverage stats on {n_samples} samples.")
 
     # Create an outer join with the reference Table
-    mt = mt.select_entries("END", "DP")
+    mt = mt.select_entries("END", "DP").select_cols().select_rows()
     col_key_fields = list(mt.col_key)
     t = mt._localize_entries("__entries", "__cols")
-    t = t.join(reference_ht.annotate(_in_ref=True), how="outer")
+    t = t.join(reference_ht.key_by(*mt.row_key).select(_in_ref=True), how="outer")
     t = t.annotate(
         __entries=hl.or_else(
             t.__entries,
@@ -669,14 +669,17 @@ def compute_coverage_stats(
     # Filter rows where the reference is missing
     mt = mt.filter_rows(mt._in_ref)
 
+    # Unfilter entries so that entries with no ref block overlap aren't null
+    mt = mt.unfilter_entries()
+
     # Compute coverage stats
     coverage_over_x_bins = sorted(coverage_over_x_bins)
     max_coverage_bin = coverage_over_x_bins[-1]
     hl_coverage_over_x_bins = hl.array(coverage_over_x_bins)
 
     # This expression creates a counter DP -> number of samples for DP between 0 and max_coverage_bin
     coverage_counter_expr = hl.agg.counter(
-        hl.or_else(hl.min(max_coverage_bin, mt.DP), 0)
+        hl.min(max_coverage_bin, hl.or_else(mt.DP, 0))
     )
 
     # This expression aggregates the DP counter in reverse order of the coverage_over_x_bins
@@ -697,12 +700,12 @@ def compute_coverage_stats(
             )
         )
     )
-    mean_expr = hl.agg.mean(mt.DP)
+    mean_expr = hl.agg.mean(hl.or_else(mt.DP, 0))
 
     # Annotate rows now
     return mt.select_rows(
         mean=hl.cond(hl.is_nan(mean_expr), 0, mean_expr),
-        median=hl.or_else(hl.agg.approx_median(mt.DP), 0),
+        median_approx=hl.or_else(hl.agg.approx_median(hl.or_else(mt.DP, 0)), 0),
         total_DP=hl.agg.sum(mt.DP),
         **{
             f"over_{x}": count_array_expr[i] / n_samples