Skip to content

Commit

Permalink
Merge pull request #352 from Clinical-Genomics/separate_coverage_stat…
Browse files Browse the repository at this point in the history
…s_handler

Separate coverage stats handler
  • Loading branch information
northwestwitch authored Sep 27, 2024
2 parents 000f53e + 5e1c193 commit e2e5486
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 63 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
- Updated the Dockerfile base image so it contains the latest d4tools (master branch)
- Updated tests workflow to cargo install the latest d4tools from git (master branch)
- Computing coverage completeness stats using d4tools `perc_cov` stat function (much quicker reports)
- Moved functions computing the coverage stats to a separate `meta/handle_coverage_stats.py` module
### Fixed
- Updated dependencies including `certifi` to address dependabot alert
- Update pytest to v.7.4.4 to address a `ReDoS` vulnerability
Expand Down
4 changes: 2 additions & 2 deletions src/chanjo2/endpoints/coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@
from chanjo2.dbutil import get_session
from chanjo2.meta.handle_bed import bed_file_interval_id_coords
from chanjo2.meta.handle_completeness_stats import get_completeness_stats
from chanjo2.meta.handle_d4 import (
from chanjo2.meta.handle_coverage_stats import (
get_d4tools_chromosome_mean_coverage,
get_d4tools_intervals_coverage,
get_d4tools_intervals_mean_coverage,
get_samples_sex_metrics,
)
from chanjo2.meta.handle_d4 import get_samples_sex_metrics
from chanjo2.meta.handle_report_contents import INTERVAL_TYPE_SQL_TYPE, get_mean
from chanjo2.models import SQLGene
from chanjo2.models.pydantic_models import (
Expand Down
63 changes: 63 additions & 0 deletions src/chanjo2/meta/handle_coverage_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import subprocess
import tempfile
from typing import List, Tuple

from chanjo2.constants import CHROMOSOMES

CHROM_INDEX = 0
START_INDEX = 1
STOP_INDEX = 2
STATS_MEAN_COVERAGE_INDEX = 3


def get_d4tools_intervals_mean_coverage(
d4_file_path: str, intervals: List[str]
) -> List[float]:
"""Return the mean value over a list of intervals of a d4 file."""

if intervals:
tmp_bed_file = tempfile.NamedTemporaryFile()
with open(tmp_bed_file.name, "w") as bed_file:
bed_file.write("\n".join(intervals))

return get_d4tools_intervals_coverage(
d4_file_path=d4_file_path, bed_file_path=tmp_bed_file.name
)
chromosomes_mean_cov = get_d4tools_chromosome_mean_coverage(
d4_file_path=d4_file_path, chromosomes=CHROMOSOMES
)
return [chrom_cov[1] for chrom_cov in chromosomes_mean_cov]


def get_d4tools_intervals_coverage(
d4_file_path: str, bed_file_path: str
) -> List[float]:
"""Return the coverage for intervals of a d4 file that are found in a bed file."""

d4tools_stats_mean_cmd: str = subprocess.check_output(
["d4tools", "stat", "--region", bed_file_path, d4_file_path, "--stat", "mean"],
text=True,
)
return [
float(line.rstrip().split("\t")[3])
for line in d4tools_stats_mean_cmd.splitlines()
]


def get_d4tools_chromosome_mean_coverage(
d4_file_path: str, chromosomes=List[str]
) -> List[Tuple[str, float]]:
"""Return mean coverage over entire chromosomes."""

chromosomes_stats_mean_cmd: List[str] = subprocess.check_output(
["d4tools", "stat", "-s" "mean", d4_file_path],
text=True,
).splitlines()
chromosomes_coverage: List[Tuple[str, float]] = []
for line in chromosomes_stats_mean_cmd:
stats_data: List[str] = line.split("\t")
if stats_data[CHROM_INDEX] in chromosomes:
chromosomes_coverage.append(
(stats_data[CHROM_INDEX], float(stats_data[STATS_MEAN_COVERAGE_INDEX]))
)
return chromosomes_coverage
65 changes: 4 additions & 61 deletions src/chanjo2/meta/handle_d4.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import subprocess
import tempfile
from statistics import mean
from typing import Dict, List, Optional, Tuple, Union

from sqlalchemy.orm import Session

from chanjo2.constants import CHROMOSOMES
from chanjo2.crud.intervals import get_gene_intervals, set_sql_intervals
from chanjo2.meta.handle_completeness_stats import get_completeness_stats
from chanjo2.meta.handle_coverage_stats import (
get_d4tools_chromosome_mean_coverage,
get_d4tools_intervals_mean_coverage,
)
from chanjo2.models import SQLExon, SQLGene, SQLTranscript
from chanjo2.models.pydantic_models import (
GeneCoverage,
Expand All @@ -17,64 +18,6 @@
TranscriptTag,
)

CHROM_INDEX = 0
START_INDEX = 1
STOP_INDEX = 2
STATS_MEAN_COVERAGE_INDEX = 3


def get_d4tools_chromosome_mean_coverage(
d4_file_path: str, chromosomes=List[str]
) -> List[Tuple[str, float]]:
"""Return mean coverage over entire chromosomes."""

chromosomes_stats_mean_cmd: List[str] = subprocess.check_output(
["d4tools", "stat", "-s" "mean", d4_file_path],
text=True,
).splitlines()
chromosomes_coverage: List[Tuple[str, float]] = []
for line in chromosomes_stats_mean_cmd:
stats_data: List[str] = line.split("\t")
if stats_data[CHROM_INDEX] in chromosomes:
chromosomes_coverage.append(
(stats_data[CHROM_INDEX], float(stats_data[STATS_MEAN_COVERAGE_INDEX]))
)
return chromosomes_coverage


def get_d4tools_intervals_mean_coverage(
d4_file_path: str, intervals: List[str]
) -> List[float]:
"""Return the mean value over a list of intervals of a d4 file."""

if intervals:
tmp_bed_file = tempfile.NamedTemporaryFile()
with open(tmp_bed_file.name, "w") as bed_file:
bed_file.write("\n".join(intervals))

return get_d4tools_intervals_coverage(
d4_file_path=d4_file_path, bed_file_path=tmp_bed_file.name
)
chromosomes_mean_cov = get_d4tools_chromosome_mean_coverage(
d4_file_path=d4_file_path, chromosomes=CHROMOSOMES
)
return [chrom_cov[1] for chrom_cov in chromosomes_mean_cov]


def get_d4tools_intervals_coverage(
d4_file_path: str, bed_file_path: str
) -> List[float]:
"""Return the coverage for intervals of a d4 file that are found in a bed file."""

d4tools_stats_mean_cmd: str = subprocess.check_output(
["d4tools", "stat", "--region", bed_file_path, d4_file_path, "--stat", "mean"],
text=True,
)
return [
float(line.rstrip().split("\t")[3])
for line in d4tools_stats_mean_cmd.splitlines()
]


def get_report_sample_interval_coverage(
d4_file_path: str,
Expand Down

0 comments on commit e2e5486

Please sign in to comment.