Skip to content

Commit

Permalink
Merge pull request #229 from Clinical-Genomics/refactor_predict_sex
Browse files Browse the repository at this point in the history
Refactor report's sex rows and coverage.get_samples_predicted_sex to use d4tools instead of pyd4
  • Loading branch information
northwestwitch authored Feb 20, 2024
2 parents 30c2214 + e559c81 commit f3b7c12
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 30 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
- Speed up response by `coverage.d4_intervals_coverage` by replacing pyd4 lib with direct calls d4tools and multiprocessing
- Removed 2 redundant functions in `meta.handle.bed.py`
- `coverage.d4_interval_coverage` is using direct calls to d4tools to retrieve stats over an entire chromosome or a genomic interval
- Reformat report sample' sex rows and coverage.get_samples_predicted_sex endpoint to use d4tools and not pyd4 for evaluating sample sex
### Fixed
- `coverage.d4_interval_coverage` endpoint crashing trying to computer coverage completeness over an entire chromosome

Expand Down
16 changes: 8 additions & 8 deletions src/chanjo2/endpoints/coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,12 @@
from chanjo2.dbutil import get_session
from chanjo2.meta.handle_bed import bed_file_interval_id_coords
from chanjo2.meta.handle_d4 import (
get_d4_file,
get_d4tools_chromosome_mean_coverage,
get_d4tools_coverage_completeness,
get_d4tools_intervals_coverage,
get_d4tools_intervals_mean_coverage,
get_sample_interval_coverage,
get_samples_sex_metrics,
set_interval,
)
from chanjo2.meta.handle_tasks import coverage_completeness_multitasker
from chanjo2.models import SQLExon, SQLGene, SQLTranscript
Expand Down Expand Up @@ -55,8 +53,8 @@ def d4_interval_coverage(query: FileCoverageQuery):
if None in [query.start, query.end]: # Coverage over an entire chromosome
return IntervalCoverage(
mean_coverage=get_d4tools_chromosome_mean_coverage(
d4_file_path=query.coverage_file_path, chromosome=query.chromosome
),
d4_file_path=query.coverage_file_path, chromosomes=[query.chromosome]
)[0][1],
completeness={},
interval_id=interval,
)
Expand Down Expand Up @@ -135,14 +133,16 @@ def d4_intervals_coverage(query: FileCoverageIntervalsFileQuery):

@router.get("/coverage/samples/predicted_sex", response_model=Dict)
async def get_samples_predicted_sex(coverage_file_path: str):
try:
d4_file: D4File = get_d4_file(coverage_file_path=coverage_file_path)
except Exception:
"""Return predicted sex for a sample given the coverage over its sex chromosomes."""
if (
isfile(coverage_file_path) is False
or validators.url(coverage_file_path) is False
):
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=WRONG_COVERAGE_FILE_MSG,
)
return get_samples_sex_metrics(d4_file=d4_file)
return get_samples_sex_metrics(d4_file_path=coverage_file_path)


@router.post(
Expand Down
28 changes: 17 additions & 11 deletions src/chanjo2/meta/handle_d4.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,18 +46,23 @@ def get_intervals_coords_list(
return interval_coords


def get_d4tools_chromosome_mean_coverage(d4_file_path: str, chromosome=str) -> float:
"""Return mean coverage over one entire chromosome."""
def get_d4tools_chromosome_mean_coverage(
d4_file_path: str, chromosomes=List[str]
) -> List[Tuple[str, float]]:
"""Return mean coverage over entire chromosomes."""

chromosomes_stats_mean_cmd: List[str] = subprocess.check_output(
["d4tools", "stat", "-s" "mean", d4_file_path],
text=True,
).splitlines()

chromosomes_coverage: List[Tuple[str, float]] = []
for line in chromosomes_stats_mean_cmd:
stats_data: List[str] = line.split("\t")
if chromosome == stats_data[CHROM_INDEX]:
return stats_data[STATS_MEAN_COVERAGE_INDEX]
if stats_data[CHROM_INDEX] in chromosomes:
chromosomes_coverage.append(
(stats_data[CHROM_INDEX], float(stats_data[STATS_MEAN_COVERAGE_INDEX]))
)
return chromosomes_coverage


def get_d4tools_intervals_mean_coverage(
Expand Down Expand Up @@ -342,16 +347,17 @@ def predict_sex(x_cov: float, y_cov: float) -> str:
return Sex.FEMALE.value


def get_samples_sex_metrics(d4_file: D4File) -> Dict:
def get_samples_sex_metrics(d4_file_path: str) -> Dict:
"""Compute coverage over sex chromosomes and predicted sex."""

sex_chroms_coverage: List[float] = get_intervals_mean_coverage(
d4_file=d4_file, intervals=[("X"), ("Y")]
sex_chroms_coverage: List[Tuple[str, float]] = get_d4tools_chromosome_mean_coverage(
d4_file_path=d4_file_path, chromosomes=["X", "Y"]
)

return {
"x_coverage": round(sex_chroms_coverage[0], 1),
"y_coverage": round(sex_chroms_coverage[1], 1),
"x_coverage": round(sex_chroms_coverage[0][1], 1),
"y_coverage": round(sex_chroms_coverage[1][1], 1),
"predicted_sex": predict_sex(
x_cov=sex_chroms_coverage[0], y_cov=sex_chroms_coverage[1]
x_cov=sex_chroms_coverage[0][1], y_cov=sex_chroms_coverage[1][1]
),
}
16 changes: 5 additions & 11 deletions src/chanjo2/meta/handle_report_contents.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,7 @@ def get_report_data(
return data

# Add coverage_report - specific data
data["sex_rows"] = get_report_sex_rows(
samples=query.samples, samples_d4_files=samples_d4_files
)
data["sex_rows"] = get_report_sex_rows(samples=query.samples)
data["completeness_rows"] = get_report_completeness_rows(
samples_coverage_stats=samples_coverage_stats,
levels=query.completeness_thresholds,
Expand Down Expand Up @@ -231,17 +229,13 @@ def get_report_completeness_rows(
return completeness_rows


def get_report_sex_rows(
samples: List[ReportQuerySample], samples_d4_files: List[Tuple[str, D4File]]
) -> List[Dict]:
def get_report_sex_rows(samples: List[ReportQuerySample]) -> List[Dict]:
"""Create and return the contents for the sample sex lines in the coverage report."""
sample_sex_rows: D4FileList = []
for sample in samples:
for identifier, d4_file in samples_d4_files:
if identifier != sample.name:
continue

sample_sex_metrics: Dict = get_samples_sex_metrics(d4_file=d4_file)
sample_sex_metrics: Dict = get_samples_sex_metrics(
d4_file_path=sample.coverage_file_path
)

sample_sex_row: SampleSexRow = SampleSexRow(
**{
Expand Down

0 comments on commit f3b7c12

Please sign in to comment.