From aec9b697863ef06b4e86e248bebde6616f4eb54e Mon Sep 17 00:00:00 2001 From: akikuno Date: Sat, 15 Jun 2024 15:51:56 +0900 Subject: [PATCH] Apply formatting with Ruff --- docs/RELEASE.md | 6 ++- pyproject.toml | 38 +++++++++++++++ .../core/classification/allele_merger.py | 3 +- src/DAJIN2/core/classification/classifier.py | 4 +- src/DAJIN2/core/clustering/__init__.py | 2 +- src/DAJIN2/core/clustering/clustering.py | 8 ++-- src/DAJIN2/core/clustering/label_extractor.py | 9 ++-- src/DAJIN2/core/clustering/label_merger.py | 1 + src/DAJIN2/core/clustering/score_handler.py | 4 +- .../core/clustering/strand_bias_handler.py | 15 +++--- src/DAJIN2/core/consensus/__init__.py | 6 +-- src/DAJIN2/core/consensus/clust_formatter.py | 2 +- src/DAJIN2/core/consensus/consensus.py | 10 ++-- .../core/consensus/mutation_extractor.py | 8 ++-- src/DAJIN2/core/consensus/name_handler.py | 3 +- .../core/consensus/similarity_searcher.py | 11 ++--- src/DAJIN2/core/preprocess/__init__.py | 12 ++--- src/DAJIN2/core/preprocess/cache_checker.py | 1 + .../core/preprocess/homopolymer_handler.py | 2 +- src/DAJIN2/core/preprocess/input_formatter.py | 12 ++--- .../core/preprocess/insertions_to_fasta.py | 48 +++++++++++-------- .../core/preprocess/inversions_to_fasta.py | 48 +++++++++++-------- src/DAJIN2/core/preprocess/knockin_handler.py | 1 - src/DAJIN2/core/preprocess/mapping.py | 19 +++++--- src/DAJIN2/core/preprocess/midsv_caller.py | 15 +++--- .../core/preprocess/mutation_extractor.py | 22 +++++---- src/DAJIN2/core/report/__init__.py | 4 +- src/DAJIN2/core/report/bam_exporter.py | 9 ++-- src/DAJIN2/core/report/mutation_exporter.py | 5 +- src/DAJIN2/core/report/sequence_exporter.py | 7 +-- src/DAJIN2/gui.py | 8 ++-- src/DAJIN2/main.py | 32 +++++++------ src/DAJIN2/utils/config.py | 5 +- src/DAJIN2/utils/cssplits_handler.py | 6 ++- src/DAJIN2/utils/input_validator.py | 9 ++-- src/DAJIN2/utils/io.py | 22 ++++----- src/DAJIN2/utils/multiprocess.py | 6 +-- src/DAJIN2/utils/report_generator.py | 1 - src/DAJIN2/utils/sam_handler.py | 2 +- src/DAJIN2/view.py | 2 +- tests/src/clustering/test_appender.py | 2 +- tests/src/clustering/test_kmer_generator.py | 4 +- tests/src/clustering/test_label_merger.py | 4 +- tests/src/consensus/test_consensus.py | 3 +- tests/src/consensus/test_name_handler.py | 13 +++-- tests/src/preprocess/test_genome_fetcher.py | 3 +- tests/src/preprocess/test_mapping.py | 1 + tests/src/preprocess/test_midsv_caller.py | 15 +++--- tests/src/report/test_bam_exporter.py | 4 +- tests/src/utils/test_cssplits_handler.py | 2 +- tests/src/utils/test_fastx_handler.py | 1 - tests/src/utils/test_input_validator.py | 8 ++-- tests/src/utils/test_io.py | 7 +-- tests/src/utils/test_multiprocess.py | 8 ++-- tests/src/utils/test_sam_handler.py | 18 ++++--- 55 files changed, 291 insertions(+), 230 deletions(-) diff --git a/docs/RELEASE.md b/docs/RELEASE.md index d04f7a89..f659605b 100644 --- a/docs/RELEASE.md +++ b/docs/RELEASE.md @@ -17,7 +17,9 @@ ## 💥 Breaking -+ Enable to accept FASTA files as an input #37 [[Commit Detail](/~https://github.com/akikuno/DAJIN2/commit/ee6d392cd51649c928bd604acafbab4b9d28feb1)] ++ Enable to accept additional file formats as an input #37 + + FASTA [[Commit Detail](/~https://github.com/akikuno/DAJIN2/commit/ee6d392cd51649c928bd604acafbab4b9d28feb1)] + + BAM [[Commit Detail](/~https://github.com/akikuno/DAJIN2/commit/xxx)] ## 🔧 Maintenance @@ -25,6 +27,8 @@ + Change `mutation_exporter.report_mutations` to return list[list[str]]. Update the tests accordingly. [[Commit Detail](/~https://github.com/akikuno/DAJIN2/commit/7153cb143d621e136ca94bfe6b391f1d7b61d438)] ++ Apply formatting with Ruff [[Commit Detail](/~https://github.com/akikuno/DAJIN2/commit/xxx)] + ## 🐛 Bug Fixes + Add `reallocate_insertion_within_deletion` into `report.mutation_exporter` and reflected it in the mutation info. [[Commit Detail](/~https://github.com/akikuno/DAJIN2/commit/ed6a96e01bb40c77df9cd3a17a4c29524684b6f1)] diff --git a/pyproject.toml b/pyproject.toml index 2dfce2be..2654223e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,3 +51,41 @@ ruptures = ">=1.1.8" [tool.poetry.scripts] DAJIN2 = "DAJIN2.main:execute" + +[tool.ruff] +line-length = 119 + +[tool.ruff.lint] +select = [ + "E", # pycodestyle errors + "W", # pycodestyle warnings + "F", # pyflakes + "I", # isort + "B", # flake8-bugbear + "C4", # flake8-comprehensions + "UP", # pyupgrade +] +ignore = [ + "E501", # line too long, handled by black + "B008", # do not perform function calls in argument defaults + "C901", # too complex + "W191", # indentation contains tabs + "B904", # ignore errors for raise ... from ... not being used +] + +[tool.ruff.lint.per-file-ignores] +"__init__.py" = ["F401"] + +[tool.ruff.lint.isort] +combine-as-imports = true +known-first-party = ["musubi_restapi"] +section-order = ["future", "standard-library", "third-party", "first-party", "local-folder"] +split-on-trailing-comma = true + +[tool.ruff.format] +quote-style = "double" + + +[tool.ruff.lint.pyupgrade] +# Settings for Python 3.8 compatibility +keep-runtime-typing = true diff --git a/src/DAJIN2/core/classification/allele_merger.py b/src/DAJIN2/core/classification/allele_merger.py index 00b812d3..835128ea 100644 --- a/src/DAJIN2/core/classification/allele_merger.py +++ b/src/DAJIN2/core/classification/allele_merger.py @@ -1,8 +1,7 @@ from __future__ import annotations -from itertools import groupby from collections import defaultdict - +from itertools import groupby ########################################################## # merge minor alleles diff --git a/src/DAJIN2/core/classification/classifier.py b/src/DAJIN2/core/classification/classifier.py index a3074d17..dc618c17 100644 --- a/src/DAJIN2/core/classification/classifier.py +++ b/src/DAJIN2/core/classification/classifier.py @@ -1,10 +1,10 @@ from __future__ import annotations -from pathlib import Path from itertools import groupby +from pathlib import Path -from DAJIN2.utils import io from DAJIN2.core.classification.allele_merger import merge_minor_alleles +from DAJIN2.utils import io def calc_match(cssplit: str) -> float: diff --git a/src/DAJIN2/core/clustering/__init__.py b/src/DAJIN2/core/clustering/__init__.py index afaabe86..4ff17894 100644 --- a/src/DAJIN2/core/clustering/__init__.py +++ b/src/DAJIN2/core/clustering/__init__.py @@ -1,3 +1,3 @@ +from DAJIN2.core.clustering.appender import add_labels, add_percent, add_readnum from DAJIN2.core.clustering.label_extractor import extract_labels -from DAJIN2.core.clustering.appender import add_labels, add_readnum, add_percent from DAJIN2.core.clustering.label_updator import update_labels diff --git a/src/DAJIN2/core/clustering/clustering.py b/src/DAJIN2/core/clustering/clustering.py index c50646c2..a336336b 100644 --- a/src/DAJIN2/core/clustering/clustering.py +++ b/src/DAJIN2/core/clustering/clustering.py @@ -1,18 +1,18 @@ from __future__ import annotations -from pathlib import Path -from itertools import chain from collections import Counter +from itertools import chain +from pathlib import Path import numpy as np +from scipy.sparse import csr_matrix, spmatrix from sklearn import metrics from sklearn.cluster import BisectingKMeans -from scipy.sparse import csr_matrix, spmatrix -from DAJIN2.utils import io, config from DAJIN2.core.clustering.label_merger import merge_labels from DAJIN2.core.clustering.score_handler import subset_scores from DAJIN2.core.clustering.strand_bias_handler import remove_biased_clusters +from DAJIN2.utils import config, io config.set_warnings_ignore() diff --git a/src/DAJIN2/core/clustering/label_extractor.py b/src/DAJIN2/core/clustering/label_extractor.py index d74482fc..eb500f42 100644 --- a/src/DAJIN2/core/clustering/label_extractor.py +++ b/src/DAJIN2/core/clustering/label_extractor.py @@ -1,15 +1,14 @@ from __future__ import annotations import uuid - -from pathlib import Path from itertools import groupby +from pathlib import Path -from DAJIN2.utils import io -from DAJIN2.core.clustering.score_handler import make_score, annotate_score +from DAJIN2.core.clustering.clustering import return_labels from DAJIN2.core.clustering.label_updator import relabel_with_consective_order +from DAJIN2.core.clustering.score_handler import annotate_score, make_score from DAJIN2.core.clustering.strand_bias_handler import is_strand_bias -from DAJIN2.core.clustering.clustering import return_labels +from DAJIN2.utils import io def extract_labels(classif_sample, TEMPDIR, SAMPLE_NAME, CONTROL_NAME) -> list[dict[str]]: diff --git a/src/DAJIN2/core/clustering/label_merger.py b/src/DAJIN2/core/clustering/label_merger.py index 42cd17c1..26bf8e4f 100644 --- a/src/DAJIN2/core/clustering/label_merger.py +++ b/src/DAJIN2/core/clustering/label_merger.py @@ -1,6 +1,7 @@ from __future__ import annotations from collections import Counter + import numpy as np diff --git a/src/DAJIN2/core/clustering/score_handler.py b/src/DAJIN2/core/clustering/score_handler.py index db71036f..6e0db9ee 100644 --- a/src/DAJIN2/core/clustering/score_handler.py +++ b/src/DAJIN2/core/clustering/score_handler.py @@ -1,8 +1,8 @@ from __future__ import annotations -from typing import Generator -from itertools import groupby from collections import Counter +from itertools import groupby +from typing import Generator from DAJIN2.core.clustering.kmer_generator import generate_mutation_kmers diff --git a/src/DAJIN2/core/clustering/strand_bias_handler.py b/src/DAJIN2/core/clustering/strand_bias_handler.py index f59832be..789d7fe3 100644 --- a/src/DAJIN2/core/clustering/strand_bias_handler.py +++ b/src/DAJIN2/core/clustering/strand_bias_handler.py @@ -1,5 +1,13 @@ from __future__ import annotations +from collections import defaultdict +from pathlib import Path +from typing import Generator + +from sklearn.tree import DecisionTreeClassifier + +from DAJIN2.utils import io + """ Nanopore sequencing results often results in strand specific mutations even though the mutation is not strand specific, thus they are considered as sequencing errors and should be removed. @@ -8,13 +16,6 @@ Re-allocates reads belonging to clusters with strand bias to clusters without strand bias. """ -from pathlib import Path -from typing import Generator -from collections import defaultdict -from sklearn.tree import DecisionTreeClassifier - -from DAJIN2.utils import io - # Constants STRAND_BIAS_LOWER_LIMIT = 0.1 STRAND_BIAS_UPPER_LIMIT = 0.9 diff --git a/src/DAJIN2/core/consensus/__init__.py b/src/DAJIN2/core/consensus/__init__.py index 5a8496ac..3f2d5b2a 100644 --- a/src/DAJIN2/core/consensus/__init__.py +++ b/src/DAJIN2/core/consensus/__init__.py @@ -1,6 +1,4 @@ +from DAJIN2.core.consensus.clust_formatter import downsample_by_label, remove_minor_alleles from DAJIN2.core.consensus.consensus import call_consensus -from DAJIN2.core.consensus.name_handler import call_allele_name -from DAJIN2.core.consensus.name_handler import update_key_by_allele_name -from DAJIN2.core.consensus.name_handler import add_key_by_allele_name -from DAJIN2.core.consensus.clust_formatter import remove_minor_alleles, downsample_by_label from DAJIN2.core.consensus.mutation_extractor import cache_mutation_loci +from DAJIN2.core.consensus.name_handler import add_key_by_allele_name, call_allele_name, update_key_by_allele_name diff --git a/src/DAJIN2/core/consensus/clust_formatter.py b/src/DAJIN2/core/consensus/clust_formatter.py index 5c7c619f..47187cd4 100644 --- a/src/DAJIN2/core/consensus/clust_formatter.py +++ b/src/DAJIN2/core/consensus/clust_formatter.py @@ -1,8 +1,8 @@ from __future__ import annotations import random -from itertools import groupby from collections import defaultdict +from itertools import groupby def remove_minor_alleles(clust_sample: list[dict]) -> list[dict]: diff --git a/src/DAJIN2/core/consensus/consensus.py b/src/DAJIN2/core/consensus/consensus.py index 28eaba0f..7ff65e7b 100644 --- a/src/DAJIN2/core/consensus/consensus.py +++ b/src/DAJIN2/core/consensus/consensus.py @@ -1,14 +1,13 @@ from __future__ import annotations -from pathlib import Path +from collections import defaultdict from dataclasses import dataclass from itertools import groupby -from collections import defaultdict +from pathlib import Path from DAJIN2.utils import io from DAJIN2.utils.cssplits_handler import call_sequence - ########################################################### # call position weight matrix (cons_pergentage) ########################################################### @@ -98,11 +97,10 @@ class ConsensusKey: def call_consensus(tempdir: Path, sample_name: str, clust_sample: list[dict]) -> tuple[dict[list], dict[str]]: - clust_sample.sort(key=lambda x: [x["ALLELE"], x["LABEL"]]) - cons_percentages = dict() - cons_sequences = dict() + cons_percentages = {} + cons_sequences = {} for (allele, label), group in groupby(clust_sample, key=lambda x: [x["ALLELE"], x["LABEL"]]): clust = list(group) diff --git a/src/DAJIN2/core/consensus/mutation_extractor.py b/src/DAJIN2/core/consensus/mutation_extractor.py index c81d9adf..033b767b 100644 --- a/src/DAJIN2/core/consensus/mutation_extractor.py +++ b/src/DAJIN2/core/consensus/mutation_extractor.py @@ -1,14 +1,14 @@ from __future__ import annotations -from pathlib import Path from itertools import groupby +from pathlib import Path import numpy as np from sklearn.cluster import MiniBatchKMeans -from DAJIN2.utils import io -from DAJIN2.core.preprocess.mutation_extractor import summarize_indels, extract_mutation_loci, minimize_mutation_counts from DAJIN2.core.consensus.similarity_searcher import cache_selected_control_by_similarity +from DAJIN2.core.preprocess.mutation_extractor import extract_mutation_loci, minimize_mutation_counts, summarize_indels +from DAJIN2.utils import io """ Most of the code reuses `preprocess.cache_mutation_loci`. @@ -25,7 +25,7 @@ def get_thresholds(path_indels_normalized_sample, path_indels_normalized_control indels_normalized_sample = io.load_pickle(path_indels_normalized_sample) indels_normalized_control = io.load_pickle(path_indels_normalized_control) indels_normalized_minimize_control = minimize_mutation_counts(indels_normalized_control, indels_normalized_sample) - thresholds = dict() + thresholds = {} for mut in {"+", "-", "*"}: values_sample = indels_normalized_sample[mut] values_control = indels_normalized_minimize_control[mut] diff --git a/src/DAJIN2/core/consensus/name_handler.py b/src/DAJIN2/core/consensus/name_handler.py index 2a542450..9dfec876 100644 --- a/src/DAJIN2/core/consensus/name_handler.py +++ b/src/DAJIN2/core/consensus/name_handler.py @@ -1,6 +1,7 @@ from __future__ import annotations import re + from DAJIN2.core.consensus.consensus import ConsensusKey @@ -69,7 +70,7 @@ def call_allele_name( def update_key_by_allele_name(cons: dict, allele_names: dict[int, str]) -> dict: - cons_update = dict() + cons_update = {} for key in cons: old_allele = cons[key] new_allele = allele_names[key.label] diff --git a/src/DAJIN2/core/consensus/similarity_searcher.py b/src/DAJIN2/core/consensus/similarity_searcher.py index 698cbf63..5f3bee36 100644 --- a/src/DAJIN2/core/consensus/similarity_searcher.py +++ b/src/DAJIN2/core/consensus/similarity_searcher.py @@ -1,10 +1,9 @@ from __future__ import annotations -from pathlib import Path from collections import defaultdict +from pathlib import Path import numpy as np - from sklearn.neighbors import LocalOutlierFactor from DAJIN2.utils import io @@ -28,7 +27,7 @@ def onehot_by_mutations(midsv_sample: list[dict]) -> dict[str, np.ndarray]: def calculate_percentage( mut_onehot_sample: dict[str, np.ndarray], coverage_match: np.ndarray[int] ) -> dict[str, np.ndarray]: - mut_percentage = dict() + mut_percentage = {} for mut, onehot in mut_onehot_sample.items(): x = np.sum(onehot, axis=0) / coverage_match mut_percentage[mut] = np.where(np.isnan(x), 0, x) @@ -36,14 +35,14 @@ def calculate_percentage( def get_values_to_mask(mut_percentage_sample: dict[str, np.ndarray], threshold=0.5) -> dict[str, np.ndarray[float]]: - mask = dict() + mask = {} for mut, percentage in mut_percentage_sample.items(): mask[mut] = np.where(percentage > threshold, 0, percentage) return mask def apply_mask(mut_onehot: dict[str, np.ndarray], mask_sample: dict[str, np.ndarray[float]]): - mut_onehot_masked = dict() + mut_onehot_masked = {} for mut, onehot in mut_onehot.items(): mut_onehot_masked[mut] = onehot * mask_sample[mut] return mut_onehot_masked @@ -52,7 +51,7 @@ def apply_mask(mut_onehot: dict[str, np.ndarray], mask_sample: dict[str, np.ndar def identify_normal_reads( mut_onehot_sample_masked: dict[str, np.ndarray], mut_onehot_control_masked: dict[str, np.ndarray] ) -> list[bool]: - mutation_comparisons = dict() + mutation_comparisons = {} for mut in {"+", "-", "*"}: values_sample = mut_onehot_sample_masked[mut] values_control = mut_onehot_control_masked[mut] diff --git a/src/DAJIN2/core/preprocess/__init__.py b/src/DAJIN2/core/preprocess/__init__.py index 73d7989d..51d791fe 100644 --- a/src/DAJIN2/core/preprocess/__init__.py +++ b/src/DAJIN2/core/preprocess/__init__.py @@ -1,9 +1,9 @@ -from DAJIN2.core.preprocess.cache_checker import exists_cached_hash, exists_cached_genome -from DAJIN2.core.preprocess.genome_fetcher import fetch_coordinates, fetch_chromosome_size -from DAJIN2.core.preprocess.mapping import generate_sam -from DAJIN2.core.preprocess.directory_manager import create_temporal_directories, create_report_directories +from DAJIN2.core.preprocess.cache_checker import exists_cached_genome, exists_cached_hash +from DAJIN2.core.preprocess.directory_manager import create_report_directories, create_temporal_directories +from DAJIN2.core.preprocess.genome_fetcher import fetch_chromosome_size, fetch_coordinates from DAJIN2.core.preprocess.input_formatter import format_inputs -from DAJIN2.core.preprocess.midsv_caller import generate_midsv +from DAJIN2.core.preprocess.insertions_to_fasta import generate_insertion_fasta from DAJIN2.core.preprocess.knockin_handler import extract_knockin_loci +from DAJIN2.core.preprocess.mapping import generate_sam +from DAJIN2.core.preprocess.midsv_caller import generate_midsv from DAJIN2.core.preprocess.mutation_extractor import cache_mutation_loci -from DAJIN2.core.preprocess.insertions_to_fasta import generate_insertion_fasta diff --git a/src/DAJIN2/core/preprocess/cache_checker.py b/src/DAJIN2/core/preprocess/cache_checker.py index c6d42122..77403b60 100644 --- a/src/DAJIN2/core/preprocess/cache_checker.py +++ b/src/DAJIN2/core/preprocess/cache_checker.py @@ -2,6 +2,7 @@ import hashlib from pathlib import Path + from DAJIN2.utils import io diff --git a/src/DAJIN2/core/preprocess/homopolymer_handler.py b/src/DAJIN2/core/preprocess/homopolymer_handler.py index 1f5ab81c..4259b4dc 100644 --- a/src/DAJIN2/core/preprocess/homopolymer_handler.py +++ b/src/DAJIN2/core/preprocess/homopolymer_handler.py @@ -37,7 +37,7 @@ def extract_sequence_errors_in_homopolymer_loci( indels_normalized_control: dict[str, np.array], anomal_loci: dict[set], ) -> dict[str, set[int]]: - sequence_errors_in_homopolymer = dict() + sequence_errors_in_homopolymer = {} for mut in ["+", "-", "*"]: repeat_regions = get_repeat_regions(sequence, anomal_loci[mut]) if len(repeat_regions) == 0: diff --git a/src/DAJIN2/core/preprocess/input_formatter.py b/src/DAJIN2/core/preprocess/input_formatter.py index 12885085..7ac93224 100644 --- a/src/DAJIN2/core/preprocess/input_formatter.py +++ b/src/DAJIN2/core/preprocess/input_formatter.py @@ -1,14 +1,12 @@ from __future__ import annotations import uuid - -from pathlib import Path -from dataclasses import dataclass from collections import defaultdict - -from DAJIN2.utils import io, config, fastx_handler +from dataclasses import dataclass +from pathlib import Path from DAJIN2.core import preprocess +from DAJIN2.utils import config, fastx_handler, io def parse_arguments(arguments: dict) -> tuple: @@ -64,7 +62,9 @@ def get_genome_coordinates(genome_urls: dict, fasta_alleles: dict, is_cache_geno if is_cache_genome: genome_coordinates = next(io.read_jsonl(Path(tempdir, "cache", "genome_coordinates.jsonl"))) else: - genome_coordinates = preprocess.fetch_coordinates(genome_coordinates, genome_urls, fasta_alleles["control"]) + genome_coordinates = preprocess.fetch_coordinates( + genome_coordinates, genome_urls, fasta_alleles["control"] + ) genome_coordinates["chrom_size"] = preprocess.fetch_chromosome_size(genome_coordinates, genome_urls) io.write_jsonl([genome_coordinates], Path(tempdir, "cache", "genome_coordinates.jsonl")) diff --git a/src/DAJIN2/core/preprocess/insertions_to_fasta.py b/src/DAJIN2/core/preprocess/insertions_to_fasta.py index dbe95c03..38f572a6 100644 --- a/src/DAJIN2/core/preprocess/insertions_to_fasta.py +++ b/src/DAJIN2/core/preprocess/insertions_to_fasta.py @@ -1,20 +1,19 @@ from __future__ import annotations -import uuid import random -from pathlib import Path +import uuid +from collections import Counter, defaultdict from itertools import groupby -from collections import defaultdict, Counter +from pathlib import Path from typing import Generator import numpy as np from rapidfuzz import process from rapidfuzz.distance import DamerauLevenshtein - from sklearn.cluster import MeanShift from DAJIN2.core.preprocess.mapping import to_sam -from DAJIN2.utils import io, config +from DAJIN2.utils import config, io from DAJIN2.utils.cssplits_handler import convert_cssplits_to_cstag config.set_warnings_ignore() @@ -65,7 +64,7 @@ def extract_all_insertions(midsv_sample: Generator, mutation_loci: list[set[str] def extract_enriched_insertions( insertions_sample: dict, insertions_control: dict, coverage_sample: int ) -> dict[int, dict[str, int]]: - enriched_insertions = dict() + enriched_insertions = {} threshold_sample = max(5, int(coverage_sample * 0.5 / 100)) for i in insertions_sample: ins_sample: list[str] = insertions_sample[i] @@ -94,7 +93,7 @@ def extract_enriched_insertions( if label in labels_count_sample: del labels_count_sample[label] - if labels_count_sample == dict(): + if labels_count_sample == {}: continue # Count the remaining insertion sequences. @@ -149,7 +148,7 @@ def get_merged_insertion(insertion: dict[str, int], labels: np.ndarray) -> dict[ insertion_label = [(label, {seq: count}) for label, (seq, count) in zip(labels, insertion.items())] insertion_label.sort(key=lambda x: x[0]) - insertion_merged = dict() + insertion_merged = {} for _, group in groupby(insertion_label, key=lambda x: x[0]): group = [g[1] for g in group] sequences, counts = set(), 0 @@ -162,7 +161,9 @@ def get_merged_insertion(insertion: dict[str, int], labels: np.ndarray) -> dict[ return insertion_merged -def remove_minor_groups(insertions_merged: dict[tuple[int], dict[tuple[str], int]], coverage: int, percentage: float = 0.5) -> dict[tuple[int], dict[tuple[str], int]]: +def remove_minor_groups( + insertions_merged: dict[tuple[int], dict[tuple[str], int]], coverage: int, percentage: float = 0.5 +) -> dict[tuple[int], dict[tuple[str], int]]: """Remove minor groups with less than {percentage} % coverage or less than 5 reads.""" threshold = max(5, int(coverage * percentage // 100)) for _, ins in insertions_merged.items(): @@ -184,11 +185,11 @@ def merge_similar_insertions( ) -> dict[tuple[int], dict[tuple[str], int]]: index_grouped = group_index_by_consecutive_insertions(mutation_loci) insertions_grouped = group_insertions(insertions, index_grouped) - insertions_merged = dict() + insertions_merged = {} for idx, insertion in insertions_grouped.items(): if len(insertion) == 1: seq, count = next(iter(insertion.items())) - insertions_merged[idx] = {tuple([seq]): count} + insertions_merged[idx] = {(seq,): count} continue labels = clustering_insertions(insertion) @@ -288,7 +289,7 @@ def extract_score_and_sequence( scores.append(score) sequences.append(",".join(sequence)) - return [(score, sequence) for score, sequence in zip(scores, sequences)] + return list(zip(scores, sequences)) def filter_minor_label( @@ -411,7 +412,7 @@ def generate_consensus_insertions(TEMPDIR: Path, SAMPLE_NAME: str, cssplits: lis def remove_all_n(cons_sequence: dict[int, str]) -> dict[int, str]: """Remove all `N` sequences.""" - cons_sequence_removed = dict() + cons_sequence_removed = {} for label, seq in cons_sequence.items(): if all(True if s == "N" else False for s in seq.split(",")): continue @@ -419,9 +420,11 @@ def remove_all_n(cons_sequence: dict[int, str]) -> dict[int, str]: return cons_sequence_removed -def call_consensus_of_insertion(TEMPDIR: Path, SAMPLE_NAME: str, insertion_sequences_subset: list[dict]) -> dict[int, str]: +def call_consensus_of_insertion( + TEMPDIR: Path, SAMPLE_NAME: str, insertion_sequences_subset: list[dict] +) -> dict[int, str]: """Generate consensus cssplits.""" - consensus_insertion_cssplits = dict() + consensus_insertion_cssplits = {} insertion_sequences_subset.sort(key=lambda x: x["LABEL"]) for label, group in groupby(insertion_sequences_subset, key=lambda x: x["LABEL"]): cssplits = [cs["CSSPLIT"].split(",") for cs in group] @@ -458,7 +461,7 @@ def extract_index_of_insertions( def generate_cstag( consensus_of_insertions: dict[str, str], index_of_insertions: list[int], sequence: str ) -> dict[str, str]: - cstag_insertions = dict() + cstag_insertions = {} for label, cons_seq in consensus_of_insertions.items(): cons_seq = cons_seq.split(",") list_sequence = list(sequence) @@ -472,7 +475,7 @@ def generate_cstag( def generate_fasta(cstag_insertions: dict[str, str]) -> dict[str, str]: - fasta_insertions = dict() + fasta_insertions = {} for label, cs_tag in cstag_insertions.items(): fasta_insertions[label] = cstag.to_sequence(cs_tag) @@ -497,7 +500,9 @@ def extract_unique_insertions(fasta_insertions: dict[str, str], FASTA_ALLELES: d for key, seq in fasta_insertions_unique.items(): if key in to_delete: continue - _, distances, _ = zip(*process.extract_iter(seq, fasta_insertions_unique.values(), scorer=DamerauLevenshtein.distance)) + _, distances, _ = zip( + *process.extract_iter(seq, fasta_insertions_unique.values(), scorer=DamerauLevenshtein.distance) + ) similar_index = {i if d < 10 else None for i, d in enumerate(distances) if i != key} to_delete |= similar_index @@ -538,6 +543,7 @@ def save_cstag(TEMPDIR: Path | str, SAMPLE_NAME: str, cstag_insertions: dict[str for header, cs_tag in cstag_insertions.items(): Path(TEMPDIR, SAMPLE_NAME, "cstag", f"{header}.txt").write_text(cs_tag + "\n") + ########################################################### # main ########################################################### @@ -554,7 +560,7 @@ def generate_insertion_fasta(TEMPDIR, SAMPLE_NAME, CONTROL_NAME, FASTA_ALLELES) MUTATION_LOCI = io.load_pickle(Path(TEMPDIR, SAMPLE_NAME, "mutation_loci", "control", "mutation_loci.pickle")) insertions = extract_insertions(PATH_SAMPLE, PATH_CONTROL, MUTATION_LOCI) - if insertions == dict(): + if insertions == {}: """If there is no insertion, return None""" return None @@ -578,7 +584,7 @@ def generate_insertion_fasta(TEMPDIR, SAMPLE_NAME, CONTROL_NAME, FASTA_ALLELES) [seq for _, seq in insertion_scores_sequences_filtered], labels_filtered, num=1000 ) consensus_of_insertions = call_consensus_of_insertion(TEMPDIR, SAMPLE_NAME, insertion_sequences_subset) - if consensus_of_insertions == dict(): + if consensus_of_insertions == {}: """ If there is no insertion sequence, return None It is possible when all insertion sequence annotated as `N` that is filtered out @@ -592,7 +598,7 @@ def generate_insertion_fasta(TEMPDIR, SAMPLE_NAME, CONTROL_NAME, FASTA_ALLELES) fasta_insertions = generate_fasta(cstag_insertions) fasta_insertions_unique = extract_unique_insertions(fasta_insertions, FASTA_ALLELES) - if fasta_insertions_unique == dict(): + if fasta_insertions_unique == {}: remove_temporal_files(TEMPDIR, SAMPLE_NAME) return None diff --git a/src/DAJIN2/core/preprocess/inversions_to_fasta.py b/src/DAJIN2/core/preprocess/inversions_to_fasta.py index 5dcf03de..1337c77e 100644 --- a/src/DAJIN2/core/preprocess/inversions_to_fasta.py +++ b/src/DAJIN2/core/preprocess/inversions_to_fasta.py @@ -1,20 +1,19 @@ from __future__ import annotations -import uuid import random -from pathlib import Path +import uuid +from collections import Counter, defaultdict from itertools import groupby -from collections import defaultdict, Counter +from pathlib import Path from typing import Generator import numpy as np from rapidfuzz import process from rapidfuzz.distance import DamerauLevenshtein - from sklearn.cluster import MeanShift from DAJIN2.core.preprocess.mapping import to_sam -from DAJIN2.utils import io, config +from DAJIN2.utils import config, io from DAJIN2.utils.cssplits_handler import convert_cssplits_to_cstag config.set_warnings_ignore() @@ -68,7 +67,9 @@ def extract_all_inversions(midsv_sample: Generator[list[str]]) -> list[list[int, return inversions_index_sequence -def cluster_index_of_inversions(inversions_index_sequence: list[list[int, int, str]], coverage: int) -> list[tuple[int]]: +def cluster_index_of_inversions( + inversions_index_sequence: list[list[int, int, str]], coverage: int +) -> list[tuple[int]]: indexes = [(start, end) for start, end, _ in inversions_index_sequence] min_cluster_size = max(5, int(coverage * 0.5 / 100)) @@ -78,7 +79,7 @@ def cluster_index_of_inversions(inversions_index_sequence: list[list[int, int, s def extract_enriched_inversions( insertions_sample: dict, insertions_control: dict, coverage_sample: int ) -> dict[int, dict[str, int]]: - enriched_insertions = dict() + enriched_insertions = {} threshold_sample = max(5, int(coverage_sample * 0.5 / 100)) for i in insertions_sample: ins_sample: list[str] = insertions_sample[i] @@ -107,7 +108,7 @@ def extract_enriched_inversions( if label in labels_count_sample: del labels_count_sample[label] - if labels_count_sample == dict(): + if labels_count_sample == {}: continue # Count the remaining insertion sequences. @@ -163,7 +164,7 @@ def get_merged_insertion(insertion: dict[str, int], labels: np.ndarray) -> dict[ insertion_label = [(label, {seq: count}) for label, (seq, count) in zip(labels, insertion.items())] insertion_label.sort(key=lambda x: x[0]) - insertion_merged = dict() + insertion_merged = {} for _, group in groupby(insertion_label, key=lambda x: x[0]): group = [g[1] for g in group] sequences, counts = set(), 0 @@ -176,7 +177,9 @@ def get_merged_insertion(insertion: dict[str, int], labels: np.ndarray) -> dict[ return insertion_merged -def remove_minor_groups(insertions_merged: dict[tuple[int], dict[tuple[str], int]], coverage: int, threshold: float = 0.5) -> dict[tuple[int], dict[tuple[str], int]]: +def remove_minor_groups( + insertions_merged: dict[tuple[int], dict[tuple[str], int]], coverage: int, threshold: float = 0.5 +) -> dict[tuple[int], dict[tuple[str], int]]: for _, ins in insertions_merged.items(): # Create a list of elements to delete to_delete = [] @@ -196,7 +199,7 @@ def merge_similar_insertions( ) -> dict[tuple[int], dict[tuple[str], int]]: index_grouped = group_index_by_consecutive_insertions(mutation_loci) insertions_grouped = group_insertions(insertions, index_grouped) - insertions_merged = dict() + insertions_merged = {} for idx, insertion in insertions_grouped.items(): if len(insertion) == 1: seq, count = next(iter(insertion.items())) @@ -422,7 +425,7 @@ def generate_consensus_insertions(TEMPDIR: Path, SAMPLE_NAME: str, cssplits: lis def remove_all_n(cons_sequence: dict[int, str]) -> dict[int, str]: """Remove all `N` sequences.""" - cons_sequence_removed = dict() + cons_sequence_removed = {} for label, seq in cons_sequence.items(): if all(True if s == "N" else False for s in seq.split(",")): continue @@ -430,9 +433,11 @@ def remove_all_n(cons_sequence: dict[int, str]) -> dict[int, str]: return cons_sequence_removed -def call_consensus_of_insertion(TEMPDIR: Path, SAMPLE_NAME: str, insertion_sequences_subset: list[dict]) -> dict[int, str]: +def call_consensus_of_insertion( + TEMPDIR: Path, SAMPLE_NAME: str, insertion_sequences_subset: list[dict] +) -> dict[int, str]: """Generate consensus cssplits.""" - consensus_insertion_cssplits = dict() + consensus_insertion_cssplits = {} insertion_sequences_subset.sort(key=lambda x: x["LABEL"]) for label, group in groupby(insertion_sequences_subset, key=lambda x: x["LABEL"]): cssplits = [cs["CSSPLIT"].split(",") for cs in group] @@ -469,7 +474,7 @@ def extract_index_of_insertions( def generate_cstag( consensus_of_insertions: dict[str, str], index_of_insertions: list[int], sequence: str ) -> dict[str, str]: - cstag_insertions = dict() + cstag_insertions = {} for label, cons_seq in consensus_of_insertions.items(): cons_seq = cons_seq.split(",") list_sequence = list(sequence) @@ -483,7 +488,7 @@ def generate_cstag( def generate_fasta(cstag_insertions: dict[str, str]) -> dict[str, str]: - fasta_insertions = dict() + fasta_insertions = {} for label, cs_tag in cstag_insertions.items(): fasta_insertions[label] = cstag.to_sequence(cs_tag) @@ -497,7 +502,9 @@ def extract_unique_insertions(FASTA_ALLELES: dict[str, str], fasta_insertions: d """ to_keep = [] for query_key, query_seq in fasta_insertions.items(): - _, distances, _ = zip(*process.extract_iter(query_seq, FASTA_ALLELES.values(), scorer=DamerauLevenshtein.distance)) + _, distances, _ = zip( + *process.extract_iter(query_seq, FASTA_ALLELES.values(), scorer=DamerauLevenshtein.distance) + ) if all(d > 10 for d in distances): to_keep.append(query_key) @@ -538,6 +545,7 @@ def save_cstag(TEMPDIR: Path | str, SAMPLE_NAME: str, cstag_insertions: dict[str for header, cs_tag in cstag_insertions.items(): Path(TEMPDIR, SAMPLE_NAME, "cstag", f"{header}.txt").write_text(cs_tag + "\n") + ########################################################### # main ########################################################### @@ -554,7 +562,7 @@ def generate_insertion_fasta(TEMPDIR, SAMPLE_NAME, CONTROL_NAME, FASTA_ALLELES) MUTATION_LOCI = io.load_pickle(Path(TEMPDIR, SAMPLE_NAME, "mutation_loci", "control.pickle")) insertions = extract_inversions(PATH_SAMPLE, PATH_CONTROL, MUTATION_LOCI) - if insertions == dict(): + if insertions == {}: """If there is no insertion, return None""" return None @@ -578,7 +586,7 @@ def generate_insertion_fasta(TEMPDIR, SAMPLE_NAME, CONTROL_NAME, FASTA_ALLELES) [seq for _, seq in insertion_scores_sequences_filtered], labels_filtered, num=1000 ) consensus_of_insertions = call_consensus_of_insertion(TEMPDIR, SAMPLE_NAME, insertion_sequences_subset) - if consensus_of_insertions == dict(): + if consensus_of_insertions == {}: """ If there is no insertion sequence, return None It is possible when all insertion sequence annotated as `N` that is filtered out @@ -592,7 +600,7 @@ def generate_insertion_fasta(TEMPDIR, SAMPLE_NAME, CONTROL_NAME, FASTA_ALLELES) fasta_insertions = generate_fasta(cstag_insertions) fasta_insertions_unique = extract_unique_insertions(FASTA_ALLELES, fasta_insertions) - if fasta_insertions_unique == dict(): + if fasta_insertions_unique == {}: remove_temporal_files(TEMPDIR, SAMPLE_NAME) return None diff --git a/src/DAJIN2/core/preprocess/knockin_handler.py b/src/DAJIN2/core/preprocess/knockin_handler.py index d3b86a36..9e8c61f4 100644 --- a/src/DAJIN2/core/preprocess/knockin_handler.py +++ b/src/DAJIN2/core/preprocess/knockin_handler.py @@ -7,7 +7,6 @@ from DAJIN2.core.preprocess import mapping - ########################################################### # Consider all mutations are possible in the knockin region # For large deletion alleles, the deleted sequence becomes the knock-in region, so all mutations within this region are taken into consideration. diff --git a/src/DAJIN2/core/preprocess/mapping.py b/src/DAJIN2/core/preprocess/mapping.py index c74b9828..73d69924 100644 --- a/src/DAJIN2/core/preprocess/mapping.py +++ b/src/DAJIN2/core/preprocess/mapping.py @@ -1,11 +1,11 @@ from __future__ import annotations -import cstag -import mappy - from pathlib import Path from typing import Generator +import cstag +import mappy + from DAJIN2.utils import dna_handler @@ -14,7 +14,7 @@ def to_sam( path_query_fastx: Path, preset: str = "map-ont", threads: int = 1, - options: dict = {}, + options: dict = None, cslong: bool = True, ) -> Generator[str]: """Align sequences using mappy and Convert PAF to SAM. @@ -29,6 +29,9 @@ def to_sam( Yields: str: SAM formatted alignment. """ + if options is None: + options = {} + path_reference_fasta = str(path_reference_fasta) path_query_fastx = str(path_query_fastx) @@ -90,8 +93,7 @@ def to_sam( SAM.append("\t".join(alignment)) - for record in SAM: - yield record + yield from SAM ######################################################################## @@ -100,8 +102,11 @@ def to_sam( def generate_sam( - ARGS, paths_fasta: list[str], mappy_options: dict = {}, is_control: bool = False, is_insertion: bool = False + ARGS, paths_fasta: list[str], mappy_options: dict = None, is_control: bool = False, is_insertion: bool = False ) -> None: + if mappy_options is None: + mappy_options = {} + if is_control: path_fastq = Path(ARGS.tempdir, ARGS.control_name, "fastq", f"{ARGS.control_name}.fastq.gz") name = ARGS.control_name diff --git a/src/DAJIN2/core/preprocess/midsv_caller.py b/src/DAJIN2/core/preprocess/midsv_caller.py index e0998ea2..c83b2bed 100644 --- a/src/DAJIN2/core/preprocess/midsv_caller.py +++ b/src/DAJIN2/core/preprocess/midsv_caller.py @@ -1,12 +1,12 @@ from __future__ import annotations -import midsv - +from collections import Counter, defaultdict from pathlib import Path from typing import Generator -from collections import Counter, defaultdict -from DAJIN2.utils import io, sam_handler, cssplits_handler +import midsv + +from DAJIN2.utils import cssplits_handler, io, sam_handler def has_inversion_in_splice(CIGAR: str) -> bool: @@ -88,8 +88,7 @@ def extract_best_alignment_length_from_sam( def transform_to_midsv_format(sam: Generator[list[str]]) -> Generator[list[dict]]: - for midsv_sample in midsv.transform(sam, midsv=False, cssplit=True, qscore=False, keep=set(["FLAG"])): - yield midsv_sample + yield from midsv.transform(sam, midsv=False, cssplit=True, qscore=False, keep={"FLAG"}) def replace_internal_n_to_d(midsv_sample: Generator[list[dict]], sequence: str) -> Generator[list[dict]]: @@ -178,7 +177,7 @@ def convert_consecutive_indels_to_match(cssplit: str) -> str: cssplit_reversed[i] = current_cs.split("|")[-1] # Format deletions - for k, insertion in enumerate(insertions, 1): + for k, _ in enumerate(insertions, 1): cssplit_reversed[i + k] = cssplit_reversed[i + k].replace("-", "=") i += len(insertions) + 1 @@ -221,7 +220,7 @@ def generate_midsv(ARGS, is_control: bool = False, is_insertion: bool = False) - """ Set the destination for midsv as `barcode02/midsv/insertion1.json` when the sample is barcode02 and the allele is insertion1. """ - path_sam_files = list(Path(ARGS.tempdir, name, "sam", allele).glob(f"*.sam")) + path_sam_files = list(Path(ARGS.tempdir, name, "sam", allele).glob("*.sam")) path_midsv_output = Path(ARGS.tempdir, name, "midsv", allele, f"{name}.jsonl") preset_cigar_by_qname = extract_preset_and_cigar_by_qname(path_sam_files) diff --git a/src/DAJIN2/core/preprocess/mutation_extractor.py b/src/DAJIN2/core/preprocess/mutation_extractor.py index cb5800d1..9ecadc1e 100644 --- a/src/DAJIN2/core/preprocess/mutation_extractor.py +++ b/src/DAJIN2/core/preprocess/mutation_extractor.py @@ -1,10 +1,10 @@ from __future__ import annotations -import re import bisect +import re +from collections import defaultdict from pathlib import Path from typing import Generator -from collections import defaultdict from DAJIN2.utils import config @@ -18,8 +18,8 @@ import numpy as np from sklearn.cluster import MiniBatchKMeans -from DAJIN2.utils import io from DAJIN2.core.preprocess.homopolymer_handler import extract_sequence_errors_in_homopolymer_loci +from DAJIN2.utils import io def count_indels(midsv_sample: Generator[dict], sequence: str) -> dict[str, list[int]]: @@ -41,7 +41,7 @@ def count_indels(midsv_sample: Generator[dict], sequence: str) -> dict[str, list def normalize_indels(count: dict[str, list[int]]) -> dict[str, np.array]: - count_normalized = dict() + count_normalized = {} match_count = np.array(count["="]) for mut, indel_count in count.items(): numerator = np.array(indel_count) @@ -56,7 +56,7 @@ def minimize_mutation_counts( """ In cases where control has a larger value than sample, adjust the value of sample to match that of control. """ - indels_control_minimized = dict() + indels_control_minimized = {} for mut in {"+", "-", "*"}: indels_control_minimized[mut] = np.minimum(indels_control[mut], indels_sample[mut]) return indels_control_minimized @@ -136,7 +136,7 @@ def extract_anomal_loci( is_consensus: bool = False, ) -> dict[str, set[int]]: """Extract outlier loci compareing indel counts between sample and control.""" - anomal_loci = dict() + anomal_loci = {} for mut in {"+", "-", "*"}: values_sample = indels_normalized_sample[mut] values_control = indels_normalized_control[mut] @@ -171,7 +171,7 @@ def count_elements_within_range(arr, lower_bound, upper_bound): def merge_index_of_consecutive_indel(mutation_loci: dict[str, set[int]]) -> dict[str, set[int]]: """Treat as contiguous indels if there are insertions/deletions within five bases of each other""" - mutation_loci_merged = dict() + mutation_loci_merged = {} """Reflect point mutations as they are""" mutation_loci_merged["*"] = mutation_loci["*"] @@ -232,14 +232,14 @@ def summarize_indels(path_midsv: Path, sequence: str) -> tuple: def merge_loci(dissimilar_loci: dict[str, set], anomal_loci: dict[str, set]) -> dict[str, set]: - mutation_loci = dict() + mutation_loci = {} for mut in {"+", "-", "*"}: mutation_loci[mut] = dissimilar_loci[mut] | anomal_loci[mut] return mutation_loci def add_knockin_loci(candidate_loci: dict[str, set], knockin_loci: set): - mutation_loci = dict() + mutation_loci = {} for mut in {"+", "-", "*"}: mutation_loci[mut] = candidate_loci[mut] | knockin_loci return mutation_loci @@ -289,9 +289,11 @@ def extract_mutation_loci( path_indels_normalized_sample: Path, path_indels_normalized_control: Path, path_knockin: Path, - thresholds: dict[str, float] = {"*": 0.5, "-": 0.5, "+": 0.5}, + thresholds: dict[str, float] = None, is_consensus: bool = False, ) -> list[set[str]]: + if thresholds is None: + thresholds = {"*": 0.5, "-": 0.5, "+": 0.5} indels_normalized_sample = io.load_pickle(path_indels_normalized_sample) indels_normalized_control = io.load_pickle(path_indels_normalized_control) diff --git a/src/DAJIN2/core/report/__init__.py b/src/DAJIN2/core/report/__init__.py index 23934c6e..88a13eb3 100644 --- a/src/DAJIN2/core/report/__init__.py +++ b/src/DAJIN2/core/report/__init__.py @@ -1,3 +1 @@ -from DAJIN2.core.report import bam_exporter -from DAJIN2.core.report import sequence_exporter -from DAJIN2.core.report import mutation_exporter +from DAJIN2.core.report import bam_exporter, mutation_exporter, sequence_exporter diff --git a/src/DAJIN2/core/report/bam_exporter.py b/src/DAJIN2/core/report/bam_exporter.py index 0d0eae38..78959003 100644 --- a/src/DAJIN2/core/report/bam_exporter.py +++ b/src/DAJIN2/core/report/bam_exporter.py @@ -98,7 +98,10 @@ def write_sam_to_bam(sam: list[list[str]], path_sam: str | Path, path_bam: str | pysam.index("-@", f"{threads}", str(path_bam)) -def update_sam(sam: list, GENOME_COODINATES: dict = {}) -> list: +def update_sam(sam: list, GENOME_COODINATES: dict = None) -> list: + if GENOME_COODINATES is None: + GENOME_COODINATES = {} + sam_records = sam.copy() sam_records = sam_handler.remove_microhomology(sam_records) if GENOME_COODINATES["genome"]: @@ -126,8 +129,8 @@ def export_to_bam(TEMPDIR, NAME, GENOME_COODINATES, THREADS, UUID, RESULT_SAMPLE sam_headers = [s for s in sam_updated if s[0].startswith("@")] sam_contents = [s for s in sam_updated if not s[0].startswith("@")] if is_control: - qnames: set[str] = set(list(set(s[0] for s in sam_contents[:10000]))[:100]) - sam_subset = [s for s in sam_updated if s[0] in qnames] + qnames_100reads: set[str] = set(list({s[0] for s in sam_contents[:10000]})[:100]) # subset 100 reads + sam_subset = [s for s in sam_updated if s[0] in qnames_100reads] path_sam_output = Path(TEMPDIR, "report", "BAM", f"temp_{UUID}_{NAME}_control_cache.sam") path_bam_output = Path(TEMPDIR, "cache", ".igvjs", NAME, "control.bam") write_sam_to_bam(sam_headers + sam_subset, path_sam_output, path_bam_output, THREADS) diff --git a/src/DAJIN2/core/report/mutation_exporter.py b/src/DAJIN2/core/report/mutation_exporter.py index 91cfdc54..e799eb55 100644 --- a/src/DAJIN2/core/report/mutation_exporter.py +++ b/src/DAJIN2/core/report/mutation_exporter.py @@ -1,10 +1,9 @@ from __future__ import annotations - -from pathlib import Path from itertools import groupby -from DAJIN2.utils.cssplits_handler import revcomp_cssplits, reallocate_insertion_within_deletion +from pathlib import Path +from DAJIN2.utils.cssplits_handler import reallocate_insertion_within_deletion, revcomp_cssplits ########################################################### # group by mutation diff --git a/src/DAJIN2/core/report/sequence_exporter.py b/src/DAJIN2/core/report/sequence_exporter.py index a0518b5b..5bffbd8b 100644 --- a/src/DAJIN2/core/report/sequence_exporter.py +++ b/src/DAJIN2/core/report/sequence_exporter.py @@ -1,11 +1,12 @@ from __future__ import annotations -import cstag import textwrap from pathlib import Path -from DAJIN2.utils.cssplits_handler import convert_cssplits_to_cstag, reallocate_insertion_within_deletion +import cstag + from DAJIN2.core.report.insertion_reflector import reflect_ref_insertion_to_query +from DAJIN2.utils.cssplits_handler import convert_cssplits_to_cstag, reallocate_insertion_within_deletion def convert_to_fasta(header: str, sequence: str) -> str: @@ -42,7 +43,7 @@ def export_to_fasta(TEMPDIR: Path, SAMPLE_NAME: str, cons_sequence: dict) -> Non def parse_fasta(file_path: Path) -> tuple[str, str]: """Parses a FASTA file and returns the header and concatenated sequence.""" - with open(file_path, "r") as f: + with open(file_path) as f: lines = f.readlines() header = lines[0].strip().lstrip(">") diff --git a/src/DAJIN2/gui.py b/src/DAJIN2/gui.py index d00d0b7b..16dad368 100644 --- a/src/DAJIN2/gui.py +++ b/src/DAJIN2/gui.py @@ -4,16 +4,14 @@ import shutil import socket import webbrowser - -import pandas as pd - +from contextlib import closing, redirect_stderr from pathlib import Path from threading import Timer -from contextlib import closing, redirect_stderr +import pandas as pd +from flask import Flask, render_template, request from waitress import serve from werkzeug.utils import secure_filename -from flask import Flask, render_template, request from DAJIN2 import main from DAJIN2.utils import config diff --git a/src/DAJIN2/main.py b/src/DAJIN2/main.py index 8ae4080e..fc2b0b5d 100644 --- a/src/DAJIN2/main.py +++ b/src/DAJIN2/main.py @@ -9,22 +9,22 @@ os.environ["NUMEXPR_NUM_THREADS"] = "1" -import sys -import shutil -import logging import argparse -from pathlib import Path +import logging +import shutil +import sys from copy import deepcopy from itertools import groupby +from pathlib import Path from DAJIN2 import gui, view from DAJIN2.core import core -from DAJIN2.utils import io, config, report_generator, input_validator, multiprocess +from DAJIN2.utils import config, input_validator, io, multiprocess, report_generator def generate_report(name: str, logger: logging.Logger) -> None: report_generator.report(name) - logger.info(f"\N{party popper} Finished! Open {config.DAJIN_RESULTS_DIR}/{name} to see the report.") + logger.info(f"\N{PARTY POPPER} Finished! Open {config.DAJIN_RESULTS_DIR}/{name} to see the report.") ################################################################################ @@ -36,8 +36,8 @@ def execute_single_mode(arguments: dict[str]): # Set logging to export log to stderr and file path_logfile = config.get_logfile() logger = config.set_logging(path_logfile) - logger.info(f"\N{runner} Start running DAJIN2 version {config.DAJIN_VERSION}") - logger.info(f"\N{Personal Computer} {' '.join(sys.argv)}") + logger.info(f"\N{RUNNER} Start running DAJIN2 version {config.DAJIN_VERSION}") + logger.info(f"\N{PERSONAL COMPUTER} {' '.join(sys.argv)}") # Validate input files input_validator.validate_files(arguments["sample"], arguments["control"], arguments["allele"]) @@ -71,7 +71,9 @@ def validate_headers_of_batch_file(headers: set[str], filepath: str) -> None: raise ValueError(f'{filepath} must contain "sample", "control", "allele" and "name" in the header') if not headers.issubset(accepted_headers): - raise ValueError(f'Accepted header names of {filepath} are "sample", "control", "allele", "name", or "genome".') + raise ValueError( + f'Accepted header names of {filepath} are "sample", "control", "allele", "name", or "genome".' + ) def create_argument_dict(args: dict, cache_urls_genome: dict, is_control: bool) -> dict[str, str]: @@ -103,7 +105,7 @@ def run_DAJIN2( contents.append(args) # Return a list of unique dictionaries - contents_unique = [dict(item) for item in set(frozenset(d.items()) for d in contents)] + contents_unique = [dict(item) for item in {frozenset(d.items()) for d in contents}] contents_unique.sort(key=lambda x: x["sample"]) @@ -126,7 +128,7 @@ def execute_batch_mode(arguments: dict[str]): validate_headers_of_batch_file(headers, path_batchfile) # Validate contents and fetch genome urls - cache_urls_genome = dict() + cache_urls_genome = {} records.sort(key=lambda x: x["name"]) for _, groups in groupby(records, key=lambda x: x["name"]): for args in groups: @@ -144,8 +146,8 @@ def execute_batch_mode(arguments: dict[str]): config.reset_logging() path_logfile = config.get_logfile() logger = config.set_logging(path_logfile) - logger.info(f"\N{runner} Start running DAJIN2 version {config.DAJIN_VERSION}") - logger.info(f"\N{Personal Computer} {' '.join(sys.argv)}") + logger.info(f"\N{RUNNER} Start running DAJIN2 version {config.DAJIN_VERSION}") + logger.info(f"\N{PERSONAL COMPUTER} {' '.join(sys.argv)}") # Run DAJIN2 run_DAJIN2(groups, cache_urls_genome, is_control=True, num_workers=arguments["threads"]) @@ -181,7 +183,7 @@ def execute(): ############################################################################### def batchmode(args): - arguments = dict() + arguments = {} arguments["file"] = args.file arguments["threads"] = input_validator.update_threads(int(args.threads)) arguments["debug"] = args.debug @@ -232,7 +234,7 @@ def viewmode(args): raise ValueError("the following arguments are required: -a/--allele") if args.name is None: raise ValueError("the following arguments are required: -n/--name") - arguments = dict() + arguments = {} arguments["sample"] = args.sample arguments["control"] = args.control arguments["allele"] = args.allele diff --git a/src/DAJIN2/utils/config.py b/src/DAJIN2/utils/config.py index 7a1ae85b..f1469390 100644 --- a/src/DAJIN2/utils/config.py +++ b/src/DAJIN2/utils/config.py @@ -1,10 +1,9 @@ from __future__ import annotations -import sys +import datetime import logging +import sys import warnings -import datetime - from pathlib import Path from sklearn.exceptions import ConvergenceWarning diff --git a/src/DAJIN2/utils/cssplits_handler.py b/src/DAJIN2/utils/cssplits_handler.py index 2ebd27f2..c7fbdd50 100644 --- a/src/DAJIN2/utils/cssplits_handler.py +++ b/src/DAJIN2/utils/cssplits_handler.py @@ -1,5 +1,7 @@ from __future__ import annotations + import re + import cstag import numpy as np import ruptures as rpt @@ -220,10 +222,10 @@ def _extract_break_points_of_large_deletions( signal_subset = signal[max(0, approximate_bp - 50) : min(approximate_bp + 50, len(signal))] bp = -1 for i, sig in enumerate(signal_subset): - if key == "start" and sig == False: + if key == "start" and not sig: bp = i + max(0, approximate_bp - 50) + window_start break - if key == "end" and sig == True: + if key == "end" and sig: bp = i + max(0, approximate_bp - 50) + window_start - 1 break if bp == -1: diff --git a/src/DAJIN2/utils/input_validator.py b/src/DAJIN2/utils/input_validator.py index cb28e667..5957cf40 100644 --- a/src/DAJIN2/utils/input_validator.py +++ b/src/DAJIN2/utils/input_validator.py @@ -1,16 +1,15 @@ from __future__ import annotations +import hashlib import os import ssl -import pysam - -import hashlib +import xml.etree.ElementTree as ET from pathlib import Path from urllib.error import URLError from urllib.request import urlopen -import xml.etree.ElementTree as ET import mappy +import pysam ######################################################################## # To accommodate cases where a user might input negative values or @@ -98,7 +97,7 @@ def validate_files(SAMPLE: str, CONTROL: str, ALLELE: str) -> None: for path_directory in [CONTROL, SAMPLE]: extentions = {return_file_extension(path_fastx) for path_fastx in Path(path_directory).iterdir()} if len(extentions) == 1: - extention = next(iter((extentions))) + extention = next(iter(extentions)) else: raise ValueError( f"{path_directory} contains multiple extensions. Please check if there are any incorrect files." diff --git a/src/DAJIN2/utils/io.py b/src/DAJIN2/utils/io.py index cd52b613..42ce91dd 100644 --- a/src/DAJIN2/utils/io.py +++ b/src/DAJIN2/utils/io.py @@ -1,19 +1,16 @@ from __future__ import annotations -import re import csv +import hashlib import json import pickle -import hashlib - -import wslPath - -from pathlib import Path - +import re from io import BufferedReader +from pathlib import Path from typing import Generator -from openpyxl import load_workbook, Workbook +import wslPath +from openpyxl import Workbook, load_workbook ########################################################### # Input/Output @@ -37,7 +34,7 @@ def save_pickle(data: object, file_path: Path) -> None: def read_jsonl(file_path: str | Path) -> Generator[dict]: - with open(file_path, "r") as f: + with open(file_path) as f: for line in f: yield json.loads(line) @@ -100,7 +97,7 @@ def read_xlsx(file_path: str | Path) -> list[dict[str, str]]: wb = load_workbook(filename=file_path) ws = wb.active - headers = [cell for cell in next(ws.iter_rows(min_row=1, max_row=1, values_only=True))] + headers = list(next(ws.iter_rows(min_row=1, max_row=1, values_only=True))) records = [] for row in ws.iter_rows(min_row=2, values_only=True): @@ -114,8 +111,7 @@ def read_xlsx(file_path: str | Path) -> list[dict[str, str]]: def read_csv(file_path: str | Path) -> list[dict[str, str]]: """Load data from a CSV file.""" - with open(file_path, "r") as csvfile: - + with open(file_path) as csvfile: header = [field.strip() for field in next(csv.reader(csvfile))] records = [] @@ -125,7 +121,7 @@ def read_csv(file_path: str | Path) -> list[dict[str, str]]: if all(element is None for element in row): # Skip rows with all None values continue row_trimmed = [field.strip() for field in row] - row_data = {h: v for h, v in zip(header, row_trimmed)} + row_data = dict(zip(header, row_trimmed)) records.append(row_data) return records diff --git a/src/DAJIN2/utils/multiprocess.py b/src/DAJIN2/utils/multiprocess.py index c8864fa6..0a2aecf7 100644 --- a/src/DAJIN2/utils/multiprocess.py +++ b/src/DAJIN2/utils/multiprocess.py @@ -1,15 +1,13 @@ from __future__ import annotations import io -import sys import logging +import sys import traceback - +from collections.abc import Callable from itertools import islice from multiprocessing import Process, Queue - from typing import Generator -from collections.abc import Callable def get_error_message_prefix(arg: dict) -> str: diff --git a/src/DAJIN2/utils/report_generator.py b/src/DAJIN2/utils/report_generator.py index fd93dc88..20ea6da2 100644 --- a/src/DAJIN2/utils/report_generator.py +++ b/src/DAJIN2/utils/report_generator.py @@ -4,7 +4,6 @@ from pathlib import Path import plotly.express as px - from DAJIN2.utils import io from DAJIN2.utils.config import DAJIN_RESULTS_DIR, TEMP_ROOT_DIR diff --git a/src/DAJIN2/utils/sam_handler.py b/src/DAJIN2/utils/sam_handler.py index 5057d768..53237582 100644 --- a/src/DAJIN2/utils/sam_handler.py +++ b/src/DAJIN2/utils/sam_handler.py @@ -1,8 +1,8 @@ from __future__ import annotations import re - from itertools import groupby + from DAJIN2.utils.dna_handler import revcomp diff --git a/src/DAJIN2/view.py b/src/DAJIN2/view.py index 3ea2ef69..aa8647a8 100644 --- a/src/DAJIN2/view.py +++ b/src/DAJIN2/view.py @@ -32,7 +32,7 @@ def execute(name: str): env = Environment(loader=FileSystemLoader(path_view, encoding="utf8")) template = env.get_template("template_igvjs.html") params_genome = {"genome": {"exist": False}} - params_reference = dict() + params_reference = {} path_genome = Path(DIR_IGVJS, "genome_symbol.txt") if path_genome.exists(): path_coordinates = Path(DIR_IGVJS, "genome_coordinates.jsonl") diff --git a/tests/src/clustering/test_appender.py b/tests/src/clustering/test_appender.py index 8968ce06..a85bc3e3 100644 --- a/tests/src/clustering/test_appender.py +++ b/tests/src/clustering/test_appender.py @@ -1,4 +1,4 @@ -from src.DAJIN2.core.clustering.appender import add_readnum, add_percent +from src.DAJIN2.core.clustering.appender import add_percent, add_readnum def test_add_readnum(): diff --git a/tests/src/clustering/test_kmer_generator.py b/tests/src/clustering/test_kmer_generator.py index dce464d1..caff0d71 100644 --- a/tests/src/clustering/test_kmer_generator.py +++ b/tests/src/clustering/test_kmer_generator.py @@ -1,7 +1,7 @@ -import pytest - from typing import Generator +import pytest + from src.DAJIN2.core.clustering.kmer_generator import generate_mutation_kmers diff --git a/tests/src/clustering/test_label_merger.py b/tests/src/clustering/test_label_merger.py index 98f4c7a8..1b970d3b 100644 --- a/tests/src/clustering/test_label_merger.py +++ b/tests/src/clustering/test_label_merger.py @@ -1,6 +1,4 @@ -from DAJIN2.core.clustering.label_merger import merge_mixed_cluster -from DAJIN2.core.clustering.label_merger import map_clusters_to_previous -from DAJIN2.core.clustering.label_merger import merge_minor_cluster +from DAJIN2.core.clustering.label_merger import map_clusters_to_previous, merge_minor_cluster, merge_mixed_cluster from DAJIN2.core.clustering.label_updator import relabel_with_consective_order diff --git a/tests/src/consensus/test_consensus.py b/tests/src/consensus/test_consensus.py index 4d2b86cc..9f49a8eb 100644 --- a/tests/src/consensus/test_consensus.py +++ b/tests/src/consensus/test_consensus.py @@ -1,10 +1,9 @@ from src.DAJIN2.core.consensus.consensus import ( - replace_sequence_error, adjust_to_100_percent, call_percentage, + replace_sequence_error, ) - ########################################################### # replace_sequence ########################################################### diff --git a/tests/src/consensus/test_name_handler.py b/tests/src/consensus/test_name_handler.py index 81dd8081..84c0a562 100644 --- a/tests/src/consensus/test_name_handler.py +++ b/tests/src/consensus/test_name_handler.py @@ -1,15 +1,14 @@ -import pytest - from collections import defaultdict from typing import NamedTuple +import pytest from DAJIN2.core.consensus.name_handler import ( _detect_sv, - _format_allele_label, _determine_suffix, + _format_allele_label, + add_key_by_allele_name, call_allele_name, update_key_by_allele_name, - add_key_by_allele_name, ) ########################################################### @@ -103,7 +102,11 @@ def test_call_allele_name(cons_sequences, cons_percentages, FASTA_ALLELES, thres @pytest.mark.parametrize( "cons, allele_names, expected_output", [ - ({ConsensusKey("control", 1, 100): "value1", ConsensusKey("control", 2, 100): "value2"}, {1: "name1", 2: "name2"}, {"name1": "value1", "name2": "value2"}), + ( + {ConsensusKey("control", 1, 100): "value1", ConsensusKey("control", 2, 100): "value2"}, + {1: "name1", 2: "name2"}, + {"name1": "value1", "name2": "value2"}, + ), ], ) def test_update_key_by_allele_name(cons, allele_names, expected_output): diff --git a/tests/src/preprocess/test_genome_fetcher.py b/tests/src/preprocess/test_genome_fetcher.py index 4312ed80..f0aa41b9 100644 --- a/tests/src/preprocess/test_genome_fetcher.py +++ b/tests/src/preprocess/test_genome_fetcher.py @@ -1,6 +1,7 @@ +from pathlib import Path + import pytest -from pathlib import Path from src.DAJIN2.core.preprocess import genome_fetcher from src.DAJIN2.utils.input_validator import validate_genome_and_fetch_urls diff --git a/tests/src/preprocess/test_mapping.py b/tests/src/preprocess/test_mapping.py index 1a3228b4..c2bb3866 100644 --- a/tests/src/preprocess/test_mapping.py +++ b/tests/src/preprocess/test_mapping.py @@ -1,6 +1,7 @@ from __future__ import annotations from pathlib import Path + from DAJIN2.core.preprocess import mapping diff --git a/tests/src/preprocess/test_midsv_caller.py b/tests/src/preprocess/test_midsv_caller.py index 0b945e59..56d44fbf 100644 --- a/tests/src/preprocess/test_midsv_caller.py +++ b/tests/src/preprocess/test_midsv_caller.py @@ -1,14 +1,15 @@ from __future__ import annotations -import pytest - -from DAJIN2.core.preprocess.midsv_caller import has_inversion_in_splice -from DAJIN2.core.preprocess.midsv_caller import replace_internal_n_to_d -from DAJIN2.core.preprocess.midsv_caller import convert_flag_to_strand -from DAJIN2.core.preprocess.midsv_caller import convert_consecutive_indels_to_match - from pathlib import Path + +import pytest from DAJIN2.core.preprocess.mapping import to_sam +from DAJIN2.core.preprocess.midsv_caller import ( + convert_consecutive_indels_to_match, + convert_flag_to_strand, + has_inversion_in_splice, + replace_internal_n_to_d, +) ########################################################### diff --git a/tests/src/report/test_bam_exporter.py b/tests/src/report/test_bam_exporter.py index fb26938d..fbbbaf47 100644 --- a/tests/src/report/test_bam_exporter.py +++ b/tests/src/report/test_bam_exporter.py @@ -1,8 +1,10 @@ from __future__ import annotations -import pytest from pathlib import Path from unittest.mock import patch + +import pytest + from src.DAJIN2.core.report import bam_exporter ############################################################################### diff --git a/tests/src/utils/test_cssplits_handler.py b/tests/src/utils/test_cssplits_handler.py index c8e9f3e5..cd05d183 100644 --- a/tests/src/utils/test_cssplits_handler.py +++ b/tests/src/utils/test_cssplits_handler.py @@ -3,7 +3,6 @@ import pytest from DAJIN2.utils import cssplits_handler - ########################################################### # find_n_boundaries ########################################################### @@ -124,6 +123,7 @@ def test_get_index_of_large_deletions(cssplits, expected): # reallocate_insertion_within_deletion ########################################################### + @pytest.mark.parametrize( "cs, expected", [ diff --git a/tests/src/utils/test_fastx_handler.py b/tests/src/utils/test_fastx_handler.py index 7975d5c1..a867e170 100644 --- a/tests/src/utils/test_fastx_handler.py +++ b/tests/src/utils/test_fastx_handler.py @@ -1,7 +1,6 @@ from __future__ import annotations import pytest - from DAJIN2.utils import fastx_handler ######################################################################## diff --git a/tests/src/utils/test_input_validator.py b/tests/src/utils/test_input_validator.py index 046a09e4..edf789ae 100644 --- a/tests/src/utils/test_input_validator.py +++ b/tests/src/utils/test_input_validator.py @@ -1,9 +1,9 @@ from __future__ import annotations -import pytest from pathlib import Path -from DAJIN2.utils import input_validator +import pytest +from DAJIN2.utils import input_validator ############################################################################### # validate File existance and the extentions @@ -20,9 +20,7 @@ def test_exists(): def test_return_file_extension(): with pytest.raises(ValueError) as e: test = Path("test.fqq") - expected = ( - f"{test} requires extensions either .fastq, .fastq.gz, .fq, .fq.gz, .fasta, .fasta.gz, .fa, .fa.gz, or .bam" - ) + expected = f"{test} requires extensions either .fastq, .fastq.gz, .fq, .fq.gz, .fasta, .fasta.gz, .fa, .fa.gz, or .bam" input_validator.return_file_extension(test) assert str(e.value) == expected diff --git a/tests/src/utils/test_io.py b/tests/src/utils/test_io.py index 425ea312..0b348bf4 100644 --- a/tests/src/utils/test_io.py +++ b/tests/src/utils/test_io.py @@ -1,10 +1,11 @@ from __future__ import annotations -import os import json -import pytest +import os from pathlib import Path +import pytest + from src.DAJIN2.utils import io @@ -71,7 +72,7 @@ def test_write_jsonl(): test_filename = "test_output.json" io.write_jsonl(file_path=test_filename, data=data_list) # Verify if the file has been written correctly - with open(test_filename, "r") as f: + with open(test_filename) as f: lines = f.readlines() assert len(lines) == 2 loaded_data1 = json.loads(lines[0].strip()) diff --git a/tests/src/utils/test_multiprocess.py b/tests/src/utils/test_multiprocess.py index 0cccaba8..e36915b4 100644 --- a/tests/src/utils/test_multiprocess.py +++ b/tests/src/utils/test_multiprocess.py @@ -1,12 +1,12 @@ from __future__ import annotations +import tempfile +from multiprocessing import Queue + import pytest -import tempfile from src.DAJIN2.utils import multiprocess -from multiprocessing import Queue - ########################################################### # generate_chunks ########################################################### @@ -125,7 +125,7 @@ def test_multiprocessing_execution(): multiprocess.run(write_value_to_file, values_to_write, num_workers=3) # Verify if all values are written correctly - with open(file_path, "r") as f: + with open(file_path) as f: written_values = set(map(int, map(str.strip, f.readlines()))) assert written_values == set(range(1, 11)) diff --git a/tests/src/utils/test_sam_handler.py b/tests/src/utils/test_sam_handler.py index 57f7b208..f24d5673 100644 --- a/tests/src/utils/test_sam_handler.py +++ b/tests/src/utils/test_sam_handler.py @@ -1,13 +1,19 @@ from __future__ import annotations -import midsv from pathlib import Path -from DAJIN2.utils.sam_handler import split_cigar -from DAJIN2.utils.sam_handler import calculate_alignment_length -from DAJIN2.utils.sam_handler import is_header, is_mapped, is_overlapping, remove_overlapped_reads -from DAJIN2.utils.sam_handler import remove_microhomology -from DAJIN2.utils.sam_handler import reverse_flag, revcomp_sam +import midsv +from DAJIN2.utils.sam_handler import ( + calculate_alignment_length, + is_header, + is_mapped, + is_overlapping, + remove_microhomology, + remove_overlapped_reads, + revcomp_sam, + reverse_flag, + split_cigar, +) def test_split_cigar():