Skip to content

Commit

Permalink
Merge pull request #196 from GavinHuttley/develop
Browse files Browse the repository at this point in the history
BUG: fix CDS span calculations
  • Loading branch information
GavinHuttley authored Feb 27, 2025
2 parents 37f1d67 + d473c97 commit fc9b07c
Show file tree
Hide file tree
Showing 10 changed files with 136 additions and 46 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/linters.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
run: |
python -m pip install --upgrade pip
python -m pip install tomli
export ruff_version=$(python -c 'import tomli; print([line for line in tomli.load(open("pyproject.toml","rb"))["project"]["optional-dependencies"]["dev"] if "ruff" in line][0])')
export ruff_version=$(python -c 'import tomli; print([line for line in tomli.load(open("pyproject.toml","rb"))["project"]["optional-dependencies"]["test"] if "ruff" in line][0])')
echo "Click version: ruff_version"
python -m pip install $ruff_version
- name: Format code using ruff
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/testing_develop.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
strategy:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
python-version: ["3.10", "3.11", "3.12"]
python-version: ["3.10", "3.11", "3.12", "3.13"]

steps:
- uses: "actions/checkout@v4"
Expand All @@ -28,9 +28,9 @@ jobs:
uses: actions/cache@v3
with:
path: tests/data
key: ${{ runner.os }}-data-v1-${{ hashFiles('tests/data/small-113.zip') }}
key: ${{ runner.os }}-data-v2-${{ hashFiles('tests/data/small-113.zip') }}
restore-keys: |
${{ runner.os }}-data-v1-
${{ runner.os }}-data-v2-
- name: Check cache status
run: |
Expand All @@ -39,7 +39,7 @@ jobs:
- name: Download data file
if: steps.cache-data.outputs.cache-hit != 'true'
run: |
curl -o tests/data/small-113.zip https://zenodo.org/records/14625203/files/small-113.zip
curl -L -o tests/data/small-113.zip "https://www.dropbox.com/scl/fi/pfmwzz96gusdeqi0a9wax/small-113.zip?rlkey=r60l1eq9jk6p440tkqslmqihi&st=ud49fits&dl=1"
- name: Unzip data file
run: |
Expand Down
2 changes: 1 addition & 1 deletion noxfile.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import nox

_py_versions = range(10, 13)
_py_versions = range(10, 14)


@nox.session(python=[f"3.{v}" for v in _py_versions])
Expand Down
48 changes: 21 additions & 27 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ maintainers = [
keywords = ["biology", "genomics", "evolution", "bioinformatics"]
readme = "README.md"
license = { file = "LICENSE" }
requires-python = ">=3.10,<3.13"
requires-python = ">=3.10,<3.14"
dependencies = ["blosc2",
"click",
"cogent3>=2024.12.19a2",
Expand All @@ -44,6 +44,7 @@ classifiers = [
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
]
dynamic = ["version", "description"]

Expand All @@ -63,45 +64,38 @@ test = [
"pytest-cov",
"pytest-timeout",
"pytest-xdist",
"ruff==0.9.1",
"ruff==0.9.7",
"nox"]
doc = ["click",
"sphinx",
"sphinx-autobuild",
"sphinx>=1.6",
"sphinx_book_theme",
"sphinx_design",
"sphinxcontrib-bibtex"]
"sphinxcontrib-bibtex",
"ipykernel",
"ipython",
"ipywidgets",
"jupyter-sphinx",
"jupyter_client",
"jupyterlab",
"jupytext",
"kaleido",
"nbconvert>5.4",
"nbformat",
"nbsphinx",
"pillow",
"plotly",
]
dev = ["click",
"cogapp",
"flit",
"ipykernel",
"ipython",
"ipywidgets",
"jupyter-sphinx",
"jupyter_client",
"jupyterlab",
"jupytext",
"kaleido",
"nbconvert>5.4",
"nbformat",
"nbsphinx",
"nox",
"numpydoc",
"pandas",
"pillow",
"plotly",
"psutil",
"pytest",
"pytest-cov",
"pytest-xdist",
"ruff==0.9.1",
"scriv",
"sphinx",
"sphinx-autobuild",
"sphinx_book_theme",
"sphinx_design",
"sphinxcontrib-bibtex"]
"ensembl_tui[doc]",
"ensembl_tui[test]",
]

[tool.flit.sdist]
include = ["src/*", "tests/*", "pyproject.toml"]
Expand Down
2 changes: 1 addition & 1 deletion src/ensembl_tui/_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ def get_features_matching(
symbol: OptStr = None,
description: OptStr = None,
**kwargs, # noqa: ANN003
) -> typing.Iterator[FeatureDataBase]:
) -> typing.Iterator[GeneData]:
# add supoport for querying by symbol and description
stable_id = stable_id or kwargs.pop("name", None)
limit = kwargs.pop("limit", None)
Expand Down
33 changes: 29 additions & 4 deletions src/ensembl_tui/_mysql_core_attr.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,10 @@ class LimitExons:
strand: int
transcript_id: int

@property
def single_exon(self) -> bool:
return self.start_rank == self.stop_rank


def get_all_limit_exons(
conn: duckdb.DuckDBPyConnection,
Expand Down Expand Up @@ -182,8 +186,7 @@ def get_limit_exons(records: list[tuple[int, ...]]) -> LimitExons:
return LimitExons(
start_rank=start_rank,
stop_rank=end_rank,
# subtract 1 to convert to 0-based
rel_start=rel_start - 1,
rel_start=rel_start,
rel_stop=rel_end,
strand=strand,
transcript_id=transcript_id,
Expand Down Expand Up @@ -232,6 +235,13 @@ def to_record(self, columns: tuple[str]) -> tuple:
return tuple(mapping[c] for c in columns)


def _adjust_single_exon(lex: LimitExons, cds_span: tuple[int, int]) -> tuple[int, int]:
ex_start = cds_span[0] if lex.strand == 1 else cds_span[1]
if lex.strand == 1:
return ex_start + lex.rel_start, ex_start + lex.rel_stop
return ex_start - lex.rel_stop, ex_start - lex.rel_start


def get_transcript_attr_records(
conn: duckdb.DuckDBPyConnection,
) -> typing.Iterator[TranscriptAttrRecord]:
Expand Down Expand Up @@ -269,6 +279,8 @@ def get_transcript_attr_records(
for i, rank in enumerate(ranks):
transcript_spans[rank - 1] = (starts[i], stops[i])

cds_spans = transcript_spans.copy()
transcript_spans = transcript_spans[numpy.lexsort(transcript_spans.T), :]
if transcript_id not in limit_exons:
# no translated exons
yield TranscriptAttrRecord(
Expand All @@ -291,7 +303,21 @@ def get_transcript_attr_records(
# 5' end of an exon
# so the start_exon coords become (exon_start + rel_start, exon_end)
# the end_exon coords become (exon_start, exon_start + rel_stop)
cds_spans = transcript_spans.copy()
if lex.single_exon:
cds_spans[0, :] = _adjust_single_exon(lex, cds_spans[0])

yield TranscriptAttrRecord(
seqid=seqid,
transcript_id=transcript_id,
gene_id=gene_id,
strand=strand,
transcript_spans=transcript_spans,
cds_spans=cds_spans,
transcript_stable_id=transcript_stable_id,
cds_stable_id=cds_stable_id,
)
continue

start_exon_coords = cds_spans[start_index]
stop_exon_coords = cds_spans[stop_index]
if lex.strand == 1:
Expand Down Expand Up @@ -321,7 +347,6 @@ def get_transcript_attr_records(
# sort all spans in ascending numerical order
# note that the lexsort returns the sorted indices
cds_spans = cds_spans[numpy.lexsort(cds_spans.T), :]
transcript_spans = transcript_spans[numpy.lexsort(transcript_spans.T), :]

yield TranscriptAttrRecord(
seqid=seqid,
Expand Down
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def namer():
return name_as_seqid


TEST_DATA_URL = "https://zenodo.org/records/14625203/files/small-113.zip"
TEST_DATA_URL = "https://www.dropbox.com/scl/fi/pfmwzz96gusdeqi0a9wax/small-113.zip?rlkey=r60l1eq9jk6p440tkqslmqihi&st=ud49fits&dl=1"
SMALL_DATA_DIRNAME = "small-113"


Expand Down
64 changes: 64 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,70 @@ def test_installed(installed):
assert len(list(path.glob("*attr.parquet"))) == 2


@pytest.mark.slow
def test_check_one_cds_seq(installed):
# checking a single exon sequence with a rel_start > 0
from ensembl_tui import _genome as eti_genome

config = eti_config.read_installed_cfg(installed)
genome = eti_genome.load_genome(
config=config,
species="saccharomyces_cerevisiae",
)
cds = next(iter(genome.get_cds(stable_id="YMR242C")))
seq = cds.get_slice()
expect = (
"GCTCACTTTAAAGAATACCAAGTTATTGGCCGTCGTTTGCCAACTGAATCTGTTCCAGAA"
"CCAAAGTTGTTCAGAATGAGAATCTTTGCTTCAAATGAAGTTATTGCCAAGTCTCGTTAC"
"TGGTATTTCTTGCAAAAGTTGCACAAGGTTAAGAAGGCTTCTGGTGAAATTGTTTCCATC"
"AACCAAATCAACGAAGCTCATCCAACCAAGGTCAAGAACTTCGGTGTCTGGGTTAGATAC"
"GACTCCAGATCTGGTACTCACAATATGTACAAGGAAATCAGAGACGTCTCCAGAGTTGCT"
"GCCGTCGAAACCTTATACCAAGACATGGCTGCCAGACACAGAGCTAGATTTAGATCTATT"
"CACATCTTGAAGGTTGCTGAAATTGAAAAGACTGCTGACGTCAAGAGACAATACGTTAAG"
"CAATTTTTGACCAAGGACTTGAAATTCCCATTGCCTCACAGAGTCCAAAAATCCACCAAG"
"ACTTTCTCCTACAAGAGACCTTCCACTTTCTACTGA"
)
assert str(cds.get_slice()) == expect


@pytest.mark.slow
def test_check_multi_exon_cds_seq_plus_strand(installed):
# checking a multi exon sequence with a rel_start > 0
# and rel_end != exon length
from ensembl_tui import _genome as eti_genome

config = eti_config.read_installed_cfg(installed)
genome = eti_genome.load_genome(
config=config,
species="caenorhabditis_elegans",
)
cds = next(iter(genome.get_cds(stable_id="WBGene00185002")))
aa = str(cds.get_slice().get_translation())
# seq expected values from ensembl
assert aa.startswith("MEMEDIDDDITVFYTDDRGTVQGPYGASTVLDWYQKGYFSDNHQMRFTDNGQRIGNLFTY")
assert aa.endswith("IEKVKTNCRDAPSPLPPAMDPVAPYHVRDKCTQS")
assert len(aa) == 274


@pytest.mark.slow
def test_check_two_exon_cds_seq_rev_strand(installed):
# checking a two exon sequence with a rel_start > 0
# and rel_end != exon length
from ensembl_tui import _genome as eti_genome

config = eti_config.read_installed_cfg(installed)
genome = eti_genome.load_genome(
config=config,
species="caenorhabditis_elegans",
)
cds = next(iter(genome.get_cds(stable_id="WBGene00184990")))
aa = str(cds.get_slice().get_translation())
# seq expected values from ensembl
assert aa.startswith("MSGVYNNSGSRMRSKNFEKHQVPSDMAFFQKFRKQSHSNETVDCKKKQEE")
assert aa.endswith("DGHYSDETVEEKHNREHRNKTKADNRTRRIAEIRRKHNINA")
assert len(aa) == 161


@pytest.mark.slow
def test_species_summary(installed):
r = RUNNER.invoke(
Expand Down
7 changes: 7 additions & 0 deletions tests/test_genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,3 +198,10 @@ def test_get_ids_for_biotype_seqid(yeast):
r.seqid for stable_id in stable_ids for r in yeast.get_features(name=stable_id)
}
assert got == seqids


def test_get_celegans_cds(worm):
cds = next(iter(worm.get_cds(stable_id="WBGene00021347")))
seq = cds.get_slice()
aa = seq.get_translation()
assert aa == "MIIPIRCFTCGKVIGDKWETYLGFLQSEYSEGDALDALGLRRYCCRRMLLAHVDLIEKLLNYHPLEK"
14 changes: 7 additions & 7 deletions tests/test_install.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ def test_get_limiting_exons_one_exon(one_exon):
all_exons = eti_tables.get_all_limit_exons(one_exon)
lex = eti_tables.get_limit_exons(all_exons[1664])
assert lex.start_rank == lex.stop_rank == 1
assert lex.rel_start == 0
assert lex.rel_start == 1
assert lex.rel_stop == 1


Expand All @@ -249,7 +249,7 @@ def test_get_limiting_exons_two_exons(two_exon):
lex = eti_tables.get_limit_exons(all_exons[269944])
assert lex.start_rank == 4
assert lex.stop_rank == 12
assert lex.rel_start == 28
assert lex.rel_start == 29
assert lex.rel_stop == 54


Expand Down Expand Up @@ -343,7 +343,7 @@ def same_tr_cds(four_exons):
{
"transcript_id": 11,
"start_exon_id": 1,
"seq_start": 1,
"seq_start": 0,
"end_exon_id": 4,
"seq_end": 100,
"stable_id": "a1",
Expand Down Expand Up @@ -371,7 +371,7 @@ def diff_tr_cds(four_exons):
{
"transcript_id": 11,
"start_exon_id": 2,
"seq_start": 1,
"seq_start": 0,
"end_exon_id": 3,
"seq_end": 100,
},
Expand Down Expand Up @@ -399,7 +399,7 @@ def tr_cds_rel_pos(four_exons):
{
"transcript_id": 11,
"start_exon_id": 1,
"seq_start": 2,
"seq_start": 1,
"end_exon_id": 4,
"seq_end": 2,
},
Expand Down Expand Up @@ -486,7 +486,7 @@ def test_rel_start_ends_2(tr_cds_rel_pos_minus):
assert tr.start == 100
assert tr.stop == 400
assert numpy.array_equal(tr.transcript_spans, [(100, 200), (300, 400)])
assert numpy.array_equal(tr.cds_spans, [(200 - 10, 200), (300, 400 - 4)])
assert numpy.array_equal(tr.cds_spans, [(200 - 10, 200), (300, 400 - 5)])


def test_no_cds_spans(four_exons):
Expand Down Expand Up @@ -609,7 +609,7 @@ def mixed_data():
{
"transcript_id": 12,
"start_exon_id": 7,
"seq_start": 1,
"seq_start": 0,
"end_exon_id": 5,
"seq_end": 100,
"stable_id": "pr-01",
Expand Down

0 comments on commit fc9b07c

Please sign in to comment.