Skip to content

Commit

Permalink
Add DigitalMSA.identity_filter to filter the alignment sequences by…
Browse files Browse the repository at this point in the history
… identity
  • Loading branch information
althonos committed Feb 27, 2025
1 parent 4a91fab commit cd997eb
Show file tree
Hide file tree
Showing 4 changed files with 175 additions and 2 deletions.
15 changes: 14 additions & 1 deletion src/pyhmmer/easel.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

# --- C imports --------------------------------------------------------------

from libc.stdint cimport int64_t, uint8_t, uint16_t, uint32_t
from libc.stdint cimport int64_t, uint8_t, uint16_t, uint32_t, uint64_t
from posix.types cimport off_t

cimport libeasel.sq
Expand Down Expand Up @@ -170,6 +170,19 @@ cdef class DigitalMSA(MSA):
cpdef DigitalMSA copy(self)
cpdef TextMSA textize(self)

cpdef DigitalMSA identity_filter(
self,
float max_identity=*,
float fragment_threshold=*,
float consensus_fraction=*,
bint ignore_rf=*,
bint sample=*,
int sample_threshold=*,
int sample_count=*,
int max_fragments=*,
uint64_t seed=?,
str preference=*,
)

# --- MSA File ---------------------------------------------------------------

Expand Down
14 changes: 14 additions & 0 deletions src/pyhmmer/easel.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ except ImportError:
from typing_extensions import Literal # type: ignore

BUFFER = typing.Union[bytes, bytearray, memoryview]
IDENTITY_FILTER_PREFERENCE = Literal["conscover", "origorder", "random"]

# --- Alphabet ---------------------------------------------------------------

Expand Down Expand Up @@ -442,6 +443,19 @@ class DigitalMSA(MSA):
def textize(self) -> TextMSA: ...
@property
def sequences(self) -> _DigitalMSASequences: ...
def identity_filter(
self,
max_identity: float = 0.8,
fragment_threshold: float = ...,
consensus_fraction: float = ...,
ignore_rf: bool = ...,
sample: bool = ...,
sample_threshold: int = ...,
sample_count: int = ...,
max_fragments: int = ...,
seed: int = ...,
preference: IDENTITY_FILTER_PREFERENCE = "conscover",
) -> DigitalMSA: ...

# --- MSA File ---------------------------------------------------------------

Expand Down
116 changes: 116 additions & 0 deletions src/pyhmmer/easel.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ cimport libeasel.keyhash
cimport libeasel.matrixops
cimport libeasel.msa
cimport libeasel.msafile
cimport libeasel.msaweight
cimport libeasel.random
cimport libeasel.sq
cimport libeasel.sqio
Expand All @@ -50,6 +51,7 @@ cimport libeasel.vec
from libeasel cimport ESL_DSQ, esl_pos_t, eslERRBUFSIZE
from libeasel.buffer cimport ESL_BUFFER
from libeasel.gencode cimport ESL_GENCODE
from libeasel.msaweight cimport ESL_MSAWEIGHT_CFG
from libeasel.sq cimport ESL_SQ
from libeasel.sqio cimport ESL_SQFILE, ESL_SQASCII_DATA
from libeasel.random cimport ESL_RANDOMNESS
Expand Down Expand Up @@ -149,6 +151,12 @@ cdef dict SEQUENCE_FILE_FORMATS_INDEX = {
v:k for k,v in SEQUENCE_FILE_FORMATS.items()
}

cdef dict MSA_WEIGHT_PREFERENCES = {
"conscover": libeasel.msaweight.eslMSAWEIGHT_FILT_CONSCOVER,
"origorder": libeasel.msaweight.eslMSAWEIGHT_FILT_ORIGORDER,
"random": libeasel.msaweight.eslMSAWEIGHT_FILT_RANDOM,
}

# --- Alphabet ---------------------------------------------------------------

cdef class Alphabet:
Expand Down Expand Up @@ -3988,6 +3996,114 @@ cdef class DigitalMSA(MSA):
else:
raise UnexpectedError(status, "esl_msa_Textize")

cpdef DigitalMSA identity_filter(
self,
float max_identity=0.8,
float fragment_threshold=libeasel.msaweight.eslMSAWEIGHT_FRAGTHRESH,
float consensus_fraction=libeasel.msaweight.eslMSAWEIGHT_SYMFRAC,
bint ignore_rf=libeasel.msaweight.eslMSAWEIGHT_IGNORE_RF,
bint sample=libeasel.msaweight.eslMSAWEIGHT_ALLOW_SAMP,
int sample_threshold=libeasel.msaweight.eslMSAWEIGHT_SAMPTHRESH,
int sample_count=libeasel.msaweight.eslMSAWEIGHT_NSAMP,
int max_fragments=libeasel.msaweight.eslMSAWEIGHT_MAXFRAG,
uint64_t seed=libeasel.msaweight.eslMSAWEIGHT_RNGSEED,
str preference="conscover",
):
r"""Filter the alignment sequences by percent identity.
Arguments:
max_identity (`float`): The maximum fractional identity
allowed between two alignment sequences. Sequences with
fractional identity :math:`\geq` this number will be
removed, with the remaining one selected according to
the given ``preference``.
Keyword Arguments:
fragment_threshold (`float`): The threshold for determining
which sequences of the alignment are fragments. An
sequence spanning columns :math:`i` to :math:`j` of an
alignment of width :math:`W` will be flagged as a fragment
if :math:`\frac{j - i}{ W } < \text{fragment_threshold}`,
consensus_fraction (`float`): The parameter for determining
with columns of the alignment are consensus columns.
A column containing :math:`n` symbols and :math:`m` gaps
will be marked a consensus column if
:math:`\frac{n}{n + m} \ge \text{consensus_fraction}`.
ignore_rf (`bool`): Set to `True` to ignore the *RF* line
of the alignment (if present) and to force building the
consensus.
sample (`bool`): Whether or not to enable consensus determination
by subsampling for large alignments. Set to `False` to force
using all sequences.
sample_threshold (`int`): The minimum number of sequences the
alignment must contain to use subsampling for consensus
determination (when ``sample`` is `True`).
sample_count (`int`): The number of sequences to use when
determining consensus by random subsampling.
max_fragments (`int`): The maximum number of allowed fragments
in the sample used for determining consensus. If the sample
contains more than ``max_fragments`` fragments, the
consensus determination is done with all sequences instead.
seed (`int`): The seed to use for initializing the random
number generator (used when ``preference`` is ``random``
or when ``sample`` is `True`). If ``0`` or `None` is given,
an arbitrary seed will be chosen using the system clock.
preference (`str`): The strategy to use for selecting the
representative sequence in case of duplicates. Supported
strategies are ``conscover`` (the default), which prefers
the sequence with an alignment span that covers more
consensus columns; ``origorder`` to use the first sequence
in the original alignment order; and ``random`` to select
the sequence at random.
Returns:
`~pyhmmer.easel.MSA`: The multiple sequence alignments with
duplicate sequence removed. Unparsed Sotckholm markup is not
propagated.
"""
assert self._msa != NULL

cdef int status
cdef int filterpref
cdef ESL_MSAWEIGHT_CFG cfg
cdef DigitalMSA msa = DigitalMSA.__new__(DigitalMSA, self.alphabet)

# validate arguments
if fragment_threshold < 0 or fragment_threshold > 1:
raise InvalidParameter("fragment_threshold", fragment_threshold, hint="real number between 0 and 1")
if consensus_fraction < 0 or consensus_fraction > 1:
raise InvalidParameter("consensus_fraction", consensus_fraction, hint="real number between 0 and 1")
if sample_threshold < 0:
raise InvalidParameter("sample_threshold", sample_threshold, hint="positive integer")
if sample_count < 0:
raise InvalidParameter("sample_count", sample_count, hint="positive integer")
if max_fragments < 0:
raise InvalidParameter("max_fragments", max_fragments, hint="positive integer")
if preference not in MSA_WEIGHT_PREFERENCES:
raise InvalidParameter("preference", preference, choices=list(MSA_WEIGHT_PREFERENCES))

# prepare configuration
cfg.fragthresh = fragment_threshold
cfg.symfrac = consensus_fraction
cfg.sampthresh = sample_threshold
cfg.ignore_rf = ignore_rf
cfg.allow_samp = sample
cfg.sampthresh = sample_threshold
cfg.nsamp = sample_count
cfg.maxfrag = max_fragments
cfg.seed = seed
cfg.filterpref = MSA_WEIGHT_PREFERENCES[preference]

# run filtering
with nogil:
status = libeasel.msaweight.esl_msaweight_IDFilter_adv(&cfg, self._msa, max_identity, &msa._msa)
if status != libeasel.eslOK:
raise UnexpectedError(status, "esl_msaweight_IDFilter_adv")

return msa



# --- MSA File ---------------------------------------------------------------

Expand Down
32 changes: 31 additions & 1 deletion src/pyhmmer/tests/test_easel/test_msa.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ def test_digitize_invalid(self):
class TestDigitalMSA(TestMSA, unittest.TestCase):

alphabet = easel.Alphabet.dna()
MSA = functools.partial(easel.DigitalMSA, alphabet)
MSA = staticmethod(functools.partial(easel.DigitalMSA, alphabet))

@staticmethod
def read_msa(sto):
Expand Down Expand Up @@ -284,3 +284,33 @@ def test_textize(self):
self.assertTrue(msa_t)
self.assertEqual(msa_t.sequences[0].name, b"seq1")
self.assertEqual(msa_t.sequences[0].sequence, "ACGT")


def test_identity_filter(self):
# adapted from `utest_idfilter` in `esl_msaweight.c`
dna = easel.Alphabet.dna()
s1 = easel.DigitalSequence

buffer = io.BytesIO(
b"# STOCKHOLM 1.0\n\n"
b"seq1 ..AAAAAAAA\n"
b"seq2 AAAAAAAAAA\n"
b"seq3 CCCCCCCCCC\n"
b"seq4 GGGGGGGGGG\n"
b"//\n"
)

with easel.MSAFile(buffer, format="stockholm", digital=True, alphabet=dna) as msa_file:
msa = msa_file.read()

msa2 = msa.identity_filter(1.0)
self.assertEqual(len(msa2.sequences), 3)
self.assertEqual(msa2.names[0], b"seq2")

msa3 = msa.identity_filter(1.0, preference="origorder")
self.assertEqual(len(msa3.sequences), 3)
self.assertEqual(msa3.names[0], b"seq1")

msa3 = msa.identity_filter(1.0, preference="random")
self.assertEqual(len(msa3.sequences), 3)
self.assertIn(msa3.names[0], [b"seq1", b"seq2"])

0 comments on commit cd997eb

Please sign in to comment.