Skip to content

Commit

Permalink
[MRG] fix pickle protocol to properly adjust ksize in __getstate__ (
Browse files Browse the repository at this point in the history
#2265)

* add tests to reproduce bug #2262

* fix in __getstate__ rather than __setstate__

* add hp and dayhoff tests

* fix comment
  • Loading branch information
ctb authored Sep 6, 2022
1 parent 2e0175a commit beb96a3
Show file tree
Hide file tree
Showing 3 changed files with 82 additions and 6 deletions.
11 changes: 5 additions & 6 deletions src/sourmash/minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,9 +275,13 @@ def __copy__(self):

def __getstate__(self):
"support pickling via __getstate__/__setstate__"

# note: we multiple ksize by 3 here so that
# pickle protocols that bypass __setstate__ <coff numpy coff>
# get a ksize that makes sense to the Rust layer. See #2262.
return (
self.num,
self.ksize,
self.ksize if self.is_dna else self.ksize*3,
self.is_protein,
self.dayhoff,
self.hp,
Expand Down Expand Up @@ -1107,11 +1111,6 @@ def to_mutable(self):
mut = MinHash.__new__(MinHash)
state_tup = self.__getstate__()

# is protein/hp/dayhoff?
if state_tup[2] or state_tup[3] or state_tup[4]:
state_tup = list(state_tup)
# adjust ksize.
state_tup[1] = state_tup[1] * 3
mut.__setstate__(state_tup)
return mut

Expand Down
63 changes: 63 additions & 0 deletions tests/test_minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -1481,6 +1481,69 @@ def test_scaled_property(track_abundance):
assert a.scaled == scaled


def test_pickle_protein(track_abundance):
# check that protein/etc ksize is handled properly during serialization.
a = MinHash(0, 10, track_abundance=track_abundance, is_protein=True,
scaled=_get_scaled_for_max_hash(20))
for i in range(0, 40, 2):
a.add_hash(i)

b = pickle.loads(pickle.dumps(a))
assert a.ksize == b.ksize
assert b.num == a.num
assert b._max_hash == a._max_hash
assert b._max_hash == 20
assert b.is_protein
assert b.track_abundance == track_abundance
assert b.seed == a.seed
assert len(b.hashes) == len(a.hashes)
assert len(b.hashes) == 11
assert a.scaled == b.scaled
assert b.scaled != 0


def test_pickle_dayhoff(track_abundance):
# check that dayhoff ksize is handled properly during serialization.
a = MinHash(0, 10, track_abundance=track_abundance, dayhoff=True,
scaled=_get_scaled_for_max_hash(20))
for i in range(0, 40, 2):
a.add_hash(i)

b = pickle.loads(pickle.dumps(a))
assert a.ksize == b.ksize
assert b.num == a.num
assert b._max_hash == a._max_hash
assert b._max_hash == 20
assert b.dayhoff
assert b.track_abundance == track_abundance
assert b.seed == a.seed
assert len(b.hashes) == len(a.hashes)
assert len(b.hashes) == 11
assert a.scaled == b.scaled
assert b.scaled != 0


def test_pickle_hp(track_abundance):
# check that hp ksize is handled properly during serialization.
a = MinHash(0, 10, track_abundance=track_abundance, hp=True,
scaled=_get_scaled_for_max_hash(20))
for i in range(0, 40, 2):
a.add_hash(i)

b = pickle.loads(pickle.dumps(a))
assert a.ksize == b.ksize
assert b.num == a.num
assert b._max_hash == a._max_hash
assert b._max_hash == 20
assert b.hp
assert b.track_abundance == track_abundance
assert b.seed == a.seed
assert len(b.hashes) == len(a.hashes)
assert len(b.hashes) == 11
assert a.scaled == b.scaled
assert b.scaled != 0


def test_pickle_max_hash(track_abundance):
a = MinHash(0, 10, track_abundance=track_abundance,
scaled=_get_scaled_for_max_hash(20))
Expand Down
14 changes: 14 additions & 0 deletions tests/test_sourmash.py
Original file line number Diff line number Diff line change
Expand Up @@ -6317,6 +6317,20 @@ def test_compare_jaccard_ani(runtmp):
print(c.last_result.out)


def test_compare_jaccard_protein_parallel_ani_bug(runtmp):
# this checks a bug that occurred with serialization of protein minhash
# in parallel situations. See #2262.
c = runtmp

sigfile = utils.get_test_data("prot/protein.zip")

c.run_sourmash('compare', '--ani', '-p', '2', '--csv', 'output.csv',
sigfile)

print(c.last_result.err)
print(c.last_result.out)


def test_compare_containment_ani_asymmetry_distance(runtmp):
# very specifically test asymmetry of ANI in containment matrices ;)
# ...calculated with --distance
Expand Down

0 comments on commit beb96a3

Please sign in to comment.