Skip to content

Commit

Permalink
[MRG] add --scaled to sourmash compare (#2414)
Browse files Browse the repository at this point in the history
Fixes #2398

Co-authored-by: Tessa Pierce Ward <bluegenes@users.noreply.github.com>
  • Loading branch information
ctb and bluegenes authored Jan 13, 2023
1 parent 19c2be5 commit ca57995
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 6 deletions.
4 changes: 3 additions & 1 deletion src/sourmash/cli/compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@
"""

from sourmash.cli.utils import (add_ksize_arg, add_moltype_args,
add_picklist_args, add_pattern_args)
add_picklist_args, add_pattern_args,
add_scaled_arg)


def subparser(subparsers):
Expand Down Expand Up @@ -95,6 +96,7 @@ def subparser(subparsers):
add_moltype_args(subparser)
add_picklist_args(subparser)
add_pattern_args(subparser)
add_scaled_arg(subparser)


def main(args):
Expand Down
2 changes: 1 addition & 1 deletion src/sourmash/cli/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def command_list(dirpath):
def add_scaled_arg(parser, default=None):
parser.add_argument(
'--scaled', metavar='FLOAT', type=check_scaled_bounds,
help='scaled value should be between 100 and 1e6'
help='downsample to this scaled; value should be between 100 and 1e6'
)


Expand Down
17 changes: 14 additions & 3 deletions src/sourmash/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def compare(args):

# complain if it's not all one or the other
if is_scaled != is_scaled_2:
error('cannot mix scaled signatures with bounded signatures')
error('ERROR: cannot mix scaled signatures with num signatures')
sys.exit(-1)

is_containment = False
Expand Down Expand Up @@ -134,24 +134,35 @@ def compare(args):
if track_abundances:
notify('NOTE: --containment, --max-containment, --avg-containment, and --estimate-ani ignore signature abundances.')

# if using --scaled, downsample appropriately
# if using scaled sketches or --scaled, downsample to common max scaled.
printed_scaled_msg = False
if is_scaled:
max_scaled = max(s.minhash.scaled for s in siglist)
if args.scaled:
args.scaled = int(args.scaled)

max_scaled = max(max_scaled, args.scaled)
if max_scaled > args.scaled:
notify(f"WARNING: --scaled specified {args.scaled}, but max scaled of sketches is {max_scaled}")
notify(f"WARNING: continuing with scaled value of {max_scaled}.")

new_siglist = []
for s in siglist:
if not size_may_be_inaccurate and not s.minhash.size_is_accurate():
size_may_be_inaccurate = True
if s.minhash.scaled != max_scaled:
if not printed_scaled_msg:
notify(f'downsampling to scaled value of {format(max_scaled)}')
notify(f'NOTE: downsampling to scaled value of {format(max_scaled)}')
printed_scaled_msg = True
with s.update() as s:
s.minhash = s.minhash.downsample(scaled=max_scaled)
new_siglist.append(s)
else:
new_siglist.append(s)
siglist = new_siglist
elif args.scaled is not None:
error("ERROR: cannot specify --scaled with non-scaled signatures.")
sys.exit(-1)

if len(siglist) == 0:
error('no signatures!')
Expand Down
79 changes: 78 additions & 1 deletion tests/test_sourmash.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,7 +414,7 @@ def test_compare_output_csv_gz(runtmp):


def test_compare_downsample(runtmp):
# test 'compare' with --downsample
# test 'compare' with implicit downsampling
c = runtmp
testdata1 = utils.get_test_data('short.fa')
c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=200', testdata1)
Expand All @@ -433,6 +433,83 @@ def test_compare_downsample(runtmp):
assert lines[2].startswith('0.6666')


def test_compare_downsample_scaled(runtmp):
# test 'compare' with explicit --scaled downsampling
c = runtmp
testdata1 = utils.get_test_data('short.fa')
c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=200', testdata1)

testdata2 = utils.get_test_data('short2.fa')
c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=100', testdata2)

c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig', '--csv', 'xxx',
'--scaled', '300')

print(c.last_result.status, c.last_result.out, c.last_result.err)
assert 'downsampling to scaled value of 300' in c.last_result.err
with open(c.output('xxx')) as fp:
lines = fp.readlines()
assert len(lines) == 3
assert lines[1].startswith('1.0,0.0')
assert lines[2].startswith('0.0')


def test_compare_downsample_scaled_too_low(runtmp):
# test 'compare' with explicit --scaled downsampling, but lower than min
c = runtmp
testdata1 = utils.get_test_data('short.fa')
c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=200', testdata1)

testdata2 = utils.get_test_data('short2.fa')
c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=100', testdata2)

c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig', '--csv', 'xxx',
'--scaled', '100')

print(c.last_result.status, c.last_result.out, c.last_result.err)
assert 'downsampling to scaled value of 200' in c.last_result.err
assert "WARNING: --scaled specified 100, but max scaled of sketches is 200" in c.last_result.err
with open(c.output('xxx')) as fp:
lines = fp.readlines()
assert len(lines) == 3
assert lines[1].startswith('1.0,0.6666')
assert lines[2].startswith('0.6666')


def test_compare_downsample_scaled_fail_num(runtmp):
# test 'compare' with explicit --scaled downsampling; fail on num sketch
c = runtmp
testdata1 = utils.get_test_data('short.fa')
c.run_sourmash('sketch', 'dna', '-p', 'k=31,num=20', testdata1)

testdata2 = utils.get_test_data('short2.fa')
c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=100', testdata2)

with pytest.raises(SourmashCommandFailed) as exc:
c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig',
'--csv', 'xxx', '--scaled', '300')

print(c.last_result.status, c.last_result.out, c.last_result.err)
assert "cannot mix scaled signatures with num signatures" in c.last_result.err


def test_compare_downsample_scaled_fail_all_num(runtmp):
# test 'compare' with explicit --scaled downsampling; fail on all num sketches
c = runtmp
testdata1 = utils.get_test_data('short.fa')
c.run_sourmash('sketch', 'dna', '-p', 'k=31,num=20', testdata1)

testdata2 = utils.get_test_data('short2.fa')
c.run_sourmash('sketch', 'dna', '-p', 'k=31,num=30', testdata2)

with pytest.raises(SourmashCommandFailed) as exc:
c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig',
'--csv', 'xxx', '--scaled', '300')

print(c.last_result.status, c.last_result.out, c.last_result.err)
assert "ERROR: cannot specify --scaled with non-scaled signatures." in c.last_result.err


def test_compare_output_multiple_k(runtmp):
# test 'compare' when given multiple k-mer sizes -> should fail
c = runtmp
Expand Down

0 comments on commit ca57995

Please sign in to comment.