Skip to content

Commit

Permalink
[MRG] update database load UX for gather etc. (#2204)
Browse files Browse the repository at this point in the history
* remove unnecessary exit checks

* add fail-on-empty; adjust notify output

* update tests

* more argparse foo

* re-add sys exit

* construct empty db when invalid

* be ok with databases with no pcklist matches

* add fail-on-empty to search and multigather

* adjust prefetch reporting, too

* add test for new output from prefetch

* simplify unnecessary code

* remove more unnecessary

* fix argparse foo, add test for search --no-fail-on-empty-database

* add test for sourmash gather --no-fail

* remove debug

* more tests

* even more test
  • Loading branch information
ctb authored Aug 15, 2022
1 parent 0c093c1 commit c7883ae
Show file tree
Hide file tree
Showing 9 changed files with 262 additions and 61 deletions.
11 changes: 11 additions & 0 deletions src/sourmash/cli/gather.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,17 @@ def subparser(subparsers):
'--estimate-ani-ci', action='store_true',
help='also output confidence intervals for ANI estimates'
)
subparser.add_argument(
'--fail-on-empty-database', action='store_true',
help='stop at databases that contain no compatible signatures'
)
subparser.add_argument(
'--no-fail-on-empty-database', action='store_false',
dest='fail_on_empty_database',
help='continue past databases that contain no compatible signatures'
)
subparser.set_defaults(fail_on_empty_database=True)

add_ksize_arg(subparser, 31)
add_moltype_args(subparser)
add_picklist_args(subparser)
Expand Down
11 changes: 11 additions & 0 deletions src/sourmash/cli/multigather.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,17 @@ def subparser(subparsers):
'--estimate-ani-ci', action='store_true',
help='also output confidence intervals for ANI estimates'
)
subparser.add_argument(
'--fail-on-empty-database', action='store_true',
help='stop at databases that contain no compatible signatures'
)
subparser.add_argument(
'--no-fail-on-empty-database', action='store_false',
dest='fail_on_empty_database',
help='continue past databases that contain no compatible signatures'
)
subparser.set_defaults(fail_on_empty_database=True)

add_ksize_arg(subparser, 31)
add_moltype_args(subparser)
add_scaled_arg(subparser, 0)
Expand Down
11 changes: 11 additions & 0 deletions src/sourmash/cli/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,17 @@ def subparser(subparsers):
'--md5', default=None,
help='select the signature with this md5 as query'
)
subparser.add_argument(
'--fail-on-empty-database', action='store_true',
help='stop at databases that contain no compatible signatures'
)
subparser.add_argument(
'--no-fail-on-empty-database', action='store_false',
dest='fail_on_empty_database',
help='continue past databases that contain no compatible signatures'
)
subparser.set_defaults(fail_on_empty_database=True)

add_ksize_arg(subparser, 31)
add_moltype_args(subparser)
add_picklist_args(subparser)
Expand Down
47 changes: 25 additions & 22 deletions src/sourmash/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,11 +496,8 @@ def search(args):
databases = sourmash_args.load_dbs_and_sigs(args.databases, query,
not is_containment,
picklist=picklist,
pattern=pattern_search)

if not len(databases):
error('Nothing found to search!')
sys.exit(-1)
pattern=pattern_search,
fail_on_empty_database=args.fail_on_empty_database)

# handle signatures with abundance
if query.minhash.track_abundance:
Expand Down Expand Up @@ -702,11 +699,9 @@ def gather(args):
databases = sourmash_args.load_dbs_and_sigs(args.databases, query, False,
cache_size=cache_size,
picklist=picklist,
pattern=pattern_search)
pattern=pattern_search,
fail_on_empty_database=args.fail_on_empty_database)

if not len(databases):
error('Nothing found to search!')
sys.exit(-1)

if args.linear: # force linear traversal?
databases = [ LazyLinearIndex(db) for db in databases ]
Expand Down Expand Up @@ -735,11 +730,8 @@ def gather(args):
try:
counter = db.counter_gather(prefetch_query, args.threshold_bp)
except ValueError:
if picklist or pattern_search:
# catch "no signatures to search" ValueError from filtering
continue
else:
raise # re-raise other errors, if no picklist.
# catch "no signatures to search" ValueError if empty db.
continue

save_prefetch.add_many(counter.signatures())

Expand Down Expand Up @@ -911,11 +903,8 @@ def multigather(args):
# need a query to get ksize, moltype for db loading
query = next(iter(sourmash_args.load_file_as_signatures(inp_files[0], ksize=args.ksize, select_moltype=moltype)))

databases = sourmash_args.load_dbs_and_sigs(args.db, query, False)

if not len(databases):
error('Nothing found to search!')
sys.exit(-1)
databases = sourmash_args.load_dbs_and_sigs(args.db, query, False,
fail_on_empty_database=args.fail_on_empty_database)

# run gather on all the queries.
n=0
Expand Down Expand Up @@ -949,7 +938,11 @@ def multigather(args):

counters = []
for db in databases:
counter = db.counter_gather(prefetch_query, args.threshold_bp)
try:
counter = db.counter_gather(prefetch_query, args.threshold_bp)
except ValueError:
# catch "no signatures to search" ValueError if empty db.
continue
counters.append(counter)

# track found/not found hashes
Expand Down Expand Up @@ -1244,10 +1237,13 @@ def prefetch(args):

did_a_search = False # track whether we did _any_ search at all!
size_may_be_inaccurate = False
total_signatures_loaded = 0
sum_signatures_after_select = 0
for dbfilename in args.databases:
notify(f"loading signatures from '{dbfilename}'")
notify(f"loading signatures from '{dbfilename}'", end='\r')

db = sourmash_args.load_file_as_index(dbfilename)
total_signatures_loaded += len(db)

# force linear traversal?
if args.linear:
Expand All @@ -1256,6 +1252,8 @@ def prefetch(args):
db = db.select(ksize=ksize, moltype=moltype,
containment=True, scaled=True)

sum_signatures_after_select += len(db)

db = sourmash_args.apply_picklist_and_pattern(db, picklist,
pattern_search)

Expand Down Expand Up @@ -1308,10 +1306,15 @@ def prefetch(args):
# delete db explicitly ('cause why not)
del db

notify("--")
notify(f"loaded {total_signatures_loaded} total signatures from {len(args.databases)} locations.")
notify(f"after selecting signatures compatible with search, {sum_signatures_after_select} remain.")

if not did_a_search:
notify("ERROR in prefetch: no compatible signatures in any databases?!")
notify("ERROR in prefetch: after picklists and patterns, no signatures to search!?")
sys.exit(-1)

notify("--")
notify(f"total of {matches_out.count} matching signatures.")
matches_out.close()

Expand Down
56 changes: 26 additions & 30 deletions src/sourmash/sourmash_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,31 +284,37 @@ def traverse_find_sigs(filenames, yield_all_files=False):


def load_dbs_and_sigs(filenames, query, is_similarity_query, *,
cache_size=None, picklist=None, pattern=None):
cache_size=None, picklist=None, pattern=None,
fail_on_empty_database=False):
"""
Load one or more SBTs, LCAs, and/or collections of signatures.
Load one or more Index objects to search - databases, etc.
Check for compatibility with query.
This is basically a user-focused wrapping of _load_databases.
'select' on compatibility with query, and apply picklists & patterns.
"""
query_mh = query.minhash

# set selection parameter for containment
containment = True
if is_similarity_query:
containment = False

databases = []
total_signatures_loaded = 0
sum_signatures_after_select = 0
for filename in filenames:
notify(f'loading from {filename}...', end='\r')
notify(f"loading from '{filename}'...", end='\r')

try:
db = _load_database(filename, False, cache_size=cache_size)
except ValueError as e:
# cannot load database!
notify(f"ERROR on loading from '{filename}':")
notify(str(e))
sys.exit(-1)

total_signatures_loaded += len(db)

# get compatible signatures - moltype/ksize/num/scaled
try:
db = db.select(moltype=query_mh.moltype,
ksize=query_mh.ksize,
Expand All @@ -319,39 +325,29 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, *,
# incompatible collection specified!
notify(f"ERROR: cannot use '{filename}' for this query.")
notify(str(exc))
sys.exit(-1)
if fail_on_empty_database:
sys.exit(-1)
else:
db = LinearIndex([])

# 'select' returns nothing => all signatures filtered out. fail!
if not db:
notify(f"no compatible signatures found in '{filename}'")
sys.exit(-1)
if fail_on_empty_database:
sys.exit(-1)

sum_signatures_after_select += len(db)

# last but not least, apply picklist!
db = apply_picklist_and_pattern(db, picklist, pattern)

databases.append(db)

# calc num loaded info.
n_signatures = 0
n_databases = 0
for db in databases:
if db.is_database:
n_databases += 1
else:
n_signatures += len(db)

notify(' '*79, end='\r')
if n_signatures and n_databases:
notify(f'loaded {n_signatures} signatures and {n_databases} databases total.')
elif n_signatures and not n_databases:
notify(f'loaded {n_signatures} signatures.')
elif n_databases and not n_signatures:
notify(f'loaded {n_databases} databases.')

if databases:
print('')
else:
notify('** ERROR: no signatures or databases loaded?')
sys.exit(-1)
# display num loaded/num selected
notify("--")
notify(f"loaded {total_signatures_loaded} total signatures from {len(databases)} locations.")
notify(f"after selecting signatures compatible with search, {sum_signatures_after_select} remain.")
print('')

return databases

Expand Down
2 changes: 1 addition & 1 deletion tests/sourmash_tst_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ def run_sourmash(self, *args, **kwargs):
kwargs['in_directory'] = self.location

cmdlist = ['sourmash']
cmdlist.extend(args)
cmdlist.extend(( str(x) for x in args))
self.last_command = " ".join(cmdlist)
self.last_result = runscript('sourmash', args, **kwargs)

Expand Down
33 changes: 31 additions & 2 deletions tests/test_lca.py
Original file line number Diff line number Diff line change
Expand Up @@ -2335,8 +2335,9 @@ def test_compare_csv_real(runtmp):
assert '0 incompatible at rank species' in runtmp.last_result.err


def test_incompat_lca_db_ksize_2(runtmp, lca_db_format):
# test on gather - create a database with ksize of 25
def test_incompat_lca_db_ksize_2_fail(runtmp, lca_db_format):
# test on gather - create a database with ksize of 25 => fail
# because of incompatibility.
c = runtmp
testdata1 = utils.get_test_data('lca/TARA_ASE_MAG_00031.fa.gz')
c.run_sourmash('sketch', 'dna', '-p', 'k=25,scaled=1000', testdata1,
Expand Down Expand Up @@ -2364,6 +2365,34 @@ def test_incompat_lca_db_ksize_2(runtmp, lca_db_format):
assert "ksize on this database is 25; this is different from requested ksize of 31"


def test_incompat_lca_db_ksize_2_nofail(runtmp, lca_db_format):
# test on gather - create a database with ksize of 25, no fail
# because of --no-fail-on-empty-databases
c = runtmp
testdata1 = utils.get_test_data('lca/TARA_ASE_MAG_00031.fa.gz')
c.run_sourmash('sketch', 'dna', '-p', 'k=25,scaled=1000', testdata1,
'-o', 'test_db.sig')
print(c)

c.run_sourmash('lca', 'index', utils.get_test_data('lca/delmont-1.csv',),
f'test.lca.{lca_db_format}', 'test_db.sig',
'-k', '25', '--scaled', '10000',
'-F', lca_db_format)
print(c)

# this should not fail despite mismatched ksize, b/c of --no-fail flag.
c.run_sourmash('gather', utils.get_test_data('lca/TARA_ASE_MAG_00031.sig'), f'test.lca.{lca_db_format}', '--no-fail-on-empty-database')

err = c.last_result.err
print(err)

if lca_db_format == 'sql':
assert "no compatible signatures found in 'test.lca.sql'" in err
else:
assert "ERROR: cannot use 'test.lca.json' for this query." in err
assert "ksize on this database is 25; this is different from requested ksize of 31"


def test_lca_index_empty(runtmp, lca_db_format):
c = runtmp
# test lca index with an empty taxonomy CSV, followed by a load & gather.
Expand Down
6 changes: 5 additions & 1 deletion tests/test_prefetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ def test_prefetch_basic(runtmp, linear_gather):
assert "loaded query: NC_009665.1 Shewanella baltica... (k=31, DNA)" in c.last_result.err
assert "query sketch has scaled=1000; will be dynamically downsampled as needed" in c.last_result.err

err = c.last_result.err
assert "loaded 5 total signatures from 3 locations." in err
assert "after selecting signatures compatible with search, 3 remain." in err

assert "total of 2 matching signatures." in c.last_result.err
assert "of 5177 distinct query hashes, 5177 were found in matches above threshold." in c.last_result.err
assert "a total of 0 query hashes remain unmatched." in c.last_result.err
Expand Down Expand Up @@ -453,7 +457,7 @@ def test_prefetch_no_num_subj(runtmp, linear_gather):
print(c.last_result.err)

assert c.last_result.status != 0
assert "ERROR in prefetch: no compatible signatures in any databases?!" in c.last_result.err
assert "ERROR in prefetch: after picklists and patterns, no signatures to search!?" in c.last_result.err


def test_prefetch_db_fromfile(runtmp, linear_gather):
Expand Down
Loading

0 comments on commit c7883ae

Please sign in to comment.