From c7883ae9132fecabf7599f66491468e017679d71 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Mon, 15 Aug 2022 09:28:40 -0700 Subject: [PATCH] [MRG] update database load UX for `gather` etc. (#2204) * remove unnecessary exit checks * add fail-on-empty; adjust notify output * update tests * more argparse foo * re-add sys exit * construct empty db when invalid * be ok with databases with no pcklist matches * add fail-on-empty to search and multigather * adjust prefetch reporting, too * add test for new output from prefetch * simplify unnecessary code * remove more unnecessary * fix argparse foo, add test for search --no-fail-on-empty-database * add test for sourmash gather --no-fail * remove debug * more tests * even more test --- src/sourmash/cli/gather.py | 11 +++ src/sourmash/cli/multigather.py | 11 +++ src/sourmash/cli/search.py | 11 +++ src/sourmash/commands.py | 47 +++++----- src/sourmash/sourmash_args.py | 56 ++++++------ tests/sourmash_tst_utils.py | 2 +- tests/test_lca.py | 33 +++++++- tests/test_prefetch.py | 6 +- tests/test_sourmash.py | 146 ++++++++++++++++++++++++++++++-- 9 files changed, 262 insertions(+), 61 deletions(-) diff --git a/src/sourmash/cli/gather.py b/src/sourmash/cli/gather.py index 1260b1d035..6b59efeb2e 100644 --- a/src/sourmash/cli/gather.py +++ b/src/sourmash/cli/gather.py @@ -135,6 +135,17 @@ def subparser(subparsers): '--estimate-ani-ci', action='store_true', help='also output confidence intervals for ANI estimates' ) + subparser.add_argument( + '--fail-on-empty-database', action='store_true', + help='stop at databases that contain no compatible signatures' + ) + subparser.add_argument( + '--no-fail-on-empty-database', action='store_false', + dest='fail_on_empty_database', + help='continue past databases that contain no compatible signatures' + ) + subparser.set_defaults(fail_on_empty_database=True) + add_ksize_arg(subparser, 31) add_moltype_args(subparser) add_picklist_args(subparser) diff --git a/src/sourmash/cli/multigather.py b/src/sourmash/cli/multigather.py index b2393f751f..40f95807e9 100644 --- a/src/sourmash/cli/multigather.py +++ b/src/sourmash/cli/multigather.py @@ -72,6 +72,17 @@ def subparser(subparsers): '--estimate-ani-ci', action='store_true', help='also output confidence intervals for ANI estimates' ) + subparser.add_argument( + '--fail-on-empty-database', action='store_true', + help='stop at databases that contain no compatible signatures' + ) + subparser.add_argument( + '--no-fail-on-empty-database', action='store_false', + dest='fail_on_empty_database', + help='continue past databases that contain no compatible signatures' + ) + subparser.set_defaults(fail_on_empty_database=True) + add_ksize_arg(subparser, 31) add_moltype_args(subparser) add_scaled_arg(subparser, 0) diff --git a/src/sourmash/cli/search.py b/src/sourmash/cli/search.py index 53b5bb9b8b..7fc17f5d2e 100644 --- a/src/sourmash/cli/search.py +++ b/src/sourmash/cli/search.py @@ -101,6 +101,17 @@ def subparser(subparsers): '--md5', default=None, help='select the signature with this md5 as query' ) + subparser.add_argument( + '--fail-on-empty-database', action='store_true', + help='stop at databases that contain no compatible signatures' + ) + subparser.add_argument( + '--no-fail-on-empty-database', action='store_false', + dest='fail_on_empty_database', + help='continue past databases that contain no compatible signatures' + ) + subparser.set_defaults(fail_on_empty_database=True) + add_ksize_arg(subparser, 31) add_moltype_args(subparser) add_picklist_args(subparser) diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index 3ecac41488..910ee3c602 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -496,11 +496,8 @@ def search(args): databases = sourmash_args.load_dbs_and_sigs(args.databases, query, not is_containment, picklist=picklist, - pattern=pattern_search) - - if not len(databases): - error('Nothing found to search!') - sys.exit(-1) + pattern=pattern_search, + fail_on_empty_database=args.fail_on_empty_database) # handle signatures with abundance if query.minhash.track_abundance: @@ -702,11 +699,9 @@ def gather(args): databases = sourmash_args.load_dbs_and_sigs(args.databases, query, False, cache_size=cache_size, picklist=picklist, - pattern=pattern_search) + pattern=pattern_search, + fail_on_empty_database=args.fail_on_empty_database) - if not len(databases): - error('Nothing found to search!') - sys.exit(-1) if args.linear: # force linear traversal? databases = [ LazyLinearIndex(db) for db in databases ] @@ -735,11 +730,8 @@ def gather(args): try: counter = db.counter_gather(prefetch_query, args.threshold_bp) except ValueError: - if picklist or pattern_search: - # catch "no signatures to search" ValueError from filtering - continue - else: - raise # re-raise other errors, if no picklist. + # catch "no signatures to search" ValueError if empty db. + continue save_prefetch.add_many(counter.signatures()) @@ -911,11 +903,8 @@ def multigather(args): # need a query to get ksize, moltype for db loading query = next(iter(sourmash_args.load_file_as_signatures(inp_files[0], ksize=args.ksize, select_moltype=moltype))) - databases = sourmash_args.load_dbs_and_sigs(args.db, query, False) - - if not len(databases): - error('Nothing found to search!') - sys.exit(-1) + databases = sourmash_args.load_dbs_and_sigs(args.db, query, False, + fail_on_empty_database=args.fail_on_empty_database) # run gather on all the queries. n=0 @@ -949,7 +938,11 @@ def multigather(args): counters = [] for db in databases: - counter = db.counter_gather(prefetch_query, args.threshold_bp) + try: + counter = db.counter_gather(prefetch_query, args.threshold_bp) + except ValueError: + # catch "no signatures to search" ValueError if empty db. + continue counters.append(counter) # track found/not found hashes @@ -1244,10 +1237,13 @@ def prefetch(args): did_a_search = False # track whether we did _any_ search at all! size_may_be_inaccurate = False + total_signatures_loaded = 0 + sum_signatures_after_select = 0 for dbfilename in args.databases: - notify(f"loading signatures from '{dbfilename}'") + notify(f"loading signatures from '{dbfilename}'", end='\r') db = sourmash_args.load_file_as_index(dbfilename) + total_signatures_loaded += len(db) # force linear traversal? if args.linear: @@ -1256,6 +1252,8 @@ def prefetch(args): db = db.select(ksize=ksize, moltype=moltype, containment=True, scaled=True) + sum_signatures_after_select += len(db) + db = sourmash_args.apply_picklist_and_pattern(db, picklist, pattern_search) @@ -1308,10 +1306,15 @@ def prefetch(args): # delete db explicitly ('cause why not) del db + notify("--") + notify(f"loaded {total_signatures_loaded} total signatures from {len(args.databases)} locations.") + notify(f"after selecting signatures compatible with search, {sum_signatures_after_select} remain.") + if not did_a_search: - notify("ERROR in prefetch: no compatible signatures in any databases?!") + notify("ERROR in prefetch: after picklists and patterns, no signatures to search!?") sys.exit(-1) + notify("--") notify(f"total of {matches_out.count} matching signatures.") matches_out.close() diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index 840fd717d1..f13e2c9037 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -284,31 +284,37 @@ def traverse_find_sigs(filenames, yield_all_files=False): def load_dbs_and_sigs(filenames, query, is_similarity_query, *, - cache_size=None, picklist=None, pattern=None): + cache_size=None, picklist=None, pattern=None, + fail_on_empty_database=False): """ - Load one or more SBTs, LCAs, and/or collections of signatures. + Load one or more Index objects to search - databases, etc. - Check for compatibility with query. - - This is basically a user-focused wrapping of _load_databases. + 'select' on compatibility with query, and apply picklists & patterns. """ query_mh = query.minhash + # set selection parameter for containment containment = True if is_similarity_query: containment = False databases = [] + total_signatures_loaded = 0 + sum_signatures_after_select = 0 for filename in filenames: - notify(f'loading from {filename}...', end='\r') + notify(f"loading from '{filename}'...", end='\r') try: db = _load_database(filename, False, cache_size=cache_size) except ValueError as e: # cannot load database! + notify(f"ERROR on loading from '{filename}':") notify(str(e)) sys.exit(-1) + total_signatures_loaded += len(db) + + # get compatible signatures - moltype/ksize/num/scaled try: db = db.select(moltype=query_mh.moltype, ksize=query_mh.ksize, @@ -319,39 +325,29 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, *, # incompatible collection specified! notify(f"ERROR: cannot use '{filename}' for this query.") notify(str(exc)) - sys.exit(-1) + if fail_on_empty_database: + sys.exit(-1) + else: + db = LinearIndex([]) # 'select' returns nothing => all signatures filtered out. fail! if not db: notify(f"no compatible signatures found in '{filename}'") - sys.exit(-1) + if fail_on_empty_database: + sys.exit(-1) + + sum_signatures_after_select += len(db) + # last but not least, apply picklist! db = apply_picklist_and_pattern(db, picklist, pattern) databases.append(db) - # calc num loaded info. - n_signatures = 0 - n_databases = 0 - for db in databases: - if db.is_database: - n_databases += 1 - else: - n_signatures += len(db) - - notify(' '*79, end='\r') - if n_signatures and n_databases: - notify(f'loaded {n_signatures} signatures and {n_databases} databases total.') - elif n_signatures and not n_databases: - notify(f'loaded {n_signatures} signatures.') - elif n_databases and not n_signatures: - notify(f'loaded {n_databases} databases.') - - if databases: - print('') - else: - notify('** ERROR: no signatures or databases loaded?') - sys.exit(-1) + # display num loaded/num selected + notify("--") + notify(f"loaded {total_signatures_loaded} total signatures from {len(databases)} locations.") + notify(f"after selecting signatures compatible with search, {sum_signatures_after_select} remain.") + print('') return databases diff --git a/tests/sourmash_tst_utils.py b/tests/sourmash_tst_utils.py index 19f2f045f5..e10913f9a7 100644 --- a/tests/sourmash_tst_utils.py +++ b/tests/sourmash_tst_utils.py @@ -187,7 +187,7 @@ def run_sourmash(self, *args, **kwargs): kwargs['in_directory'] = self.location cmdlist = ['sourmash'] - cmdlist.extend(args) + cmdlist.extend(( str(x) for x in args)) self.last_command = " ".join(cmdlist) self.last_result = runscript('sourmash', args, **kwargs) diff --git a/tests/test_lca.py b/tests/test_lca.py index 6602d46d86..3ac721287d 100644 --- a/tests/test_lca.py +++ b/tests/test_lca.py @@ -2335,8 +2335,9 @@ def test_compare_csv_real(runtmp): assert '0 incompatible at rank species' in runtmp.last_result.err -def test_incompat_lca_db_ksize_2(runtmp, lca_db_format): - # test on gather - create a database with ksize of 25 +def test_incompat_lca_db_ksize_2_fail(runtmp, lca_db_format): + # test on gather - create a database with ksize of 25 => fail + # because of incompatibility. c = runtmp testdata1 = utils.get_test_data('lca/TARA_ASE_MAG_00031.fa.gz') c.run_sourmash('sketch', 'dna', '-p', 'k=25,scaled=1000', testdata1, @@ -2364,6 +2365,34 @@ def test_incompat_lca_db_ksize_2(runtmp, lca_db_format): assert "ksize on this database is 25; this is different from requested ksize of 31" +def test_incompat_lca_db_ksize_2_nofail(runtmp, lca_db_format): + # test on gather - create a database with ksize of 25, no fail + # because of --no-fail-on-empty-databases + c = runtmp + testdata1 = utils.get_test_data('lca/TARA_ASE_MAG_00031.fa.gz') + c.run_sourmash('sketch', 'dna', '-p', 'k=25,scaled=1000', testdata1, + '-o', 'test_db.sig') + print(c) + + c.run_sourmash('lca', 'index', utils.get_test_data('lca/delmont-1.csv',), + f'test.lca.{lca_db_format}', 'test_db.sig', + '-k', '25', '--scaled', '10000', + '-F', lca_db_format) + print(c) + + # this should not fail despite mismatched ksize, b/c of --no-fail flag. + c.run_sourmash('gather', utils.get_test_data('lca/TARA_ASE_MAG_00031.sig'), f'test.lca.{lca_db_format}', '--no-fail-on-empty-database') + + err = c.last_result.err + print(err) + + if lca_db_format == 'sql': + assert "no compatible signatures found in 'test.lca.sql'" in err + else: + assert "ERROR: cannot use 'test.lca.json' for this query." in err + assert "ksize on this database is 25; this is different from requested ksize of 31" + + def test_lca_index_empty(runtmp, lca_db_format): c = runtmp # test lca index with an empty taxonomy CSV, followed by a load & gather. diff --git a/tests/test_prefetch.py b/tests/test_prefetch.py index 36d231df62..38dfed0db0 100644 --- a/tests/test_prefetch.py +++ b/tests/test_prefetch.py @@ -42,6 +42,10 @@ def test_prefetch_basic(runtmp, linear_gather): assert "loaded query: NC_009665.1 Shewanella baltica... (k=31, DNA)" in c.last_result.err assert "query sketch has scaled=1000; will be dynamically downsampled as needed" in c.last_result.err + err = c.last_result.err + assert "loaded 5 total signatures from 3 locations." in err + assert "after selecting signatures compatible with search, 3 remain." in err + assert "total of 2 matching signatures." in c.last_result.err assert "of 5177 distinct query hashes, 5177 were found in matches above threshold." in c.last_result.err assert "a total of 0 query hashes remain unmatched." in c.last_result.err @@ -453,7 +457,7 @@ def test_prefetch_no_num_subj(runtmp, linear_gather): print(c.last_result.err) assert c.last_result.status != 0 - assert "ERROR in prefetch: no compatible signatures in any databases?!" in c.last_result.err + assert "ERROR in prefetch: after picklists and patterns, no signatures to search!?" in c.last_result.err def test_prefetch_db_fromfile(runtmp, linear_gather): diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index f8e1f889e6..0689ef4046 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -2055,7 +2055,8 @@ def test_search_check_scaled_bounds_more_than_maximum(runtmp): # explanation: you cannot downsample a scaled SBT to match a scaled # signature, so make sure that when you try such a search, it fails! # (you *can* downsample a signature to match an SBT.) -def test_search_metagenome_downsample(runtmp): +def test_search_metagenome_sbt_downsample_fail(runtmp): + # test downsample on SBT => failure, with --fail-on-empty-databases testdata_glob = utils.get_test_data('gather/GCF*.sig') testdata_sigs = glob.glob(testdata_glob) @@ -2069,18 +2070,41 @@ def test_search_metagenome_downsample(runtmp): assert os.path.exists(runtmp.output('gcf_all.sbt.zip')) - cmd = 'search {} gcf_all -k 21 --scaled 100000'.format(query_sig) - with pytest.raises(SourmashCommandFailed): runtmp.sourmash('search', query_sig, 'gcf_all', '-k', '21', '--scaled', '100000') + print(runtmp.last_result.out) + print(runtmp.last_result.err) + assert runtmp.last_result.status == -1 + assert "ERROR: cannot use 'gcf_all' for this query." in runtmp.last_result.err + assert "search scaled value 100000 is less than database scaled value of 10000" in runtmp.last_result.err + + +def test_search_metagenome_sbt_downsample_nofail(runtmp): + # test downsample on SBT => failure but ok with --no-fail-on-empty-database + testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_sigs = glob.glob(testdata_glob) + + query_sig = utils.get_test_data('gather/combined.sig') + + cmd = ['index', 'gcf_all'] + cmd.extend(testdata_sigs) + cmd.extend(['-k', '21']) + + runtmp.sourmash(*cmd) + + assert os.path.exists(runtmp.output('gcf_all.sbt.zip')) + + runtmp.sourmash('search', query_sig, 'gcf_all', '-k', '21', '--scaled', '100000', '--no-fail-on-empty-database') print(runtmp.last_result.out) print(runtmp.last_result.err) + assert runtmp.last_result.status == 0 assert "ERROR: cannot use 'gcf_all' for this query." in runtmp.last_result.err assert "search scaled value 100000 is less than database scaled value of 10000" in runtmp.last_result.err + assert "0 matches:" in runtmp.last_result.out def test_search_metagenome_downsample_containment(runtmp): @@ -2216,6 +2240,40 @@ def test_search_with_pattern_exclude(runtmp): assert "32.2% NC_006905.1 Salmonella" in out +def test_search_empty_db_fail(runtmp): + # search should fail on empty db with --fail-on-empty-database + query = utils.get_test_data('2.fa.sig') + against = utils.get_test_data('47.fa.sig') + against2 = utils.get_test_data('lca/47+63.lca.json') + + with pytest.raises(SourmashCommandFailed): + runtmp.sourmash('search', query, against, against2, '-k', '51') + + + err = runtmp.last_result.err + assert "no compatible signatures found in " in err + + +def test_search_empty_db_nofail(runtmp): + # search should not fail on empty db with --no-fail-on-empty-database + query = utils.get_test_data('2.fa.sig') + against = utils.get_test_data('47.fa.sig') + against2 = utils.get_test_data('lca/47+63.lca.json') + + runtmp.sourmash('search', query, against, against2, '-k', '51', + '--no-fail-on-empty-data') + + out = runtmp.last_result.out + err = runtmp.last_result.err + print(out) + print(err) + + assert "no compatible signatures found in " in err + assert "ksize on this database is 31; this is different from requested ksize of 51" in err + assert "loaded 50 total signatures from 2 locations" in err + assert "after selecting signatures compatible with search, 0 remain." in err + + def test_mash_csv_to_sig(runtmp): testdata1 = utils.get_test_data('short.fa.msh.dump') testdata2 = utils.get_test_data('short.fa') @@ -4081,7 +4139,11 @@ def test_gather_query_downsample(runtmp, linear_gather, prefetch_gather): print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'loaded 12 signatures' in runtmp.last_result.err + err = runtmp.last_result.err + + assert 'loaded 36 total signatures from 12 locations.' in err + assert 'after selecting signatures compatible with search, 12 remain.' in err + assert all(('4.9 Mbp 100.0% 100.0%' in runtmp.last_result.out, 'NC_003197.2' in runtmp.last_result.out)) @@ -4100,7 +4162,11 @@ def test_gather_query_downsample_explicit(runtmp, linear_gather, prefetch_gather print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'loaded 12 signatures' in runtmp.last_result.err + err = runtmp.last_result.err + + assert 'loaded 36 total signatures from 12 locations.' in err + assert 'after selecting signatures compatible with search, 12 remain.' in err + assert all(('4.9 Mbp 100.0% 100.0%' in runtmp.last_result.out, 'NC_003197.2' in runtmp.last_result.out)) @@ -4574,6 +4640,41 @@ def test_gather_output_unassigned_with_abundance(runtmp, prefetch_gather, linear assert nomatch_mh.hashes[hashval] == abund +def test_gather_empty_db_fail(runtmp, linear_gather, prefetch_gather): + # gather should fail on empty db with --fail-on-empty-database + query = utils.get_test_data('2.fa.sig') + against = utils.get_test_data('47.fa.sig') + against2 = utils.get_test_data('lca/47+63.lca.json') + + with pytest.raises(SourmashCommandFailed): + runtmp.sourmash('gather', query, against, against2, '-k', '51', + linear_gather, prefetch_gather) + + + err = runtmp.last_result.err + assert "no compatible signatures found in " in err + + +def test_gather_empty_db_nofail(runtmp, prefetch_gather, linear_gather): + # gather should not fail on empty db with --no-fail-on-empty-database + query = utils.get_test_data('2.fa.sig') + against = utils.get_test_data('47.fa.sig') + against2 = utils.get_test_data('lca/47+63.lca.json') + + runtmp.sourmash('gather', query, against, against2, '-k', '51', + '--no-fail-on-empty-data', + linear_gather, prefetch_gather) + + out = runtmp.last_result.out + err = runtmp.last_result.err + print(out) + print(err) + + assert "no compatible signatures found in " in err + assert "ksize on this database is 31; this is different from requested ksize of 51" in err + assert "loaded 50 total signatures from 2 locations" in err + assert "after selecting signatures compatible with search, 0 remain." in err + def test_multigather_output_unassigned_with_abundance(runtmp): c = runtmp query = utils.get_test_data('gather-abund/reads-s10x10-s11.sig') @@ -4604,6 +4705,41 @@ def test_multigather_output_unassigned_with_abundance(runtmp): assert nomatch_mh.hashes[hashval] == abund +def test_multigather_empty_db_fail(runtmp): + # multigather should fail on empty db with --fail-on-empty-database + query = utils.get_test_data('2.fa.sig') + against = utils.get_test_data('47.fa.sig') + against2 = utils.get_test_data('lca/47+63.lca.json') + + with pytest.raises(SourmashCommandFailed): + runtmp.sourmash('multigather', '--query', query, + '--db', against, against2, '-k', '51') + + err = runtmp.last_result.err + assert "no compatible signatures found in " in err + + +def test_multigather_empty_db_nofail(runtmp): + # multigather should not fail on empty db with --no-fail-on-empty-database + query = utils.get_test_data('2.fa.sig') + against = utils.get_test_data('47.fa.sig') + against2 = utils.get_test_data('lca/47+63.lca.json') + + runtmp.sourmash('multigather', '--query', query, + '--db', against, against2, '-k', '51', + '--no-fail-on-empty-data') + + out = runtmp.last_result.out + err = runtmp.last_result.err + print(out) + print(err) + + assert "no compatible signatures found in " in err + assert "ksize on this database is 31; this is different from requested ksize of 51" in err + assert "conducted gather searches on 0 signatures" in err + assert "loaded 50 total signatures from 2 locations" in err + assert "after selecting signatures compatible with search, 0 remain." in err + def test_sbt_categorize(runtmp): testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') testdata2 = utils.get_test_data('genome-s11.fa.gz.sig')