diff --git a/doc/command-line.md b/doc/command-line.md index 2908bd18f2..05c2e22557 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -250,31 +250,45 @@ for matches to the query signature. It can search for matches with either high [Jaccard similarity](https://en.wikipedia.org/wiki/Jaccard_index) or containment; the default is to use Jaccard similarity, unless `--containment` is specified. `-o/--output` will create a CSV file -containing the matches. +containing all of the matches with respective similarity or containment score. `search` makes use of [indexed databases](#loading-many-signatures) to decrease search time and memory where possible. Usage: ``` -sourmash search query.sig [ list of signatures or SBTs ] +sourmash search query.sig ``` Example output: ``` -49 matches; showing first 20: +% sourmash search tests/test-data/47.fa.sig gtdb-rs207.genomic-reps.dna.k31.zip + +... +-- +loaded 65703 total signatures from 1 locations. +after selecting signatures compatible with search, 65703 remain. + +2 matches above threshold 0.080: similarity match ---------- ----- - 75.4% NZ_JMGW01000001.1 Escherichia coli 1-176-05_S4_C2 e117605... - 72.2% NZ_GG774190.1 Escherichia coli MS 196-1 Scfld2538, whole ... - 71.4% NZ_JMGU01000001.1 Escherichia coli 2-011-08_S3_C2 e201108... - 70.1% NZ_JHRU01000001.1 Escherichia coli strain 100854 100854_1... - 69.0% NZ_JH659569.1 Escherichia coli M919 supercont2.1, whole g... -... + 32.3% GCF_900456975.1 Shewanella baltica strain=NCTC10735, 5088... + 14.0% GCF_002838165.1 Shewanella sp. Pdp11 strain=Pdp11, ASM283... ``` -Note, as of sourmash 4.2.0, `search` supports `--picklist`, to +`search` takes a number of command line options - +* `--containment` - find matches using the containment index rather than Jaccard similarity; +* `--max-containment` - find matches using the max containment index rather than Jaccard similarity; +* `-t/--threshold` - lower threshold for matching; defaults to 0.08; +* `--best-only` - find and report only the best match; +* `-n/--num-results` - number of matches to report to stdout; defaults to 3; 0 to report all; + +Match information can be saved to a CSV file with `-o/--output`; with +`-o`, all matches above the threshold will be saved, not just those +printed to stdout (which are limited to `-n/--num-results`). + +As of sourmash 4.2.0, `search` supports `--picklist`, to [select a subset of signatures to search, based on a CSV file](#using-picklists-to-subset-large-collections-of-signatures). This can be used to search only a small subset of a large collection, or to exclude a few signatures from a collection, without modifying the diff --git a/src/sourmash/cli/search.py b/src/sourmash/cli/search.py index 7fc17f5d2e..96ac30ab4b 100644 --- a/src/sourmash/cli/search.py +++ b/src/sourmash/cli/search.py @@ -61,7 +61,7 @@ def subparser(subparsers): help='output debug information' ) subparser.add_argument( - '--threshold', metavar='T', default=0.08, type=float, + '-t', '--threshold', metavar='T', default=0.08, type=float, help='minimum threshold for reporting matches; default=0.08' ) subparser.add_argument( @@ -74,7 +74,7 @@ def subparser(subparsers): ) subparser.add_argument( '-n', '--num-results', default=3, type=int, metavar='N', - help='number of results to report' + help='number of results to display to user; 0 to report all' ) subparser.add_argument( '--containment', action='store_true', diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index 910ee3c602..8d7bc9d221 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -538,10 +538,10 @@ def search(args): args.num_results = 1 if not args.num_results or n_matches <= args.num_results: - print_results('{} matches:'.format(len(results))) + print_results(f'{len(results)} matches above threshold {args.threshold:0.3f}:') else: - print_results('{} matches; showing first {}:', - len(results), args.num_results) + print_results(f'{len(results)} matches above threshold {args.threshold:0.3f}; showing first {args.num_results}:') + n_matches = args.num_results size_may_be_inaccurate = False @@ -588,6 +588,7 @@ def search(args): if jaccard_ani_untrustworthy: notify("WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons.") + def categorize(args): "Use a database to find the best match to many signatures." from .index import MultiIndex diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py index d212a49fff..4c67c7dbc2 100644 --- a/tests/test_cmd_signature.py +++ b/tests/test_cmd_signature.py @@ -3596,7 +3596,7 @@ def test_import_mash_csv_to_sig(runtmp): print("RUNTEMP", runtmp) - assert '1 matches:' in runtmp.last_result.out + assert '1 matches' in runtmp.last_result.out assert '100.0% short.fa' in runtmp.last_result.out diff --git a/tests/test_index.py b/tests/test_index.py index ad04598db1..a8a9524bbb 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -688,7 +688,7 @@ def test_zipfile_protein_command_search(runtmp): db_out = utils.get_test_data('prot/protein.zip') c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0') - assert '2 matches:' in c.last_result.out + assert '2 matches' in c.last_result.out c.run_sourmash('gather', sigfile1, db_out) assert 'found 1 matches total' in c.last_result.out @@ -703,7 +703,7 @@ def test_zipfile_hp_command_search(runtmp): db_out = utils.get_test_data('prot/hp.zip') c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0') - assert '2 matches:' in c.last_result.out + assert '2 matches' in c.last_result.out c.run_sourmash('gather', sigfile1, db_out, '--threshold', '0.0') assert 'found 1 matches total' in c.last_result.out @@ -718,7 +718,7 @@ def test_zipfile_dayhoff_command_search(runtmp): db_out = utils.get_test_data('prot/dayhoff.zip') c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0') - assert '2 matches:' in c.last_result.out + assert '2 matches' in c.last_result.out c.run_sourmash('gather', sigfile1, db_out, '--threshold', '0.0') assert 'found 1 matches total' in c.last_result.out @@ -733,7 +733,7 @@ def test_zipfile_protein_command_search_combined(runtmp): db_out = utils.get_test_data('prot/all.zip') c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0') - assert '2 matches:' in c.last_result.out + assert '2 matches' in c.last_result.out c.run_sourmash('gather', sigfile1, db_out) assert 'found 1 matches total' in c.last_result.out @@ -748,7 +748,7 @@ def test_zipfile_hp_command_search_combined(runtmp): db_out = utils.get_test_data('prot/all.zip') c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0') - assert '2 matches:' in c.last_result.out + assert '2 matches' in c.last_result.out c.run_sourmash('gather', sigfile1, db_out, '--threshold', '0.0') assert 'found 1 matches total' in c.last_result.out @@ -763,7 +763,7 @@ def test_zipfile_dayhoff_command_search_combined(runtmp): db_out = utils.get_test_data('prot/all.zip') c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0') - assert '2 matches:' in c.last_result.out + assert '2 matches' in c.last_result.out c.run_sourmash('gather', sigfile1, db_out, '--threshold', '0.0') assert 'found 1 matches total' in c.last_result.out diff --git a/tests/test_lca.py b/tests/test_lca.py index 3ac721287d..e585d6f835 100644 --- a/tests/test_lca.py +++ b/tests/test_lca.py @@ -2659,7 +2659,7 @@ def test_lca_db_protein_command_search(c): db_out = utils.get_test_data('prot/protein.lca.json.gz') c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0') - assert '2 matches:' in c.last_result.out + assert '2 matches' in c.last_result.out c.run_sourmash('gather', sigfile1, db_out) assert 'found 1 matches total' in c.last_result.out @@ -2770,7 +2770,7 @@ def test_lca_db_hp_command_search(c): db_out = utils.get_test_data('prot/hp.lca.json.gz') c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0') - assert '2 matches:' in c.last_result.out + assert '2 matches' in c.last_result.out c.run_sourmash('gather', sigfile1, db_out, '--threshold', '0.0') assert 'found 1 matches total' in c.last_result.out @@ -2881,7 +2881,7 @@ def test_lca_db_dayhoff_command_search(c): db_out = utils.get_test_data('prot/dayhoff.lca.json.gz') c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0') - assert '2 matches:' in c.last_result.out + assert '2 matches' in c.last_result.out c.run_sourmash('gather', sigfile1, db_out, '--threshold', '0.0') assert 'found 1 matches total' in c.last_result.out diff --git a/tests/test_sbt.py b/tests/test_sbt.py index 3c83915e9c..a66d0c634e 100644 --- a/tests/test_sbt.py +++ b/tests/test_sbt.py @@ -1049,7 +1049,7 @@ def test_sbt_protein_command_search(c): db_out = utils.get_test_data('prot/protein.sbt.zip') c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0') - assert '2 matches:' in c.last_result.out + assert '2 matches' in c.last_result.out c.run_sourmash('gather', sigfile1, db_out) assert 'found 1 matches total' in c.last_result.out @@ -1096,7 +1096,7 @@ def test_sbt_hp_command_search(c): db_out = utils.get_test_data('prot/hp.sbt.zip') c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0') - assert '2 matches:' in c.last_result.out + assert '2 matches' in c.last_result.out c.run_sourmash('gather', sigfile1, db_out, '--threshold', '0.0') assert 'found 1 matches total' in c.last_result.out @@ -1143,7 +1143,7 @@ def test_sbt_dayhoff_command_search(c): db_out = utils.get_test_data('prot/dayhoff.sbt.zip') c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0') - assert '2 matches:' in c.last_result.out + assert '2 matches' in c.last_result.out c.run_sourmash('gather', sigfile1, db_out, '--threshold', '0.0') assert 'found 1 matches total' in c.last_result.out diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 0689ef4046..5588ffc2c0 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -1371,7 +1371,7 @@ def test_do_sourmash_sbt_search_check_bug(runtmp): runtmp.sourmash('search', testdata1, 'zzz') - assert '1 matches:' in runtmp.last_result.out + assert '1 matches' in runtmp.last_result.out tree = load_sbt_index(runtmp.output('zzz.sbt.zip')) assert tree._nodes[0].metadata['min_n_below'] == 431 @@ -1390,7 +1390,7 @@ def test_do_sourmash_sbt_search_empty_sig(runtmp): runtmp.sourmash('search', testdata1, 'zzz') - assert '1 matches:' in runtmp.last_result.out + assert '1 matches' in runtmp.last_result.out tree = load_sbt_index(runtmp.output('zzz.sbt.zip')) assert tree._nodes[0].metadata['min_n_below'] == 1 @@ -1821,7 +1821,7 @@ def test_search_3(runtmp): runtmp.sourmash('search', '-n', '1', 'short.fa.sig', 'short2.fa.sig', 'short3.fa.sig') print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '2 matches; showing first 1' in runtmp.last_result.out + assert '2 matches above threshold 0.080; showing first 1:' in runtmp.last_result.out def test_search_4(runtmp): @@ -1834,11 +1834,21 @@ def test_search_4(runtmp): runtmp.sourmash('search', '-n', '0', 'short.fa.sig', 'short2.fa.sig', 'short3.fa.sig') print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '2 matches:' in runtmp.last_result.out + assert '2 matches above threshold 0.080:' in runtmp.last_result.out assert 'short2.fa' in runtmp.last_result.out assert 'short3.fa' in runtmp.last_result.out +def test_search_5_num_results(runtmp): + query = utils.get_test_data('gather/combined.sig') + against = glob.glob(utils.get_test_data('gather/GCF*.sig')) + + runtmp.sourmash('search', '-n', '5', query, *against) + + print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) + assert '12 matches above threshold 0.080; showing first 5:' in runtmp.last_result.out + + def test_index_check_scaled_bounds_negative(runtmp): with pytest.raises(SourmashCommandFailed): runtmp.sourmash('index', 'zzz', 'short.fa.sig', 'short2.fa.sig', '-k', '31', '--scaled', '-5', '--dna') @@ -1889,7 +1899,7 @@ def test_index_metagenome_fromfile(c): print(c.last_result.err) assert ' 33.2% NC_003198.1 Salmonella enterica subsp. enterica serovar T...' in out - assert '12 matches; showing first 3:' in out + assert '12 matches above threshold 0.080; showing first 3:' in out @utils.in_tempdir def test_index_metagenome_fromfile_no_cmdline_sig(c): @@ -1918,7 +1928,7 @@ def test_index_metagenome_fromfile_no_cmdline_sig(c): print(c.last_result.err) assert ' 33.2% NC_003198.1 Salmonella enterica subsp. enterica serovar T' in out - assert '12 matches; showing first 3:' in out + assert '12 matches above threshold 0.080; showing first 3:' in out def test_search_metagenome(runtmp): @@ -1941,7 +1951,7 @@ def test_search_metagenome(runtmp): print(runtmp.last_result.err) assert ' 33.2% NC_003198.1 Salmonella enterica subsp. enterica serovar T' in runtmp.last_result.out - assert '12 matches; showing first 3:' in runtmp.last_result.out + assert '12 matches above threshold 0.080; showing first 3:' in runtmp.last_result.out def test_search_metagenome_traverse(runtmp): @@ -1955,7 +1965,7 @@ def test_search_metagenome_traverse(runtmp): print(runtmp.last_result.err) assert ' 33.2% NC_003198.1 Salmonella enterica subsp. enterica serovar T' in runtmp.last_result.out - assert '13 matches; showing first 3:' in runtmp.last_result.out + assert '13 matches above threshold 0.080; showing first 3:' in runtmp.last_result.out def test_search_metagenome_traverse_check_csv(runtmp): @@ -1983,7 +1993,7 @@ def test_search_metagenome_traverse_check_csv(runtmp): assert len(filename) > prefix_len assert ' 33.2% NC_003198.1 Salmonella enterica subsp. enterica serovar T' in runtmp.last_result.out - assert '13 matches; showing first 3:' in runtmp.last_result.out + assert '13 matches above threshold 0.080; showing first 3:' in runtmp.last_result.out @utils.in_thisdir @@ -2104,7 +2114,7 @@ def test_search_metagenome_sbt_downsample_nofail(runtmp): assert runtmp.last_result.status == 0 assert "ERROR: cannot use 'gcf_all' for this query." in runtmp.last_result.err assert "search scaled value 100000 is less than database scaled value of 10000" in runtmp.last_result.err - assert "0 matches:" in runtmp.last_result.out + assert "0 matches" in runtmp.last_result.out def test_search_metagenome_downsample_containment(runtmp): @@ -2127,7 +2137,7 @@ def test_search_metagenome_downsample_containment(runtmp): print(runtmp.last_result.err) assert ' 32.9% NC_003198.1 Salmonella enterica subsp. enterica serovar T' in runtmp.last_result.out - assert '12 matches; showing first 3:' in runtmp.last_result.out + assert '12 matches above threshold 0.080; showing first 3:' in runtmp.last_result.out @utils.in_tempdir @@ -2154,7 +2164,7 @@ def test_search_metagenome_downsample_index(c): c) assert ' 29.7% NC_003197.2 Salmonella enterica subsp. enterica serovar T' in str( c) - assert '12 matches; showing first 3:' in str(c) + assert '12 matches above threshold 0.080; showing first 3:' in str(c) def test_search_with_picklist(runtmp): @@ -2174,7 +2184,7 @@ def test_search_with_picklist(runtmp): out = runtmp.last_result.out print(out) - assert "3 matches:" in out + assert "3 matches" in out assert "13.1% NC_000853.1 Thermotoga" in out assert "13.0% NC_009486.1 Thermotoga" in out assert "12.8% NC_011978.1 Thermotoga" in out @@ -2196,7 +2206,7 @@ def test_search_with_picklist_exclude(runtmp): out = runtmp.last_result.out print(out) - assert "9 matches; showing first 3:" in out + assert "9 matches above threshold 0.080; showing first 3:" in out assert "33.2% NC_003198.1 Salmonella" in out assert "33.1% NC_003197.2 Salmonella" in out assert "32.2% NC_006905.1 Salmonella" in out @@ -2215,7 +2225,7 @@ def test_search_with_pattern_include(runtmp): out = runtmp.last_result.out print(out) - assert "3 matches:" in out + assert "3 matches" in out assert "13.1% NC_000853.1 Thermotoga" in out assert "13.0% NC_009486.1 Thermotoga" in out assert "12.8% NC_011978.1 Thermotoga" in out @@ -2234,7 +2244,7 @@ def test_search_with_pattern_exclude(runtmp): out = runtmp.last_result.out print(out) - assert "9 matches; showing first 3:" in out + assert "9 matches above threshold 0.080; showing first 3:" in out assert "33.2% NC_003198.1 Salmonella" in out assert "33.1% NC_003197.2 Salmonella" in out assert "32.2% NC_006905.1 Salmonella" in out @@ -2285,7 +2295,7 @@ def test_mash_csv_to_sig(runtmp): runtmp.sourmash('search', '-k', '31', 'short.fa.sig', 'xxx.sig') print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '1 matches:' in runtmp.last_result.out + assert '1 matches' in runtmp.last_result.out assert '100.0% short.fa' in runtmp.last_result.out @@ -5281,7 +5291,7 @@ def test_index_matches_search_with_picklist(runtmp): out = runtmp.last_result.out print(out) - assert "3 matches:" in out + assert "3 matches" in out assert "13.1% NC_000853.1 Thermotoga" in out assert "13.0% NC_009486.1 Thermotoga" in out assert "12.8% NC_011978.1 Thermotoga" in out @@ -5322,7 +5332,7 @@ def test_index_matches_search_with_picklist_exclude(runtmp): out = runtmp.last_result.out print(out) - assert "10 matches; showing first 3:" in out + assert "10 matches above threshold 0.080; showing first 3:" in out assert "100.0% -" in out assert "33.2% NC_003198.1 Salmonella" in out assert "33.1% NC_003197.2 Salmonella" in out @@ -5564,7 +5574,7 @@ def test_gather_with_prefetch_picklist_5_search(runtmp): out = runtmp.last_result.out print(out) - assert "12 matches; showing first 3:" in out + assert "12 matches above threshold 0.080; showing first 3:" in out assert " 33.2% NC_003198.1 Salmonella enterica subsp." in out # now, do a gather with the results