Skip to content

Commit

Permalink
[MRG] update search documentation, help, and output. (#2222)
Browse files Browse the repository at this point in the history
* adjust search output to include threshold

* add docs, fix tests

* Apply suggestions from code review

thanks!

Co-authored-by: ccbaumler <63077899+ccbaumler@users.noreply.github.com>

* Update doc/command-line.md

Co-authored-by: ccbaumler <63077899+ccbaumler@users.noreply.github.com>

Co-authored-by: ccbaumler <63077899+ccbaumler@users.noreply.github.com>
  • Loading branch information
ctb and ccbaumler authored Aug 19, 2022
1 parent c7883ae commit 651ecea
Show file tree
Hide file tree
Showing 8 changed files with 73 additions and 48 deletions.
34 changes: 24 additions & 10 deletions doc/command-line.md
Original file line number Diff line number Diff line change
Expand Up @@ -250,31 +250,45 @@ for matches to the query signature. It can search for matches with either
high [Jaccard similarity](https://en.wikipedia.org/wiki/Jaccard_index)
or containment; the default is to use Jaccard similarity, unless
`--containment` is specified. `-o/--output` will create a CSV file
containing the matches.
containing all of the matches with respective similarity or containment score.

`search` makes use of [indexed databases](#loading-many-signatures) to
decrease search time and memory where possible.

Usage:
```
sourmash search query.sig [ list of signatures or SBTs ]
sourmash search query.sig <signatures or databases>
```

Example output:

```
49 matches; showing first 20:
% sourmash search tests/test-data/47.fa.sig gtdb-rs207.genomic-reps.dna.k31.zip
...
--
loaded 65703 total signatures from 1 locations.
after selecting signatures compatible with search, 65703 remain.
2 matches above threshold 0.080:
similarity match
---------- -----
75.4% NZ_JMGW01000001.1 Escherichia coli 1-176-05_S4_C2 e117605...
72.2% NZ_GG774190.1 Escherichia coli MS 196-1 Scfld2538, whole ...
71.4% NZ_JMGU01000001.1 Escherichia coli 2-011-08_S3_C2 e201108...
70.1% NZ_JHRU01000001.1 Escherichia coli strain 100854 100854_1...
69.0% NZ_JH659569.1 Escherichia coli M919 supercont2.1, whole g...
...
32.3% GCF_900456975.1 Shewanella baltica strain=NCTC10735, 5088...
14.0% GCF_002838165.1 Shewanella sp. Pdp11 strain=Pdp11, ASM283...
```

Note, as of sourmash 4.2.0, `search` supports `--picklist`, to
`search` takes a number of command line options -
* `--containment` - find matches using the containment index rather than Jaccard similarity;
* `--max-containment` - find matches using the max containment index rather than Jaccard similarity;
* `-t/--threshold` - lower threshold for matching; defaults to 0.08;
* `--best-only` - find and report only the best match;
* `-n/--num-results` - number of matches to report to stdout; defaults to 3; 0 to report all;

Match information can be saved to a CSV file with `-o/--output`; with
`-o`, all matches above the threshold will be saved, not just those
printed to stdout (which are limited to `-n/--num-results`).

As of sourmash 4.2.0, `search` supports `--picklist`, to
[select a subset of signatures to search, based on a CSV file](#using-picklists-to-subset-large-collections-of-signatures). This
can be used to search only a small subset of a large collection, or to
exclude a few signatures from a collection, without modifying the
Expand Down
4 changes: 2 additions & 2 deletions src/sourmash/cli/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def subparser(subparsers):
help='output debug information'
)
subparser.add_argument(
'--threshold', metavar='T', default=0.08, type=float,
'-t', '--threshold', metavar='T', default=0.08, type=float,
help='minimum threshold for reporting matches; default=0.08'
)
subparser.add_argument(
Expand All @@ -74,7 +74,7 @@ def subparser(subparsers):
)
subparser.add_argument(
'-n', '--num-results', default=3, type=int, metavar='N',
help='number of results to report'
help='number of results to display to user; 0 to report all'
)
subparser.add_argument(
'--containment', action='store_true',
Expand Down
7 changes: 4 additions & 3 deletions src/sourmash/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -538,10 +538,10 @@ def search(args):
args.num_results = 1

if not args.num_results or n_matches <= args.num_results:
print_results('{} matches:'.format(len(results)))
print_results(f'{len(results)} matches above threshold {args.threshold:0.3f}:')
else:
print_results('{} matches; showing first {}:',
len(results), args.num_results)
print_results(f'{len(results)} matches above threshold {args.threshold:0.3f}; showing first {args.num_results}:')

n_matches = args.num_results

size_may_be_inaccurate = False
Expand Down Expand Up @@ -588,6 +588,7 @@ def search(args):
if jaccard_ani_untrustworthy:
notify("WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons.")


def categorize(args):
"Use a database to find the best match to many signatures."
from .index import MultiIndex
Expand Down
2 changes: 1 addition & 1 deletion tests/test_cmd_signature.py
Original file line number Diff line number Diff line change
Expand Up @@ -3596,7 +3596,7 @@ def test_import_mash_csv_to_sig(runtmp):

print("RUNTEMP", runtmp)

assert '1 matches:' in runtmp.last_result.out
assert '1 matches' in runtmp.last_result.out
assert '100.0% short.fa' in runtmp.last_result.out


Expand Down
12 changes: 6 additions & 6 deletions tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -688,7 +688,7 @@ def test_zipfile_protein_command_search(runtmp):
db_out = utils.get_test_data('prot/protein.zip')

c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0')
assert '2 matches:' in c.last_result.out
assert '2 matches' in c.last_result.out

c.run_sourmash('gather', sigfile1, db_out)
assert 'found 1 matches total' in c.last_result.out
Expand All @@ -703,7 +703,7 @@ def test_zipfile_hp_command_search(runtmp):
db_out = utils.get_test_data('prot/hp.zip')

c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0')
assert '2 matches:' in c.last_result.out
assert '2 matches' in c.last_result.out

c.run_sourmash('gather', sigfile1, db_out, '--threshold', '0.0')
assert 'found 1 matches total' in c.last_result.out
Expand All @@ -718,7 +718,7 @@ def test_zipfile_dayhoff_command_search(runtmp):
db_out = utils.get_test_data('prot/dayhoff.zip')

c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0')
assert '2 matches:' in c.last_result.out
assert '2 matches' in c.last_result.out

c.run_sourmash('gather', sigfile1, db_out, '--threshold', '0.0')
assert 'found 1 matches total' in c.last_result.out
Expand All @@ -733,7 +733,7 @@ def test_zipfile_protein_command_search_combined(runtmp):
db_out = utils.get_test_data('prot/all.zip')

c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0')
assert '2 matches:' in c.last_result.out
assert '2 matches' in c.last_result.out

c.run_sourmash('gather', sigfile1, db_out)
assert 'found 1 matches total' in c.last_result.out
Expand All @@ -748,7 +748,7 @@ def test_zipfile_hp_command_search_combined(runtmp):
db_out = utils.get_test_data('prot/all.zip')

c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0')
assert '2 matches:' in c.last_result.out
assert '2 matches' in c.last_result.out

c.run_sourmash('gather', sigfile1, db_out, '--threshold', '0.0')
assert 'found 1 matches total' in c.last_result.out
Expand All @@ -763,7 +763,7 @@ def test_zipfile_dayhoff_command_search_combined(runtmp):
db_out = utils.get_test_data('prot/all.zip')

c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0')
assert '2 matches:' in c.last_result.out
assert '2 matches' in c.last_result.out

c.run_sourmash('gather', sigfile1, db_out, '--threshold', '0.0')
assert 'found 1 matches total' in c.last_result.out
Expand Down
6 changes: 3 additions & 3 deletions tests/test_lca.py
Original file line number Diff line number Diff line change
Expand Up @@ -2659,7 +2659,7 @@ def test_lca_db_protein_command_search(c):
db_out = utils.get_test_data('prot/protein.lca.json.gz')

c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0')
assert '2 matches:' in c.last_result.out
assert '2 matches' in c.last_result.out

c.run_sourmash('gather', sigfile1, db_out)
assert 'found 1 matches total' in c.last_result.out
Expand Down Expand Up @@ -2770,7 +2770,7 @@ def test_lca_db_hp_command_search(c):
db_out = utils.get_test_data('prot/hp.lca.json.gz')

c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0')
assert '2 matches:' in c.last_result.out
assert '2 matches' in c.last_result.out

c.run_sourmash('gather', sigfile1, db_out, '--threshold', '0.0')
assert 'found 1 matches total' in c.last_result.out
Expand Down Expand Up @@ -2881,7 +2881,7 @@ def test_lca_db_dayhoff_command_search(c):
db_out = utils.get_test_data('prot/dayhoff.lca.json.gz')

c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0')
assert '2 matches:' in c.last_result.out
assert '2 matches' in c.last_result.out

c.run_sourmash('gather', sigfile1, db_out, '--threshold', '0.0')
assert 'found 1 matches total' in c.last_result.out
Expand Down
6 changes: 3 additions & 3 deletions tests/test_sbt.py
Original file line number Diff line number Diff line change
Expand Up @@ -1049,7 +1049,7 @@ def test_sbt_protein_command_search(c):
db_out = utils.get_test_data('prot/protein.sbt.zip')

c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0')
assert '2 matches:' in c.last_result.out
assert '2 matches' in c.last_result.out

c.run_sourmash('gather', sigfile1, db_out)
assert 'found 1 matches total' in c.last_result.out
Expand Down Expand Up @@ -1096,7 +1096,7 @@ def test_sbt_hp_command_search(c):
db_out = utils.get_test_data('prot/hp.sbt.zip')

c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0')
assert '2 matches:' in c.last_result.out
assert '2 matches' in c.last_result.out

c.run_sourmash('gather', sigfile1, db_out, '--threshold', '0.0')
assert 'found 1 matches total' in c.last_result.out
Expand Down Expand Up @@ -1143,7 +1143,7 @@ def test_sbt_dayhoff_command_search(c):
db_out = utils.get_test_data('prot/dayhoff.sbt.zip')

c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0')
assert '2 matches:' in c.last_result.out
assert '2 matches' in c.last_result.out

c.run_sourmash('gather', sigfile1, db_out, '--threshold', '0.0')
assert 'found 1 matches total' in c.last_result.out
Expand Down
Loading

0 comments on commit 651ecea

Please sign in to comment.