Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] report both weighted and unweighted % recovered in gather #2301

Merged
merged 8 commits into from
Sep 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion doc/classifying-signatures.md
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,10 @@ The output below is the CSV output for a fictional metagenome.

The first column, `f_unique_to_query`, is the fraction of the database
match that is _unique_ with respect to the original query. It will
always decrease as you get more matches.
always decrease as you get more matches. The sum of
`f_unique_to_query` across all rows is what is reported in by gather
as the fraction of query k-mers hit by the recovered matches
(unweighted) and should never be greater than 1!

The second column, `f_match_orig`, is how much of the match is in the
_original_ query. For this fictional metagenome, each match is
Expand All @@ -236,6 +239,9 @@ f_unique_to_query f_match_orig f_match f_orig_query
0.10709413369713507 1.0 1.0 0.10709413369713507
0.10368349249658936 1.0 0.3134020618556701 0.33083219645293316
```
Where there are overlapping matches (e.g. to multiple
*E. coli* species in a gut metagenome) the overlaps will be represented
multiple times in this column.

A few quick notes for the algorithmic folk out there --

Expand Down
33 changes: 24 additions & 9 deletions src/sourmash/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -812,7 +812,11 @@ def gather(args):
estimate_ani_ci=args.estimate_ani_ci)

screen_width = _get_screen_width()
for result, weighted_missed in gather_iter:
sum_f_uniq_found = 0.
result = None
for result in gather_iter:
sum_f_uniq_found += result.f_unique_to_query

if not len(found): # first result? print header.
if is_abundance:
print_results("")
Expand Down Expand Up @@ -854,11 +858,13 @@ def gather(args):
if args.num_results and len(found) == args.num_results:
print_results(f'(truncated gather because --num-results={args.num_results})')

p_covered = (1 - weighted_missed) * 100
if is_abundance:
print_results(f'the recovered matches hit {p_covered:.1f}% of the abundance-weighted query')
else:
print_results(f'the recovered matches hit {p_covered:.1f}% of the query (unweighted)')
if is_abundance and result:
p_covered = result.sum_weighted_found / result.total_weighted_hashes
p_covered *= 100
print_results(f'the recovered matches hit {p_covered:.1f}% of the abundance-weighted query.')

print_results(f'the recovered matches hit {sum_f_uniq_found*100:.1f}% of the query k-mers (unweighted).')

print_results('')
if gather_iter.scaled != query.minhash.scaled:
print_results(f'WARNING: final scaled was {gather_iter.scaled}, vs query scaled of {query.minhash.scaled}')
Expand Down Expand Up @@ -932,6 +938,8 @@ def multigather(args):
# need a query to get ksize, moltype for db loading
query = next(iter(sourmash_args.load_file_as_signatures(inp_files[0], ksize=args.ksize, select_moltype=moltype)))

notify(f'loaded first query: {str(query)[:30]}... (k={query.minhash.ksize}, {sourmash_args.get_moltype(query)})')

databases = sourmash_args.load_dbs_and_sigs(args.db, query, False,
fail_on_empty_database=args.fail_on_empty_database)

Expand Down Expand Up @@ -994,7 +1002,10 @@ def multigather(args):
ident_mh=ident_mh)

screen_width = _get_screen_width()
for result, weighted_missed in gather_iter:
sum_f_uniq_found = 0.
result = None
for result in gather_iter:
sum_f_uniq_found += result.f_unique_to_query
if not len(found): # first result? print header.
if is_abundance:
print_results("")
Expand Down Expand Up @@ -1035,8 +1046,12 @@ def multigather(args):
# basic reporting
print_results('\nfound {} matches total;', len(found))

print_results('the recovered matches hit {:.1f}% of the query',
(1 - weighted_missed) * 100)
if is_abundance and result:
p_covered = result.sum_weighted_found / result.total_weighted_hashes
p_covered *= 100
print_results(f'the recovered matches hit {p_covered:.1f}% of the abundance-weighted query.')

print_results(f'the recovered matches hit {sum_f_uniq_found*100:.1f}% of the query k-mers (unweighted).')
print_results('')

if not found:
Expand Down
5 changes: 2 additions & 3 deletions src/sourmash/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -783,12 +783,11 @@ def __next__(self):
new_query_mh.remove_many(found_mh)
new_query = SourmashSignature(new_query_mh)

# compute weighted_missed for remaining query hashes
# compute weighted information for remaining query hashes
query_hashes = set(query_mh.hashes) - set(found_mh.hashes)
n_weighted_missed = sum((orig_query_abunds[k] for k in query_hashes))
n_weighted_missed += self.noident_query_sum_abunds
sum_weighted_found = sum_abunds - n_weighted_missed
weighted_missed = n_weighted_missed / sum_abunds

# build a GatherResult
result = GatherResult(self.orig_query, best_match,
Expand All @@ -809,7 +808,7 @@ def __next__(self):
self.query = new_query
self.orig_query_mh = orig_query_mh

return result, weighted_missed
return result


###
Expand Down
57 changes: 54 additions & 3 deletions tests/test_sourmash.py
Original file line number Diff line number Diff line change
Expand Up @@ -3451,12 +3451,27 @@ def approx_equal(a, b, n=5):
remaining_mh.remove_many(match.minhash.hashes.keys())


def test_gather_nomatch(runtmp):
def test_gather_nomatch(runtmp, linear_gather, prefetch_gather):
testdata_query = utils.get_test_data(
'gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig')
testdata_match = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig')

runtmp.sourmash('gather', testdata_query, testdata_match)
runtmp.sourmash('gather', testdata_query, testdata_match,
linear_gather, prefetch_gather)

print(runtmp.last_result.out)
print(runtmp.last_result.err)

assert 'found 0 matches total' in runtmp.last_result.out
assert 'the recovered matches hit 0.0% of the query' in runtmp.last_result.out


def test_gather_abund_nomatch(runtmp, linear_gather, prefetch_gather):
testdata_query = utils.get_test_data('gather-abund/reads-s10x10-s11.sig')
testdata_match = utils.get_test_data('gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig')

runtmp.sourmash('gather', testdata_query, testdata_match,
linear_gather, prefetch_gather)

print(runtmp.last_result.out)
print(runtmp.last_result.err)
Expand Down Expand Up @@ -4562,6 +4577,7 @@ def test_gather_abund_1_1(runtmp, linear_gather, prefetch_gather):
assert 'genome-s12.fa.gz' not in out

assert "the recovered matches hit 100.0% of the abundance-weighted query" in out
assert "the recovered matches hit 100.0% of the query k-mers (unweighted)" in out


def test_gather_abund_10_1(runtmp, prefetch_gather, linear_gather):
Expand Down Expand Up @@ -4701,7 +4717,8 @@ def test_gather_abund_10_1_ignore_abundance(runtmp, linear_gather, prefetch_gath

print(out)
print(err)
assert "the recovered matches hit 100.0% of the query (unweighted)" in out
assert "the recovered matches hit 100.0% of the abundance-weighted query" not in out
assert "the recovered matches hit 100.0% of the query k-mers (unweighted)" in out

# when we project s10x10-s11 (r2+r3), 10:1 abundance,
# onto s10 and s11 genomes with gather --ignore-abundance, we get:
Expand Down Expand Up @@ -4803,6 +4820,10 @@ def test_multigather_output_unassigned_with_abundance(runtmp):
print(c.last_result.out)
print(c.last_result.err)

out = c.last_result.out
assert "the recovered matches hit 91.0% of the abundance-weighted query." in out
assert "the recovered matches hit 57.2% of the query k-mers (unweighted)." in out

assert os.path.exists(c.output('r3.fa.unassigned.sig'))

nomatch = sourmash.load_one_signature(c.output('r3.fa.unassigned.sig'))
Expand Down Expand Up @@ -4857,6 +4878,36 @@ def test_multigather_empty_db_nofail(runtmp):
assert "loaded 50 total signatures from 2 locations" in err
assert "after selecting signatures compatible with search, 0 remain." in err


def test_multigather_nomatch(runtmp):
testdata_query = utils.get_test_data(
'gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig')
testdata_match = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig')

runtmp.sourmash('multigather', '--query', testdata_query,
'--db', testdata_match, '-k', '31')

print(runtmp.last_result.out)
print(runtmp.last_result.err)

assert 'found 0 matches total' in runtmp.last_result.out
assert 'the recovered matches hit 0.0% of the query' in runtmp.last_result.out


def test_multigather_abund_nomatch(runtmp):
testdata_query = utils.get_test_data('gather-abund/reads-s10x10-s11.sig')
testdata_match = utils.get_test_data('gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig')

runtmp.sourmash('multigather', '--query', testdata_query,
'--db', testdata_match)

print(runtmp.last_result.out)
print(runtmp.last_result.err)

assert 'found 0 matches total' in runtmp.last_result.out
assert 'the recovered matches hit 0.0% of the query' in runtmp.last_result.out


def test_sbt_categorize(runtmp):
testdata1 = utils.get_test_data('genome-s10.fa.gz.sig')
testdata2 = utils.get_test_data('genome-s11.fa.gz.sig')
Expand Down