Skip to content

Commit

Permalink
[MRG] add optional reporting for duplicated names in sketch fromfile (#…
Browse files Browse the repository at this point in the history
…2580)

When `sketch fromfile` encounters duplicated names, it exits with an
error. This PR introduces an option, `--report-duplicated`, to report
those names for faster/easier debugging.

We could instead do this by default, if we want.
  • Loading branch information
bluegenes authored Apr 20, 2023
1 parent 7a9fed0 commit ee69c50
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 0 deletions.
4 changes: 4 additions & 0 deletions src/sourmash/cli/sketch/fromfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@ def subparser(subparsers):
'--output-manifest-matching',
help='output a manifest file of already-existing signatures'
)
file_args.add_argument(
'--report-duplicated', action='store_true',
help='report duplicated names'
)


def main(args):
Expand Down
4 changes: 4 additions & 0 deletions src/sourmash/command_sketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,7 @@ def fromfile(args):
skipped_sigs = 0
n_missing_name = 0
n_duplicate_name = 0
duplicate_names = set()

for csvfile in args.csvs:
with sourmash_args.FileInputCSV(csvfile) as r:
Expand All @@ -439,11 +440,14 @@ def fromfile(args):

if name in all_names:
n_duplicate_name += 1
duplicate_names.add(name)
else:
all_names[name] = (genome, proteome)

fail_exit = False
if n_duplicate_name:
if args.report_duplicated:
notify("duplicated:\n" + '\n'.join(sorted(duplicate_names)))
error(f"** ERROR: {n_duplicate_name} entries have duplicate 'name' records. Exiting!")
fail_exit = True

Expand Down
22 changes: 22 additions & 0 deletions tests/test_sourmash_sketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -1796,6 +1796,28 @@ def test_fromfile_dna_and_protein_dup_name(runtmp):

print(out)
print(err)
assert "GCA_903797575 Salmonella enterica" not in err
assert "ERROR: 1 entries have duplicate 'name' records. Exiting!" in err


def test_fromfile_dna_and_protein_dup_name_report(runtmp):
# duplicate names
test_inp = utils.get_test_data('sketch_fromfile')
shutil.copytree(test_inp, runtmp.output('sketch_fromfile'))

with pytest.raises(SourmashCommandFailed):
runtmp.sourmash('sketch', 'fromfile',
'sketch_fromfile/salmonella.csv',
'sketch_fromfile/salmonella.csv',
'--report-duplicated',
'-o', 'out.zip', '-p', 'dna', '-p', 'protein')

out = runtmp.last_result.out
err = runtmp.last_result.err

print(out)
print(err)
assert "GCA_903797575 Salmonella enterica" in err
assert "ERROR: 1 entries have duplicate 'name' records. Exiting!" in err


Expand Down

0 comments on commit ee69c50

Please sign in to comment.