diff --git a/src/sourmash/cli/tax/annotate.py b/src/sourmash/cli/tax/annotate.py index 5086e1c18e..e0c17b0019 100644 --- a/src/sourmash/cli/tax/annotate.py +++ b/src/sourmash/cli/tax/annotate.py @@ -23,7 +23,7 @@ def subparser(subparsers): aliases=['annotate'], usage=usage) subparser.add_argument( - '-g', '--gather-csv', nargs='*', default = [], + '-g', '--gather-csv', nargs='*', default = [], action='extend', help='CSV output files from sourmash gather' ) subparser.add_argument( @@ -36,7 +36,7 @@ def subparser(subparsers): ) subparser.add_argument( '-t', '--taxonomy-csv', '--taxonomy', metavar='FILE', - nargs="+", required=True, + nargs='*', required=True, action="extend", help='database lineages CSV' ) subparser.add_argument( diff --git a/src/sourmash/cli/tax/genome.py b/src/sourmash/cli/tax/genome.py index 9a9c3fbad3..54c40fd681 100644 --- a/src/sourmash/cli/tax/genome.py +++ b/src/sourmash/cli/tax/genome.py @@ -41,7 +41,7 @@ def subparser(subparsers): aliases=['classify'], usage=usage) subparser.add_argument( - '-g', '--gather-csv', nargs='*', default = [], + '-g', '--gather-csv', action='extend', nargs='*', default = [], help='CSVs output by sourmash gather for this sample' ) subparser.add_argument( @@ -54,7 +54,7 @@ def subparser(subparsers): ) subparser.add_argument( '-t', '--taxonomy-csv', '--taxonomy', metavar='FILE', - nargs='+', required=True, + nargs='*', required=True, action='extend', help='database lineages CSV' ) subparser.add_argument( @@ -82,7 +82,8 @@ def subparser(subparsers): help='fail quickly if taxonomy is not available for an identifier', ) subparser.add_argument( - '-F', '--output-format', default=['csv_summary'], nargs='+', choices=["csv_summary", "krona", "human", "lineage_csv"], + '-F', '--output-format', default=[], nargs='*', action='extend', + choices=["csv_summary", "krona", "human", "lineage_csv"], help='choose output format(s)', ) subparser.add_argument( @@ -102,4 +103,8 @@ def main(args): if not args.rank: if any(x in ["krona"] for x in args.output_format): raise ValueError(f"Rank (--rank) is required for krona output format.") + if not args.output_format: + # change to "human" for 5.0 + args.output_format = ["csv_summary"] + return sourmash.tax.__main__.genome(args) diff --git a/src/sourmash/cli/tax/grep.py b/src/sourmash/cli/tax/grep.py index 2c88f5dc99..003c152dc1 100644 --- a/src/sourmash/cli/tax/grep.py +++ b/src/sourmash/cli/tax/grep.py @@ -55,7 +55,7 @@ def subparser(subparsers): ) subparser.add_argument( '-t', '--taxonomy-csv', '--taxonomy', metavar='FILE', - nargs="+", required=True, + nargs="+", required=True, action="extend", help='database lineages' ) subparser.add_argument( diff --git a/src/sourmash/cli/tax/metagenome.py b/src/sourmash/cli/tax/metagenome.py index 2f00ec4ddb..0bc7ab7c4e 100644 --- a/src/sourmash/cli/tax/metagenome.py +++ b/src/sourmash/cli/tax/metagenome.py @@ -30,7 +30,7 @@ def subparser(subparsers): aliases=['summarize'], usage=usage) subparser.add_argument( - '-g', '--gather-csv', nargs='*', default = [], + '-g', '--gather-csv', action="extend", nargs='*', default = [], help='CSVs from sourmash gather' ) subparser.add_argument( @@ -51,7 +51,7 @@ def subparser(subparsers): ) subparser.add_argument( '-t', '--taxonomy-csv', '--taxonomy', metavar='FILE', - nargs='+', required=True, + action="extend", nargs='+', required=True, help='database lineages CSV' ) subparser.add_argument( @@ -67,7 +67,8 @@ def subparser(subparsers): help='fail quickly if taxonomy is not available for an identifier', ) subparser.add_argument( - '-F', '--output-format', default=['csv_summary'], nargs='+', choices=["human", "csv_summary", "krona", "lineage_summary", "kreport"], + '-F', '--output-format', default=[], nargs='*', action="extend", + choices=["human", "csv_summary", "krona", "lineage_summary", "kreport"], help='choose output format(s)', ) subparser.add_argument( @@ -89,4 +90,7 @@ def main(args): if not args.rank: if any(x in ["krona", "lineage_summary"] for x in args.output_format): raise ValueError(f"Rank (--rank) is required for krona and lineage_summary output formats.") + if not args.output_format: + # change to "human" for 5.0 + args.output_format = ["csv_summary"] return sourmash.tax.__main__.metagenome(args) diff --git a/src/sourmash/cli/tax/prepare.py b/src/sourmash/cli/tax/prepare.py index 01d978c73e..de2e58521b 100644 --- a/src/sourmash/cli/tax/prepare.py +++ b/src/sourmash/cli/tax/prepare.py @@ -25,7 +25,7 @@ def subparser(subparsers): ) subparser.add_argument( '-t', '--taxonomy-csv', '--taxonomy', metavar='FILE', - nargs="+", required=True, + nargs="+", required=True, action="extend", help='database lineages' ) subparser.add_argument( diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 6a6fd69545..f9a274d5c6 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -364,6 +364,7 @@ def prepare(args): notify("loading taxonomies...") try: tax_assign = MultiLineageDB.load(args.taxonomy_csv, + force=args.force, keep_full_identifiers=args.keep_full_identifiers, keep_identifier_versions=args.keep_identifier_versions) except ValueError as exc: diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 9c8dc3b956..0fdc488313 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -1076,6 +1076,8 @@ def _save_csv(self, fp): @classmethod def load(cls, locations, **kwargs): "Load one or more taxonomies from the given location(s)" + force = kwargs.get('force', False) + if isinstance(locations, str): raise TypeError("'locations' should be a list, not a string") @@ -1098,12 +1100,14 @@ def load(cls, locations, **kwargs): loaded = True except (ValueError, csv.Error) as exc: # for the last loader, just pass along ValueError... - raise ValueError(f"cannot read taxonomy assignments from '{location}': {str(exc)}") + if not force: + raise ValueError(f"cannot read taxonomy assignments from '{location}': {str(exc)}") # nothing loaded, goodbye! - if not loaded: + if not loaded and not force: raise ValueError(f"cannot read taxonomy assignments from '{location}'") - tax_assign.add(this_tax_assign) + if loaded: + tax_assign.add(this_tax_assign) return tax_assign diff --git a/tests/test_tax.py b/tests/test_tax.py index 0efa9863a2..da922355f4 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -142,6 +142,27 @@ def test_metagenome_summary_csv_out(runtmp): assert 'test1,species,0.7957718388512166,unclassified,md5,test1.sig,0.8691969376119889,3990000' in sum_gather_results[22] +def test_metagenome_summary_csv_out_empty_gather_force(runtmp): + # test multiple -g, empty -g file, and --force + g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data('tax/test.taxonomy.csv') + csv_base = "out" + sum_csv = csv_base + ".summarized.csv" + csvout = runtmp.output(sum_csv) + outdir = os.path.dirname(csvout) + + gather_empty = runtmp.output('g.csv') + with open(gather_empty, "w") as fp: + fp.write("") + print("g_csv: ", gather_empty) + + runtmp.run_sourmash('tax', 'metagenome', '--gather-csv', g_csv, '-g', gather_empty, '--taxonomy-csv', tax, '-o', csv_base, '--output-dir', outdir, '-f') + sum_gather_results = [x.rstrip() for x in open(csvout)] + assert f"saving 'csv_summary' output to '{csvout}'" in runtmp.last_result.err + assert 'query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in sum_gather_results[0] + assert 'test1,superkingdom,0.2042281611487834,d__Bacteria,md5,test1.sig,0.13080306238801107,1024000' in sum_gather_results[1] + + def test_metagenome_kreport_out(runtmp): g_csv = utils.get_test_data('tax/test1.gather.csv') tax = utils.get_test_data('tax/test.taxonomy.csv') @@ -180,7 +201,6 @@ def test_metagenome_kreport_out(runtmp): assert ['0.02', '138000', '', 'S', '', 's__Phocaeicola vulgatus'] == kreport_results[15] - def test_metagenome_krona_tsv_out(runtmp): g_csv = utils.get_test_data('tax/test1.gather.csv') tax = utils.get_test_data('tax/test.taxonomy.csv') @@ -476,6 +496,66 @@ def test_metagenome_multiple_taxonomy_files(runtmp): assert 'multtest,class,0.116,Bacteria;Bacteroidetes;Bacteroidia,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.073,582000' in c.last_result.out +def test_metagenome_multiple_taxonomy_files_multiple_taxonomy_args(runtmp): + c = runtmp + # pass in mult tax files using mult tax arguments + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + protozoa_genbank = utils.get_test_data('tax/protozoa_genbank_lineage.csv') + bacteria_refseq = utils.get_test_data('tax/bacteria_refseq_lineage.csv') + + # gather against mult databases + g_csv = utils.get_test_data('tax/test1_x_gtdbrs202_genbank_euks.gather.csv') + + c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', taxonomy_csv, '-t', protozoa_genbank, '-t', bacteria_refseq) + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert 'query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out + assert 'multtest,superkingdom,0.204,Bacteria,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.131,1024000' in c.last_result.out + assert 'multtest,superkingdom,0.051,Eukaryota,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.245,258000' in c.last_result.out + assert 'multtest,superkingdom,0.744,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.624,3732000' in c.last_result.out + assert 'multtest,phylum,0.116,Bacteria;Bacteroidetes,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.073,582000' in c.last_result.out + assert 'multtest,phylum,0.088,Bacteria;Proteobacteria,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.058,442000' in c.last_result.out + assert 'multtest,phylum,0.051,Eukaryota;Apicomplexa,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.245,258000' in c.last_result.out + assert 'multtest,phylum,0.744,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.624,3732000' in c.last_result.out + assert 'multtest,class,0.116,Bacteria;Bacteroidetes;Bacteroidia,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.073,582000' in c.last_result.out + + +def test_metagenome_multiple_taxonomy_files_multiple_taxonomy_args_empty_force(runtmp): + # pass in mult tax files using mult tax arguments, with one empty, + # and use --force + c = runtmp + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + protozoa_genbank = utils.get_test_data('tax/protozoa_genbank_lineage.csv') + bacteria_refseq = utils.get_test_data('tax/bacteria_refseq_lineage.csv') + + tax_empty = runtmp.output('t.csv') + g_csv = utils.get_test_data('tax/test1.gather.csv') + + with open(tax_empty, "w") as fp: + fp.write("") + print("t_csv: ", tax_empty) + + # gather against mult databases + g_csv = utils.get_test_data('tax/test1_x_gtdbrs202_genbank_euks.gather.csv') + + c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', taxonomy_csv, '-t', protozoa_genbank, '-t', bacteria_refseq, '-t', tax_empty, '--force') + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert 'query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out + assert 'multtest,superkingdom,0.204,Bacteria,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.131,1024000' in c.last_result.out + assert 'multtest,superkingdom,0.051,Eukaryota,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.245,258000' in c.last_result.out + assert 'multtest,superkingdom,0.744,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.624,3732000' in c.last_result.out + assert 'multtest,phylum,0.116,Bacteria;Bacteroidetes,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.073,582000' in c.last_result.out + assert 'multtest,phylum,0.088,Bacteria;Proteobacteria,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.058,442000' in c.last_result.out + assert 'multtest,phylum,0.051,Eukaryota;Apicomplexa,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.245,258000' in c.last_result.out + assert 'multtest,phylum,0.744,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.624,3732000' in c.last_result.out + assert 'multtest,class,0.116,Bacteria;Bacteroidetes;Bacteroidia,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.073,582000' in c.last_result.out + + def test_metagenome_empty_gather_results(runtmp): tax = utils.get_test_data('tax/test.taxonomy.csv') @@ -513,6 +593,7 @@ def test_metagenome_bad_gather_header(runtmp): def test_metagenome_empty_tax_lineage_input(runtmp): + # test an empty tax CSV tax_empty = runtmp.output('t.csv') g_csv = utils.get_test_data('tax/test1.gather.csv') @@ -532,6 +613,27 @@ def test_metagenome_empty_tax_lineage_input(runtmp): assert "cannot read taxonomy assignments from" in str(exc.value) +def test_metagenome_empty_tax_lineage_input_force(runtmp): + # test an empty tax CSV with --force + tax_empty = runtmp.output('t.csv') + g_csv = utils.get_test_data('tax/test1.gather.csv') + + with open(tax_empty, "w") as fp: + fp.write("") + print("t_csv: ", tax_empty) + + + with pytest.raises(SourmashCommandFailed) as exc: + runtmp.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax_empty, '--force') + + print(runtmp.last_result.status) + print(runtmp.last_result.out) + print(runtmp.last_result.err) + + assert runtmp.last_result.status != 0 + assert "ERROR: No taxonomic assignments loaded" in str(exc.value) + + def test_metagenome_perfect_match_warning(runtmp): tax = utils.get_test_data('tax/test.taxonomy.csv') g_csv = utils.get_test_data('tax/test1.gather.csv') @@ -649,6 +751,8 @@ def test_metagenome_gather_duplicate_query_force(runtmp): def test_metagenome_gather_duplicate_filename(runtmp): + # test that a duplicate filename is properly flagged, when passed in + # twice to a single -g argument. c = runtmp taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') g_res = utils.get_test_data('tax/test1.gather.csv') @@ -665,6 +769,24 @@ def test_metagenome_gather_duplicate_filename(runtmp): assert 'test1,superkingdom,0.204,d__Bacteria,md5,test1.sig,0.131,1024000' in c.last_result.out +def test_metagenome_gather_duplicate_filename_2(runtmp): + # test that a duplicate filename is properly flagged, with -g a -g b + c = runtmp + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + g_res = utils.get_test_data('tax/test1.gather.csv') + + c.run_sourmash('tax', 'metagenome', '--gather-csv', g_res, '-g', g_res, '--taxonomy-csv', taxonomy_csv) + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert f'ignoring duplicated reference to file: {g_res}' + assert 'query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out + assert 'test1,superkingdom,0.204,d__Bacteria,md5,test1.sig,0.131,1024000' in c.last_result.out + + def test_metagenome_gather_duplicate_filename_from_file(runtmp): c = runtmp taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') @@ -723,6 +845,7 @@ def test_genome_bad_gather_header(runtmp): def test_genome_empty_tax_lineage_input(runtmp): + # test an empty tax csv tax_empty = runtmp.output('t.csv') g_csv = utils.get_test_data('tax/test1.gather.csv') @@ -730,7 +853,6 @@ def test_genome_empty_tax_lineage_input(runtmp): fp.write("") print("t_csv: ", tax_empty) - with pytest.raises(SourmashCommandFailed) as exc: runtmp.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax_empty) @@ -956,12 +1078,45 @@ def test_genome_gather_two_files(runtmp): assert 'test2,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test2.sig,0.057,444000.0' in c.last_result.out +def test_genome_gather_two_files_empty_force(runtmp): + # make test2 results (identical to test1 except query_name and filename) + # add an empty file too, with --force -> should work + c = runtmp + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + g_res = utils.get_test_data('tax/test1.gather.csv') + + g_empty_csv = runtmp.output('g_empty.csv') + with open(g_empty_csv, "w") as fp: + fp.write("") + print("g_csv: ", g_empty_csv) + + g_res2 = runtmp.output("test2.gather.csv") + test2_results = [x.replace("test1", "test2") for x in open(g_res, 'r')] + with open(g_res2, 'w') as fp: + for line in test2_results: + fp.write(line) + + c.run_sourmash('tax', 'genome', '-g', g_res, g_res2, '-g', g_empty_csv, + '--taxonomy-csv', taxonomy_csv, + '--rank', 'species', '--containment-threshold', '0', + '--force') + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out + assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000.0' in c.last_result.out + assert 'test2,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test2.sig,0.057,444000.0' in c.last_result.out + + def test_genome_gather_duplicate_filename(runtmp): c = runtmp taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') g_res = utils.get_test_data('tax/test1.gather.csv') - c.run_sourmash('tax', 'genome', '--gather-csv', g_res, g_res, '--taxonomy-csv', taxonomy_csv, + c.run_sourmash('tax', 'genome', '--gather-csv', g_res, '-g', g_res, '--taxonomy-csv', taxonomy_csv, '--rank', 'species', '--containment-threshold', '0') print(c.last_result.status) @@ -1225,6 +1380,29 @@ def test_genome_missing_taxonomy_ignore_threshold(runtmp): assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000.0' in c.last_result.out +def test_genome_missing_taxonomy_recover_with_second_tax_file(runtmp): + c = runtmp + # write temp taxonomy with missing entry + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + subset_csv = runtmp.output("subset_taxonomy.csv") + with open(subset_csv, 'w') as subset: + tax = [x.rstrip() for x in open(taxonomy_csv, 'r')] + tax = [tax[0]] + tax[2:] # remove the best match (1st tax entry) + subset.write("\n".join(tax)) + + g_csv = utils.get_test_data('tax/test1.gather.csv') + + c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', subset_csv, '-t', taxonomy_csv, '--containment-threshold', '0') + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert "The following are missing from the taxonomy information: GCF_001881345" not in c.last_result.err + assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out + assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000.0' in c.last_result.out + + def test_genome_missing_taxonomy_ignore_rank(runtmp): c = runtmp # write temp taxonomy with missing entry @@ -1247,6 +1425,70 @@ def test_genome_missing_taxonomy_ignore_rank(runtmp): assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out assert 'test1,below_threshold,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000.0' in c.last_result.out + +def test_genome_multiple_taxonomy_files(runtmp): + c = runtmp + # write temp taxonomy with missing entry + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + subset_csv = runtmp.output("subset_taxonomy.csv") + with open(subset_csv, 'w') as subset: + tax = [x.rstrip() for x in open(taxonomy_csv, 'r')] + tax = [tax[0]] + tax[2:] # remove the best match (1st tax entry) + subset.write("\n".join(tax)) + + g_csv = utils.get_test_data('tax/test1.gather.csv') + + # using mult -t args + c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', subset_csv, '-t', taxonomy_csv) + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert "The following are missing from the taxonomy information: GCF_001881345" not in c.last_result.err + assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out + assert 'test1,match,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000.0,' in c.last_result.out + # using single -t arg + c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', subset_csv, taxonomy_csv) + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert "The following are missing from the taxonomy information: GCF_001881345" not in c.last_result.err + assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out + assert 'test1,match,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000.0,' in c.last_result.out + + +def test_genome_multiple_taxonomy_files_empty_force(runtmp): + c = runtmp + # write temp taxonomy with missing entry, as well as an empty file, + # and use force + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + subset_csv = runtmp.output("subset_taxonomy.csv") + with open(subset_csv, 'w') as subset: + tax = [x.rstrip() for x in open(taxonomy_csv, 'r')] + tax = [tax[0]] + tax[2:] # remove the best match (1st tax entry) + subset.write("\n".join(tax)) + + g_csv = utils.get_test_data('tax/test1.gather.csv') + + empty_tax = runtmp.output('tax_empty.txt') + with open(empty_tax, "w") as fp: + fp.write("") + + # using mult -t args + c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', subset_csv, '-t', taxonomy_csv, '-t', empty_tax, '--force') + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert "The following are missing from the taxonomy information: GCF_001881345" not in c.last_result.err + assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out + assert 'test1,match,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000.0,' in c.last_result.out + + def test_genome_missing_taxonomy_fail_threshold(runtmp): c = runtmp # write temp taxonomy with missing entry @@ -1694,7 +1936,7 @@ def test_annotate_no_gather_csv(runtmp): def test_annotate_0(runtmp): - # test annotate + # test annotate basics c = runtmp g_csv = utils.get_test_data('tax/test1.gather.csv') @@ -1722,6 +1964,40 @@ def test_annotate_0(runtmp): assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in lin_gather_results[4] +def test_annotate_gather_argparse(runtmp): + # test annotate with two gather CSVs, second one empty, and --force. + # this tests argparse handling w/extend. + c = runtmp + + g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data('tax/test.taxonomy.csv') + csvout = runtmp.output("test1.gather.with-lineages.csv") + out_dir = os.path.dirname(csvout) + + g_empty_csv = runtmp.output('g_empty.csv') + with open(g_empty_csv, "w") as fp: + fp.write("") + print("g_csv: ", g_empty_csv) + + c.run_sourmash('tax', 'annotate', '--gather-csv', g_csv, + '-g', g_empty_csv, '--taxonomy-csv', tax, '-o', out_dir, + '--force') + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert os.path.exists(csvout) + + lin_gather_results = [x.rstrip() for x in open(csvout)] + print("\n".join(lin_gather_results)) + assert f"saving 'annotate' output to '{csvout}'" in runtmp.last_result.err + + assert "lineage" in lin_gather_results[0] + assert "d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in lin_gather_results[1] + + def test_annotate_0_db(runtmp): # test annotate with sqlite db c = runtmp @@ -1806,6 +2082,44 @@ def test_annotate_empty_tax_lineage_input(runtmp): assert "cannot read taxonomy assignments from" in str(exc.value) +def test_annotate_empty_tax_lineage_input_recover_with_second_taxfile(runtmp): + tax_empty = runtmp.output('t.csv') + tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data('tax/test1.gather.csv') + + with open(tax_empty, "w") as fp: + fp.write("") + print("t_csv: ", tax_empty) + + runtmp.run_sourmash('tax', 'annotate', '-g', g_csv, '-t', tax_empty, '--taxonomy-csv', tax, '--force') + + print(runtmp.last_result.status) + print(runtmp.last_result.out) + print(runtmp.last_result.err) + + assert runtmp.last_result.status == 0 + + +def test_annotate_empty_tax_lineage_input_recover_with_second_taxfile_2(runtmp): + # test with empty tax second, to check on argparse handling + tax_empty = runtmp.output('t.csv') + tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data('tax/test1.gather.csv') + + with open(tax_empty, "w") as fp: + fp.write("") + print("t_csv: ", tax_empty) + + runtmp.run_sourmash('tax', 'annotate', '-g', g_csv, + '--taxonomy-csv', tax, '-t', tax_empty, '--force') + + print(runtmp.last_result.status) + print(runtmp.last_result.out) + print(runtmp.last_result.err) + + assert runtmp.last_result.status == 0 + + def test_tax_prepare_1_csv_to_csv(runtmp, keep_identifiers, keep_versions): # CSV -> CSV; same assignments tax = utils.get_test_data('tax/test.taxonomy.csv') @@ -1888,6 +2202,43 @@ def test_tax_prepare_1_csv_to_csv_empty_ranks(runtmp, keep_identifiers, keep_ver assert set(db1) == set(db2) +def test_tax_prepare_1_csv_to_csv_empty_file(runtmp, keep_identifiers, keep_versions): + # CSV -> CSV with an empty input file and --force + # tests argparse extend + tax = utils.get_test_data('tax/test-empty-ranks.taxonomy.csv') + tax_empty = runtmp.output('t.csv') + taxout = runtmp.output('out.csv') + + with open(tax_empty, "w") as fp: + fp.write("") + print("t_csv: ", tax_empty) + + args = [] + if keep_identifiers: + args.append('--keep-full-identifiers') + if keep_versions: + args.append('--keep-identifier-versions') + + # this is an error - can't strip versions if not splitting identifiers + if keep_identifiers and not keep_versions: + with pytest.raises(SourmashCommandFailed): + runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', + taxout, '-F', 'csv', *args) + return + + runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-t', tax_empty, '-o', + taxout, '-F', 'csv', *args, '--force') + assert os.path.exists(taxout) + + db1 = tax_utils.MultiLineageDB.load([tax], + keep_full_identifiers=keep_identifiers, + keep_identifier_versions=keep_versions) + + db2 = tax_utils.MultiLineageDB.load([taxout]) + + assert set(db1) == set(db2) + + def test_tax_prepare_1_csv_to_csv_empty_ranks_2(runtmp, keep_identifiers, keep_versions): # CSV -> CSV; same assignments for situations with empty internal ranks tax = utils.get_test_data('tax/test-empty-ranks-2.taxonomy.csv') @@ -2216,6 +2567,7 @@ def test_tax_prepare_sqlite_lineage_version(runtmp): with pytest.raises(IndexNotSupported): db = tax_utils.MultiLineageDB.load([taxout]) + def test_tax_prepare_sqlite_no_lineage(): # no lineage table at all sqldb = utils.get_test_data('sqlite/index.sqldb') @@ -2450,6 +2802,37 @@ def test_tax_grep_multiple_csv(runtmp): assert 'GCF_001881345.1' in names +def test_tax_grep_multiple_csv_empty_force(runtmp): + # grep on multiple CSVs, one empty, with --force + tax1 = utils.get_test_data('tax/test.taxonomy.csv') + tax2 = utils.get_test_data('tax/protozoa_genbank_lineage.csv') + tax_empty = runtmp.output('t.csv') + + taxout = runtmp.output('out.csv') + with open(tax_empty, "w") as fp: + fp.write("") + print("t_csv: ", tax_empty) + + runtmp.sourmash('tax', 'grep', "Toxo|Gamma", + '-t', tax1, tax2, '-t', tax_empty, + '-o', taxout, '--force') + + out = runtmp.last_result.out + err = runtmp.last_result.err + + assert not out + assert "found 4 matches" in err + + lines = open(taxout).readlines() + assert len(lines) == 5 + + names = set([ x.split(',')[0] for x in lines ]) + assert 'GCA_000256725' in names + assert 'GCF_000017325.1' in names + assert 'GCF_000021665.1' in names + assert 'GCF_001881345.1' in names + + def test_tax_grep_duplicate_csv(runtmp): # grep on duplicates => should collapse to uniques on identifiers tax1 = utils.get_test_data('tax/test.taxonomy.csv')