diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index fd84cdc3ab..bdc2c28c17 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -136,15 +136,24 @@ jobs:
lints:
name: Lints
runs-on: ubuntu-latest
+ strategy:
+ fail-fast: false
+ matrix:
+ build: [beta, stable]
+ include:
+ - build: beta
+ rust: beta
+ - build: stable
+ rust: stable
steps:
- name: Checkout sources
uses: actions/checkout@v1
- - name: Install stable toolchain
+ - name: Install Rust toolchain
uses: actions-rs/toolchain@v1
with:
profile: minimal
- toolchain: stable
+ toolchain: ${{ matrix.rust }}
override: true
components: rustfmt, clippy
@@ -171,10 +180,10 @@ jobs:
toolchain: stable
target: wasm32-unknown-unknown
- - uses: actions-rs/install@v0.1
+ - uses: actions-rs/cargo@v1
with:
- crate: wasm-pack
- version: latest
+ command: install
+ args: --force wasm-pack --version 0.10.0
- name: Prepare node for running tests
uses: actions/setup-node@v1
@@ -188,6 +197,14 @@ jobs:
- name: run wasm-pack build
run: wasm-pack build src/core -d ../../pkg
+ - name: Prepare package for NPM publishing
+ working-directory: pkg
+ run: npm pack
+
+ - uses: actions/upload-artifact@v2
+ with:
+ path: 'pkg/sourmash*.tgz'
+
wasm32-wasi:
name: Run tests under wasm32-wasi
runs-on: ubuntu-latest
diff --git a/.github/workflows/rust_publish.yml b/.github/workflows/rust_publish.yml
index 30ab988bbc..2543fb81f7 100644
--- a/.github/workflows/rust_publish.yml
+++ b/.github/workflows/rust_publish.yml
@@ -18,9 +18,10 @@ jobs:
- uses: actions-rs/cargo@v1
with:
command: install
- args: --force wasm-pack --version 0.8.1
+ args: --force wasm-pack --version 0.10.0
- name: run wasm-pack
- run: wasm-pack build src/core -d ../../pkg
+ run: |
+ wasm-pack build src/core -d ../../pkg
- name: Prepare node for NPM publishing
uses: actions/setup-node@v1
@@ -29,7 +30,8 @@ jobs:
registry-url: https://registry.npmjs.org/
- name: Publish to NPM
- run: '(cd pkg && npm publish)'
+ working-directory: pkg
+ run: npm publish
env:
NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
diff --git a/README.md b/README.md
index 1df1b800ce..08b7004549 100644
--- a/README.md
+++ b/README.md
@@ -3,14 +3,15 @@
Quickly search, compare, and analyze genomic and metagenomic data sets.
[![Documentation](https://readthedocs.org/projects/sourmash/badge/?version=latest)](http://sourmash.readthedocs.io/en/latest/)
-[![Build Status](/~https://github.com/dib-lab/sourmash/workflows/Python%20tests/badge.svg)](/~https://github.com/dib-lab/sourmash/actions/)
+[![Gitter](https://badges.gitter.im/sourmash-bio/community.svg)](https://gitter.im/sourmash-bio/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge)
+[![Build Status](/~https://github.com/sourmash-bio/sourmash/workflows/Python%20tests/badge.svg)](/~https://github.com/sourmash-bio/sourmash/actions/)
[![Bioconda install](https://img.shields.io/conda/dn/bioconda/sourmash.svg?style=flag&label=Bioconda)](https://anaconda.org/bioconda/sourmash)
-[![codecov](https://codecov.io/gh/dib-lab/sourmash/branch/latest/graph/badge.svg)](https://codecov.io/gh/dib-lab/sourmash)
+[![codecov](https://codecov.io/gh/sourmash-bio/sourmash/branch/latest/graph/badge.svg)](https://codecov.io/gh/sourmash-bio/sourmash)
[![DOI](http://joss.theoj.org/papers/10.21105/joss.00027/status.svg)](http://joss.theoj.org/papers/10.21105/joss.00027)
-
+
-
+
Usage:
@@ -80,17 +81,16 @@ $ sourmash --help
```
which will install
-[the latest released version](/~https://github.com/dib-lab/sourmash/releases).
+[the latest released version](/~https://github.com/sourmash-bio/sourmash/releases).
## Support
-Please ask questions and files issues
-[on Github](/~https://github.com/dib-lab/sourmash/issues).
+For questions, please open an issue [on Github](/~https://github.com/sourmash-bio/sourmash/issues), or ask in our [chat](https://gitter.im/sourmash-bio/community?utm_source=share-link&utm_medium=link&utm_campaign=share-link).
## Development
Development happens on github at
-[dib-lab/sourmash](/~https://github.com/dib-lab/sourmash).
+[sourmash-bio/sourmash](/~https://github.com/sourmash-bio/sourmash).
sourmash is developed in Python and Rust, and you will need a Rust
environment to build it; see [the developer notes](doc/developer.md)
@@ -110,6 +110,18 @@ Tests require py.test and can be run with `make test`.
Please see [the developer notes](doc/developer.md) for more information
on getting set up with a development environment.
+## Research notice
+
+Please note that this repository is participating in a study into sustainability
+ of open source projects. Data will be gathered about this repository for
+ approximately the next 12 months, starting from 2021-06-11.
+
+Data collected will include number of contributors, number of PRs, time taken to
+ close/merge these PRs, and issues closed.
+
+For more information, please visit
+[our informational page](https://sustainable-open-science-and-software.github.io/) or download our [participant information sheet](https://sustainable-open-science-and-software.github.io/assets/PIS_sustainable_software.pdf).
+
----
CTB
diff --git a/doc/api-example.md b/doc/api-example.md
index ba82054437..e3c4c72e8e 100644
--- a/doc/api-example.md
+++ b/doc/api-example.md
@@ -395,7 +395,7 @@ True
(Beware, these are confusing techniques for working with hashes that
are easy to get wrong! We suggest
-[posting questions in the issue tracker](/~https://github.com/dib-lab/sourmash/issues)
+[posting questions in the issue tracker](/~https://github.com/sourmash-bio/sourmash/issues)
as you go, if you are interested in exploring this area!)
The hashing function used is identical between num and scaled signatures,
@@ -596,8 +596,8 @@ Now, let's load in all of the signatures from the test directory:
... hashes_inserted = db.insert(sig)
... print(f"Inserted {hashes_inserted} hashes into db.")
Inserted 493 hashes into db.
-Inserted 525 hashes into db.
Inserted 490 hashes into db.
+Inserted 525 hashes into db.
```
diff --git a/doc/classifying-signatures.md b/doc/classifying-signatures.md
index 787c2e2d32..8a15000867 100644
--- a/doc/classifying-signatures.md
+++ b/doc/classifying-signatures.md
@@ -139,7 +139,7 @@ Please see Appendix B, below, for some actual numbers and output.
**Buyer beware:** There are substantial challenges in doing this kind
of analysis on real metagenomic samples, relating to genome representation
-and strain overlap; see [this issue](/~https://github.com/dib-lab/sourmash/issues/461) for a discussion.
+and strain overlap; see [this issue](/~https://github.com/sourmash-bio/sourmash/issues/461) for a discussion.
### Computing signature similarity with angular similarity.
@@ -173,7 +173,7 @@ We suggest the following approach:
* explore the available databases;
-* then ask questions [via the issue tracker](/~https://github.com/dib-lab/sourmash/issues) and we will do our best to help you out!
+* then ask questions [via the issue tracker](/~https://github.com/sourmash-bio/sourmash/issues) and we will do our best to help you out!
This helps us figure out what people are actually interested in doing, and
any help we provide via the issue tracker will eventually be added into the
diff --git a/doc/command-line.md b/doc/command-line.md
index 6489b3167f..8619fb8b8b 100644
--- a/doc/command-line.md
+++ b/doc/command-line.md
@@ -70,9 +70,16 @@ There are seven main subcommands: `sketch`, `compare`, `plot`,
* `prefetch` selects signatures of interest from a very large collection of signatures, for later processing.
There are also a number of commands that work with taxonomic
-information; these are grouped under the `sourmash lca`
-subcommand. See [the LCA tutorial](tutorials-lca.md) for a
-walkthrough of these commands.
+information; these are grouped under the `sourmash tax` and
+`sourmash lca` subcommands.
+
+`sourmash tax` commands:
+
+* `tax metagenome` - summarize metagenome gather results at each taxonomic rank.
+* `tax genome` - summarize single-genome gather results and report most likely classification.
+* `tax annotate` - annotate gather results with lineage information (no summarization or classification).
+
+`sourmash lca` commands:
* `lca classify` classifies many signatures against an LCA database.
* `lca summarize` summarizes the content of metagenomes using an LCA database.
@@ -80,6 +87,9 @@ walkthrough of these commands.
* `lca rankinfo` summarizes the content of a database.
* `lca compare_csv` compares lineage spreadsheets, e.g. those output by `lca classify`.
+> See [the LCA tutorial](tutorials-lca.md) for a
+walkthrough of some of these commands.
+
Finally, there are a number of utility and information commands:
* `info` shows version and software information.
@@ -177,15 +187,14 @@ sourmash compare file1.sig [ file2.sig ... ]
```
Options:
-```
---output -- save the distance matrix to this file (as a numpy binary matrix)
---ksize -- do the comparisons at this k-mer size.
---containment -- calculate containment instead of similarity.
- C(i, j) = size(i intersection j) / size(i).
---from-file -- append the list of files in this text file to the input
+
+* `--output` -- save the distance matrix to this file (as a numpy binary matrix)
+* `--ksize` -- do the comparisons at this k-mer size.
+* `--containment` -- calculate containment instead of similarity; `C(i, j) = size(i intersection j) / size(i)`
+* `--from-file` -- append the list of files in this text file to the input
signatures.
---ignore-abundance -- ignore abundances in signatures.
-```
+* `--ignore-abundance` -- ignore abundances in signatures.
+* `--picklist` -- select a subset of signatures with [a picklist](#using-picklists-to-subset-large-collections-of-signatures)
**Note:** compare by default produces a symmetric similarity matrix that can be used as an input to clustering. With `--containment`, however, this matrix is no longer symmetric and cannot formally be used for clustering.
@@ -249,6 +258,9 @@ similarity match
...
```
+Note, as of sourmash 4.2.0, `search` supports `--picklist`, to
+[select a subset of signatures based on a CSV file](#using-picklists-to-subset-large-collections-of-signatures).
+
### `sourmash gather` - find metagenome members
The `gather` subcommand selects the best reference genomes to use for
@@ -289,6 +301,9 @@ which matches are no longer reported; by default, this is set to
50kb. see the Appendix in
[Classifying Signatures](classifying-signatures.md) for details.
+As of sourmash 4.2.0, `gather` supports `--picklist`, to
+[select a subset of signatures based on a CSV file](#using-picklists-to-subset-large-collections-of-signatures).
+
Note:
Use `sourmash gather` to classify a metagenome against a collection of
@@ -350,6 +365,9 @@ containing a list of file names to index; you can also provide individual
signature files, directories full of signatures, or other sourmash
databases.
+As of sourmash 4.2.0, `index` supports `--picklist`, to
+[select a subset of signatures based on a CSV file](#using-picklists-to-subset-large-collections-of-signatures).
+
### `sourmash prefetch` - select subsets of very large databases for more processing
The `prefetch` subcommand searches a collection of scaled signatures
@@ -375,6 +393,7 @@ Other options include:
* `--threshold-bp` to require a minimum estimated bp overlap for output;
* `--scaled` for downsampling;
* `--force` to continue past survivable errors;
+* `--picklist` select a subset of signatures with [a picklist](#using-picklists-to-subset-large-collections-of-signatures)
### Alternative search mode for low-memory (but slow) search: `--linear`
@@ -402,7 +421,301 @@ This combination of commands ensures that the more time- and
memory-intensive `gather` step is run only on a small set of relevant
signatures, rather than all the signatures in the database.
-## `sourmash lca` subcommands for taxonomic classification
+## `sourmash tax` subcommands for integrating taxonomic information into gather results
+
+The sourmash `tax` or `taxonomy` commands integrate taxonomic
+ information into the results of `sourmash gather`. All `tax` commands
+ require one or more properly formatted `taxonomy` files where the
+ identifiers correspond to those in the database(s) used for
+ `gather`. Note that if using multiple databases, the `gather` needs
+ to have been conducted against all desired databases within the same
+ `gather` command (we cannot combine separate `gather` runs for the
+ same query). For supported databases (e.g. GTDB, NCBI), we provide
+ taxonomy csv files, but they can also be generated for user-generated
+ databases. For more information, see [databases](databases.md).
+
+`tax` commands rely upon the fact that `gather` provides both the total
+ fraction of the query matched to each database matched, as well as a
+ non-overlapping `f_unique_to_query`, which is the fraction of the query
+ uniquely matched to each reference genome. The `f_unique_to_query` for
+ any reference match will always be between (0% of query matched) and 1
+ (100% of query matched), and for a query matched to multiple references,
+ the `f_unique_to_query` will sum to at most 1 (100% of query matched).
+ We use this property to aggregate gather matches at the desired
+ taxonomic rank. For example, if the gather results for a metagenome
+ include results for 30 different strains of a given species, we can sum
+ the fraction uniquely matched to each strain to obtain the fraction
+ uniquely matched to this species. Note that this summarization can
+ also take into account abundance weighting; see
+ [classifying signatures](classifying-signatures.md) for more
+ information.
+
+As with all reference-based analysis, results can be affected by the
+ completeness of the reference database. However, summarizing taxonomic
+ results from `gather` minimizes issues associated with increasing size
+ and redundancy of reference databases.
+
+For more details on how `gather` works and can be used to classify
+ signatures, see [classifying-signatures](classifying-signatures.md).
+
+
+### `sourmash tax metagenome` - summarize metagenome content from `gather` results
+
+`sourmash tax metagenome` summarizes gather results for each query metagenome by
+ taxonomic lineage.
+
+example command to summarize a single `gather csv`, where the query was gathered
+ against `gtdb-rs202` representative species database:
+
+```
+sourmash tax metagenome
+ --gather-csv HSMA33MX_gather_x_gtdbrs202_k31.csv \
+ --taxonomy gtdb-rs202.taxonomy.v2.csv
+```
+
+There are three possible output formats, `csv_summary`, `lineage_summary`, and
+ `krona`.
+
+#### `csv_summary` output format
+
+`csv_summary` is the default output format. This outputs a `csv` with lineage
+ summarization for each taxonomic rank. This output currently consists of six
+ columns, `query_name,rank,fraction,lineage,query_md5,query_filename`, where
+ `fraction` is the fraction of the query matched to the reported rank and
+ lineage.
+
+example `csv_summary` output from the command above:
+
+```
+query_name,rank,fraction,lineage
+HSMA33MX,superkingdom,0.131,d__Bacteria
+HSMA33MX,phylum,0.073,d__Bacteria;p__Bacteroidota
+HSMA33MX,phylum,0.058,d__Bacteria;p__Proteobacteria
+.
+.
+.
+HSMA33MX,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;
+o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli
+HSMA33MX,species,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;
+o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri
+HSMA33MX,species,0.016,d__Bacteria;p__Bacteroidota;c__Bacteroidia;
+o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus
+```
+> The `query_md5` and `query_filename` columns are omitted here for brevity.
+
+#### `krona` output format
+
+`krona` format is a tab-separated list of these results at a specific rank.
+ The first column, `fraction` is the fraction of the query matched to the
+ reported rank and lineage. The remaining columns are `superkingdom`, `phylum`,
+ ... etc down to the rank used for summarization. This output can be used
+ directly for summary visualization.
+
+To generate `krona`, we add `--output-format krona` to the command above, and
+ need to specify a rank to summarize. Here's the command for reporting `krona`
+ summary at `species` level:
+
+```
+sourmash tax metagenome
+ --gather-csv HSMA33MX_gather_x_gtdbrs202_k31.csv \
+ --taxonomy gtdb-rs202.taxonomy.v2.csv \
+ --output-format krona --rank species
+```
+
+example krona output from this command:
+
+```
+fraction superkingdom phylum class order family genus species
+0.05815279361459521 Bacteria Proteobacteria Gammaproteobacteria Enterobacterales Enterobacteriaceae Escherichia Escherichia coli
+0.05701254275940707 Bacteria Bacteroidetes Bacteroidia Bacteroidales Prevotellaceae Prevotella Prevotella copri
+0.015637726014008795 Bacteria Bacteroidetes Bacteroidia Bacteroidales Bacteroidaceae Bacteroides Bacteroides vulgatus
+```
+
+#### `lineage_summary` output format
+
+The lineage summary format is most useful when comparing across metagenome queries.
+ Each row is a lineage at the desired reporting rank. The columns are each query
+ used for gather, with the fraction match reported for each lineage. This format
+ is commonly used as input for many external multi-sample visualization tools.
+
+To generate `lineage_summary`, we add `--output-format lineage_summary` to the summarize
+ command, and need to specify a rank to summarize. Here's the command for reporting
+ `lineage_summary` for two queries (HSMA33MX, PSM6XBW3) summary at `species` level.
+
+```
+sourmash tax metagenome
+ --gather-csv HSMA33MX_gather_x_gtdbrs202_k31.csv \
+ --gather-csv PSM6XBW3_gather_x_gtdbrs202_k31.csv \
+ --taxonomy gtdb-rs202.taxonomy.v2.csv \
+ --output-format krona --rank species
+```
+
+example `lineage_summary`:
+
+```
+lineage HSMA33MX PSM6XBW3
+d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus 0.015637726014008795 0.015642822225843248
+d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri 0.05701254275940707 0.05703112269838684
+d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli 0.05815279361459521 0.05817174515235457
+```
+
+To produce multiple output types from the same command, add the types into the
+ `--output-format` argument, e.g. `--output-format summary krona lineage_summary`
+
+
+### `sourmash tax genome` - classify a genome using `gather` results
+
+`sourmash tax genome` reports likely classification for each query,
+ based on `gather` matches. By default, classification requires at least 10% of
+ the query to be matched. Thus, if 10% of the query was matched to a species, the
+ species-level classification can be reported. However, if 7% of the query was
+ matched to one species, and an additional 5% matched to a different species in
+ the same genus, the genus-level classification will be reported.
+
+Optionally, `genome` can instead report classifications at a desired `rank`,
+ regardless of match threshold (`--rank` argument, e.g. `--rank species`).
+
+Note that these thresholds and strategies are under active testing.
+
+To illustrate the utility of `genome`, let's consider a signature consisting
+ of two different Shewanella strains, `Shewanella baltica OS185 strain=OS185`
+ and `Shewanella baltica OS223 strain=OS223`. For simplicity, we gave this query
+ the name "Sb47+63".
+
+When we gather this signature against the `gtdb-rs202` representatives database,
+we see 66% matches to one strain, and 33% to the other:
+
+abbreviated gather_csv:
+
+```
+f_match,f_unique_to_query,name,query_name
+0.664,0.664,"GCF_000021665.1 Shewanella baltica OS223 strain=OS223, ASM2166v1",Sb47+63
+0.656,0.335,"GCF_000017325.1 Shewanella baltica OS185 strain=OS185, ASM1732v1",Sb47+63
+```
+
+> Here, `f_match` shows that independently, both strains match ~65% percent of
+ this mixed query. The `f_unique_to_query` column has the results of gather-style
+ decomposition. As the OS223 strain had a slightly higher `f_match` (66%), it
+ was the first match. The remaining 33% of the query matched to strain OS185.
+
+We can use `tax genome` on this gather csv to classify our "Sb47+63" mixed-strain query:
+
+```
+sourmash tax genome
+ --gather-csv 47+63_x_gtdb-rs202.gather.csv \
+ --taxonomy gtdb-rs202.taxonomy.v2.csv
+```
+> This command uses the default classification strategy, which uses a
+containment threshold of 0.1 (10%).
+
+There are two possible output formats, `csv_summary` and `krona`.
+
+#### `csv_summary` output format
+
+`csv_summary` is the default output format. This outputs a `csv` with taxonomic
+ classification for each query genome. This output currently consists of six
+ columns, `query_name,rank,fraction,lineage,query_md5,query_filename`, where
+ `fraction` is the fraction of the query matched to the reported rank and lineage.
+ The `status` column provides additional information on the classification:
+
+ - `match` - this query was classified
+ - `nomatch`- this query could not be classified
+ - `below_threshold` - this query was classified at the specified rank,
+ but the query fraction matched was below the containment threshold
+
+Here is the `csv_summary` output from classifying this mixed-strain Shewanella query to
+species level:
+
+```
+query_name,status,rank,fraction,lineage
+"Sb47+63",match,species,1.000,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Shewanellaceae;g__Shewanella;s__Shewanella baltica
+```
+>Here, we see that the match percentages to both strains have been aggregated,
+and we have 100% species-level `Shewanella baltica` annotation. We have omitted
+the `query_md5` and `query_filename` columns for brevity.
+
+#### `krona` output format
+
+`krona` format is a tab-separated list of these results at a specific rank.
+ The first column, `fraction` is the fraction of the query matched to the
+ reported rank and lineage. The remaining columns are `superkingdom`, `phylum`,
+ ... etc down to the rank used for summarization. This output can be used
+ directly for `krona` visualization.
+
+To generate `krona`, we must classify by `--rank` instead of using the
+ classification threshold. For the command, we add `--output-format krona`
+ and `--rank ` to the command above. Here's the command for producing
+ `krona` output for `species`-level classifications:
+
+```
+sourmash tax genome
+ --gather-csv Sb47+63_gather_x_gtdbrs202_k31.csv \
+ --taxonomy gtdb-rs202.taxonomy.v2.csv \
+ --output-format krona --rank species
+```
+> Note that specifying `--rank` forces classification by rank rather than
+by the containment threshold.
+
+Here is the `krona`-formatted output for this command:
+
+```
+fraction superkingdom phylum class order family genus species
+1.0 d__Bacteria p__Proteobacteria c__Gammaproteobacteria o__Enterobacterales f__Shewanellaceae g__Shewanella s__Shewanella baltica
+```
+
+To produce multiple output types from the same command, add the types into the
+ `--output-format` argument, e.g. `--output-format csv_summary krona`.
+ **Note that specifying the classification rank with `--rank`,
+ (e.g. `--rank species`), as needed for `krona` output, forces classification
+ by `rank` rather than by containment threshold.** If the query
+ classification at this rank does not meet the containment threshold
+ (default=0.1), the `status` column will contain `below_threshold`.
+
+
+### `sourmash tax annotate` - annotates gather output with taxonomy
+
+`sourmash tax annotate` adds a column with taxonomic lineage information
+ for each database match to gather output. Do not summarize or classify.
+ Note that this is not required for either `summarize` or `classify`.
+
+By default, `annotate` uses the name of each input gather csv to write an updated
+ version with lineages information. For example, annotating `sample1.gather.csv`
+ would produce `sample1.gather.with-lineages.csv`
+
+```
+sourmash tax annotate
+ --gather-csv Sb47+63_gather_x_gtdbrs202_k31.csv \
+ --taxonomy gtdb-rs202.taxonomy.v2.csv
+```
+> This will produce an annotated gather CSV, `Sb47+63_gather_x_gtdbrs202_k31.with-lineages.csv`
+
+### `sourmash tax prepare` - prepare and/or combine taxonomy files
+
+All `sourmash tax` commands must be given one or more taxonomy files as
+parameters to the `--taxonomy` argument. These files can be either CSV
+files or (as of sourmash 4.2.1) sqlite3 databases. sqlite3 databases
+are much faster for large taxonomies, while CSV files are easier to view
+and modify using spreadsheet software.
+
+`sourmash tax prepare` is a utility function that can ingest and validate
+multiple CSV files or sqlite3 databases, and output a CSV file or a sqlite3
+database. It can be used to combine multiple taxonomies into a single file,
+as well as change formats between CSV and sqlite3.
+
+The following command will take in two taxonomy files and combine them into
+a single taxonomy sqlite database.
+
+```
+sourmash tax prepare --taxonomy file1.csv file2.csv -o tax.db
+```
+
+Input databases formats can be mixed and matched, and the output format
+can be set to CSV like so:
+```
+sourmash tax prepare --taxonomy file1.csv file2.db -o tax.csv -F csv
+```
+
+## `sourmash lca` subcommands for in-memory taxonomy integration
These commands use LCA databases (created with `lca index`, below, or
prepared databases such as
@@ -589,6 +902,9 @@ see
You can use `--from-file` to pass `lca index` a text file containing a
list of file names to index.
+As of sourmash 4.2.0, `lca index` supports `--picklist`, to
+[select a subset of signatures based on a CSV file](#using-picklists-to-subset-large-collections-of-signatures).
+
### `sourmash lca rankinfo` - examine an LCA database
The `sourmash lca rankinfo` command displays k-mer specificity
@@ -818,6 +1134,12 @@ sourmash signature extract tests/test-data/*.fa.sig --name NC_009665
will extract the same signature, which has an accession number of
`NC_009665.1`.
+#### Using picklists with `sourmash sig extract`
+
+As of sourmash 4.2.0, `extract` also supports picklists, a feature by
+which you can select signatures based on values in a CSV file. See
+[Using picklists to subset large collections of signatures](#using-picklists-to-subset-large-collections-of-signatures), below.
+
### `sourmash signature flatten` - remove abundance information from signatures
Flatten the specified signature(s), removing abundances and setting
@@ -920,15 +1242,78 @@ Briefly,
signature files using `--query-from-file` (see below).
* `index` and `lca index` take a few fixed parameters (database name,
- taxonomy spreadsheet) and then an arbitrary number of other files
- that contain signatures, including files, directories, and indexed
- databases. These commands will also take `--from-file` (see below).
+ and for `lca index`, a taxonomy file) and then an arbitrary number of
+ other files that contain signatures, including files, directories,
+ and indexed databases. These commands will also take `--from-file`
+ (see below).
None of these commands currently support searching, comparing, or indexing
signatures with multiple ksizes or moltypes at the same time; you need
to pick the ksize and moltype to use for your search. Where possible,
scaled values will be made compatible.
+### Using picklists to subset large collections of signatures
+
+As of sourmash 4.2.0, many commands support *picklists*, a feature by
+which you can select or "pick out" signatures based on values in a CSV
+file.
+
+For example,
+```
+sourmash sig extract --picklist list.csv:md5:md5sum
+```
+will extract only the signatures that have md5sums matching the
+column `md5sum` in the CSV file `list.csv`. The command
+```
+sourmash sig extract --picklist list.csv::prefetch
+```
+will extract only the signatures found in the output
+of `sourmash prefetch ... -o list.csv`.
+
+The `--picklist` argument string must be of the format
+`pickfile:colname:coltype[:pickstyle]`, where `pickfile` is the path
+to a CSV file, `colname` is the name of the column to select from the
+CSV file (based on the headers in the first line of the CSV file), and
+`coltype` is the type of match. An optional pickstyle argument,
+`:include` or `:exclude`, can be added as a fourth parameter; if
+omitted, the default is `:include`.
+
+The following `coltype`s are currently supported by `sourmash sig extract`:
+
+* `name` - exact match to signature's name
+* `md5` - exact match to signature's md5sum
+* `md5prefix8` - match to 8-character prefix of signature's md5sum
+* `md5short` - same as `md5prefix8`
+* `ident` - exact match to signature's identifier
+* `identprefix` - match to signature's identifier, before '.'
+* `gather` - use the CSV output of `sourmash gather` as a picklist
+* `prefetch` - use the CSV output of `sourmash prefetch` as a picklist
+* `search` - use the CSV output of `sourmash prefetch` as a picklist
+* `manifest` - use the CSV output of `sourmash sig manifest` as a picklist
+
+Identifiers are constructed by using the first space delimited word in
+the signature name.
+
+One way to build a picklist is to use `sourmash sig describe --csv
+out.csv ` to construct an initial CSV file that you can
+then edit further.
+
+The picklist functionality also supports excluding (rather than
+including) signatures matching the picklist arguments. To specify a
+picklist for exclusion, add `:exclude` to the `--picklist` argument
+string, e.g. `pickfile:colname:coltype:exclude`.
+
+For example,
+```
+sourmash sig extract --picklist list.csv:md5:md5sum:exclude
+```
+will extract only the signatures that have md5sums that **do not** match
+entries in the column `md5sum` in the CSV file `list.csv`.
+
+In addition to `sig extract`, the following commands support
+`--picklist` selection: `index`, `search`, `gather`, `prefetch`,
+`compare`, `index`, and `lca index`.
+
### Storing (and searching) signatures
Backing up a little, there are many ways to store and search
@@ -1031,5 +1416,5 @@ signatures that were just created.
(This is a relatively new feature as of 3.4 and our testing may need
some work, so please
-[let us know](/~https://github.com/dib-lab/sourmash/issues) if there's
+[let us know](/~https://github.com/sourmash-bio/sourmash/issues) if there's
something that doesn't work and we will fix it :).
diff --git a/doc/databases.md b/doc/databases.md
index d0830635ed..d2b2308473 100644
--- a/doc/databases.md
+++ b/doc/databases.md
@@ -1,86 +1,37 @@
-# Prepared search databases
+# Prepared databases
-We provide several databases for download. Note that these databases can
-be used with both sourmash v3.5 and sourmash v4.0.
+## GTDB R06-rs202 - DNA databases
-## RefSeq microbial genomes - SBT
+All files below are available under https://osf.io/wxf9z/. The GTDB taxonomy spreadsheet (in a format suitable for `sourmash lca index`) is available [here](https://osf.io/p6z3w/).
-These database are formatted for use with `sourmash search` and
-`sourmash gather`. They are calculated with a scaled value of 2000.
+For each k-mer size, three databases are available.
-Approximately 91,000 microbial genomes (including viral and fungal)
-from NCBI RefSeq.
+* Zipfile collections can be used for a linear search. The signatures were calculated with a scaled of 1000, which robustly supports searches for ~10kb or larger matches.
+* SBT databases are indexed versions of the Zipfile collections that support faster search. They are also indexed with scaled=1000.
+* LCA databases are indexed versions of the Zipfile collections that also contain taxonomy information and can be used with regular search as well as with [the `lca` subcommands for taxonomic analysis](https://sourmash.readthedocs.io/en/latest/command-line.html#sourmash-lca-subcommands-for-taxonomic-classification). They are indexed with scaled=10,000, which robustly supports searches for 100kb or larger matches.
-* [RefSeq k=21, 2018.03.29][0] - 3.3 GB - [manifest](https://osf.io/wamfk/download)
-* [RefSeq k=31, 2018.03.29][1] - 3.3 GB - [manifest](https://osf.io/x3aut/download)
-* [RefSeq k=51, 2018.03.29][2] - 3.4 GB - [manifest](https://osf.io/zpkau/download)
+You can read more about the different database and index types [here](https://sourmash.readthedocs.io/en/latest/command-line.html#indexed-databases).
-## Genbank microbial genomes - SBT
+Legacy databases are available [here](legacy-databases.md)
-These database are formatted for use with `sourmash search` and
-`sourmash gather`.
+Note that the SBT and LCA databases can be used with sourmash v3.5 and later, while Zipfile collections can only be used with sourmash v4.1.0 and up.
-Approximately 98,000 microbial genomes (including viral and fungal)
-from NCBI Genbank.
+### GTDB genomic representatives (47.8k genomes)
-* [Genbank k=21, 2018.03.29][3] - 3.9 GB - [manifest](https://osf.io/vm5kb/download)
-* [Genbank k=31, 2018.03.29][4] - 3.9 GB - [manifest](https://osf.io/p87ec/download)
-* [Genbank k=51, 2018.03.29][5] - 3.9 GB - [manifest](https://osf.io/cbxg9/download)
+The GTDB genomic representatives are a low-redundancy subset of Genbank genomes.
-### Details
+| K-mer size | Zipfile collection | SBT | LCA |
+| -------- | -------- | -------- | ---- |
+| 21 | [download (1.3 GB)](https://osf.io/jp5zh/download) | [download (2.6 GB)](https://osf.io/py92w/download) | [download (114 MB)](https://osf.io/gk2za/download) |
+| 31 | [download (1.3 GB)](https://osf.io/nqmau/download) | [download (2.6 GB)](https://osf.io/w4bcm/download) | [download (131 MB)](https://osf.io/ypsjq/download) |
+| 51 | [download (1.3 GB)](https://osf.io/px6qd/download) | [download (2.6 GB)](https://osf.io/rv9zp/download) | [download (137 MB)](https://osf.io/297dp/download) |
-The individual signatures for the above SBTs were calculated as follows:
+### GTDB all genomes (258k genomes)
-```
-sourmash compute -k 4,5 \
- -n 2000 \
- --track-abundance \
- --name-from-first \
- -o {output} \
- {input}
+These databases contain the complete GTDB collection of 258,406 genomes.
-sourmash compute -k 21,31,51 \
- --scaled 2000 \
- --track-abundance \
- --name-from-first \
- -o {output} \
- {input}
-```
-
-See [github.com/dib-lab/sourmash_databases](/~https://github.com/dib-lab/sourmash_databases) for a Snakemake workflow
-to build the databases.
-
-[0]: https://sourmash-databases.s3-us-west-2.amazonaws.com/zip/refseq-k21.sbt.zip
-[1]: https://sourmash-databases.s3-us-west-2.amazonaws.com/zip/refseq-k31.sbt.zip
-[2]: https://sourmash-databases.s3-us-west-2.amazonaws.com/zip/refseq-k51.sbt.zip
-
-[3]: https://sourmash-databases.s3-us-west-2.amazonaws.com/zip/genbank-k21.sbt.zip
-[4]: https://sourmash-databases.s3-us-west-2.amazonaws.com/zip/genbank-k31.sbt.zip
-[5]: https://sourmash-databases.s3-us-west-2.amazonaws.com/zip/genbank-k51.sbt.zip
-
-## Genbank LCA Database
-
-These databases are formatted for use with `sourmash lca`; they are
-v2 LCA databases and will work with sourmash v2.0a11 and later.
-They are calculated with a scaled value of 10000 (1e5).
-
-Approximately 87,000 microbial genomes (including viral and fungal)
-from NCBI Genbank.
-
-* [Genbank k=21, 2017.11.07](https://osf.io/d7rv8/download), 109 MB
-* [Genbank k=31, 2017.11.07](https://osf.io/4f8n3/download), 120 MB
-* [Genbank k=51, 2017.11.07](https://osf.io/nemkw/download), 125 MB
-
-### Details
-
-The above LCA databases were calculated as follows:
-
-```
-sourmash lca index genbank-genomes-taxonomy.2017.05.29.csv \
- genbank-k21.lca.json.gz -k 21 --scaled=10000 \
- -f --traverse-directory .sbt.genbank-k21 --split-identifiers
-```
-
-See
-[github.com/dib-lab/2018-ncbi-lineages](/~https://github.com/dib-lab/2018-ncbi-lineages)
-for information on preparing the genbank-genomes-taxonomy file.
+| K-mer size | Zipfile collection | SBT | LCA |
+| -------- | -------- | -------- | ---- |
+| 21 | [download (7.8 GB)](https://osf.io/vgex4/download) | [download (15 GB)](https://osf.io/ar67j/download) | [download (266 MB)](https://osf.io/hm3c4/download) |
+| 31 | [download (7.8 GB)](https://osf.io/94mzh/download) | [download (15 GB)](https://osf.io/dmsz8/download) | [download (286 MB)](https://osf.io/9xdg2/download) |
+| 51 | [download (7.8 GB)](https://osf.io/x9cdp/download) | [download (15 GB)](https://osf.io/8fc3t/download) | [download (299 MB)](https://osf.io/3cdp6/download) |
diff --git a/doc/developer.md b/doc/developer.md
index 088d9ab2b6..c2929b43d6 100644
--- a/doc/developer.md
+++ b/doc/developer.md
@@ -4,7 +4,7 @@
You can get the latest development branch with:
```
-git clone /~https://github.com/dib-lab/sourmash.git
+git clone /~https://github.com/sourmash-bio/sourmash.git
```
sourmash runs under Python 3.7 and later.
@@ -102,8 +102,8 @@ We use [GitHub Actions][2] for continuous integration.
Code coverage can be viewed interactively at [codecov.io][1].
-[1]: https://codecov.io/gh/dib-lab/sourmash/
-[2]: /~https://github.com/dib-lab/sourmash/actions
+[1]: https://codecov.io/gh/sourmash-bio/sourmash/
+[2]: /~https://github.com/sourmash-bio/sourmash/actions
## Code organization
diff --git a/doc/index.md b/doc/index.md
index b4517a6412..75d83e2f55 100644
--- a/doc/index.md
+++ b/doc/index.md
@@ -24,7 +24,7 @@ Please also see the `mash` [software](http://mash.readthedocs.io/en/latest/) and
[paper (Ondov et al., 2016)](http://dx.doi.org/10.1186/s13059-016-0997-x) for
background information on how and why MinHash works.
-**Questions? Thoughts?** Ask us on the [sourmash issue tracker](/~https://github.com/dib-lab/sourmash/issues/)!
+**Questions? Thoughts?** Ask us on the [sourmash issue tracker](/~https://github.com/sourmash-bio/sourmash/issues/)!
**Want to migrate to sourmash v4?** sourmash v4 is now available, and
has a number of incompatibilites with v2 and v3. Please see
@@ -71,15 +71,15 @@ be stored, searched, explored, and taxonomically annotated.
* `sourmash` relies on an underlying Rust core for performance.
-* `sourmash` is developed [on GitHub](/~https://github.com/dib-lab/sourmash)
+* `sourmash` is developed [on GitHub](/~https://github.com/sourmash-bio/sourmash)
and is **freely and openly available** under the BSD 3-clause license.
- Please see [the README](/~https://github.com/dib-lab/sourmash/blob/latest/README.md)
+ Please see [the README](/~https://github.com/sourmash-bio/sourmash/blob/latest/README.md)
for more information on development, support, and contributing.
You can take a look at sourmash analyses on real data
-[in a saved Jupyter notebook](/~https://github.com/dib-lab/sourmash/blob/latest/doc/sourmash-examples.ipynb),
+[in a saved Jupyter notebook](/~https://github.com/sourmash-bio/sourmash/blob/latest/doc/sourmash-examples.ipynb),
and experiment with it yourself
-[interactively in a Jupyter Notebook](https://mybinder.org/v2/gh/dib-lab/sourmash/latest?filepath=doc%2Fsourmash-examples.ipynb)
+[interactively in a Jupyter Notebook](https://mybinder.org/v2/gh/sourmash-bio/sourmash/latest?filepath=doc%2Fsourmash-examples.ipynb)
at [mybinder.org](http://mybinder.org).
## Installing sourmash
@@ -94,7 +94,7 @@ or conda:
$ conda install -c conda-forge -c bioconda sourmash
```
-Please see [the README file in github.com/dib-lab/sourmash](/~https://github.com/dib-lab/sourmash/blob/latest/README.md)
+Please see [the README file in github.com/sourmash-bio/sourmash](/~https://github.com/sourmash-bio/sourmash/blob/latest/README.md)
for more information.
## Memory and speed
@@ -104,7 +104,7 @@ many other software programs used for genome search and taxonomic
classification.
`sourmash search` and `sourmash gather` can be used to search 100k
-genbank microbial genomes ([using our prepared databases](databases.md)
+genbank microbial genomes ([using our prepared databases](databases.md))
with about 20 GB of disk and in under 1 GB of RAM.
Typically a search for a single genome takes about 30 seconds on a laptop.
diff --git a/doc/legacy-databases.md b/doc/legacy-databases.md
new file mode 100644
index 0000000000..5b8312ae9d
--- /dev/null
+++ b/doc/legacy-databases.md
@@ -0,0 +1,112 @@
+# Legacy Databases
+
+Sourmash databases have evolved over time.
+We have changed how the database is stored (uncompressed `.zip`) and how we name each signature.
+All SBT databases below are in `.sbt.zip` format.
+Note that the SBT and LCA databases can be used with sourmash v3.5 and later, while Zipfile collections can only be used with sourmash v4.1.0 and up.
+We detail these changes below, and include links to legacy databases.
+See [github.com/sourmash-bio/databases](/~https://github.com/sourmash-bio/databases) for a Snakemake workflow that builds current and legacy databases.
+
+## Sourmash signature names
+
+Earlier versions of sourmash databases were built using individual signatures that were calculated as follows:
+
+```
+sourmash compute -k 4,5 \
+ -n 2000 \
+ --track-abundance \
+ --name-from-first \
+ -o {output} \
+ {input}
+
+sourmash compute -k 21,31,51 \
+ --scaled 2000 \
+ --track-abundance \
+ --name-from-first \
+ -o {output} \
+ {input}
+```
+
+We moved away from this strategy because `--name-from-first` named each signature from the name of the first sequence in the FASTA file.
+While the species name of the organism was present in this name, the accession number corresponded to the accession of the first sequence fragment in the file, not the genome assembly.
+As such, we revised our strategy so that signatures are named by genome assembly accession and species name.
+This requires the `assembly_summary.txt` file to be parsed.
+
+## Sourmash database compression
+
+## Legacy databases
+
+### RefSeq microbial genomes - SBT
+
+These database are formatted for use with `sourmash search` and
+`sourmash gather`. They are calculated with a scaled value of 2000.
+
+Approximately 91,000 microbial genomes (including viral and fungal)
+from NCBI RefSeq.
+
+* [RefSeq k=21, 2018.03.29][0] - 3.3 GB - [manifest](https://osf.io/wamfk/download)
+* [RefSeq k=31, 2018.03.29][1] - 3.3 GB - [manifest](https://osf.io/x3aut/download)
+* [RefSeq k=51, 2018.03.29][2] - 3.4 GB - [manifest](https://osf.io/zpkau/download)
+
+### Genbank microbial genomes - SBT
+
+These database are formatted for use with `sourmash search` and
+`sourmash gather`.
+
+Approximately 98,000 microbial genomes (including viral and fungal)
+from NCBI Genbank.
+
+* [Genbank k=21, 2018.03.29][3] - 3.9 GB - [manifest](https://osf.io/vm5kb/download)
+* [Genbank k=31, 2018.03.29][4] - 3.9 GB - [manifest](https://osf.io/p87ec/download)
+* [Genbank k=51, 2018.03.29][5] - 3.9 GB - [manifest](https://osf.io/cbxg9/download)
+
+
+[0]: https://sourmash-databases.s3-us-west-2.amazonaws.com/zip/refseq-k21.sbt.zip
+[1]: https://sourmash-databases.s3-us-west-2.amazonaws.com/zip/refseq-k31.sbt.zip
+[2]: https://sourmash-databases.s3-us-west-2.amazonaws.com/zip/refseq-k51.sbt.zip
+
+[3]: https://sourmash-databases.s3-us-west-2.amazonaws.com/zip/genbank-k21.sbt.zip
+[4]: https://sourmash-databases.s3-us-west-2.amazonaws.com/zip/genbank-k31.sbt.zip
+[5]: https://sourmash-databases.s3-us-west-2.amazonaws.com/zip/genbank-k51.sbt.zip
+
+### Genbank microbial genomes - LCA
+
+These databases are formatted for use with `sourmash lca`; they are
+v2 LCA databases and will work with sourmash v2.0a11 and later.
+They are calculated with a scaled value of 10000 (1e5).
+
+Approximately 87,000 microbial genomes (including viral and fungal)
+from NCBI Genbank.
+
+* [Genbank k=21, 2017.11.07](https://osf.io/d7rv8/download), 109 MB
+* [Genbank k=31, 2017.11.07](https://osf.io/4f8n3/download), 120 MB
+* [Genbank k=51, 2017.11.07](https://osf.io/nemkw/download), 125 MB
+
+
+The above LCA databases were calculated as follows:
+
+```
+sourmash lca index genbank-genomes-taxonomy.2017.05.29.csv \
+ genbank-k21.lca.json.gz -k 21 --scaled=10000 \
+ -f --traverse-directory .sbt.genbank-k21 --split-identifiers
+```
+
+See
+[github.com/dib-lab/2018-ncbi-lineages](/~https://github.com/dib-lab/2018-ncbi-lineages)
+for information on preparing the genbank-genomes-taxonomy when signatures are generated using `--name-from-first`.
+
+### GTDB databases - SBT
+
+All files below are available [here](https://osf.io/wxf9z/).
+
+Release 89
+
+* [GTDB k=31, release 89](https://osf.io/5mb9k/download)
+
+Release 95
+
+* [GTDB k=21, scaled=1000](https://osf.io/4yhe2/download)
+* [GTDB k=31, scaled=1000](https://osf.io/4n3m5/download)
+* [GTDB k=51, scaled=1000](https://osf.io/c8wj7/download)
+
+
diff --git a/doc/more-info.md b/doc/more-info.md
index b98150d694..f370fd2130 100644
--- a/doc/more-info.md
+++ b/doc/more-info.md
@@ -112,7 +112,7 @@ tries to connect to X11 to use the Tkinter backend.
The solution is to force the use of the 'Agg' backend in matplotlib;
see [this stackoverflow answer](https://stackoverflow.com/a/34294056)
-or [this sourmash issue comment](/~https://github.com/dib-lab/sourmash/issues/254#issuecomment-304274590).
+or [this sourmash issue comment](/~https://github.com/sourmash-bio/sourmash/issues/254#issuecomment-304274590).
Newer versions of matplotlib do not seem to have this problem.
@@ -120,7 +120,7 @@ Newer versions of matplotlib do not seem to have this problem.
[1]:/~https://github.com/edawson/rkmh
[2]:/~https://github.com/lskatz/mashtree/blob/master/README.md
[3]:/~https://github.com/onecodex/finch-rs
-[4]:/~https://github.com/dib-lab/sourmash/blob/latest/utils/compute-dna-mh-another-way.py
+[4]:/~https://github.com/sourmash-bio/sourmash/blob/latest/utils/compute-dna-mh-another-way.py
[5]:http://ivory.idyll.org/blog/2016-sourmash.html
[6]:http://ivory.idyll.org/blog/2016-sourmash-signatures.html
[7]:http://ivory.idyll.org/blog/2016-sourmash-sbt.html
diff --git a/doc/release.md b/doc/release.md
index 56741fc1ca..de093f591f 100644
--- a/doc/release.md
+++ b/doc/release.md
@@ -29,7 +29,7 @@ and also check if the [rendered docs] are updated.
1\. The below should be done in a clean checkout:
```
cd $(mktemp -d)
-git clone git@github.com:dib-lab/sourmash.git
+git clone git@github.com:sourmash-bio/sourmash.git
cd sourmash
```
@@ -46,7 +46,7 @@ git tag -a v${new_version}${rc}
git push --tags origin
```
-[the releases page]: /~https://github.com/dib-lab/sourmash/releases
+[the releases page]: /~https://github.com/sourmash-bio/sourmash/releases
3\. Test the release candidate. Bonus: repeat on macOS:
```
@@ -62,7 +62,7 @@ python -m venv testenv4
cd testenv1
source bin/activate
-git clone --depth 1 --branch v${new_version}${rc} /~https://github.com/dib-lab/sourmash.git
+git clone --depth 1 --branch v${new_version}${rc} /~https://github.com/sourmash-bio/sourmash.git
cd sourmash
python -m pip install -U setuptools pip wheel setuptools_scm
python -m pip install -r requirements.txt
@@ -74,7 +74,7 @@ cd ../../testenv2
deactivate
source bin/activate
python -m pip install -U setuptools pip wheel setuptools_scm
-python -m pip install -e git+/~https://github.com/dib-lab/sourmash.git@v${new_version}${rc}#egg=sourmash[test]
+python -m pip install -e git+/~https://github.com/sourmash-bio/sourmash.git@v${new_version}${rc}#egg=sourmash[test]
cd src/sourmash
make test
make dist
@@ -145,7 +145,7 @@ git push --delete origin v${new_version}${rc}
3\. Upload wheels from GitHub Releases to PyPI
-[GitHub Actions will automatically build wheels and upload them to GitHub Releases](/~https://github.com/dib-lab/sourmash/actions?query=workflow%3Acibuildwheel).
+[GitHub Actions will automatically build wheels and upload them to GitHub Releases](/~https://github.com/sourmash-bio/sourmash/actions?query=workflow%3Acibuildwheel).
This will take about 45 minutes, or more. After they're built, they must be
copied over to PyPI manually.
diff --git a/doc/requirements.md b/doc/requirements.md
index 95f2b2dfbd..d6a5353be9 100644
--- a/doc/requirements.md
+++ b/doc/requirements.md
@@ -15,4 +15,4 @@ under Python 3.7 and later. Please see [the development repository README][0]
for
information on source code, tests, and continuous integration.
-[0]:/~https://github.com/dib-lab/sourmash/blob/latest/README.md
+[0]:/~https://github.com/sourmash-bio/sourmash/blob/latest/README.md
diff --git a/doc/sourmash-sketch.md b/doc/sourmash-sketch.md
index f136570d32..2c9852c74e 100644
--- a/doc/sourmash-sketch.md
+++ b/doc/sourmash-sketch.md
@@ -91,7 +91,7 @@ The output signature(s) will be saved in locations that depend on your input par
`sourmash sketch protein` and `sourmash sketch translate` output protein sketches by default, but can also use the `dayhoff` and `hp` encodings. The [Dayhoff encoding](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-9-367/tables/1) collapses multiple amino acids into a smaller alphabet so that amino acids that share biochemical properties map to the same character. The hp encoding divides amino acids into hydrophobic and polar (hydrophilic) amino acids, collapsing amino acids with hydrophobic side chains together and doing the same for polar amino acids.
-We are still in the process of benchmarking these encodings; ask [on the issue tracker](/~https://github.com/dib-lab/sourmash/issues) if you are interested in updates.
+We are still in the process of benchmarking these encodings; ask [on the issue tracker](/~https://github.com/sourmash-bio/sourmash/issues) if you are interested in updates.
### Parameter strings
@@ -121,7 +121,7 @@ These were chosen by a committee of PhDs as being good defaults for an initial a
More seriously, the DNA parameters were chosen based on the analyses done by Koslicki and Falush in [MetaPalette: a k-mer Painting Approach for Metagenomic Taxonomic Profiling and Quantification of Novel Strain Variation](https://msystems.asm.org/content/1/3/e00020-16).
-The protein, dayhoff, and hp parameters were selected based on unpublished research results and/or magic formulas. We are working on publishing the results! Please ask on the [issue tracker](/~https://github.com/dib-lab/sourmash/issues) if you are curious.
+The protein, dayhoff, and hp parameters were selected based on unpublished research results and/or magic formulas. We are working on publishing the results! Please ask on the [issue tracker](/~https://github.com/sourmash-bio/sourmash/issues) if you are curious.
### More complex parameter string examples
@@ -185,4 +185,4 @@ You can use `sourmash sig describe` to get detailed information about the conten
We try to provide good documentation and error messages, but may not succeed in answer all your questions! So we're happy to help out!
-Please post questions [on the sourmash issue tracker](/~https://github.com/dib-lab/sourmash/issues). If you find something confusing or buggy about the documentation or about sourmash, we'd love to fix it -- for you *and* for everyone else!
+Please post questions [on the sourmash issue tracker](/~https://github.com/sourmash-bio/sourmash/issues). If you find something confusing or buggy about the documentation or about sourmash, we'd love to fix it -- for you *and* for everyone else!
diff --git a/doc/support.md b/doc/support.md
index 406ed81e65..4c585b1181 100644
--- a/doc/support.md
+++ b/doc/support.md
@@ -11,11 +11,11 @@ bugs, and some of our best features have come from user
requests. Please help us improve sourmash for everyone by asking
questions as you have them!
-Please ask questions and file bug descriptions [on the GitHub issue tracker for sourmash, dib-lab/sourmash/issues][0].
+Please ask questions and file bug descriptions [on the GitHub issue tracker for sourmash, sourmash-bio/sourmash/issues][0].
You can also ask questions of Titus on Twitter at [@ctitusbrown][1].
-[0]:/~https://github.com/dib-lab/sourmash/issues
+[0]:/~https://github.com/sourmash-bio/sourmash/issues
[1]:https://twitter.com/ctitusbrown/
## Versioning and stability of features and APIs
@@ -116,7 +116,7 @@ If you depend on sourmash, we recommend using the following process:
* pin sourmash to the major version you developed against, e.g. `sourmash >=3,<4`.
* when ready to upgrade sourmash, upgrade to the latest minor release within that major version (e.g. sourmash 3.5.x).
-* scan for deprecations that affect you, check [the release notes](/~https://github.com/dib-lab/sourmash/releases),
+* scan for deprecations that affect you, check [the release notes](/~https://github.com/sourmash-bio/sourmash/releases),
and fix any major issues noted.
* upgrade to the next major version (e.g. sourmash 4.0) and run your integration tests or workflow.
* fix outstanding issues.
@@ -130,7 +130,7 @@ If you want to upgrade workflows and scripts from prior releases of
sourmash to sourmash v4.0, we suggest doing this in two stages.
First, upgrade to the latest version of sourmash 3.5.x (currently
-[v3.5.1](/~https://github.com/dib-lab/sourmash/releases/tag/v3.5.1)),
+[v3.5.1](/~https://github.com/sourmash-bio/sourmash/releases/tag/v3.5.1)),
which is compatible with all files and command lines used in previous
versions of sourmash (v2.x and v3.x). After upgrading to 3.5.x, scan
the sourmash output for deprecation warnings and fix those.
@@ -178,12 +178,12 @@ Second, the `MinHash` class API has changed significantly!
Third, `SourmashSignature` objects no longer have a `name()` method but instead a `name` property, which can be assigned to. This property is now `None` when no name has been assigned. Note that `str(sig)` should now be used to retrieve a display name, and should replace all previous uses of `sig.name()`.
Fourth, a few top-level functions have been deprecated: `load_signatures(...)`, `load_one_signature(...)`, `create_sbt_index(...)`, and `load_sbt_index(...)`.
-* `load_signatures(...)`, `load_one_signature(...)` should be replaced with `load_file_as_signatures(...)`. Note there is currently no top-level way to load signatures from strings. For now, if you need that functionality, you can use `sourmash.signature.load_signatures(...)` and `sourmash.signature.load_one_signature(...)`, but please be aware that these are not considered part of the public API that is under semantic versioning, so they may change in the next minor point release; this is tracked in /~https://github.com/dib-lab/sourmash/issues/1312.
+* `load_signatures(...)`, `load_one_signature(...)` should be replaced with `load_file_as_signatures(...)`. Note there is currently no top-level way to load signatures from strings. For now, if you need that functionality, you can use `sourmash.signature.load_signatures(...)` and `sourmash.signature.load_one_signature(...)`, but please be aware that these are not considered part of the public API that is under semantic versioning, so they may change in the next minor point release; this is tracked in /~https://github.com/sourmash-bio/sourmash/issues/1312.
* `load_sbt_index(...)` have been deprecated. Please use `load_file_as_index(...)` instead.
* `create_sbt_index(...)` has been deprecated. There is currently no replacement, although you can use it directly from `sourmash.sbtmh` if necessary.
Fifth, directory traversal now happens by default when loading signatures, so remove `traverse=True` arguments to several functions in `sourmash_args` - `load_dbs_and_sigs`, `load_file_as_index`, `and load_file_as_signatures`.
Please post questions and concerns to the
-[sourmash issue tracker](/~https://github.com/dib-lab/sourmash/issues)
+[sourmash issue tracker](/~https://github.com/sourmash-bio/sourmash/issues)
and we'll be happy to help!
diff --git a/doc/tutorial-basic.md b/doc/tutorial-basic.md
index 41311f9f96..d6b6f270a5 100644
--- a/doc/tutorial-basic.md
+++ b/doc/tutorial-basic.md
@@ -121,7 +121,7 @@ Let's grab a sample collection of 50 E. coli genomes and unpack it --
mkdir ecoli_many_sigs
cd ecoli_many_sigs
-curl -O -L /~https://github.com/dib-lab/sourmash/raw/latest/data/eschericia-sigs.tar.gz
+curl -O -L /~https://github.com/sourmash-bio/sourmash/raw/latest/data/eschericia-sigs.tar.gz
tar xzf eschericia-sigs.tar.gz
rm eschericia-sigs.tar.gz
@@ -246,7 +246,7 @@ from the
[Shakya et al. 2013 mock metagenome paper.][2]
```
-wget /~https://github.com/dib-lab/sourmash/raw/latest/doc/_static/shakya-unaligned-contigs.sig
+wget /~https://github.com/sourmash-bio/sourmash/raw/latest/doc/_static/shakya-unaligned-contigs.sig
sourmash gather -k 31 shakya-unaligned-contigs.sig genbank-k31.lca.json.gz
```
diff --git a/doc/using-sourmash-a-guide.md b/doc/using-sourmash-a-guide.md
index 757d3d7654..2b62be021b 100644
--- a/doc/using-sourmash-a-guide.md
+++ b/doc/using-sourmash-a-guide.md
@@ -8,7 +8,7 @@ So! You've installed sourmash, run a few of the tutorials and commands,
and now you actually want to *use* it. This guide is here to answer some
of your questions, and explain why we can't answer others.
-(If you have additional questions, please [file an issue!](/~https://github.com/dib-lab/sourmash/issues))
+(If you have additional questions, please [file an issue!](/~https://github.com/sourmash-bio/sourmash/issues))
## What k-mer size(s) should I use?
diff --git a/include/sourmash.h b/include/sourmash.h
index 7f88fcc203..a7bb4a9b71 100644
--- a/include/sourmash.h
+++ b/include/sourmash.h
@@ -225,6 +225,8 @@ SourmashKmerMinHash *kmerminhash_new(uint64_t scaled,
uint32_t kmerminhash_num(const SourmashKmerMinHash *ptr);
+void kmerminhash_remove_from(SourmashKmerMinHash *ptr, const SourmashKmerMinHash *other);
+
void kmerminhash_remove_hash(SourmashKmerMinHash *ptr, uint64_t h);
void kmerminhash_remove_many(SourmashKmerMinHash *ptr,
diff --git a/nix/sources.json b/nix/sources.json
index 7f9a68a779..fece90acd7 100644
--- a/nix/sources.json
+++ b/nix/sources.json
@@ -5,10 +5,10 @@
"homepage": "/~https://github.com/nmattia/niv",
"owner": "nmattia",
"repo": "niv",
- "rev": "af958e8057f345ee1aca714c1247ef3ba1c15f5e",
- "sha256": "1qjavxabbrsh73yck5dcq8jggvh3r2jkbr6b5nlz5d9yrqm9255n",
+ "rev": "1819632b5823e0527da28ad82fecd6be5136c1e9",
+ "sha256": "08jz17756qchq0zrqmapcm33nr4ms9f630mycc06i6zkfwl5yh5i",
"type": "tarball",
- "url": "/~https://github.com/nmattia/niv/archive/af958e8057f345ee1aca714c1247ef3ba1c15f5e.tar.gz",
+ "url": "/~https://github.com/nmattia/niv/archive/1819632b5823e0527da28ad82fecd6be5136c1e9.tar.gz",
"url_template": "/~https://github.com///archive/.tar.gz"
},
"nixpkgs": {
@@ -17,10 +17,10 @@
"homepage": "",
"owner": "NixOS",
"repo": "nixpkgs",
- "rev": "e62feb3bf4a603e26755238303bda0c24651e155",
- "sha256": "1gkamm044jrksjrisr7h9grg8p2y6rk01x6391asrx988hm2rh9s",
+ "rev": "84aa23742f6c72501f9cc209f29c438766f5352d",
+ "sha256": "0h7xl6q0yjrbl9vm3h6lkxw692nm8bg3wy65gm95a2mivhrdjpxp",
"type": "tarball",
- "url": "/~https://github.com/NixOS/nixpkgs/archive/e62feb3bf4a603e26755238303bda0c24651e155.tar.gz",
+ "url": "/~https://github.com/NixOS/nixpkgs/archive/84aa23742f6c72501f9cc209f29c438766f5352d.tar.gz",
"url_template": "/~https://github.com///archive/.tar.gz"
},
"rust-overlay": {
@@ -29,10 +29,10 @@
"homepage": null,
"owner": "oxalica",
"repo": "rust-overlay",
- "rev": "d8efe70dc561c4bea0b7bf440d36ce98c497e054",
- "sha256": "0hdj5d635dq6zwj8d4ady1kyl9mwmsxvy6vqyd3xq0p2w18ffi4r",
+ "rev": "7368784b67f963508b67064ee758537b7c8e40c8",
+ "sha256": "1sr0lsk5m6c0dqp3429c0fj0picrvsaw1hjn6nd83pzxm54g7fcr",
"type": "tarball",
- "url": "/~https://github.com/oxalica/rust-overlay/archive/d8efe70dc561c4bea0b7bf440d36ce98c497e054.tar.gz",
+ "url": "/~https://github.com/oxalica/rust-overlay/archive/7368784b67f963508b67064ee758537b7c8e40c8.tar.gz",
"url_template": "/~https://github.com///archive/.tar.gz"
}
}
diff --git a/setup.cfg b/setup.cfg
index 6dd80fde71..37e5444e52 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -55,8 +55,8 @@ where = src
# this syntax may change in the future
[options.extras_require]
test =
- pytest>=6
- pytest-cov<2.6
+ pytest~=6.2.4
+ pytest-cov~=2.12
recommonmark
hypothesis
demo =
@@ -94,9 +94,6 @@ norecursedirs =
.tox
.asv
.eggs
-python_files =
- src/sourmash/*.py
- tests/*.py
testpaths =
tests
doc
diff --git a/src/core/CHANGELOG.md b/src/core/CHANGELOG.md
new file mode 100644
index 0000000000..05eff4f83b
--- /dev/null
+++ b/src/core/CHANGELOG.md
@@ -0,0 +1,160 @@
+# Changelog
+
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [0.11.0] - 2021-07-07
+
+Added:
+
+- Add HyperLogLog implementation (#1223)
+
+Changed:
+
+- Update `MinHash.set_abundances` to remove hash if 0 abund; handle negative abundances. (#1575)
+- Improving `MinHash.remove_many(...)` performance (#1571)
+- Improved intersection and union calculations (#1475)
+- Bump MSRV to 1.42 (and other dep fixes) (#1461)
+- Rework the `find` functionality for `Index` classes (#1392)
+- Rationalize `SourmashSignature.name` and `str(sig)` (#1179)
+
+Fixed:
+
+- Fix needless borrows as suggested by clippy (#1636)
+- Fix Rust 1.59 lints (#1600)
+- Clean up clippy lints from 1.52 (#1505)
+- Fix clippy lints introduced in 1.51 (#1407)
+- CI/Rust: update and fix cbindgen config (#1473)
+- pin needletail version to keep MSRV at 1.37 (#1393)
+- Update proptest requirement from 0.9.6 to 1.0.0 (#1344)
+- Fix clippy lints introduced in 1.50 and update nix configs (#1332)
+- Update finch requirement from 0.3.0 to 0.4.1 (#1290)
+- update rand for test, and activate "js" feature for getrandom (#1275)
+- Fix new clippy warnings from Rust 1.49 (#1267)
+- CI: small build fixes (#1252)
+
+Removed:
+
+- Remove 10x support in compute (#1229)
+
+## [0.10.0] - 2020-10-08
+
+Added:
+
+- Add `clear` option to set_abundances(...) method (#1046)
+
+Changed:
+
+- Replace mx by scaled (#1139)
+
+Fixed:
+
+- Fix Rust panic error in signature creation (#1172)
+- Update typed-builder requirement from 0.6.0 to 0.7.0 (#1121)
+- update CI for latest branch name change (#1150)
+- Update typed-builder requirement from 0.6.0 to 0.7.0 (#1121)
+
+## [0.9.0] - 2020-07-13
+
+Added:
+
+- Cache md5sum calculation (#1058)
+- Expose more of the API for wasm (signature and ComputeParameters) (#1058)
+- Getters and setters for ComputeParameters (#1058)
+
+Changed:
+
+- Migrate from failure to thiserror (#1058)
+- Bump MSRV to 1.37 (#1058)
+
+Fixed:
+
+- Use the derive feature in serde instead of serde_derive (#1058)
+- Use nohash-hasher crate instead of previous NoHashHasher from finch.
+- Update typed-builder to 0.6.0 (#1058)
+- stricter niffler versions and add new gz feature to it (#1070)
+
+## [0.8.0] - 2020-06-26
+
+Added:
+
+- compute-optimized MinHash (for small scaled or large cardinalities) (#1045)
+
+## [0.7.0] - 2020-05-12
+
+Changed:
+
+- Hide internal representation in core (#986)
+
+Fixed:
+
+- update FFI and cbindgen (#986)
+
+## [0.6.0] - 2020-04-28
+
+Added:
+
+- Nodegraph implementation based on khmer.Nodegraph (#799)
+
+## [0.5.0] - 2020-02-08
+
+Added:
+
+- add_hash_with_abundance method in core library (#892)
+
+Changed:
+
+- More refactoring of MinHash comparison code (#882)
+- Replace mins_push and abunds_push with set_abundances (#887)
+
+Fixed:
+
+- add_hash with num doesn't set abundances properly (#891)
+
+## [0.4.0] - 2020-01-26
+
+Added:
+
+- Compute improvements: Parameter sets for defining signatures, add_protein implemented (#845)
+- add_many for faster insertion of multiple hashes (#826)
+
+Changed:
+
+- Compare/similarity now have a downsample argument (#856)
+
+Fixed:
+
+- Improve sketching performance with lookup tables for complement and DNA validation (#861) (#865)
+- Use tarpaulin instead of grcov (#862)
+- set up publishing workflow for NPM and crates.io (#824)
+
+## [0.3.0] - 2020-01-05
+
+Added:
+
+- Similarity with abundance method for MinHash (#808)
+- Experimental support for indices in Rust (#773)
+- Experimental SBT with MQF internal nodes in Rust (#772)
+
+Changed:
+
+- Make the sourmash crate library-only (#812)
+
+Fixed:
+
+- Use once_cell instead of lazy_static and lazy-init (#815)
+- Fix mem leak in get_mins (#807)
+- Fixes for WASI and WASM compilation (#771) (#723)
+
+[unreleased]: /~https://github.com/sourmash-bio/sourmash/compare/r0.11.0...HEAD
+[0.11.0]: /~https://github.com/sourmash-bio/sourmash/compare/r0.10.0...r0.11.0
+[0.10.0]: /~https://github.com/sourmash-bio/sourmash/compare/r0.9.0...r0.10.0
+[0.9.0]: /~https://github.com/sourmash-bio/sourmash/compare/r0.9.0...r0.10.0
+[0.8.0]: /~https://github.com/sourmash-bio/sourmash/compare/r0.8.0...r0.9.0
+[0.7.0]: /~https://github.com/sourmash-bio/sourmash/compare/r0.7.0...r0.8.0
+[0.6.0]: /~https://github.com/sourmash-bio/sourmash/compare/r0.6.0...r0.7.0
+[0.5.0]: /~https://github.com/sourmash-bio/sourmash/compare/r0.5.0...r0.6.0
+[0.4.0]: /~https://github.com/sourmash-bio/sourmash/compare/r0.4.0...r0.5.0
+[0.3.0]: /~https://github.com/sourmash-bio/sourmash/compare/r0.3.0...r0.4.0
diff --git a/src/core/Cargo.toml b/src/core/Cargo.toml
index 3de8c37734..daceb338d4 100644
--- a/src/core/Cargo.toml
+++ b/src/core/Cargo.toml
@@ -1,6 +1,6 @@
[package]
name = "sourmash"
-version = "0.10.0"
+version = "0.11.0"
authors = ["Luiz Irber "]
description = "MinHash sketches for genomic data"
repository = "/~https://github.com/dib-lab/sourmash"
@@ -44,19 +44,6 @@ primal-check = "0.3.1"
thiserror = "1.0"
typed-builder = "0.9.0"
-[target.'cfg(all(target_arch = "wasm32", target_vendor="unknown"))'.dependencies.wasm-bindgen]
-version = "0.2.62"
-features = ["serde-serialize"]
-
-[target.'cfg(all(target_arch = "wasm32", target_vendor="unknown"))'.dev-dependencies]
-wasm-bindgen-test = "0.3.0"
-
-[target.'cfg(not(target_arch = "wasm32"))'.dev-dependencies.assert_cmd]
-version = "1.0.1"
-
-[package.metadata.wasm-pack.profile.release]
-wasm-opt = false # /~https://github.com/rustwasm/wasm-pack/issues/886
-
[dev-dependencies]
assert_matches = "1.3.0"
criterion = "0.3.2"
@@ -82,3 +69,19 @@ harness = false
[[bench]]
name = "minhash"
harness = false
+
+## Wasm section. Crates only used for WASM, as well as specific configurations
+
+[target.'cfg(all(target_arch = "wasm32", target_vendor="unknown"))'.dependencies.wasm-bindgen]
+version = "0.2.62"
+features = ["serde-serialize"]
+
+[target.'cfg(all(target_arch = "wasm32", target_vendor="unknown"))'.dependencies.web-sys]
+version = "0.3.51"
+features = ["console", "File"]
+
+[target.'cfg(all(target_arch = "wasm32", target_vendor="unknown"))'.dev-dependencies]
+wasm-bindgen-test = "0.3.0"
+
+[target.'cfg(not(target_arch = "wasm32"))'.dev-dependencies.assert_cmd]
+version = "1.0.1"
diff --git a/src/core/src/cmd.rs b/src/core/src/cmd.rs
index aef8481405..25c4d8ca1d 100644
--- a/src/core/src/cmd.rs
+++ b/src/core/src/cmd.rs
@@ -1,6 +1,3 @@
-#[cfg(all(target_arch = "wasm32", target_vendor = "unknown"))]
-use wasm_bindgen::prelude::*;
-
use getset::{CopyGetters, Getters, Setters};
use typed_builder::TypedBuilder;
@@ -24,7 +21,7 @@ pub fn prepare(index_path: &str) -> Result<(), Error> {
impl Signature {
pub fn from_params(params: &ComputeParameters) -> Signature {
- let template = build_template(¶ms);
+ let template = build_template(params);
Signature::builder()
.hash_function("0.murmur64")
@@ -36,7 +33,6 @@ impl Signature {
}
#[allow(dead_code)]
-#[cfg_attr(all(target_arch = "wasm32", target_vendor = "unknown"), wasm_bindgen)]
#[derive(TypedBuilder, CopyGetters, Getters, Setters)]
pub struct ComputeParameters {
#[getset(get = "pub", set = "pub")]
diff --git a/src/core/src/encodings.rs b/src/core/src/encodings.rs
index 0ceddc2cbb..fee92e4bb6 100644
--- a/src/core/src/encodings.rs
+++ b/src/core/src/encodings.rs
@@ -4,12 +4,9 @@ use std::iter::Iterator;
use std::str;
use once_cell::sync::Lazy;
-#[cfg(all(target_arch = "wasm32", target_vendor = "unknown"))]
-use wasm_bindgen::prelude::*;
use crate::Error;
-#[cfg_attr(all(target_arch = "wasm32", target_vendor = "unknown"), wasm_bindgen)]
#[allow(non_camel_case_types)]
#[derive(Debug, Clone, Copy, PartialEq)]
#[repr(u32)]
diff --git a/src/core/src/ffi/minhash.rs b/src/core/src/ffi/minhash.rs
index 1618b9de62..7c3c6358c5 100644
--- a/src/core/src/ffi/minhash.rs
+++ b/src/core/src/ffi/minhash.rs
@@ -366,6 +366,15 @@ unsafe fn kmerminhash_add_from(ptr: *mut SourmashKmerMinHash, other: *const Sour
}
}
+ffi_fn! {
+ unsafe fn kmerminhash_remove_from(ptr: *mut SourmashKmerMinHash, other: *const SourmashKmerMinHash)
+ -> Result<()> {
+ let mh = SourmashKmerMinHash::as_rust_mut(ptr);
+ let other_mh = SourmashKmerMinHash::as_rust(other);
+ mh.remove_from(other_mh)
+ }
+}
+
ffi_fn! {
unsafe fn kmerminhash_count_common(ptr: *const SourmashKmerMinHash, other: *const SourmashKmerMinHash, downsample: bool)
-> Result {
diff --git a/src/core/src/ffi/utils.rs b/src/core/src/ffi/utils.rs
index 69baac7b88..04652cbeef 100644
--- a/src/core/src/ffi/utils.rs
+++ b/src/core/src/ffi/utils.rs
@@ -18,7 +18,7 @@ thread_local! {
pub static LAST_ERROR: RefCell> = RefCell::new(None);
}
-#[allow(clippy::clippy::wrong_self_convention)]
+#[allow(clippy::wrong_self_convention)]
pub trait ForeignObject: Sized {
type RustObject;
diff --git a/src/core/src/index/mod.rs b/src/core/src/index/mod.rs
index 507020fe3c..b2ad8c0fc7 100644
--- a/src/core/src/index/mod.rs
+++ b/src/core/src/index/mod.rs
@@ -114,11 +114,11 @@ where
N: Comparable,
{
fn similarity(&self, other: &L) -> f64 {
- (*self).similarity(&other)
+ (*self).similarity(other)
}
fn containment(&self, other: &L) -> f64 {
- (*self).containment(&other)
+ (*self).containment(other)
}
}
@@ -195,7 +195,7 @@ impl SigStore {
// TODO: better matching here, what if it is not a mh?
if let Sketch::MinHash(mh) = &ng.signatures[0] {
if let Sketch::MinHash(omh) = &ong.signatures[0] {
- return mh.count_common(&omh, false).unwrap() as u64;
+ return mh.count_common(omh, false).unwrap() as u64;
}
}
unimplemented!();
@@ -252,7 +252,7 @@ impl Comparable> for SigStore {
// TODO: better matching here, what if it is not a mh?
if let Sketch::MinHash(mh) = &ng.signatures[0] {
if let Sketch::MinHash(omh) = &ong.signatures[0] {
- return mh.similarity(&omh, true, false).unwrap();
+ return mh.similarity(omh, true, false).unwrap();
}
}
@@ -275,7 +275,7 @@ impl Comparable> for SigStore {
// TODO: better matching here, what if it is not a mh?
if let Sketch::MinHash(mh) = &ng.signatures[0] {
if let Sketch::MinHash(omh) = &ong.signatures[0] {
- let common = mh.count_common(&omh, false).unwrap();
+ let common = mh.count_common(omh, false).unwrap();
let size = mh.size();
return common as f64 / size as f64;
}
@@ -290,7 +290,7 @@ impl Comparable for Signature {
// TODO: better matching here, what if it is not a mh?
if let Sketch::MinHash(mh) = &self.signatures[0] {
if let Sketch::MinHash(omh) = &other.signatures[0] {
- return mh.similarity(&omh, true, false).unwrap();
+ return mh.similarity(omh, true, false).unwrap();
}
}
@@ -310,7 +310,7 @@ impl Comparable for Signature {
// TODO: better matching here, what if it is not a mh?
if let Sketch::MinHash(mh) = &self.signatures[0] {
if let Sketch::MinHash(omh) = &other.signatures[0] {
- let common = mh.count_common(&omh, false).unwrap();
+ let common = mh.count_common(omh, false).unwrap();
let size = mh.size();
return common as f64 / size as f64;
}
diff --git a/src/core/src/index/sbt/mhbt.rs b/src/core/src/index/sbt/mhbt.rs
index b3fcbe6987..974318cf29 100644
--- a/src/core/src/index/sbt/mhbt.rs
+++ b/src/core/src/index/sbt/mhbt.rs
@@ -76,13 +76,13 @@ impl Comparable> for Node {
fn similarity(&self, other: &Node) -> f64 {
let ng: &Nodegraph = self.data().unwrap();
let ong: &Nodegraph = other.data().unwrap();
- ng.similarity(&ong)
+ ng.similarity(ong)
}
fn containment(&self, other: &Node) -> f64 {
let ng: &Nodegraph = self.data().unwrap();
let ong: &Nodegraph = other.data().unwrap();
- ng.containment(&ong)
+ ng.containment(ong)
}
}
diff --git a/src/core/src/index/sbt/mod.rs b/src/core/src/index/sbt/mod.rs
index 5d3a25cd05..6638615317 100644
--- a/src/core/src/index/sbt/mod.rs
+++ b/src/core/src/index/sbt/mod.rs
@@ -822,7 +822,7 @@ impl BinaryTree {
let mut similar_node_pos = 0;
let mut current_max = 0;
for (pos, cmpe) in current_round.iter().enumerate() {
- let common = BinaryTree::intersection_size(&next_node, &cmpe);
+ let common = BinaryTree::intersection_size(&next_node, cmpe);
if common > current_max {
current_max = common;
similar_node_pos = pos;
diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs
index f223b32bc0..8da0600b0b 100644
--- a/src/core/src/signature.rs
+++ b/src/core/src/signature.rs
@@ -14,9 +14,6 @@ use rayon::prelude::*;
use serde::{Deserialize, Serialize};
use typed_builder::TypedBuilder;
-#[cfg(all(target_arch = "wasm32", target_vendor = "unknown"))]
-use wasm_bindgen::prelude::*;
-
use crate::encodings::{aa_to_dayhoff, aa_to_hp, revcomp, to_aa, HashFunctions, VALID};
use crate::index::storage::ToWriter;
use crate::sketch::Sketch;
@@ -134,7 +131,7 @@ pub trait SigsTrait {
if hash_function.protein() {
for aa_kmer in seq.windows(ksize) {
- let hash = crate::_hash_murmur(&aa_kmer, self.seed());
+ let hash = crate::_hash_murmur(aa_kmer, self.seed());
self.add_hash(hash);
}
return Ok(());
@@ -151,7 +148,7 @@ pub trait SigsTrait {
};
for aa_kmer in aa_seq.windows(ksize) {
- let hash = crate::_hash_murmur(&aa_kmer, self.seed());
+ let hash = crate::_hash_murmur(aa_kmer, self.seed());
self.add_hash(hash);
}
@@ -242,7 +239,6 @@ impl SigsTrait for Sketch {
}
}
-#[cfg_attr(all(target_arch = "wasm32", target_vendor = "unknown"), wasm_bindgen)]
#[derive(Serialize, Deserialize, Debug, Clone, TypedBuilder)]
pub struct Signature {
#[serde(default = "default_class")]
@@ -478,7 +474,7 @@ impl Signature {
self.signatures
.iter_mut()
.for_each(|sketch| {
- sketch.add_sequence(&seq, force).unwrap(); }
+ sketch.add_sequence(seq, force).unwrap(); }
);
}
}
@@ -498,7 +494,7 @@ impl Signature {
self.signatures
.iter_mut()
.try_for_each(|sketch| {
- sketch.add_protein(&seq) }
+ sketch.add_protein(seq) }
)?;
}
}
diff --git a/src/core/src/sketch/hyperloglog/estimators.rs b/src/core/src/sketch/hyperloglog/estimators.rs
index 4c2fbe02cc..60eadc5201 100644
--- a/src/core/src/sketch/hyperloglog/estimators.rs
+++ b/src/core/src/sketch/hyperloglog/estimators.rs
@@ -120,7 +120,7 @@ pub fn joint_mle(
let mut ceq = vec![0; q + 2];
for (k1_, k2_) in k1.iter().zip(k2.iter()) {
- match k1_.cmp(&k2_) {
+ match k1_.cmp(k2_) {
cmp::Ordering::Less => {
c1[*k1_ as usize] += 1;
cg2[*k2_ as usize] += 1;
diff --git a/src/core/src/sketch/hyperloglog/mod.rs b/src/core/src/sketch/hyperloglog/mod.rs
index 1ee5b89c25..a1bc8eb9d6 100644
--- a/src/core/src/sketch/hyperloglog/mod.rs
+++ b/src/core/src/sketch/hyperloglog/mod.rs
@@ -120,7 +120,7 @@ impl HyperLogLog {
wtr.write_u8(self.p as u8)?; // number of bits used for indexing
wtr.write_u8(self.q as u8)?; // number of bits used for counting leading zeroes
wtr.write_u8(self.ksize as u8)?; // ksize
- wtr.write_all(&self.registers.as_slice())?;
+ wtr.write_all(self.registers.as_slice())?;
Ok(())
}
diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs
index a9d8a9d778..472314d1a6 100644
--- a/src/core/src/sketch/minhash.rs
+++ b/src/core/src/sketch/minhash.rs
@@ -17,9 +17,6 @@ use crate::signature::SigsTrait;
use crate::sketch::hyperloglog::HyperLogLog;
use crate::Error;
-#[cfg(all(target_arch = "wasm32", target_vendor = "unknown"))]
-use wasm_bindgen::prelude::*;
-
pub fn max_hash_for_scaled(scaled: u64) -> u64 {
match scaled {
0 => 0,
@@ -35,7 +32,6 @@ pub fn scaled_for_max_hash(max_hash: u64) -> u64 {
}
}
-#[cfg_attr(all(target_arch = "wasm32", target_vendor = "unknown"), wasm_bindgen)]
#[derive(Debug, TypedBuilder)]
pub struct KmerMinHash {
num: u32,
@@ -327,7 +323,7 @@ impl KmerMinHash {
}
if abundance == 0 {
- // well, don't add it.
+ self.remove_hash(hash);
return;
}
@@ -415,6 +411,13 @@ impl KmerMinHash {
};
}
+ pub fn remove_from(&mut self, other: &KmerMinHash) -> Result<(), Error> {
+ for min in &other.mins {
+ self.remove_hash(*min);
+ }
+ Ok(())
+ }
+
pub fn remove_many(&mut self, hashes: &[u64]) -> Result<(), Error> {
for min in hashes {
self.remove_hash(*min);
@@ -590,8 +593,8 @@ impl KmerMinHash {
self.num,
);
- combined_mh.merge(&self)?;
- combined_mh.merge(&other)?;
+ combined_mh.merge(self)?;
+ combined_mh.merge(other)?;
let it1 = Intersection::new(self.mins.iter(), other.mins.iter());
@@ -623,8 +626,8 @@ impl KmerMinHash {
self.num,
);
- combined_mh.merge(&self)?;
- combined_mh.merge(&other)?;
+ combined_mh.merge(self)?;
+ combined_mh.merge(other)?;
let it1 = Intersection::new(self.mins.iter(), other.mins.iter());
@@ -714,9 +717,9 @@ impl KmerMinHash {
let downsampled_mh = second.downsample_max_hash(first.max_hash)?;
first.similarity(&downsampled_mh, ignore_abundance, false)
} else if ignore_abundance || self.abunds.is_none() || other.abunds.is_none() {
- self.jaccard(&other)
+ self.jaccard(other)
} else {
- self.angular_similarity(&other)
+ self.angular_similarity(other)
}
}
@@ -932,7 +935,6 @@ mod test {
//#############
// A MinHash implementation for low scaled or large cardinalities
-#[cfg_attr(all(target_arch = "wasm32", target_vendor = "unknown"), wasm_bindgen)]
#[derive(Debug, TypedBuilder)]
pub struct KmerMinHashBTree {
num: u32,
@@ -1304,7 +1306,7 @@ impl KmerMinHashBTree {
for hash in &self.mins {
*new_abunds.entry(*hash).or_insert(0) +=
- abunds.get(&hash).unwrap_or(&0) + oabunds.get(&hash).unwrap_or(&0);
+ abunds.get(hash).unwrap_or(&0) + oabunds.get(hash).unwrap_or(&0);
}
self.abunds = Some(new_abunds)
}
@@ -1371,8 +1373,8 @@ impl KmerMinHashBTree {
self.num,
);
- combined_mh.merge(&self)?;
- combined_mh.merge(&other)?;
+ combined_mh.merge(self)?;
+ combined_mh.merge(other)?;
let it1 = Intersection::new(self.mins.iter(), other.mins.iter());
@@ -1403,8 +1405,8 @@ impl KmerMinHashBTree {
self.num,
);
- combined_mh.merge(&self)?;
- combined_mh.merge(&other)?;
+ combined_mh.merge(self)?;
+ combined_mh.merge(other)?;
let it1 = Intersection::new(self.mins.iter(), other.mins.iter());
@@ -1448,7 +1450,7 @@ impl KmerMinHashBTree {
let b_sq: u64 = other_abunds.values().map(|a| (a * a)).sum();
for (hash, value) in abunds.iter() {
- if let Some(oa) = other_abunds.get(&hash) {
+ if let Some(oa) = other_abunds.get(hash) {
prod += value * oa
}
}
@@ -1479,9 +1481,9 @@ impl KmerMinHashBTree {
let downsampled_mh = second.downsample_max_hash(first.max_hash)?;
first.similarity(&downsampled_mh, ignore_abundance, false)
} else if ignore_abundance || self.abunds.is_none() || other.abunds.is_none() {
- self.jaccard(&other)
+ self.jaccard(other)
} else {
- self.angular_similarity(&other)
+ self.angular_similarity(other)
}
}
diff --git a/src/core/src/sketch/nodegraph.rs b/src/core/src/sketch/nodegraph.rs
index 37c89fb4cf..56ee7d5940 100644
--- a/src/core/src/sketch/nodegraph.rs
+++ b/src/core/src/sketch/nodegraph.rs
@@ -202,7 +202,7 @@ impl Nodegraph {
let len = size_of::() * slice.len();
slice::from_raw_parts(slice.as_ptr() as *const u8, len)
};
- wtr.write_all(&buf)?;
+ wtr.write_all(buf)?;
// Replace when byteorder PR is released
if rem != 0 {
@@ -241,22 +241,19 @@ impl Nodegraph {
let byte_size = tablesize / 8 + 1;
let rem = byte_size % 4;
- let blocks: Vec = if rem == 0 {
+ let blocks: Vec = {
let mut blocks = vec![0; byte_size / 4];
rdr.read_u32_into::(&mut blocks)?;
- blocks
- } else {
- let mut blocks = vec![0; byte_size / 4];
- rdr.read_u32_into::(&mut blocks)?;
-
- let mut values = [0u8; 4];
- for item in values.iter_mut().take(rem) {
- let byte = rdr.read_u8().expect("error reading bins");
- *item = byte;
+ if rem != 0 {
+ let mut values = [0u8; 4];
+ for item in values.iter_mut().take(rem) {
+ let byte = rdr.read_u8().expect("error reading bins");
+ *item = byte;
+ }
+ let mut block = vec![0u32; 1];
+ LittleEndian::read_u32_into(&values, &mut block);
+ blocks.push(block[0]);
}
- let mut block = vec![0u32; 1];
- LittleEndian::read_u32_into(&values, &mut block);
- blocks.push(block[0]);
blocks
};
diff --git a/src/core/src/wasm.rs b/src/core/src/wasm.rs
index 3ada5e7f86..ad656d9955 100644
--- a/src/core/src/wasm.rs
+++ b/src/core/src/wasm.rs
@@ -1,11 +1,25 @@
-use wasm_bindgen::prelude::*;
+// When the `wee_alloc` feature is enabled, use `wee_alloc` as the global
+// allocator.
+#[cfg(feature = "wee_alloc")]
+#[global_allocator]
+static ALLOC: wee_alloc::WeeAlloc = wee_alloc::WeeAlloc::INIT;
-use serde_json;
+use wasm_bindgen::prelude::*;
-use crate::cmd::ComputeParameters;
+use crate::cmd::ComputeParameters as _ComputeParameters;
use crate::encodings::HashFunctions;
-use crate::signature::{Signature, SigsTrait};
-use crate::sketch::minhash::KmerMinHash;
+use crate::signature::Signature as _Signature;
+use crate::signature::SigsTrait;
+use crate::sketch::minhash::KmerMinHash as _KmerMinHash;
+
+#[wasm_bindgen]
+pub struct KmerMinHash(_KmerMinHash);
+
+#[wasm_bindgen]
+pub struct Signature(_Signature);
+
+#[wasm_bindgen]
+pub struct ComputeParameters(_ComputeParameters);
#[wasm_bindgen]
impl KmerMinHash {
@@ -32,25 +46,26 @@ impl KmerMinHash {
HashFunctions::murmur64_DNA
};
- KmerMinHash::new(
+ KmerMinHash(_KmerMinHash::new(
scaled as u64,
ksize,
hash_function,
seed as u64,
track_abundance,
num,
- )
+ ))
}
#[wasm_bindgen]
pub fn add_sequence_js(&mut self, buf: &str) {
- self.add_sequence(buf.as_bytes(), true)
+ self.0
+ .add_sequence(buf.as_bytes(), true)
.expect("Error adding sequence");
}
#[wasm_bindgen]
pub fn to_json(&mut self) -> String {
- serde_json::to_string(self).unwrap()
+ serde_json::to_string(&self.0).unwrap()
}
}
@@ -58,8 +73,13 @@ impl KmerMinHash {
impl ComputeParameters {
#[wasm_bindgen(constructor)]
pub fn new_with_params() -> ComputeParameters {
- let params = ComputeParameters::default();
- params
+ let params = _ComputeParameters::default();
+ ComputeParameters(params)
+ }
+
+ #[wasm_bindgen]
+ pub fn set_ksizes(&mut self, ksizes: Vec) {
+ self.0.set_ksizes(ksizes);
}
}
@@ -69,18 +89,28 @@ impl Signature {
pub fn new_from_params(params: &ComputeParameters) -> Signature {
//let params = ComputeParameters::default();
- Signature::from_params(¶ms)
+ Signature(_Signature::from_params(¶ms.0))
}
#[wasm_bindgen]
pub fn add_sequence_js(&mut self, buf: &str) {
- self.add_sequence(buf.as_bytes(), true)
+ self.0
+ .add_sequence(buf.as_bytes(), true)
.expect("Error adding sequence");
}
+ #[wasm_bindgen]
+ pub fn add_from_file(&mut self, fp: web_sys::File) {
+ unimplemented!()
+ }
+
#[wasm_bindgen]
pub fn to_json(&mut self) -> String {
- serde_json::to_string(self).unwrap()
+ serde_json::to_string(&self.0).unwrap()
+ }
+
+ pub fn size(&self) -> usize {
+ self.0.size()
}
}
diff --git a/src/sourmash/__init__.py b/src/sourmash/__init__.py
index 463f718a7a..8735c06cea 100644
--- a/src/sourmash/__init__.py
+++ b/src/sourmash/__init__.py
@@ -111,6 +111,7 @@ def search_sbt_index(*args, **kwargs):
from .sbtmh import create_sbt_index
from . import lca
+from . import tax
from . import sbt
from . import sbtmh
from . import sbt_storage
diff --git a/src/sourmash/cli/__init__.py b/src/sourmash/cli/__init__.py
index c38c3e7afc..1b20c90def 100644
--- a/src/sourmash/cli/__init__.py
+++ b/src/sourmash/cli/__init__.py
@@ -37,6 +37,7 @@
from . import sig as signature
from . import sketch
from . import storage
+from . import tax
class SourmashParser(ArgumentParser):
@@ -92,6 +93,7 @@ def parse_args(self, args=None, namespace=None):
def get_parser():
module_descs = {
+ 'tax': 'Integrate taxonomy information based on "gather" results',
'lca': 'Taxonomic operations',
'sketch': 'Create signatures',
'sig': 'Manipulate signature files',
diff --git a/src/sourmash/cli/compare.py b/src/sourmash/cli/compare.py
index dcec015bd5..f7387f68b2 100644
--- a/src/sourmash/cli/compare.py
+++ b/src/sourmash/cli/compare.py
@@ -1,6 +1,7 @@
"""compare sequence signatures made by compute"""
-from sourmash.cli.utils import add_ksize_arg, add_moltype_args
+from sourmash.cli.utils import (add_ksize_arg, add_moltype_args,
+ add_picklist_args)
def subparser(subparsers):
@@ -47,6 +48,7 @@ def subparser(subparsers):
subparser.add_argument(
'-p', '--processes', metavar='N', type=int, default=None,
help='Number of processes to use to calculate similarity')
+ add_picklist_args(subparser)
def main(args):
diff --git a/src/sourmash/cli/gather.py b/src/sourmash/cli/gather.py
index 3d2e6d1a24..6e0addd427 100644
--- a/src/sourmash/cli/gather.py
+++ b/src/sourmash/cli/gather.py
@@ -1,6 +1,7 @@
"""search a metagenome signature against dbs"""
-from sourmash.cli.utils import add_ksize_arg, add_moltype_args
+from sourmash.cli.utils import (add_ksize_arg, add_moltype_args,
+ add_picklist_args)
def subparser(subparsers):
@@ -60,8 +61,6 @@ def subparser(subparsers):
'--cache-size', default=0, type=int, metavar='N',
help='number of internal SBT nodes to cache in memory (default: 0, cache all nodes)'
)
- add_ksize_arg(subparser, 31)
- add_moltype_args(subparser)
# advanced parameters
subparser.add_argument(
@@ -80,6 +79,10 @@ def subparser(subparsers):
help="use prefetch before gather; see documentation",
)
+ add_ksize_arg(subparser, 31)
+ add_moltype_args(subparser)
+ add_picklist_args(subparser)
+
def main(args):
import sourmash
diff --git a/src/sourmash/cli/index.py b/src/sourmash/cli/index.py
index 1be7f06690..334b394bfe 100644
--- a/src/sourmash/cli/index.py
+++ b/src/sourmash/cli/index.py
@@ -25,7 +25,8 @@
---
"""
-from sourmash.cli.utils import add_moltype_args, add_ksize_arg
+from sourmash.cli.utils import (add_ksize_arg, add_moltype_args,
+ add_picklist_args)
def subparser(subparsers):
@@ -44,7 +45,6 @@ def subparser(subparsers):
'-q', '--quiet', action='store_true',
help='suppress non-error output'
)
- add_ksize_arg(subparser, 31)
subparser.add_argument(
'-d', '--n_children', metavar='D', type=int, default=2,
help='number of children for internal nodes; default=2'
@@ -70,7 +70,9 @@ def subparser(subparsers):
'--scaled', metavar='FLOAT', type=float, default=0,
help='downsample signatures to the specified scaled factor'
)
+ add_ksize_arg(subparser, 31)
add_moltype_args(subparser)
+ add_picklist_args(subparser)
def main(args):
diff --git a/src/sourmash/cli/lca/index.py b/src/sourmash/cli/lca/index.py
index 581ff63dcd..fd205b6f9e 100644
--- a/src/sourmash/cli/lca/index.py
+++ b/src/sourmash/cli/lca/index.py
@@ -1,6 +1,7 @@
"""create LCA database"""
-from sourmash.cli.utils import add_ksize_arg, add_moltype_args
+from sourmash.cli.utils import (add_ksize_arg, add_moltype_args,
+ add_picklist_args)
def subparser(subparsers):
@@ -18,8 +19,6 @@ def subparser(subparsers):
subparser.add_argument(
'--scaled', metavar='S', default=10000, type=float
)
- add_ksize_arg(subparser, 31)
- add_moltype_args(subparser)
subparser.add_argument(
'-q', '--quiet', action='store_true',
help='suppress non-error output'
@@ -42,7 +41,11 @@ def subparser(subparsers):
)
subparser.add_argument(
'--split-identifiers', action='store_true',
- help='split names in signatures on whitspace and period'
+ help='split names in signatures on whitespace'
+ )
+ subparser.add_argument(
+ '--keep-identifier-versions', action='store_true',
+ help='do not remove accession versions'
)
subparser.add_argument('-f', '--force', action='store_true')
subparser.add_argument(
@@ -52,6 +55,14 @@ def subparser(subparsers):
'--require-taxonomy', action='store_true',
help='ignore signatures with no taxonomy entry'
)
+ subparser.add_argument(
+ '--fail-on-missing-taxonomy', action='store_true',
+ help='fail quickly if taxonomy is not available for an identifier',
+ )
+
+ add_ksize_arg(subparser, 31)
+ add_moltype_args(subparser)
+ add_picklist_args(subparser)
def main(args):
diff --git a/src/sourmash/cli/prefetch.py b/src/sourmash/cli/prefetch.py
index 27a254c68e..e04c537193 100644
--- a/src/sourmash/cli/prefetch.py
+++ b/src/sourmash/cli/prefetch.py
@@ -1,6 +1,7 @@
"""search a signature against dbs, find all overlaps"""
-from sourmash.cli.utils import add_ksize_arg, add_moltype_args
+from sourmash.cli.utils import (add_ksize_arg, add_moltype_args,
+ add_picklist_args)
def subparser(subparsers):
@@ -63,6 +64,7 @@ def subparser(subparsers):
)
add_ksize_arg(subparser, 31)
add_moltype_args(subparser)
+ add_picklist_args(subparser)
def main(args):
diff --git a/src/sourmash/cli/search.py b/src/sourmash/cli/search.py
index 9ff4ab9985..c4e1d41323 100644
--- a/src/sourmash/cli/search.py
+++ b/src/sourmash/cli/search.py
@@ -1,6 +1,7 @@
"""search a signature against other signatures"""
-from sourmash.cli.utils import add_ksize_arg, add_moltype_args
+from sourmash.cli.utils import (add_ksize_arg, add_moltype_args,
+ add_picklist_args)
def subparser(subparsers):
@@ -59,6 +60,7 @@ def subparser(subparsers):
)
add_ksize_arg(subparser, 31)
add_moltype_args(subparser)
+ add_picklist_args(subparser)
def main(args):
diff --git a/src/sourmash/cli/sig/__init__.py b/src/sourmash/cli/sig/__init__.py
index 36a224ef86..d9d71463d9 100644
--- a/src/sourmash/cli/sig/__init__.py
+++ b/src/sourmash/cli/sig/__init__.py
@@ -12,6 +12,7 @@
from . import filter
from . import flatten
from . import intersect
+from . import manifest
from . import merge
from . import rename
from . import subtract
diff --git a/src/sourmash/cli/sig/cat.py b/src/sourmash/cli/sig/cat.py
index 99d53090d7..6da03f886e 100644
--- a/src/sourmash/cli/sig/cat.py
+++ b/src/sourmash/cli/sig/cat.py
@@ -6,7 +6,11 @@
def subparser(subparsers):
subparser = subparsers.add_parser('cat')
- subparser.add_argument('signatures', nargs='+')
+ subparser.add_argument('signatures', nargs='*')
+ subparser.add_argument(
+ '--from-file',
+ help='a text file containing a list of files to load signatures from'
+ )
subparser.add_argument(
'-q', '--quiet', action='store_true',
help='suppress non-error output'
@@ -19,6 +23,10 @@ def subparser(subparsers):
'-u', '--unique', action='store_true',
help='keep only distinct signatures, removing duplicates (based on md5sum)'
)
+ subparser.add_argument(
+ '-f', '--force', action='store_true',
+ help='try to load all files as signatures'
+ )
def main(args):
diff --git a/src/sourmash/cli/sig/extract.py b/src/sourmash/cli/sig/extract.py
index d2066e8bcc..9ea71eb229 100644
--- a/src/sourmash/cli/sig/extract.py
+++ b/src/sourmash/cli/sig/extract.py
@@ -2,7 +2,8 @@
import sys
-from sourmash.cli.utils import add_moltype_args, add_ksize_arg
+from sourmash.cli.utils import (add_moltype_args, add_ksize_arg,
+ add_picklist_args)
def subparser(subparsers):
@@ -27,6 +28,7 @@ def subparser(subparsers):
)
add_ksize_arg(subparser, 31)
add_moltype_args(subparser)
+ add_picklist_args(subparser)
def main(args):
diff --git a/src/sourmash/cli/sig/manifest.py b/src/sourmash/cli/sig/manifest.py
new file mode 100644
index 0000000000..805c9d1a56
--- /dev/null
+++ b/src/sourmash/cli/sig/manifest.py
@@ -0,0 +1,27 @@
+"""create a manifest for a collection of signatures"""
+
+import sourmash
+from sourmash.logging import notify, print_results, error
+
+
+def subparser(subparsers):
+ subparser = subparsers.add_parser('manifest')
+ subparser.add_argument('location')
+ subparser.add_argument(
+ '-q', '--quiet', action='store_true',
+ help='suppress non-error output'
+ )
+ subparser.add_argument(
+ '-o', '--output', '--csv', metavar='FILE',
+ help='output information to a CSV file',
+ required=True,
+ )
+ subparser.add_argument(
+ '-f', '--force', action='store_true',
+ help='try to load all files as signatures'
+ )
+
+
+def main(args):
+ import sourmash
+ return sourmash.sig.__main__.manifest(args)
diff --git a/src/sourmash/cli/tax/__init__.py b/src/sourmash/cli/tax/__init__.py
new file mode 100644
index 0000000000..2a9b5b0302
--- /dev/null
+++ b/src/sourmash/cli/tax/__init__.py
@@ -0,0 +1,33 @@
+"""Define the command line interface for sourmash tax
+
+The top level CLI is defined in ../__init__.py. This module defines the CLI for
+`sourmash tax` operations.
+"""
+
+from . import metagenome
+from . import genome
+from . import annotate
+from . import prepare
+from ..utils import command_list
+from argparse import SUPPRESS, RawDescriptionHelpFormatter
+import os
+import sys
+
+
+def subparser(subparsers):
+ subparser = subparsers.add_parser('tax', formatter_class=RawDescriptionHelpFormatter, usage=SUPPRESS, aliases=['taxonomy'])
+ desc = 'Operations\n'
+ clidir = os.path.dirname(__file__)
+ ops = command_list(clidir)
+ for subcmd in ops:
+ docstring = getattr(sys.modules[__name__], subcmd).__doc__
+ helpstring = 'sourmash tax {op:s} --help'.format(op=subcmd)
+ desc += ' {hs:33s} {ds:s}\n'.format(hs=helpstring, ds=docstring)
+ s = subparser.add_subparsers(
+ title="Integrate taxonomy information based on 'gather' results", dest='subcmd', metavar='subcmd', help=SUPPRESS,
+ description=desc
+ )
+ for subcmd in ops:
+ getattr(sys.modules[__name__], subcmd).subparser(s)
+ subparser._action_groups.reverse()
+ subparser._optionals.title = 'Options'
diff --git a/src/sourmash/cli/tax/annotate.py b/src/sourmash/cli/tax/annotate.py
new file mode 100644
index 0000000000..f25a532554
--- /dev/null
+++ b/src/sourmash/cli/tax/annotate.py
@@ -0,0 +1,67 @@
+"""annotate gather results with taxonomy information"""
+
+usage="""
+
+ sourmash tax annotate --gather-csv [ ... ] --taxonomy-csv [ ... ]
+
+The 'tax annotate' command reads in gather results CSVs and annotates them
+ with taxonomic information.
+
+By default, `tax annotate` produces a gather CSV with an additional `lineage`
+ column containing the taxonomic information for each database match.
+
+Please see the 'tax annotate' documentation for more details:
+ https://sourmash.readthedocs.io/en/latest/command-line.html#sourmash-tax-annotate-annotates-gather-output-with-taxonomy
+"""
+
+import sourmash
+from sourmash.logging import notify, print_results, error
+
+
+def subparser(subparsers):
+ subparser = subparsers.add_parser('annotate',
+ aliases=['annotate'],
+ usage=usage)
+ subparser.add_argument(
+ '-g', '--gather-csv', nargs='*', default = [],
+ help='CSV output files from sourmash gather'
+ )
+ subparser.add_argument(
+ '--from-file', metavar='FILE', default=None,
+ help='input many gather results as a text file, with one gather CSV per line'
+ )
+ subparser.add_argument(
+ '-q', '--quiet', action='store_true',
+ help='suppress non-error output'
+ )
+ subparser.add_argument(
+ '-t', '--taxonomy-csv', '--taxonomy', metavar='FILE',
+ nargs="+", required=True,
+ help='database lineages CSV'
+ )
+ subparser.add_argument(
+ '-o', '--output-dir', default= "",
+ help='directory for output files'
+ )
+ subparser.add_argument(
+ '--keep-full-identifiers', action='store_true',
+ help='do not split identifiers on whitespace'
+ )
+ subparser.add_argument(
+ '--keep-identifier-versions', action='store_true',
+ help='after splitting identifiers, do not remove accession versions'
+ )
+ subparser.add_argument(
+ '--fail-on-missing-taxonomy', action='store_true',
+ help='fail quickly if taxonomy is not available for an identifier',
+ )
+ subparser.add_argument(
+ '-f', '--force', action = 'store_true',
+ help='continue past errors in file and taxonomy loading',
+ )
+
+def main(args):
+ import sourmash
+ if not args.gather_csv and not args.from_file:
+ raise ValueError(f"No gather CSVs found! Please input via `-g` or `--from-file`.")
+ return sourmash.tax.__main__.annotate(args)
diff --git a/src/sourmash/cli/tax/genome.py b/src/sourmash/cli/tax/genome.py
new file mode 100644
index 0000000000..f37fb35c8a
--- /dev/null
+++ b/src/sourmash/cli/tax/genome.py
@@ -0,0 +1,102 @@
+"""classify genomes from gather results"""
+
+usage="""
+
+ sourmash tax genome --gather-csv [ ... ] --taxonomy-csv [ ... ]
+
+The 'tax genome' command reads in genome gather result CSVs and reports likely
+classification for each query genome.
+
+By default, classification uses a containment threshold of 0.1, meaning at least
+10 percent of the query was covered by matches with the reported taxonomic rank and lineage.
+You can specify an alternate classification threshold or force classification by
+taxonomic rank instead, e.g. at species or genus-level.
+
+The default output format consists of five columns,
+ `query_name,status,rank,fraction,lineage`, where `fraction` is the fraction
+ of the query matched to the reported rank and lineage. The `status` column
+ provides additional information on the classification, and can be:
+ - `match` - this query was classified
+ - `nomatch`- this query could not be classified
+ - `below_threshold` - this query was classified at the specified rank,
+ but the query fraction matched was below the containment threshold
+
+Optionally, you can report classifications in `krona` format, but note
+that this forces classification by rank, rather than containment threshold.
+
+Please see the 'tax genome' documentation for more details:
+ https://sourmash.readthedocs.io/en/latest/command-line.html#sourmash-tax-genome-classify-a-genome-using-gather-results
+"""
+
+import argparse
+import sourmash
+from sourmash.logging import notify, print_results, error
+from sourmash.cli.utils import add_tax_threshold_arg
+
+def subparser(subparsers):
+ subparser = subparsers.add_parser('genome',
+ aliases=['classify'],
+ usage=usage)
+ subparser.add_argument(
+ '-g', '--gather-csv', nargs='*', default = [],
+ help='CSVs output by sourmash gather for this sample'
+ )
+ subparser.add_argument(
+ '--from-file', metavar='FILE', default=None,
+ help='input many gather results as a text file, with one gather CSV per line'
+ )
+ subparser.add_argument(
+ '-q', '--quiet', action='store_true',
+ help='suppress non-error output'
+ )
+ subparser.add_argument(
+ '-t', '--taxonomy-csv', '--taxonomy', metavar='FILE',
+ nargs='+', required=True,
+ help='database lineages CSV'
+ )
+ subparser.add_argument(
+ '-o', '--output-base', default='-',
+ help='base filepath for output file(s) (default stdout)'
+ )
+ subparser.add_argument(
+ '--output-dir', default= "",
+ help='directory for output files'
+ )
+ subparser.add_argument(
+ '-r', '--rank', choices=['strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'],
+ help='Summarize genome taxonomy at this rank and above. Note that the taxonomy CSV must contain lineage information at this rank.'
+ )
+ subparser.add_argument(
+ '--keep-full-identifiers', action='store_true',
+ help='do not split identifiers on whitespace'
+ )
+ subparser.add_argument(
+ '--keep-identifier-versions', action='store_true',
+ help='after splitting identifiers, do not remove accession versions'
+ )
+ subparser.add_argument(
+ '--fail-on-missing-taxonomy', action='store_true',
+ help='fail quickly if taxonomy is not available for an identifier',
+ )
+ subparser.add_argument(
+ '--output-format', default=['csv_summary'], nargs='+', choices=["csv_summary", "krona"],
+ help='choose output format(s)',
+ )
+ subparser.add_argument(
+ '-f', '--force', action = 'store_true',
+ help='continue past survivable errors in loading taxonomy database or gather results',
+ )
+ add_tax_threshold_arg(subparser, 0.1)
+
+
+def main(args):
+ import sourmash
+ if not args.gather_csv and not args.from_file:
+ raise ValueError(f"No gather CSVs found! Please input via `-g` or `--from-file`.")
+ if len(args.output_format) > 1:
+ if args.output_base == "-":
+ raise TypeError(f"Writing to stdout is incompatible with multiple output formats {args.output_format}")
+ if not args.rank:
+ if any(x in ["krona"] for x in args.output_format):
+ raise ValueError(f"Rank (--rank) is required for krona output format.")
+ return sourmash.tax.__main__.genome(args)
diff --git a/src/sourmash/cli/tax/metagenome.py b/src/sourmash/cli/tax/metagenome.py
new file mode 100644
index 0000000000..4eb6b352db
--- /dev/null
+++ b/src/sourmash/cli/tax/metagenome.py
@@ -0,0 +1,90 @@
+"""summarize metagenome gather results"""
+
+usage="""
+
+ sourmash tax metagenome --gather-csv [ ... ] --taxonomy-csv [ ... ]
+
+The 'tax metagenome' command reads in metagenome gather result CSVs and
+summarizes by taxonomic lineage.
+
+The default output format consists of four columns,
+ `query_name,rank,fraction,lineage`, where `fraction` is the fraction
+ of the query matched to that reported rank and lineage. The summarization
+ is reported for each taxonomic rank.
+
+Alternatively, you can output results at a specific rank (e.g. species)
+in `krona` or `lineage_summary` formats.
+
+Please see the 'tax metagenome' documentation for more details:
+ https://sourmash.readthedocs.io/en/latest/command-line.html#sourmash-tax-metagenome-summarize-metagenome-content-from-gather-results
+"""
+
+import sourmash
+from sourmash.logging import notify, print_results, error
+
+
+def subparser(subparsers):
+ subparser = subparsers.add_parser('metagenome',
+ aliases=['summarize'],
+ usage=usage)
+ subparser.add_argument(
+ '-g', '--gather-csv', nargs='*', default = [],
+ help='CSVs from sourmash gather'
+ )
+ subparser.add_argument(
+ '--from-file', metavar='FILE', default = None,
+ help='input many gather results as a text file, with one gather CSV per line'
+ )
+ subparser.add_argument(
+ '-q', '--quiet', action='store_true',
+ help='suppress non-error output'
+ )
+ subparser.add_argument(
+ '-o', '--output-base', default='-',
+ help='base filepath for output file(s) (default stdout)'
+ )
+ subparser.add_argument(
+ '--output-dir', default= "",
+ help='directory for output files'
+ )
+ subparser.add_argument(
+ '-t', '--taxonomy-csv', '--taxonomy', metavar='FILE',
+ nargs='+', required=True,
+ help='database lineages CSV'
+ )
+ subparser.add_argument(
+ '--keep-full-identifiers', action='store_true',
+ help='do not split identifiers on whitespace'
+ )
+ subparser.add_argument(
+ '--keep-identifier-versions', action='store_true',
+ help='after splitting identifiers, do not remove accession versions'
+ )
+ subparser.add_argument(
+ '--fail-on-missing-taxonomy', action='store_true',
+ help='fail quickly if taxonomy is not available for an identifier',
+ )
+ subparser.add_argument(
+ '--output-format', default=['csv_summary'], nargs='+', choices=["csv_summary", "krona", "lineage_summary"],
+ help='choose output format(s)',
+ )
+ subparser.add_argument(
+ '-r', '--rank', choices=['strain','species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'],
+ help='For non-default output formats: Summarize genome taxonomy at this rank and above. Note that the taxonomy CSV must contain lineage information at this rank.'
+ )
+ subparser.add_argument(
+ '-f', '--force', action = 'store_true',
+ help='continue past errors in taxonomy database loading',
+ )
+
+def main(args):
+ import sourmash
+ if not args.gather_csv and not args.from_file:
+ raise ValueError(f"No gather CSVs found! Please input via `-g` or `--from-file`.")
+ if len(args.output_format) > 1:
+ if args.output_base == "-":
+ raise TypeError(f"Writing to stdout is incompatible with multiple output formats {args.output_format}")
+ if not args.rank:
+ if any(x in ["krona", "lineage_summary"] for x in args.output_format):
+ raise ValueError(f"Rank (--rank) is required for krona and lineage_summary output formats.")
+ return sourmash.tax.__main__.metagenome(args)
diff --git a/src/sourmash/cli/tax/prepare.py b/src/sourmash/cli/tax/prepare.py
new file mode 100644
index 0000000000..01d978c73e
--- /dev/null
+++ b/src/sourmash/cli/tax/prepare.py
@@ -0,0 +1,60 @@
+"""combine multiple taxonomy databases into one."""
+
+usage="""
+
+ sourmash tax prepare --taxonomy-csv [ ... ] -o
+
+The 'tax prepare' command reads in one or more taxonomy databases
+and saves them into a new database. It can be used to combine databases
+in the desired order, as well as output different database formats.
+
+Please see the 'tax prepare' documentation for more details:
+ https://sourmash.readthedocs.io/en/latest/command-line.html#sourmash-tax-prepare-prepare-and-or-combine-taxonomy-files
+"""
+
+import sourmash
+from sourmash.logging import notify, print_results, error
+
+
+def subparser(subparsers):
+ subparser = subparsers.add_parser('prepare',
+ usage=usage)
+ subparser.add_argument(
+ '-q', '--quiet', action='store_true',
+ help='suppress non-error output'
+ )
+ subparser.add_argument(
+ '-t', '--taxonomy-csv', '--taxonomy', metavar='FILE',
+ nargs="+", required=True,
+ help='database lineages'
+ )
+ subparser.add_argument(
+ '-o', '--output', required=True,
+ help='output file',
+ )
+ subparser.add_argument(
+ '-F', '--database-format',
+ help="format of output file; default is 'sql')",
+ default='sql',
+ choices=['csv', 'sql'],
+ )
+ subparser.add_argument(
+ '--keep-full-identifiers', action='store_true',
+ help='do not split identifiers on whitespace'
+ )
+ subparser.add_argument(
+ '--keep-identifier-versions', action='store_true',
+ help='after splitting identifiers, do not remove accession versions'
+ )
+ subparser.add_argument(
+ '--fail-on-missing-taxonomy', action='store_true',
+ help='fail quickly if taxonomy is not available for an identifier',
+ )
+ subparser.add_argument(
+ '-f', '--force', action = 'store_true',
+ help='continue past errors in file and taxonomy loading',
+ )
+
+def main(args):
+ import sourmash
+ return sourmash.tax.__main__.prepare(args)
diff --git a/src/sourmash/cli/utils.py b/src/sourmash/cli/utils.py
index 4bb918643a..a074357bd1 100644
--- a/src/sourmash/cli/utils.py
+++ b/src/sourmash/cli/utils.py
@@ -1,5 +1,6 @@
from glob import glob
import os
+import argparse
def add_moltype_args(parser):
@@ -31,10 +32,10 @@ def add_moltype_args(parser):
parser.set_defaults(hp=False)
parser.add_argument(
- '--dna', '--rna', dest='dna', default=None, action='store_true',
+ '--dna', '--rna', '--nucleotide', dest='dna', default=None, action='store_true',
help='choose a nucleotide signature (default: True)')
parser.add_argument(
- '--no-dna', '--no-rna', dest='dna', action='store_false',
+ '--no-dna', '--no-rna', '--no-nucleotide', dest='dna', action='store_false',
help='do not choose a nucleotide signature')
parser.set_defaults(dna=None)
@@ -50,6 +51,37 @@ def add_ksize_arg(parser, default=31):
help='k-mer size; default={d}'.format(d=default)
)
+#https://stackoverflow.com/questions/55324449/how-to-specify-a-minimum-or-maximum-float-value-with-argparse#55410582
+def range_limited_float_type(arg):
+ """ Type function for argparse - a float within some predefined bounds """
+ min_val = 0
+ max_val = 1
+ try:
+ f = float(arg)
+ except ValueError:
+ raise argparse.ArgumentTypeError("\n\tERROR: Must be a floating point number.")
+ if f < min_val or f > max_val:
+ raise argparse.ArgumentTypeError(f"\n\tERROR: Argument must be >{str(min_val)} and <{str(max_val)}.")
+ return f
+
+
+def add_tax_threshold_arg(parser, default=0.1):
+ parser.add_argument(
+ '--containment-threshold', default=default, type=range_limited_float_type,
+ help=f'minimum containment threshold for classification; default={default}'
+ )
+
+
+def add_picklist_args(parser):
+ parser.add_argument(
+ '--picklist', default=None,
+ help="select signatures based on a picklist, i.e. 'file.csv:colname:coltype'"
+ )
+ parser.add_argument(
+ '--picklist-require-all', default=False, action='store_true',
+ help="require that all picklist values be found or else fail"
+ )
+
def opfilter(path):
return not path.startswith('__') and path not in ['utils']
diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py
index 02a04969fb..2843345f19 100644
--- a/src/sourmash/commands.py
+++ b/src/sourmash/commands.py
@@ -5,7 +5,6 @@
import os
import os.path
import sys
-import copy
import screed
from .compare import (compare_all_pairs, compare_serial_containment,
@@ -29,6 +28,7 @@ def compare(args):
set_quiet(args.quiet)
moltype = sourmash_args.calculate_moltype(args)
+ picklist = sourmash_args.load_picklist(args)
inp_files = list(args.signatures)
if args.from_file:
@@ -46,11 +46,12 @@ def compare(args):
loaded = sourmash_args.load_file_as_signatures(filename,
ksize=args.ksize,
select_moltype=moltype,
+ picklist=picklist,
yield_all_files=args.force,
progress=progress)
loaded = list(loaded)
if not loaded:
- notify('\nwarning: no signatures loaded at given ksize/molecule type from {}', filename)
+ notify('\nwarning: no signatures loaded at given ksize/molecule type/picklist from {}', filename)
siglist.extend(loaded)
# track ksizes/moltypes
@@ -80,6 +81,9 @@ def compare(args):
notify(' '*79, end='\r')
notify('loaded {} signatures total.'.format(len(siglist)))
+ if picklist:
+ sourmash_args.report_picklist(args, picklist)
+
# check to make sure they're potentially compatible - either using
# scaled, or not.
scaled_sigs = [s.minhash.scaled for s in siglist]
@@ -337,6 +341,7 @@ def index(args):
"""
set_quiet(args.quiet)
moltype = sourmash_args.calculate_moltype(args)
+ picklist = sourmash_args.load_picklist(args)
if args.append:
tree = load_sbt_index(args.sbt_name)
@@ -373,6 +378,7 @@ def index(args):
ksize=args.ksize,
select_moltype=moltype,
yield_all_files=args.force,
+ picklist=picklist,
progress=progress)
# load all matching signatures in this file
@@ -418,6 +424,9 @@ def index(args):
error('no signatures found to load into tree!? failing.')
sys.exit(-1)
+ if picklist:
+ sourmash_args.report_picklist(args, picklist)
+
notify('loaded {} sigs; saving SBT under "{}"', n, args.sbt_name)
tree.save(args.sbt_name, sparseness=args.sparseness)
if tree.storage:
@@ -430,6 +439,7 @@ def search(args):
set_quiet(args.quiet)
moltype = sourmash_args.calculate_moltype(args)
+ picklist = sourmash_args.load_picklist(args)
# set up the query.
query = sourmash_args.load_query_signature(args.query,
@@ -459,7 +469,8 @@ def search(args):
sys.exit(-1)
databases = sourmash_args.load_dbs_and_sigs(args.databases, query,
- not is_containment)
+ not is_containment,
+ picklist=picklist)
if not len(databases):
error('Nothing found to search!')
@@ -532,6 +543,9 @@ def search(args):
for sr in results:
save_sig.add(sr.match)
+ if picklist:
+ sourmash_args.report_picklist(args, picklist)
+
def categorize(args):
"Use a database to find the best match to many signatures."
@@ -578,7 +592,7 @@ def _yield_all_sigs(queries, ksize, moltype):
orig_query.minhash.ksize, orig_query.minhash.moltype)
if args.ignore_abundance:
- query = copy.copy(orig_query)
+ query = orig_query.copy()
query.minhash = query.minhash.flatten()
else:
if orig_query.minhash.track_abundance:
@@ -612,10 +626,11 @@ def _yield_all_sigs(queries, ksize, moltype):
def gather(args):
- from .search import gather_databases, format_bp
+ from .search import GatherDatabases, format_bp
set_quiet(args.quiet, args.debug)
moltype = sourmash_args.calculate_moltype(args)
+ picklist = sourmash_args.load_picklist(args)
# load the query signature & figure out all the things
query = sourmash_args.load_query_signature(args.query,
@@ -647,7 +662,8 @@ def gather(args):
if args.cache_size == 0:
cache_size = None
databases = sourmash_args.load_dbs_and_sigs(args.databases, query, False,
- cache_size=cache_size)
+ cache_size=cache_size,
+ picklist=picklist)
if not len(databases):
error('Nothing found to search!')
@@ -658,21 +674,36 @@ def gather(args):
if args.prefetch: # note: on by default!
notify("Starting prefetch sweep across databases.")
- prefetch_query = copy.copy(query)
+ prefetch_query = query.copy()
prefetch_query.minhash = prefetch_query.minhash.flatten()
+ noident_mh = prefetch_query.minhash.to_mutable()
save_prefetch = SaveSignaturesToLocation(args.save_prefetch)
save_prefetch.open()
counters = []
for db in databases:
- counter = db.counter_gather(prefetch_query, args.threshold_bp)
+ counter = None
+ try:
+ counter = db.counter_gather(prefetch_query, args.threshold_bp)
+ except ValueError:
+ if picklist:
+ # catch "no signatures to search" ValueError...
+ continue
+ else:
+ raise # re-raise other errors, if no picklist.
+
save_prefetch.add_many(counter.siglist)
+ # subtract found hashes as we can.
+ for found_sig in counter.siglist:
+ noident_mh.remove_many(found_sig.minhash)
counters.append(counter)
notify(f"Found {len(save_prefetch)} signatures via prefetch; now doing gather.")
save_prefetch.close()
else:
counters = databases
+ # we can't track unidentified hashes w/o prefetch
+ noident_mh = None
## ok! now do gather -
@@ -680,11 +711,12 @@ def gather(args):
weighted_missed = 1
is_abundance = query.minhash.track_abundance and not args.ignore_abundance
orig_query_mh = query.minhash
- next_query = query
+ gather_iter = GatherDatabases(query, counters,
+ threshold_bp=args.threshold_bp,
+ ignore_abundance=args.ignore_abundance,
+ noident_mh=noident_mh)
- gather_iter = gather_databases(query, counters, args.threshold_bp,
- args.ignore_abundance)
- for result, weighted_missed, next_query in gather_iter:
+ for result, weighted_missed in gather_iter:
if not len(found): # first result? print header.
if is_abundance:
print_results("")
@@ -716,6 +748,11 @@ def gather(args):
break
+ # report on thresholding -
+ if gather_iter.query:
+ # if still a query, then we failed the threshold.
+ notify(f'found less than {format_bp(args.threshold_bp)} in common. => exiting')
+
# basic reporting:
print_results(f'\nfound {len(found)} matches total;')
if args.num_results and len(found) == args.num_results:
@@ -724,6 +761,8 @@ def gather(args):
p_covered = (1 - weighted_missed) * 100
print_results(f'the recovered matches hit {p_covered:.1f}% of the query')
print_results('')
+ if gather_iter.scaled != query.minhash.scaled:
+ print_results(f'WARNING: final scaled was {gather_iter.scaled}, vs query scaled of {query.minhash.scaled}')
# save CSV?
if found and args.output:
@@ -731,7 +770,8 @@ def gather(args):
'f_unique_to_query', 'f_unique_weighted',
'average_abund', 'median_abund', 'std_abund', 'name',
'filename', 'md5', 'f_match_orig', 'unique_intersect_bp',
- 'gather_result_rank', 'remaining_bp']
+ 'gather_result_rank', 'remaining_bp',
+ 'query_filename', 'query_name', 'query_md5', 'query_bp']
with FileOutputCSV(args.output) as fp:
w = csv.DictWriter(fp, fieldnames=fieldnames)
@@ -750,31 +790,41 @@ def gather(args):
# save unassigned hashes?
if args.output_unassigned:
- if not len(next_query.minhash):
+ remaining_query = gather_iter.query
+ if not (remaining_query.minhash or noident_mh):
notify('no unassigned hashes to save with --output-unassigned!')
else:
notify(f"saving unassigned hashes to '{args.output_unassigned}'")
+ if noident_mh:
+ remaining_mh = remaining_query.minhash.to_mutable()
+ remaining_mh += noident_mh
+ remaining_query.minhash = remaining_mh
+
if is_abundance:
- # next_query is flattened; reinflate abundances
- hashes = set(next_query.minhash.hashes)
+ # remaining_query is flattened; reinflate abundances
+ hashes = set(remaining_query.minhash.hashes)
orig_abunds = orig_query_mh.hashes
abunds = { h: orig_abunds[h] for h in hashes }
abund_query_mh = orig_query_mh.copy_and_clear()
# orig_query might have been downsampled...
- abund_query_mh.downsample(scaled=next_query.minhash.scaled)
+ abund_query_mh.downsample(scaled=gather_iter.scaled)
abund_query_mh.set_abundances(abunds)
- next_query.minhash = abund_query_mh
+ remaining_query.minhash = abund_query_mh
with FileOutput(args.output_unassigned, 'wt') as fp:
- sig.save_signatures([ next_query ], fp)
+ sig.save_signatures([ remaining_query ], fp)
+
+ if picklist:
+ sourmash_args.report_picklist(args, picklist)
+
# DONE w/gather function.
def multigather(args):
"Gather many signatures against multiple databases."
- from .search import gather_databases, format_bp
+ from .search import GatherDatabases, format_bp
set_quiet(args.quiet)
moltype = sourmash_args.calculate_moltype(args)
@@ -830,18 +880,25 @@ def multigather(args):
continue
counters = []
- prefetch_query = copy.copy(query)
+ prefetch_query = query.copy()
prefetch_query.minhash = prefetch_query.minhash.flatten()
+ noident_mh = prefetch_query.minhash.to_mutable()
counters = []
for db in databases:
counter = db.counter_gather(prefetch_query, args.threshold_bp)
+ for found_sig in counter.siglist:
+ noident_mh.remove_many(found_sig.minhash)
counters.append(counter)
found = []
weighted_missed = 1
is_abundance = query.minhash.track_abundance and not args.ignore_abundance
- for result, weighted_missed, next_query in gather_databases(query, counters, args.threshold_bp, args.ignore_abundance):
+ gather_iter = GatherDatabases(query, counters,
+ threshold_bp=args.threshold_bp,
+ ignore_abundance=args.ignore_abundance,
+ noident_mh=noident_mh)
+ for result, weighted_missed in gather_iter:
if not len(found): # first result? print header.
if is_abundance:
print_results("")
@@ -869,6 +926,10 @@ def multigather(args):
name)
found.append(result)
+ # report on thresholding -
+ if gather_iter.query.minhash:
+ # if still a query, then we failed the threshold.
+ notify(f'found less than {format_bp(args.threshold_bp)} in common. => exiting')
# basic reporting
print_results('\nfound {} matches total;', len(found))
@@ -894,7 +955,8 @@ def multigather(args):
'average_abund', 'median_abund', 'std_abund', 'name',
'filename', 'md5', 'f_match_orig',
'unique_intersect_bp', 'gather_result_rank',
- 'remaining_bp']
+ 'remaining_bp', 'query_filename', 'query_name',
+ 'query_md5', 'query_bp']
with FileOutputCSV(output_csv) as fp:
w = csv.DictWriter(fp, fieldnames=fieldnames)
w.writeheader()
@@ -911,18 +973,21 @@ def multigather(args):
output_unassigned = output_base + '.unassigned.sig'
with open(output_unassigned, 'wt') as fp:
+ remaining_query = gather_iter.query
+ if noident_mh:
+ remaining_mh = remaining_query.minhash.to_mutable()
+ remaining_mh += noident_mh.downsample(scaled=remaining_mh.scaled)
+ remaining_query.minhash = remaining_mh
+
if not found:
notify('nothing found - entire query signature unassigned.')
- elif not len(query.minhash):
+ elif not remaining_query:
notify('no unassigned hashes! not saving.')
else:
notify('saving unassigned hashes to "{}"', output_unassigned)
- e = MinHash(ksize=query.minhash.ksize, n=0,
- scaled=next_query.minhash.scaled)
- e.add_many(next_query.minhash.hashes)
# CTB: note, multigather does not save abundances
- sig.save_signatures([ sig.SourmashSignature(e) ], fp)
+ sig.save_signatures([ remaining_query ], fp)
n += 1
# fini, next query!
@@ -1051,6 +1116,7 @@ def prefetch(args):
# figure out what k-mer size and molecule type we're looking for here
ksize = args.ksize
moltype = sourmash_args.calculate_moltype(args)
+ picklist = sourmash_args.load_picklist(args)
# load the query signature & figure out all the things
query = sourmash_args.load_query_signature(args.query,
@@ -1106,6 +1172,7 @@ def prefetch(args):
# iterate over signatures in db one at a time, for each db;
# find those with sufficient overlap
+ ident_mh = query_mh.copy_and_clear()
noident_mh = query_mh.to_mutable()
did_a_search = False # track whether we did _any_ search at all!
@@ -1119,7 +1186,8 @@ def prefetch(args):
db = LazyLinearIndex(db)
db = db.select(ksize=ksize, moltype=moltype,
- containment=True, scaled=True)
+ containment=True, scaled=True,
+ picklist=picklist)
if not db:
notify(f"...no compatible signatures in '{dbfilename}'; skipping")
@@ -1128,8 +1196,10 @@ def prefetch(args):
for result in prefetch_database(query, db, args.threshold_bp):
match = result.match
- # track remaining "untouched" hashes.
- noident_mh.remove_many(match.minhash.hashes)
+ # track found & "untouched" hashes.
+ match_mh = match.minhash.downsample(scaled=query.minhash.scaled)
+ ident_mh += query.minhash & match_mh.flatten()
+ noident_mh.remove_many(match.minhash)
# output match info as we go
if csvout_fp:
@@ -1165,24 +1235,35 @@ def prefetch(args):
notify(f"saved {matches_out.count} matches to CSV file '{args.output}'")
csvout_fp.close()
- matched_query_mh = query_mh.to_mutable()
- matched_query_mh.remove_many(noident_mh.hashes)
- notify(f"of {len(query_mh)} distinct query hashes, {len(matched_query_mh)} were found in matches above threshold.")
+ assert len(query_mh) == len(ident_mh) + len(noident_mh)
+ notify(f"of {len(query_mh)} distinct query hashes, {len(ident_mh)} were found in matches above threshold.")
notify(f"a total of {len(noident_mh)} query hashes remain unmatched.")
if args.save_matching_hashes:
filename = args.save_matching_hashes
- notify(f"saving {len(matched_query_mh)} matched hashes to '{filename}'")
- ss = sig.SourmashSignature(matched_query_mh)
+ notify(f"saving {len(ident_mh)} matched hashes to '{filename}'")
+
+ sig_name = ''
+ if query.name:
+ sig_name = f"{query.name}-known"
+
+ ss = sig.SourmashSignature(ident_mh, name=sig_name)
with open(filename, "wt") as fp:
sig.save_signatures([ss], fp)
if args.save_unmatched_hashes:
filename = args.save_unmatched_hashes
+
+ sig_name = ''
+ if query.name:
+ sig_name = f"{query.name}-unknown"
+
notify(f"saving {len(noident_mh)} unmatched hashes to '{filename}'")
- ss = sig.SourmashSignature(noident_mh)
+ ss = sig.SourmashSignature(noident_mh, name=sig_name)
with open(filename, "wt") as fp:
sig.save_signatures([ss], fp)
+ if picklist:
+ sourmash_args.report_picklist(args, picklist)
+
return 0
-
diff --git a/src/sourmash/index.py b/src/sourmash/index.py
index 55c3a2f8c4..314732778c 100644
--- a/src/sourmash/index.py
+++ b/src/sourmash/index.py
@@ -1,19 +1,56 @@
-"An Abstract Base Class for collections of signatures."
+"""An Abstract Base Class for collections of signatures, plus implementations.
+
+APIs and functionality
+----------------------
+
+Index classes support three sets of API functionality -
+
+'select(...)', which selects subsets of signatures based on ksize, moltype,
+and other criteria, including picklists.
+
+'find(...)', and the 'search', 'gather', and 'counter_gather' implementations
+built on top of 'find', which search for signatures that match a query.
+
+'signatures()', which yields all signatures in the Index subject to the
+selection criteria.
+
+Classes defined in this file
+----------------------------
+
+Index - abstract base class for all Index objects.
+
+LinearIndex - simple in-memory storage of signatures.
+
+LazyLinearIndex - lazy selection and linear search of signatures.
+
+ZipFileLinearIndex - simple on-disk storage of signatures.
+
+class MultiIndex - in-memory storage and selection of signatures from multiple
+index objects, using manifests.
+
+LazyLoadedIndex - selection on manifests with loading of index on demand.
+
+CounterGather - an ancillary class returned by the 'counter_gather()' method.
+"""
import os
import sourmash
from abc import abstractmethod, ABC
from collections import namedtuple, Counter
import zipfile
-import copy
+from io import TextIOWrapper
from .search import make_jaccard_search_query, make_gather_query
+from .manifest import CollectionManifest
+from .logging import debug_literal
+from .signature import load_signatures, save_signatures
# generic return tuple for Index.search and Index.gather
IndexSearchResult = namedtuple('Result', 'score, signature, location')
class Index(ABC):
is_database = False
+ manifest = None
@property
def location(self):
@@ -29,6 +66,16 @@ def signatures_with_location(self):
for ss in self.signatures():
yield ss, self.location
+ def _signatures_with_internal(self):
+ """Return an iterator of tuples (ss, location, internal_location).
+
+ This is an internal API for use in generating manifests, and may
+ change without warning.
+
+ This method should be implemented separately for each Index object.
+ """
+ raise NotImplementedError
+
@abstractmethod
def insert(self, signature):
""" """
@@ -195,9 +242,6 @@ def search(self, query, *, threshold=None,
def prefetch(self, query, threshold_bp, **kwargs):
"Return all matches with minimum overlap."
- query_mh = query.minhash
- scaled = query_mh.scaled
-
if not self: # empty database? quit.
raise ValueError("no signatures to search")
@@ -242,7 +286,7 @@ def peek(self, query_mh, threshold_bp=0):
scaled = max(query_mh.scaled, match_mh.scaled)
match_mh = match_mh.downsample(scaled=scaled).flatten()
query_mh = query_mh.downsample(scaled=scaled)
- intersect_mh = match_mh.intersection(query_mh)
+ intersect_mh = match_mh & query_mh
return [sr, intersect_mh]
@@ -260,7 +304,7 @@ def counter_gather(self, query, threshold_bp, **kwargs):
public `CounterGather` interface, of course.
"""
# build a flat query
- prefetch_query = copy.copy(query)
+ prefetch_query = query.copy()
prefetch_query.minhash = prefetch_query.minhash.flatten()
# find all matches and construct a CounterGather object.
@@ -291,8 +335,8 @@ def select(self, ksize=None, moltype=None, scaled=None, num=None,
"""
-def select_signature(ss, ksize=None, moltype=None, scaled=0, num=0,
- containment=False):
+def select_signature(ss, *, ksize=None, moltype=None, scaled=0, num=0,
+ containment=False, abund=None, picklist=None):
"Check that the given signature matches the specificed requirements."
# ksize match?
if ksize and ksize != ss.minhash.ksize:
@@ -319,11 +363,24 @@ def select_signature(ss, ksize=None, moltype=None, scaled=0, num=0,
if ss.minhash.scaled or num != ss.minhash.num:
return False
+ if abund:
+ # note: minhash w/abund can always be flattened
+ if not ss.minhash.track_abundance:
+ return False
+
+ if picklist is not None and ss not in picklist:
+ return False
+
return True
class LinearIndex(Index):
- "An Index for a collection of signatures. Can load from a .sig file."
+ """An Index for a collection of signatures. Can load from a .sig file.
+
+ Note: See MultiIndex for an in-memory class that uses manifests.
+
+ Concrete class; signatures held in memory; does not use manifests.
+ """
def __init__(self, _signatures=None, filename=None):
self._signatures = []
if _signatures:
@@ -347,13 +404,12 @@ def insert(self, node):
self._signatures.append(node)
def save(self, path):
- from .signature import save_signatures
with open(path, 'wt') as fp:
save_signatures(self.signatures(), fp)
@classmethod
def load(cls, location):
- from .signature import load_signatures
+ "Load signatures from a JSON signature file."
si = load_signatures(location, do_raise=True)
lidx = LinearIndex(si, filename=location)
@@ -364,9 +420,6 @@ def select(self, **kwargs):
Does not raise ValueError, but may return an empty Index.
"""
- # eliminate things from kwargs with None or zero value
- kw = { k : v for (k, v) in kwargs.items() if v }
-
siglist = []
for ss in self._signatures:
if select_signature(ss, **kwargs):
@@ -378,9 +431,22 @@ def select(self, **kwargs):
class LazyLinearIndex(Index):
"""An Index for lazy linear search of another database.
- The defining feature of this class is that 'find' is inherited
- from the base Index class, which does a linear search with
- signatures().
+ Wrapper class; does not use manifests.
+
+ One of the main purposes of this class is to _force_ linear 'find'
+ on index objects. So if this class wraps an SBT, for example, the
+ SBT find method will be overriden with the linear 'find' from the
+ base class. There are very few situations where this is an improvement,
+ so use this class wisely!
+
+ A few notes:
+ * selection criteria defined by 'select' are only executed when
+ signatures are actually requested (hence, 'lazy').
+ * this class stores the provided index 'db' in memory. If you need
+ a class that does lazy loading of signatures from disk and does not
+ store signatures in memory, see LazyLoadedIndex.
+ * if you want efficient manifest-based selection, consider
+ MultiIndex (signatures in memory).
"""
def __init__(self, db, selection_dict={}):
@@ -401,7 +467,7 @@ def signatures_with_location(self):
def __bool__(self):
try:
- first_sig = next(iter(self.signatures()))
+ next(iter(self.signatures()))
return True
except StopIteration:
return False
@@ -439,19 +505,52 @@ class ZipFileLinearIndex(Index):
A read-only collection of signatures in a zip file.
Does not support `insert` or `save`.
+
+ Concrete class; signatures dynamically loaded from disk; uses manifests.
"""
is_database = True
- def __init__(self, zf, selection_dict=None,
- traverse_yield_all=False):
+ def __init__(self, zf, *, selection_dict=None,
+ traverse_yield_all=False, manifest=None, use_manifest=True):
self.zf = zf
self.selection_dict = selection_dict
self.traverse_yield_all = traverse_yield_all
+ self.use_manifest = use_manifest
+
+ # do we have a manifest already? if not, try loading.
+ if use_manifest:
+ if manifest is not None:
+ debug_literal('ZipFileLinearIndex using passed-in manifest')
+ self.manifest = manifest
+ else:
+ self._load_manifest()
+ else:
+ self.manifest = None
+
+ if self.manifest is not None:
+ assert not self.selection_dict, self.selection_dict
+ if self.selection_dict:
+ assert self.manifest is None
+
+ def _load_manifest(self):
+ "Load a manifest if one exists"
+ try:
+ zi = self.zf.getinfo('SOURMASH-MANIFEST.csv')
+ except KeyError:
+ self.manifest = None
+ else:
+ debug_literal(f'found manifest when loading {self.zf.filename}')
+
+ with self.zf.open(zi, 'r') as mfp:
+ # wrap as text, since ZipFile.open only supports 'r' mode.
+ mfp = TextIOWrapper(mfp, 'utf-8')
+ # load manifest!
+ self.manifest = CollectionManifest.load_from_csv(mfp)
def __bool__(self):
"Are there any matching signatures in this zipfile? Avoid calling len."
try:
- first_sig = next(iter(self.signatures()))
+ next(iter(self.signatures()))
except StopIteration:
return False
@@ -474,38 +573,97 @@ def save(self, path):
raise NotImplementedError
@classmethod
- def load(cls, location, traverse_yield_all=False):
+ def load(cls, location, traverse_yield_all=False, use_manifest=True):
"Class method to load a zipfile."
zf = zipfile.ZipFile(location, 'r')
- return cls(zf, traverse_yield_all=traverse_yield_all)
+ return cls(zf, traverse_yield_all=traverse_yield_all,
+ use_manifest=use_manifest)
- def signatures(self):
- "Load all signatures in the zip file."
- from .signature import load_signatures
+ def _signatures_with_internal(self):
+ """Return an iterator of tuples (ss, location, internal_location).
+
+ Note: does not limit signatures to subsets.
+ """
for zipinfo in self.zf.infolist():
# should we load this file? if it ends in .sig OR we are forcing:
if zipinfo.filename.endswith('.sig') or \
zipinfo.filename.endswith('.sig.gz') or \
self.traverse_yield_all:
fp = self.zf.open(zipinfo)
+ for ss in load_signatures(fp):
+ yield ss, self.zf.filename, zipinfo.filename
- # now load all the signatures and select on ksize/moltype:
- selection_dict = self.selection_dict
-
- # note: if 'fp' doesn't contain a valid JSON signature,
- # load_signatures will silently fail & yield nothing.
+ def signatures(self):
+ "Load all signatures in the zip file."
+ selection_dict = self.selection_dict
+ manifest = None
+ if self.manifest is not None:
+ manifest = self.manifest
+ assert not selection_dict
+
+ # yield all signatures found in manifest
+ for filename in manifest.locations():
+ zi = self.zf.getinfo(filename)
+ fp = self.zf.open(zi)
for ss in load_signatures(fp):
+ # in case multiple signatures are in the file, check
+ # to make sure we want to return each one.
+ if ss in manifest:
+ yield ss
+
+ # no manifest! iterate.
+ else:
+ for zipinfo in self.zf.infolist():
+ # should we load this file? if it ends in .sig OR force:
+ if zipinfo.filename.endswith('.sig') or \
+ zipinfo.filename.endswith('.sig.gz') or \
+ self.traverse_yield_all:
+ fp = self.zf.open(zipinfo)
+
if selection_dict:
- if select_signature(ss, **self.selection_dict):
- yield ss
+ select = lambda x: select_signature(x,
+ **selection_dict)
else:
- yield ss
+ select = lambda x: True
+
+ for ss in load_signatures(fp):
+ if select(ss):
+ yield ss
def select(self, **kwargs):
"Select signatures in zip file based on ksize/moltype/etc."
- return ZipFileLinearIndex(self.zf,
- selection_dict=kwargs,
- traverse_yield_all=self.traverse_yield_all)
+
+ # if we have a manifest, run 'select' on the manifest.
+ manifest = self.manifest
+ traverse_yield_all = self.traverse_yield_all
+
+ if manifest is not None:
+ manifest = manifest.select_to_manifest(**kwargs)
+ return ZipFileLinearIndex(self.zf,
+ selection_dict=None,
+ traverse_yield_all=traverse_yield_all,
+ manifest=manifest,
+ use_manifest=True)
+ else:
+ # no manifest? just pass along all the selection kwargs to
+ # the new ZipFileLinearIndex.
+
+ assert manifest is None
+ if self.selection_dict:
+ # combine selects...
+ d = dict(self.selection_dict)
+ for k, v in kwargs.items():
+ if k in d:
+ if d[k] is not None and d[k] != v:
+ raise ValueError(f"incompatible select on '{k}'")
+ d[k] = v
+ kwargs = d
+
+ return ZipFileLinearIndex(self.zf,
+ selection_dict=kwargs,
+ traverse_yield_all=traverse_yield_all,
+ manifest=None,
+ use_manifest=False)
class CounterGather:
@@ -520,7 +678,7 @@ def __init__(self, query_mh):
raise ValueError('gather requires scaled signatures')
# track query
- self.orig_query_mh = copy.copy(query_mh).flatten()
+ self.orig_query_mh = query_mh.copy().flatten()
self.scaled = query_mh.scaled
# track matching signatures & their locations
@@ -623,7 +781,7 @@ def peek(self, cur_query_mh, threshold_bp=0):
# calculate intersection of this "best match" with query.
match_mh = match.minhash.downsample(scaled=scaled).flatten()
- intersect_mh = cur_query_mh.intersection(match_mh)
+ intersect_mh = cur_query_mh & match_mh
location = self.locations[dataset_id]
# build result & return intersection
@@ -661,72 +819,154 @@ def consume(self, intersect_mh):
class MultiIndex(Index):
- """An Index class that wraps other Index classes.
-
- The MultiIndex constructor takes two arguments: a list of Index
- objects, and a matching list of sources (filenames, etc.) If the
- source is not None, then it will be used to override the 'filename'
- in the triple that is returned by search and gather.
+ """
+ Load a collection of signatures, and retain their original locations.
One specific use for this is when loading signatures from a directory;
- MultiIndex will properly record which files provided which signatures.
+ MultiIndex will record which specific files provided which
+ signatures.
+
+ Creates a manifest on load.
+
+ Note: this is an in-memory collection, and does not do lazy loading:
+ all signatures are loaded upon instantiation and kept in memory.
+
+ Concrete class; signatures held in memory; builds and uses manifests.
"""
- def __init__(self, index_list, source_list):
- self.index_list = list(index_list)
- self.source_list = list(source_list)
- assert len(index_list) == len(source_list)
+ def __init__(self, manifest, parent=""):
+ """Constructor; takes manifest containing signatures, together with
+ optional top-level location to prepend to internal locations.
+ """
+ self.manifest = manifest
+ self.parent = parent
def signatures(self):
- for idx in self.index_list:
- for ss in idx.signatures():
- yield ss
+ for row in self.manifest.rows:
+ yield row['signature']
def signatures_with_location(self):
- for idx, loc in zip(self.index_list, self.source_list):
- for ss in idx.signatures():
- yield ss, loc
+ for row in self.manifest.rows:
+ loc = row['internal_location']
+ # here, 'parent' may have been removed from internal_location
+ # for directories; if so, add it back in.
+ if self.parent:
+ loc = os.path.join(self.parent, loc)
+ yield row['signature'], loc
+
+ def _signatures_with_internal(self):
+ """Return an iterator of tuples (ss, parent, location)
+
+ CTB note: here, 'internal_location' is the source file for the
+ index. This is a special feature of this (in memory) class.
+ """
+ parent = self.parent
+ for row in self.manifest.rows:
+ yield row['signature'], parent, row['internal_location']
+
def __len__(self):
- return sum([ len(idx) for idx in self.index_list ])
+ return len(self.manifest)
def insert(self, *args):
raise NotImplementedError
@classmethod
- def load(self, *args):
- raise NotImplementedError
+ def load(cls, index_list, source_list, parent=""):
+ """Create a MultiIndex from already-loaded indices.
+
+ Takes two arguments: a list of Index objects, and a matching list
+ of source strings (filenames, etc.) If the source is not None,
+ then it will be used to override the location provided by the
+ matching Index object.
+ """
+ assert len(index_list) == len(source_list)
+
+ # yield all signatures + locations
+ def sigloc_iter():
+ for idx, iloc in zip(index_list, source_list):
+ # override internal location if location is explicitly provided
+ if iloc is None:
+ iloc = idx.location
+ for ss in idx.signatures():
+ yield ss, iloc
+
+ # build manifest; note, signatures are stored in memory.
+ manifest = CollectionManifest.create_manifest(sigloc_iter())
+
+ # create!
+ return cls(manifest, parent=parent)
@classmethod
- def load_from_path(cls, pathname, force=False):
- "Create a MultiIndex from a path (filename or directory)."
+ def load_from_directory(cls, pathname, *, force=False):
+ """Create a MultiIndex from a directory.
+
+ Takes directory path plus optional boolean 'force'. Attempts to
+ load all files ending in .sig or .sig.gz, by default; if 'force' is
+ True, will attempt to load _all_ files, ignoring errors.
+ """
from .sourmash_args import traverse_find_sigs
- if not os.path.exists(pathname): # CTB consider changing to isdir.
- raise ValueError(f"'{pathname}' must be a directory")
+
+ if not os.path.isdir(pathname):
+ raise ValueError(f"'{pathname}' must be a directory.")
index_list = []
source_list = []
- for thisfile in traverse_find_sigs([pathname], yield_all_files=force):
+
+ traversal = traverse_find_sigs([pathname], yield_all_files=force)
+ for thisfile in traversal:
try:
idx = LinearIndex.load(thisfile)
index_list.append(idx)
- source_list.append(thisfile)
+
+ rel = os.path.relpath(thisfile, pathname)
+ source_list.append(rel)
except (IOError, sourmash.exceptions.SourmashError):
if force:
continue # ignore error
else:
- raise # continue past error!
+ raise # stop loading!
- db = None
- if index_list:
- db = cls(index_list, source_list)
- else:
+ # did we load anything? if not, error
+ if not index_list:
raise ValueError(f"no signatures to load under directory '{pathname}'")
- return db
+ return cls.load(index_list, source_list, parent=pathname)
+
+ @classmethod
+ def load_from_path(cls, pathname, force=False):
+ """
+ Create a MultiIndex from a path (filename or directory).
+
+ Note: this only uses LinearIndex.load(...), so will only load
+ signature JSON files.
+ """
+ if not os.path.exists(pathname):
+ raise ValueError(f"'{pathname}' must exist.")
+
+ if os.path.isdir(pathname): # traverse
+ return cls.load_from_directory(pathname, force=force)
+ else: # load as a .sig/JSON file
+ index_list = []
+ source_list = []
+ try:
+ idx = LinearIndex.load(pathname)
+ index_list = [idx]
+ source_list = [pathname]
+ except (IOError, sourmash.exceptions.SourmashError):
+ if not force:
+ raise ValueError(f"no signatures to load from '{pathname}'")
+ return None
+
+ return cls.load(index_list, source_list)
@classmethod
def load_from_pathlist(cls, filename):
- "Create a MultiIndex from all files listed in a text file."
+ """Create a MultiIndex from all files listed in a text file.
+
+ Note: this will load signatures from directories and databases, too,
+ if they are listed in the text file; it uses 'load_file_as_index'
+ underneath.
+ """
from .sourmash_args import (load_pathlist_from_file,
load_file_as_index)
idx_list = []
@@ -740,52 +980,117 @@ def load_from_pathlist(cls, filename):
idx_list.append(idx)
src_list.append(src)
- db = MultiIndex(idx_list, src_list)
- return db
+ return cls.load(idx_list, src_list)
def save(self, *args):
raise NotImplementedError
def select(self, **kwargs):
- "Run 'select' on all indices within this MultiIndex."
- new_idx_list = []
- new_src_list = []
- for idx, src in zip(self.index_list, self.source_list):
- idx = idx.select(**kwargs)
- new_idx_list.append(idx)
- new_src_list.append(src)
+ "Run 'select' on the manifest."
+ new_manifest = self.manifest.select_to_manifest(**kwargs)
+ return MultiIndex(new_manifest, parent=self.parent)
+
+
+class LazyLoadedIndex(Index):
+ """Given an index location and a manifest, do select only on the manifest
+ until signatures are actually requested, and only then load the index.
+
+ This class is useful when you have an index object that consume
+ memory when it is loaded (e.g. JSON signature files, or LCA
+ databases) and you want to avoid keeping them in memory. The
+ downside of using this class is that it will load the signatures
+ from disk every time they are needed (e.g. 'find(...)', 'signatures()').
+
+ Wrapper class; signatures dynamically loaded from disk; uses manifests.
+ """
+ def __init__(self, filename, manifest):
+ "Create an Index with given filename and manifest."
+ self.filename = filename
+ self.manifest = manifest
+
+ @property
+ def location(self):
+ "the 'location' attribute for this index will be the filename."
+ return self.filename
- return MultiIndex(new_idx_list, new_src_list)
+ def signatures(self):
+ "yield all signatures from the manifest."
+ if not len(self):
+ # nothing in manifest? done!
+ return []
- def search(self, query, **kwargs):
- """Return the match with the best Jaccard similarity in the Index.
+ # ok - something in manifest, let's go get those signatures!
+ picklist = self.manifest.to_picklist()
+ idx = sourmash.load_file_as_index(self.location)
- Note: this overrides the location of the match if needed.
+ # convert remaining manifest into picklist
+ # CTB: one optimization down the road is, for storage-backed
+ # Index objects, to just reach in and get the signatures directly,
+ # without going through 'select'. Still, this is nice for abstraction
+ # because we don't need to care what the index is - it'll work on
+ # anything. It just might be a bit slower.
+ idx = idx.select(picklist=picklist)
+
+ # extract signatures.
+ for ss in idx.signatures():
+ yield ss
+
+ def find(self, *args, **kwargs):
+ """Run find after loading and selecting; this provides 'search',
+ "'gather', and 'prefetch' functionality, which are built on 'find'.
"""
- # do the actual search:
- matches = []
- for idx, src in zip(self.index_list, self.source_list):
- for sr in idx.search(query, **kwargs):
- if src: # override 'sr.location' if 'src' specified'
- sr = IndexSearchResult(sr.score, sr.signature, src)
- matches.append(sr)
-
- # sort!
- matches.sort(key=lambda x: -x.score)
- return matches
+ if not len(self):
+ # nothing in manifest? done!
+ return []
- def prefetch(self, query, threshold_bp, **kwargs):
- "Return all matches with specified overlap."
- # actually do search!
- results = []
+ # ok - something in manifest, let's go get those signatures!
+ picklist = self.manifest.to_picklist()
+ idx = sourmash.load_file_as_index(self.location)
+
+ # convert remaining manifest into picklist
+ # CTB: one optimization down the road is, for storage-backed
+ # Index objects, to just reach in and get the signatures directly,
+ # without going through 'select'. Still, this is nice for abstraction
+ # because we don't need to care what the index is - it'll work on
+ # anything. It just might be a bit slower.
+ idx = idx.select(picklist=picklist)
+
+ for x in idx.find(*args, **kwargs):
+ yield x
+
+ def __len__(self):
+ "track index size based on the manifest."
+ return len(self.manifest)
+
+ def __bool__(self):
+ return bool(self.manifest)
+
+ @classmethod
+ def load(cls, location):
+ """Load index from given location, but retain only the manifest.
+
+ Fail if no manifest.
+ """
+ idx = sourmash.load_file_as_index(location)
+ manifest = idx.manifest
+
+ if not idx.manifest:
+ raise ValueError(f"no manifest on index at {location}")
+
+ del idx
+ # NOTE: index is not retained outside this scope, just location.
- for idx, src in zip(self.index_list, self.source_list):
- if not idx:
- continue
+ return cls(location, manifest)
+
+ def insert(self, *args):
+ raise NotImplementedError
+
+ def save(self, *args):
+ raise NotImplementedError
+
+ def select(self, **kwargs):
+ "Run 'select' on manifest, return new object with new manifest."
+ manifest = self.manifest
+ new_manifest = manifest.select_to_manifest(**kwargs)
- for (score, ss, filename) in idx.prefetch(query, threshold_bp,
- **kwargs):
- best_src = src or filename # override if src provided
- yield IndexSearchResult(score, ss, best_src)
-
- return results
+ return LazyLoadedIndex(self.filename, new_manifest)
diff --git a/src/sourmash/lca/command_index.py b/src/sourmash/lca/command_index.py
index 883d861a75..ae6ebaeb27 100644
--- a/src/sourmash/lca/command_index.py
+++ b/src/sourmash/lca/command_index.py
@@ -15,8 +15,10 @@
from sourmash.sourmash_args import DEFAULT_LOAD_K
-def load_taxonomy_assignments(filename, delimiter=',', start_column=2,
- use_headers=True, force=False):
+def load_taxonomy_assignments(filename, *, delimiter=',', start_column=2,
+ use_headers=True, force=False,
+ split_identifiers=False,
+ keep_identifier_versions=False):
"""
Load a taxonomy assignment spreadsheet into a dictionary.
@@ -65,6 +67,13 @@ def load_taxonomy_assignments(filename, delimiter=',', start_column=2,
ident = lineage[0][1]
lineage = lineage[1:]
+ # fold, spindle, and mutilate ident?
+ if split_identifiers:
+ ident = ident.split(' ')[0]
+
+ if not keep_identifier_versions:
+ ident = ident.split('.')[0]
+
# clean lineage of null names, replace with 'unassigned'
lineage = [ (a, lca_utils.filter_null(b)) for (a,b) in lineage ]
lineage = [ LineagePair(a, b) for (a, b) in lineage ]
@@ -111,19 +120,19 @@ def generate_report(record_duplicates, record_no_lineage, record_remnants,
Output a report of anomalies from building the index.
"""
with open(filename, 'wt') as fp:
- print('Duplicate signatures:', file=fp)
+ print(f'Duplicate signatures: {len(record_duplicates)}', file=fp)
fp.write("\n".join(record_duplicates))
fp.write("\n")
- print('----\nUnused identifiers:', file=fp)
+ print(f'----\nUnused identifiers: {len(unused_identifiers)}', file=fp)
fp.write("\n".join(unused_identifiers))
fp.write("\n")
- print('----\nNo lineage provided for these identifiers:', file=fp)
+ print(f'----\nNo lineage provided for these identifiers: {len(record_no_lineage)}', file=fp)
fp.write("\n".join(record_no_lineage))
fp.write("\n")
- print('----\nNo signatures found for these identifiers:', file=fp)
+ print(f'----\nNo signatures found for these identifiers: {len(record_remnants)}', file=fp)
fp.write('\n'.join(record_remnants))
fp.write("\n")
- print('----\nUnused lineages:', file=fp)
+ print(f'----\nUnused lineages: {len(unused_lineages)}', file=fp)
for lineage in unused_lineages:
fp.write(";".join(lca_utils.zip_lineage(lineage)))
fp.write("\n")
@@ -145,6 +154,7 @@ def index(args):
args.ksize = DEFAULT_LOAD_K
moltype = sourmash_args.calculate_moltype(args, default='DNA')
+ picklist = sourmash_args.load_picklist(args)
notify('Building LCA database with ksize={} scaled={} moltype={}.',
args.ksize, args.scaled, moltype)
@@ -157,7 +167,10 @@ def index(args):
delimiter=delimiter,
start_column=args.start_column,
use_headers=not args.no_headers,
- force=args.force)
+ force=args.force,
+ split_identifiers=args.split_identifiers,
+ keep_identifier_versions=args.keep_identifier_versions
+ )
notify('{} distinct identities in spreadsheet out of {} rows.',
len(assignments), num_rows)
@@ -190,6 +203,7 @@ def index(args):
n += 1
it = load_file_as_signatures(filename, ksize=args.ksize,
select_moltype=moltype,
+ picklist=picklist,
yield_all_files=args.force)
for sig in it:
notify(u'\r\033[K', end=u'')
@@ -199,7 +213,7 @@ def index(args):
# block off duplicates.
if sig.md5sum() in md5_to_name:
debug('WARNING: in file {}, duplicate md5sum: {}; skipping', filename, sig.md5sum())
- record_duplicates.add(filename)
+ record_duplicates.add(sig.name)
continue
md5_to_name[sig.md5sum()] = str(sig)
@@ -210,16 +224,24 @@ def index(args):
else:
ident = sig.filename
+ orig_ident = ident
if args.split_identifiers: # hack for NCBI-style names, etc.
# split on space...
ident = ident.split(' ')[0]
- # ...and on period.
- ident = ident.split('.')[0]
+
+ if not args.keep_identifier_versions:
+ # ...and on period.
+ ident = ident.split('.')[0]
lineage = assignments.get(ident)
# punt if no lineage and --require-taxonomy
if lineage is None and args.require_taxonomy:
+ if args.fail_on_missing_taxonomy:
+ notify(f"ERROR: no taxonomy found for identifier '{ident}'")
+ if args.split_identifiers:
+ notify(f"(Identifier extracted from name: '{orig_ident})')")
+ sys.exit(-1)
debug('(skipping, because --require-taxonomy was specified)')
n_skipped += 1
continue
@@ -265,6 +287,9 @@ def index(args):
notify('loaded {} hashes at ksize={} scaled={}', len(db.hashval_to_idx),
args.ksize, args.scaled)
+ if picklist:
+ sourmash_args.report_picklist(args, picklist)
+
# summarize:
notify('{} assigned lineages out of {} distinct lineages in spreadsheet.',
len(record_used_lineages), len(set(assignments.values())))
diff --git a/src/sourmash/lca/lca_db.py b/src/sourmash/lca/lca_db.py
index a3d90ffd5d..32b62224c1 100644
--- a/src/sourmash/lca/lca_db.py
+++ b/src/sourmash/lca/lca_db.py
@@ -9,6 +9,7 @@
from sourmash.minhash import _get_max_hash_for_scaled
from sourmash.logging import notify, error, debug
from sourmash.index import Index, IndexSearchResult
+from sourmash.picklist import passes_all_picklists
def cached_property(fun):
@@ -71,6 +72,7 @@ def __init__(self, ksize, scaled, moltype='DNA'):
self.lineage_to_lid = {}
self.lid_to_lineage = {}
self.hashval_to_idx = defaultdict(set)
+ self.picklists = []
@property
def location(self):
@@ -175,8 +177,8 @@ def signatures(self):
for v in self._signatures.values():
yield v
- def select(self, ksize=None, moltype=None, num=0, scaled=0,
- containment=False):
+ def select(self, ksize=None, moltype=None, num=0, scaled=0, abund=None,
+ containment=False, picklist=None):
"""Make sure this database matches the requested requirements.
As with SBTs, queries with higher scaled values than the database
@@ -197,6 +199,14 @@ def select(self, ksize=None, moltype=None, num=0, scaled=0,
if moltype is not None and moltype != self.moltype:
raise ValueError(f"moltype on this database is {self.moltype}; this is different from requested moltype of {moltype}")
+ if abund:
+ raise ValueError("LCA databases do not support sketches with abund=True")
+
+ if picklist is not None:
+ self.picklists.append(picklist)
+ if len(self.picklists) > 1:
+ raise ValueError("we do not (yet) support multiple picklists for LCA databases")
+
return self
@classmethod
@@ -416,7 +426,10 @@ def _signatures(self):
for idx, mh in mhd.items():
ident = self.idx_to_ident[idx]
name = self.ident_to_name[ident]
- sigd[idx] = SourmashSignature(mh, name=name)
+ ss = SourmashSignature(mh, name=name)
+
+ if passes_all_picklists(ss, self.picklists):
+ sigd[idx] = SourmashSignature(mh, name=name)
debug('=> {} signatures!', len(sigd))
return sigd
@@ -460,7 +473,11 @@ def find(self, search_fn, query, **kwargs):
# NOTE: one future low-mem optimization could be to support doing
# this piecemeal by iterating across all the hashes, instead.
- subj = self._signatures[idx]
+ subj = self._signatures.get(idx)
+ if subj is None: # must be because of a picklist exclusion
+ assert self.picklists
+ continue
+
subj_mh = prepare_subject(subj.minhash)
# all numbers calculated after downsampling --
@@ -478,7 +495,8 @@ def find(self, search_fn, query, **kwargs):
# signal that it is done, or something.
if search_fn.passes(score):
if search_fn.collect(score, subj):
- yield IndexSearchResult(score, subj, self.location)
+ if passes_all_picklists(subj, self.picklists):
+ yield IndexSearchResult(score, subj, self.location)
@cached_property
def lid_to_idx(self):
diff --git a/src/sourmash/manifest.py b/src/sourmash/manifest.py
new file mode 100644
index 0000000000..63bc0713ad
--- /dev/null
+++ b/src/sourmash/manifest.py
@@ -0,0 +1,199 @@
+"""
+Manifests for collections of signatures.
+"""
+import csv
+
+from sourmash.picklist import SignaturePicklist
+
+
+class CollectionManifest:
+ """
+ Signature metadata for a collection of signatures.
+
+ Manifests support selection and rapid lookup of signatures.
+
+ * 'select_to_manifest(...)' matches the Index selector protocol
+ * 'rows' is a public iterable that can be used to iterate over the manifest
+ contents.
+ * 'locations()' returns all distinct locations for e.g. lazy loading
+ * supports container protocol for signatures, e.g. 'if ss in manifest: ...'
+ """
+ # each manifest row must have the following, although they may be empty.
+ required_keys = ('internal_location',
+ 'md5', 'md5short', 'ksize', 'moltype', 'num',
+ 'scaled', 'n_hashes', 'with_abundance',
+ 'name', 'filename')
+
+ def __init__(self, rows):
+ "Initialize from an iterable of metadata dictionaries."
+ self.rows = tuple(rows)
+
+ # build a fast lookup table for md5sums in particular
+ md5set = set()
+ for row in self.rows:
+ md5set.add(row['md5'])
+ self._md5_set = md5set
+
+ def __bool__(self):
+ return bool(self.rows)
+
+ def __len__(self):
+ return len(self.rows)
+
+ @classmethod
+ def load_from_csv(cls, fp):
+ "load a manifest from a CSV file."
+ manifest_list = []
+ firstline = fp.readline().rstrip()
+ if not firstline.startswith('# SOURMASH-MANIFEST-VERSION: '):
+ raise ValueError("manifest is missing version header")
+
+ version = firstline[len('# SOURMASH-MANIFEST-VERSION: '):]
+ if float(version) != 1.0:
+ raise ValueError(f"unknown manifest version number {version}")
+
+ r = csv.DictReader(fp)
+ if not r.fieldnames:
+ raise ValueError("missing column headers in manifest")
+
+ for k in cls.required_keys:
+ if k not in r.fieldnames:
+ raise ValueError(f"missing column '{k}' in manifest.")
+
+ row = None
+
+ # do row type conversion
+ introws = ('num', 'scaled', 'with_abundance', 'ksize', 'n_hashes')
+ boolrows = ('with_abundance',)
+
+ for row in r:
+ for k in introws:
+ row[k] = int(row[k])
+ for k in boolrows:
+ row[k] = bool(row[k])
+ row['signature'] = None
+ manifest_list.append(row)
+
+ return cls(manifest_list)
+
+ @classmethod
+ def write_csv_header(cls, fp):
+ "write header for manifest CSV format"
+ fp.write('# SOURMASH-MANIFEST-VERSION: 1.0\n')
+ w = csv.DictWriter(fp, fieldnames=cls.required_keys)
+ w.writeheader()
+
+ def write_to_csv(self, fp, write_header=False):
+ "write manifest CSV to specified file handle"
+ w = csv.DictWriter(fp, fieldnames=self.required_keys)
+
+ if write_header:
+ self.write_csv_header(fp)
+
+ for row in self.rows:
+ # don't write signature!
+ if 'signature' in row:
+ del row['signature']
+ w.writerow(row)
+
+ @classmethod
+ def make_manifest_row(cls, ss, location, *, include_signature=True):
+ "make a manifest row dictionary."
+ row = {}
+ row['md5'] = ss.md5sum()
+ row['md5short'] = row['md5'][:8]
+ row['ksize'] = ss.minhash.ksize
+ row['moltype'] = ss.minhash.moltype
+ row['num'] = ss.minhash.num
+ row['scaled'] = ss.minhash.scaled
+ row['n_hashes'] = len(ss.minhash)
+ row['with_abundance'] = 1 if ss.minhash.track_abundance else 0
+ row['name'] = ss.name
+ row['filename'] = ss.filename
+ row['internal_location'] = location
+
+ assert set(row.keys()) == set(cls.required_keys)
+
+ # if requested, include the signature in the manifest.
+ if include_signature:
+ row['signature'] = ss
+ return row
+
+ @classmethod
+ def create_manifest(cls, locations_iter, *, include_signature=True):
+ """Create a manifest from an iterator that yields (ss, location)
+
+ Stores signatures in manifest rows by default.
+
+ Note: do NOT catch exceptions here, so this passes through load excs.
+ """
+ manifest_list = []
+ for ss, location in locations_iter:
+ row = cls.make_manifest_row(ss, location, include_signature=True)
+ manifest_list.append(row)
+
+ return cls(manifest_list)
+
+ def _select(self, *, ksize=None, moltype=None, scaled=0, num=0,
+ containment=False, abund=None, picklist=None):
+ """Yield manifest rows for sigs that match the specified requirements.
+
+ Internal method; call `select_to_manifest` instead.
+ """
+ matching_rows = self.rows
+ if ksize:
+ matching_rows = ( row for row in matching_rows
+ if row['ksize'] == ksize )
+ if moltype:
+ matching_rows = ( row for row in matching_rows
+ if row['moltype'] == moltype )
+ if scaled or containment:
+ if containment and not scaled:
+ raise ValueError("'containment' requires 'scaled' in Index.select'")
+
+ matching_rows = ( row for row in matching_rows
+ if row['scaled'] and not row['num'] )
+ if num:
+ matching_rows = ( row for row in matching_rows
+ if row['num'] and not row['scaled'] )
+
+ if abund:
+ # only need to concern ourselves if abundance is _required_
+ matching_rows = ( row for row in matching_rows
+ if row['with_abundance'] )
+
+ if picklist:
+ matching_rows = ( row for row in matching_rows
+ if picklist.matches_manifest_row(row) )
+
+ # return only the internal filenames!
+ for row in matching_rows:
+ yield row
+
+ def select_to_manifest(self, **kwargs):
+ "Do a 'select' and return a new CollectionManifest object."
+ new_rows = self._select(**kwargs)
+ return CollectionManifest(new_rows)
+
+ def locations(self):
+ "Return all distinct locations."
+ seen = set()
+ for row in self.rows:
+ loc = row['internal_location']
+
+ # track/remove duplicates
+ if loc not in seen:
+ seen.add(loc)
+ yield loc
+
+ def __contains__(self, ss):
+ "Does this manifest contain this signature?"
+ md5 = ss.md5sum()
+ return md5 in self._md5_set
+
+ def to_picklist(self):
+ "Convert this manifest to a picklist."
+ picklist = SignaturePicklist('md5')
+ picklist.pickset = set(self._md5_set)
+
+ return picklist
diff --git a/src/sourmash/minhash.py b/src/sourmash/minhash.py
index 05b8b7361e..b0f258d49e 100644
--- a/src/sourmash/minhash.py
+++ b/src/sourmash/minhash.py
@@ -1,13 +1,23 @@
# -*- coding: UTF-8 -*-
+"""
+sourmash submodule that provides MinHash class and utility functions.
+
+class MinHash - core MinHash class.
+class FrozenMinHash - read-only MinHash class.
+"""
from __future__ import unicode_literals, division
-import math
-import copy
+__all__ = ['get_minhash_default_seed',
+ 'get_minhash_max_hash',
+ 'hash_murmur',
+ 'MinHash',
+ 'FrozenMinHash']
+
from collections.abc import Mapping
from . import VERSION
from ._lowlevel import ffi, lib
-from .utils import RustObject, rustcall, decode_str
+from .utils import RustObject, rustcall
from .exceptions import SourmashError
from deprecation import deprecated
@@ -222,6 +232,8 @@ def __copy__(self):
a.merge(self)
return a
+ copy = __copy__
+
def __getstate__(self):
"support pickling via __getstate__/__setstate__"
return (
@@ -305,8 +317,15 @@ def add_many(self, hashes):
self._methodcall(lib.kmerminhash_add_many, list(hashes), len(hashes))
def remove_many(self, hashes):
- "Remove many hashes at once; ``hashes`` must be an iterable."
- self._methodcall(lib.kmerminhash_remove_many, list(hashes), len(hashes))
+ """Remove many hashes from a sketch at once.
+
+ ``hashes`` can be either an iterable (list, set, etc.), or another
+ ``MinHash`` object.
+ """
+ if isinstance(hashes, MinHash):
+ self._methodcall(lib.kmerminhash_remove_from, hashes._objptr)
+ else:
+ self._methodcall(lib.kmerminhash_remove_many, list(hashes), len(hashes))
def __len__(self):
"Number of hashes."
@@ -454,6 +473,8 @@ def intersection_and_union_size(self, other):
"Calculate intersection and union sizes between `self` and `other`."
if not isinstance(other, MinHash):
raise TypeError("Must be a MinHash!")
+ if not self.is_compatible(other):
+ raise TypeError("incompatible MinHash objects")
usize = ffi.new("uint64_t *")
common = self._methodcall(lib.kmerminhash_intersection_union_size,
@@ -591,6 +612,7 @@ def __add__(self, other):
new_obj = self.to_mutable()
new_obj += other
return new_obj
+ __or__ = __add__
def __iadd__(self, other):
if not isinstance(other, MinHash):
@@ -611,17 +633,23 @@ def intersection(self, other):
ptr = self._methodcall(lib.kmerminhash_intersection, other._get_objptr())
return MinHash._from_objptr(ptr)
+ __and__ = intersection
def set_abundances(self, values, clear=True):
"""Set abundances for hashes from ``values``, where
``values[hash] = abund``
+
+ If ``abund`` value is set to zero, the ``hash`` will be removed from the sketch.
+ ``abund`` cannot be set to a negative value.
"""
if self.track_abundance:
hashes = []
abunds = []
for h, v in values.items():
- hashes.append(h)
+ hashes.append(h)
+ if v < 0:
+ raise ValueError("Abundance cannot be set to a negative value.")
abunds.append(v)
self._methodcall(lib.kmerminhash_set_abundances, hashes, abunds, len(hashes), clear)
@@ -679,9 +707,6 @@ def add_hash_with_abundance(self, *args, **kwargs):
def clear(self, *args, **kwargs):
raise TypeError('FrozenMinHash does not support modification')
- def remove_many(self, *args, **kwargs):
- raise TypeError('FrozenMinHash does not support modification')
-
def set_abundances(self, *args, **kwargs):
raise TypeError('FrozenMinHash does not support modification')
@@ -749,3 +774,4 @@ def __setstate__(self, tup):
def __copy__(self):
return self
+ copy = __copy__
diff --git a/src/sourmash/picklist.py b/src/sourmash/picklist.py
new file mode 100644
index 0000000000..14977da0f0
--- /dev/null
+++ b/src/sourmash/picklist.py
@@ -0,0 +1,240 @@
+"Picklist code for extracting subsets of signatures."
+import csv
+from enum import Enum
+
+# set up preprocessing functions for column stuff
+preprocess = {}
+
+# exact matches
+preprocess['name'] = lambda x: x
+preprocess['md5'] = lambda x: x
+
+# identifier matches/prefix foo - space delimited identifiers
+preprocess['identprefix'] = lambda x: x.split(' ')[0].split('.')[0]
+preprocess['ident'] = lambda x: x.split(' ')[0]
+
+# match 8 characters
+preprocess['md5prefix8'] = lambda x: x[:8]
+preprocess['md5short'] = lambda x: x[:8]
+
+
+class PickStyle(Enum):
+ INCLUDE = 1
+ EXCLUDE = 2
+
+
+class SignaturePicklist:
+ """Picklist class for subsetting collections of signatures.
+
+ Initialize using ``SignaturePicklist.from_picklist_args(argstr)``,
+ which takes an argument str like so: 'pickfile:column:coltype'.
+
+ Here, 'pickfile' is the path to a CSV file; 'column' is the name of
+ the column to select from the CSV file; and 'coltype' is the type of
+ matching to do on that column.
+
+ 'coltype's that are currently supported:
+ * 'name' - exact match to signature's name
+ * 'md5' - exact match to signature's md5sum
+ * 'md5prefix8' - match to 8-character prefix of signature's md5sum
+ * 'md5short' - same as md5prefix8
+ * 'ident' - exact match to signature's identifier
+ * 'identprefix' - match to signature's identifier, before '.'
+
+ Identifiers are constructed by using the first space delimited word in
+ the signature name.
+
+ You can also use 'gather', 'prefetch', 'search' and 'manifest' as
+ column types; these take the CSV output of 'gather', 'prefetch',
+ 'search', and 'sig manifest' as picklists. 'column' must be left
+ blank in this case: e.g. use 'pickfile.csv::gather'.
+ """
+ meta_coltypes = ('manifest', 'gather', 'prefetch', 'search')
+ supported_coltypes = ('md5', 'md5prefix8', 'md5short',
+ 'name', 'ident', 'identprefix')
+
+ def __init__(self, coltype, *, pickfile=None, column_name=None,
+ pickstyle=PickStyle.INCLUDE):
+ "create a picklist of column type 'coltype'."
+
+ # first, check coltype...
+ valid_coltypes = set(self.meta_coltypes)
+ valid_coltypes.update(self.supported_coltypes)
+ if coltype not in valid_coltypes:
+ raise ValueError(f"invalid picklist column type '{coltype}'")
+
+ # if we're using gather or prefetch or manifest, set column_name
+ # automatically (after checks).
+ if coltype in self.meta_coltypes:
+ if column_name:
+ raise ValueError(f"no column name allowed for coltype '{coltype}'")
+ if coltype == 'gather':
+ # for now, override => md5short in column md5
+ coltype = 'md5prefix8'
+ column_name = 'md5'
+ elif coltype == 'prefetch':
+ # for now, override => md5short in column match_md5
+ coltype = 'md5prefix8'
+ column_name = 'match_md5'
+ elif coltype == 'manifest' or coltype == 'search':
+ # for now, override => md5
+ coltype = 'md5'
+ column_name = 'md5'
+ else: # should never be reached!
+ assert 0
+
+ self.coltype = coltype
+ self.pickfile = pickfile
+ self.column_name = column_name
+ self.pickstyle = pickstyle
+
+ self.preprocess_fn = preprocess[coltype]
+ self.pickset = None
+ self.found = set()
+ self.n_queries = 0
+
+ @classmethod
+ def from_picklist_args(cls, argstr):
+ "load a picklist from an argument string 'pickfile:col:coltype:style'"
+ picklist = argstr.split(':')
+ pickstyle = PickStyle.INCLUDE
+
+ # pickstyle specified?
+ if len(picklist) == 4:
+ pickstyle_str = picklist.pop()
+ if pickstyle_str == 'include':
+ pickstyle = PickStyle.INCLUDE
+ elif pickstyle_str == 'exclude':
+ pickstyle = PickStyle.EXCLUDE
+ else:
+ raise ValueError(f"invalid picklist 'pickstyle' argument, '{pickstyle_str}': must be 'include' or 'exclude'")
+
+ if len(picklist) != 3:
+ raise ValueError(f"invalid picklist argument '{argstr}'")
+
+ assert len(picklist) == 3
+ pickfile, column, coltype = picklist
+
+ return cls(coltype, pickfile=pickfile, column_name=column,
+ pickstyle=pickstyle)
+
+ def _get_sig_attribute(self, ss):
+ "for a given SourmashSignature, return attribute for this picklist."
+ coltype = self.coltype
+ if coltype in ('md5', 'md5prefix8', 'md5short'):
+ q = ss.md5sum()
+ elif coltype in ('name', 'ident', 'identprefix'):
+ q = ss.name
+ else:
+ assert 0
+
+ return q
+
+ def init(self, values=[]):
+ "initialize a Picklist object with given values."
+ if self.pickset is not None:
+ raise ValueError("already initialized?")
+ self.pickset = set(values)
+ return self.pickset
+
+ def load(self, pickfile, column_name):
+ "load pickset, return num empty vals, and set of duplicate vals."
+ pickset = self.init()
+
+ n_empty_val = 0
+ dup_vals = set()
+ with open(pickfile, newline='') as csvfile:
+ x = csvfile.readline()
+
+ # skip leading comment line in case there's a manifest header
+ if x[0] == '#':
+ pass
+ else:
+ csvfile.seek(0)
+
+ r = csv.DictReader(csvfile)
+
+ if column_name not in r.fieldnames:
+ raise ValueError(f"column '{column_name}' not in pickfile '{pickfile}'")
+
+ for row in r:
+ # pick out values from column
+ col = row[column_name]
+ if not col:
+ n_empty_val += 1
+ continue
+
+ col = self.preprocess_fn(col)
+
+ # look for duplicate values or empty values
+ if col in pickset:
+ dup_vals.add(col)
+ else:
+ self.add(col)
+
+ return n_empty_val, dup_vals
+
+ def add(self, value):
+ "Add a value to this picklist."
+ self.pickset.add(value)
+
+ def __contains__(self, ss):
+ "does this signature match anything in the picklist?"
+ # pull out the relevant signature attribute
+ q = self._get_sig_attribute(ss)
+
+ # mangle into the kinds of values we support here
+ q = self.preprocess_fn(q)
+
+ # add to the number of queries performed,
+ self.n_queries += 1
+
+ # determine if ok or not.
+ if self.pickstyle == PickStyle.INCLUDE:
+ if q in self.pickset:
+ self.found.add(q)
+ return True
+ elif self.pickstyle == PickStyle.EXCLUDE:
+ if q not in self.pickset:
+ self.found.add(q)
+ return True
+ return False
+
+ def matches_manifest_row(self, row):
+ "does the given manifest row match this picklist?"
+ if self.coltype == 'md5':
+ colkey = 'md5'
+ elif self.coltype in ('md5prefix8', 'md5short'):
+ colkey = 'md5short'
+ elif self.coltype in ('name', 'ident', 'identprefix'):
+ colkey = 'name'
+ else:
+ assert 0
+
+ q = row[colkey]
+ q = self.preprocess_fn(q)
+ self.n_queries += 1
+
+ if self.pickstyle == PickStyle.INCLUDE:
+ if q in self.pickset:
+ self.found.add(q)
+ return True
+ elif self.pickstyle == PickStyle.EXCLUDE:
+ if q not in self.pickset:
+ self.found.add(q)
+ return True
+ return False
+
+ def filter(self, it):
+ "yield all signatures in the given iterator that are in the picklist"
+ for ss in it:
+ if self.__contains__(ss):
+ yield ss
+
+
+def passes_all_picklists(ss, picklists):
+ "does the signature 'ss' pass all of the picklists?"
+ for picklist in picklists:
+ if ss not in picklist:
+ return False
+ return True
diff --git a/src/sourmash/sbt.py b/src/sourmash/sbt.py
index ab93016b0e..c49721d498 100644
--- a/src/sourmash/sbt.py
+++ b/src/sourmash/sbt.py
@@ -15,11 +15,13 @@
import sys
from tempfile import NamedTemporaryFile
from cachetools import Cache
+from io import StringIO
from .exceptions import IndexNotSupported
from .sbt_storage import FSStorage, IPFSStorage, RedisStorage, ZipStorage
from .logging import error, notify, debug
-from .index import Index, IndexSearchResult
+from .index import Index, IndexSearchResult, CollectionManifest
+from .picklist import passes_all_picklists
from .nodegraph import Nodegraph, extract_nodegraph_info, calc_expected_collisions
@@ -148,17 +150,49 @@ def __init__(self, factory, *, d=2, storage=None, cache_size=None):
cache_size = sys.maxsize
self._nodescache = _NodesCache(maxsize=cache_size)
self._location = None
+ self.picklists = []
+ self.manifest = None
@property
def location(self):
return self._location
def signatures(self):
+ if self.manifest:
+ # if manifest, use it & load using direct path to storage.
+ # this will be faster when using picklists.
+ from .signature import load_one_signature
+ manifest = self.manifest
+
+ # iteratively select picklists; no other selection criteria
+ # apply to SBTs, since ksize etc are fixed as part of indexing.
+ for picklist in self.picklists:
+ manifest = manifest.select_to_manifest(picklist=picklist)
+
+ for loc in manifest.locations():
+ buf = self.storage.load(loc)
+ # if more than one signature can be in a file, we need
+ # to recheck picklists here.
+ ss = load_one_signature(buf)
+ yield ss
+ else:
+ # no manifest? iterate over all leaves.
+ for k in self.leaves():
+ ss = k.data
+ if passes_all_picklists(ss, self.picklists):
+ yield ss
+
+ def _signatures_with_internal(self):
+ """Return an iterator of tuples (ss, storage_path, internal_location).
+
+ Note: does not limit signatures to subsets.
+ """
for k in self.leaves():
- yield k.data
+ ss = k.data
+ yield ss, self.location, k._path
def select(self, ksize=None, moltype=None, num=0, scaled=0,
- containment=False):
+ containment=False, abund=None, picklist=None):
"""Make sure this database matches the requested requirements.
Will always raise ValueError if a requirement cannot be met.
@@ -210,9 +244,18 @@ def select(self, ksize=None, moltype=None, num=0, scaled=0,
if scaled > db_mh.scaled and not containment:
raise ValueError(f"search scaled value {scaled} is less than database scaled value of {db_mh.scaled}")
+ if abund:
+ raise ValueError("SBT indices do not support sketches with abund=True")
+
+ if picklist is not None:
+ self.picklists.append(picklist)
+ if len(self.picklists) > 1:
+ raise ValueError("we do not (yet) support multiple picklists for SBTs")
+
return self
def new_node_pos(self, node):
+ # note: node is not actually used in this function! CTB
if not self._nodes:
self.next_node = 1
return 0
@@ -286,6 +329,9 @@ def add_node(self, node):
c1 = self.children(p.pos)[0]
self._leaves[c1.pos] = node
node.update(n)
+ else:
+ # this branch should never be reached; put guard in to make sure!
+ assert 0
# update all parents!
p = self.parent(p.pos)
@@ -446,7 +492,11 @@ def node_search(node, *args, **kwargs):
# & execute!
for n in self._find_nodes(node_search, **kwargs):
- yield IndexSearchResult(results[n.data], n.data, self.location)
+ ss = n.data
+
+ # filter on picklists
+ if passes_all_picklists(ss, self.picklists):
+ yield IndexSearchResult(results[ss], ss, self.location)
def _rebuild_node(self, pos=0):
"""Recursively rebuilds an internal node (if it is not present).
@@ -569,13 +619,19 @@ def save(self, path, storage=None, sparseness=0.0, structure_only=False):
info["index_type"] = self.__class__.__name__ # TODO: check
# choose between ZipStorage and FS (file system/directory) storage.
+ # default to ZipStorage, unless .sbt.json is specified in filename.
+ kind = None
if not path.endswith(".sbt.json"):
kind = "Zip"
if not path.endswith('.sbt.zip'):
path += '.sbt.zip'
storage = ZipStorage(path)
backend = "FSStorage"
+
+ assert path[-8:] == '.sbt.zip'
name = os.path.basename(path[:-8])
+
+ # align the storage prefix with what we do for FSStorage, below.
subdir = '.sbt.{}'.format(name)
storage_args = FSStorage("", subdir, make_dirs=False).init_args()
storage.save(subdir + "/", b"")
@@ -583,16 +639,20 @@ def save(self, path, storage=None, sparseness=0.0, structure_only=False):
index_filename = os.path.abspath(path)
else: # path.endswith('.sbt.json')
assert path.endswith('.sbt.json')
- kind = "FS"
name = os.path.basename(path)
name = name[:-9]
index_filename = os.path.abspath(path)
if storage is None:
+ kind = "FS"
# default storage
location = os.path.dirname(index_filename)
+
+ # align subdir names with what we do above for ZipStorage
subdir = '.sbt.{}'.format(name)
+ # when we go to default of FSStorage, use full location for
+ # storage, e.g. location/.sbt.{name}/
storage = FSStorage(location, subdir)
index_filename = os.path.join(location, index_filename)
@@ -611,6 +671,7 @@ def save(self, path, storage=None, sparseness=0.0, structure_only=False):
nodes = {}
leaves = {}
total_nodes = len(self)
+ manifest_rows = []
for n, (i, node) in enumerate(self):
if node is None:
continue
@@ -639,31 +700,77 @@ def save(self, path, storage=None, sparseness=0.0, structure_only=False):
node.storage = storage
if kind == "Zip":
- node.save(os.path.join(subdir, data['filename']))
- elif kind == "FS":
+ new_name = node.save(os.path.join(subdir, data['filename']))
+ assert new_name.startswith(subdir + '/')
+
+ # strip off prefix
+ new_name = new_name[len(subdir) + 1:]
+ data['filename'] = new_name
+ else:
data['filename'] = node.save(data['filename'])
+
if isinstance(node, Node):
nodes[i] = data
else:
leaves[i] = data
+ row = node.make_manifest_row(data['filename'])
+ if row:
+ manifest_rows.append(row)
+
if n % 100 == 0:
notify("{} of {} nodes saved".format(n+1, total_nodes), end='\r')
+ # now, save the index file and manifests.
+ #
+ # for zipfiles, they get saved in the zip file.
+ # for FSStorage, we use the storage.save function.
+ #
+ # for everything else (Redis, IPFS), the index gets saved locally.
+ # the nodes/leaves are saved/loaded from the datatabase, and
+ # the index is used to get their names for loading.
+ # (CTB: manifests are not yet supported for Redis and IPFS)
+ #
notify("Finished saving nodes, now saving SBT index file.")
info['nodes'] = nodes
info['signatures'] = leaves
+ # finish constructing manifest object & save
+ manifest = CollectionManifest(manifest_rows)
+ manifest_name = f"{name}.manifest.csv"
+
+ manifest_fp = StringIO()
+ manifest.write_to_csv(manifest_fp, write_header=True)
+ manifest_data = manifest_fp.getvalue().encode("utf-8")
+
+ if kind == "Zip":
+ manifest_name = os.path.join(storage.subdir, manifest_name)
+ manifest_path = storage.save(manifest_name, manifest_data,
+ overwrite=True, compress=True)
+ elif kind == "FS":
+ manifest_name = manifest_name
+ manifest_path = storage.save(manifest_name, manifest_data,
+ overwrite=True)
+ else:
+ manifest_path = None
+
+ if manifest_path:
+ info['manifest_path'] = manifest_path
+
+ # now, save index.
+ tree_data = json.dumps(info).encode("utf-8")
+
if kind == "Zip":
- tree_data = json.dumps(info).encode("utf-8")
save_path = "{}.sbt.json".format(name)
- storage.save(save_path, tree_data)
+ storage.save(save_path, tree_data, overwrite=True)
storage.flush()
-
elif kind == "FS":
- with open(index_filename, 'w') as fp:
- json.dump(info, fp)
+ storage.save(index_filename, tree_data, overwrite=True)
+ else:
+ # save tree locally.
+ with open(index_filename, 'wb') as tree_fp:
+ tree_fp.write(tree_data)
notify("Finished saving SBT index, available at {0}\n".format(index_filename))
@@ -702,10 +809,10 @@ def load(cls, location, *, leaf_loader=None, storage=None, print_version_warning
if ZipStorage.can_open(location2):
storage = ZipStorage(location2)
- if storage:
- sbts = storage.list_sbts()
- if len(sbts) == 1:
- tree_data = storage.load(sbts[0])
+ if storage:
+ sbts = storage.list_sbts()
+ if len(sbts) == 1:
+ tree_data = storage.load(sbts[0])
tempfile = NamedTemporaryFile()
@@ -766,6 +873,16 @@ def load(cls, location, *, leaf_loader=None, storage=None, print_version_warning
obj = loader(jnodes, leaf_loader, dirname, storage, print_version_warning=print_version_warning, cache_size=cache_size)
obj._location = location
+
+ if 'manifest_path' in jnodes:
+ manifest_path = jnodes['manifest_path']
+ manifest_data = storage.load(manifest_path)
+ manifest_data = manifest_data.decode('utf-8')
+ manifest_fp = StringIO(manifest_data)
+ obj.manifest = CollectionManifest.load_from_csv(manifest_fp)
+ else:
+ obj.manifest = None
+
return obj
@staticmethod
@@ -1185,7 +1302,7 @@ def __str__(self):
def save(self, path):
buf = self.data.to_bytes(compression=1)
- return self.storage.save(path, buf)
+ return self.storage.save(path, buf, overwrite=True)
@property
def data(self):
@@ -1245,6 +1362,9 @@ def __str__(self):
nb=self.data.n_occupied(),
fpr=calc_expected_collisions(self.data, True, 1.1))
+ def make_manifest_row(self, location):
+ return None
+
@property
def data(self):
if self._data is None:
diff --git a/src/sourmash/sbt_storage.py b/src/sourmash/sbt_storage.py
index 48fa124c3c..83122d40c6 100644
--- a/src/sourmash/sbt_storage.py
+++ b/src/sourmash/sbt_storage.py
@@ -12,13 +12,16 @@
class Storage(ABC):
@abc.abstractmethod
- def save(self, path, content):
+ def save(self, path, content, *, overwrite=False):
pass
@abc.abstractmethod
def load(self, path):
pass
+ def list_sbts(self):
+ return []
+
def init_args(self):
return {}
@@ -49,7 +52,7 @@ def __init__(self, location, subdir, make_dirs=True):
def init_args(self):
return {'path': self.subdir}
- def save(self, path, content):
+ def save(self, path, content, overwrite=False):
"Save a node/leaf."
newpath = path
fullpath = os.path.join(self.location, self.subdir, path)
@@ -61,16 +64,19 @@ def save(self, path, content):
if old_content == content:
return path
- # different content, need to find new path to save
- newpath = None
- n = 0
- while newpath is None:
- testpath = "{}_{}".format(fullpath, n)
- if os.path.exists(testpath):
- n += 1
- else:
- # testpath is available, use it as newpath
- newpath = "{}_{}".format(path, n)
+ if overwrite:
+ pass # fine to overwrite file!
+ else:
+ # different content, need to find new path to save
+ newpath = None
+ n = 0
+ while newpath is None:
+ testpath = "{}_{}".format(fullpath, n)
+ if os.path.exists(testpath):
+ n += 1
+ else:
+ # testpath is available, use it as newpath
+ newpath = "{}_{}".format(path, n)
fullpath = os.path.join(self.location, self.subdir, newpath)
with open(fullpath, 'wb') as f:
@@ -113,50 +119,75 @@ def __init__(self, path):
if len(subdirs) == 1:
self.subdir = subdirs[0]
- def _save_to_zf(self, zf, path, content):
- # we repeat these steps for self.zipfile and self.bufferzip,
- # so better to have an auxiliary method
+ def _content_matches(self, zf, path, content):
+ info = zf.getinfo(path)
+ entry_content = zf.read(info)
+ if entry_content == content:
+ return True
+ return False
+
+ def _generate_filename(self, zf, path, content):
try:
- info = zf.getinfo(path)
+ matches = self._content_matches(zf, path, content)
+ if matches:
+ return path, False
except KeyError:
- # entry not there yet, write a new one
- newpath = path
- else:
- entry_content = zf.read(info)
-
- if entry_content == content:
- # skip writing
- return path
-
- # Trying to write new content:
- # create newpath based on path
- newpath = None
- n = 0
- while newpath is None:
- testpath = "{}_{}".format(path, n)
- try:
- zf.getinfo(testpath)
- except KeyError:
- # testpath is available, use it as newpath
- newpath = testpath
+ # entry not there yet, use that path
+ return path, True
+
+ # content does not match - generate new path based on path
+ newpath = None
+ n = 0
+ while newpath is None:
+ testpath = "{}_{}".format(path, n)
+ try:
+ matches = self._content_matches(zf, testpath, content)
+ if matches:
+ return testpath, False
else:
n += 1
+ except KeyError:
+ return testpath, True
- zf.writestr(newpath, content)
- return newpath
+ assert 0 # should never get here!
+
+ def _write_to_zf(self, zf, path, content, *, compress=False):
+ compress_type = zipfile.ZIP_STORED
+ if compress:
+ compress_type = zipfile.ZIP_DEFLATED
+
+ # save to zipfile
+ zf.writestr(path, content, compress_type=compress_type)
+
+ # set permissions
+ zi = zf.getinfo(path)
+ perms = 0o444 << 16 # give a+r access
+ if path.endswith('/'):
+ perms = 0o755 << 16 # directories get u+rwx, a+rx
+ zi.external_attr = perms
- def save(self, path, content):
+ def save(self, path, content, *, overwrite=False, compress=False):
# First try to save to self.zipfile, if it is not writable
# or would introduce duplicates then try to save it in the buffer
- try:
- newpath = self._save_to_zf(self.zipfile, path, content)
- except (ValueError, RuntimeError):
- # Can't write in the zipfile, write in buffer instead
- if self.bufferzip:
- newpath = self._save_to_zf(self.bufferzip, path, content)
- else:
- # Throw error, can't write the data
- raise ValueError("can't write data")
+ if overwrite:
+ newpath = path
+ do_write = True
+ else:
+ newpath, do_write = self._generate_filename(self.zipfile, path, content)
+ if do_write:
+ try:
+ self._write_to_zf(self.zipfile, newpath, content,
+ compress=compress)
+ except (ValueError, RuntimeError):
+ # Can't write in the zipfile, write in buffer instead
+ # CTB: do we need to generate a new filename wrt to the
+ # bufferzip, too? Not sure this code is working as intended...
+ if self.bufferzip:
+ self._write_to_zf(self.bufferzip, newpath, content,
+ compress=compress)
+ else:
+ # Throw error, can't write the data
+ raise ValueError("can't write data")
return newpath
@@ -207,7 +238,7 @@ def flush(self, *, keep_closed=False):
zf_names = set(self.zipfile.namelist())
if buffer_names:
new_data = buffer_names - zf_names
- duplicated = buffer_names.intersection(zf_names)
+ duplicated = buffer_names & zf_names
if duplicated:
# bad news, need to create new file...
@@ -221,10 +252,10 @@ def flush(self, *, keep_closed=False):
if item in duplicated or item in buffer_names:
# we prioritize writing data from the buffer to the
# final file
- final_file.writestr(item, self.bufferzip.read(item))
+ self._write_to_zf(final_file, item, self.bufferzip.read(item))
else:
# it is only in the zipfile, so write from it
- final_file.writestr(item, self.zipfile.read(item))
+ self._write_to_zf(final_file, item, self.zipfile.read(item))
# close the files, remove the old one and copy the final
# file to the right place.
@@ -243,7 +274,7 @@ def flush(self, *, keep_closed=False):
zf = zipfile.ZipFile(self.path, mode='a',
compression=zipfile.ZIP_STORED)
for item in new_data:
- zf.writestr(item, self.bufferzip.read(item))
+ self._write_to_zf(zf, item, self.bufferzip.read(item))
self.zipfile = zf
# finally, close the buffer and release memory
self.bufferzip.close()
@@ -268,7 +299,7 @@ def __init__(self, pin_on_add=True, **kwargs):
self.pin_on_add = pin_on_add
self.api = ipfshttpclient.connect(**self.ipfs_args)
- def save(self, path, content):
+ def save(self, path, content, *, overwrite=False):
new_obj = self.api.add_bytes(content)
if self.pin_on_add:
self.api.pin.add(new_obj)
@@ -306,7 +337,7 @@ def __init__(self, **kwargs):
self.redis_args = kwargs
self.conn = redis.Redis(**self.redis_args)
- def save(self, path, content):
+ def save(self, path, content, *, overwrite=False):
if not isinstance(content, bytes):
content = bytes(content)
self.conn.set(path, content)
diff --git a/src/sourmash/sbtmh.py b/src/sourmash/sbtmh.py
index d175d2ae89..6cb9cc0135 100644
--- a/src/sourmash/sbtmh.py
+++ b/src/sourmash/sbtmh.py
@@ -39,6 +39,13 @@ def __str__(self):
return '**Leaf:{name} -> {metadata}'.format(
name=self.name, metadata=self.metadata)
+ def make_manifest_row(self, loc):
+ from .index import CollectionManifest
+ row = CollectionManifest.make_manifest_row(self.data,
+ loc,
+ include_signature=0)
+ return row
+
def save(self, path):
# this is here only for triggering the property load
# before we reopen the file (and overwrite the previous
diff --git a/src/sourmash/search.py b/src/sourmash/search.py
index 93d77920ce..161f0da01e 100644
--- a/src/sourmash/search.py
+++ b/src/sourmash/search.py
@@ -2,13 +2,10 @@
Code for searching collections of signatures.
"""
from collections import namedtuple
-import sys
-import os
from enum import Enum
+import numpy as np
-from .logging import notify, error
from .signature import SourmashSignature
-from .minhash import _get_max_hash_for_scaled
class SearchType(Enum):
@@ -133,6 +130,8 @@ def collect(self, score, match_sig):
def score_jaccard(self, query_size, shared_size, subject_size, total_size):
"Calculate Jaccard similarity."
+ if total_size == 0:
+ return 0
return shared_size / total_size
def score_containment(self, query_size, shared_size, subject_size,
@@ -242,15 +241,13 @@ def search_databases_with_abund_query(query, databases, **kwargs):
###
GatherResult = namedtuple('GatherResult',
- 'intersect_bp, f_orig_query, f_match, f_unique_to_query, f_unique_weighted, average_abund, median_abund, std_abund, filename, name, md5, match, f_match_orig, unique_intersect_bp, gather_result_rank, remaining_bp')
+ 'intersect_bp, f_orig_query, f_match, f_unique_to_query, f_unique_weighted, average_abund, median_abund, std_abund, filename, name, md5, match, f_match_orig, unique_intersect_bp, gather_result_rank, remaining_bp, query_filename, query_name, query_md5, query_bp')
def _find_best(counters, query, threshold_bp):
"""
Search for the best containment, return precisely one match.
"""
- results = []
-
best_result = None
best_intersect_mh = None
@@ -274,33 +271,97 @@ def _find_best(counters, query, threshold_bp):
return None, None
-def gather_databases(query, counters, threshold_bp, ignore_abundance):
- """
- Iteratively find the best containment of `query` in all the `counters`,
- until we find fewer than `threshold_bp` (estimated) bp in common.
- """
- # track original query information for later usage.
- track_abundance = query.minhash.track_abundance and not ignore_abundance
- orig_query_mh = query.minhash
-
- # do we pay attention to abundances?
- orig_query_abunds = { k: 1 for k in orig_query_mh.hashes }
- if track_abundance:
- import numpy as np
- orig_query_abunds = orig_query_mh.hashes
-
- orig_query_mh = orig_query_mh.flatten()
- query.minhash = query.minhash.flatten()
-
- cmp_scaled = query.minhash.scaled # initialize with resolution of query
- result_n = 0
- while query.minhash:
+class GatherDatabases:
+ "Iterator object for doing gather/min-set-cov."
+
+ def __init__(self, query, counters, *,
+ threshold_bp=0, ignore_abundance=False, noident_mh=None):
+ # track original query information for later usage?
+ track_abundance = query.minhash.track_abundance and not ignore_abundance
+ self.orig_query_bp = len(query.minhash) * query.minhash.scaled
+ self.orig_query_filename = query.filename
+ self.orig_query_name = query.name
+ self.orig_query_md5 = query.md5sum()[:8]
+
+ # do we pay attention to abundances?
+ query_mh = query.minhash
+ query_hashes = query_mh.hashes
+ orig_query_abunds = { k: 1 for k in query_hashes }
+ if track_abundance:
+ orig_query_abunds = query_hashes
+
+ # adjust for not found...
+ if noident_mh is None: # create empty
+ noident_mh = query_mh.copy_and_clear()
+ self.noident_mh = noident_mh.to_frozen()
+
+ query_mh = query_mh.to_mutable()
+ query_mh.remove_many(noident_mh)
+
+ orig_query_mh = query_mh.flatten()
+ query.minhash = orig_query_mh.to_mutable()
+
+ cmp_scaled = query.minhash.scaled # initialize with resolution of query
+
+ self.result_n = 0
+ self.query = query
+ self.counters = counters
+ self.threshold_bp = threshold_bp
+
+ self.track_abundance = track_abundance
+ self.orig_query_mh = orig_query_mh
+ self.orig_query_abunds = orig_query_abunds
+
+ self.cmp_scaled = 0 # initialize with something very low!
+ self._update_scaled(cmp_scaled)
+
+ def _update_scaled(self, scaled):
+ max_scaled = max(self.cmp_scaled, scaled)
+ if self.cmp_scaled != max_scaled:
+ self.cmp_scaled = max_scaled
+
+ # CTB note: this can be expensive
+ self.orig_query_mh = self.orig_query_mh.downsample(scaled=scaled)
+ self.noident_mh = self.noident_mh.downsample(scaled=scaled)
+
+ # NOTE: orig_query_abunds can be used w/o downsampling
+ orig_query_abunds = self.orig_query_abunds
+ self.noident_query_sum_abunds = sum(( orig_query_abunds[k] \
+ for k in self.noident_mh.hashes ))
+ self.sum_abunds = sum(( orig_query_abunds[k] \
+ for k in self.orig_query_mh.hashes ))
+ self.sum_abunds += self.noident_query_sum_abunds
+
+ if max_scaled != scaled:
+ return max_scaled
+ return max_scaled
+
+ @property
+ def scaled(self):
+ return self.cmp_scaled
+
+ def __iter__(self):
+ return self
+
+ def __next__(self):
+ query = self.query
+ if not self.query.minhash:
+ raise StopIteration
+
+ # may be changed:
+ counters = self.counters
+ cmp_scaled = self.cmp_scaled
+
+ # will not be changed::
+ track_abundance = self.track_abundance
+ threshold_bp = self.threshold_bp
+ orig_query_abunds = self.orig_query_abunds
+
# find the best match!
best_result, intersect_mh = _find_best(counters, query, threshold_bp)
if not best_result: # no matches at all for this cutoff!
- notify(f'found less than {format_bp(threshold_bp)} in common. => exiting')
- break
+ raise StopIteration
best_match = best_result.signature
filename = best_result.location
@@ -309,34 +370,37 @@ def gather_databases(query, counters, threshold_bp, ignore_abundance):
match_scaled = best_match.minhash.scaled
assert match_scaled
- # pick the highest scaled / lowest resolution
- cmp_scaled = max(cmp_scaled, match_scaled)
+ # pick the highest scaled / lowest resolution.
+ scaled = self._update_scaled(match_scaled)
+ # CTB note: this means that if a high scaled/low res signature is
+ # found early on, resolution will be low from then on.
+
+ # retrieve various saved things, after potential downsampling
+ orig_query_mh = self.orig_query_mh
+ sum_abunds = self.sum_abunds
+ noident_mh = self.noident_mh
+ orig_query_len = len(orig_query_mh) + len(noident_mh)
# eliminate hashes under this new resolution.
- # (CTB note: this means that if a high scaled/low res signature is
- # found early on, resolution will be low from then on.)
- query_mh = query.minhash.downsample(scaled=cmp_scaled)
- found_mh = best_match.minhash.downsample(scaled=cmp_scaled).flatten()
- orig_query_mh = orig_query_mh.downsample(scaled=cmp_scaled)
- sum_abunds = sum(( orig_query_abunds[k] for k in orig_query_mh.hashes ))
+ query_mh = query.minhash.downsample(scaled=scaled)
+ found_mh = best_match.minhash.downsample(scaled=scaled).flatten()
# calculate intersection with query hashes:
- unique_intersect_bp = cmp_scaled * len(intersect_mh)
- intersect_orig_mh = orig_query_mh.intersection(found_mh)
- intersect_bp = cmp_scaled * len(intersect_orig_mh)
+ unique_intersect_bp = scaled * len(intersect_mh)
+ intersect_orig_mh = orig_query_mh & found_mh
+ intersect_bp = scaled * len(intersect_orig_mh)
# calculate fractions wrt first denominator - genome size
assert intersect_mh.contained_by(found_mh) == 1.0
f_match = len(intersect_mh) / len(found_mh)
- f_orig_query = len(intersect_orig_mh) / len(orig_query_mh)
+ f_orig_query = len(intersect_orig_mh) / orig_query_len
# calculate fractions wrt second denominator - metagenome size
assert intersect_mh.contained_by(orig_query_mh) == 1.0
- f_unique_to_query = len(intersect_mh) / len(orig_query_mh)
+ f_unique_to_query = len(intersect_mh) / orig_query_len
# calculate fraction of subject match with orig query
- f_match_orig = best_match.minhash.contained_by(orig_query_mh,
- downsample=True)
+ f_match_orig = found_mh.contained_by(orig_query_mh)
# calculate scores weighted by abundances
f_unique_weighted = sum((orig_query_abunds[k] for k in intersect_mh.hashes ))
@@ -353,17 +417,17 @@ def gather_databases(query, counters, threshold_bp, ignore_abundance):
std_abund = np.std(intersect_abunds)
# construct a new query, subtracting hashes found in previous one.
- new_query_mh = query.minhash.downsample(scaled=cmp_scaled)
- new_query_mh = new_query_mh.to_mutable()
- new_query_mh.remove_many(set(found_mh.hashes))
+ new_query_mh = query_mh.to_mutable()
+ new_query_mh.remove_many(found_mh)
new_query = SourmashSignature(new_query_mh)
- remaining_bp = cmp_scaled * len(new_query_mh)
+ remaining_bp = scaled * len(new_query_mh)
- # compute weighted_missed:
+ # compute weighted_missed for remaining query hashes
query_hashes = set(query_mh.hashes) - set(found_mh.hashes)
- weighted_missed = sum((orig_query_abunds[k] for k in query_hashes)) \
- / sum_abunds
+ weighted_missed = sum((orig_query_abunds[k] for k in query_hashes))
+ weighted_missed += self.noident_query_sum_abunds
+ weighted_missed /= sum_abunds
# build a result namedtuple
result = GatherResult(intersect_bp=intersect_bp,
@@ -380,13 +444,18 @@ def gather_databases(query, counters, threshold_bp, ignore_abundance):
md5=best_match.md5sum(),
name=str(best_match),
match=best_match,
- gather_result_rank=result_n,
- remaining_bp=remaining_bp)
- result_n += 1
-
- yield result, weighted_missed, new_query
+ gather_result_rank=self.result_n,
+ remaining_bp=remaining_bp,
+ query_bp = self.orig_query_bp,
+ query_filename=self.orig_query_filename,
+ query_name=self.orig_query_name,
+ query_md5=self.orig_query_md5,
+ )
+ self.result_n += 1
+ self.query = new_query
+ self.orig_query_mh = orig_query_mh
- query = new_query
+ return result, weighted_missed
###
@@ -416,7 +485,7 @@ def prefetch_database(query, database, threshold_bp):
db_mh = match.minhash.flatten().downsample(scaled=scaled)
# calculate db match intersection with query hashes:
- intersect_mh = query_mh.intersection(db_mh)
+ intersect_mh = query_mh & db_mh
assert len(intersect_mh) >= threshold
f_query_match = db_mh.contained_by(query_mh)
diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py
index 667e74cfcb..0f7c412cca 100644
--- a/src/sourmash/sig/__main__.py
+++ b/src/sourmash/sig/__main__.py
@@ -8,12 +8,12 @@
from collections import defaultdict
import sourmash
-import copy
from sourmash.sourmash_args import FileOutput
-from sourmash.logging import set_quiet, error, notify, set_quiet, print_results, debug
+from sourmash.logging import set_quiet, error, notify, print_results, debug
from sourmash import sourmash_args
from sourmash.minhash import _get_max_hash_for_scaled
+from sourmash.picklist import SignaturePicklist
usage='''
sourmash signature [] - manipulate/work with signature files.
@@ -27,6 +27,7 @@
filter [ ... ] - filter k-mers on abundance
flatten [ ... ] - remove abundances
intersect [ ...] - intersect one or more signatures
+manifest - build a manifest
merge [ ...] - merge one or more signatures
rename - rename signature
split [ ...] - split signatures into single files
@@ -73,10 +74,16 @@ def cat(args):
save_sigs = sourmash_args.SaveSignaturesToLocation(args.output)
save_sigs.open()
+ if args.from_file:
+ more_files = sourmash_args.load_pathlist_from_file(args.from_file)
+ args.signatures = list(args.signatures)
+ args.signatures.extend(more_files)
+
for sigfile in args.signatures:
try:
loader = sourmash_args.load_file_as_signatures(sigfile,
- progress=progress)
+ progress=progress,
+ yield_all_files=args.force)
n_loaded = 0
for sig in loader:
n_loaded += 1
@@ -123,7 +130,6 @@ def split(args):
progress = sourmash_args.SignatureLoadingProgress()
- total = 0
for sigfile in args.signatures:
# load signatures from input file:
this_siglist = sourmash_args.load_file_as_signatures(sigfile,
@@ -176,9 +182,8 @@ def split(args):
notify('loaded {} signatures from {}...', n_signatures, sigfile,
end='\r')
- total += n_signatures
- notify('loaded and split {} signatures total.', total)
+ notify(f'loaded and split {len(progress)} signatures total.')
def describe(args):
@@ -191,6 +196,7 @@ def describe(args):
w = None
csv_fp = None
if args.csv:
+ # CTB: might want to switch to sourmash_args.FileOutputCSV here?
csv_fp = open(args.csv, 'w', newline='')
w = csv.DictWriter(csv_fp,
['signature_file', 'md5', 'ksize', 'moltype', 'num',
@@ -202,14 +208,12 @@ def describe(args):
# load signatures and display info.
progress = sourmash_args.SignatureLoadingProgress()
- n_loaded = 0
for signature_file in args.signatures:
try:
- loader = sourmash_args.load_file_as_signatures(signature_file,
- progress=progress)
- for sig in loader:
- n_loaded += 1
+ idx = sourmash_args.load_file_as_index(signature_file)
+ loader = idx.signatures_with_location()
+ for sig, location in progress.start_file(signature_file, loader):
# extract info, write as appropriate.
mh = sig.minhash
ksize = mh.ksize
@@ -222,8 +226,10 @@ def describe(args):
if mh.track_abundance:
with_abundance = 1
md5 = sig.md5sum()
- name = sig.name or "** no name **"
- filename = sig.filename or "** no name **"
+ name = sig.name
+ p_name = name or "** no name **"
+ filename = sig.filename
+ p_filename = filename or "** no name **"
license = sig.license
if w:
@@ -231,9 +237,9 @@ def describe(args):
print_results('''\
---
-signature filename: {signature_file}
-signature: {name}
-source file: {filename}
+signature filename: {location}
+signature: {p_name}
+source file: {p_filename}
md5: {md5}
k={ksize} molecule={moltype} num={num} scaled={scaled} seed={seed} track_abundance={with_abundance}
size: {n_hashes}
@@ -246,7 +252,50 @@ def describe(args):
error('(continuing)')
raise
- notify('loaded {} signatures total.', n_loaded)
+ notify(f'loaded {len(progress)} signatures total.')
+
+ if csv_fp:
+ csv_fp.close()
+
+
+def manifest(args):
+ """
+ build a signature manifest
+ """
+ from sourmash.index import CollectionManifest
+
+ set_quiet(args.quiet)
+
+ # CTB: might want to switch to sourmash_args.FileOutputCSV here?
+ csv_fp = open(args.output, 'w', newline='')
+
+ CollectionManifest.write_csv_header(csv_fp)
+ w = csv.DictWriter(csv_fp, fieldnames=CollectionManifest.required_keys)
+
+ try:
+ loader = sourmash_args.load_file_as_index(args.location,
+ yield_all_files=args.force)
+ except Exception as exc:
+ error('\nError while reading signatures from {}:'.format(args.location))
+ error(str(exc))
+ error('(continuing)')
+ raise
+
+ n = 0
+ # Need to ignore existing manifests here! otherwise circularity...
+ try:
+ manifest_iter = loader._signatures_with_internal()
+ except NotImplementedError:
+ error("ERROR: manifests cannot be generated for this file.")
+ sys.exit(-1)
+
+ for n, (sig, parent, loc) in enumerate(manifest_iter):
+ # extract info, write as appropriate.
+ row = CollectionManifest.make_manifest_row(sig, loc,
+ include_signature=False)
+ w.writerow(row)
+
+ notify(f'built manifest for {n} signatures total.')
if csv_fp:
csv_fp.close()
@@ -297,7 +346,7 @@ def overlap(args):
hashes_1 = set(sig1.minhash.hashes)
hashes_2 = set(sig2.minhash.hashes)
- num_common = len(hashes_1.intersection(hashes_2))
+ num_common = len(hashes_1 & hashes_2)
disjoint_1 = len(hashes_1 - hashes_2)
disjoint_2 = len(hashes_2 - hashes_1)
num_union = len(hashes_1.union(hashes_2))
@@ -378,7 +427,7 @@ def merge(args):
if this_n:
notify('loaded and merged {} signatures from {}...', this_n, sigfile, end='\r')
- if not total_loaded:
+ if not len(progress):
error("no signatures to merge!?")
sys.exit(-1)
@@ -387,7 +436,7 @@ def merge(args):
with FileOutput(args.output, 'wt') as fp:
sourmash.save_signatures([merged_sigobj], fp=fp)
- notify('loaded and merged {} signatures', total_loaded)
+ notify(f'loaded and merged {len(progress)} signatures')
def intersect(args):
@@ -401,7 +450,6 @@ def intersect(args):
first_sig = None
mins = None
- total_loaded = 0
progress = sourmash_args.SignatureLoadingProgress()
@@ -420,10 +468,9 @@ def intersect(args):
sys.exit(-1)
mins.intersection_update(sigobj.minhash.hashes)
- total_loaded += 1
notify('loaded and intersected signatures from {}...', sigfile, end='\r')
- if total_loaded == 0:
+ if len(progress) == 0:
error("no signatures to merge!?")
sys.exit(-1)
@@ -455,7 +502,7 @@ def intersect(args):
with FileOutput(args.output, 'wt') as fp:
sourmash.save_signatures([intersect_sigobj], fp=fp)
- notify('loaded and intersected {} signatures', total_loaded)
+ notify(f'loaded and intersected {len(progress)} signatures')
def subtract(args):
@@ -479,7 +526,6 @@ def subtract(args):
progress = sourmash_args.SignatureLoadingProgress()
- total_loaded = 0
for sigfile in args.subtraction_sigs:
for sigobj in sourmash_args.load_file_as_signatures(sigfile,
ksize=args.ksize,
@@ -496,9 +542,8 @@ def subtract(args):
subtract_mins -= set(sigobj.minhash.hashes)
notify('loaded and subtracted signatures from {}...', sigfile, end='\r')
- total_loaded += 1
- if not total_loaded:
+ if not len(progress):
error("no signatures to subtract!?")
sys.exit(-1)
@@ -511,7 +556,7 @@ def subtract(args):
with FileOutput(args.output, 'wt') as fp:
sourmash.save_signatures([subtract_sigobj], fp=fp)
- notify('loaded and subtracted {} signatures', total_loaded)
+ notify(f'loaded and subtracted {len(progress)} signatures')
def rename(args):
@@ -539,7 +584,7 @@ def rename(args):
save_sigs.close()
- notify("set name to '{}' on {} signatures", args.name, len(save_sigs))
+ notify(f"set name to '{args.name}' on {len(save_sigs)} signatures")
def extract(args):
@@ -548,35 +593,45 @@ def extract(args):
"""
set_quiet(args.quiet)
moltype = sourmash_args.calculate_moltype(args)
+ picklist = sourmash_args.load_picklist(args)
+
+ # further filtering on md5 or name?
+ if args.md5 is not None or args.name is not None:
+ def filter_fn(it):
+ for ss in it:
+ # match?
+ keep = False
+ if args.name and args.name in str(ss):
+ keep = True
+ if args.md5 and args.md5 in ss.md5sum():
+ keep = True
+
+ if keep:
+ yield ss
+ else:
+ # whatever comes out of the database is fine
+ def filter_fn(it):
+ for ss in it:
+ yield ss
+ # ok! filtering defined, let's go forward
progress = sourmash_args.SignatureLoadingProgress()
save_sigs = sourmash_args.SaveSignaturesToLocation(args.output)
save_sigs.open()
- total_loaded = 0
for filename in args.signatures:
siglist = sourmash_args.load_file_as_signatures(filename,
ksize=args.ksize,
select_moltype=moltype,
+ picklist=picklist,
progress=progress)
- siglist = list(siglist)
-
- total_loaded += len(siglist)
-
- # select!
- if args.md5 is not None:
- siglist = [ ss for ss in siglist if args.md5 in ss.md5sum() ]
- if args.name is not None:
- siglist = [ ss for ss in siglist if args.name in str(ss) ]
-
- for ss in siglist:
+ for ss in filter_fn(siglist):
save_sigs.add(ss)
- notify("loaded {} total that matched ksize & molecule type",
- total_loaded)
+ notify(f"loaded {len(progress)} total that matched ksize & molecule type")
if not save_sigs:
- error("no matching signatures!")
+ error("no matching signatures to save!")
sys.exit(-1)
save_sigs.close()
@@ -584,6 +639,9 @@ def extract(args):
notify("extracted {} signatures from {} file(s)", len(save_sigs),
len(args.signatures))
+ if picklist:
+ sourmash_args.report_picklist(args, picklist)
+
def filter(args):
"""
@@ -597,7 +655,6 @@ def filter(args):
save_sigs = sourmash_args.SaveSignaturesToLocation(args.output)
save_sigs.open()
- total_loaded = 0
for filename in args.signatures:
siglist = sourmash_args.load_file_as_signatures(filename,
ksize=args.ksize,
@@ -605,8 +662,6 @@ def filter(args):
progress=progress)
siglist = list(siglist)
- total_loaded += len(siglist)
-
# select!
if args.md5 is not None:
siglist = [ ss for ss in siglist if args.md5 in ss.md5sum() ]
@@ -637,8 +692,7 @@ def filter(args):
save_sigs.close()
- notify("loaded {} total that matched ksize & molecule type",
- total_loaded)
+ notify(f"loaded {len(progress)} total that matched ksize & molecule type")
notify("extracted {} signatures from {} file(s)", len(save_sigs),
len(args.signatures))
@@ -655,7 +709,6 @@ def flatten(args):
save_sigs = sourmash_args.SaveSignaturesToLocation(args.output)
save_sigs.open()
- total_loaded = 0
for filename in args.signatures:
siglist = sourmash_args.load_file_as_signatures(filename,
ksize=args.ksize,
@@ -663,8 +716,6 @@ def flatten(args):
progress=progress)
siglist = list(siglist)
- total_loaded += len(siglist)
-
# select!
if args.md5 is not None:
siglist = [ ss for ss in siglist if args.md5 in ss.md5sum() ]
@@ -677,8 +728,7 @@ def flatten(args):
save_sigs.close()
- notify("loaded {} total that matched ksize & molecule type",
- total_loaded)
+ notify(f"loaded {len(progress)} total that matched ksize & molecule type")
notify("extracted {} signatures from {} file(s)", len(save_sigs),
len(args.signatures))
@@ -703,7 +753,6 @@ def downsample(args):
progress = sourmash_args.SignatureLoadingProgress()
- total_loaded = 0
for sigfile in args.signatures:
siglist = sourmash_args.load_file_as_signatures(sigfile,
ksize=args.ksize,
@@ -714,7 +763,6 @@ def downsample(args):
mh = sigobj.minhash
notify('loading and downsampling signature from {}...', sigfile, end='\r')
- total_loaded += 1
if args.scaled:
if mh.scaled:
mh_new = mh.downsample(scaled=args.scaled)
@@ -725,7 +773,7 @@ def downsample(args):
if max(mins) < max_hash:
raise ValueError("this num MinHash does not have enough hashes to convert it into a scaled MinHash.")
- mh_new = copy.copy(mh)
+ mh_new = mh.copy()
_set_num_scaled(mh_new, 0, args.scaled)
elif args.num:
if mh.num:
@@ -735,7 +783,7 @@ def downsample(args):
if len(mh) < args.num:
raise ValueError("this scaled MinHash has only {} hashes")
- mh_new = copy.copy(mh)
+ mh_new = mh.copy()
_set_num_scaled(mh_new, args.num, 0)
sigobj.minhash = mh_new
@@ -744,7 +792,7 @@ def downsample(args):
save_sigs.close()
- notify("loaded and downsampled {} signatures", total_loaded)
+ notify(f"loaded and downsampled {len(progress)} signatures")
def sig_import(args):
diff --git a/src/sourmash/signature.py b/src/sourmash/signature.py
index b1915d38cf..386fdb1733 100644
--- a/src/sourmash/signature.py
+++ b/src/sourmash/signature.py
@@ -192,6 +192,17 @@ def __reduce__(self):
),
)
+ def __copy__(self):
+ mh = self.minhash
+ mh = mh.to_frozen()
+ a = SourmashSignature(
+ mh,
+ name=self.name,
+ filename=self.filename,
+ )
+ return a
+
+ copy = __copy__
def _detect_input_type(data):
"""\
diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py
index 54661685c5..3261a7bceb 100644
--- a/src/sourmash/sourmash_args.py
+++ b/src/sourmash/sourmash_args.py
@@ -3,28 +3,26 @@
"""
import sys
import os
-import argparse
-import itertools
from enum import Enum
import traceback
import gzip
import zipfile
+from io import StringIO
import screed
+import sourmash
from sourmash.sbtmh import load_sbt_index
from sourmash.lca.lca_db import load_single_database
import sourmash.exceptions
-from . import signature
from .logging import notify, error, debug_literal
from .index import (LinearIndex, ZipFileLinearIndex, MultiIndex)
-from . import signature as sig
-from .sbt import SBT
-from .sbtmh import SigLeaf
-from .lca import LCA_Database
-import sourmash
+from . import signature as sigmod
+from .picklist import SignaturePicklist, PickStyle
+from .manifest import CollectionManifest
+
DEFAULT_LOAD_K = 31
@@ -57,12 +55,51 @@ def calculate_moltype(args, default=None):
n += 1
if n > 1:
- error("cannot specify more than one of --dna/--rna/--protein/--hp/--dayhoff")
+ error("cannot specify more than one of --dna/--rna/--nucleotide/--protein/--hp/--dayhoff")
sys.exit(-1)
return moltype
+def load_picklist(args):
+ "Load a SignaturePicklist from --picklist arguments."
+ picklist = None
+ if args.picklist:
+ try:
+ picklist = SignaturePicklist.from_picklist_args(args.picklist)
+ except ValueError as exc:
+ error("ERROR: could not load picklist.")
+ error(str(exc))
+ sys.exit(-1)
+
+ notify(f"picking column '{picklist.column_name}' of type '{picklist.coltype}' from '{picklist.pickfile}'")
+
+ n_empty_val, dup_vals = picklist.load(picklist.pickfile, picklist.column_name)
+
+ notify(f"loaded {len(picklist.pickset)} distinct values into picklist.")
+ if n_empty_val:
+ notify(f"WARNING: {n_empty_val} empty values in column '{picklist.column_name}' in picklist file")
+ if dup_vals:
+ notify(f"WARNING: {len(dup_vals)} values in picklist column '{picklist.column_name}' were not distinct")
+
+ return picklist
+
+
+def report_picklist(args, picklist):
+ if picklist.pickstyle == PickStyle.INCLUDE:
+ notify(f"for given picklist, found {len(picklist.found)} matches to {len(picklist.pickset)} distinct values")
+ n_missing = len(picklist.pickset - picklist.found)
+ elif picklist.pickstyle == PickStyle.EXCLUDE:
+ notify(f"for given picklist, found {len(picklist.found)} matches by excluding {len(picklist.pickset)} distinct values")
+ n_missing = 0
+ if n_missing:
+ notify(f"WARNING: {n_missing} missing picklist values.")
+ # Note - picklist_require_all is currently only relevant for PickStyle.INCLUDE
+ if args.picklist_require_all:
+ error("ERROR: failing because --picklist-require-all was set")
+ sys.exit(-1)
+
+
def load_query_signature(filename, ksize, select_moltype, select_md5=None):
"""Load a single signature to use as a query.
@@ -137,13 +174,14 @@ def traverse_find_sigs(filenames, yield_all_files=False):
# filename is a directory -- traverse beneath!
elif os.path.isdir(filename):
for root, dirs, files in os.walk(filename):
- for name in files:
+ for name in sorted(files):
fullname = os.path.join(root, name)
if yield_all_files or _check_suffix(fullname, endings):
yield fullname
-def load_dbs_and_sigs(filenames, query, is_similarity_query, *, cache_size=None):
+def load_dbs_and_sigs(filenames, query, is_similarity_query, *,
+ cache_size=None, picklist=None):
"""
Load one or more SBTs, LCAs, and/or collections of signatures.
@@ -185,6 +223,9 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, *, cache_size=None)
notify(f"no compatible signatures found in '{filename}'")
sys.exit(-1)
+ if picklist:
+ db = db.select(picklist=picklist)
+
databases.append(db)
# calc num loaded info.
@@ -237,16 +278,6 @@ def _multiindex_load_from_path(filename, **kwargs):
return db
-def _load_sigfile(filename, **kwargs):
- "Load collection from a signature JSON file"
- try:
- db = LinearIndex.load(filename)
- except sourmash.exceptions.SourmashError as exc:
- raise ValueError(exc)
-
- return db
-
-
def _load_sbt(filename, **kwargs):
"Load collection from an SBT."
cache_size = kwargs.get('cache_size')
@@ -278,8 +309,7 @@ def _load_zipfile(filename, **kwargs):
# all loader functions, in order.
_loader_functions = [
("load from stdin", _load_stdin),
- ("load from directory", _multiindex_load_from_path),
- ("load from sig file", _load_sigfile),
+ ("load from path (file or directory)", _multiindex_load_from_path),
("load from file list", _multiindex_load_from_pathlist),
("load SBT", _load_sbt),
("load revindex", _load_revindex),
@@ -298,14 +328,14 @@ def _load_database(filename, traverse_yield_all, *, cache_size=None):
# iterate through loader functions, trying them all. Catch ValueError
# but nothing else.
- for (desc, load_fn) in _loader_functions:
+ for n, (desc, load_fn) in enumerate(_loader_functions):
try:
- debug_literal(f"_load_databases: trying loader fn {desc}")
+ debug_literal(f"_load_databases: trying loader fn {n} {desc}")
db = load_fn(filename,
traverse_yield_all=traverse_yield_all,
cache_size=cache_size)
- except ValueError as exc:
- debug_literal(f"_load_databases: FAIL on fn {desc}.")
+ except ValueError:
+ debug_literal(f"_load_databases: FAIL on fn {n} {desc}.")
debug_literal(traceback.format_exc())
if db is not None:
@@ -321,7 +351,7 @@ def _load_database(filename, traverse_yield_all, *, cache_size=None):
# CTB: could be kind of time consuming for a big record, but at the
# moment screed doesn't expose format detection cleanly.
with screed.open(filename) as it:
- record = next(iter(it))
+ _ = next(iter(it))
successful_screed_load = True
except:
pass
@@ -338,7 +368,7 @@ def _load_database(filename, traverse_yield_all, *, cache_size=None):
return db
-def load_file_as_index(filename, yield_all_files=False):
+def load_file_as_index(filename, *, yield_all_files=False):
"""Load 'filename' as a database; generic database loader.
If 'filename' contains an SBT or LCA indexed database, or a regular
@@ -356,9 +386,11 @@ def load_file_as_index(filename, yield_all_files=False):
return _load_database(filename, yield_all_files)
-def load_file_as_signatures(filename, select_moltype=None, ksize=None,
+def load_file_as_signatures(filename, *, select_moltype=None, ksize=None,
+ picklist=None,
yield_all_files=False,
- progress=None):
+ progress=None,
+ _use_manifest=True):
"""Load 'filename' as a collection of signatures. Return an iterable.
If 'filename' contains an SBT or LCA indexed database, or a regular
@@ -373,16 +405,21 @@ def load_file_as_signatures(filename, select_moltype=None, ksize=None,
underneath this directory into a list of signatures. If
yield_all_files=True, will attempt to load all files.
- Applies selector function if select_moltype and/or ksize are given.
+ Applies selector function if select_moltype, ksize or picklist are given.
"""
if progress:
progress.notify(filename)
db = _load_database(filename, yield_all_files)
- db = db.select(moltype=select_moltype, ksize=ksize)
+
+ # test fixture ;)
+ if not _use_manifest and db.manifest:
+ db.manifest = None
+
+ db = db.select(moltype=select_moltype, ksize=ksize, picklist=picklist)
loader = db.signatures()
- if progress:
+ if progress is not None:
return progress.start_file(filename, loader)
else:
return loader
@@ -400,7 +437,7 @@ def load_pathlist_from_file(filename):
if not os.path.exists(checkfile):
raise ValueError(f"file '{checkfile}' inside the pathlist does not exist")
except IOError:
- raise ValueError(f"pathlist file '{filename}' does not exist")
+ raise ValueError(f"pathlist file '{filename}' does not exist")
except OSError:
raise ValueError(f"cannot open file '{filename}'")
except UnicodeDecodeError:
@@ -491,16 +528,19 @@ class SignatureLoadingProgress(object):
Instantiate this class once, and then pass it to load_file_as_signatures
with progress=.
- Alternatively, call obj.start_file(filename, iter) each time you
+ Alternatively, call obj.start_file(location, iter) each time you
start loading signatures from a new file via iter.
- You can optionally notify of reading a file with `.notify(filename)`.
+ You can optionally notify of reading a file with `.notify(location)`.
"""
def __init__(self, reporting_interval=10):
self.n_sig = 0
self.interval = reporting_interval
self.screen_width = 79
+ def __len__(self):
+ return self.n_sig
+
def short_notify(self, msg_template, *args, **kwargs):
"""Shorten the notification message so that it fits on one line.
@@ -517,11 +557,10 @@ def short_notify(self, msg_template, *args, **kwargs):
notify(msg, end=end)
- def notify(self, filename):
- self.short_notify("...reading from file '{}'",
- filename, end='\r')
+ def notify(self, location):
+ self.short_notify(f"...{self.n_sig} sigs so far. Now reading from file '{location}'", end='\r')
- def start_file(self, filename, loader):
+ def start_file(self, location, loader):
n_this = 0
n_before = self.n_sig
@@ -532,7 +571,7 @@ def start_file(self, filename, loader):
n_total = n_before + n_this
if n_this and n_total % self.interval == 0:
self.short_notify("...loading from '{}' / {} sigs total",
- filename, n_total, end='\r')
+ location, n_total, end='\r')
yield result
except KeyboardInterrupt:
@@ -542,7 +581,8 @@ def start_file(self, filename, loader):
finally:
self.n_sig += n_this
- self.short_notify("loaded {} sigs from '{}'", n_this, filename)
+ self.short_notify(f"Loaded {n_this} sigs from '{location}'",
+ end='\r')
#
@@ -594,7 +634,7 @@ class SaveSignatures_Directory(_BaseSaveSignaturesToLocation):
"Save signatures within a directory, using md5sum names."
def __init__(self, location):
super().__init__(location)
-
+
def __repr__(self):
return f"SaveSignatures_Directory('{self.location}')"
@@ -626,7 +666,7 @@ def add(self, ss):
i += 1
with gzip.open(outname, "wb") as fp:
- sig.save_signatures([ss], fp, compression=1)
+ sigmod.save_signatures([ss], fp, compression=1)
class SaveSignatures_SigFile(_BaseSaveSignaturesToLocation):
@@ -671,15 +711,32 @@ class SaveSignatures_ZipFile(_BaseSaveSignaturesToLocation):
def __init__(self, location):
super().__init__(location)
self.zf = None
-
+
def __repr__(self):
return f"SaveSignatures_ZipFile('{self.location}')"
def close(self):
+ # finish constructing manifest object & save
+ manifest = CollectionManifest(self.manifest_rows)
+ manifest_name = f"SOURMASH-MANIFEST.csv"
+
+ manifest_fp = StringIO()
+ manifest.write_to_csv(manifest_fp, write_header=True)
+ manifest_data = manifest_fp.getvalue().encode("utf-8")
+
+ # compress the manifest --
+ self.zf.writestr(manifest_name, manifest_data,
+ compress_type=zipfile.ZIP_DEFLATED)
+
+ # set permissions:
+ zi = self.zf.getinfo(manifest_name)
+ zi.external_attr = 0o444 << 16 # give a+r access
+
self.zf.close()
def open(self):
self.zf = zipfile.ZipFile(self.location, 'w', zipfile.ZIP_STORED)
+ self.manifest_rows = []
def _exists(self, name):
try:
@@ -689,7 +746,8 @@ def _exists(self, name):
return False
def add(self, ss):
- assert self.zf
+ if not self.zf:
+ raise ValueError("this output is not open")
super().add(ss)
md5 = ss.md5sum()
@@ -707,6 +765,15 @@ def add(self, ss):
json_str = sourmash.save_signatures([ss], compression=1)
self.zf.writestr(outname, json_str)
+ # set permissions:
+ zi = self.zf.getinfo(outname)
+ zi.external_attr = 0o444 << 16 # give a+r access
+
+ # update manifest
+ row = CollectionManifest.make_manifest_row(ss, outname,
+ include_signature=False)
+ self.manifest_rows.append(row)
+
class SigFileSaveType(Enum):
SIGFILE = 1
diff --git a/src/sourmash/tax/__init__.py b/src/sourmash/tax/__init__.py
new file mode 100644
index 0000000000..afdccbee52
--- /dev/null
+++ b/src/sourmash/tax/__init__.py
@@ -0,0 +1 @@
+from .__main__ import main
diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py
new file mode 100644
index 0000000000..a6ddeb8abf
--- /dev/null
+++ b/src/sourmash/tax/__main__.py
@@ -0,0 +1,326 @@
+"""
+Command-line entry point for 'python -m sourmash.tax'
+"""
+import sys
+import csv
+import os
+from collections import defaultdict
+
+import sourmash
+from ..sourmash_args import FileOutputCSV
+from sourmash.logging import set_quiet, error, notify
+from sourmash.lca.lca_utils import display_lineage
+
+from . import tax_utils
+from .tax_utils import ClassificationResult, MultiLineageDB
+
+usage='''
+sourmash taxonomy [] - manipulate/work with taxonomy information.
+or
+sourmash tax []
+
+
+** Commands can be:
+
+annotate -g [ ... ] -t [ ...] - annotate gather CSVs with taxonomic lineages
+genome -g [ ... ] -t [ ...] - taxonomic classification of genomes from gather results
+metagenome -g [ ... ] -t [ ...] - summarize taxonomic information for metagenome gather results
+
+** Use '-h' to get subcommand-specific help, e.g.
+
+sourmash taxonomy metagenome -h
+'''
+
+# some utils
+def make_outfile(base, output_type, *, output_dir = ""):
+ if base == "-":
+ return base
+ ext=""
+ if output_type == 'csv_summary':
+ ext = '.summarized.csv'
+ elif output_type == 'classification':
+ ext = '.classifications.csv'
+ elif output_type == 'krona':
+ ext = '.krona.tsv'
+ elif output_type == 'lineage_summary':
+ ext = '.lineage_summary.tsv'
+ elif output_type == 'annotate':
+ ext = '.with-lineages.csv'
+ fname = base+ext
+ if output_dir:
+ fname = os.path.join(output_dir, fname)
+ notify(f"saving `{output_type}` output to {fname}.")
+ return fname
+
+
+##### taxonomy command line functions
+def metagenome(args):
+ """
+ summarize taxonomic information for metagenome gather results
+ """
+ set_quiet(args.quiet)
+
+ # first, load taxonomic_assignments
+ try:
+ tax_assign = MultiLineageDB.load(args.taxonomy_csv,
+ keep_full_identifiers=args.keep_full_identifiers,
+ keep_identifier_versions=args.keep_identifier_versions,
+ force=args.force)
+ available_ranks = tax_assign.available_ranks
+ except ValueError as exc:
+ error(f"ERROR: {str(exc)}")
+ sys.exit(-1)
+
+ if not tax_assign:
+ error(f'ERROR: No taxonomic assignments loaded from {",".join(args.taxonomy_csv)}. Exiting.')
+ sys.exit(-1)
+
+ if args.rank and args.rank not in available_ranks:
+ error(f"ERROR: No taxonomic information provided for rank {args.rank}: cannot summarize at this rank")
+ sys.exit(-1)
+
+ # next, collect and load gather results
+ gather_csvs = tax_utils.collect_gather_csvs(args.gather_csv, from_file= args.from_file)
+ try:
+ gather_results, idents_missed, total_missed, _ = tax_utils.check_and_load_gather_csvs(gather_csvs, tax_assign, force=args.force,
+ fail_on_missing_taxonomy=args.fail_on_missing_taxonomy)
+ except ValueError as exc:
+ error(f"ERROR: {str(exc)}")
+ sys.exit(-1)
+
+ if not gather_results:
+ notify('No gather results loaded. Exiting.')
+ sys.exit(-1)
+
+ # actually summarize at rank
+ summarized_gather = {}
+ seen_perfect = set()
+ for rank in sourmash.lca.taxlist(include_strain=False):
+ summarized_gather[rank], seen_perfect = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, skip_idents=idents_missed,
+ keep_full_identifiers=args.keep_full_identifiers,
+ keep_identifier_versions = args.keep_identifier_versions,
+ seen_perfect = seen_perfect)
+
+ # write summarized output csv
+ if "csv_summary" in args.output_format:
+ summary_outfile = make_outfile(args.output_base, "csv_summary", output_dir=args.output_dir)
+ with FileOutputCSV(summary_outfile) as out_fp:
+ tax_utils.write_summary(summarized_gather, out_fp)
+
+ # if lineage summary table
+ if "lineage_summary" in args.output_format:
+ lineage_outfile = make_outfile(args.output_base, "lineage_summary", output_dir=args.output_dir)
+
+ ## aggregate by lineage, by query
+ lineageD, query_names, num_queries = tax_utils.aggregate_by_lineage_at_rank(summarized_gather[args.rank], by_query=True)
+
+ with FileOutputCSV(lineage_outfile) as out_fp:
+ tax_utils.write_lineage_sample_frac(query_names, lineageD, out_fp, format_lineage=True, sep='\t')
+
+ # write summarized --> krona output tsv
+ if "krona" in args.output_format:
+ krona_resultslist = tax_utils.format_for_krona(args.rank, summarized_gather)
+
+ krona_outfile = make_outfile(args.output_base, "krona", output_dir=args.output_dir)
+ with FileOutputCSV(krona_outfile) as out_fp:
+ tax_utils.write_krona(args.rank, krona_resultslist, out_fp)
+
+
+def genome(args):
+ """
+ taxonomic classification of genomes from gather results
+ """
+ set_quiet(args.quiet)
+
+ # first, load taxonomic_assignments
+ try:
+ tax_assign = MultiLineageDB.load(args.taxonomy_csv,
+ keep_full_identifiers=args.keep_full_identifiers,
+ keep_identifier_versions=args.keep_identifier_versions,
+ force=args.force)
+ available_ranks = tax_assign.available_ranks
+ except ValueError as exc:
+ error(f"ERROR: {str(exc)}")
+ sys.exit(-1)
+
+ if not tax_assign:
+ error(f'ERROR: No taxonomic assignments loaded from {",".join(args.taxonomy_csv)}. Exiting.')
+ sys.exit(-1)
+
+ if args.rank and args.rank not in available_ranks:
+ error(f"ERROR: No taxonomic information provided for rank {args.rank}: cannot classify at this rank")
+ sys.exit(-1)
+
+ # get gather_csvs from args
+ gather_csvs = tax_utils.collect_gather_csvs(args.gather_csv, from_file=args.from_file)
+
+ classifications = defaultdict(list)
+ matched_queries=set()
+ krona_results = []
+ status = "nomatch"
+ seen_perfect = set()
+
+ # read in all gather CSVs (queries in more than one gather file will raise error; with --force they will only be loaded once)
+ # note: doing one CSV at a time would work and probably be more memory efficient, but we would need to change how we check
+ # for duplicated queries
+ try:
+ gather_results, idents_missed, total_missed, _ = tax_utils.check_and_load_gather_csvs(gather_csvs, tax_assign, force=args.force,
+ fail_on_missing_taxonomy=args.fail_on_missing_taxonomy)
+
+ except ValueError as exc:
+ error(f"ERROR: {str(exc)}")
+ sys.exit(-1)
+
+ # if --rank is specified, classify to that rank
+ if args.rank:
+ best_at_rank, seen_perfect = tax_utils.summarize_gather_at(args.rank, tax_assign, gather_results, skip_idents=idents_missed,
+ keep_full_identifiers=args.keep_full_identifiers,
+ keep_identifier_versions = args.keep_identifier_versions,
+ best_only=True, seen_perfect=seen_perfect)
+
+ # best at rank is a list of SummarizedGather tuples
+ for sg in best_at_rank:
+ status = 'nomatch'
+ if sg.query_name in matched_queries:
+ continue
+ if sg.fraction <= args.containment_threshold:
+ status="below_threshold"
+ notify(f"WARNING: classifying query {sg.query_name} at desired rank {args.rank} does not meet containment threshold {args.containment_threshold}")
+ else:
+ status="match"
+ classif = ClassificationResult(sg.query_name, status, sg.rank, sg.fraction, sg.lineage, sg.query_md5, sg.query_filename)
+ classifications[args.rank].append(classif)
+ matched_queries.add(sg.query_name)
+ if "krona" in args.output_format:
+ lin_list = display_lineage(sg.lineage).split(';')
+ krona_results.append((sg.fraction, *lin_list))
+ else:
+ # classify to the match that passes the containment threshold.
+ # To do - do we want to store anything for this match if nothing >= containment threshold?
+ for rank in tax_utils.ascending_taxlist(include_strain=False):
+ # gets best_at_rank for all queries in this gather_csv
+ best_at_rank, seen_perfect = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, skip_idents=idents_missed,
+ keep_full_identifiers=args.keep_full_identifiers,
+ keep_identifier_versions = args.keep_identifier_versions,
+ best_only=True, seen_perfect=seen_perfect)
+
+ for sg in best_at_rank:
+ status = 'nomatch'
+ if sg.query_name in matched_queries:
+ continue
+ if sg.fraction >= args.containment_threshold:
+ status = "match"
+ classif = ClassificationResult(sg.query_name, status, sg.rank, sg.fraction, sg.lineage, sg.query_md5, sg.query_filename)
+ classifications[sg.rank].append(classif)
+ matched_queries.add(sg.query_name)
+ continue
+ if rank == "superkingdom" and status == "nomatch":
+ status="below_threshold"
+ classif = ClassificationResult(query_name=sg.query_name, status=status,
+ rank="", fraction=0, lineage="",
+ query_md5=sg.query_md5, query_filename=sg.query_filename)
+ classifications[sg.rank].append(classif)
+
+ if not any([classifications, krona_results]):
+ notify('No results for classification. Exiting.')
+ sys.exit(-1)
+
+ # write outputs
+ if "csv_summary" in args.output_format:
+ summary_outfile = make_outfile(args.output_base, "classification", output_dir=args.output_dir)
+ with FileOutputCSV(summary_outfile) as out_fp:
+ tax_utils.write_classifications(classifications, out_fp)
+
+ if "krona" in args.output_format:
+ krona_outfile = make_outfile(args.output_base, "krona", output_dir=args.output_dir)
+ with FileOutputCSV(krona_outfile) as out_fp:
+ tax_utils.write_krona(args.rank, krona_results, out_fp)
+
+
+def annotate(args):
+ """
+ Annotate gather results with taxonomic lineage for each match.
+
+ Produces gather csv with lineage information as the final column.
+ """
+
+ set_quiet(args.quiet)
+
+ # first, load taxonomic_assignments
+ try:
+ tax_assign = MultiLineageDB.load(args.taxonomy_csv,
+ keep_full_identifiers=args.keep_full_identifiers,
+ keep_identifier_versions=args.keep_identifier_versions,
+ force=args.force)
+ available_ranks = tax_assign.available_ranks
+ except ValueError as exc:
+ error(f"ERROR: {str(exc)}")
+ sys.exit(-1)
+
+ if not tax_assign:
+ error(f'ERROR: No taxonomic assignments loaded from {",".join(args.taxonomy_csv)}. Exiting.')
+ sys.exit(-1)
+
+ # get gather_csvs from args
+ gather_csvs = tax_utils.collect_gather_csvs(args.gather_csv, from_file=args.from_file)
+
+ # handle each gather csv separately
+ for n, g_csv in enumerate(gather_csvs):
+ gather_results, idents_missed, total_missed, header = tax_utils.check_and_load_gather_csvs(g_csv, tax_assign, force=args.force,
+ fail_on_missing_taxonomy=args.fail_on_missing_taxonomy)
+
+ if not gather_results:
+ continue
+
+ out_base = os.path.basename(g_csv.rsplit('.csv')[0])
+ this_outfile = make_outfile(out_base, "annotate", output_dir=args.output_dir)
+
+ with FileOutputCSV(this_outfile) as out_fp:
+ header.append("lineage")
+ w = csv.DictWriter(out_fp, header, delimiter=',')
+ w.writeheader()
+
+ # add taxonomy info and then print directly
+ for row in gather_results:
+ match_ident = row['name']
+ lineage = tax_utils.find_match_lineage(match_ident, tax_assign, skip_idents=idents_missed,
+ keep_full_identifiers=args.keep_full_identifiers,
+ keep_identifier_versions=args.keep_identifier_versions)
+ row['lineage'] = display_lineage(lineage)
+ w.writerow(row)
+
+
+def prepare(args):
+ "Combine multiple taxonomy databases into one and/or translate formats."
+ notify("loading taxonomies...")
+ try:
+ tax_assign = MultiLineageDB.load(args.taxonomy_csv,
+ keep_full_identifiers=args.keep_full_identifiers,
+ keep_identifier_versions=args.keep_identifier_versions)
+ except ValueError as exc:
+ error("ERROR while loading taxonomies!")
+ error(str(exc))
+ sys.exit(-1)
+
+ notify(f"...loaded {len(tax_assign)} entries.")
+
+ notify(f"saving to '{args.output}', format {args.database_format}...")
+ try:
+ tax_assign.save(args.output, args.database_format)
+ except ValueError as exc:
+ error("ERROR while saving!")
+ error(str(exc))
+ sys.exit(-1)
+
+ notify("done!")
+
+
+def main(arglist=None):
+ args = sourmash.cli.get_parser().parse_args(arglist)
+ submod = getattr(sourmash.cli.sig, args.subcmd)
+ mainmethod = getattr(submod, 'main')
+ return mainmethod(args)
+
+
+if __name__ == '__main__':
+ main(sys.argv)
diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
new file mode 100644
index 0000000000..810bd9c9af
--- /dev/null
+++ b/src/sourmash/tax/tax_utils.py
@@ -0,0 +1,819 @@
+"""
+Utility functions for taxonomy analysis tools.
+"""
+import os
+import csv
+from collections import namedtuple, defaultdict
+from collections import abc
+
+__all__ = ['get_ident', 'ascending_taxlist', 'collect_gather_csvs',
+ 'load_gather_results', 'check_and_load_gather_csvs',
+ 'find_match_lineage', 'summarize_gather_at',
+ 'find_missing_identities', 'make_krona_header',
+ 'aggregate_by_lineage_at_rank', 'format_for_krona',
+ 'write_krona', 'write_summary', 'write_classifications',
+ 'combine_sumgather_csvs_by_lineage', 'write_lineage_sample_frac',
+ 'MultiLineageDB']
+
+from sourmash.logging import notify
+from sourmash.sourmash_args import load_pathlist_from_file
+
+SummarizedGatherResult = namedtuple("SummarizedGatherResult", "query_name, rank, fraction, lineage, query_md5, query_filename")
+ClassificationResult = namedtuple("ClassificationResult", "query_name, status, rank, fraction, lineage, query_md5, query_filename")
+
+# import lca utils as needed for now
+from sourmash.lca import lca_utils
+from sourmash.lca.lca_utils import (LineagePair, taxlist, display_lineage, pop_to_rank)
+
+
+def get_ident(ident, *,
+ keep_full_identifiers=False, keep_identifier_versions=False):
+ # split identifiers = split on whitespace
+ # keep identifiers = don't split .[12] from assembly accessions
+ "Hack and slash identifiers."
+ if not keep_full_identifiers:
+ ident = ident.split(' ')[0]
+ if not keep_identifier_versions:
+ ident = ident.split('.')[0]
+ return ident
+
+
+def ascending_taxlist(include_strain=True):
+ """
+ Provide an ordered list of taxonomic ranks: strain --> superkingdom
+ """
+ ascending_taxlist = ['species', 'genus', 'family', 'order',
+ 'class', 'phylum', 'superkingdom']
+ if include_strain:
+ ascending_taxlist = ['strain'] + ascending_taxlist
+ for k in ascending_taxlist:
+ yield k
+
+
+def collect_gather_csvs(cmdline_gather_input, *, from_file=None):
+ """
+ collect gather files from cmdline; --from-file input
+ """
+ gather_csvs = []
+ # ignore command line duplicates
+ for gf in cmdline_gather_input:
+ if gf not in gather_csvs:
+ gather_csvs.append(gf)
+ else:
+ notify(f'ignoring duplicated reference to file: {gf}')
+ # ignore pathlist duplicates
+ if from_file:
+ more_files = load_pathlist_from_file(from_file)
+ for gf in more_files:
+ if gf not in gather_csvs:
+ gather_csvs.append(gf)
+ else:
+ notify(f'ignoring duplicated reference to file: {gf}')
+ return gather_csvs
+
+
+def load_gather_results(gather_csv, *, delimiter=',', essential_colnames=['query_name', 'name', 'f_unique_weighted', 'query_md5', 'query_filename'], seen_queries=set(), force=False):
+ "Load a single gather csv"
+ header = []
+ gather_results = []
+ gather_queries = set()
+ with open(gather_csv, 'rt') as fp:
+ r = csv.DictReader(fp, delimiter=delimiter)
+ header = r.fieldnames
+ # check for empty file
+ if not header:
+ raise ValueError(f'Cannot read gather results from {gather_csv}. Is file empty?')
+
+ #check for critical column names used by summarize_gather_at
+ if not set(essential_colnames).issubset(header):
+ raise ValueError(f'Not all required gather columns are present in {gather_csv}.')
+
+ for n, row in enumerate(r):
+ query_name = row['query_name']
+ # check if we've seen this query already in a different gather CSV
+ if query_name in seen_queries:
+ if query_name not in gather_queries: #seen already in this CSV? (only want to warn once per query per CSV)
+ notify(f"WARNING: Gather query {query_name} was already loaded from a separate gather CSV. Cannot load duplicate query from CSV {gather_csv}...")
+ if force:
+ if query_name not in gather_queries:
+ notify("--force is set, ignoring duplicate query.")
+ gather_queries.add(query_name)
+ continue
+ else:
+ raise ValueError(f"Gather query {query_name} was found in more than one CSV. Cannot load from {gather_csv}.")
+ else:
+ gather_results.append(row)
+ # add query name to the gather_queries from this CSV
+ if query_name not in gather_queries:
+ gather_queries.add(query_name)
+
+ if not gather_results:
+ raise ValueError(f'No gather results loaded from {gather_csv}.')
+ else:
+ notify(f'loaded {len(gather_results)} gather results.')
+ return gather_results, header, gather_queries
+
+
+def check_and_load_gather_csvs(gather_csvs, tax_assign, *, fail_on_missing_taxonomy=False, force=False):
+ '''
+ Load gather csvs, checking for empties and ids missing from taxonomic assignments.
+ '''
+ if not isinstance(gather_csvs, list):
+ gather_csvs = [gather_csvs]
+ gather_results = []
+ total_missed = 0
+ all_ident_missed = set()
+ seen_queries = set()
+ header = []
+ n_ignored = 0
+ for n, gather_csv in enumerate(gather_csvs):
+ these_results = []
+ try:
+ these_results, header, seen_queries = load_gather_results(gather_csv, seen_queries=seen_queries, force=force)
+ except ValueError as exc:
+ if force:
+ notify(str(exc))
+ notify('--force is set. Attempting to continue to next set of gather results.')
+ n_ignored+=1
+ continue
+ else:
+ notify('Exiting.')
+ raise
+
+ # check for match identites in these gather_results not found in lineage spreadsheets
+ n_missed, ident_missed = find_missing_identities(these_results, tax_assign)
+ if n_missed:
+ notify(f'The following are missing from the taxonomy information: {",".join(ident_missed)}')
+ if fail_on_missing_taxonomy:
+ raise ValueError('Failing on missing taxonomy, as requested via --fail-on-missing-taxonomy.')
+
+ total_missed += n_missed
+ all_ident_missed.update(ident_missed)
+ # add these results to gather_results
+ gather_results += these_results
+
+ num_gather_csvs_loaded = n+1 - n_ignored
+ notify(f'loaded results from {str(num_gather_csvs_loaded)} gather CSVs')
+
+ return gather_results, all_ident_missed, total_missed, header
+
+
+def find_match_lineage(match_ident, tax_assign, *, skip_idents = [],
+ keep_full_identifiers=False,
+ keep_identifier_versions=False):
+ lineage=""
+ match_ident = get_ident(match_ident, keep_full_identifiers=keep_full_identifiers, keep_identifier_versions=keep_identifier_versions)
+ # if identity not in lineage database, and not --fail-on-missing-taxonomy, skip summarizing this match
+ if match_ident in skip_idents:
+ return lineage
+ try:
+ lineage = tax_assign[match_ident]
+ except KeyError:
+ raise ValueError(f"ident {match_ident} is not in the taxonomy database.")
+ return lineage
+
+
+def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [],
+ keep_full_identifiers=False,
+ keep_identifier_versions=False, best_only=False,
+ seen_perfect=set()):
+ """
+ Summarize gather results at specified taxonomic rank
+ """
+ sum_uniq_weighted = defaultdict(lambda: defaultdict(float))
+ for row in gather_results:
+ # get essential gather info
+ query_name = row['query_name']
+ query_md5 = row['query_md5']
+ query_filename = row['query_filename']
+ match_ident = row['name']
+ f_uniq_weighted = row['f_unique_weighted']
+ f_uniq_weighted = float(f_uniq_weighted)
+
+ # 100% match? are we looking at something in the database?
+ if f_uniq_weighted >= 1.0 and query_name not in seen_perfect: # only want to notify once, not for each rank
+ ident = get_ident(match_ident,
+ keep_full_identifiers=keep_full_identifiers,
+ keep_identifier_versions=keep_identifier_versions)
+ seen_perfect.add(query_name)
+ notify(f'WARNING: 100% match! Is query "{query_name}" identical to its database match, {ident}?')
+
+ # get lineage for match
+ lineage = find_match_lineage(match_ident, tax_assign,
+ skip_idents=skip_idents,
+ keep_full_identifiers=keep_full_identifiers,
+ keep_identifier_versions=keep_identifier_versions)
+ # ident was in skip_idents
+ if not lineage:
+ continue
+
+ # summarize at rank!
+ lineage = pop_to_rank(lineage, rank)
+ assert lineage[-1].rank == rank, lineage[-1]
+ # record info
+ sum_uniq_weighted[query_name][lineage] += f_uniq_weighted
+
+ # sort and store each as SummarizedGatherResult
+ sum_uniq_weighted_sorted = []
+ for query_name, lineage_weights in sum_uniq_weighted.items():
+ sumgather_items = list(lineage_weights.items())
+ sumgather_items.sort(key = lambda x: -x[1])
+ if best_only:
+ lineage, fraction = sumgather_items[0]
+ sres = SummarizedGatherResult(query_name, rank, fraction, lineage, query_md5, query_filename)
+ sum_uniq_weighted_sorted.append(sres)
+ else:
+ for lineage, fraction in sumgather_items:
+ sres = SummarizedGatherResult(query_name, rank, fraction, lineage, query_md5, query_filename)
+ sum_uniq_weighted_sorted.append(sres)
+
+ return sum_uniq_weighted_sorted, seen_perfect
+
+
+def find_missing_identities(gather_results, tax_assign):
+ """
+ Identify match ids/accessions from gather results
+ that are not present in taxonomic assignments.
+ """
+ n_missed = 0
+ ident_missed= set()
+ for row in gather_results:
+ match_ident = row['name']
+ match_ident = get_ident(match_ident)
+ if match_ident not in tax_assign:
+ n_missed += 1
+ ident_missed.add(match_ident)
+
+ notify(f'of {len(gather_results)}, missed {n_missed} lineage assignments.')
+ return n_missed, ident_missed
+
+
+# pass ranks; have ranks=[default_ranks]
+def make_krona_header(min_rank, *, include_strain=False):
+ "make header for krona output"
+ header = ["fraction"]
+ tl = list(taxlist(include_strain=include_strain))
+ try:
+ rank_index = tl.index(min_rank)
+ except ValueError:
+ raise ValueError(f"Rank {min_rank} not present in available ranks!")
+ return tuple(header + tl[:rank_index+1])
+
+
+def aggregate_by_lineage_at_rank(rank_results, *, by_query=False):
+ '''
+ Aggregate list of rank SummarizedGatherResults,
+ keeping query info or aggregating across queries.
+ '''
+ lineage_summary = defaultdict(float)
+ if by_query:
+ lineage_summary = defaultdict(dict)
+ all_queries = []
+ for res in rank_results:
+ if res.query_name not in all_queries:
+ all_queries.append(res.query_name)
+ if by_query:
+ lineage_summary[res.lineage][res.query_name] = res.fraction
+ else:
+ lineage_summary[res.lineage] += res.fraction
+ return lineage_summary, all_queries, len(all_queries)
+
+
+def format_for_krona(rank, summarized_gather):
+ '''
+ Aggregate list of SummarizedGatherResults and format for krona output
+ '''
+ num_queries=0
+ for res_rank, rank_results in summarized_gather.items():
+ if res_rank == rank:
+ lineage_summary, all_queries, num_queries = aggregate_by_lineage_at_rank(rank_results, by_query=False)
+ # if multiple_samples, divide fraction by the total number of query files
+ for lin, fraction in lineage_summary.items():
+ # divide total fraction by total number of queries
+ lineage_summary[lin] = fraction/num_queries
+
+ # sort by fraction
+ lin_items = list(lineage_summary.items())
+ lin_items.sort(key = lambda x: -x[1])
+
+ # reformat lineage for krona_results printing
+ krona_results = []
+ for lin, fraction in lin_items:
+ lin_list = display_lineage(lin).split(';')
+ krona_results.append((fraction, *lin_list))
+
+ return krona_results
+
+
+def write_krona(rank, krona_results, out_fp, *, sep='\t'):
+ 'write krona output'
+ header = make_krona_header(rank)
+ tsv_output = csv.writer(out_fp, delimiter='\t')
+ tsv_output.writerow(header)
+ for res in krona_results:
+ tsv_output.writerow(res)
+
+
+def write_summary(summarized_gather, csv_fp, *, sep=','):
+ '''
+ Write taxonomy-summarized gather results for each rank.
+ '''
+ header = SummarizedGatherResult._fields
+ w = csv.DictWriter(csv_fp, header, delimiter=sep)
+ w.writeheader()
+ for rank, rank_results in summarized_gather.items():
+ for res in rank_results:
+ rD = res._asdict()
+ rD['fraction'] = f'{res.fraction:.3f}'
+ rD['lineage'] = display_lineage(res.lineage)
+ w.writerow(rD)
+
+
+def write_classifications(classifications, csv_fp, *, sep=','):
+ '''
+ Write taxonomy-classifed gather results.
+ '''
+ header = ClassificationResult._fields
+ w = csv.DictWriter(csv_fp, header, delimiter=sep)
+ w.writeheader()
+ for rank, rank_results in classifications.items():
+ for res in rank_results:
+ rD = res._asdict()
+ rD['fraction'] = f'{res.fraction:.3f}'
+ rD['lineage'] = display_lineage(res.lineage)
+ w.writerow(rD)
+
+
+def combine_sumgather_csvs_by_lineage(gather_csvs, *, rank="species", accept_ranks = list(lca_utils.taxlist(include_strain=False)), force=False):
+ '''
+ Takes in one or more output csvs from `sourmash taxonomy summarize`
+ and combines the results into a nested dictionary with lineages
+ as the keys {lineage: {sample1: frac1, sample2: frac2}}.
+ Uses the file basename (minus .csv extension) as sample identifier.
+
+ usage:
+
+ linD, all_samples = combine_sumgather_by_lineage(["sample1.csv", "sample2.csv"], rank="genus")
+
+ output:
+
+ linD = {lin_a: {'sample1': 0.4, 'sample2': 0.17, 'sample3': 0.6}
+ lin_b: {'sample1': 0.0, 'sample2': 0.0, 'sample3': 0.1}
+ lin_c: {'sample1': 0.3, 'sample2': 0.4, 'sample3': 0.2} }
+
+ all_samples = ['sample1','sample2','sample3']
+
+ '''
+ if rank not in accept_ranks:
+ raise ValueError(f"Rank {rank} not available.")
+
+ sgD = defaultdict(dict)
+ all_samples = []
+ for g_csv in gather_csvs:
+ # collect lineage info for this sample
+ with open(g_csv, 'r') as fp:
+ r = csv.DictReader(fp)
+ for row in r:
+ if row["rank"] == rank:
+ query_name = row["query_name"]
+ lin = row["lineage"]
+ frac = row["fraction"]
+ if query_name not in all_samples:
+ all_samples.append(query_name)
+ sgD[lin][query_name] = frac
+ fp.close()
+ return sgD, all_samples
+
+
+def write_lineage_sample_frac(sample_names, lineage_dict, out_fp, *, format_lineage=False, sep='\t'):
+ '''
+ takes in a lineage dictionary with sample counts (output of combine_sumgather_by_lineage)
+ and produces a tab-separated file with fractions for each sample.
+
+ input: {lin_a: {sample1: 0.4, sample2: 0.17, sample3: 0.6}
+ lin_b: {sample1: 0.0, sample2: 0.0, sample3: 0.1}
+ lin_c: {sample1: 0.3, sample2: 0.4, sample3: 0.2}}
+
+ output:
+
+ lineage sample1 sample2 sample3
+ lin_a 0.4 0.17 0.6
+ lin_b 0.0 0.0 0.1
+ lin_c 0.3 0.4 0.2
+ '''
+
+ header = ["lineage"] + sample_names
+ w = csv.DictWriter(out_fp, header, delimiter=sep)
+ w.writeheader()
+ blank_row = {query_name: 0 for query_name in sample_names}
+ for lin, sampleinfo in sorted(lineage_dict.items()):
+ if format_lineage:
+ lin = display_lineage(lin)
+ #add lineage and 0 placeholders
+ row = {'lineage': lin}
+ row.update(blank_row)
+ # add info for query_names that exist for this lineage
+ row.update(sampleinfo)
+ # write row
+ w.writerow(row)
+
+
+class LineageDB(abc.Mapping):
+ "Base LineageDB class built around an assignments dictionary."
+ def __init__(self, assign_d, avail_ranks):
+ self.assignments = assign_d
+ self.available_ranks = set(avail_ranks)
+
+ def __getitem__(self, ident):
+ "Retrieve the lineage tuple for identifer (or raise KeyError)"
+ return self.assignments[ident]
+
+ def __iter__(self):
+ "Return all identifiers for this db."
+ return iter(self.assignments)
+
+ def __len__(self):
+ "Return number of lineages"
+ return len(self.assignments)
+
+ def __bool__(self):
+ "Are there any lineages at all in this database?"
+ return bool(self.assignments)
+
+ @classmethod
+ def load(cls, filename, *, delimiter=',', force=False,
+ keep_full_identifiers=False, keep_identifier_versions=True):
+ """
+ Load a taxonomy assignment CSV file into a LineageDB.
+
+ 'keep_full_identifiers=False' will split identifiers from strings
+ using whitespace, e.g. 'IDENT other name stuff' => 'IDENT'
+
+ 'keep_identifier_versions=False' will remove trailing versions,
+ e.g. 'IDENT.1' => 'IDENT'.
+ """
+ include_strain=False
+ if not keep_identifier_versions and keep_full_identifiers:
+ raise ValueError("keep_identifer_versions=False doesn't make sense with keep_full_identifiers=True")
+
+ if not os.path.exists(filename):
+ raise ValueError(f"'{filename}' does not exist")
+
+ if os.path.isdir(filename):
+ raise ValueError(f"'{filename}' is a directory")
+
+ with open(filename, newline='') as fp:
+ r = csv.DictReader(fp, delimiter=delimiter)
+ header = r.fieldnames
+ if not header:
+ raise ValueError(f'cannot read taxonomy assignments from {filename}')
+
+ identifier = "ident"
+ # check for ident/identifier, handle some common alternatives
+ if "ident" not in header:
+ # check for ident/identifier, handle some common alternatives
+ if 'identifiers' in header:
+ identifier = 'identifiers'
+ header = ["ident" if "identifiers" == x else x for x in header]
+ elif 'accession' in header:
+ identifier = 'accession'
+ header = ["ident" if "accession" == x else x for x in header]
+ else:
+ raise ValueError('No taxonomic identifiers found.')
+ # is "strain" an available rank?
+ if "strain" in header:
+ include_strain=True
+
+ # check that all ranks are in header
+ ranks = list(lca_utils.taxlist(include_strain=include_strain))
+ if not set(ranks).issubset(header):
+ # for now, just raise err if not all ranks are present.
+ # in future, we can define `ranks` differently if desired
+ # return them from this function so we can check the `available` ranks
+ raise ValueError('Not all taxonomy ranks present')
+
+ assignments = {}
+ num_rows = 0
+ n_species = 0
+ n_strains = 0
+
+ # now parse and load lineages
+ for n, row in enumerate(r):
+ if row:
+ num_rows += 1
+ lineage = []
+ # read row into a lineage pair
+ for rank in lca_utils.taxlist(include_strain=include_strain):
+ lin = row[rank]
+ lineage.append(LineagePair(rank, lin))
+ ident = row[identifier]
+
+ # fold, spindle, and mutilate ident?
+ if not keep_full_identifiers:
+ ident = ident.split(' ')[0]
+
+ if not keep_identifier_versions:
+ ident = ident.split('.')[0]
+
+ # clean lineage of null names, replace with 'unassigned'
+ lineage = [ (a, lca_utils.filter_null(b)) for (a,b) in lineage ]
+ lineage = [ LineagePair(a, b) for (a, b) in lineage ]
+
+ # remove end nulls
+ while lineage and lineage[-1].name == 'unassigned':
+ lineage = lineage[:-1]
+
+ # store lineage tuple
+ if lineage:
+ # check duplicates
+ if ident in assignments:
+ if assignments[ident] != tuple(lineage):
+ if not force:
+ raise ValueError(f"multiple lineages for identifier {ident}")
+ else:
+ assignments[ident] = tuple(lineage)
+
+ if lineage[-1].rank == 'species':
+ n_species += 1
+ elif lineage[-1].rank == 'strain':
+ n_species += 1
+ n_strains += 1
+
+ return LineageDB(assignments, ranks)
+
+
+class LineageDB_Sqlite(abc.Mapping):
+ """
+ A LineageDB based on a sqlite3 database with a 'taxonomy' table.
+ """
+ # NOTE: 'order' is a reserved name in sql, so we have to use 'order_'.
+ columns = ('superkingdom', 'phylum', 'order_', 'class', 'family',
+ 'genus', 'species', 'strain')
+
+ def __init__(self, conn):
+ self.conn = conn
+
+ # check: can we do a 'select' on the right table?
+ self.__len__()
+ c = conn.cursor()
+
+ # get available ranks...
+ ranks = set()
+ for column, rank in zip(self.columns, taxlist(include_strain=True)):
+ query = f'SELECT COUNT({column}) FROM taxonomy WHERE {column} IS NOT NULL AND {column} != ""'
+ c.execute(query)
+ cnt, = c.fetchone()
+ if cnt:
+ ranks.add(rank)
+
+ self.available_ranks = ranks
+ self.cursor = c
+
+ @classmethod
+ def load(cls, location):
+ "load taxonomy information from a sqlite3 database"
+ import sqlite3
+ try:
+ conn = sqlite3.connect(location)
+ db = cls(conn)
+ except sqlite3.DatabaseError:
+ raise ValueError("not a sqlite database")
+ return db
+
+ def _make_tup(self, row):
+ "build a tuple of LineagePairs for this sqlite row"
+ tup = [ LineagePair(n, r) for (n, r) in zip(taxlist(True), row) ]
+ return tuple(tup)
+
+ def __getitem__(self, ident):
+ "Retrieve lineage for identifer"
+ c = self.cursor
+ c.execute('SELECT superkingdom, phylum, class, order_, family, genus, species, strain FROM taxonomy WHERE ident=?', (ident,))
+
+ # retrieve names list...
+ names = c.fetchone()
+ if names:
+ # ...and construct lineage tuple
+ tup = self._make_tup(names)
+ while tup and not tup[-1].name:
+ tup = tup[:-1]
+
+ return tup
+
+ raise KeyError(ident)
+
+ def __bool__(self):
+ "Do we have any info?"
+ return bool(len(self))
+
+ def __len__(self):
+ "Return number of rows"
+ c = self.conn.cursor()
+ c.execute('SELECT COUNT(DISTINCT ident) FROM taxonomy')
+ nrows, = c.fetchone()
+ return nrows
+
+ def __iter__(self):
+ "Return all identifiers"
+ # create new cursor so as to allow other operations
+ c = self.conn.cursor()
+ c.execute('SELECT DISTINCT ident FROM taxonomy')
+
+ for ident, in c:
+ yield ident
+
+ def items(self):
+ "return all items in the sqlite database"
+ c = self.conn.cursor()
+
+ c.execute('SELECT DISTINCT ident, superkingdom, phylum, class, order_, family, genus, species, strain FROM taxonomy')
+
+ for ident, *names in c:
+ yield ident, self._make_tup(names)
+
+class MultiLineageDB(abc.Mapping):
+ "A wrapper for (dynamically) combining multiple lineage databases."
+
+ # NTP: currently, later lineage databases will override earlier ones.
+ # Do we want to report/summarize shadowed identifiers?
+
+ def __init__(self):
+ self.lineage_dbs = []
+
+ @property
+ def available_ranks(self):
+ "build the union of available ranks across all databases"
+ # CTB: do we need to worry about lineages of shadowed identifiers?
+ x = set()
+ for db in self.lineage_dbs:
+ x.update(db.available_ranks)
+ return x
+
+ def add(self, db):
+ "Add a new lineage database"
+ self.lineage_dbs.insert(0, db)
+
+ def __iter__(self):
+ "Return all identifiers (once)"
+ seen = set()
+ for db in self.lineage_dbs:
+ for k in db:
+ if k not in seen:
+ seen.add(k)
+ yield k
+
+ def items(self):
+ "Return all (identifiers, lineage_tup), masking duplicate idents"
+ seen = set()
+ for db in self.lineage_dbs:
+ for k, v in db.items():
+ if k not in seen:
+ seen.add(k)
+ yield k, v
+
+ def shadowed_identifiers(self):
+ seen = set()
+ dups = set()
+ for db in self.lineage_dbs:
+ for k, v in db.items():
+ if k in seen:
+ dups.add(k)
+ else:
+ seen.add(k)
+ return seen
+
+ def __getitem__(self, ident):
+ "Return lineage tuple for first match to identifier."
+ for db in self.lineage_dbs:
+ if ident in db:
+ return db[ident]
+
+ # not found? KeyError!
+ raise KeyError(ident)
+
+ def __len__(self):
+ "Return number of distinct identifiers. Currently iterates over all."
+ # CTB: maybe we can make this unnecessary?
+ x = set(self)
+ return len(x)
+
+ def __bool__(self):
+ "True if any contained database has content."
+ return any( bool(db) for db in self.lineage_dbs )
+
+ def save(self, filename_or_fp, file_format):
+ assert file_format in ('sql', 'csv')
+
+ is_filename = False
+ try:
+ filename_or_fp.write
+ except AttributeError:
+ is_filename = True
+
+ if file_format == 'sql':
+ if not is_filename:
+ raise ValueError("file format '{file_format}' requires a filename, not a file handle")
+ self._save_sqlite(filename_or_fp)
+ elif file_format == 'csv':
+ # we need a file handle; open file.
+ fp = filename_or_fp
+ if is_filename:
+ fp = open(filename_or_fp, 'w', newline="")
+
+ try:
+ self._save_csv(fp)
+ finally:
+ # close the file we opened!
+ if is_filename:
+ fp.close()
+
+ def _save_sqlite(self, filename):
+ import sqlite3
+ db = sqlite3.connect(filename)
+
+ cursor = db.cursor()
+ try:
+ cursor.execute("""
+
+ CREATE TABLE taxonomy (
+ ident TEXT NOT NULL,
+ superkingdom TEXT,
+ phylum TEXT,
+ class TEXT,
+ order_ TEXT,
+ family TEXT,
+ genus TEXT,
+ species TEXT,
+ strain TEXT
+ )
+ """)
+ did_create = True
+ except sqlite3.OperationalError:
+ # already exists?
+ raise ValueError(f"taxonomy table already exists in '{filename}'")
+
+ # follow up and create index
+ cursor.execute("CREATE UNIQUE INDEX taxonomy_ident ON taxonomy(ident);")
+ for ident, tax in self.items():
+ x = [ident, *[ t.name for t in tax ]]
+
+ if tax[-1].rank != 'strain':
+ assert len(x) == 8, len(x)
+ x.append('') # append empty strain value
+ cursor.execute('INSERT INTO taxonomy (ident, superkingdom, phylum, class, order_, family, genus, species, strain) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)', x)
+
+ db.commit()
+
+ def _save_csv(self, fp):
+ headers = ['identifiers'] + list(taxlist(include_strain=True))
+ w = csv.DictWriter(fp, fieldnames=headers)
+ w.writeheader()
+
+ for n, (ident, tax) in enumerate(self.items()):
+ row = {}
+ row['identifiers'] = ident
+
+ # convert tax LineagePairs into dictionary
+ for t in tax:
+ row[t.rank] = t.name
+
+ # add strain if needed
+ if 'strain' not in row:
+ row['strain'] = ''
+
+ w.writerow(row)
+
+ @classmethod
+ def load(cls, locations, **kwargs):
+ "Load one or more taxonomies from the given location(s)"
+ if isinstance(locations, str):
+ raise TypeError("'locations' should be a list, not a string")
+
+ tax_assign = cls()
+ for location in locations:
+ # try faster formats first
+ loaded = False
+
+ # sqlite db?
+ try:
+ this_tax_assign = LineageDB_Sqlite.load(location)
+ loaded = True
+ except ValueError:
+ pass
+
+ # CSV file?
+ if not loaded:
+ try:
+ this_tax_assign = LineageDB.load(location, **kwargs)
+ loaded = True
+ except ValueError as exc:
+ # for the last loader, just pass along ValueError...
+ raise ValueError(f"cannot read taxonomy assignments from '{location}': {str(exc)}")
+
+ # nothing loaded, goodbye!
+ if not loaded:
+ raise ValueError(f"cannot read taxonomy assignments from '{location}'")
+
+ tax_assign.add(this_tax_assign)
+
+ return tax_assign
diff --git a/tests/conftest.py b/tests/conftest.py
index 31ecc336a1..a592a1c114 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -35,6 +35,16 @@ def hp(request):
return request.param
+@pytest.fixture(params=[True, False])
+def keep_identifiers(request):
+ return request.param
+
+
+@pytest.fixture(params=[True, False])
+def keep_versions(request):
+ return request.param
+
+
@pytest.fixture(params=[2, 5, 10])
def n_children(request):
return request.param
@@ -49,6 +59,10 @@ def linear_gather(request):
def prefetch_gather(request):
return request.param
+@pytest.fixture(params=[True, False])
+def use_manifest(request):
+ return request.param
+
# --- BEGIN - Only run tests using a particular fixture --- #
# Cribbed from: http://pythontesting.net/framework/pytest/pytest-run-tests-using-particular-fixture/
diff --git a/tests/test-data/duplicate-sigs/README.md b/tests/test-data/duplicate-sigs/README.md
new file mode 100644
index 0000000000..69453567a4
--- /dev/null
+++ b/tests/test-data/duplicate-sigs/README.md
@@ -0,0 +1,2 @@
+This directory contains multiple signatures with different metadata but the same
+contents (and md5sum).
diff --git a/tests/test-data/duplicate-sigs/fb2c4c88.k=31.scaled=1000.DNA.dup=0.63.sig b/tests/test-data/duplicate-sigs/fb2c4c88.k=31.scaled=1000.DNA.dup=0.63.sig
new file mode 100644
index 0000000000..3fa1580214
--- /dev/null
+++ b/tests/test-data/duplicate-sigs/fb2c4c88.k=31.scaled=1000.DNA.dup=0.63.sig
@@ -0,0 +1 @@
+[{"class":"sourmash_signature","email":"","hash_function":"0.murmur64","filename":"/dev/fd/63","name":"GCA_009816935.1 Francisella tularensis strain=06-2412, ASM981693v1","license":"CC0","signatures":[{"num":0,"ksize":31,"seed":42,"max_hash":18446744073709552,"mins":[27281317737506,30774174543091,34134260966885,40422536005915,46092979748140,52667380017781,77640546109120,89191245175479,97341175558837,103303605162730,108020839024807,135414989701161,137460624135325,143826654727361,153684135314332,159757157353946,180095348607580,182873709030656,208309245438225,214041805294629,223646403684661,231557613626707,232715412458897,233924901580683,242631428039918,242916443054461,244997637525660,250821105337195,263769223490888,289612885205952,290507901010552,310144178150855,324002037533321,327741982499086,370660193353938,374827227987284,385925999728554,391908630864897,396271485002374,425780426886080,446363367677659,458676097367934,465761497060251,474500797064463,487890967126457,501285366842536,509029972291483,518744095767761,553284673753979,557420110856932,561588147238471,564955493511165,575940509276718,605337498101285,606548977697285,618213673953712,643115681109447,643216012278133,644963356377577,653829434142317,670360373084070,672527530119591,675150655252443,684524270185147,719626188880764,720045643265551,725967839675339,734392235451397,757995085421554,758873739679041,762435098460972,776635435277572,779263005191648,796121174431627,801537843188472,806791101979164,808323945009162,831597602839947,837342876049719,841644000186381,852358311452961,856044596532065,863211350823511,866354466734001,877528246849899,882816662973420,886831123416057,887324029071980,896684190269245,910013730955328,911021453771825,918053824067528,959404715354517,965749800410433,987156720845891,988004107440612,1003560293105019,1019132469630602,1031742940239750,1045004389970757,1047834223177133,1050009621695541,1067005090470023,1075835347418477,1106009829832368,1110321639602504,1129029490927944,1152325475808554,1155792411577727,1173501625134773,1179142432651244,1182043494444045,1185298213665043,1189817092658474,1198647977438533,1202832335428232,1213527880288483,1224149087020995,1231538243085066,1237866648067421,1247204248956355,1273002845219134,1311344518744158,1325762747773666,1327583535977585,1337025108346963,1337939134216356,1343194778880486,1347178976724408,1347965204255319,1360726528548292,1375648837386937,1378537007249094,1380037737023614,1396008873056192,1423251048049841,1425487297427108,1430011450080971,1436389861421409,1447828603923579,1448154600038663,1461606398479896,1477748591186096,1481500503955992,1489343676336319,1491409298456108,1495478198602408,1505324675783522,1506871645688047,1510642595999290,1523787516591314,1538544686450629,1551332608914346,1562178949777326,1582701020555290,1582714715562808,1594428143243217,1600950609397319,1601869664618303,1604847998387314,1605137573067942,1609948614207926,1612555898829971,1617568740441930,1630450719749736,1640503113008175,1658047227795127,1662918431912958,1674686450315888,1683642878379715,1685434818592932,1696496306605751,1699915950805608,1705874533889722,1719615875783650,1720419457177651,1728660623265899,1730106574233740,1732265511056345,1755484768216714,1757979695302374,1761318932844395,1767169583106630,1767600206417692,1768553578088471,1799333654355911,1809290380380199,1812674951838073,1816281246010249,1828421791729927,1832915856042159,1845631702610353,1850188493318868,1868069593147085,1884958304606475,1888837445051056,1891025754082888,1899332233791441,1911726518903029,1918452760708060,1952919386872879,1955072741257444,1970574995631202,1971148594271014,1980609364625158,1984116510933050,1993245451986067,1994334947674791,1996473350761932,2011995194361170,2016402104104536,2038605101930749,2041599619079412,2055486908646634,2068184964131101,2073480224427132,2081468075605553,2083859149031988,2088845573181737,2091868834590483,2095410096745181,2105648145116436,2113691764892897,2129201860410658,2130929519981988,2131490169201290,2170261983009798,2171035356430939,2177414857180262,2178114239308922,2179241686148154,2181354598823504,2184056585017162,2197366871875060,2204375393501991,2214523537881050,2221431264383483,2237381498315179,2281430614061135,2285676462393652,2289064303021873,2293074580236368,2295815116249395,2301838542881084,2310754905687291,2316428340566049,2317685491567407,2333959693552804,2342074675676015,2349601653567167,2359342178128271,2406274493306297,2409062953251471,2423082222606459,2444813151989679,2449631428878268,2467788917998802,2477025902927540,2483114115832913,2495644065213346,2499969884139421,2516439493941522,2517619989855248,2531737524981284,2532968001402240,2535608410342101,2541995133136461,2548656366696057,2566020346579398,2580305967686391,2594650471395280,2602545163684199,2613012954935120,2613606010573059,2637712179671244,2642357132272560,2642478099324048,2644197809403928,2659363560806872,2662304715625150,2698172378241685,2728578112915602,2730219461233997,2735122180150483,2735758529835485,2773281588385404,2789219916032586,2789850822350080,2810625093279126,2813140178921049,2849540048794850,2860369179928472,2871754307039262,2871907171556769,2880691563686929,2888613215455329,2903093947499687,2921332438238831,2927325011079857,2970698796413300,2985341598759171,2994088683976615,2995876644715330,2997465365202042,3003397844817766,3014591226675854,3015280923174733,3032228468334705,3034882558366266,3038102613400782,3040293237673794,3041384201679032,3065335255532440,3084787857619832,3086181234740975,3086344760356109,3088760991663376,3113879541153339,3117502144995062,3118334386799456,3118353038001378,3124967229669132,3141111622118606,3152319552764911,3157397983327458,3165214969768489,3173160905431776,3174196946497848,3176252213532449,3189600474341459,3200527298983599,3228756288805719,3233648223147430,3234704310551101,3244303587768447,3245279954664994,3253798917461757,3256199541149175,3270652593815318,3289782382963436,3291924461643881,3321936120954224,3350538525764676,3359013043704164,3367516662066673,3375956437105495,3381047593455589,3395194612319712,3395515328122470,3401980298093332,3417672631802041,3443496823444718,3473853502109201,3476156978273954,3492929176926081,3495505110142109,3503331458443572,3519095536458611,3535510821231923,3543906742502112,3547039933171809,3554769633644327,3560048373591648,3560832183669588,3561168542060264,3571381254876092,3582372240211864,3588526584026912,3590322008363596,3595234111026435,3598612108866317,3610671769485981,3621249240349460,3625292056724077,3627537868670400,3634725715407039,3654725299461030,3661758740289748,3677898597207892,3699121516322788,3702325285097834,3706591318445986,3711739163437948,3719471889280203,3725602040928106,3730648684404687,3738937658728956,3739565211999619,3748208043454616,3749224223393980,3764528432190170,3768018390267232,3792801693012879,3817603306259244,3822152193894646,3824688781764074,3828070741647380,3832361520491395,3834409435587390,3841574835390887,3845376089274772,3867101884678333,3907465181171189,3911757008173649,3936203721973325,3943792168144251,3944618352654874,3954974756216366,3957553012755889,3962797547908719,3967295346819374,3973009233407317,3977667913129268,3989206138624726,3989835479325906,4000191104196681,4016502960230774,4023968417116128,4033160238001050,4039430145337632,4054926036842145,4081890179374943,4083332846839379,4089179632516350,4095359176823975,4096638251379460,4097570444659615,4104945618865731,4106492050918606,4121210480013637,4128300247378694,4139521044238430,4156273280061428,4159321375157642,4161219557528029,4185314023925957,4188885093299245,4213378476941623,4215094248447875,4225599528550081,4246087184970239,4250723839958798,4253813544227807,4257725579070903,4274410444769581,4297865110420934,4314262721326477,4318921209326641,4326796571089165,4358157813774093,4363051662578414,4379054359357346,4387229825472605,4407121475128099,4418586927764827,4444497211702225,4448386354535670,4464672439725319,4479051595073164,4483602152851587,4487416329648218,4491213325063669,4493047653554173,4497108801816010,4503560524390685,4505039949877502,4514805314363434,4536842772674504,4556510901039833,4575403317134912,4592200188563858,4601869033857310,4610031751653292,4618259959292216,4627536713075473,4628641892367445,4633519059669963,4657815314704334,4662180672884852,4665919527337388,4687322226819303,4691844768752312,4707135680049989,4712525914876856,4720547794135633,4740606227775619,4746680609181901,4779520324623562,4791100834322356,4806377157351085,4808372750546970,4826808701416396,4832925031047563,4843277060090928,4862243495142622,4868706396089897,4870247565916556,4876212301431017,4896518867008986,4896692681487429,4909543183678547,4937682630931618,4938956956887018,4966196725954876,4969062413514851,4973600900081374,4999513822868305,5002893845020141,5008622662773184,5014970956568907,5035054008602857,5035384841398862,5039086438104691,5040652225496901,5046016737108994,5059014222849688,5066823009561671,5072430374944592,5126548277288607,5130609108534278,5141019017613116,5145220929445971,5157914165172243,5164042673067139,5164323192060956,5227087582431333,5239058059519899,5241552288110762,5242566539224285,5260929013588652,5264673038204573,5264846847738316,5291104629254226,5305867165279633,5309458727013297,5322844014918400,5323108914962638,5329144744675176,5363349599556055,5366817078493635,5369180741303960,5372222568751999,5372725457533622,5376760723403211,5389389943488643,5398148075045622,5407440001247943,5407552681509066,5425215886017472,5433029476329892,5435967788585089,5453070200302066,5505363397956209,5506656183795141,5520041430372158,5528030877128164,5529328537858607,5538301071879598,5573247074136840,5601063311481394,5612211670828121,5626811993541640,5671110634252314,5671512692298857,5676368648166471,5679704419692665,5697117527796873,5710885073252421,5722042629401126,5725434481579237,5727852290091844,5746609611777297,5759784984111085,5766468332812783,5767658702736892,5769340991360522,5817916655292234,5823448049010532,5831664213962049,5839479091276720,5847620778187862,5848083119581748,5875458157591724,5880371325296374,5887738774301190,5889494649222929,5895198579157112,5901426220955008,5923404444685793,5930424349546694,5954090060404027,5965424605208959,5976106562523974,5976468751599547,5985405131720490,6011246049659946,6019388992671725,6022142115842949,6022687599667645,6026176200521087,6026449947119821,6026852008290352,6028359150627379,6037198658977053,6049559969468114,6071357946907426,6074536323309542,6075861603095684,6086651545690621,6088147986793911,6091451791350760,6097972992225898,6102520222366377,6106185662845781,6150209857537398,6151086995205809,6179061741208419,6186814642225785,6207677944023553,6214079985740421,6222242045894974,6259769482143501,6292912013848109,6335759087357123,6338313411351946,6364174128810209,6376235120055415,6379797414315714,6392936323112603,6393281611893662,6425613172818892,6427037182426097,6435673284042797,6440155758298635,6470065458738460,6473238460278105,6498233125545854,6512447296999122,6519926591968090,6537194036770963,6539403501517441,6546843392577889,6549806578906724,6552086386539938,6562753360182634,6580454437794215,6583294028078651,6599805048819771,6611303480414165,6611480286587626,6613303542954184,6619465618743996,6620952231633165,6628364051881565,6637782452830313,6639366508325377,6643384411215171,6651291777536757,6659026904794676,6660386432638050,6706708532025137,6712233176289295,6715034077809162,6718106064823151,6718466194045753,6737012223038140,6737448552020375,6749465141048682,6755892952873165,6759712360779738,6766665767003108,6769349598161957,6779991167037331,6805981002654460,6822678972563408,6824212178067303,6825656208355223,6828108881031930,6838666110003247,6843872818542831,6858200159969968,6862403913830441,6896605336469152,6907696209350866,6940430139137108,6947378732749248,6953405360148223,6965934499254406,6978237167340371,6979116642430112,7000204500640703,7013379106513098,7015548089157306,7017340939890130,7022751771973604,7032077373720710,7057472610075201,7069751745272274,7071427481472917,7076942582186008,7083956725625194,7088117728748372,7110694400947463,7119947527921472,7162448687990059,7164239598107818,7167755656765912,7174842826868336,7175438170580579,7194659618833292,7199131481548540,7201097914134797,7249458427670294,7258600960921299,7260262458870231,7272892562787288,7291277476900411,7298751506522380,7320645384455102,7347510371391158,7349195911222994,7354593750963108,7379412287438610,7396028447991993,7407333626649072,7417992548609519,7436769564720525,7455509939494098,7466612719167435,7473444312733619,7473921057215867,7481010725288922,7494555361323136,7511820031110502,7539834933699319,7541028416083622,7548889429298262,7550654989959541,7567961569136947,7568482345047421,7581552356535593,7583581654211041,7587845721090877,7628037567441185,7633586298526539,7639776715226470,7676516649995125,7678642663878476,7694954607256167,7696205219169608,7701103797021301,7717485469267928,7734382322255612,7742828147953131,7754433671355044,7755280476043760,7769918479127653,7792923160082860,7808336679035965,7851406039611761,7854329110050349,7864747279450982,7874975130558924,7878651583399616,7905885790085843,7907390132227916,7910193473416632,7914616292135963,7936567617966657,7942757952243526,7948051059967055,7955099191740661,7958261337857520,7959981873817532,7961073053238888,7969625643027082,7971181815184894,7974554831275928,7974726923655088,7979097331324691,8004922364117543,8007706307006295,8012303905212338,8013295641540623,8066164953236699,8084246722538173,8100240629932439,8100951490474943,8106199043893306,8107552539527840,8114588959043938,8132109950950425,8139068209349164,8139545132154805,8142237913270447,8158329919102245,8167114814444647,8173611854335395,8189714298100356,8195428097158099,8201564519665139,8220965830202883,8235606574847074,8246150433354016,8270274863750807,8284482132698750,8285985183630643,8292533320746682,8295908119012735,8296380863267928,8323255543543643,8348711822077161,8358705549655169,8365548267150232,8374906480797651,8375721515813351,8379632648873725,8385947080398988,8391743136854975,8418406057930748,8428572383618079,8432685689145543,8433517681349491,8438148178407681,8444919949804523,8446420140769624,8455107906508370,8475628073984291,8508022924218892,8510397511704993,8514237274685749,8514396962844012,8531363280764652,8537391416444784,8549235306484624,8580449592169756,8598393465824567,8604034180239741,8611570380523396,8612678693937747,8620109638775764,8642635839487892,8646344667334136,8646717894657892,8654788340452875,8658449886140254,8663165400137631,8722442451761544,8761729420462614,8799647367219413,8801831997666499,8808453998469722,8818307640518629,8818834451063303,8823766790135813,8833290176775915,8841993926981016,8854459467830464,8861087128751225,8862728500798303,8880985466378305,8881613336282482,8925251195661314,8953704880557779,8985742174669720,8991621885018154,8992026721663720,8997168720804990,9016856686855149,9047424902876796,9050560415698716,9050753309887512,9061429496148816,9069273612029890,9080265743048056,9086071373806583,9095588303764494,9096970962257714,9100770128872926,9101701347271054,9112256340668504,9116617466944120,9136292702218238,9137011986680851,9143781826339633,9153600809394255,9199889121013565,9201721144829026,9209023421216735,9210503570621360,9214460083393763,9221754104943871,9224413178285270,9234547745090520,9243132989358081,9248116050684550,9264703712467273,9280659174889120,9287946744159071,9292140972727810,9327466025742305,9332051044758209,9363145119903540,9364400993642126,9391130196045242,9395408164721980,9399740447176971,9424276250634510,9452221926640102,9456478187020366,9464543357189079,9465600149658791,9491049221619505,9497191629902225,9522170868080528,9533727201306437,9547661663619976,9549245598585404,9569979472692505,9573473519866808,9591009354744385,9617917621932747,9648236513062897,9658329412127906,9659598422876240,9682812252677365,9686371108983926,9728579985303380,9729699868914242,9740105842067989,9745445361656865,9745681179963804,9754643298197035,9760567102397958,9771710857253491,9789917483611116,9793359197214519,9793503159465365,9794390650710491,9798125839578328,9804745463385914,9808752020775346,9833005702827487,9839044066017243,9845432123717545,9885127473462061,9894715619458167,9905070536524380,9908520702066001,9910780473079700,9917733811149031,9923651765289083,9927388605300676,9938702047866083,9944740018953312,9960830080447414,9961286044348072,9976639243429026,9991256507745359,10022572634244940,10025580401289134,10026906747582208,10027635205445506,10038347159492151,10039604072460842,10043026849402656,10047276489450169,10053877592451757,10063475808610984,10076240276956154,10101156164378146,10106363481576762,10127734783681212,10136370121235304,10146252926027242,10157507993657925,10170937864363577,10176151554703241,10183534012238724,10192998900924592,10197340560318368,10237743941972439,10254073288131603,10256735847958754,10280479610352360,10285508802345218,10291262449582857,10295721152014702,10310390505199421,10313473991212838,10342547269042481,10412937296046572,10455518715262653,10462990290852442,10464315303440262,10471197443105163,10471235757467544,10490073561279689,10492924328467233,10507681094428554,10516717765537166,10540330407067241,10541927502626015,10556872609087686,10565575287036146,10567098846120860,10569991378070155,10592858161029490,10605555894932784,10617796130250535,10635845120674097,10650787780040606,10652474248971788,10675091890076247,10677639062140656,10686876585139260,10687769709926863,10695637927115727,10696366138304664,10729506666892986,10739953219495080,10741375692368836,10772979860590118,10787932183480326,10797834288911940,10797954026094675,10799912491676373,10801123407332591,10809957352958534,10811859724613778,10814855039053275,10823710184242031,10835831635651692,10838780402343038,10839952733006938,10844349943321077,10845732416899091,10850918643526601,10860256158679413,10863856633990962,10869462262089206,10888194791081552,10892912626153721,10898443770405678,10910130929886125,10939089240676095,10957695103575326,10960164352659702,10961938624164593,10962367887299917,10963209129287673,10972178204371166,10980943184449618,11004523743982583,11021436183624407,11027930223905649,11051706685260634,11068765668137027,11117926166287355,11130512060712491,11137064269321064,11137491086953899,11142860693680581,11161848365790340,11164864927470696,11166391153310384,11170503851782616,11174109853098886,11179331841591506,11183033087165489,11194665067070355,11213473653620578,11238383978151703,11241048792018996,11254802265046802,11264184666096697,11268594471164860,11273858751227255,11285014483956224,11306227409946359,11316029880189806,11317018836436108,11326888850753433,11332371567827730,11344860342601282,11363041022575148,11364562992186694,11366255071713968,11378751439993468,11388719560536041,11408349988397772,11415332673174238,11418046330072121,11435535718923282,11449275714355046,11460074843266170,11483399532377237,11495688976808914,11507029069035406,11509653702854252,11515420136228054,11516873482469677,11520503668749817,11525515242618518,11557753508725419,11561604977704834,11569292413360517,11589295715309246,11631902140279157,11654556201194743,11656899418054614,11657294505407742,11661070133557696,11668795792548531,11688426128601404,11692173069223099,11692638821382112,11704142988138877,11730275878132669,11747860305548587,11768902473065614,11809606231785533,11814942688163748,11830275507824233,11832881846320770,11839076547825639,11842516303617037,11848809254066440,11849604090148961,11861036171256724,11862479669320152,11902273608768276,11909927135034753,11915914136058870,11917366131988635,11918146339628393,11952186029640314,11965762970341077,11968852102140818,11988163409658002,11991640765060726,12003946399173925,12015371223569459,12015699073484173,12026238097639303,12027543541848783,12029927405469169,12030852032077865,12032177915239463,12033394505469741,12041449133719384,12065689743810267,12075464021106187,12097367992437353,12102042122098365,12106676312986046,12119543078536750,12137234780206015,12152142412839865,12157879639736773,12158276228389397,12162749550023120,12165771397045986,12167381433344274,12181596294329824,12203742166109026,12209847266646966,12224653876328040,12229733059110899,12236479214073070,12240425549065773,12252089558841057,12252834455254861,12252921401858245,12263083529856368,12280117660905010,12282073505316488,12291733945277146,12295652397734313,12308950434416884,12315245431394520,12325148768094462,12353815014171672,12409472162226791,12413108559037636,12428072482683163,12435514170613803,12457386755420991,12515317411195819,12523706347945485,12523915038722689,12559341317878562,12582135565969205,12587936565284399,12592521878562533,12606988144137942,12615583150436888,12648682904569149,12656884893522301,12659016821769380,12672378100177709,12672827694409958,12678220421944320,12715313343566187,12719294959376032,12719335619027235,12720282275955265,12735108166808283,12736098435499462,12761734704097496,12765606334499184,12768487626988503,12787159319028422,12795719813049885,12796004399647948,12799644350597798,12825977729217187,12830098688357983,12830625689728250,12834785022889568,12844934606745507,12864892528356214,12879900683153406,12886172805466435,12888336247314715,12900495987025021,12904609340082772,12910186793479215,12913485144602650,12940792596482062,12951190294698176,12959920644213803,12987666390036602,12989467441559491,13017251126115385,13019641576048600,13025171186373006,13037172756924801,13041090869246053,13086846890781615,13102950879868503,13106344354361040,13110764699645099,13124781656132352,13146834909840568,13150475019000506,13165947997663636,13166432994532590,13173193375632369,13189323547733397,13191421827504087,13193440049022055,13195947726278877,13198070394653089,13223167457129300,13224252442883332,13228531581453927,13247204945379143,13258807801452479,13262258424271961,13267705241542163,13278616079036501,13283748455296671,13285230775877616,13285868034600602,13300334671920689,13310292875184935,13329136310276887,13352283337060157,13369169997036466,13377705827703609,13379962140831188,13380311342746500,13382187966941630,13390758145584707,13416266652895919,13419557557650088,13422200559860853,13427443083225106,13439116518710702,13456005147462978,13460243070950619,13461053659060122,13477523913094903,13487344032963920,13507861890928667,13508621577819321,13515438283308158,13521387743539755,13521554576987408,13539882561854500,13542269562596619,13578903302625692,13579583599193286,13581215157158944,13587044091969040,13602252577581628,13603519953862396,13615325004229940,13618894832957770,13650874726182653,13651727413207625,13662960495380203,13673600314291218,13674963480009909,13720620632179448,13735303013062726,13758363653494113,13768307727241862,13773441988162878,13784942424403905,13790029177942415,13790094291403739,13801859760285334,13823682578780211,13837797612780120,13840946933400146,13849721184598166,13858714430263200,13867040888606934,13869330719207858,13869373581337859,13875919313930175,13885363497519254,13899685526884978,13903849933818815,13907306748318011,13908235025197238,13910720876944015,13915193569823349,13923426716034528,13927569063761614,13935851918750000,13944130770252219,13950881517688055,13952439552733635,13955532186501673,13956941976117171,13966505352198009,13971610984999907,13979667301665560,13980401348213351,14024776843639840,14045295365141886,14061987901102577,14073881884275302,14075769365417929,14101845060400112,14117845585934631,14125332980519745,14128142329462641,14141075514030681,14159769218027675,14160023342235796,14165810602836303,14181751660405752,14195237522560027,14196209485287318,14201058898149871,14209101675948535,14231071926066360,14233941946968962,14241316678116937,14284230229638553,14284814018322640,14305677576147150,14305696255635441,14307348878842670,14321113665510751,14340366181037166,14382118569064968,14388001837716830,14391131597662486,14405901660716046,14427191795741247,14431884293379000,14435096137720815,14453306552449138,14460183468613142,14483703679632836,14484162426506072,14501476390289602,14501597818233059,14501833648745811,14511683112043509,14519535484637316,14521142156451097,14531002795413835,14540525969203315,14553035897229090,14563971448853004,14600735852927628,14611838908596828,14619382573883017,14641964844026130,14646545710428465,14652831294709181,14658550015911821,14658660845978751,14663491457747719,14681209102114692,14685647720239038,14723277112189400,14724140557861103,14735542904510591,14735551553794705,14737623329240627,14745232043596101,14752117678987462,14752581133768923,14759374398919542,14763316537025536,14781094775952097,14796832801115371,14796936112218498,14804719699727899,14814157253057913,14815872841991523,14819066206613811,14823404554947143,14829528747684225,14866285111324854,14874993683632673,14883151088403008,14887455070918576,14892361811823494,14910335371645937,14912913132110225,14924238937048081,14929241650755283,14950642543150044,14951063678347538,14975260126884161,15012809225833170,15012809793999981,15018571069035926,15020915825825774,15021404021337883,15022670595110400,15067928338100796,15085336559480488,15086974469237813,15095558052845905,15096856583295205,15104704601738115,15137463620871579,15139231593172071,15139470141320967,15141311654469419,15157297991462867,15165877497218537,15187454908961867,15205333655740165,15206524678220769,15213795744311666,15223458910424702,15231054902970513,15247444976777419,15253979392755088,15261179716215589,15264607147906949,15279764888497900,15289070477931369,15289098424523085,15302475269115685,15329448444911128,15332466801914067,15333746453390031,15334014646205651,15359121081622482,15372450993617149,15386636932060455,15388624896861794,15393703762293723,15396285667061859,15403250655983005,15418940717783981,15433964561817078,15438010039979776,15445262976227614,15455930562511989,15458735994674226,15459676313235252,15474414406371359,15486140333535895,15523643011224095,15530427628910070,15535488185343583,15547815300743142,15558207370718302,15560047986976016,15562655257896949,15572024028401096,15576633429000938,15640243559582304,15652664613796795,15662909040297435,15667815187503376,15674056088265955,15715286415685035,15734854842917459,15735629130746769,15745022742489686,15746397054374347,15753706416522065,15756364032314896,15759715213832065,15770571874905038,15771450504624662,15771565345317449,15774389582251672,15784982662006986,15826693353056782,15832992777484533,15834554452643249,15836567480040877,15846387640967869,15860004612907614,15862052574313643,15872412308796247,15876898540191851,15885096538206697,15886752701317732,15922383621886343,15925989819151637,15934315148588228,15935855575601832,15939241514854343,15947297812421556,15947649106036590,15947943670655095,15950707805014365,15953773242451901,15954192604657090,15975906896379703,15977967341911619,15988446934219592,16007534381370348,16008071982696337,16010080349394241,16019379996180302,16041693970346251,16050169327466182,16051144134269849,16070351779538067,16082115874549525,16102988424614242,16105825060325355,16108247668377083,16110185924188167,16120433790384445,16123644917797893,16124055380239661,16139682163688113,16148870984659849,16148983638429813,16179671651212829,16203460357263003,16204526262165688,16204949145766686,16241217195123001,16255807762483898,16263861205376525,16266403851500931,16274843164243675,16283022478827615,16288765059208216,16298269409548231,16310527157714076,16347427740561385,16348527080745850,16363236815965139,16378418422888053,16385922065480711,16386734109401830,16401309614419580,16420118853248593,16439064672105252,16444981631010484,16452680020690587,16455001541854420,16455790056067069,16458731130673862,16466030540364453,16483603638815995,16520653489349458,16521053145712040,16541559809954869,16549645496126569,16555041588219694,16556178595846804,16569131959084059,16584068045240185,16587611662622512,16609771886989064,16642535292639438,16649914812980760,16663626685346575,16671959185780948,16675862044640924,16725186034571857,16725455429885896,16731497228216796,16744527283341947,16746698059193344,16751770083916499,16752540407374498,16766466364238840,16781512709609102,16813158866111289,16848902773147957,16893611322805776,16894702249688277,16907053630551983,16918949101677550,16919206140826699,16927217964664091,16940864515999735,16970582910707933,16981453778803121,16983386100118313,16992080913752201,17008651624357706,17012650641208674,17015626300316320,17058734385396248,17094714926844016,17102048353802941,17119147153394225,17165663075547765,17168990396387808,17177995785851436,17192004769472906,17226057752082506,17230160752007703,17251290606896319,17255628627059519,17259460293245834,17261291602429545,17264444797119395,17267303429682730,17276137294424252,17297590713909193,17297622966744962,17304973793899500,17308573609293684,17331691594984106,17368721199906565,17370692363109638,17375347669952355,17384846996643858,17385438621225034,17393122952695385,17401580047518152,17403708711522033,17407737475614609,17412590309079896,17463298767015505,17477653216986609,17478595798135648,17482690806609130,17487690591900499,17492239615344727,17499428207086686,17513023676325158,17517878080826035,17518382048724585,17531301554404446,17532123431784764,17537235202003650,17543036676175723,17547545276513648,17551850598397102,17552727063175508,17571693025099870,17576884999794912,17580278606631657,17599142813424731,17605574666239988,17607361867830427,17618374994448394,17625656721813080,17641208765354924,17666295601206345,17676134188980509,17686356143845167,17694457529115978,17701836378206948,17713251693643593,17727864470254841,17745611798586794,17751640855559608,17755611883871708,17755994503641789,17757420725169406,17773170776249049,17780217609224267,17786870875243625,17792258928649416,17794266624700006,17816975800032715,17823693682349900,17838668413395798,17872295918517872,17876042049961247,17878900310275595,17884658064199580,17887430722263974,17893775077652078,17906199187845137,17912620382448881,17914094645083489,17926317445804624,17927386865920973,17936806132878791,17948722706950552,17950394714083630,17957430043936373,17957766991025570,17973954406588157,17981109220320748,17985536773394222,17991723913595561,17994050143919349,17999886330183165,18006067580170253,18015394845354757,18018354938006966,18023736156722913,18030285508849769,18032380366607200,18053110620914874,18054106463885348,18055637741519432,18062025642286974,18064910909128101,18068076157304068,18068456068882015,18081200901569883,18097697732126918,18142823591161549,18157090443766400,18165366617626746,18171825130045475,18182572711257795,18199348561681442,18199940176448223,18208382627853317,18210472497434014,18229683164086399,18236982822958815,18243995057427405,18268933871999774,18277497206833650,18298789242826634,18299236052051415,18308783355641550,18323932752353633,18334037508831312,18345286825512630,18346225455350760,18361317522385086,18368801383412228,18374609286574885,18404707397839425,18420663232850685,18425471188893674,18435764882631694,18436653476822659],"md5sum":"fb2c4c8861753dbc497d72d0e465465a","molecule":"dna"}],"version":0.4}]
\ No newline at end of file
diff --git a/tests/test-data/duplicate-sigs/fb2c4c88.k=31.scaled=1000.DNA.dup=100.63.sig b/tests/test-data/duplicate-sigs/fb2c4c88.k=31.scaled=1000.DNA.dup=100.63.sig
new file mode 100644
index 0000000000..b85bb0ef21
--- /dev/null
+++ b/tests/test-data/duplicate-sigs/fb2c4c88.k=31.scaled=1000.DNA.dup=100.63.sig
@@ -0,0 +1 @@
+[{"class":"sourmash_signature","email":"","hash_function":"0.murmur64","filename":"/dev/fd/63","name":"GCA_009817195.1 Francisella tularensis strain=99-907, ASM981719v1","license":"CC0","signatures":[{"num":0,"ksize":31,"seed":42,"max_hash":18446744073709552,"mins":[27281317737506,30774174543091,34134260966885,40422536005915,46092979748140,52667380017781,77640546109120,89191245175479,97341175558837,103303605162730,108020839024807,135414989701161,137460624135325,143826654727361,153684135314332,159757157353946,180095348607580,182873709030656,208309245438225,214041805294629,223646403684661,231557613626707,232715412458897,233924901580683,242631428039918,242916443054461,244997637525660,250821105337195,263769223490888,289612885205952,290507901010552,310144178150855,324002037533321,327741982499086,370660193353938,374827227987284,385925999728554,391908630864897,396271485002374,425780426886080,446363367677659,458676097367934,465761497060251,474500797064463,487890967126457,501285366842536,509029972291483,518744095767761,553284673753979,557420110856932,561588147238471,564955493511165,575940509276718,605337498101285,606548977697285,618213673953712,643115681109447,643216012278133,644963356377577,653829434142317,670360373084070,672527530119591,675150655252443,684524270185147,719626188880764,720045643265551,725967839675339,734392235451397,757995085421554,758873739679041,762435098460972,776635435277572,779263005191648,796121174431627,801537843188472,806791101979164,808323945009162,831597602839947,837342876049719,841644000186381,852358311452961,856044596532065,863211350823511,866354466734001,877528246849899,882816662973420,886831123416057,887324029071980,896684190269245,910013730955328,911021453771825,918053824067528,959404715354517,965749800410433,987156720845891,988004107440612,1003560293105019,1019132469630602,1031742940239750,1045004389970757,1047834223177133,1050009621695541,1067005090470023,1075835347418477,1106009829832368,1110321639602504,1129029490927944,1152325475808554,1155792411577727,1173501625134773,1179142432651244,1182043494444045,1185298213665043,1189817092658474,1198647977438533,1202832335428232,1213527880288483,1224149087020995,1231538243085066,1237866648067421,1247204248956355,1273002845219134,1311344518744158,1325762747773666,1327583535977585,1337025108346963,1337939134216356,1343194778880486,1347178976724408,1347965204255319,1360726528548292,1375648837386937,1378537007249094,1380037737023614,1396008873056192,1423251048049841,1425487297427108,1430011450080971,1436389861421409,1447828603923579,1448154600038663,1461606398479896,1477748591186096,1481500503955992,1489343676336319,1491409298456108,1495478198602408,1505324675783522,1506871645688047,1510642595999290,1523787516591314,1538544686450629,1551332608914346,1562178949777326,1582701020555290,1582714715562808,1594428143243217,1600950609397319,1601869664618303,1604847998387314,1605137573067942,1609948614207926,1612555898829971,1617568740441930,1630450719749736,1640503113008175,1658047227795127,1662918431912958,1674686450315888,1683642878379715,1685434818592932,1696496306605751,1699915950805608,1705874533889722,1719615875783650,1720419457177651,1728660623265899,1730106574233740,1732265511056345,1755484768216714,1757979695302374,1761318932844395,1767169583106630,1767600206417692,1768553578088471,1799333654355911,1809290380380199,1812674951838073,1816281246010249,1828421791729927,1832915856042159,1845631702610353,1850188493318868,1868069593147085,1884958304606475,1888837445051056,1891025754082888,1899332233791441,1911726518903029,1918452760708060,1952919386872879,1955072741257444,1970574995631202,1971148594271014,1980609364625158,1984116510933050,1993245451986067,1994334947674791,1996473350761932,2011995194361170,2016402104104536,2038605101930749,2041599619079412,2055486908646634,2068184964131101,2073480224427132,2081468075605553,2083859149031988,2088845573181737,2091868834590483,2095410096745181,2105648145116436,2113691764892897,2129201860410658,2130929519981988,2131490169201290,2170261983009798,2171035356430939,2177414857180262,2178114239308922,2179241686148154,2181354598823504,2184056585017162,2197366871875060,2204375393501991,2214523537881050,2221431264383483,2237381498315179,2281430614061135,2285676462393652,2289064303021873,2293074580236368,2295815116249395,2301838542881084,2310754905687291,2316428340566049,2317685491567407,2333959693552804,2342074675676015,2349601653567167,2359342178128271,2406274493306297,2409062953251471,2423082222606459,2444813151989679,2449631428878268,2467788917998802,2477025902927540,2483114115832913,2495644065213346,2499969884139421,2516439493941522,2517619989855248,2531737524981284,2532968001402240,2535608410342101,2541995133136461,2548656366696057,2566020346579398,2580305967686391,2594650471395280,2602545163684199,2613012954935120,2613606010573059,2637712179671244,2642357132272560,2642478099324048,2644197809403928,2659363560806872,2662304715625150,2698172378241685,2728578112915602,2730219461233997,2735122180150483,2735758529835485,2773281588385404,2789219916032586,2789850822350080,2810625093279126,2813140178921049,2849540048794850,2860369179928472,2871754307039262,2871907171556769,2880691563686929,2888613215455329,2903093947499687,2921332438238831,2927325011079857,2970698796413300,2985341598759171,2994088683976615,2995876644715330,2997465365202042,3003397844817766,3014591226675854,3015280923174733,3032228468334705,3034882558366266,3038102613400782,3040293237673794,3041384201679032,3065335255532440,3084787857619832,3086181234740975,3086344760356109,3088760991663376,3113879541153339,3117502144995062,3118334386799456,3118353038001378,3124967229669132,3141111622118606,3152319552764911,3157397983327458,3165214969768489,3173160905431776,3174196946497848,3176252213532449,3189600474341459,3200527298983599,3228756288805719,3233648223147430,3234704310551101,3244303587768447,3245279954664994,3253798917461757,3256199541149175,3270652593815318,3289782382963436,3291924461643881,3321936120954224,3350538525764676,3359013043704164,3367516662066673,3375956437105495,3381047593455589,3395194612319712,3395515328122470,3401980298093332,3417672631802041,3443496823444718,3473853502109201,3476156978273954,3492929176926081,3495505110142109,3503331458443572,3519095536458611,3535510821231923,3543906742502112,3547039933171809,3554769633644327,3560048373591648,3560832183669588,3561168542060264,3571381254876092,3582372240211864,3588526584026912,3590322008363596,3595234111026435,3598612108866317,3610671769485981,3621249240349460,3625292056724077,3627537868670400,3634725715407039,3654725299461030,3661758740289748,3677898597207892,3699121516322788,3702325285097834,3706591318445986,3711739163437948,3719471889280203,3725602040928106,3730648684404687,3738937658728956,3739565211999619,3748208043454616,3749224223393980,3764528432190170,3768018390267232,3792801693012879,3817603306259244,3822152193894646,3824688781764074,3828070741647380,3832361520491395,3834409435587390,3841574835390887,3845376089274772,3867101884678333,3907465181171189,3911757008173649,3936203721973325,3943792168144251,3944618352654874,3954974756216366,3957553012755889,3962797547908719,3967295346819374,3973009233407317,3977667913129268,3989206138624726,3989835479325906,4000191104196681,4016502960230774,4023968417116128,4033160238001050,4039430145337632,4054926036842145,4081890179374943,4083332846839379,4089179632516350,4095359176823975,4096638251379460,4097570444659615,4104945618865731,4106492050918606,4121210480013637,4128300247378694,4139521044238430,4156273280061428,4159321375157642,4161219557528029,4185314023925957,4188885093299245,4213378476941623,4215094248447875,4225599528550081,4246087184970239,4250723839958798,4253813544227807,4257725579070903,4274410444769581,4297865110420934,4314262721326477,4318921209326641,4326796571089165,4358157813774093,4363051662578414,4379054359357346,4387229825472605,4407121475128099,4418586927764827,4444497211702225,4448386354535670,4464672439725319,4479051595073164,4483602152851587,4487416329648218,4491213325063669,4493047653554173,4497108801816010,4503560524390685,4505039949877502,4514805314363434,4536842772674504,4556510901039833,4575403317134912,4592200188563858,4601869033857310,4610031751653292,4618259959292216,4627536713075473,4628641892367445,4633519059669963,4657815314704334,4662180672884852,4665919527337388,4687322226819303,4691844768752312,4707135680049989,4712525914876856,4720547794135633,4740606227775619,4746680609181901,4779520324623562,4791100834322356,4806377157351085,4808372750546970,4826808701416396,4832925031047563,4843277060090928,4862243495142622,4868706396089897,4870247565916556,4876212301431017,4896518867008986,4896692681487429,4909543183678547,4937682630931618,4938956956887018,4966196725954876,4969062413514851,4973600900081374,4999513822868305,5002893845020141,5008622662773184,5014970956568907,5035054008602857,5035384841398862,5039086438104691,5040652225496901,5046016737108994,5059014222849688,5066823009561671,5072430374944592,5126548277288607,5130609108534278,5141019017613116,5145220929445971,5157914165172243,5164042673067139,5164323192060956,5227087582431333,5239058059519899,5241552288110762,5242566539224285,5260929013588652,5264673038204573,5264846847738316,5291104629254226,5305867165279633,5309458727013297,5322844014918400,5323108914962638,5329144744675176,5363349599556055,5366817078493635,5369180741303960,5372222568751999,5372725457533622,5376760723403211,5389389943488643,5398148075045622,5407440001247943,5407552681509066,5425215886017472,5433029476329892,5435967788585089,5453070200302066,5505363397956209,5506656183795141,5520041430372158,5528030877128164,5529328537858607,5538301071879598,5573247074136840,5601063311481394,5612211670828121,5626811993541640,5671110634252314,5671512692298857,5676368648166471,5679704419692665,5697117527796873,5710885073252421,5722042629401126,5725434481579237,5727852290091844,5746609611777297,5759784984111085,5766468332812783,5767658702736892,5769340991360522,5817916655292234,5823448049010532,5831664213962049,5839479091276720,5847620778187862,5848083119581748,5875458157591724,5880371325296374,5887738774301190,5889494649222929,5895198579157112,5901426220955008,5923404444685793,5930424349546694,5954090060404027,5965424605208959,5976106562523974,5976468751599547,5985405131720490,6011246049659946,6019388992671725,6022142115842949,6022687599667645,6026176200521087,6026449947119821,6026852008290352,6028359150627379,6037198658977053,6049559969468114,6071357946907426,6074536323309542,6075861603095684,6086651545690621,6088147986793911,6091451791350760,6097972992225898,6102520222366377,6106185662845781,6150209857537398,6151086995205809,6179061741208419,6186814642225785,6207677944023553,6214079985740421,6222242045894974,6259769482143501,6292912013848109,6335759087357123,6338313411351946,6364174128810209,6376235120055415,6379797414315714,6392936323112603,6393281611893662,6425613172818892,6427037182426097,6435673284042797,6440155758298635,6470065458738460,6473238460278105,6498233125545854,6512447296999122,6519926591968090,6537194036770963,6539403501517441,6546843392577889,6549806578906724,6552086386539938,6562753360182634,6580454437794215,6583294028078651,6599805048819771,6611303480414165,6611480286587626,6613303542954184,6619465618743996,6620952231633165,6628364051881565,6637782452830313,6639366508325377,6643384411215171,6651291777536757,6659026904794676,6660386432638050,6706708532025137,6712233176289295,6715034077809162,6718106064823151,6718466194045753,6737012223038140,6737448552020375,6749465141048682,6755892952873165,6759712360779738,6766665767003108,6769349598161957,6779991167037331,6805981002654460,6822678972563408,6824212178067303,6825656208355223,6828108881031930,6838666110003247,6843872818542831,6858200159969968,6862403913830441,6896605336469152,6907696209350866,6940430139137108,6947378732749248,6953405360148223,6965934499254406,6978237167340371,6979116642430112,7000204500640703,7013379106513098,7015548089157306,7017340939890130,7022751771973604,7032077373720710,7057472610075201,7069751745272274,7071427481472917,7076942582186008,7083956725625194,7088117728748372,7110694400947463,7119947527921472,7162448687990059,7164239598107818,7167755656765912,7174842826868336,7175438170580579,7194659618833292,7199131481548540,7201097914134797,7249458427670294,7258600960921299,7260262458870231,7272892562787288,7291277476900411,7298751506522380,7320645384455102,7347510371391158,7349195911222994,7354593750963108,7379412287438610,7396028447991993,7407333626649072,7417992548609519,7436769564720525,7455509939494098,7466612719167435,7473444312733619,7473921057215867,7481010725288922,7494555361323136,7511820031110502,7539834933699319,7541028416083622,7548889429298262,7550654989959541,7567961569136947,7568482345047421,7581552356535593,7583581654211041,7587845721090877,7628037567441185,7633586298526539,7639776715226470,7676516649995125,7678642663878476,7694954607256167,7696205219169608,7701103797021301,7717485469267928,7734382322255612,7742828147953131,7754433671355044,7755280476043760,7769918479127653,7792923160082860,7808336679035965,7851406039611761,7854329110050349,7864747279450982,7874975130558924,7878651583399616,7905885790085843,7907390132227916,7910193473416632,7914616292135963,7936567617966657,7942757952243526,7948051059967055,7955099191740661,7958261337857520,7959981873817532,7961073053238888,7969625643027082,7971181815184894,7974554831275928,7974726923655088,7979097331324691,8004922364117543,8007706307006295,8012303905212338,8013295641540623,8066164953236699,8084246722538173,8100240629932439,8100951490474943,8106199043893306,8107552539527840,8114588959043938,8132109950950425,8139068209349164,8139545132154805,8142237913270447,8158329919102245,8167114814444647,8173611854335395,8189714298100356,8195428097158099,8201564519665139,8220965830202883,8235606574847074,8246150433354016,8270274863750807,8284482132698750,8285985183630643,8292533320746682,8295908119012735,8296380863267928,8323255543543643,8348711822077161,8358705549655169,8365548267150232,8374906480797651,8375721515813351,8379632648873725,8385947080398988,8391743136854975,8418406057930748,8428572383618079,8432685689145543,8433517681349491,8438148178407681,8444919949804523,8446420140769624,8455107906508370,8475628073984291,8508022924218892,8510397511704993,8514237274685749,8514396962844012,8531363280764652,8537391416444784,8549235306484624,8580449592169756,8598393465824567,8604034180239741,8611570380523396,8612678693937747,8620109638775764,8642635839487892,8646344667334136,8646717894657892,8654788340452875,8658449886140254,8663165400137631,8722442451761544,8761729420462614,8799647367219413,8801831997666499,8808453998469722,8818307640518629,8818834451063303,8823766790135813,8833290176775915,8841993926981016,8854459467830464,8861087128751225,8862728500798303,8880985466378305,8881613336282482,8925251195661314,8953704880557779,8985742174669720,8991621885018154,8992026721663720,8997168720804990,9016856686855149,9047424902876796,9050560415698716,9050753309887512,9061429496148816,9069273612029890,9080265743048056,9086071373806583,9095588303764494,9096970962257714,9100770128872926,9101701347271054,9112256340668504,9116617466944120,9136292702218238,9137011986680851,9143781826339633,9153600809394255,9199889121013565,9201721144829026,9209023421216735,9210503570621360,9214460083393763,9221754104943871,9224413178285270,9234547745090520,9243132989358081,9248116050684550,9264703712467273,9280659174889120,9287946744159071,9292140972727810,9327466025742305,9332051044758209,9363145119903540,9364400993642126,9391130196045242,9395408164721980,9399740447176971,9424276250634510,9452221926640102,9456478187020366,9464543357189079,9465600149658791,9491049221619505,9497191629902225,9522170868080528,9533727201306437,9547661663619976,9549245598585404,9569979472692505,9573473519866808,9591009354744385,9617917621932747,9648236513062897,9658329412127906,9659598422876240,9682812252677365,9686371108983926,9728579985303380,9729699868914242,9740105842067989,9745445361656865,9745681179963804,9754643298197035,9760567102397958,9771710857253491,9789917483611116,9793359197214519,9793503159465365,9794390650710491,9798125839578328,9804745463385914,9808752020775346,9833005702827487,9839044066017243,9845432123717545,9885127473462061,9894715619458167,9905070536524380,9908520702066001,9910780473079700,9917733811149031,9923651765289083,9927388605300676,9938702047866083,9944740018953312,9960830080447414,9961286044348072,9976639243429026,9991256507745359,10022572634244940,10025580401289134,10026906747582208,10027635205445506,10038347159492151,10039604072460842,10043026849402656,10047276489450169,10053877592451757,10063475808610984,10076240276956154,10101156164378146,10106363481576762,10127734783681212,10136370121235304,10146252926027242,10157507993657925,10170937864363577,10176151554703241,10183534012238724,10192998900924592,10197340560318368,10237743941972439,10254073288131603,10256735847958754,10280479610352360,10285508802345218,10291262449582857,10295721152014702,10310390505199421,10313473991212838,10342547269042481,10412937296046572,10455518715262653,10462990290852442,10464315303440262,10471197443105163,10471235757467544,10490073561279689,10492924328467233,10507681094428554,10516717765537166,10540330407067241,10541927502626015,10556872609087686,10565575287036146,10567098846120860,10569991378070155,10592858161029490,10605555894932784,10617796130250535,10635845120674097,10650787780040606,10652474248971788,10675091890076247,10677639062140656,10686876585139260,10687769709926863,10695637927115727,10696366138304664,10729506666892986,10739953219495080,10741375692368836,10772979860590118,10787932183480326,10797834288911940,10797954026094675,10799912491676373,10801123407332591,10809957352958534,10811859724613778,10814855039053275,10823710184242031,10835831635651692,10838780402343038,10839952733006938,10844349943321077,10845732416899091,10850918643526601,10860256158679413,10863856633990962,10869462262089206,10888194791081552,10892912626153721,10898443770405678,10910130929886125,10939089240676095,10957695103575326,10960164352659702,10961938624164593,10962367887299917,10963209129287673,10972178204371166,10980943184449618,11004523743982583,11021436183624407,11027930223905649,11051706685260634,11068765668137027,11117926166287355,11130512060712491,11137064269321064,11137491086953899,11142860693680581,11161848365790340,11164864927470696,11166391153310384,11170503851782616,11174109853098886,11179331841591506,11183033087165489,11194665067070355,11213473653620578,11238383978151703,11241048792018996,11254802265046802,11264184666096697,11268594471164860,11273858751227255,11285014483956224,11306227409946359,11316029880189806,11317018836436108,11326888850753433,11332371567827730,11344860342601282,11363041022575148,11364562992186694,11366255071713968,11378751439993468,11388719560536041,11408349988397772,11415332673174238,11418046330072121,11435535718923282,11449275714355046,11460074843266170,11483399532377237,11495688976808914,11507029069035406,11509653702854252,11515420136228054,11516873482469677,11520503668749817,11525515242618518,11557753508725419,11561604977704834,11569292413360517,11589295715309246,11631902140279157,11654556201194743,11656899418054614,11657294505407742,11661070133557696,11668795792548531,11688426128601404,11692173069223099,11692638821382112,11704142988138877,11730275878132669,11747860305548587,11768902473065614,11809606231785533,11814942688163748,11830275507824233,11832881846320770,11839076547825639,11842516303617037,11848809254066440,11849604090148961,11861036171256724,11862479669320152,11902273608768276,11909927135034753,11915914136058870,11917366131988635,11918146339628393,11952186029640314,11965762970341077,11968852102140818,11988163409658002,11991640765060726,12003946399173925,12015371223569459,12015699073484173,12026238097639303,12027543541848783,12029927405469169,12030852032077865,12032177915239463,12033394505469741,12041449133719384,12065689743810267,12075464021106187,12097367992437353,12102042122098365,12106676312986046,12119543078536750,12137234780206015,12152142412839865,12157879639736773,12158276228389397,12162749550023120,12165771397045986,12167381433344274,12181596294329824,12203742166109026,12209847266646966,12224653876328040,12229733059110899,12236479214073070,12240425549065773,12252089558841057,12252834455254861,12252921401858245,12263083529856368,12280117660905010,12282073505316488,12291733945277146,12295652397734313,12308950434416884,12315245431394520,12325148768094462,12353815014171672,12409472162226791,12413108559037636,12428072482683163,12435514170613803,12457386755420991,12515317411195819,12523706347945485,12523915038722689,12559341317878562,12582135565969205,12587936565284399,12592521878562533,12606988144137942,12615583150436888,12648682904569149,12656884893522301,12659016821769380,12672378100177709,12672827694409958,12678220421944320,12715313343566187,12719294959376032,12719335619027235,12720282275955265,12735108166808283,12736098435499462,12761734704097496,12765606334499184,12768487626988503,12787159319028422,12795719813049885,12796004399647948,12799644350597798,12825977729217187,12830098688357983,12830625689728250,12834785022889568,12844934606745507,12864892528356214,12879900683153406,12886172805466435,12888336247314715,12900495987025021,12904609340082772,12910186793479215,12913485144602650,12940792596482062,12951190294698176,12959920644213803,12987666390036602,12989467441559491,13017251126115385,13019641576048600,13025171186373006,13037172756924801,13041090869246053,13086846890781615,13102950879868503,13106344354361040,13110764699645099,13124781656132352,13146834909840568,13150475019000506,13165947997663636,13166432994532590,13173193375632369,13189323547733397,13191421827504087,13193440049022055,13195947726278877,13198070394653089,13223167457129300,13224252442883332,13228531581453927,13247204945379143,13258807801452479,13262258424271961,13267705241542163,13278616079036501,13283748455296671,13285230775877616,13285868034600602,13300334671920689,13310292875184935,13329136310276887,13352283337060157,13369169997036466,13377705827703609,13379962140831188,13380311342746500,13382187966941630,13390758145584707,13416266652895919,13419557557650088,13422200559860853,13427443083225106,13439116518710702,13456005147462978,13460243070950619,13461053659060122,13477523913094903,13487344032963920,13507861890928667,13508621577819321,13515438283308158,13521387743539755,13521554576987408,13539882561854500,13542269562596619,13578903302625692,13579583599193286,13581215157158944,13587044091969040,13602252577581628,13603519953862396,13615325004229940,13618894832957770,13650874726182653,13651727413207625,13662960495380203,13673600314291218,13674963480009909,13720620632179448,13735303013062726,13758363653494113,13768307727241862,13773441988162878,13784942424403905,13790029177942415,13790094291403739,13801859760285334,13823682578780211,13837797612780120,13840946933400146,13849721184598166,13858714430263200,13867040888606934,13869330719207858,13869373581337859,13875919313930175,13885363497519254,13899685526884978,13903849933818815,13907306748318011,13908235025197238,13910720876944015,13915193569823349,13923426716034528,13927569063761614,13935851918750000,13944130770252219,13950881517688055,13952439552733635,13955532186501673,13956941976117171,13966505352198009,13971610984999907,13979667301665560,13980401348213351,14024776843639840,14045295365141886,14061987901102577,14073881884275302,14075769365417929,14101845060400112,14117845585934631,14125332980519745,14128142329462641,14141075514030681,14159769218027675,14160023342235796,14165810602836303,14181751660405752,14195237522560027,14196209485287318,14201058898149871,14209101675948535,14231071926066360,14233941946968962,14241316678116937,14284230229638553,14284814018322640,14305677576147150,14305696255635441,14307348878842670,14321113665510751,14340366181037166,14382118569064968,14388001837716830,14391131597662486,14405901660716046,14427191795741247,14431884293379000,14435096137720815,14453306552449138,14460183468613142,14483703679632836,14484162426506072,14501476390289602,14501597818233059,14501833648745811,14511683112043509,14519535484637316,14521142156451097,14531002795413835,14540525969203315,14553035897229090,14563971448853004,14600735852927628,14611838908596828,14619382573883017,14641964844026130,14646545710428465,14652831294709181,14658550015911821,14658660845978751,14663491457747719,14681209102114692,14685647720239038,14723277112189400,14724140557861103,14735542904510591,14735551553794705,14737623329240627,14745232043596101,14752117678987462,14752581133768923,14759374398919542,14763316537025536,14781094775952097,14796832801115371,14796936112218498,14804719699727899,14814157253057913,14815872841991523,14819066206613811,14823404554947143,14829528747684225,14866285111324854,14874993683632673,14883151088403008,14887455070918576,14892361811823494,14910335371645937,14912913132110225,14924238937048081,14929241650755283,14950642543150044,14951063678347538,14975260126884161,15012809225833170,15012809793999981,15018571069035926,15020915825825774,15021404021337883,15022670595110400,15067928338100796,15085336559480488,15086974469237813,15095558052845905,15096856583295205,15104704601738115,15137463620871579,15139231593172071,15139470141320967,15141311654469419,15157297991462867,15165877497218537,15187454908961867,15205333655740165,15206524678220769,15213795744311666,15223458910424702,15231054902970513,15247444976777419,15253979392755088,15261179716215589,15264607147906949,15279764888497900,15289070477931369,15289098424523085,15302475269115685,15329448444911128,15332466801914067,15333746453390031,15334014646205651,15359121081622482,15372450993617149,15386636932060455,15388624896861794,15393703762293723,15396285667061859,15403250655983005,15418940717783981,15433964561817078,15438010039979776,15445262976227614,15455930562511989,15458735994674226,15459676313235252,15474414406371359,15486140333535895,15523643011224095,15530427628910070,15535488185343583,15547815300743142,15558207370718302,15560047986976016,15562655257896949,15572024028401096,15576633429000938,15640243559582304,15652664613796795,15662909040297435,15667815187503376,15674056088265955,15715286415685035,15734854842917459,15735629130746769,15745022742489686,15746397054374347,15753706416522065,15756364032314896,15759715213832065,15770571874905038,15771450504624662,15771565345317449,15774389582251672,15784982662006986,15826693353056782,15832992777484533,15834554452643249,15836567480040877,15846387640967869,15860004612907614,15862052574313643,15872412308796247,15876898540191851,15885096538206697,15886752701317732,15922383621886343,15925989819151637,15934315148588228,15935855575601832,15939241514854343,15947297812421556,15947649106036590,15947943670655095,15950707805014365,15953773242451901,15954192604657090,15975906896379703,15977967341911619,15988446934219592,16007534381370348,16008071982696337,16010080349394241,16019379996180302,16041693970346251,16050169327466182,16051144134269849,16070351779538067,16082115874549525,16102988424614242,16105825060325355,16108247668377083,16110185924188167,16120433790384445,16123644917797893,16124055380239661,16139682163688113,16148870984659849,16148983638429813,16179671651212829,16203460357263003,16204526262165688,16204949145766686,16241217195123001,16255807762483898,16263861205376525,16266403851500931,16274843164243675,16283022478827615,16288765059208216,16298269409548231,16310527157714076,16347427740561385,16348527080745850,16363236815965139,16378418422888053,16385922065480711,16386734109401830,16401309614419580,16420118853248593,16439064672105252,16444981631010484,16452680020690587,16455001541854420,16455790056067069,16458731130673862,16466030540364453,16483603638815995,16520653489349458,16521053145712040,16541559809954869,16549645496126569,16555041588219694,16556178595846804,16569131959084059,16584068045240185,16587611662622512,16609771886989064,16642535292639438,16649914812980760,16663626685346575,16671959185780948,16675862044640924,16725186034571857,16725455429885896,16731497228216796,16744527283341947,16746698059193344,16751770083916499,16752540407374498,16766466364238840,16781512709609102,16813158866111289,16848902773147957,16893611322805776,16894702249688277,16907053630551983,16918949101677550,16919206140826699,16927217964664091,16940864515999735,16970582910707933,16981453778803121,16983386100118313,16992080913752201,17008651624357706,17012650641208674,17015626300316320,17058734385396248,17094714926844016,17102048353802941,17119147153394225,17165663075547765,17168990396387808,17177995785851436,17192004769472906,17226057752082506,17230160752007703,17251290606896319,17255628627059519,17259460293245834,17261291602429545,17264444797119395,17267303429682730,17276137294424252,17297590713909193,17297622966744962,17304973793899500,17308573609293684,17331691594984106,17368721199906565,17370692363109638,17375347669952355,17384846996643858,17385438621225034,17393122952695385,17401580047518152,17403708711522033,17407737475614609,17412590309079896,17463298767015505,17477653216986609,17478595798135648,17482690806609130,17487690591900499,17492239615344727,17499428207086686,17513023676325158,17517878080826035,17518382048724585,17531301554404446,17532123431784764,17537235202003650,17543036676175723,17547545276513648,17551850598397102,17552727063175508,17571693025099870,17576884999794912,17580278606631657,17599142813424731,17605574666239988,17607361867830427,17618374994448394,17625656721813080,17641208765354924,17666295601206345,17676134188980509,17686356143845167,17694457529115978,17701836378206948,17713251693643593,17727864470254841,17745611798586794,17751640855559608,17755611883871708,17755994503641789,17757420725169406,17773170776249049,17780217609224267,17786870875243625,17792258928649416,17794266624700006,17816975800032715,17823693682349900,17838668413395798,17872295918517872,17876042049961247,17878900310275595,17884658064199580,17887430722263974,17893775077652078,17906199187845137,17912620382448881,17914094645083489,17926317445804624,17927386865920973,17936806132878791,17948722706950552,17950394714083630,17957430043936373,17957766991025570,17973954406588157,17981109220320748,17985536773394222,17991723913595561,17994050143919349,17999886330183165,18006067580170253,18015394845354757,18018354938006966,18023736156722913,18030285508849769,18032380366607200,18053110620914874,18054106463885348,18055637741519432,18062025642286974,18064910909128101,18068076157304068,18068456068882015,18081200901569883,18097697732126918,18142823591161549,18157090443766400,18165366617626746,18171825130045475,18182572711257795,18199348561681442,18199940176448223,18208382627853317,18210472497434014,18229683164086399,18236982822958815,18243995057427405,18268933871999774,18277497206833650,18298789242826634,18299236052051415,18308783355641550,18323932752353633,18334037508831312,18345286825512630,18346225455350760,18361317522385086,18368801383412228,18374609286574885,18404707397839425,18420663232850685,18425471188893674,18435764882631694,18436653476822659],"md5sum":"fb2c4c8861753dbc497d72d0e465465a","molecule":"dna"}],"version":0.4}]
\ No newline at end of file
diff --git a/tests/test-data/duplicate-sigs/fb2c4c88.k=31.scaled=1000.DNA.dup=2.63.sig b/tests/test-data/duplicate-sigs/fb2c4c88.k=31.scaled=1000.DNA.dup=2.63.sig
new file mode 100644
index 0000000000..06855e92c9
--- /dev/null
+++ b/tests/test-data/duplicate-sigs/fb2c4c88.k=31.scaled=1000.DNA.dup=2.63.sig
@@ -0,0 +1 @@
+[{"class":"sourmash_signature","email":"","hash_function":"0.murmur64","filename":"/dev/fd/63","name":"GCA_009817935.1 Francisella tularensis strain=99-5719, ASM981793v1","license":"CC0","signatures":[{"num":0,"ksize":31,"seed":42,"max_hash":18446744073709552,"mins":[27281317737506,30774174543091,34134260966885,40422536005915,46092979748140,52667380017781,77640546109120,89191245175479,97341175558837,103303605162730,108020839024807,135414989701161,137460624135325,143826654727361,153684135314332,159757157353946,180095348607580,182873709030656,208309245438225,214041805294629,223646403684661,231557613626707,232715412458897,233924901580683,242631428039918,242916443054461,244997637525660,250821105337195,263769223490888,289612885205952,290507901010552,310144178150855,324002037533321,327741982499086,370660193353938,374827227987284,385925999728554,391908630864897,396271485002374,425780426886080,446363367677659,458676097367934,465761497060251,474500797064463,487890967126457,501285366842536,509029972291483,518744095767761,553284673753979,557420110856932,561588147238471,564955493511165,575940509276718,605337498101285,606548977697285,618213673953712,643115681109447,643216012278133,644963356377577,653829434142317,670360373084070,672527530119591,675150655252443,684524270185147,719626188880764,720045643265551,725967839675339,734392235451397,757995085421554,758873739679041,762435098460972,776635435277572,779263005191648,796121174431627,801537843188472,806791101979164,808323945009162,831597602839947,837342876049719,841644000186381,852358311452961,856044596532065,863211350823511,866354466734001,877528246849899,882816662973420,886831123416057,887324029071980,896684190269245,910013730955328,911021453771825,918053824067528,959404715354517,965749800410433,987156720845891,988004107440612,1003560293105019,1019132469630602,1031742940239750,1045004389970757,1047834223177133,1050009621695541,1067005090470023,1075835347418477,1106009829832368,1110321639602504,1129029490927944,1152325475808554,1155792411577727,1173501625134773,1179142432651244,1182043494444045,1185298213665043,1189817092658474,1198647977438533,1202832335428232,1213527880288483,1224149087020995,1231538243085066,1237866648067421,1247204248956355,1273002845219134,1311344518744158,1325762747773666,1327583535977585,1337025108346963,1337939134216356,1343194778880486,1347178976724408,1347965204255319,1360726528548292,1375648837386937,1378537007249094,1380037737023614,1396008873056192,1423251048049841,1425487297427108,1430011450080971,1436389861421409,1447828603923579,1448154600038663,1461606398479896,1477748591186096,1481500503955992,1489343676336319,1491409298456108,1495478198602408,1505324675783522,1506871645688047,1510642595999290,1523787516591314,1538544686450629,1551332608914346,1562178949777326,1582701020555290,1582714715562808,1594428143243217,1600950609397319,1601869664618303,1604847998387314,1605137573067942,1609948614207926,1612555898829971,1617568740441930,1630450719749736,1640503113008175,1658047227795127,1662918431912958,1674686450315888,1683642878379715,1685434818592932,1696496306605751,1699915950805608,1705874533889722,1719615875783650,1720419457177651,1728660623265899,1730106574233740,1732265511056345,1755484768216714,1757979695302374,1761318932844395,1767169583106630,1767600206417692,1768553578088471,1799333654355911,1809290380380199,1812674951838073,1816281246010249,1828421791729927,1832915856042159,1845631702610353,1850188493318868,1868069593147085,1884958304606475,1888837445051056,1891025754082888,1899332233791441,1911726518903029,1918452760708060,1952919386872879,1955072741257444,1970574995631202,1971148594271014,1980609364625158,1984116510933050,1993245451986067,1994334947674791,1996473350761932,2011995194361170,2016402104104536,2038605101930749,2041599619079412,2055486908646634,2068184964131101,2073480224427132,2081468075605553,2083859149031988,2088845573181737,2091868834590483,2095410096745181,2105648145116436,2113691764892897,2129201860410658,2130929519981988,2131490169201290,2170261983009798,2171035356430939,2177414857180262,2178114239308922,2179241686148154,2181354598823504,2184056585017162,2197366871875060,2204375393501991,2214523537881050,2221431264383483,2237381498315179,2281430614061135,2285676462393652,2289064303021873,2293074580236368,2295815116249395,2301838542881084,2310754905687291,2316428340566049,2317685491567407,2333959693552804,2342074675676015,2349601653567167,2359342178128271,2406274493306297,2409062953251471,2423082222606459,2444813151989679,2449631428878268,2467788917998802,2477025902927540,2483114115832913,2495644065213346,2499969884139421,2516439493941522,2517619989855248,2531737524981284,2532968001402240,2535608410342101,2541995133136461,2548656366696057,2566020346579398,2580305967686391,2594650471395280,2602545163684199,2613012954935120,2613606010573059,2637712179671244,2642357132272560,2642478099324048,2644197809403928,2659363560806872,2662304715625150,2698172378241685,2728578112915602,2730219461233997,2735122180150483,2735758529835485,2773281588385404,2789219916032586,2789850822350080,2810625093279126,2813140178921049,2849540048794850,2860369179928472,2871754307039262,2871907171556769,2880691563686929,2888613215455329,2903093947499687,2921332438238831,2927325011079857,2970698796413300,2985341598759171,2994088683976615,2995876644715330,2997465365202042,3003397844817766,3014591226675854,3015280923174733,3032228468334705,3034882558366266,3038102613400782,3040293237673794,3041384201679032,3065335255532440,3084787857619832,3086181234740975,3086344760356109,3088760991663376,3113879541153339,3117502144995062,3118334386799456,3118353038001378,3124967229669132,3141111622118606,3152319552764911,3157397983327458,3165214969768489,3173160905431776,3174196946497848,3176252213532449,3189600474341459,3200527298983599,3228756288805719,3233648223147430,3234704310551101,3244303587768447,3245279954664994,3253798917461757,3256199541149175,3270652593815318,3289782382963436,3291924461643881,3321936120954224,3350538525764676,3359013043704164,3367516662066673,3375956437105495,3381047593455589,3395194612319712,3395515328122470,3401980298093332,3417672631802041,3443496823444718,3473853502109201,3476156978273954,3492929176926081,3495505110142109,3503331458443572,3519095536458611,3535510821231923,3543906742502112,3547039933171809,3554769633644327,3560048373591648,3560832183669588,3561168542060264,3571381254876092,3582372240211864,3588526584026912,3590322008363596,3595234111026435,3598612108866317,3610671769485981,3621249240349460,3625292056724077,3627537868670400,3634725715407039,3654725299461030,3661758740289748,3677898597207892,3699121516322788,3702325285097834,3706591318445986,3711739163437948,3719471889280203,3725602040928106,3730648684404687,3738937658728956,3739565211999619,3748208043454616,3749224223393980,3764528432190170,3768018390267232,3792801693012879,3817603306259244,3822152193894646,3824688781764074,3828070741647380,3832361520491395,3834409435587390,3841574835390887,3845376089274772,3867101884678333,3907465181171189,3911757008173649,3936203721973325,3943792168144251,3944618352654874,3954974756216366,3957553012755889,3962797547908719,3967295346819374,3973009233407317,3977667913129268,3989206138624726,3989835479325906,4000191104196681,4016502960230774,4023968417116128,4033160238001050,4039430145337632,4054926036842145,4081890179374943,4083332846839379,4089179632516350,4095359176823975,4096638251379460,4097570444659615,4104945618865731,4106492050918606,4121210480013637,4128300247378694,4139521044238430,4156273280061428,4159321375157642,4161219557528029,4185314023925957,4188885093299245,4213378476941623,4215094248447875,4225599528550081,4246087184970239,4250723839958798,4253813544227807,4257725579070903,4274410444769581,4297865110420934,4314262721326477,4318921209326641,4326796571089165,4358157813774093,4363051662578414,4379054359357346,4387229825472605,4407121475128099,4418586927764827,4444497211702225,4448386354535670,4464672439725319,4479051595073164,4483602152851587,4487416329648218,4491213325063669,4493047653554173,4497108801816010,4503560524390685,4505039949877502,4514805314363434,4536842772674504,4556510901039833,4575403317134912,4592200188563858,4601869033857310,4610031751653292,4618259959292216,4627536713075473,4628641892367445,4633519059669963,4657815314704334,4662180672884852,4665919527337388,4687322226819303,4691844768752312,4707135680049989,4712525914876856,4720547794135633,4740606227775619,4746680609181901,4779520324623562,4791100834322356,4806377157351085,4808372750546970,4826808701416396,4832925031047563,4843277060090928,4862243495142622,4868706396089897,4870247565916556,4876212301431017,4896518867008986,4896692681487429,4909543183678547,4937682630931618,4938956956887018,4966196725954876,4969062413514851,4973600900081374,4999513822868305,5002893845020141,5008622662773184,5014970956568907,5035054008602857,5035384841398862,5039086438104691,5040652225496901,5046016737108994,5059014222849688,5066823009561671,5072430374944592,5126548277288607,5130609108534278,5141019017613116,5145220929445971,5157914165172243,5164042673067139,5164323192060956,5227087582431333,5239058059519899,5241552288110762,5242566539224285,5260929013588652,5264673038204573,5264846847738316,5291104629254226,5305867165279633,5309458727013297,5322844014918400,5323108914962638,5329144744675176,5363349599556055,5366817078493635,5369180741303960,5372222568751999,5372725457533622,5376760723403211,5389389943488643,5398148075045622,5407440001247943,5407552681509066,5425215886017472,5433029476329892,5435967788585089,5453070200302066,5505363397956209,5506656183795141,5520041430372158,5528030877128164,5529328537858607,5538301071879598,5573247074136840,5601063311481394,5612211670828121,5626811993541640,5671110634252314,5671512692298857,5676368648166471,5679704419692665,5697117527796873,5710885073252421,5722042629401126,5725434481579237,5727852290091844,5746609611777297,5759784984111085,5766468332812783,5767658702736892,5769340991360522,5817916655292234,5823448049010532,5831664213962049,5839479091276720,5847620778187862,5848083119581748,5875458157591724,5880371325296374,5887738774301190,5889494649222929,5895198579157112,5901426220955008,5923404444685793,5930424349546694,5954090060404027,5965424605208959,5976106562523974,5976468751599547,5985405131720490,6011246049659946,6019388992671725,6022142115842949,6022687599667645,6026176200521087,6026449947119821,6026852008290352,6028359150627379,6037198658977053,6049559969468114,6071357946907426,6074536323309542,6075861603095684,6086651545690621,6088147986793911,6091451791350760,6097972992225898,6102520222366377,6106185662845781,6150209857537398,6151086995205809,6179061741208419,6186814642225785,6207677944023553,6214079985740421,6222242045894974,6259769482143501,6292912013848109,6335759087357123,6338313411351946,6364174128810209,6376235120055415,6379797414315714,6392936323112603,6393281611893662,6425613172818892,6427037182426097,6435673284042797,6440155758298635,6470065458738460,6473238460278105,6498233125545854,6512447296999122,6519926591968090,6537194036770963,6539403501517441,6546843392577889,6549806578906724,6552086386539938,6562753360182634,6580454437794215,6583294028078651,6599805048819771,6611303480414165,6611480286587626,6613303542954184,6619465618743996,6620952231633165,6628364051881565,6637782452830313,6639366508325377,6643384411215171,6651291777536757,6659026904794676,6660386432638050,6706708532025137,6712233176289295,6715034077809162,6718106064823151,6718466194045753,6737012223038140,6737448552020375,6749465141048682,6755892952873165,6759712360779738,6766665767003108,6769349598161957,6779991167037331,6805981002654460,6822678972563408,6824212178067303,6825656208355223,6828108881031930,6838666110003247,6843872818542831,6858200159969968,6862403913830441,6896605336469152,6907696209350866,6940430139137108,6947378732749248,6953405360148223,6965934499254406,6978237167340371,6979116642430112,7000204500640703,7013379106513098,7015548089157306,7017340939890130,7022751771973604,7032077373720710,7057472610075201,7069751745272274,7071427481472917,7076942582186008,7083956725625194,7088117728748372,7110694400947463,7119947527921472,7162448687990059,7164239598107818,7167755656765912,7174842826868336,7175438170580579,7194659618833292,7199131481548540,7201097914134797,7249458427670294,7258600960921299,7260262458870231,7272892562787288,7291277476900411,7298751506522380,7320645384455102,7347510371391158,7349195911222994,7354593750963108,7379412287438610,7396028447991993,7407333626649072,7417992548609519,7436769564720525,7455509939494098,7466612719167435,7473444312733619,7473921057215867,7481010725288922,7494555361323136,7511820031110502,7539834933699319,7541028416083622,7548889429298262,7550654989959541,7567961569136947,7568482345047421,7581552356535593,7583581654211041,7587845721090877,7628037567441185,7633586298526539,7639776715226470,7676516649995125,7678642663878476,7694954607256167,7696205219169608,7701103797021301,7717485469267928,7734382322255612,7742828147953131,7754433671355044,7755280476043760,7769918479127653,7792923160082860,7808336679035965,7851406039611761,7854329110050349,7864747279450982,7874975130558924,7878651583399616,7905885790085843,7907390132227916,7910193473416632,7914616292135963,7936567617966657,7942757952243526,7948051059967055,7955099191740661,7958261337857520,7959981873817532,7961073053238888,7969625643027082,7971181815184894,7974554831275928,7974726923655088,7979097331324691,8004922364117543,8007706307006295,8012303905212338,8013295641540623,8066164953236699,8084246722538173,8100240629932439,8100951490474943,8106199043893306,8107552539527840,8114588959043938,8132109950950425,8139068209349164,8139545132154805,8142237913270447,8158329919102245,8167114814444647,8173611854335395,8189714298100356,8195428097158099,8201564519665139,8220965830202883,8235606574847074,8246150433354016,8270274863750807,8284482132698750,8285985183630643,8292533320746682,8295908119012735,8296380863267928,8323255543543643,8348711822077161,8358705549655169,8365548267150232,8374906480797651,8375721515813351,8379632648873725,8385947080398988,8391743136854975,8418406057930748,8428572383618079,8432685689145543,8433517681349491,8438148178407681,8444919949804523,8446420140769624,8455107906508370,8475628073984291,8508022924218892,8510397511704993,8514237274685749,8514396962844012,8531363280764652,8537391416444784,8549235306484624,8580449592169756,8598393465824567,8604034180239741,8611570380523396,8612678693937747,8620109638775764,8642635839487892,8646344667334136,8646717894657892,8654788340452875,8658449886140254,8663165400137631,8722442451761544,8761729420462614,8799647367219413,8801831997666499,8808453998469722,8818307640518629,8818834451063303,8823766790135813,8833290176775915,8841993926981016,8854459467830464,8861087128751225,8862728500798303,8880985466378305,8881613336282482,8925251195661314,8953704880557779,8985742174669720,8991621885018154,8992026721663720,8997168720804990,9016856686855149,9047424902876796,9050560415698716,9050753309887512,9061429496148816,9069273612029890,9080265743048056,9086071373806583,9095588303764494,9096970962257714,9100770128872926,9101701347271054,9112256340668504,9116617466944120,9136292702218238,9137011986680851,9143781826339633,9153600809394255,9199889121013565,9201721144829026,9209023421216735,9210503570621360,9214460083393763,9221754104943871,9224413178285270,9234547745090520,9243132989358081,9248116050684550,9264703712467273,9280659174889120,9287946744159071,9292140972727810,9327466025742305,9332051044758209,9363145119903540,9364400993642126,9391130196045242,9395408164721980,9399740447176971,9424276250634510,9452221926640102,9456478187020366,9464543357189079,9465600149658791,9491049221619505,9497191629902225,9522170868080528,9533727201306437,9547661663619976,9549245598585404,9569979472692505,9573473519866808,9591009354744385,9617917621932747,9648236513062897,9658329412127906,9659598422876240,9682812252677365,9686371108983926,9728579985303380,9729699868914242,9740105842067989,9745445361656865,9745681179963804,9754643298197035,9760567102397958,9771710857253491,9789917483611116,9793359197214519,9793503159465365,9794390650710491,9798125839578328,9804745463385914,9808752020775346,9833005702827487,9839044066017243,9845432123717545,9885127473462061,9894715619458167,9905070536524380,9908520702066001,9910780473079700,9917733811149031,9923651765289083,9927388605300676,9938702047866083,9944740018953312,9960830080447414,9961286044348072,9976639243429026,9991256507745359,10022572634244940,10025580401289134,10026906747582208,10027635205445506,10038347159492151,10039604072460842,10043026849402656,10047276489450169,10053877592451757,10063475808610984,10076240276956154,10101156164378146,10106363481576762,10127734783681212,10136370121235304,10146252926027242,10157507993657925,10170937864363577,10176151554703241,10183534012238724,10192998900924592,10197340560318368,10237743941972439,10254073288131603,10256735847958754,10280479610352360,10285508802345218,10291262449582857,10295721152014702,10310390505199421,10313473991212838,10342547269042481,10412937296046572,10455518715262653,10462990290852442,10464315303440262,10471197443105163,10471235757467544,10490073561279689,10492924328467233,10507681094428554,10516717765537166,10540330407067241,10541927502626015,10556872609087686,10565575287036146,10567098846120860,10569991378070155,10592858161029490,10605555894932784,10617796130250535,10635845120674097,10650787780040606,10652474248971788,10675091890076247,10677639062140656,10686876585139260,10687769709926863,10695637927115727,10696366138304664,10729506666892986,10739953219495080,10741375692368836,10772979860590118,10787932183480326,10797834288911940,10797954026094675,10799912491676373,10801123407332591,10809957352958534,10811859724613778,10814855039053275,10823710184242031,10835831635651692,10838780402343038,10839952733006938,10844349943321077,10845732416899091,10850918643526601,10860256158679413,10863856633990962,10869462262089206,10888194791081552,10892912626153721,10898443770405678,10910130929886125,10939089240676095,10957695103575326,10960164352659702,10961938624164593,10962367887299917,10963209129287673,10972178204371166,10980943184449618,11004523743982583,11021436183624407,11027930223905649,11051706685260634,11068765668137027,11117926166287355,11130512060712491,11137064269321064,11137491086953899,11142860693680581,11161848365790340,11164864927470696,11166391153310384,11170503851782616,11174109853098886,11179331841591506,11183033087165489,11194665067070355,11213473653620578,11238383978151703,11241048792018996,11254802265046802,11264184666096697,11268594471164860,11273858751227255,11285014483956224,11306227409946359,11316029880189806,11317018836436108,11326888850753433,11332371567827730,11344860342601282,11363041022575148,11364562992186694,11366255071713968,11378751439993468,11388719560536041,11408349988397772,11415332673174238,11418046330072121,11435535718923282,11449275714355046,11460074843266170,11483399532377237,11495688976808914,11507029069035406,11509653702854252,11515420136228054,11516873482469677,11520503668749817,11525515242618518,11557753508725419,11561604977704834,11569292413360517,11589295715309246,11631902140279157,11654556201194743,11656899418054614,11657294505407742,11661070133557696,11668795792548531,11688426128601404,11692173069223099,11692638821382112,11704142988138877,11730275878132669,11747860305548587,11768902473065614,11809606231785533,11814942688163748,11830275507824233,11832881846320770,11839076547825639,11842516303617037,11848809254066440,11849604090148961,11861036171256724,11862479669320152,11902273608768276,11909927135034753,11915914136058870,11917366131988635,11918146339628393,11952186029640314,11965762970341077,11968852102140818,11988163409658002,11991640765060726,12003946399173925,12015371223569459,12015699073484173,12026238097639303,12027543541848783,12029927405469169,12030852032077865,12032177915239463,12033394505469741,12041449133719384,12065689743810267,12075464021106187,12097367992437353,12102042122098365,12106676312986046,12119543078536750,12137234780206015,12152142412839865,12157879639736773,12158276228389397,12162749550023120,12165771397045986,12167381433344274,12181596294329824,12203742166109026,12209847266646966,12224653876328040,12229733059110899,12236479214073070,12240425549065773,12252089558841057,12252834455254861,12252921401858245,12263083529856368,12280117660905010,12282073505316488,12291733945277146,12295652397734313,12308950434416884,12315245431394520,12325148768094462,12353815014171672,12409472162226791,12413108559037636,12428072482683163,12435514170613803,12457386755420991,12515317411195819,12523706347945485,12523915038722689,12559341317878562,12582135565969205,12587936565284399,12592521878562533,12606988144137942,12615583150436888,12648682904569149,12656884893522301,12659016821769380,12672378100177709,12672827694409958,12678220421944320,12715313343566187,12719294959376032,12719335619027235,12720282275955265,12735108166808283,12736098435499462,12761734704097496,12765606334499184,12768487626988503,12787159319028422,12795719813049885,12796004399647948,12799644350597798,12825977729217187,12830098688357983,12830625689728250,12834785022889568,12844934606745507,12864892528356214,12879900683153406,12886172805466435,12888336247314715,12900495987025021,12904609340082772,12910186793479215,12913485144602650,12940792596482062,12951190294698176,12959920644213803,12987666390036602,12989467441559491,13017251126115385,13019641576048600,13025171186373006,13037172756924801,13041090869246053,13086846890781615,13102950879868503,13106344354361040,13110764699645099,13124781656132352,13146834909840568,13150475019000506,13165947997663636,13166432994532590,13173193375632369,13189323547733397,13191421827504087,13193440049022055,13195947726278877,13198070394653089,13223167457129300,13224252442883332,13228531581453927,13247204945379143,13258807801452479,13262258424271961,13267705241542163,13278616079036501,13283748455296671,13285230775877616,13285868034600602,13300334671920689,13310292875184935,13329136310276887,13352283337060157,13369169997036466,13377705827703609,13379962140831188,13380311342746500,13382187966941630,13390758145584707,13416266652895919,13419557557650088,13422200559860853,13427443083225106,13439116518710702,13456005147462978,13460243070950619,13461053659060122,13477523913094903,13487344032963920,13507861890928667,13508621577819321,13515438283308158,13521387743539755,13521554576987408,13539882561854500,13542269562596619,13578903302625692,13579583599193286,13581215157158944,13587044091969040,13602252577581628,13603519953862396,13615325004229940,13618894832957770,13650874726182653,13651727413207625,13662960495380203,13673600314291218,13674963480009909,13720620632179448,13735303013062726,13758363653494113,13768307727241862,13773441988162878,13784942424403905,13790029177942415,13790094291403739,13801859760285334,13823682578780211,13837797612780120,13840946933400146,13849721184598166,13858714430263200,13867040888606934,13869330719207858,13869373581337859,13875919313930175,13885363497519254,13899685526884978,13903849933818815,13907306748318011,13908235025197238,13910720876944015,13915193569823349,13923426716034528,13927569063761614,13935851918750000,13944130770252219,13950881517688055,13952439552733635,13955532186501673,13956941976117171,13966505352198009,13971610984999907,13979667301665560,13980401348213351,14024776843639840,14045295365141886,14061987901102577,14073881884275302,14075769365417929,14101845060400112,14117845585934631,14125332980519745,14128142329462641,14141075514030681,14159769218027675,14160023342235796,14165810602836303,14181751660405752,14195237522560027,14196209485287318,14201058898149871,14209101675948535,14231071926066360,14233941946968962,14241316678116937,14284230229638553,14284814018322640,14305677576147150,14305696255635441,14307348878842670,14321113665510751,14340366181037166,14382118569064968,14388001837716830,14391131597662486,14405901660716046,14427191795741247,14431884293379000,14435096137720815,14453306552449138,14460183468613142,14483703679632836,14484162426506072,14501476390289602,14501597818233059,14501833648745811,14511683112043509,14519535484637316,14521142156451097,14531002795413835,14540525969203315,14553035897229090,14563971448853004,14600735852927628,14611838908596828,14619382573883017,14641964844026130,14646545710428465,14652831294709181,14658550015911821,14658660845978751,14663491457747719,14681209102114692,14685647720239038,14723277112189400,14724140557861103,14735542904510591,14735551553794705,14737623329240627,14745232043596101,14752117678987462,14752581133768923,14759374398919542,14763316537025536,14781094775952097,14796832801115371,14796936112218498,14804719699727899,14814157253057913,14815872841991523,14819066206613811,14823404554947143,14829528747684225,14866285111324854,14874993683632673,14883151088403008,14887455070918576,14892361811823494,14910335371645937,14912913132110225,14924238937048081,14929241650755283,14950642543150044,14951063678347538,14975260126884161,15012809225833170,15012809793999981,15018571069035926,15020915825825774,15021404021337883,15022670595110400,15067928338100796,15085336559480488,15086974469237813,15095558052845905,15096856583295205,15104704601738115,15137463620871579,15139231593172071,15139470141320967,15141311654469419,15157297991462867,15165877497218537,15187454908961867,15205333655740165,15206524678220769,15213795744311666,15223458910424702,15231054902970513,15247444976777419,15253979392755088,15261179716215589,15264607147906949,15279764888497900,15289070477931369,15289098424523085,15302475269115685,15329448444911128,15332466801914067,15333746453390031,15334014646205651,15359121081622482,15372450993617149,15386636932060455,15388624896861794,15393703762293723,15396285667061859,15403250655983005,15418940717783981,15433964561817078,15438010039979776,15445262976227614,15455930562511989,15458735994674226,15459676313235252,15474414406371359,15486140333535895,15523643011224095,15530427628910070,15535488185343583,15547815300743142,15558207370718302,15560047986976016,15562655257896949,15572024028401096,15576633429000938,15640243559582304,15652664613796795,15662909040297435,15667815187503376,15674056088265955,15715286415685035,15734854842917459,15735629130746769,15745022742489686,15746397054374347,15753706416522065,15756364032314896,15759715213832065,15770571874905038,15771450504624662,15771565345317449,15774389582251672,15784982662006986,15826693353056782,15832992777484533,15834554452643249,15836567480040877,15846387640967869,15860004612907614,15862052574313643,15872412308796247,15876898540191851,15885096538206697,15886752701317732,15922383621886343,15925989819151637,15934315148588228,15935855575601832,15939241514854343,15947297812421556,15947649106036590,15947943670655095,15950707805014365,15953773242451901,15954192604657090,15975906896379703,15977967341911619,15988446934219592,16007534381370348,16008071982696337,16010080349394241,16019379996180302,16041693970346251,16050169327466182,16051144134269849,16070351779538067,16082115874549525,16102988424614242,16105825060325355,16108247668377083,16110185924188167,16120433790384445,16123644917797893,16124055380239661,16139682163688113,16148870984659849,16148983638429813,16179671651212829,16203460357263003,16204526262165688,16204949145766686,16241217195123001,16255807762483898,16263861205376525,16266403851500931,16274843164243675,16283022478827615,16288765059208216,16298269409548231,16310527157714076,16347427740561385,16348527080745850,16363236815965139,16378418422888053,16385922065480711,16386734109401830,16401309614419580,16420118853248593,16439064672105252,16444981631010484,16452680020690587,16455001541854420,16455790056067069,16458731130673862,16466030540364453,16483603638815995,16520653489349458,16521053145712040,16541559809954869,16549645496126569,16555041588219694,16556178595846804,16569131959084059,16584068045240185,16587611662622512,16609771886989064,16642535292639438,16649914812980760,16663626685346575,16671959185780948,16675862044640924,16725186034571857,16725455429885896,16731497228216796,16744527283341947,16746698059193344,16751770083916499,16752540407374498,16766466364238840,16781512709609102,16813158866111289,16848902773147957,16893611322805776,16894702249688277,16907053630551983,16918949101677550,16919206140826699,16927217964664091,16940864515999735,16970582910707933,16981453778803121,16983386100118313,16992080913752201,17008651624357706,17012650641208674,17015626300316320,17058734385396248,17094714926844016,17102048353802941,17119147153394225,17165663075547765,17168990396387808,17177995785851436,17192004769472906,17226057752082506,17230160752007703,17251290606896319,17255628627059519,17259460293245834,17261291602429545,17264444797119395,17267303429682730,17276137294424252,17297590713909193,17297622966744962,17304973793899500,17308573609293684,17331691594984106,17368721199906565,17370692363109638,17375347669952355,17384846996643858,17385438621225034,17393122952695385,17401580047518152,17403708711522033,17407737475614609,17412590309079896,17463298767015505,17477653216986609,17478595798135648,17482690806609130,17487690591900499,17492239615344727,17499428207086686,17513023676325158,17517878080826035,17518382048724585,17531301554404446,17532123431784764,17537235202003650,17543036676175723,17547545276513648,17551850598397102,17552727063175508,17571693025099870,17576884999794912,17580278606631657,17599142813424731,17605574666239988,17607361867830427,17618374994448394,17625656721813080,17641208765354924,17666295601206345,17676134188980509,17686356143845167,17694457529115978,17701836378206948,17713251693643593,17727864470254841,17745611798586794,17751640855559608,17755611883871708,17755994503641789,17757420725169406,17773170776249049,17780217609224267,17786870875243625,17792258928649416,17794266624700006,17816975800032715,17823693682349900,17838668413395798,17872295918517872,17876042049961247,17878900310275595,17884658064199580,17887430722263974,17893775077652078,17906199187845137,17912620382448881,17914094645083489,17926317445804624,17927386865920973,17936806132878791,17948722706950552,17950394714083630,17957430043936373,17957766991025570,17973954406588157,17981109220320748,17985536773394222,17991723913595561,17994050143919349,17999886330183165,18006067580170253,18015394845354757,18018354938006966,18023736156722913,18030285508849769,18032380366607200,18053110620914874,18054106463885348,18055637741519432,18062025642286974,18064910909128101,18068076157304068,18068456068882015,18081200901569883,18097697732126918,18142823591161549,18157090443766400,18165366617626746,18171825130045475,18182572711257795,18199348561681442,18199940176448223,18208382627853317,18210472497434014,18229683164086399,18236982822958815,18243995057427405,18268933871999774,18277497206833650,18298789242826634,18299236052051415,18308783355641550,18323932752353633,18334037508831312,18345286825512630,18346225455350760,18361317522385086,18368801383412228,18374609286574885,18404707397839425,18420663232850685,18425471188893674,18435764882631694,18436653476822659],"md5sum":"fb2c4c8861753dbc497d72d0e465465a","molecule":"dna"}],"version":0.4}]
\ No newline at end of file
diff --git a/tests/test-data/duplicate-sigs/fb2c4c88.k=31.scaled=1000.DNA.dup=3.63.sig b/tests/test-data/duplicate-sigs/fb2c4c88.k=31.scaled=1000.DNA.dup=3.63.sig
new file mode 100644
index 0000000000..dc1a1d0a2d
--- /dev/null
+++ b/tests/test-data/duplicate-sigs/fb2c4c88.k=31.scaled=1000.DNA.dup=3.63.sig
@@ -0,0 +1 @@
+[{"class":"sourmash_signature","email":"","hash_function":"0.murmur64","filename":"/dev/fd/63","name":"GCA_009818955.1 Francisella tularensis strain=87-14795, ASM981895v1","license":"CC0","signatures":[{"num":0,"ksize":31,"seed":42,"max_hash":18446744073709552,"mins":[27281317737506,30774174543091,34134260966885,40422536005915,46092979748140,52667380017781,77640546109120,89191245175479,97341175558837,103303605162730,108020839024807,135414989701161,137460624135325,143826654727361,153684135314332,159757157353946,180095348607580,182873709030656,208309245438225,214041805294629,223646403684661,231557613626707,232715412458897,233924901580683,242631428039918,242916443054461,244997637525660,250821105337195,263769223490888,289612885205952,290507901010552,310144178150855,324002037533321,327741982499086,370660193353938,374827227987284,385925999728554,391908630864897,396271485002374,425780426886080,446363367677659,458676097367934,465761497060251,474500797064463,487890967126457,501285366842536,509029972291483,518744095767761,553284673753979,557420110856932,561588147238471,564955493511165,575940509276718,605337498101285,606548977697285,618213673953712,643115681109447,643216012278133,644963356377577,653829434142317,670360373084070,672527530119591,675150655252443,684524270185147,719626188880764,720045643265551,725967839675339,734392235451397,757995085421554,758873739679041,762435098460972,776635435277572,779263005191648,796121174431627,801537843188472,806791101979164,808323945009162,831597602839947,837342876049719,841644000186381,852358311452961,856044596532065,863211350823511,866354466734001,877528246849899,882816662973420,886831123416057,887324029071980,896684190269245,910013730955328,911021453771825,918053824067528,959404715354517,965749800410433,987156720845891,988004107440612,1003560293105019,1019132469630602,1031742940239750,1045004389970757,1047834223177133,1050009621695541,1067005090470023,1075835347418477,1106009829832368,1110321639602504,1129029490927944,1152325475808554,1155792411577727,1173501625134773,1179142432651244,1182043494444045,1185298213665043,1189817092658474,1198647977438533,1202832335428232,1213527880288483,1224149087020995,1231538243085066,1237866648067421,1247204248956355,1273002845219134,1311344518744158,1325762747773666,1327583535977585,1337025108346963,1337939134216356,1343194778880486,1347178976724408,1347965204255319,1360726528548292,1375648837386937,1378537007249094,1380037737023614,1396008873056192,1423251048049841,1425487297427108,1430011450080971,1436389861421409,1447828603923579,1448154600038663,1461606398479896,1477748591186096,1481500503955992,1489343676336319,1491409298456108,1495478198602408,1505324675783522,1506871645688047,1510642595999290,1523787516591314,1538544686450629,1551332608914346,1562178949777326,1582701020555290,1582714715562808,1594428143243217,1600950609397319,1601869664618303,1604847998387314,1605137573067942,1609948614207926,1612555898829971,1617568740441930,1630450719749736,1640503113008175,1658047227795127,1662918431912958,1674686450315888,1683642878379715,1685434818592932,1696496306605751,1699915950805608,1705874533889722,1719615875783650,1720419457177651,1728660623265899,1730106574233740,1732265511056345,1755484768216714,1757979695302374,1761318932844395,1767169583106630,1767600206417692,1768553578088471,1799333654355911,1809290380380199,1812674951838073,1816281246010249,1828421791729927,1832915856042159,1845631702610353,1850188493318868,1868069593147085,1884958304606475,1888837445051056,1891025754082888,1899332233791441,1911726518903029,1918452760708060,1952919386872879,1955072741257444,1970574995631202,1971148594271014,1980609364625158,1984116510933050,1993245451986067,1994334947674791,1996473350761932,2011995194361170,2016402104104536,2038605101930749,2041599619079412,2055486908646634,2068184964131101,2073480224427132,2081468075605553,2083859149031988,2088845573181737,2091868834590483,2095410096745181,2105648145116436,2113691764892897,2129201860410658,2130929519981988,2131490169201290,2170261983009798,2171035356430939,2177414857180262,2178114239308922,2179241686148154,2181354598823504,2184056585017162,2197366871875060,2204375393501991,2214523537881050,2221431264383483,2237381498315179,2281430614061135,2285676462393652,2289064303021873,2293074580236368,2295815116249395,2301838542881084,2310754905687291,2316428340566049,2317685491567407,2333959693552804,2342074675676015,2349601653567167,2359342178128271,2406274493306297,2409062953251471,2423082222606459,2444813151989679,2449631428878268,2467788917998802,2477025902927540,2483114115832913,2495644065213346,2499969884139421,2516439493941522,2517619989855248,2531737524981284,2532968001402240,2535608410342101,2541995133136461,2548656366696057,2566020346579398,2580305967686391,2594650471395280,2602545163684199,2613012954935120,2613606010573059,2637712179671244,2642357132272560,2642478099324048,2644197809403928,2659363560806872,2662304715625150,2698172378241685,2728578112915602,2730219461233997,2735122180150483,2735758529835485,2773281588385404,2789219916032586,2789850822350080,2810625093279126,2813140178921049,2849540048794850,2860369179928472,2871754307039262,2871907171556769,2880691563686929,2888613215455329,2903093947499687,2921332438238831,2927325011079857,2970698796413300,2985341598759171,2994088683976615,2995876644715330,2997465365202042,3003397844817766,3014591226675854,3015280923174733,3032228468334705,3034882558366266,3038102613400782,3040293237673794,3041384201679032,3065335255532440,3084787857619832,3086181234740975,3086344760356109,3088760991663376,3113879541153339,3117502144995062,3118334386799456,3118353038001378,3124967229669132,3141111622118606,3152319552764911,3157397983327458,3165214969768489,3173160905431776,3174196946497848,3176252213532449,3189600474341459,3200527298983599,3228756288805719,3233648223147430,3234704310551101,3244303587768447,3245279954664994,3253798917461757,3256199541149175,3270652593815318,3289782382963436,3291924461643881,3321936120954224,3350538525764676,3359013043704164,3367516662066673,3375956437105495,3381047593455589,3395194612319712,3395515328122470,3401980298093332,3417672631802041,3443496823444718,3473853502109201,3476156978273954,3492929176926081,3495505110142109,3503331458443572,3519095536458611,3535510821231923,3543906742502112,3547039933171809,3554769633644327,3560048373591648,3560832183669588,3561168542060264,3571381254876092,3582372240211864,3588526584026912,3590322008363596,3595234111026435,3598612108866317,3610671769485981,3621249240349460,3625292056724077,3627537868670400,3634725715407039,3654725299461030,3661758740289748,3677898597207892,3699121516322788,3702325285097834,3706591318445986,3711739163437948,3719471889280203,3725602040928106,3730648684404687,3738937658728956,3739565211999619,3748208043454616,3749224223393980,3764528432190170,3768018390267232,3792801693012879,3817603306259244,3822152193894646,3824688781764074,3828070741647380,3832361520491395,3834409435587390,3841574835390887,3845376089274772,3867101884678333,3907465181171189,3911757008173649,3936203721973325,3943792168144251,3944618352654874,3954974756216366,3957553012755889,3962797547908719,3967295346819374,3973009233407317,3977667913129268,3989206138624726,3989835479325906,4000191104196681,4016502960230774,4023968417116128,4033160238001050,4039430145337632,4054926036842145,4081890179374943,4083332846839379,4089179632516350,4095359176823975,4096638251379460,4097570444659615,4104945618865731,4106492050918606,4121210480013637,4128300247378694,4139521044238430,4156273280061428,4159321375157642,4161219557528029,4185314023925957,4188885093299245,4213378476941623,4215094248447875,4225599528550081,4246087184970239,4250723839958798,4253813544227807,4257725579070903,4274410444769581,4297865110420934,4314262721326477,4318921209326641,4326796571089165,4358157813774093,4363051662578414,4379054359357346,4387229825472605,4407121475128099,4418586927764827,4444497211702225,4448386354535670,4464672439725319,4479051595073164,4483602152851587,4487416329648218,4491213325063669,4493047653554173,4497108801816010,4503560524390685,4505039949877502,4514805314363434,4536842772674504,4556510901039833,4575403317134912,4592200188563858,4601869033857310,4610031751653292,4618259959292216,4627536713075473,4628641892367445,4633519059669963,4657815314704334,4662180672884852,4665919527337388,4687322226819303,4691844768752312,4707135680049989,4712525914876856,4720547794135633,4740606227775619,4746680609181901,4779520324623562,4791100834322356,4806377157351085,4808372750546970,4826808701416396,4832925031047563,4843277060090928,4862243495142622,4868706396089897,4870247565916556,4876212301431017,4896518867008986,4896692681487429,4909543183678547,4937682630931618,4938956956887018,4966196725954876,4969062413514851,4973600900081374,4999513822868305,5002893845020141,5008622662773184,5014970956568907,5035054008602857,5035384841398862,5039086438104691,5040652225496901,5046016737108994,5059014222849688,5066823009561671,5072430374944592,5126548277288607,5130609108534278,5141019017613116,5145220929445971,5157914165172243,5164042673067139,5164323192060956,5227087582431333,5239058059519899,5241552288110762,5242566539224285,5260929013588652,5264673038204573,5264846847738316,5291104629254226,5305867165279633,5309458727013297,5322844014918400,5323108914962638,5329144744675176,5363349599556055,5366817078493635,5369180741303960,5372222568751999,5372725457533622,5376760723403211,5389389943488643,5398148075045622,5407440001247943,5407552681509066,5425215886017472,5433029476329892,5435967788585089,5453070200302066,5505363397956209,5506656183795141,5520041430372158,5528030877128164,5529328537858607,5538301071879598,5573247074136840,5601063311481394,5612211670828121,5626811993541640,5671110634252314,5671512692298857,5676368648166471,5679704419692665,5697117527796873,5710885073252421,5722042629401126,5725434481579237,5727852290091844,5746609611777297,5759784984111085,5766468332812783,5767658702736892,5769340991360522,5817916655292234,5823448049010532,5831664213962049,5839479091276720,5847620778187862,5848083119581748,5875458157591724,5880371325296374,5887738774301190,5889494649222929,5895198579157112,5901426220955008,5923404444685793,5930424349546694,5954090060404027,5965424605208959,5976106562523974,5976468751599547,5985405131720490,6011246049659946,6019388992671725,6022142115842949,6022687599667645,6026176200521087,6026449947119821,6026852008290352,6028359150627379,6037198658977053,6049559969468114,6071357946907426,6074536323309542,6075861603095684,6086651545690621,6088147986793911,6091451791350760,6097972992225898,6102520222366377,6106185662845781,6150209857537398,6151086995205809,6179061741208419,6186814642225785,6207677944023553,6214079985740421,6222242045894974,6259769482143501,6292912013848109,6335759087357123,6338313411351946,6364174128810209,6376235120055415,6379797414315714,6392936323112603,6393281611893662,6425613172818892,6427037182426097,6435673284042797,6440155758298635,6470065458738460,6473238460278105,6498233125545854,6512447296999122,6519926591968090,6537194036770963,6539403501517441,6546843392577889,6549806578906724,6552086386539938,6562753360182634,6580454437794215,6583294028078651,6599805048819771,6611303480414165,6611480286587626,6613303542954184,6619465618743996,6620952231633165,6628364051881565,6637782452830313,6639366508325377,6643384411215171,6651291777536757,6659026904794676,6660386432638050,6706708532025137,6712233176289295,6715034077809162,6718106064823151,6718466194045753,6737012223038140,6737448552020375,6749465141048682,6755892952873165,6759712360779738,6766665767003108,6769349598161957,6779991167037331,6805981002654460,6822678972563408,6824212178067303,6825656208355223,6828108881031930,6838666110003247,6843872818542831,6858200159969968,6862403913830441,6896605336469152,6907696209350866,6940430139137108,6947378732749248,6953405360148223,6965934499254406,6978237167340371,6979116642430112,7000204500640703,7013379106513098,7015548089157306,7017340939890130,7022751771973604,7032077373720710,7057472610075201,7069751745272274,7071427481472917,7076942582186008,7083956725625194,7088117728748372,7110694400947463,7119947527921472,7162448687990059,7164239598107818,7167755656765912,7174842826868336,7175438170580579,7194659618833292,7199131481548540,7201097914134797,7249458427670294,7258600960921299,7260262458870231,7272892562787288,7291277476900411,7298751506522380,7320645384455102,7347510371391158,7349195911222994,7354593750963108,7379412287438610,7396028447991993,7407333626649072,7417992548609519,7436769564720525,7455509939494098,7466612719167435,7473444312733619,7473921057215867,7481010725288922,7494555361323136,7511820031110502,7539834933699319,7541028416083622,7548889429298262,7550654989959541,7567961569136947,7568482345047421,7581552356535593,7583581654211041,7587845721090877,7628037567441185,7633586298526539,7639776715226470,7676516649995125,7678642663878476,7694954607256167,7696205219169608,7701103797021301,7717485469267928,7734382322255612,7742828147953131,7754433671355044,7755280476043760,7769918479127653,7792923160082860,7808336679035965,7851406039611761,7854329110050349,7864747279450982,7874975130558924,7878651583399616,7905885790085843,7907390132227916,7910193473416632,7914616292135963,7936567617966657,7942757952243526,7948051059967055,7955099191740661,7958261337857520,7959981873817532,7961073053238888,7969625643027082,7971181815184894,7974554831275928,7974726923655088,7979097331324691,8004922364117543,8007706307006295,8012303905212338,8013295641540623,8066164953236699,8084246722538173,8100240629932439,8100951490474943,8106199043893306,8107552539527840,8114588959043938,8132109950950425,8139068209349164,8139545132154805,8142237913270447,8158329919102245,8167114814444647,8173611854335395,8189714298100356,8195428097158099,8201564519665139,8220965830202883,8235606574847074,8246150433354016,8270274863750807,8284482132698750,8285985183630643,8292533320746682,8295908119012735,8296380863267928,8323255543543643,8348711822077161,8358705549655169,8365548267150232,8374906480797651,8375721515813351,8379632648873725,8385947080398988,8391743136854975,8418406057930748,8428572383618079,8432685689145543,8433517681349491,8438148178407681,8444919949804523,8446420140769624,8455107906508370,8475628073984291,8508022924218892,8510397511704993,8514237274685749,8514396962844012,8531363280764652,8537391416444784,8549235306484624,8580449592169756,8598393465824567,8604034180239741,8611570380523396,8612678693937747,8620109638775764,8642635839487892,8646344667334136,8646717894657892,8654788340452875,8658449886140254,8663165400137631,8722442451761544,8761729420462614,8799647367219413,8801831997666499,8808453998469722,8818307640518629,8818834451063303,8823766790135813,8833290176775915,8841993926981016,8854459467830464,8861087128751225,8862728500798303,8880985466378305,8881613336282482,8925251195661314,8953704880557779,8985742174669720,8991621885018154,8992026721663720,8997168720804990,9016856686855149,9047424902876796,9050560415698716,9050753309887512,9061429496148816,9069273612029890,9080265743048056,9086071373806583,9095588303764494,9096970962257714,9100770128872926,9101701347271054,9112256340668504,9116617466944120,9136292702218238,9137011986680851,9143781826339633,9153600809394255,9199889121013565,9201721144829026,9209023421216735,9210503570621360,9214460083393763,9221754104943871,9224413178285270,9234547745090520,9243132989358081,9248116050684550,9264703712467273,9280659174889120,9287946744159071,9292140972727810,9327466025742305,9332051044758209,9363145119903540,9364400993642126,9391130196045242,9395408164721980,9399740447176971,9424276250634510,9452221926640102,9456478187020366,9464543357189079,9465600149658791,9491049221619505,9497191629902225,9522170868080528,9533727201306437,9547661663619976,9549245598585404,9569979472692505,9573473519866808,9591009354744385,9617917621932747,9648236513062897,9658329412127906,9659598422876240,9682812252677365,9686371108983926,9728579985303380,9729699868914242,9740105842067989,9745445361656865,9745681179963804,9754643298197035,9760567102397958,9771710857253491,9789917483611116,9793359197214519,9793503159465365,9794390650710491,9798125839578328,9804745463385914,9808752020775346,9833005702827487,9839044066017243,9845432123717545,9885127473462061,9894715619458167,9905070536524380,9908520702066001,9910780473079700,9917733811149031,9923651765289083,9927388605300676,9938702047866083,9944740018953312,9960830080447414,9961286044348072,9976639243429026,9991256507745359,10022572634244940,10025580401289134,10026906747582208,10027635205445506,10038347159492151,10039604072460842,10043026849402656,10047276489450169,10053877592451757,10063475808610984,10076240276956154,10101156164378146,10106363481576762,10127734783681212,10136370121235304,10146252926027242,10157507993657925,10170937864363577,10176151554703241,10183534012238724,10192998900924592,10197340560318368,10237743941972439,10254073288131603,10256735847958754,10280479610352360,10285508802345218,10291262449582857,10295721152014702,10310390505199421,10313473991212838,10342547269042481,10412937296046572,10455518715262653,10462990290852442,10464315303440262,10471197443105163,10471235757467544,10490073561279689,10492924328467233,10507681094428554,10516717765537166,10540330407067241,10541927502626015,10556872609087686,10565575287036146,10567098846120860,10569991378070155,10592858161029490,10605555894932784,10617796130250535,10635845120674097,10650787780040606,10652474248971788,10675091890076247,10677639062140656,10686876585139260,10687769709926863,10695637927115727,10696366138304664,10729506666892986,10739953219495080,10741375692368836,10772979860590118,10787932183480326,10797834288911940,10797954026094675,10799912491676373,10801123407332591,10809957352958534,10811859724613778,10814855039053275,10823710184242031,10835831635651692,10838780402343038,10839952733006938,10844349943321077,10845732416899091,10850918643526601,10860256158679413,10863856633990962,10869462262089206,10888194791081552,10892912626153721,10898443770405678,10910130929886125,10939089240676095,10957695103575326,10960164352659702,10961938624164593,10962367887299917,10963209129287673,10972178204371166,10980943184449618,11004523743982583,11021436183624407,11027930223905649,11051706685260634,11068765668137027,11117926166287355,11130512060712491,11137064269321064,11137491086953899,11142860693680581,11161848365790340,11164864927470696,11166391153310384,11170503851782616,11174109853098886,11179331841591506,11183033087165489,11194665067070355,11213473653620578,11238383978151703,11241048792018996,11254802265046802,11264184666096697,11268594471164860,11273858751227255,11285014483956224,11306227409946359,11316029880189806,11317018836436108,11326888850753433,11332371567827730,11344860342601282,11363041022575148,11364562992186694,11366255071713968,11378751439993468,11388719560536041,11408349988397772,11415332673174238,11418046330072121,11435535718923282,11449275714355046,11460074843266170,11483399532377237,11495688976808914,11507029069035406,11509653702854252,11515420136228054,11516873482469677,11520503668749817,11525515242618518,11557753508725419,11561604977704834,11569292413360517,11589295715309246,11631902140279157,11654556201194743,11656899418054614,11657294505407742,11661070133557696,11668795792548531,11688426128601404,11692173069223099,11692638821382112,11704142988138877,11730275878132669,11747860305548587,11768902473065614,11809606231785533,11814942688163748,11830275507824233,11832881846320770,11839076547825639,11842516303617037,11848809254066440,11849604090148961,11861036171256724,11862479669320152,11902273608768276,11909927135034753,11915914136058870,11917366131988635,11918146339628393,11952186029640314,11965762970341077,11968852102140818,11988163409658002,11991640765060726,12003946399173925,12015371223569459,12015699073484173,12026238097639303,12027543541848783,12029927405469169,12030852032077865,12032177915239463,12033394505469741,12041449133719384,12065689743810267,12075464021106187,12097367992437353,12102042122098365,12106676312986046,12119543078536750,12137234780206015,12152142412839865,12157879639736773,12158276228389397,12162749550023120,12165771397045986,12167381433344274,12181596294329824,12203742166109026,12209847266646966,12224653876328040,12229733059110899,12236479214073070,12240425549065773,12252089558841057,12252834455254861,12252921401858245,12263083529856368,12280117660905010,12282073505316488,12291733945277146,12295652397734313,12308950434416884,12315245431394520,12325148768094462,12353815014171672,12409472162226791,12413108559037636,12428072482683163,12435514170613803,12457386755420991,12515317411195819,12523706347945485,12523915038722689,12559341317878562,12582135565969205,12587936565284399,12592521878562533,12606988144137942,12615583150436888,12648682904569149,12656884893522301,12659016821769380,12672378100177709,12672827694409958,12678220421944320,12715313343566187,12719294959376032,12719335619027235,12720282275955265,12735108166808283,12736098435499462,12761734704097496,12765606334499184,12768487626988503,12787159319028422,12795719813049885,12796004399647948,12799644350597798,12825977729217187,12830098688357983,12830625689728250,12834785022889568,12844934606745507,12864892528356214,12879900683153406,12886172805466435,12888336247314715,12900495987025021,12904609340082772,12910186793479215,12913485144602650,12940792596482062,12951190294698176,12959920644213803,12987666390036602,12989467441559491,13017251126115385,13019641576048600,13025171186373006,13037172756924801,13041090869246053,13086846890781615,13102950879868503,13106344354361040,13110764699645099,13124781656132352,13146834909840568,13150475019000506,13165947997663636,13166432994532590,13173193375632369,13189323547733397,13191421827504087,13193440049022055,13195947726278877,13198070394653089,13223167457129300,13224252442883332,13228531581453927,13247204945379143,13258807801452479,13262258424271961,13267705241542163,13278616079036501,13283748455296671,13285230775877616,13285868034600602,13300334671920689,13310292875184935,13329136310276887,13352283337060157,13369169997036466,13377705827703609,13379962140831188,13380311342746500,13382187966941630,13390758145584707,13416266652895919,13419557557650088,13422200559860853,13427443083225106,13439116518710702,13456005147462978,13460243070950619,13461053659060122,13477523913094903,13487344032963920,13507861890928667,13508621577819321,13515438283308158,13521387743539755,13521554576987408,13539882561854500,13542269562596619,13578903302625692,13579583599193286,13581215157158944,13587044091969040,13602252577581628,13603519953862396,13615325004229940,13618894832957770,13650874726182653,13651727413207625,13662960495380203,13673600314291218,13674963480009909,13720620632179448,13735303013062726,13758363653494113,13768307727241862,13773441988162878,13784942424403905,13790029177942415,13790094291403739,13801859760285334,13823682578780211,13837797612780120,13840946933400146,13849721184598166,13858714430263200,13867040888606934,13869330719207858,13869373581337859,13875919313930175,13885363497519254,13899685526884978,13903849933818815,13907306748318011,13908235025197238,13910720876944015,13915193569823349,13923426716034528,13927569063761614,13935851918750000,13944130770252219,13950881517688055,13952439552733635,13955532186501673,13956941976117171,13966505352198009,13971610984999907,13979667301665560,13980401348213351,14024776843639840,14045295365141886,14061987901102577,14073881884275302,14075769365417929,14101845060400112,14117845585934631,14125332980519745,14128142329462641,14141075514030681,14159769218027675,14160023342235796,14165810602836303,14181751660405752,14195237522560027,14196209485287318,14201058898149871,14209101675948535,14231071926066360,14233941946968962,14241316678116937,14284230229638553,14284814018322640,14305677576147150,14305696255635441,14307348878842670,14321113665510751,14340366181037166,14382118569064968,14388001837716830,14391131597662486,14405901660716046,14427191795741247,14431884293379000,14435096137720815,14453306552449138,14460183468613142,14483703679632836,14484162426506072,14501476390289602,14501597818233059,14501833648745811,14511683112043509,14519535484637316,14521142156451097,14531002795413835,14540525969203315,14553035897229090,14563971448853004,14600735852927628,14611838908596828,14619382573883017,14641964844026130,14646545710428465,14652831294709181,14658550015911821,14658660845978751,14663491457747719,14681209102114692,14685647720239038,14723277112189400,14724140557861103,14735542904510591,14735551553794705,14737623329240627,14745232043596101,14752117678987462,14752581133768923,14759374398919542,14763316537025536,14781094775952097,14796832801115371,14796936112218498,14804719699727899,14814157253057913,14815872841991523,14819066206613811,14823404554947143,14829528747684225,14866285111324854,14874993683632673,14883151088403008,14887455070918576,14892361811823494,14910335371645937,14912913132110225,14924238937048081,14929241650755283,14950642543150044,14951063678347538,14975260126884161,15012809225833170,15012809793999981,15018571069035926,15020915825825774,15021404021337883,15022670595110400,15067928338100796,15085336559480488,15086974469237813,15095558052845905,15096856583295205,15104704601738115,15137463620871579,15139231593172071,15139470141320967,15141311654469419,15157297991462867,15165877497218537,15187454908961867,15205333655740165,15206524678220769,15213795744311666,15223458910424702,15231054902970513,15247444976777419,15253979392755088,15261179716215589,15264607147906949,15279764888497900,15289070477931369,15289098424523085,15302475269115685,15329448444911128,15332466801914067,15333746453390031,15334014646205651,15359121081622482,15372450993617149,15386636932060455,15388624896861794,15393703762293723,15396285667061859,15403250655983005,15418940717783981,15433964561817078,15438010039979776,15445262976227614,15455930562511989,15458735994674226,15459676313235252,15474414406371359,15486140333535895,15523643011224095,15530427628910070,15535488185343583,15547815300743142,15558207370718302,15560047986976016,15562655257896949,15572024028401096,15576633429000938,15640243559582304,15652664613796795,15662909040297435,15667815187503376,15674056088265955,15715286415685035,15734854842917459,15735629130746769,15745022742489686,15746397054374347,15753706416522065,15756364032314896,15759715213832065,15770571874905038,15771450504624662,15771565345317449,15774389582251672,15784982662006986,15826693353056782,15832992777484533,15834554452643249,15836567480040877,15846387640967869,15860004612907614,15862052574313643,15872412308796247,15876898540191851,15885096538206697,15886752701317732,15922383621886343,15925989819151637,15934315148588228,15935855575601832,15939241514854343,15947297812421556,15947649106036590,15947943670655095,15950707805014365,15953773242451901,15954192604657090,15975906896379703,15977967341911619,15988446934219592,16007534381370348,16008071982696337,16010080349394241,16019379996180302,16041693970346251,16050169327466182,16051144134269849,16070351779538067,16082115874549525,16102988424614242,16105825060325355,16108247668377083,16110185924188167,16120433790384445,16123644917797893,16124055380239661,16139682163688113,16148870984659849,16148983638429813,16179671651212829,16203460357263003,16204526262165688,16204949145766686,16241217195123001,16255807762483898,16263861205376525,16266403851500931,16274843164243675,16283022478827615,16288765059208216,16298269409548231,16310527157714076,16347427740561385,16348527080745850,16363236815965139,16378418422888053,16385922065480711,16386734109401830,16401309614419580,16420118853248593,16439064672105252,16444981631010484,16452680020690587,16455001541854420,16455790056067069,16458731130673862,16466030540364453,16483603638815995,16520653489349458,16521053145712040,16541559809954869,16549645496126569,16555041588219694,16556178595846804,16569131959084059,16584068045240185,16587611662622512,16609771886989064,16642535292639438,16649914812980760,16663626685346575,16671959185780948,16675862044640924,16725186034571857,16725455429885896,16731497228216796,16744527283341947,16746698059193344,16751770083916499,16752540407374498,16766466364238840,16781512709609102,16813158866111289,16848902773147957,16893611322805776,16894702249688277,16907053630551983,16918949101677550,16919206140826699,16927217964664091,16940864515999735,16970582910707933,16981453778803121,16983386100118313,16992080913752201,17008651624357706,17012650641208674,17015626300316320,17058734385396248,17094714926844016,17102048353802941,17119147153394225,17165663075547765,17168990396387808,17177995785851436,17192004769472906,17226057752082506,17230160752007703,17251290606896319,17255628627059519,17259460293245834,17261291602429545,17264444797119395,17267303429682730,17276137294424252,17297590713909193,17297622966744962,17304973793899500,17308573609293684,17331691594984106,17368721199906565,17370692363109638,17375347669952355,17384846996643858,17385438621225034,17393122952695385,17401580047518152,17403708711522033,17407737475614609,17412590309079896,17463298767015505,17477653216986609,17478595798135648,17482690806609130,17487690591900499,17492239615344727,17499428207086686,17513023676325158,17517878080826035,17518382048724585,17531301554404446,17532123431784764,17537235202003650,17543036676175723,17547545276513648,17551850598397102,17552727063175508,17571693025099870,17576884999794912,17580278606631657,17599142813424731,17605574666239988,17607361867830427,17618374994448394,17625656721813080,17641208765354924,17666295601206345,17676134188980509,17686356143845167,17694457529115978,17701836378206948,17713251693643593,17727864470254841,17745611798586794,17751640855559608,17755611883871708,17755994503641789,17757420725169406,17773170776249049,17780217609224267,17786870875243625,17792258928649416,17794266624700006,17816975800032715,17823693682349900,17838668413395798,17872295918517872,17876042049961247,17878900310275595,17884658064199580,17887430722263974,17893775077652078,17906199187845137,17912620382448881,17914094645083489,17926317445804624,17927386865920973,17936806132878791,17948722706950552,17950394714083630,17957430043936373,17957766991025570,17973954406588157,17981109220320748,17985536773394222,17991723913595561,17994050143919349,17999886330183165,18006067580170253,18015394845354757,18018354938006966,18023736156722913,18030285508849769,18032380366607200,18053110620914874,18054106463885348,18055637741519432,18062025642286974,18064910909128101,18068076157304068,18068456068882015,18081200901569883,18097697732126918,18142823591161549,18157090443766400,18165366617626746,18171825130045475,18182572711257795,18199348561681442,18199940176448223,18208382627853317,18210472497434014,18229683164086399,18236982822958815,18243995057427405,18268933871999774,18277497206833650,18298789242826634,18299236052051415,18308783355641550,18323932752353633,18334037508831312,18345286825512630,18346225455350760,18361317522385086,18368801383412228,18374609286574885,18404707397839425,18420663232850685,18425471188893674,18435764882631694,18436653476822659],"md5sum":"fb2c4c8861753dbc497d72d0e465465a","molecule":"dna"}],"version":0.4}]
\ No newline at end of file
diff --git a/tests/test-data/gather/all-picklist.csv b/tests/test-data/gather/all-picklist.csv
new file mode 100644
index 0000000000..b9ebbaa522
--- /dev/null
+++ b/tests/test-data/gather/all-picklist.csv
@@ -0,0 +1,37 @@
+signature_file,md5,ksize,moltype,num,scaled,n_hashes,seed,with_abundance,name,filename,license
+GCF_000006945.2_ASM694v2_genomic.fna.gz.sig,323c1a1712b0949268dd6fb93be63ae2,11,DNA,0,10000,150,42,0,"NC_003197.2 Salmonella enterica subsp. enterica serovar Typhimurium str. LT2, complete genome",../fasta/GCF_000006945.2_ASM694v2_genomic.fna.gz,CC0
+GCF_000006945.2_ASM694v2_genomic.fna.gz.sig,263c2de20b597d6e33b81ec91d8672b5,21,DNA,0,10000,485,42,0,"NC_003197.2 Salmonella enterica subsp. enterica serovar Typhimurium str. LT2, complete genome",../fasta/GCF_000006945.2_ASM694v2_genomic.fna.gz,CC0
+GCF_000006945.2_ASM694v2_genomic.fna.gz.sig,dc12a6d8fd63122aa68f78facf9bed94,31,DNA,0,10000,490,42,0,"NC_003197.2 Salmonella enterica subsp. enterica serovar Typhimurium str. LT2, complete genome",../fasta/GCF_000006945.2_ASM694v2_genomic.fna.gz,CC0
+GCF_000007545.1_ASM754v1_genomic.fna.gz.sig,df24140b1c6cad16b30abeaf03019eb5,11,DNA,0,10000,158,42,0,"NC_004631.1 Salmonella enterica subsp. enterica serovar Typhi Ty2, complete genome",../fasta/GCF_000007545.1_ASM754v1_genomic.fna.gz,CC0
+GCF_000007545.1_ASM754v1_genomic.fna.gz.sig,fd958e3b5649bc03890517ff239970ea,21,DNA,0,10000,445,42,0,"NC_004631.1 Salmonella enterica subsp. enterica serovar Typhi Ty2, complete genome",../fasta/GCF_000007545.1_ASM754v1_genomic.fna.gz,CC0
+GCF_000007545.1_ASM754v1_genomic.fna.gz.sig,8c22dff88a2239607762da00f7fd1725,31,DNA,0,10000,472,42,0,"NC_004631.1 Salmonella enterica subsp. enterica serovar Typhi Ty2, complete genome",../fasta/GCF_000007545.1_ASM754v1_genomic.fna.gz,CC0
+GCF_000008105.1_ASM810v1_genomic.fna.gz.sig,9db6efc92a041e11713ccfa8597edae5,11,DNA,0,10000,150,42,0,"NC_006905.1 Salmonella enterica subsp. enterica serovar Choleraesuis str. SC-B67, complete genome",../fasta/GCF_000008105.1_ASM810v1_genomic.fna.gz,CC0
+GCF_000008105.1_ASM810v1_genomic.fna.gz.sig,8996699a05d3e5a05fa3fe94bfa41431,21,DNA,0,10000,472,42,0,"NC_006905.1 Salmonella enterica subsp. enterica serovar Choleraesuis str. SC-B67, complete genome",../fasta/GCF_000008105.1_ASM810v1_genomic.fna.gz,CC0
+GCF_000008105.1_ASM810v1_genomic.fna.gz.sig,85c3aeec6457c0b1d210472ddeb67714,31,DNA,0,10000,468,42,0,"NC_006905.1 Salmonella enterica subsp. enterica serovar Choleraesuis str. SC-B67, complete genome",../fasta/GCF_000008105.1_ASM810v1_genomic.fna.gz,CC0
+GCF_000008545.1_ASM854v1_genomic.fna.gz.sig,74b928d3db1f7f033c0dcca6c6e52aea,11,DNA,0,10000,84,42,0,"NC_000853.1 Thermotoga maritima MSB8 chromosome, complete genome",../fasta/GCF_000008545.1_ASM854v1_genomic.fna.gz,CC0
+GCF_000008545.1_ASM854v1_genomic.fna.gz.sig,ba9947e078cab29e20bc7d31bc1b9f0d,21,DNA,0,10000,192,42,0,"NC_000853.1 Thermotoga maritima MSB8 chromosome, complete genome",../fasta/GCF_000008545.1_ASM854v1_genomic.fna.gz,CC0
+GCF_000008545.1_ASM854v1_genomic.fna.gz.sig,1bfe96d76ec9cdb60779a1a9223c424e,31,DNA,0,10000,187,42,0,"NC_000853.1 Thermotoga maritima MSB8 chromosome, complete genome",../fasta/GCF_000008545.1_ASM854v1_genomic.fna.gz,CC0
+GCF_000009085.1_ASM908v1_genomic.fna.gz.sig,752280e9969ce750e2c80477c1b7b0e7,11,DNA,0,10000,61,42,0,"NC_002163.1 Campylobacter jejuni subsp. jejuni NCTC 11168 = ATCC 700819 chromosome, complete genome",../fasta/GCF_000009085.1_ASM908v1_genomic.fna.gz,CC0
+GCF_000009085.1_ASM908v1_genomic.fna.gz.sig,eba0eb3ce984cc53c36f134a752c52c5,21,DNA,0,10000,157,42,0,"NC_002163.1 Campylobacter jejuni subsp. jejuni NCTC 11168 = ATCC 700819 chromosome, complete genome",../fasta/GCF_000009085.1_ASM908v1_genomic.fna.gz,CC0
+GCF_000009085.1_ASM908v1_genomic.fna.gz.sig,953156e9f4da8cf22e7e0b4b88261fae,31,DNA,0,10000,167,42,0,"NC_002163.1 Campylobacter jejuni subsp. jejuni NCTC 11168 = ATCC 700819 chromosome, complete genome",../fasta/GCF_000009085.1_ASM908v1_genomic.fna.gz,CC0
+GCF_000009505.1_ASM950v1_genomic.fna.gz.sig,0f35aeadda1532ed450bd6de1e73545d,11,DNA,0,10000,148,42,0,NC_011294.1 Salmonella enterica subsp. enterica serovar Enteritidis str. P125109 complete genome,../fasta/GCF_000009505.1_ASM950v1_genomic.fna.gz,CC0
+GCF_000009505.1_ASM950v1_genomic.fna.gz.sig,405ae3300f28ca5fe5c223cbf7e28734,21,DNA,0,10000,471,42,0,NC_011294.1 Salmonella enterica subsp. enterica serovar Enteritidis str. P125109 complete genome,../fasta/GCF_000009505.1_ASM950v1_genomic.fna.gz,CC0
+GCF_000009505.1_ASM950v1_genomic.fna.gz.sig,0842f7edb426fc4fa2701c107e678279,31,DNA,0,10000,461,42,0,NC_011294.1 Salmonella enterica subsp. enterica serovar Enteritidis str. P125109 complete genome,../fasta/GCF_000009505.1_ASM950v1_genomic.fna.gz,CC0
+GCF_000009525.1_ASM952v1_genomic.fna.gz.sig,d883538a0c983a863fa4b6e5fcd19612,11,DNA,0,10000,148,42,0,NC_011274.1 Salmonella enterica subsp. enterica serovar Gallinarum str. 287/91 complete genome,../fasta/GCF_000009525.1_ASM952v1_genomic.fna.gz,CC0
+GCF_000009525.1_ASM952v1_genomic.fna.gz.sig,9133bd71b86628b38c665ab7e5eb8712,21,DNA,0,10000,457,42,0,NC_011274.1 Salmonella enterica subsp. enterica serovar Gallinarum str. 287/91 complete genome,../fasta/GCF_000009525.1_ASM952v1_genomic.fna.gz,CC0
+GCF_000009525.1_ASM952v1_genomic.fna.gz.sig,afadabf39aec247929e84a29fd797117,31,DNA,0,10000,461,42,0,NC_011274.1 Salmonella enterica subsp. enterica serovar Gallinarum str. 287/91 complete genome,../fasta/GCF_000009525.1_ASM952v1_genomic.fna.gz,CC0
+GCF_000011885.1_ASM1188v1_genomic.fna.gz.sig,feef9e4d39fecd3d9292b76c0cc72b81,11,DNA,0,10000,155,42,0,"NC_006511.1 Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC 9150, complete genome",../fasta/GCF_000011885.1_ASM1188v1_genomic.fna.gz,CC0
+GCF_000011885.1_ASM1188v1_genomic.fna.gz.sig,cc80cb247b195ca3dfa0756257d882b6,21,DNA,0,10000,427,42,0,"NC_006511.1 Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC 9150, complete genome",../fasta/GCF_000011885.1_ASM1188v1_genomic.fna.gz,CC0
+GCF_000011885.1_ASM1188v1_genomic.fna.gz.sig,bb365606acbf08d183399f139af80c32,31,DNA,0,10000,459,42,0,"NC_006511.1 Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC 9150, complete genome",../fasta/GCF_000011885.1_ASM1188v1_genomic.fna.gz,CC0
+GCF_000016045.1_ASM1604v1_genomic.fna.gz.sig,4cec832176c4831239faed42c0616ef4,11,DNA,0,10000,155,42,0,"NC_011080.1 Salmonella enterica subsp. enterica serovar Newport str. SL254, complete genome",../fasta/GCF_000016045.1_ASM1604v1_genomic.fna.gz,CC0
+GCF_000016045.1_ASM1604v1_genomic.fna.gz.sig,43a9d80a4cd995779c7538a32088dd0e,21,DNA,0,10000,469,42,0,"NC_011080.1 Salmonella enterica subsp. enterica serovar Newport str. SL254, complete genome",../fasta/GCF_000016045.1_ASM1604v1_genomic.fna.gz,CC0
+GCF_000016045.1_ASM1604v1_genomic.fna.gz.sig,d0cfbe22579f98fd5de2d41203589964,31,DNA,0,10000,480,42,0,"NC_011080.1 Salmonella enterica subsp. enterica serovar Newport str. SL254, complete genome",../fasta/GCF_000016045.1_ASM1604v1_genomic.fna.gz,CC0
+GCF_000016785.1_ASM1678v1_genomic.fna.gz.sig,328f7b0643bdb6c76135292b5afc8fa7,11,DNA,0,10000,82,42,0,"NC_009486.1 Thermotoga petrophila RKU-1, complete genome",../fasta/GCF_000016785.1_ASM1678v1_genomic.fna.gz,CC0
+GCF_000016785.1_ASM1678v1_genomic.fna.gz.sig,a77789e831fcd2436c3b9e4e22fb173e,21,DNA,0,10000,190,42,0,"NC_009486.1 Thermotoga petrophila RKU-1, complete genome",../fasta/GCF_000016785.1_ASM1678v1_genomic.fna.gz,CC0
+GCF_000016785.1_ASM1678v1_genomic.fna.gz.sig,50d8efd580ff2000cb38d1f8cc9cf9b4,31,DNA,0,10000,185,42,0,"NC_009486.1 Thermotoga petrophila RKU-1, complete genome",../fasta/GCF_000016785.1_ASM1678v1_genomic.fna.gz,CC0
+GCF_000018945.1_ASM1894v1_genomic.fna.gz.sig,989f88420b193ef39c4dbe3b268e0049,11,DNA,0,10000,90,42,0,"NC_011978.1 Thermotoga neapolitana DSM 4359, complete genome",../fasta/GCF_000018945.1_ASM1894v1_genomic.fna.gz,CC0
+GCF_000018945.1_ASM1894v1_genomic.fna.gz.sig,bebcd0dcc0ed3b120ad16c4e15805370,21,DNA,0,10000,188,42,0,"NC_011978.1 Thermotoga neapolitana DSM 4359, complete genome",../fasta/GCF_000018945.1_ASM1894v1_genomic.fna.gz,CC0
+GCF_000018945.1_ASM1894v1_genomic.fna.gz.sig,4289d4241be8573145282352215ca3c4,31,DNA,0,10000,198,42,0,"NC_011978.1 Thermotoga neapolitana DSM 4359, complete genome",../fasta/GCF_000018945.1_ASM1894v1_genomic.fna.gz,CC0
+GCF_000195995.1_ASM19599v1_genomic.fna.gz.sig,40df36a7eb411022be4b1d6a7af05496,11,DNA,0,10000,161,42,0,"NC_003198.1 Salmonella enterica subsp. enterica serovar Typhi str. CT18, complete genome",../fasta/GCF_000195995.1_ASM19599v1_genomic.fna.gz,CC0
+GCF_000195995.1_ASM19599v1_genomic.fna.gz.sig,ffa92983f7e67454c407499cbfbabf88,21,DNA,0,10000,487,42,0,"NC_003198.1 Salmonella enterica subsp. enterica serovar Typhi str. CT18, complete genome",../fasta/GCF_000195995.1_ASM19599v1_genomic.fna.gz,CC0
+GCF_000195995.1_ASM19599v1_genomic.fna.gz.sig,cb26db5716a213c9a6614021e7176c1d,31,DNA,0,10000,512,42,0,"NC_003198.1 Salmonella enterica subsp. enterica serovar Typhi str. CT18, complete genome",../fasta/GCF_000195995.1_ASM19599v1_genomic.fna.gz,CC0
diff --git a/tests/test-data/gather/campy-picklist.csv b/tests/test-data/gather/campy-picklist.csv
new file mode 100644
index 0000000000..5490c2de61
--- /dev/null
+++ b/tests/test-data/gather/campy-picklist.csv
@@ -0,0 +1,4 @@
+signature_file,md5,ksize,moltype,num,scaled,n_hashes,seed,with_abundance,name,filename,license
+GCF_000009085.1_ASM908v1_genomic.fna.gz.sig,752280e9969ce750e2c80477c1b7b0e7,11,DNA,0,10000,61,42,0,"NC_002163.1 Campylobacter jejuni subsp. jejuni NCTC 11168 = ATCC 700819 chromosome, complete genome",../fasta/GCF_000009085.1_ASM908v1_genomic.fna.gz,CC0
+GCF_000009085.1_ASM908v1_genomic.fna.gz.sig,eba0eb3ce984cc53c36f134a752c52c5,21,DNA,0,10000,157,42,0,"NC_002163.1 Campylobacter jejuni subsp. jejuni NCTC 11168 = ATCC 700819 chromosome, complete genome",../fasta/GCF_000009085.1_ASM908v1_genomic.fna.gz,CC0
+GCF_000009085.1_ASM908v1_genomic.fna.gz.sig,953156e9f4da8cf22e7e0b4b88261fae,31,DNA,0,10000,167,42,0,"NC_002163.1 Campylobacter jejuni subsp. jejuni NCTC 11168 = ATCC 700819 chromosome, complete genome",../fasta/GCF_000009085.1_ASM908v1_genomic.fna.gz,CC0
diff --git a/tests/test-data/gather/salmonella-picklist.csv b/tests/test-data/gather/salmonella-picklist.csv
new file mode 100644
index 0000000000..ae048b99d4
--- /dev/null
+++ b/tests/test-data/gather/salmonella-picklist.csv
@@ -0,0 +1,25 @@
+signature_file,md5,ksize,moltype,num,scaled,n_hashes,seed,with_abundance,name,filename,license
+GCF_000006945.2_ASM694v2_genomic.fna.gz.sig,323c1a1712b0949268dd6fb93be63ae2,11,DNA,0,10000,150,42,0,"NC_003197.2 Salmonella enterica subsp. enterica serovar Typhimurium str. LT2, complete genome",../fasta/GCF_000006945.2_ASM694v2_genomic.fna.gz,CC0
+GCF_000006945.2_ASM694v2_genomic.fna.gz.sig,263c2de20b597d6e33b81ec91d8672b5,21,DNA,0,10000,485,42,0,"NC_003197.2 Salmonella enterica subsp. enterica serovar Typhimurium str. LT2, complete genome",../fasta/GCF_000006945.2_ASM694v2_genomic.fna.gz,CC0
+GCF_000006945.2_ASM694v2_genomic.fna.gz.sig,dc12a6d8fd63122aa68f78facf9bed94,31,DNA,0,10000,490,42,0,"NC_003197.2 Salmonella enterica subsp. enterica serovar Typhimurium str. LT2, complete genome",../fasta/GCF_000006945.2_ASM694v2_genomic.fna.gz,CC0
+GCF_000007545.1_ASM754v1_genomic.fna.gz.sig,df24140b1c6cad16b30abeaf03019eb5,11,DNA,0,10000,158,42,0,"NC_004631.1 Salmonella enterica subsp. enterica serovar Typhi Ty2, complete genome",../fasta/GCF_000007545.1_ASM754v1_genomic.fna.gz,CC0
+GCF_000007545.1_ASM754v1_genomic.fna.gz.sig,fd958e3b5649bc03890517ff239970ea,21,DNA,0,10000,445,42,0,"NC_004631.1 Salmonella enterica subsp. enterica serovar Typhi Ty2, complete genome",../fasta/GCF_000007545.1_ASM754v1_genomic.fna.gz,CC0
+GCF_000007545.1_ASM754v1_genomic.fna.gz.sig,8c22dff88a2239607762da00f7fd1725,31,DNA,0,10000,472,42,0,"NC_004631.1 Salmonella enterica subsp. enterica serovar Typhi Ty2, complete genome",../fasta/GCF_000007545.1_ASM754v1_genomic.fna.gz,CC0
+GCF_000008105.1_ASM810v1_genomic.fna.gz.sig,9db6efc92a041e11713ccfa8597edae5,11,DNA,0,10000,150,42,0,"NC_006905.1 Salmonella enterica subsp. enterica serovar Choleraesuis str. SC-B67, complete genome",../fasta/GCF_000008105.1_ASM810v1_genomic.fna.gz,CC0
+GCF_000008105.1_ASM810v1_genomic.fna.gz.sig,8996699a05d3e5a05fa3fe94bfa41431,21,DNA,0,10000,472,42,0,"NC_006905.1 Salmonella enterica subsp. enterica serovar Choleraesuis str. SC-B67, complete genome",../fasta/GCF_000008105.1_ASM810v1_genomic.fna.gz,CC0
+GCF_000008105.1_ASM810v1_genomic.fna.gz.sig,85c3aeec6457c0b1d210472ddeb67714,31,DNA,0,10000,468,42,0,"NC_006905.1 Salmonella enterica subsp. enterica serovar Choleraesuis str. SC-B67, complete genome",../fasta/GCF_000008105.1_ASM810v1_genomic.fna.gz,CC0
+GCF_000009505.1_ASM950v1_genomic.fna.gz.sig,0f35aeadda1532ed450bd6de1e73545d,11,DNA,0,10000,148,42,0,NC_011294.1 Salmonella enterica subsp. enterica serovar Enteritidis str. P125109 complete genome,../fasta/GCF_000009505.1_ASM950v1_genomic.fna.gz,CC0
+GCF_000009505.1_ASM950v1_genomic.fna.gz.sig,405ae3300f28ca5fe5c223cbf7e28734,21,DNA,0,10000,471,42,0,NC_011294.1 Salmonella enterica subsp. enterica serovar Enteritidis str. P125109 complete genome,../fasta/GCF_000009505.1_ASM950v1_genomic.fna.gz,CC0
+GCF_000009505.1_ASM950v1_genomic.fna.gz.sig,0842f7edb426fc4fa2701c107e678279,31,DNA,0,10000,461,42,0,NC_011294.1 Salmonella enterica subsp. enterica serovar Enteritidis str. P125109 complete genome,../fasta/GCF_000009505.1_ASM950v1_genomic.fna.gz,CC0
+GCF_000009525.1_ASM952v1_genomic.fna.gz.sig,d883538a0c983a863fa4b6e5fcd19612,11,DNA,0,10000,148,42,0,NC_011274.1 Salmonella enterica subsp. enterica serovar Gallinarum str. 287/91 complete genome,../fasta/GCF_000009525.1_ASM952v1_genomic.fna.gz,CC0
+GCF_000009525.1_ASM952v1_genomic.fna.gz.sig,9133bd71b86628b38c665ab7e5eb8712,21,DNA,0,10000,457,42,0,NC_011274.1 Salmonella enterica subsp. enterica serovar Gallinarum str. 287/91 complete genome,../fasta/GCF_000009525.1_ASM952v1_genomic.fna.gz,CC0
+GCF_000009525.1_ASM952v1_genomic.fna.gz.sig,afadabf39aec247929e84a29fd797117,31,DNA,0,10000,461,42,0,NC_011274.1 Salmonella enterica subsp. enterica serovar Gallinarum str. 287/91 complete genome,../fasta/GCF_000009525.1_ASM952v1_genomic.fna.gz,CC0
+GCF_000011885.1_ASM1188v1_genomic.fna.gz.sig,feef9e4d39fecd3d9292b76c0cc72b81,11,DNA,0,10000,155,42,0,"NC_006511.1 Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC 9150, complete genome",../fasta/GCF_000011885.1_ASM1188v1_genomic.fna.gz,CC0
+GCF_000011885.1_ASM1188v1_genomic.fna.gz.sig,cc80cb247b195ca3dfa0756257d882b6,21,DNA,0,10000,427,42,0,"NC_006511.1 Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC 9150, complete genome",../fasta/GCF_000011885.1_ASM1188v1_genomic.fna.gz,CC0
+GCF_000011885.1_ASM1188v1_genomic.fna.gz.sig,bb365606acbf08d183399f139af80c32,31,DNA,0,10000,459,42,0,"NC_006511.1 Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC 9150, complete genome",../fasta/GCF_000011885.1_ASM1188v1_genomic.fna.gz,CC0
+GCF_000016045.1_ASM1604v1_genomic.fna.gz.sig,4cec832176c4831239faed42c0616ef4,11,DNA,0,10000,155,42,0,"NC_011080.1 Salmonella enterica subsp. enterica serovar Newport str. SL254, complete genome",../fasta/GCF_000016045.1_ASM1604v1_genomic.fna.gz,CC0
+GCF_000016045.1_ASM1604v1_genomic.fna.gz.sig,43a9d80a4cd995779c7538a32088dd0e,21,DNA,0,10000,469,42,0,"NC_011080.1 Salmonella enterica subsp. enterica serovar Newport str. SL254, complete genome",../fasta/GCF_000016045.1_ASM1604v1_genomic.fna.gz,CC0
+GCF_000016045.1_ASM1604v1_genomic.fna.gz.sig,d0cfbe22579f98fd5de2d41203589964,31,DNA,0,10000,480,42,0,"NC_011080.1 Salmonella enterica subsp. enterica serovar Newport str. SL254, complete genome",../fasta/GCF_000016045.1_ASM1604v1_genomic.fna.gz,CC0
+GCF_000195995.1_ASM19599v1_genomic.fna.gz.sig,40df36a7eb411022be4b1d6a7af05496,11,DNA,0,10000,161,42,0,"NC_003198.1 Salmonella enterica subsp. enterica serovar Typhi str. CT18, complete genome",../fasta/GCF_000195995.1_ASM19599v1_genomic.fna.gz,CC0
+GCF_000195995.1_ASM19599v1_genomic.fna.gz.sig,ffa92983f7e67454c407499cbfbabf88,21,DNA,0,10000,487,42,0,"NC_003198.1 Salmonella enterica subsp. enterica serovar Typhi str. CT18, complete genome",../fasta/GCF_000195995.1_ASM19599v1_genomic.fna.gz,CC0
+GCF_000195995.1_ASM19599v1_genomic.fna.gz.sig,cb26db5716a213c9a6614021e7176c1d,31,DNA,0,10000,512,42,0,"NC_003198.1 Salmonella enterica subsp. enterica serovar Typhi str. CT18, complete genome",../fasta/GCF_000195995.1_ASM19599v1_genomic.fna.gz,CC0
diff --git a/tests/test-data/gather/thermotoga-picklist.csv b/tests/test-data/gather/thermotoga-picklist.csv
new file mode 100644
index 0000000000..4606e0ca47
--- /dev/null
+++ b/tests/test-data/gather/thermotoga-picklist.csv
@@ -0,0 +1,10 @@
+signature_file,md5,ksize,moltype,num,scaled,n_hashes,seed,with_abundance,name,filename,license
+GCF_000008545.1_ASM854v1_genomic.fna.gz.sig,74b928d3db1f7f033c0dcca6c6e52aea,11,DNA,0,10000,84,42,0,"NC_000853.1 Thermotoga maritima MSB8 chromosome, complete genome",../fasta/GCF_000008545.1_ASM854v1_genomic.fna.gz,CC0
+GCF_000008545.1_ASM854v1_genomic.fna.gz.sig,ba9947e078cab29e20bc7d31bc1b9f0d,21,DNA,0,10000,192,42,0,"NC_000853.1 Thermotoga maritima MSB8 chromosome, complete genome",../fasta/GCF_000008545.1_ASM854v1_genomic.fna.gz,CC0
+GCF_000008545.1_ASM854v1_genomic.fna.gz.sig,1bfe96d76ec9cdb60779a1a9223c424e,31,DNA,0,10000,187,42,0,"NC_000853.1 Thermotoga maritima MSB8 chromosome, complete genome",../fasta/GCF_000008545.1_ASM854v1_genomic.fna.gz,CC0
+GCF_000016785.1_ASM1678v1_genomic.fna.gz.sig,328f7b0643bdb6c76135292b5afc8fa7,11,DNA,0,10000,82,42,0,"NC_009486.1 Thermotoga petrophila RKU-1, complete genome",../fasta/GCF_000016785.1_ASM1678v1_genomic.fna.gz,CC0
+GCF_000016785.1_ASM1678v1_genomic.fna.gz.sig,a77789e831fcd2436c3b9e4e22fb173e,21,DNA,0,10000,190,42,0,"NC_009486.1 Thermotoga petrophila RKU-1, complete genome",../fasta/GCF_000016785.1_ASM1678v1_genomic.fna.gz,CC0
+GCF_000016785.1_ASM1678v1_genomic.fna.gz.sig,50d8efd580ff2000cb38d1f8cc9cf9b4,31,DNA,0,10000,185,42,0,"NC_009486.1 Thermotoga petrophila RKU-1, complete genome",../fasta/GCF_000016785.1_ASM1678v1_genomic.fna.gz,CC0
+GCF_000018945.1_ASM1894v1_genomic.fna.gz.sig,989f88420b193ef39c4dbe3b268e0049,11,DNA,0,10000,90,42,0,"NC_011978.1 Thermotoga neapolitana DSM 4359, complete genome",../fasta/GCF_000018945.1_ASM1894v1_genomic.fna.gz,CC0
+GCF_000018945.1_ASM1894v1_genomic.fna.gz.sig,bebcd0dcc0ed3b120ad16c4e15805370,21,DNA,0,10000,188,42,0,"NC_011978.1 Thermotoga neapolitana DSM 4359, complete genome",../fasta/GCF_000018945.1_ASM1894v1_genomic.fna.gz,CC0
+GCF_000018945.1_ASM1894v1_genomic.fna.gz.sig,4289d4241be8573145282352215ca3c4,31,DNA,0,10000,198,42,0,"NC_011978.1 Thermotoga neapolitana DSM 4359, complete genome",../fasta/GCF_000018945.1_ASM1894v1_genomic.fna.gz,CC0
diff --git a/tests/test-data/prot/all.zip b/tests/test-data/prot/all.zip
index e7a34d85f8..59603e00e4 100644
Binary files a/tests/test-data/prot/all.zip and b/tests/test-data/prot/all.zip differ
diff --git a/tests/test-data/prot/dayhoff.zip b/tests/test-data/prot/dayhoff.zip
index e9aedbcaf7..0bf619a136 100644
Binary files a/tests/test-data/prot/dayhoff.zip and b/tests/test-data/prot/dayhoff.zip differ
diff --git a/tests/test-data/prot/hp.zip b/tests/test-data/prot/hp.zip
index 5779824145..c1963c2424 100644
Binary files a/tests/test-data/prot/hp.zip and b/tests/test-data/prot/hp.zip differ
diff --git a/tests/test-data/prot/protein.zip b/tests/test-data/prot/protein.zip
index 3103c0217c..9195fdc763 100644
Binary files a/tests/test-data/prot/protein.zip and b/tests/test-data/prot/protein.zip differ
diff --git a/tests/test-data/tax/47+63_x_gtdb-rs202.gather.csv b/tests/test-data/tax/47+63_x_gtdb-rs202.gather.csv
new file mode 100644
index 0000000000..df9c2a14f6
--- /dev/null
+++ b/tests/test-data/tax/47+63_x_gtdb-rs202.gather.csv
@@ -0,0 +1,3 @@
+intersect_bp,f_orig_query,f_match,f_unique_to_query,f_unique_weighted,average_abund,median_abund,std_abund,name,filename,md5,f_match_orig,unique_intersect_bp,gather_result_rank,remaining_bp,query_filename,query_name,query_md5,query_bp
+5238000,0.6642150646715699,1.0,0.6642150646715699,0.6642150646715699,,,,"GCF_000021665.1 Shewanella baltica OS223 strain=OS223, ASM2166v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,38729c6374925585db28916b82a6f513,1.0,5238000,0,2648000,,47+63,491c0a81,7886000
+5177000,0.6564798376870403,0.5114931427467645,0.3357849353284301,0.3357849353284301,,,,"GCF_000017325.1 Shewanella baltica OS185 strain=OS185, ASM1732v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,09a08691ce52952152f0e866a59f6261,1.0,2648000,1,0,,47+63,491c0a81,7886000
diff --git a/tests/test-data/tax/bacteria_refseq_lineage.csv b/tests/test-data/tax/bacteria_refseq_lineage.csv
new file mode 100644
index 0000000000..a4dccc4318
--- /dev/null
+++ b/tests/test-data/tax/bacteria_refseq_lineage.csv
@@ -0,0 +1,5 @@
+accession,taxid,superkingdom,phylum,class,order,family,genus,species
+GCF_001881345,562,Bacteria,Proteobacteria,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Escherichia,Escherichia coli,
+GCF_009494285,165179,Bacteria,Bacteroidetes,Bacteroidia,Bacteroidales,Prevotellaceae,Prevotella,Prevotella copri,
+GCF_013368705,821,Bacteria,Bacteroidetes,Bacteroidia,Bacteroidales,Bacteroidaceae,Bacteroides,Bacteroides vulgatus,
+GCF_003471795,165179,Bacteria,Bacteroidetes,Bacteroidia,Bacteroidales,Prevotellaceae,Prevotella,Prevotella copri,
diff --git a/tests/test-data/tax/protozoa_genbank_lineage.csv b/tests/test-data/tax/protozoa_genbank_lineage.csv
new file mode 100644
index 0000000000..d36340e7f8
--- /dev/null
+++ b/tests/test-data/tax/protozoa_genbank_lineage.csv
@@ -0,0 +1,3 @@
+ident,taxid,superkingdom,phylum,class,order,family,genus,species
+GCA_002754635,5855,Eukaryota,Apicomplexa,Aconoidasida,Haemosporida,Plasmodiidae,Plasmodium,Plasmodium vivax,
+GCA_000256725,1130821,Eukaryota,Apicomplexa,Conoidasida,Eucoccidiorida,Sarcocystidae,Toxoplasma,Toxoplasma gondii,Toxoplasma gondii TgCatPRC2
diff --git a/tests/test-data/tax/test-missing-ranks.taxonomy.csv b/tests/test-data/tax/test-missing-ranks.taxonomy.csv
new file mode 100644
index 0000000000..3b27816704
--- /dev/null
+++ b/tests/test-data/tax/test-missing-ranks.taxonomy.csv
@@ -0,0 +1,7 @@
+ident,superkingdom,phylum,class,order,a,b,c
+GCF_001881345.1,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Enterobacterales,f__Enterobacteriaceae,g__Escherichia,s__Escherichia coli
+GCF_009494285.1,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Bacteroidaceae,g__Prevotella,s__Prevotella copri
+GCF_013368705.1,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Bacteroidaceae,g__Phocaeicola,s__Phocaeicola vulgatus
+GCF_003471795.1,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Bacteroidaceae,g__Prevotella,s__Prevotella copri
+GCF_000017325.1,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Enterobacterales,f__Shewanellaceae,g__Shewanella,s__Shewanella baltica
+GCF_000021665.1,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Enterobacterales,f__Shewanellaceae,g__Shewanella,s__Shewanella baltica
diff --git a/tests/test-data/tax/test-strain.taxonomy.csv b/tests/test-data/tax/test-strain.taxonomy.csv
new file mode 100644
index 0000000000..b1ae095b02
--- /dev/null
+++ b/tests/test-data/tax/test-strain.taxonomy.csv
@@ -0,0 +1,7 @@
+ident,superkingdom,phylum,class,order,family,genus,species,strain
+GCF_001881345.1,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Enterobacterales,f__Enterobacteriaceae,g__Escherichia,s__Escherichia coli,1
+GCF_009494285.1,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Bacteroidaceae,g__Prevotella,s__Prevotella copri,2
+GCF_013368705.1,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Bacteroidaceae,g__Phocaeicola,s__Phocaeicola vulgatus,3
+GCF_003471795.1,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Bacteroidaceae,g__Prevotella,s__Prevotella copri,4
+GCF_000017325.1,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Enterobacterales,f__Shewanellaceae,g__Shewanella,s__Shewanella baltica,5
+GCF_000021665.1,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Enterobacterales,f__Shewanellaceae,g__Shewanella,s__Shewanella baltica,6
diff --git a/tests/test-data/tax/test.taxonomy.csv b/tests/test-data/tax/test.taxonomy.csv
new file mode 100644
index 0000000000..63c52eea92
--- /dev/null
+++ b/tests/test-data/tax/test.taxonomy.csv
@@ -0,0 +1,7 @@
+ident,superkingdom,phylum,class,order,family,genus,species
+GCF_001881345.1,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Enterobacterales,f__Enterobacteriaceae,g__Escherichia,s__Escherichia coli
+GCF_009494285.1,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Bacteroidaceae,g__Prevotella,s__Prevotella copri
+GCF_013368705.1,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Bacteroidaceae,g__Phocaeicola,s__Phocaeicola vulgatus
+GCF_003471795.1,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Bacteroidaceae,g__Prevotella,s__Prevotella copri
+GCF_000017325.1,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Enterobacterales,f__Shewanellaceae,g__Shewanella,s__Shewanella baltica
+GCF_000021665.1,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Enterobacterales,f__Shewanellaceae,g__Shewanella,s__Shewanella baltica
diff --git a/tests/test-data/tax/test.taxonomy.db b/tests/test-data/tax/test.taxonomy.db
new file mode 100644
index 0000000000..9539cc1c2e
Binary files /dev/null and b/tests/test-data/tax/test.taxonomy.db differ
diff --git a/tests/test-data/tax/test1.gather.csv b/tests/test-data/tax/test1.gather.csv
new file mode 100644
index 0000000000..80388e8d9d
--- /dev/null
+++ b/tests/test-data/tax/test1.gather.csv
@@ -0,0 +1,5 @@
+intersect_bp,f_orig_query,f_match,f_unique_to_query,f_unique_weighted,average_abund,median_abund,std_abund,name,filename,md5,f_match_orig,unique_intersect_bp,gather_result_rank,remaining_bp,query_name,query_md5,query_filename
+442000,0.08815317112086159,0.08438335242458954,0.08815317112086159,0.05815279361459521,1.6153846153846154,1.0,1.1059438185997785,"GCF_001881345.1 Escherichia coli strain=SF-596, ASM188134v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,683df1ec13872b4b98d59e98b355b52c,0.042779713511420826,442000,0,4572000,test1,md5,test1.sig
+390000,0.07778220981252493,0.10416666666666667,0.07778220981252493,0.050496823586903404,1.5897435897435896,1.0,0.8804995294906566,"GCF_009494285.1 Prevotella copri strain=iAK1218, ASM949428v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,1266c86141e3a5603da61f57dd863ed0,0.052236806857755155,390000,1,4182000,test1,md5,test1.sig
+138000,0.027522935779816515,0.024722321748477247,0.027522935779816515,0.015637726014008795,1.391304347826087,1.0,0.5702120455914782,"GCF_013368705.1 Bacteroides vulgatus strain=B33, ASM1336870v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,7d5f4ba1d01c8c3f7a520d19faded7cb,0.012648945921173235,138000,2,4044000,test1,md5,test1.sig
+338000,0.06741124850418827,0.013789581205311542,0.010769844435580374,0.006515719172503665,1.4814814814814814,1.0,0.738886568268889,"GCF_003471795.1 Prevotella copri strain=AM16-54, ASM347179v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,0ebd36ff45fc2810808789667f4aad84,0.04337782340862423,54000,3,3990000,test1,md5,test1.sig
diff --git a/tests/test-data/tax/test1_x_gtdbrs202_genbank_euks.gather.csv b/tests/test-data/tax/test1_x_gtdbrs202_genbank_euks.gather.csv
new file mode 100644
index 0000000000..62af0c7491
--- /dev/null
+++ b/tests/test-data/tax/test1_x_gtdbrs202_genbank_euks.gather.csv
@@ -0,0 +1,7 @@
+intersect_bp,f_orig_query,f_match,f_unique_to_query,f_unique_weighted,average_abund,median_abund,std_abund,name,filename,md5,f_match_orig,unique_intersect_bp,gather_result_rank,remaining_bp,query_filename,query_name,query_md5,query_bp
+442000,0.08815317112086159,0.08438335242458954,0.08815317112086159,0.05815279361459521,1.6153846153846154,1.0,1.1059438185997785,"GCF_001881345.1 Escherichia coli strain=SF-596, ASM188134v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,683df1ec13872b4b98d59e98b355b52c,0.042779713511420826,442000,0,4572000,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,multtest,9687eeed,5014000
+390000,0.07778220981252493,0.10416666666666667,0.07778220981252493,0.050496823586903404,1.5897435897435896,1.0,0.8804995294906566,"GCF_009494285.1 Prevotella copri strain=iAK1218, ASM949428v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,1266c86141e3a5603da61f57dd863ed0,0.052236806857755155,390000,1,4182000,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,multtest,9687eeed,5014000
+206000,0.041084962106102914,0.007403148134837921,0.041084962106102914,0.2215344518651246,13.20388349514563,3.0,69.69466823965065,"GCA_002754635.1 Plasmodium vivax strain=CMB-1, CMB-1_v2",/home/irber/sourmash_databases/outputs/sbt/genbank-protozoa-x1e6-k31.sbt.zip,8125e7913e0d0b88deb63c9ad28f827c,0.0037419167332703625,206000,2,3976000,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,multtest,9687eeed,5014000
+138000,0.027522935779816515,0.024722321748477247,0.027522935779816515,0.015637726014008795,1.391304347826087,1.0,0.5702120455914782,"GCF_013368705.1 Bacteroides vulgatus strain=B33, ASM1336870v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,7d5f4ba1d01c8c3f7a520d19faded7cb,0.012648945921173235,138000,3,3838000,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,multtest,9687eeed,5014000
+338000,0.06741124850418827,0.013789581205311542,0.010769844435580374,0.006515719172503665,1.4814814814814814,1.0,0.738886568268889,"GCF_003471795.1 Prevotella copri strain=AM16-54, ASM347179v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,0ebd36ff45fc2810808789667f4aad84,0.04337782340862423,54000,4,3784000,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,multtest,9687eeed,5014000
+110000,0.021938571998404467,0.000842978957948319,0.010370961308336658,0.023293696041700604,5.5,2.5,7.417494911978758,"GCA_000256725.2 Toxoplasma gondii TgCatPRC2 strain=TgCatPRC2, TGCATPRC2 v2",/home/irber/sourmash_databases/outputs/sbt/genbank-protozoa-x1e6-k31.sbt.zip,2a3b1804cf5ea5fe75dde3e153294548,0.0008909768346023004,52000,5,3732000,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,multtest,9687eeed,5014000
diff --git a/tests/test-data/track_abund/track_abund.zip b/tests/test-data/track_abund/track_abund.zip
new file mode 100644
index 0000000000..8f5e8d8d26
Binary files /dev/null and b/tests/test-data/track_abund/track_abund.zip differ
diff --git a/tests/test_api.py b/tests/test_api.py
index 02dd07eaef..73f9ffc7a4 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -54,7 +54,7 @@ def test_load_index_4():
idx = sourmash.load_file_as_index(testfile)
sigs = list(idx.signatures())
- assert len(sigs) == 7
+ assert len(sigs) == 8
def test_load_index_4_b():
diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py
index 060f7f83ab..cc6f9bfb99 100644
--- a/tests/test_cmd_signature.py
+++ b/tests/test_cmd_signature.py
@@ -4,12 +4,14 @@
import csv
import shutil
import os
+import glob
import pytest
import sourmash_tst_utils as utils
import sourmash
from sourmash.signature import load_signatures
+from sourmash.manifest import CollectionManifest
## command line tests
@@ -100,7 +102,7 @@ def test_sig_merge_1_name(c):
outsig = c.output('merged2and63.sig')
c.run_sourmash('sig', 'merge', sig2, sig63, '--dna', '-k', '31', '-o', "merged2and63.sig", '--name', assignedSigName )
-
+
test_merge_sig = sourmash.load_one_signature(outsig)
print("outsig", outsig)
@@ -761,31 +763,13 @@ def test_sig_cat_2_out_inplace(c):
@utils.in_tempdir
-def test_sig_cat_filelist(c):
+def test_sig_cat_3_filelist(c):
# cat using a file list as input
sig47 = utils.get_test_data('47.fa.sig')
- # sig47list = list(load_signatures(sig47))
- # print("sig47: ",sig47)
- # print(type(sig47))
- # print("length sig47: ",len(sig47list))
- # print("\n")
-
sig47abund = utils.get_test_data('track_abund/47.fa.sig')
- # sig47abundlist = list(load_signatures(sig47abund))
- # print("sig47abund: ",sig47abund)
- # print(type(sig47abund))
- # print("length sig47abund: ",len(sig47abundlist))
- # print("\n")
-
multisig = utils.get_test_data('47+63-multisig.sig')
- # multisiglist = list(load_signatures(multisig))
- # print("multisig: ",multisig)
- # print(type(multisig))
- # print("length multisig: ",len(multisiglist))
- # print("\n")
-
- filelist = c.output("filelist")
+ filelist = c.output("filelist")
with open(filelist, 'w') as f:
f.write("\n".join((sig47, sig47abund, multisig)))
@@ -811,17 +795,12 @@ def test_sig_cat_filelist(c):
# sort the signatures by something deterministic and unique
siglist.sort(key = lambda x: x.md5sum())
-
- # print(len(siglist))
- # print("siglist: ",siglist)
- # print("\n")
- # print("\n")
-
+
assert repr(siglist) == """[SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_011665.1 Shewanella baltica OS223 plasmid pS22303, complete sequence', 485c3377), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 57e2b22f), SourmashSignature('NC_011668.1 Shewanella baltica OS223 plasmid pS22302, complete sequence', 837bf2a7), SourmashSignature('NC_011664.1 Shewanella baltica OS223 plasmid pS22301, complete sequence', 87a9aec4), SourmashSignature('NC_009661.1 Shewanella baltica OS185 plasmid pS18501, complete sequence', bde81a41), SourmashSignature('NC_011663.1 Shewanella baltica OS223, complete genome', f033bbd8)]"""
@utils.in_tempdir
-def test_sig_cat_filelist_with_dbs(c):
+def test_sig_cat_4_filelist_with_dbs(c):
# cat using a file list as input
sig47 = utils.get_test_data('47.fa.sig')
sig47abund = utils.get_test_data('track_abund/47.fa.sig')
@@ -857,6 +836,43 @@ def test_sig_cat_filelist_with_dbs(c):
assert repr(siglist) == """[SourmashSignature('', 0107d767), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('', 4e94e602), SourmashSignature('', 60f7e23c), SourmashSignature('', 6d6e87e1), SourmashSignature('', b59473c9), SourmashSignature('', f0c834bc), SourmashSignature('', f71e7817)]"""
+@utils.in_tempdir
+def test_sig_cat_5_from_file(c):
+ # cat using a file list as input
+ sig47 = utils.get_test_data('47.fa.sig')
+ sig47abund = utils.get_test_data('track_abund/47.fa.sig')
+ sbt = utils.get_test_data('v6.sbt.zip')
+
+ filelist = c.output("filelist")
+ with open(filelist, 'w') as f:
+ f.write("\n".join((sig47, sig47abund, sbt)))
+
+ c.run_sourmash('sig', 'cat', '--from-file', filelist,
+ '-o', 'out.sig')
+
+ # stdout should be same signatures
+ out = c.output('out.sig')
+
+ siglist = list(load_signatures(out))
+ print(len(siglist))
+ # print("siglist: ",siglist)
+ # print("\n")
+
+ # verify the number of signatures matches what we expect to see based
+ # on the input files
+ all_sigs = []
+ all_sigs += list(load_signatures(sig47))
+ all_sigs += list(load_signatures(sig47abund))
+ all_sigs += list(sourmash.load_file_as_signatures(sbt))
+
+ assert len(all_sigs) == len(siglist)
+
+ # sort the signatures by something deterministic and unique
+ siglist.sort(key = lambda x: x.md5sum())
+
+ assert repr(siglist) == """[SourmashSignature('', 0107d767), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('', 4e94e602), SourmashSignature('', 60f7e23c), SourmashSignature('', 6d6e87e1), SourmashSignature('', b59473c9), SourmashSignature('', f0c834bc), SourmashSignature('', f71e7817)]"""
+
+
@utils.in_tempdir
def test_sig_split_1(c):
# split 47 into 1 sig :)
@@ -1110,6 +1126,913 @@ def test_sig_extract_7_no_ksize(c):
assert len(siglist) == 3
+def test_sig_extract_8_picklist_md5(runtmp):
+ # extract 47 from 47, using a picklist w/full md5
+ sig47 = utils.get_test_data('47.fa.sig')
+ sig63 = utils.get_test_data('63.fa.sig')
+
+ # select on any of these attributes
+ row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome',
+ md5full='09a08691ce52952152f0e866a59f6261',
+ md5short='09a08691ce5295215',
+ fullIdent='NC_009665.1',
+ nodotIdent='NC_009665')
+
+ # make picklist
+ picklist_csv = runtmp.output('pick.csv')
+ with open(picklist_csv, 'w', newline='') as csvfp:
+ w = csv.DictWriter(csvfp, fieldnames=row.keys())
+ w.writeheader()
+ w.writerow(row)
+
+ picklist_arg = f"{picklist_csv}:md5full:md5"
+ runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg)
+
+ # stdout should be new signature
+ out = runtmp.last_result.out
+
+ test_extract_sig = sourmash.load_one_signature(sig47)
+ actual_extract_sig = sourmash.load_one_signature(out)
+
+ assert actual_extract_sig == test_extract_sig
+
+ err = runtmp.last_result.err
+
+ print(err)
+ assert "loaded 1 distinct values into picklist." in err
+ assert "loaded 1 total that matched ksize & molecule type" in err
+ assert "extracted 1 signatures from 2 file(s)" in err
+ assert "for given picklist, found 1 matches to 1 distinct values" in err
+
+def test_sig_extract_8_picklist_md5_include(runtmp):
+ # extract 47 from 47, using a picklist w/full md5:: explicit include
+ sig47 = utils.get_test_data('47.fa.sig')
+ sig63 = utils.get_test_data('63.fa.sig')
+
+ # select on any of these attributes
+ row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome',
+ md5full='09a08691ce52952152f0e866a59f6261',
+ md5short='09a08691ce5295215',
+ fullIdent='NC_009665.1',
+ nodotIdent='NC_009665')
+
+ # make picklist
+ picklist_csv = runtmp.output('pick.csv')
+ with open(picklist_csv, 'w', newline='') as csvfp:
+ w = csv.DictWriter(csvfp, fieldnames=row.keys())
+ w.writeheader()
+ w.writerow(row)
+
+ picklist_arg = f"{picklist_csv}:md5full:md5:include"
+ runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg)
+
+ # stdout should be new signature
+ out = runtmp.last_result.out
+
+ test_extract_sig = sourmash.load_one_signature(sig47)
+ actual_extract_sig = sourmash.load_one_signature(out)
+
+ assert actual_extract_sig == test_extract_sig
+
+ err = runtmp.last_result.err
+
+ print(err)
+ assert "loaded 1 distinct values into picklist." in err
+ assert "loaded 1 total that matched ksize & molecule type" in err
+ assert "extracted 1 signatures from 2 file(s)" in err
+ assert "for given picklist, found 1 matches to 1 distinct values" in err
+
+
+def test_sig_extract_8_picklist_md5_exclude(runtmp):
+ # extract 63 from 47,63 by excluding 47, using a picklist w/full md5
+ sig47 = utils.get_test_data('47.fa.sig')
+ sig63 = utils.get_test_data('63.fa.sig')
+
+ # select on any of these attributes
+ row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome',
+ md5full='09a08691ce52952152f0e866a59f6261',
+ md5short='09a08691ce5295215',
+ fullIdent='NC_009665.1',
+ nodotIdent='NC_009665')
+
+ # make picklist
+ picklist_csv = runtmp.output('pick.csv')
+ with open(picklist_csv, 'w', newline='') as csvfp:
+ w = csv.DictWriter(csvfp, fieldnames=row.keys())
+ w.writeheader()
+ w.writerow(row)
+
+ picklist_arg = f"{picklist_csv}:md5full:md5:exclude"
+ runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg)
+
+ # stdout should be new signature
+ out = runtmp.last_result.out
+
+ test_extract_sig = sourmash.load_one_signature(sig63)
+ actual_extract_sig = sourmash.load_one_signature(out)
+
+ assert actual_extract_sig == test_extract_sig
+
+ err = runtmp.last_result.err
+
+ print(err)
+ assert "loaded 1 distinct values into picklist." in err
+ assert "loaded 1 total that matched ksize & molecule type" in err
+ assert "extracted 1 signatures from 2 file(s)" in err
+ assert "for given picklist, found 1 matches by excluding 1 distinct values" in err
+
+
+def test_sig_extract_8_picklist_md5_require_all(runtmp):
+ # extract 47 from 47, using a picklist w/full md5;
+ # confirm that check missing picklist val errors out on --picklist-require
+ sig47 = utils.get_test_data('47.fa.sig')
+ sig63 = utils.get_test_data('63.fa.sig')
+
+ # select on any of these attributes
+ row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome',
+ md5full='09a08691ce52952152f0e866a59f6261',
+ md5short='09a08691ce5295215',
+ fullIdent='NC_009665.1',
+ nodotIdent='NC_009665')
+
+ # make picklist
+ picklist_csv = runtmp.output('pick.csv')
+ with open(picklist_csv, 'w', newline='') as csvfp:
+ w = csv.DictWriter(csvfp, fieldnames=row.keys())
+ w.writeheader()
+ w.writerow(row)
+ w.writerow(dict(exactName='', md5full='BAD MD5',
+ md5short='', fullIdent='', nodotIdent=''))
+
+ picklist_arg = f"{picklist_csv}:md5full:md5"
+ with pytest.raises(ValueError):
+ runtmp.sourmash('sig', 'extract', sig47, sig63,
+ '--picklist', picklist_arg,
+ '--picklist-require-all')
+
+ # stdout should be new signature
+ out = runtmp.last_result.out
+
+ test_extract_sig = sourmash.load_one_signature(sig47)
+ actual_extract_sig = sourmash.load_one_signature(out)
+
+ assert actual_extract_sig == test_extract_sig
+
+ err = runtmp.last_result.err
+
+ print(err)
+ assert "loaded 2 distinct values into picklist." in err
+ assert "loaded 1 total that matched ksize & molecule type" in err
+ assert "extracted 1 signatures from 2 file(s)" in err
+ assert "for given picklist, found 1 matches to 2 distinct values" in err
+ assert 'WARNING: 1 missing picklist values.' in err
+ assert 'ERROR: failing because --picklist-require-all was set' in err
+
+
+def test_sig_extract_8_picklist_name(runtmp):
+ # extract 47 from 47, using a picklist w/full md5
+ sig47 = utils.get_test_data('47.fa.sig')
+ sig63 = utils.get_test_data('63.fa.sig')
+
+ # select on any of these attributes
+ row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome',
+ md5full='09a08691ce52952152f0e866a59f6261',
+ md5short='09a08691ce5295215',
+ fullIdent='NC_009665.1',
+ nodotIdent='NC_009665')
+
+ # make picklist
+ picklist_csv = runtmp.output('pick.csv')
+ with open(picklist_csv, 'w', newline='') as csvfp:
+ w = csv.DictWriter(csvfp, fieldnames=row.keys())
+ w.writeheader()
+ w.writerow(row)
+
+ picklist_arg = f"{picklist_csv}:exactName:name"
+ runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg)
+
+ # stdout should be new signature
+ out = runtmp.last_result.out
+
+ test_extract_sig = sourmash.load_one_signature(sig47)
+ actual_extract_sig = sourmash.load_one_signature(out)
+
+ assert actual_extract_sig == test_extract_sig
+
+
+def test_sig_extract_8_picklist_name_exclude(runtmp):
+ # exclude 47 based on name
+ sig47 = utils.get_test_data('47.fa.sig')
+ sig63 = utils.get_test_data('63.fa.sig')
+
+ # select on any of these attributes
+ row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome',
+ md5full='09a08691ce52952152f0e866a59f6261',
+ md5short='09a08691ce5295215',
+ fullIdent='NC_009665.1',
+ nodotIdent='NC_009665')
+
+ # make picklist
+ picklist_csv = runtmp.output('pick.csv')
+ with open(picklist_csv, 'w', newline='') as csvfp:
+ w = csv.DictWriter(csvfp, fieldnames=row.keys())
+ w.writeheader()
+ w.writerow(row)
+
+ picklist_arg = f"{picklist_csv}:exactName:name:exclude"
+ runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg)
+
+ # stdout should be new signature
+ out = runtmp.last_result.out
+
+ test_extract_sig = sourmash.load_one_signature(sig63)
+ actual_extract_sig = sourmash.load_one_signature(out)
+
+ assert actual_extract_sig == test_extract_sig
+
+
+def test_sig_extract_8_picklist_ident(runtmp):
+ # extract 47 from 47, using a picklist w/full md5
+ sig47 = utils.get_test_data('47.fa.sig')
+ sig63 = utils.get_test_data('63.fa.sig')
+
+ # select on any of these attributes
+ row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome',
+ md5full='09a08691ce52952152f0e866a59f6261',
+ md5short='09a08691ce5295215',
+ fullIdent='NC_009665.1',
+ nodotIdent='NC_009665')
+
+ # make picklist
+ picklist_csv = runtmp.output('pick.csv')
+ with open(picklist_csv, 'w', newline='') as csvfp:
+ w = csv.DictWriter(csvfp, fieldnames=row.keys())
+ w.writeheader()
+ w.writerow(row)
+
+ picklist_arg = f"{picklist_csv}:fullIdent:ident"
+ runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg)
+
+ # stdout should be new signature
+ out = runtmp.last_result.out
+
+ test_extract_sig = sourmash.load_one_signature(sig47)
+ actual_extract_sig = sourmash.load_one_signature(out)
+
+ assert actual_extract_sig == test_extract_sig
+
+
+def test_sig_extract_8_picklist_ident_exclude(runtmp):
+ # exclude 47 based on ident
+ sig47 = utils.get_test_data('47.fa.sig')
+ sig63 = utils.get_test_data('63.fa.sig')
+
+ # select on any of these attributes
+ row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome',
+ md5full='09a08691ce52952152f0e866a59f6261',
+ md5short='09a08691ce5295215',
+ fullIdent='NC_009665.1',
+ nodotIdent='NC_009665')
+
+ # make picklist
+ picklist_csv = runtmp.output('pick.csv')
+ with open(picklist_csv, 'w', newline='') as csvfp:
+ w = csv.DictWriter(csvfp, fieldnames=row.keys())
+ w.writeheader()
+ w.writerow(row)
+
+ picklist_arg = f"{picklist_csv}:fullIdent:ident:exclude"
+ runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg)
+
+ # stdout should be new signature
+ out = runtmp.last_result.out
+
+ test_extract_sig = sourmash.load_one_signature(sig63)
+ actual_extract_sig = sourmash.load_one_signature(out)
+
+ assert actual_extract_sig == test_extract_sig
+
+
+def test_sig_extract_8_picklist_ident_dot(runtmp):
+ # extract 47 from 47, using a picklist w/full md5
+ sig47 = utils.get_test_data('47.fa.sig')
+ sig63 = utils.get_test_data('63.fa.sig')
+
+ # select on any of these attributes
+ row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome',
+ md5full='09a08691ce52952152f0e866a59f6261',
+ md5short='09a08691ce5295215',
+ fullIdent='NC_009665.1',
+ nodotIdent='NC_009665')
+
+ # make picklist
+ picklist_csv = runtmp.output('pick.csv')
+ with open(picklist_csv, 'w', newline='') as csvfp:
+ w = csv.DictWriter(csvfp, fieldnames=row.keys())
+ w.writeheader()
+ w.writerow(row)
+
+ picklist_arg = f"{picklist_csv}:nodotIdent:identprefix"
+ runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg)
+
+ # stdout should be new signature
+ out = runtmp.last_result.out
+
+ test_extract_sig = sourmash.load_one_signature(sig47)
+ actual_extract_sig = sourmash.load_one_signature(out)
+
+ assert actual_extract_sig == test_extract_sig
+
+
+def test_sig_extract_8_picklist_ident_dot_exclude(runtmp):
+ # exlude 47 based on identprefix
+ sig47 = utils.get_test_data('47.fa.sig')
+ sig63 = utils.get_test_data('63.fa.sig')
+
+ # select on any of these attributes
+ row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome',
+ md5full='09a08691ce52952152f0e866a59f6261',
+ md5short='09a08691ce5295215',
+ fullIdent='NC_009665.1',
+ nodotIdent='NC_009665')
+
+ # make picklist
+ picklist_csv = runtmp.output('pick.csv')
+ with open(picklist_csv, 'w', newline='') as csvfp:
+ w = csv.DictWriter(csvfp, fieldnames=row.keys())
+ w.writeheader()
+ w.writerow(row)
+
+ picklist_arg = f"{picklist_csv}:nodotIdent:identprefix:exclude"
+ runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg)
+
+ # stdout should be new signature
+ out = runtmp.last_result.out
+
+ test_extract_sig = sourmash.load_one_signature(sig63)
+ actual_extract_sig = sourmash.load_one_signature(out)
+
+ assert actual_extract_sig == test_extract_sig
+
+
+def test_sig_extract_8_picklist_md5_short(runtmp):
+ # extract 47 from 47, using a picklist w/full md5
+ sig47 = utils.get_test_data('47.fa.sig')
+ sig63 = utils.get_test_data('63.fa.sig')
+
+ # select on any of these attributes
+ row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome',
+ md5full='09a08691ce52952152f0e866a59f6261',
+ md5short='09a08691ce5295215',
+ fullIdent='NC_009665.1',
+ nodotIdent='NC_009665')
+
+ # make picklist
+ picklist_csv = runtmp.output('pick.csv')
+ with open(picklist_csv, 'w', newline='') as csvfp:
+ w = csv.DictWriter(csvfp, fieldnames=row.keys())
+ w.writeheader()
+ w.writerow(row)
+
+ picklist_arg = f"{picklist_csv}:md5short:md5prefix8"
+ runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg)
+
+ # stdout should be new signature
+ out = runtmp.last_result.out
+
+ test_extract_sig = sourmash.load_one_signature(sig47)
+ actual_extract_sig = sourmash.load_one_signature(out)
+
+ assert actual_extract_sig == test_extract_sig
+
+
+def test_sig_extract_8_picklist_md5_short_exclude(runtmp):
+ # exclude 47 based on md5prefix8
+ sig47 = utils.get_test_data('47.fa.sig')
+ sig63 = utils.get_test_data('63.fa.sig')
+
+ # select on any of these attributes
+ row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome',
+ md5full='09a08691ce52952152f0e866a59f6261',
+ md5short='09a08691ce5295215',
+ fullIdent='NC_009665.1',
+ nodotIdent='NC_009665')
+
+ # make picklist
+ picklist_csv = runtmp.output('pick.csv')
+ with open(picklist_csv, 'w', newline='') as csvfp:
+ w = csv.DictWriter(csvfp, fieldnames=row.keys())
+ w.writeheader()
+ w.writerow(row)
+
+ picklist_arg = f"{picklist_csv}:md5short:md5prefix8:exclude"
+ runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg)
+
+ # stdout should be new signature
+ out = runtmp.last_result.out
+
+ test_extract_sig = sourmash.load_one_signature(sig63)
+ actual_extract_sig = sourmash.load_one_signature(out)
+
+ assert actual_extract_sig == test_extract_sig
+
+
+def test_sig_extract_8_picklist_md5_short_alias(runtmp):
+ # extract 47 from 47, using a picklist w/full md5
+ sig47 = utils.get_test_data('47.fa.sig')
+ sig63 = utils.get_test_data('63.fa.sig')
+
+ # select on any of these attributes
+ row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome',
+ md5full='09a08691ce52952152f0e866a59f6261',
+ md5short='09a08691ce5295215',
+ fullIdent='NC_009665.1',
+ nodotIdent='NC_009665')
+
+ # make picklist
+ picklist_csv = runtmp.output('pick.csv')
+ with open(picklist_csv, 'w', newline='') as csvfp:
+ w = csv.DictWriter(csvfp, fieldnames=row.keys())
+ w.writeheader()
+ w.writerow(row)
+
+ picklist_arg = f"{picklist_csv}:md5short:md5short"
+ runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg)
+
+ # stdout should be new signature
+ out = runtmp.last_result.out
+
+ test_extract_sig = sourmash.load_one_signature(sig47)
+ actual_extract_sig = sourmash.load_one_signature(out)
+
+ assert actual_extract_sig == test_extract_sig
+
+
+def test_sig_extract_8_picklist_md5_short_alias_exclude(runtmp):
+ # exlude 47 based on md5prefix8 alias, md5short
+ sig47 = utils.get_test_data('47.fa.sig')
+ sig63 = utils.get_test_data('63.fa.sig')
+
+ # select on any of these attributes
+ row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome',
+ md5full='09a08691ce52952152f0e866a59f6261',
+ md5short='09a08691ce5295215',
+ fullIdent='NC_009665.1',
+ nodotIdent='NC_009665')
+
+ # make picklist
+ picklist_csv = runtmp.output('pick.csv')
+ with open(picklist_csv, 'w', newline='') as csvfp:
+ w = csv.DictWriter(csvfp, fieldnames=row.keys())
+ w.writeheader()
+ w.writerow(row)
+
+ picklist_arg = f"{picklist_csv}:md5short:md5short:exclude"
+ runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg)
+
+ # stdout should be new signature
+ out = runtmp.last_result.out
+
+ test_extract_sig = sourmash.load_one_signature(sig63)
+ actual_extract_sig = sourmash.load_one_signature(out)
+
+ assert actual_extract_sig == test_extract_sig
+
+
+def test_sig_extract_8_picklist_md5_short_alias_with_md5_selector_nomatch(runtmp):
+ # extract 47 from 47, using a picklist w/full md5 and also md5 selector
+ sig47 = utils.get_test_data('47.fa.sig')
+ sig63 = utils.get_test_data('63.fa.sig')
+
+ # select on any of these attributes
+ row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome',
+ md5full='09a08691ce52952152f0e866a59f6261',
+ md5short='09a08691ce5295215',
+ fullIdent='NC_009665.1',
+ nodotIdent='NC_009665')
+
+ # make picklist
+ picklist_csv = runtmp.output('pick.csv')
+ with open(picklist_csv, 'w', newline='') as csvfp:
+ w = csv.DictWriter(csvfp, fieldnames=row.keys())
+ w.writeheader()
+ w.writerow(row)
+
+ picklist_arg = f"{picklist_csv}:md5short:md5short"
+ with pytest.raises(ValueError):
+ runtmp.sourmash('sig', 'extract', sig47, sig63,
+ '--picklist', picklist_arg,
+ '--md5', 'XXX') # no match to md5 selector here
+
+ err = runtmp.last_result.err
+ assert "no matching signatures to save!" in err
+
+
+def test_sig_extract_8_picklist_md5_short_alias_with_md5_selector_nomatch_exclude(runtmp):
+ # exclude 47 using a picklist w/full md5 and also md5 selector
+ sig47 = utils.get_test_data('47.fa.sig')
+ sig63 = utils.get_test_data('63.fa.sig')
+
+ # select on any of these attributes
+ row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome',
+ md5full='09a08691ce52952152f0e866a59f6261',
+ md5short='09a08691ce5295215',
+ fullIdent='NC_009665.1',
+ nodotIdent='NC_009665')
+
+ # make picklist
+ picklist_csv = runtmp.output('pick.csv')
+ with open(picklist_csv, 'w', newline='') as csvfp:
+ w = csv.DictWriter(csvfp, fieldnames=row.keys())
+ w.writeheader()
+ w.writerow(row)
+
+ picklist_arg = f"{picklist_csv}:md5short:md5short:exclude"
+ with pytest.raises(ValueError):
+ runtmp.sourmash('sig', 'extract', sig47, sig63,
+ '--picklist', picklist_arg,
+ '--md5', 'XXX') # no match to md5 selector here
+
+ err = runtmp.last_result.err
+ assert "no matching signatures to save!" in err
+
+
+def test_sig_extract_8_picklist_md5_short_alias_with_md5_selector(runtmp):
+ # extract 47 from 47, using a picklist w/full md5 and also md5 selector
+ sig47 = utils.get_test_data('47.fa.sig')
+ sig63 = utils.get_test_data('63.fa.sig')
+
+ # select on any of these attributes
+ row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome',
+ md5full='09a08691ce52952152f0e866a59f6261',
+ md5short='09a08691ce5295215',
+ fullIdent='NC_009665.1',
+ nodotIdent='NC_009665')
+
+ # make picklist
+ picklist_csv = runtmp.output('pick.csv')
+ with open(picklist_csv, 'w', newline='') as csvfp:
+ w = csv.DictWriter(csvfp, fieldnames=row.keys())
+ w.writeheader()
+ w.writerow(row)
+
+ picklist_arg = f"{picklist_csv}:md5short:md5short"
+ runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg,
+ '--md5', '09a08691ce5295215')
+
+ # stdout should be new signature
+ out = runtmp.last_result.out
+
+ test_extract_sig = sourmash.load_one_signature(sig47)
+ actual_extract_sig = sourmash.load_one_signature(out)
+
+ assert actual_extract_sig == test_extract_sig
+
+def test_sig_extract_8_picklist_md5_short_alias_with_md5_selector_exclude(runtmp):
+ # exclude 47, using a picklist w/full md5; but try to select with md5 selector
+ sig47 = utils.get_test_data('47.fa.sig')
+ sig63 = utils.get_test_data('63.fa.sig')
+
+ # select on any of these attributes
+ row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome',
+ md5full='09a08691ce52952152f0e866a59f6261',
+ md5short='09a08691ce5295215',
+ fullIdent='NC_009665.1',
+ nodotIdent='NC_009665')
+
+ # make picklist
+ picklist_csv = runtmp.output('pick.csv')
+ with open(picklist_csv, 'w', newline='') as csvfp:
+ w = csv.DictWriter(csvfp, fieldnames=row.keys())
+ w.writeheader()
+ w.writerow(row)
+
+ picklist_arg = f"{picklist_csv}:md5short:md5short:exclude"
+ with pytest.raises(ValueError):
+ runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg,
+ '--md5', '09a08691ce5295215')
+
+ # NTP: do we want to emit a more informative "conflicting selectors" type of msg?
+ err = runtmp.last_result.err
+ print(err)
+ assert "loaded 1 distinct values into picklist." in err
+ assert "loaded 1 total that matched ksize & molecule type" in err
+ assert 'no matching signatures to save!' in err
+
+
+def test_sig_extract_8_picklist_md5_nomatch(runtmp):
+ # use an empty picklist => no match
+ sig47 = utils.get_test_data('47.fa.sig')
+ sig63 = utils.get_test_data('63.fa.sig')
+
+ # make picklist
+ picklist_csv = runtmp.output('pick.csv')
+ with open(picklist_csv, 'w', newline='') as csvfp:
+ w = csv.DictWriter(csvfp, fieldnames=['md5short'])
+ w.writeheader()
+
+ picklist_arg = f"{picklist_csv}:md5short:md5prefix8"
+
+ with pytest.raises(ValueError):
+ runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist',
+ picklist_arg)
+
+ # stdout should be new signature
+ out = runtmp.last_result.out
+ print(out)
+ err = runtmp.last_result.err
+ print(err)
+ assert "no matching signatures to save!" in err
+ assert runtmp.last_result.status != 0
+
+
+def test_sig_extract_8_picklist_md5_nomatch_exclude(runtmp):
+ # use an empty picklist to exclude => no match => include everything
+ sig47 = utils.get_test_data('47.fa.sig')
+ sig63 = utils.get_test_data('63.fa.sig')
+
+ # make picklist
+ picklist_csv = runtmp.output('pick.csv')
+ with open(picklist_csv, 'w', newline='') as csvfp:
+ w = csv.DictWriter(csvfp, fieldnames=['md5short'])
+ w.writeheader()
+
+ picklist_arg = f"{picklist_csv}:md5short:md5prefix8:exclude"
+
+ runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist',
+ picklist_arg)
+
+ # stdout should be both signatures
+ out = runtmp.last_result.out
+ extract_siglist = list(load_signatures(out))
+ print(len(extract_siglist))
+ s47 = sourmash.load_file_as_signatures(sig47)
+ s63 = sourmash.load_file_as_signatures(sig63)
+ actual_extract_siglist = list(s47) + list(s63)
+
+ assert set(extract_siglist) == set(actual_extract_siglist)
+
+ err = runtmp.last_result.err
+ print(err)
+ assert runtmp.last_result.status == 0
+ assert 'loaded 0 distinct values into picklist.' in err
+ assert 'loaded 2 total that matched ksize & molecule type' in err
+ assert 'extracted 2 signatures from 2 file(s)' in err
+ assert 'for given picklist, found 2 matches by excluding 0 distinct values' in err
+
+
+def test_sig_extract_9_picklist_md5_ksize_hp_select(runtmp):
+ # test with -k and moltype selector
+ sigdir = utils.get_test_data('prot/')
+
+ # make picklist
+ picklist_csv = runtmp.output('pick.csv')
+ with open(picklist_csv, 'w', newline='') as csvfp:
+ w = csv.DictWriter(csvfp, fieldnames=['md5'])
+ w.writeheader()
+ w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672'))
+
+ picklist_arg = f"{picklist_csv}:md5:md5"
+
+ runtmp.sourmash('sig', 'extract', sigdir, '--picklist',
+ picklist_arg, '-k', '19', '--hp')
+
+ # stdout should be new signature
+ out = runtmp.last_result.out
+ actual_extract_sig = sourmash.load_one_signature(out)
+
+ print(actual_extract_sig.md5sum)
+ assert str(actual_extract_sig) == 'GCA_001593925'
+ assert actual_extract_sig.md5sum() == 'ea2a1ad233c2908529d124a330bcb672'
+ assert actual_extract_sig.minhash.ksize == 19
+ assert actual_extract_sig.minhash.moltype == 'hp'
+
+
+def test_sig_extract_9_picklist_md5_ksize_hp_select_exclude(runtmp):
+ # test picklist exclude with -k and moltype selector
+ sigdir = utils.get_test_data('prot/')
+
+ # make picklist
+ picklist_csv = runtmp.output('pick.csv')
+ with open(picklist_csv, 'w', newline='') as csvfp:
+ w = csv.DictWriter(csvfp, fieldnames=['md5'])
+ w.writeheader()
+ w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672'))
+
+ picklist_arg = f"{picklist_csv}:md5:md5:exclude"
+
+ runtmp.sourmash('sig', 'extract', sigdir, '--picklist',
+ picklist_arg, '-k', '19', '--hp')
+
+ # stdout should be new signature
+ out = runtmp.last_result.out
+ actual_extract_sig = sourmash.load_one_signature(out)
+ print(actual_extract_sig.md5sum)
+
+ assert str(actual_extract_sig) == 'GCA_001593935'
+ assert actual_extract_sig.md5sum() == 'bb0e6d90df01b7bd5d0956a5f9e3ed12'
+ assert actual_extract_sig.minhash.ksize == 19
+ assert actual_extract_sig.minhash.moltype == 'hp'
+
+
+def test_sig_extract_10_picklist_md5_dups_and_empty(runtmp):
+ # test empty picklist values, and duplicate picklist values
+ sigdir = utils.get_test_data('prot/')
+
+ # make picklist
+ picklist_csv = runtmp.output('pick.csv')
+ with open(picklist_csv, 'w', newline='') as csvfp:
+ w = csv.DictWriter(csvfp, fieldnames=['md5'])
+ w.writeheader()
+ w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672'))
+ w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672'))
+ w.writerow(dict(md5=''))
+
+ picklist_arg = f"{picklist_csv}:md5:md5"
+
+ runtmp.sourmash('sig', 'extract', sigdir, '--picklist',
+ picklist_arg, '-k', '19', '--hp')
+
+ # stdout should be new signature
+ out = runtmp.last_result.out
+ actual_extract_sig = sourmash.load_one_signature(out)
+
+ assert actual_extract_sig.minhash.ksize == 19
+ assert actual_extract_sig.minhash.moltype == 'hp'
+ assert actual_extract_sig.md5sum() == 'ea2a1ad233c2908529d124a330bcb672'
+
+ err = runtmp.last_result.err
+ print(err)
+
+ assert "WARNING: 1 empty values in column 'md5' in picklist file" in err
+ assert "WARNING: 1 values in picklist column 'md5' were not distinct" in err
+
+
+def test_sig_extract_10_picklist_md5_dups_and_empty_exclude(runtmp):
+ # test empty picklist values, and duplicate picklist values for exclude
+ sigdir = utils.get_test_data('prot/')
+
+ # make picklist
+ picklist_csv = runtmp.output('pick.csv')
+ with open(picklist_csv, 'w', newline='') as csvfp:
+ w = csv.DictWriter(csvfp, fieldnames=['md5'])
+ w.writeheader()
+ w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672'))
+ w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672'))
+ w.writerow(dict(md5=''))
+
+ picklist_arg = f"{picklist_csv}:md5:md5:exclude"
+
+ runtmp.sourmash('sig', 'extract', sigdir, '--picklist',
+ picklist_arg, '-k', '19', '--hp')
+
+ # stdout should be new signature
+ out = runtmp.last_result.out
+ actual_extract_sig = sourmash.load_one_signature(out)
+
+ assert actual_extract_sig.minhash.ksize == 19
+ assert actual_extract_sig.minhash.moltype == 'hp'
+ assert actual_extract_sig.md5sum() == 'bb0e6d90df01b7bd5d0956a5f9e3ed12'
+
+ err = runtmp.last_result.err
+ print(err)
+
+ assert "WARNING: 1 empty values in column 'md5' in picklist file" in err
+ assert "WARNING: 1 values in picklist column 'md5' were not distinct" in err
+
+
+def test_sig_extract_11_picklist_bad_coltype(runtmp):
+ # test with invalid picklist coltype
+ sigdir = utils.get_test_data('prot/')
+
+ # make picklist
+ picklist_csv = runtmp.output('pick.csv')
+ with open(picklist_csv, 'w', newline='') as csvfp:
+ w = csv.DictWriter(csvfp, fieldnames=['md5'])
+ w.writeheader()
+ w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672'))
+
+ picklist_arg = f"{picklist_csv}:md5:BADCOLTYPE"
+
+ with pytest.raises(ValueError):
+ runtmp.sourmash('sig', 'extract', sigdir, '--picklist',
+ picklist_arg, '-k', '19', '--hp')
+
+ err = runtmp.last_result.err
+ print(err)
+ assert "invalid picklist column type 'BADCOLTYPE'" in err
+
+
+def test_sig_extract_11_picklist_bad_coltype_exclude(runtmp):
+ # test with invalid picklist coltype
+ sigdir = utils.get_test_data('prot/')
+
+ # make picklist
+ picklist_csv = runtmp.output('pick.csv')
+ with open(picklist_csv, 'w', newline='') as csvfp:
+ w = csv.DictWriter(csvfp, fieldnames=['md5'])
+ w.writeheader()
+ w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672'))
+
+ picklist_arg = f"{picklist_csv}:md5:BADCOLTYPE:exclude"
+
+ with pytest.raises(ValueError):
+ runtmp.sourmash('sig', 'extract', sigdir, '--picklist',
+ picklist_arg, '-k', '19', '--hp')
+
+ err = runtmp.last_result.err
+ print(err)
+ assert "invalid picklist column type 'BADCOLTYPE'" in err
+
+
+def test_sig_extract_12_picklist_bad_argstr(runtmp):
+ # test with invalid argument format to --picklist
+ sigdir = utils.get_test_data('prot/')
+
+ # make picklist
+ picklist_csv = runtmp.output('pick.csv')
+ with open(picklist_csv, 'w', newline='') as csvfp:
+ w = csv.DictWriter(csvfp, fieldnames=['md5'])
+ w.writeheader()
+ w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672'))
+
+ picklist_arg = f"{picklist_csv}"
+
+ with pytest.raises(ValueError):
+ runtmp.sourmash('sig', 'extract', sigdir, '--picklist',
+ picklist_arg, '-k', '19', '--hp')
+
+ err = runtmp.last_result.err
+ print(err)
+ assert "invalid picklist argument" in err
+
+
+def test_sig_extract_12_picklist_bad_pickstyle(runtmp):
+ # test with invalid argument format to --picklist
+ sigdir = utils.get_test_data('prot/')
+
+ # make picklist
+ picklist_csv = runtmp.output('pick.csv')
+ with open(picklist_csv, 'w', newline='') as csvfp:
+ w = csv.DictWriter(csvfp, fieldnames=['md5'])
+ w.writeheader()
+ w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672'))
+
+ picklist_arg = f"{picklist_csv}:md5:md5:XXX"
+
+ with pytest.raises(ValueError):
+ runtmp.sourmash('sig', 'extract', sigdir, '--picklist',
+ picklist_arg, '-k', '19', '--hp')
+
+ err = runtmp.last_result.err
+ print(err)
+ assert "invalid picklist 'pickstyle' argument, 'XXX': must be 'include' or 'exclude'" in err
+
+
+def test_sig_extract_12_picklist_bad_colname(runtmp):
+ # test with invalid picklist colname
+ sigdir = utils.get_test_data('prot/')
+
+ # make picklist
+ picklist_csv = runtmp.output('pick.csv')
+ with open(picklist_csv, 'w', newline='') as csvfp:
+ w = csv.DictWriter(csvfp, fieldnames=['md5'])
+ w.writeheader()
+ w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672'))
+
+ picklist_arg = f"{picklist_csv}:BADCOLNAME:md5"
+
+ with pytest.raises(ValueError):
+ runtmp.sourmash('sig', 'extract', sigdir, '--picklist',
+ picklist_arg, '-k', '19', '--hp')
+
+ err = runtmp.last_result.err
+ print(err)
+ assert "column 'BADCOLNAME' not in pickfile" in err
+
+
+def test_sig_extract_12_picklist_bad_colname_exclude(runtmp):
+ # test with invalid picklist colname
+ sigdir = utils.get_test_data('prot/')
+
+ # make picklist
+ picklist_csv = runtmp.output('pick.csv')
+ with open(picklist_csv, 'w', newline='') as csvfp:
+ w = csv.DictWriter(csvfp, fieldnames=['md5'])
+ w.writeheader()
+ w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672'))
+
+ picklist_arg = f"{picklist_csv}:BADCOLNAME:md5:exclude"
+
+ with pytest.raises(ValueError):
+ runtmp.sourmash('sig', 'extract', sigdir, '--picklist',
+ picklist_arg, '-k', '19', '--hp')
+
+ err = runtmp.last_result.err
+ print(err)
+ assert "column 'BADCOLNAME' not in pickfile" in err
+
+
@utils.in_tempdir
def test_sig_flatten_1(c):
# extract matches to several names from among several signatures & flatten
@@ -1483,9 +2406,13 @@ def test_sig_describe_1_dir(c):
out = c.last_result.out
print(c.last_result)
+ # make sure signature names, as well as full path to .sig file under
+ # directory, show up in output.
expected_output = """\
signature: GCA_001593925
signature: GCA_001593935
+prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig
+prot/protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig
""".splitlines()
for line in expected_output:
assert line.strip() in out
@@ -1654,3 +2581,111 @@ def test_import_mash_csv_to_sig(runtmp):
assert '1 matches:' in runtmp.last_result.out
assert '100.0% short.fa' in runtmp.last_result.out
+
+
+def test_sig_manifest_1_zipfile(runtmp):
+ # make a manifest from a .zip file
+ protzip = utils.get_test_data('prot/protein.zip')
+ runtmp.sourmash('sig', 'manifest', protzip, '-o', 'SOURMASH-MANIFEST.csv')
+
+ manifest_fn = runtmp.output('SOURMASH-MANIFEST.csv')
+ with open(manifest_fn, newline='') as csvfp:
+ manifest = CollectionManifest.load_from_csv(csvfp)
+
+ assert len(manifest) == 2
+ md5_list = [ row['md5'] for row in manifest.rows ]
+ assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list
+ assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list
+
+
+def test_sig_manifest_2_sigfile(runtmp):
+ # make a manifest from a .sig file
+ sigfile = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig')
+
+ runtmp.sourmash('sig', 'manifest', sigfile, '-o', 'SOURMASH-MANIFEST.csv')
+
+ status = runtmp.last_result.status
+ out = runtmp.last_result.out
+ err = runtmp.last_result.err
+
+ manifest_fn = runtmp.output('SOURMASH-MANIFEST.csv')
+ with open(manifest_fn, newline='') as csvfp:
+ manifest = CollectionManifest.load_from_csv(csvfp)
+
+ assert len(manifest) == 1
+ md5_list = [ row['md5'] for row in manifest.rows ]
+ assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list
+
+
+def test_sig_manifest_3_sbt(runtmp):
+ # make a manifest from an SBT
+ protzip = utils.get_test_data('prot/protein.sbt.zip')
+ runtmp.sourmash('sig', 'manifest', protzip, '-o', 'SOURMASH-MANIFEST.csv')
+
+ manifest_fn = runtmp.output('SOURMASH-MANIFEST.csv')
+ with open(manifest_fn, newline='') as csvfp:
+ manifest = CollectionManifest.load_from_csv(csvfp)
+
+ assert len(manifest) == 2
+ md5_list = [ row['md5'] for row in manifest.rows ]
+ assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list
+ assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list
+
+
+def test_sig_manifest_4_lca(runtmp):
+ # make a manifest from a .lca.json file
+ sigfile = utils.get_test_data('prot/protein.lca.json.gz')
+ with pytest.raises(ValueError):
+ runtmp.sourmash('sig', 'manifest', sigfile, '-o',
+ 'SOURMASH-MANIFEST.csv')
+
+ status = runtmp.last_result.status
+ out = runtmp.last_result.out
+ err = runtmp.last_result.err
+
+ assert status != 0
+ assert "ERROR: manifests cannot be generated for this file." in err
+
+
+def test_sig_manifest_5_dir(runtmp):
+ # make a manifest from a directory
+ sigfile = utils.get_test_data('prot/protein/')
+ runtmp.sourmash('sig', 'manifest', sigfile, '-o', 'SOURMASH-MANIFEST.csv')
+
+ status = runtmp.last_result.status
+ out = runtmp.last_result.out
+ err = runtmp.last_result.err
+
+ manifest_fn = runtmp.output('SOURMASH-MANIFEST.csv')
+ with open(manifest_fn, newline='') as csvfp:
+ manifest = CollectionManifest.load_from_csv(csvfp)
+
+ assert len(manifest) == 2
+ md5_list = [ row['md5'] for row in manifest.rows ]
+ assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list
+ assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list
+
+
+def test_sig_manifest_6_pathlist(runtmp):
+ # make a manifest from a pathlist file
+ sigfiles = utils.get_test_data('prot/protein/*.sig')
+ sigfiles = glob.glob(sigfiles)
+
+ pathlist = runtmp.output('pathlist.txt')
+ with open(pathlist, 'wt') as fp:
+ fp.write("\n".join(sigfiles))
+
+ runtmp.sourmash('sig', 'manifest', pathlist, '-o', 'SOURMASH-MANIFEST.csv')
+
+ status = runtmp.last_result.status
+ out = runtmp.last_result.out
+ err = runtmp.last_result.err
+
+ manifest_fn = runtmp.output('SOURMASH-MANIFEST.csv')
+ with open(manifest_fn, newline='') as csvfp:
+ manifest = CollectionManifest.load_from_csv(csvfp)
+
+ assert len(manifest) == 2
+ md5_list = [ row['md5'] for row in manifest.rows ]
+ assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list
+ assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list
diff --git a/tests/test_index.py b/tests/test_index.py
index 5e30c7c585..610a5089ae 100644
--- a/tests/test_index.py
+++ b/tests/test_index.py
@@ -9,14 +9,16 @@
import copy
import sourmash
+from sourmash import index
from sourmash import load_one_signature, SourmashSignature
-from sourmash.index import (LinearIndex, MultiIndex, ZipFileLinearIndex,
+from sourmash.index import (LinearIndex, ZipFileLinearIndex,
make_jaccard_search_query, CounterGather,
- LazyLinearIndex)
+ LazyLinearIndex, MultiIndex)
from sourmash.sbt import SBT, GraphFactory, Leaf
from sourmash.sbtmh import SigLeaf
from sourmash import sourmash_args
from sourmash.search import JaccardSearch, SearchType
+from sourmash.picklist import SignaturePicklist, PickStyle
import sourmash_tst_utils as utils
@@ -38,28 +40,28 @@ def test_simple_index(n_children):
leaf2_mh.add_sequence("AAAAG")
leaf2_sig = SourmashSignature(leaf2_mh)
root.insert(leaf2_sig)
-
+
leaf3_mh = sourmash.MinHash(0, 5, scaled=1)
leaf3_mh.add_sequence("AAAAA")
leaf3_mh.add_sequence("AAAAT")
leaf3_mh.add_sequence("CAAAA")
leaf3_sig = SourmashSignature(leaf3_mh)
root.insert(leaf3_sig)
-
+
leaf4_mh = sourmash.MinHash(0, 5, scaled=1)
leaf4_mh.add_sequence("AAAAA")
leaf4_mh.add_sequence("CAAAA")
leaf4_mh.add_sequence("GAAAA")
leaf4_sig = SourmashSignature(leaf4_mh)
root.insert(leaf4_sig)
-
+
leaf5_mh = sourmash.MinHash(0, 5, scaled=1)
leaf5_mh.add_sequence("AAAAA")
leaf5_mh.add_sequence("AAAAT")
leaf5_mh.add_sequence("GAAAA")
leaf5_sig = SourmashSignature(leaf5_mh)
root.insert(leaf5_sig)
-
+
linear = LinearIndex()
linear.insert(leaf1_sig)
linear.insert(leaf2_sig)
@@ -79,7 +81,7 @@ def test_simple_index(n_children):
linear_found = set(linear_found)
tree_found = set(root.find(search_fn, search_sig))
-
+
assert tree_found
assert tree_found == set(linear_found)
@@ -423,7 +425,7 @@ def test_linear_index_save(runtmp):
linear.insert(ss2)
linear.insert(ss47)
linear.insert(ss63)
-
+
filename = runtmp.output('foo')
linear.save(filename)
@@ -450,7 +452,7 @@ def test_linear_index_load(runtmp):
ss63 = sourmash.load_one_signature(sig63)
from sourmash import save_signatures
-
+
filename = runtmp.output('foo')
with open(filename, 'wt') as fp:
sourmash.save_signatures([ss2, ss47, ss63], fp)
@@ -479,7 +481,7 @@ def test_linear_index_save_load(runtmp):
filename = runtmp.output('foo')
linear.save(filename)
linear2 = LinearIndex.load(filename)
-
+
# now, search for sig2
sr = linear2.search(ss2, threshold=1.0)
print([s[1].name for s in sr])
@@ -634,6 +636,56 @@ def test_linear_index_moltype_select():
assert len(linear2) == 0
+def test_linear_index_picklist_select():
+ # test select with a picklist
+
+ # this loads three ksizes, 21/31/51
+ sig2 = utils.get_test_data('2.fa.sig')
+ siglist = sourmash.load_file_as_signatures(sig2)
+
+ linear = LinearIndex()
+ for ss in siglist:
+ linear.insert(ss)
+
+ # construct a picklist...
+ picklist = SignaturePicklist('md5prefix8')
+ picklist.init(['f3a90d4e'])
+
+ # select on picklist
+ linear2 = linear.select(picklist=picklist)
+ assert len(linear2) == 1
+ ss = list(linear2.signatures())[0]
+ assert ss.minhash.ksize == 31
+ assert ss.md5sum().startswith('f3a90d4e55')
+
+
+def test_linear_index_picklist_select_exclude():
+ # test select with a picklist, but exclude
+
+ # this loads three ksizes, 21/31/51
+ sig2 = utils.get_test_data('2.fa.sig')
+ siglist = sourmash.load_file_as_signatures(sig2)
+
+ linear = LinearIndex()
+ for ss in siglist:
+ linear.insert(ss)
+
+ # construct a picklist...
+ picklist = SignaturePicklist('md5prefix8', pickstyle=PickStyle.EXCLUDE)
+ picklist.init(['f3a90d4e'])
+
+ # select on picklist
+ linear2 = linear.select(picklist=picklist)
+ assert len(linear2) == 2
+ md5s = set()
+ ksizes = set()
+ for ss in list(linear2.signatures()):
+ md5s.add(ss.md5sum())
+ ksizes.add(ss.minhash.ksize)
+ assert md5s == set(['f372e47893edd349e5956f8b0d8dcbf7','43f3b48e59443092850964d355a20ac0'])
+ assert ksizes == set([21,51])
+
+
@utils.in_tempdir
def test_index_same_md5sum_fsstorage(c):
testdata1 = utils.get_test_data('img/2706795855.sig')
@@ -645,7 +697,7 @@ def test_index_same_md5sum_fsstorage(c):
outfile = c.output('zzz.sbt.json')
assert os.path.exists(outfile)
storage = c.output('.sbt.zzz')
- assert len(glob.glob(storage + "/*")) == 3
+ assert len(glob.glob(storage + "/*")) == 4
@utils.in_tempdir
@@ -661,7 +713,7 @@ def test_index_same_md5sum_sbt_zipstorage(c):
zout = zipfile.ZipFile(outfile, mode='r')
# should have 3 files, 1 internal and two sigs. We check for 4 because the
# directory also shows in namelist()
- assert len([f for f in zout.namelist() if f.startswith(".sbt.zzz/")]) == 4
+ assert len([f for f in zout.namelist() if f.startswith(".sbt.zzz/")]) == 5
@utils.in_thisdir
@@ -764,14 +816,19 @@ def test_zipfile_dayhoff_command_search_protein(c):
assert 'no compatible signatures found in ' in c.last_result.err
-def test_zipfile_API_signatures():
+def test_zipfile_API_signatures(use_manifest):
# return all of the .sig and .sig.gz files in all.zip
zipfile_db = utils.get_test_data('prot/all.zip')
- zipidx = ZipFileLinearIndex.load(zipfile_db)
+ zipidx = ZipFileLinearIndex.load(zipfile_db, use_manifest=use_manifest)
siglist = list(zipidx.signatures())
- assert len(siglist) == 7
- assert len(zipidx) == 7
+
+ if use_manifest:
+ assert len(siglist) == 8
+ assert len(zipidx) == 8
+ else:
+ assert len(siglist) == 7
+ assert len(zipidx) == 7
def test_zipfile_bool():
@@ -800,11 +857,12 @@ def __len__(self):
assert "don't call len!" in str(exc.value)
-def test_zipfile_API_signatures_traverse_yield_all():
+def test_zipfile_API_signatures_traverse_yield_all(use_manifest):
# include dna-sig.noext, but not build.sh (cannot be loaded as signature)
zipfile_db = utils.get_test_data('prot/all.zip')
- zipidx = ZipFileLinearIndex.load(zipfile_db, traverse_yield_all=True)
+ zipidx = ZipFileLinearIndex.load(zipfile_db, traverse_yield_all=True,
+ use_manifest=use_manifest)
siglist = list(zipidx.signatures())
assert len(siglist) == 8
assert len(zipidx) == 8
@@ -813,29 +871,111 @@ def test_zipfile_API_signatures_traverse_yield_all():
zf = zipidx.zf
allfiles = [ zi.filename for zi in zf.infolist() ]
print(allfiles)
- assert len(allfiles) == 12
+ assert len(allfiles) == 13
-def test_zipfile_API_signatures_traverse_yield_all_select():
+def test_zipfile_API_signatures_traverse_yield_all_select(use_manifest):
# include dna-sig.noext
zipfile_db = utils.get_test_data('prot/all.zip')
- zipidx = ZipFileLinearIndex.load(zipfile_db, traverse_yield_all=True)
+ zipidx = ZipFileLinearIndex.load(zipfile_db, traverse_yield_all=True,
+ use_manifest=use_manifest)
zipidx = zipidx.select(moltype='DNA')
siglist = list(zipidx.signatures())
assert len(siglist) == 2
assert len(zipidx) == 2
-def test_zipfile_API_signatures_select():
+def test_zipfile_API_signatures_select(use_manifest):
# include dna-sig.noext
zipfile_db = utils.get_test_data('prot/all.zip')
- zipidx = ZipFileLinearIndex.load(zipfile_db)
+ zipidx = ZipFileLinearIndex.load(zipfile_db, use_manifest=use_manifest)
+ ziplist_pre = LinearIndex(zipidx.signatures())
+ ziplist_pre = ziplist_pre.select(moltype='DNA')
+
+ zipidx = zipidx.select(moltype='DNA')
+ siglist = list(zipidx.signatures())
+
+ if use_manifest:
+ assert len(siglist) == 2
+ assert len(zipidx) == 2
+ assert len(ziplist_pre) == 2
+ else:
+ assert len(siglist) == 1
+ assert len(zipidx) == 1
+ assert len(ziplist_pre) == 1
+
+
+def test_zipfile_API_signatures_select_abund_false(use_manifest):
+ # check for abund=False (all signatures match b/c can convert)
+ zipfile_db = utils.get_test_data('track_abund/track_abund.zip')
+
+ zipidx = ZipFileLinearIndex.load(zipfile_db, use_manifest=use_manifest)
+ ziplist_pre = LinearIndex(zipidx.signatures())
+ ziplist_pre = ziplist_pre.select(abund=False)
+
+ zipidx = zipidx.select(abund=False)
+ siglist = list(zipidx.signatures())
+
+ assert len(siglist) == 2
+ assert len(zipidx) == 2
+ assert len(ziplist_pre) == 2
+
+
+def test_zipfile_API_signatures_select_abund_true(use_manifest):
+ # find all abund=True (all signatures match, b/c abund)
+ zipfile_db = utils.get_test_data('track_abund/track_abund.zip')
+
+ zipidx = ZipFileLinearIndex.load(zipfile_db, use_manifest=use_manifest)
+ ziplist_pre = LinearIndex(zipidx.signatures())
+ ziplist_pre = ziplist_pre.select(abund=True)
+
+ zipidx = zipidx.select(abund=True)
+ siglist = list(zipidx.signatures())
+
+ assert len(siglist) == 2
+ assert len(zipidx) == 2
+ assert len(ziplist_pre) == 2
+
+
+def test_zipfile_API_signatures_select_abund_none(use_manifest):
+ # find all abund=None (all signatures match, b/c no selection criteria)
+ zipfile_db = utils.get_test_data('track_abund/track_abund.zip')
+
+ zipidx = ZipFileLinearIndex.load(zipfile_db, use_manifest=use_manifest)
+ ziplist_pre = LinearIndex(zipidx.signatures())
+ ziplist_pre = ziplist_pre.select(abund=None)
+
+ zipidx = zipidx.select(abund=None)
+ siglist = list(zipidx.signatures())
+
+ assert len(siglist) == 2
+ assert len(zipidx) == 2
+ assert len(ziplist_pre) == 2
+
+
+def test_zipfile_API_signatures_select_twice(use_manifest):
+ # include dna-sig.noext
+ zipfile_db = utils.get_test_data('prot/all.zip')
+
+ zipidx = ZipFileLinearIndex.load(zipfile_db, use_manifest=use_manifest)
+ ziplist_pre = LinearIndex(zipidx.signatures())
+ ziplist_pre = ziplist_pre.select(moltype='DNA')
+ ziplist_pre = ziplist_pre.select(ksize=31)
+
zipidx = zipidx.select(moltype='DNA')
+ zipidx = zipidx.select(ksize=31)
siglist = list(zipidx.signatures())
- assert len(siglist) == 1
- assert len(zipidx) == 1
+
+ if use_manifest:
+ assert len(siglist) == 2
+ assert len(zipidx) == 2
+ assert len(ziplist_pre) == 2
+ else:
+ assert len(siglist) == 1
+ assert len(zipidx) == 1
+ assert len(ziplist_pre) == 1
def test_zipfile_API_save():
@@ -857,37 +997,42 @@ def test_zipfile_API_insert():
zipidx.insert(None)
-def test_zipfile_API_location():
+def test_zipfile_API_location(use_manifest):
zipfile_db = utils.get_test_data('prot/all.zip')
- zipidx = ZipFileLinearIndex.load(zipfile_db)
+ zipidx = ZipFileLinearIndex.load(zipfile_db, use_manifest=use_manifest)
assert zipidx.location == zipfile_db
-def test_zipfile_load_file_as_signatures():
+def test_zipfile_load_file_as_signatures(use_manifest):
from types import GeneratorType
zipfile_db = utils.get_test_data('prot/all.zip')
- sigs = sourmash_args.load_file_as_signatures(zipfile_db)
+ sigs = sourmash_args.load_file_as_signatures(zipfile_db,
+ _use_manifest=use_manifest)
# it's fine if this needs to change, but for now I want to make
- # sure that this is generator.
+ # sure that this is a generator.
assert isinstance(sigs, GeneratorType)
sigs = list(sigs)
- assert len(sigs) == 7
+ if use_manifest:
+ assert len(sigs) == 8
+ else:
+ assert len(sigs) == 7
-def test_zipfile_load_file_as_signatures_traverse_yield_all():
+def test_zipfile_load_file_as_signatures_traverse_yield_all(use_manifest):
from types import GeneratorType
zipfile_db = utils.get_test_data('prot/all.zip')
sigs = sourmash_args.load_file_as_signatures(zipfile_db,
- yield_all_files=True)
+ yield_all_files=True,
+ _use_manifest=use_manifest)
# it's fine if this needs to change, but for now I want to make
- # sure that this is generator.
+ # sure that this is a generator.
assert isinstance(sigs, GeneratorType)
sigs = list(sigs)
@@ -920,8 +1065,8 @@ def test_multi_index_search():
lidx2 = LinearIndex.load(sig47)
lidx3 = LinearIndex.load(sig63)
- # create MultiIindex with source location override
- lidx = MultiIndex([lidx1, lidx2, lidx3], ['A', None, 'C'])
+ # create MultiIndex with source location override
+ lidx = MultiIndex.load([lidx1, lidx2, lidx3], ['A', None, 'C'])
lidx = lidx.select(ksize=31)
# now, search for sig2
@@ -973,8 +1118,8 @@ def test_multi_index_gather():
lidx2 = LinearIndex.load(sig47)
lidx3 = LinearIndex.load(sig63)
- # create MultiIindex with source location override
- lidx = MultiIndex([lidx1, lidx2, lidx3], ['A', None, 'C'])
+ # create MultiIndex with source location override
+ lidx = MultiIndex.load([lidx1, lidx2, lidx3], ['A', None, 'C'])
lidx = lidx.select(ksize=31)
matches = lidx.gather(ss2)
@@ -1002,8 +1147,8 @@ def test_multi_index_signatures():
lidx2 = LinearIndex.load(sig47)
lidx3 = LinearIndex.load(sig63)
- # create MultiIindex with source location override
- lidx = MultiIndex([lidx1, lidx2, lidx3], ['A', None, 'C'])
+ # create MultiIndex with source location override
+ lidx = MultiIndex.load([lidx1, lidx2, lidx3], ['A', None, 'C'])
lidx = lidx.select(ksize=31)
siglist = list(lidx.signatures())
@@ -1014,21 +1159,39 @@ def test_multi_index_signatures():
def test_multi_index_load_from_path():
+ # test MultiIndex loading from a directory. The full paths to the
+ # signature files should be available via 'signatures_with_location()'
dirname = utils.get_test_data('prot/protein')
mi = MultiIndex.load_from_path(dirname, force=False)
sigs = list(mi.signatures())
assert len(sigs) == 2
+ # check to make sure that full paths to expected sig files are returned
+ locs = [ x[1] for x in mi.signatures_with_location() ]
+
+ endings = ('GCA_001593925.1_ASM159392v1_protein.faa.gz.sig',
+ 'GCA_001593935.1_ASM159393v1_protein.faa.gz.sig')
+ for loc in locs:
+ found = False
+ for end in endings:
+ if loc.endswith(end):
+ found = True
+ assert found, f"could not find full filename in locations for {end}"
+
+ # also check internal locations and parent value --
+ assert mi.parent.endswith('prot/protein')
+
+ ilocs = [ x[2] for x in mi._signatures_with_internal() ]
+ assert endings[0] in ilocs, ilocs
+ assert endings[1] in ilocs, ilocs
+
def test_multi_index_load_from_path_2():
# only load .sig files, currently; not the databases under that directory.
dirname = utils.get_test_data('prot')
mi = MultiIndex.load_from_path(dirname, force=False)
- print(mi.index_list)
- print(mi.source_list)
-
sigs = list(mi.signatures())
assert len(sigs) == 7
@@ -1067,9 +1230,6 @@ def test_multi_index_load_from_path_3_yield_all_true(c):
mi = MultiIndex.load_from_path(c.location, force=True)
- print(mi.index_list)
- print(mi.source_list)
-
sigs = list(mi.signatures())
assert len(sigs) == 8
@@ -1093,9 +1253,6 @@ def test_multi_index_load_from_path_3_yield_all_true_subdir(c):
mi = MultiIndex.load_from_path(c.location, force=True)
- print(mi.index_list)
- print(mi.source_list)
-
sigs = list(mi.signatures())
assert len(sigs) == 8
@@ -1118,9 +1275,6 @@ def test_multi_index_load_from_path_3_sig_gz(c):
mi = MultiIndex.load_from_path(c.location, force=False)
- print(mi.index_list)
- print(mi.source_list)
-
sigs = list(mi.signatures())
assert len(sigs) == 6
@@ -1135,7 +1289,7 @@ def test_multi_index_load_from_path_3_check_traverse_fn(c):
assert len(files) == 7, files
files = list(sourmash_args.traverse_find_sigs([dirname], True))
- assert len(files) == 20, files
+ assert len(files) == 20, files # if this fails, check for extra files!
def test_multi_index_load_from_path_no_exist():
@@ -1168,9 +1322,11 @@ def test_multi_index_load_from_pathlist_1(c):
@utils.in_tempdir
def test_multi_index_load_from_pathlist_2(c):
+ # CTB note: if you create extra files under this directory,
+ # it will fail :)
dirname = utils.get_test_data('prot')
files = list(sourmash_args.traverse_find_sigs([dirname], True))
- assert len(files) == 20, files
+ assert len(files) == 20, files # check there aren't extra files in here!
file_list = c.output('filelist.txt')
@@ -1192,7 +1348,7 @@ def test_multi_index_load_from_pathlist_3_zipfile(c):
print(zipfile, file=fp)
mi = MultiIndex.load_from_pathlist(file_list)
- assert len(mi) == 7
+ assert len(mi) == 8
##
## test a slightly outre version of JaccardSearch - this is a test of the
@@ -2028,7 +2184,7 @@ def test_lazy_index_5_len():
len(lazy)
-def test_lazy_index_wraps_multiindex_location():
+def test_lazy_index_wraps_multi_index_location():
sigdir = utils.get_test_data('prot/protein/')
sigzip = utils.get_test_data('prot/protein.zip')
siglca = utils.get_test_data('prot/protein.lca.json.gz')
@@ -2037,7 +2193,7 @@ def test_lazy_index_wraps_multiindex_location():
db_paths = (sigdir, sigzip, siglca, sigsbt)
dbs = [ sourmash.load_file_as_index(db_path) for db_path in db_paths ]
- mi = MultiIndex(dbs, db_paths)
+ mi = MultiIndex.load(dbs, db_paths)
lazy = LazyLinearIndex(mi)
mi2 = mi.select(moltype='protein')
@@ -2046,3 +2202,82 @@ def test_lazy_index_wraps_multiindex_location():
for (ss_tup, ss_lazy_tup) in zip(mi2.signatures_with_location(),
lazy2.signatures_with_location()):
assert ss_tup == ss_lazy_tup
+
+
+def test_lazy_loaded_index_1(runtmp):
+ # some basic tests for LazyLoadedIndex
+ lcafile = utils.get_test_data('prot/protein.lca.json.gz')
+ sigzip = utils.get_test_data('prot/protein.zip')
+
+ with pytest.raises(ValueError) as exc:
+ db = index.LazyLoadedIndex.load(lcafile)
+ # no manifest on LCA database
+ assert "no manifest on index at" in str(exc)
+
+ # load something, check that it's only accessed upon .signatures(...)
+ test_zip = runtmp.output('test.zip')
+ shutil.copyfile(sigzip, test_zip)
+ db = index.LazyLoadedIndex.load(test_zip)
+ assert len(db) == 2
+ assert db.location == test_zip
+
+ # now remove!
+ os.unlink(test_zip)
+
+ # can still access manifest...
+ assert len(db) == 2
+
+ # ...but we should get an error when we call signatures.
+ with pytest.raises(FileNotFoundError):
+ list(db.signatures())
+
+ # but put it back, and all is forgiven. yay!
+ shutil.copyfile(sigzip, test_zip)
+ x = list(db.signatures())
+ assert len(x) == 2
+
+
+def test_lazy_loaded_index_2_empty(runtmp):
+ # some basic tests for LazyLoadedIndex that is empty
+ sigzip = utils.get_test_data('prot/protein.zip')
+
+ # load something:
+ test_zip = runtmp.output('test.zip')
+ shutil.copyfile(sigzip, test_zip)
+ db = index.LazyLoadedIndex.load(test_zip)
+ assert len(db) == 2
+ assert db.location == test_zip
+ assert bool(db)
+
+ # select to empty:
+ db = db.select(ksize=50)
+
+ assert len(db) == 0
+ assert not bool(db)
+
+ x = list(db.signatures())
+ assert len(x) == 0
+
+
+def test_lazy_loaded_index_3_find(runtmp):
+ # test 'find'
+ query_file = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig')
+ sigzip = utils.get_test_data('prot/protein.zip')
+
+ # load something:
+ test_zip = runtmp.output('test.zip')
+ shutil.copyfile(sigzip, test_zip)
+ db = index.LazyLoadedIndex.load(test_zip)
+
+ # can we find matches? should find two.
+ query = sourmash.load_one_signature(query_file)
+ assert query.minhash.ksize == 19
+ x = db.search(query, threshold=0.0)
+ x = list(x)
+ assert len(x) == 2
+
+ # no matches!
+ db = db.select(ksize=20)
+ x = db.search(query, threshold=0.0)
+ x = list(x)
+ assert len(x) == 0
diff --git a/tests/test_lca.py b/tests/test_lca.py
index a6443dbc6c..faa9149d72 100644
--- a/tests/test_lca.py
+++ b/tests/test_lca.py
@@ -11,8 +11,10 @@
import sourmash
from sourmash import load_one_signature, SourmashSignature
+from sourmash.search import make_jaccard_search_query
from sourmash.lca import lca_utils
from sourmash.lca.lca_utils import LineagePair
+from sourmash.picklist import SignaturePicklist, PickStyle
def test_api_create_search():
@@ -31,6 +33,74 @@ def test_api_create_search():
assert match.minhash == ss.minhash
+def test_api_find_picklist_select():
+ # does 'find' respect picklists?
+
+ sig47 = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'),
+ ksize=31)
+ sig63 = sourmash.load_one_signature(utils.get_test_data('63.fa.sig'),
+ ksize=31)
+
+ lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000)
+ lca_db.insert(sig47)
+ lca_db.insert(sig63)
+
+ # construct a picklist...
+ picklist = SignaturePicklist('md5prefix8')
+ picklist.init(['09a08691'])
+
+ # run a 'find' with sig63, should find 47 and 63 both.
+ search_obj = make_jaccard_search_query(do_containment=True, threshold=0.0)
+ results = list(lca_db.find(search_obj, sig63))
+ print(results)
+ assert len(results) == 2
+
+ # now, select on picklist and do another find...
+ lca_db = lca_db.select(picklist=picklist)
+ results = list(lca_db.find(search_obj, sig63))
+ print(results)
+ assert len(results) == 1
+
+ # and check that it is the expected one!
+ ss = results[0].signature
+ assert ss.minhash.ksize == 31
+ assert ss.md5sum().startswith('09a08691c')
+
+
+def test_api_find_picklist_select_exclude():
+ # does 'find' respect picklists?
+
+ sig47 = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'),
+ ksize=31)
+ sig63 = sourmash.load_one_signature(utils.get_test_data('63.fa.sig'),
+ ksize=31)
+
+ lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000)
+ lca_db.insert(sig47)
+ lca_db.insert(sig63)
+
+ # construct a picklist...
+ picklist = SignaturePicklist('md5prefix8', pickstyle= PickStyle.EXCLUDE)
+ picklist.init(['09a08691'])
+
+ # run a 'find' with sig63, should find 47 and 63 both.
+ search_obj = make_jaccard_search_query(do_containment=True, threshold=0.0)
+ results = list(lca_db.find(search_obj, sig63))
+ print(results)
+ assert len(results) == 2
+
+ # now, select on picklist and do another find...
+ lca_db = lca_db.select(picklist=picklist)
+ results = list(lca_db.find(search_obj, sig63))
+ print(results)
+ assert len(results) == 1
+
+ # and check that it is the expected one!
+ ss = results[0].signature
+ assert ss.minhash.ksize == 31
+ assert ss.md5sum().startswith('38729c637')
+
+
def test_api_create_insert():
# test some internal implementation stuff: create & then insert a sig.
ss = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'),
@@ -453,12 +523,98 @@ def test_lca_index_select():
xx = db.select(moltype='DNA')
assert xx == db
+ xx = db.select(abund=False)
+ assert xx == db
+
with pytest.raises(ValueError):
db.select(ksize=21)
with pytest.raises(ValueError):
db.select(moltype='protein')
+ with pytest.raises(ValueError):
+ db.select(abund=True)
+
+
+def test_lca_index_select_picklist():
+ # test 'select' method from Index base class with a picklist.
+
+ filename = utils.get_test_data('lca/47+63.lca.json')
+ db, ksize, scaled = lca_utils.load_single_database(filename)
+
+ # construct a picklist...
+ picklist = SignaturePicklist('md5prefix8')
+ picklist.init(['50a92740'])
+
+ xx = db.select(picklist=picklist)
+ assert xx == db
+
+ siglist = list(db.signatures())
+ assert len(siglist) == 1
+ ss = siglist[0]
+ assert ss.md5sum().startswith('50a92740')
+ assert ss.minhash.ksize == 31
+
+
+def test_lca_index_find_picklist_check_overlap():
+ # make sure 'find' works for picklists that exclude relevant signatures
+ # (bug #1638)
+
+ query_fn = utils.get_test_data('47.fa.sig')
+ query_sig = sourmash.load_one_signature(query_fn, ksize=31)
+ db_fn = utils.get_test_data('lca/47+63.lca.json')
+ db, ksize, scaled = lca_utils.load_single_database(db_fn)
+
+ # construct a picklist...
+ picklist = SignaturePicklist('ident')
+ picklist.init(['NC_009665.1'])
+
+ xx = db.select(picklist=picklist)
+ assert xx == db
+
+ results = list(db.search(query_sig, threshold=0.1))
+ assert len(results) == 1
+
+
+def test_lca_index_select_picklist_exclude():
+ # test 'select' method from Index base class with a picklist.
+
+ filename = utils.get_test_data('lca/47+63.lca.json')
+ db, ksize, scaled = lca_utils.load_single_database(filename)
+
+ # construct a picklist...
+ picklist = SignaturePicklist('md5prefix8', pickstyle=PickStyle.EXCLUDE)
+ picklist.init(['50a92740'])
+
+ xx = db.select(picklist=picklist)
+ assert xx == db
+
+ siglist = list(db.signatures())
+ assert len(siglist) == 1
+ ss = siglist[0]
+ assert ss.md5sum().startswith('e88dc390')
+ assert ss.minhash.ksize == 31
+
+
+def test_lca_index_select_picklist_twice():
+ # test 'select' method from Index base class with a picklist.
+
+ filename = utils.get_test_data('lca/47+63.lca.json')
+ db, ksize, scaled = lca_utils.load_single_database(filename)
+
+ # construct a picklist...
+ picklist = SignaturePicklist('md5prefix8')
+ picklist.init(['50a92740'])
+
+ xx = db.select(picklist=picklist)
+ assert xx == db
+
+ with pytest.raises(ValueError) as exc:
+ xx = db.select(picklist=picklist)
+
+ assert "we do not (yet) support multiple picklists for LCA databases" in str(exc)
+
+
def test_search_db_scaled_gt_sig_scaled():
dbfile = utils.get_test_data('lca/47+63.lca.json')
@@ -2438,3 +2594,59 @@ def test_lca_db_dayhoff_command_search(c):
c.run_sourmash('gather', sigfile1, db_out, '--threshold', '0.0')
assert 'found 1 matches total' in c.last_result.out
assert 'the recovered matches hit 100.0% of the query' in c.last_result.out
+
+
+def test_lca_index_with_picklist(runtmp):
+ gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig'))
+ outdb = runtmp.output('gcf.lca.json')
+ picklist = utils.get_test_data('gather/thermotoga-picklist.csv')
+
+ # create an empty spreadsheet
+ with open(runtmp.output('empty.csv'), 'wt') as fp:
+ fp.write('accession,superkingdom,phylum,class,order,family,genus,species,strain')
+
+ runtmp.sourmash('lca', 'index', 'empty.csv', outdb, *gcf_sigs,
+ '-k', '21', '--picklist', f"{picklist}:md5:md5")
+
+ out = runtmp.last_result.out
+ err = runtmp.last_result.err
+
+ print(out)
+ print(err)
+
+ assert "for given picklist, found 3 matches to 9 distinct values" in err
+ assert "WARNING: 6 missing picklist values."
+ assert "WARNING: no lineage provided for 3 signatures" in err
+
+ siglist = list(sourmash.load_file_as_signatures(outdb))
+ assert len(siglist) == 3
+ for ss in siglist:
+ assert 'Thermotoga' in ss.name
+
+
+def test_lca_index_with_picklist_exclude(runtmp):
+ gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig'))
+ outdb = runtmp.output('gcf.lca.json')
+ picklist = utils.get_test_data('gather/thermotoga-picklist.csv')
+
+ # create an empty spreadsheet
+ with open(runtmp.output('empty.csv'), 'wt') as fp:
+ fp.write('accession,superkingdom,phylum,class,order,family,genus,species,strain')
+
+ runtmp.sourmash('lca', 'index', 'empty.csv', outdb, *gcf_sigs,
+ '-k', '21', '--picklist', f"{picklist}:md5:md5:exclude")
+
+ out = runtmp.last_result.out
+ err = runtmp.last_result.err
+
+ print(out)
+ print(err)
+
+ assert "for given picklist, found 9 matches by excluding 9 distinct values" in err
+ assert "WARNING: 3 missing picklist values."
+ assert "WARNING: no lineage provided for 9 signatures" in err
+
+ siglist = list(sourmash.load_file_as_signatures(outdb))
+ assert len(siglist) == 9
+ for ss in siglist:
+ assert 'Thermotoga' not in ss.name
diff --git a/tests/test_manifest.py b/tests/test_manifest.py
new file mode 100644
index 0000000000..35f3fec14e
--- /dev/null
+++ b/tests/test_manifest.py
@@ -0,0 +1,93 @@
+"""
+Tests for manifest code in databases, etc.
+"""
+from io import StringIO
+
+import sourmash
+from sourmash import index
+
+import sourmash_tst_utils as utils
+
+
+def test_generate_manifest():
+ # test basic manifest-generating functionality.
+ protzip = utils.get_test_data('prot/protein.zip')
+
+ loader = sourmash.load_file_as_index(protzip)
+
+ rows = []
+ siglist = []
+ for (sig, _, loc) in loader._signatures_with_internal():
+ row = index.CollectionManifest.make_manifest_row(sig, loc)
+ rows.append(row)
+ siglist.append(sig)
+
+ manifest = index.CollectionManifest(rows)
+
+ assert len(manifest) == len(rows)
+ assert len(manifest) == 2
+
+ md5_list = [ row['md5'] for row in manifest.rows ]
+ assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list
+ assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list
+
+ for sig in siglist:
+ assert sig in manifest
+
+
+def test_manifest_to_picklist():
+ # test manifest/picklist interaction basics
+ protzip = utils.get_test_data('prot/protein.zip')
+
+ loader = sourmash.load_file_as_index(protzip)
+
+ rows = []
+ siglist = []
+ for (sig, _, loc) in loader._signatures_with_internal():
+ row = index.CollectionManifest.make_manifest_row(sig, loc)
+ rows.append(row)
+ siglist.append(sig)
+
+ manifest = index.CollectionManifest(rows)
+ picklist = manifest.to_picklist()
+ assert len(picklist.pickset) == len(manifest)
+
+ new_manifest = manifest.select_to_manifest(picklist=picklist)
+ assert len(new_manifest) == len(manifest)
+
+
+def test_save_load_manifest():
+ # test saving and loading manifests
+ protzip = utils.get_test_data('prot/protein.zip')
+
+ loader = sourmash.load_file_as_index(protzip)
+
+ rows = []
+ siglist = []
+ for (sig, _, loc) in loader._signatures_with_internal():
+ row = index.CollectionManifest.make_manifest_row(sig, loc)
+ rows.append(row)
+ siglist.append(sig)
+
+ manifest = index.CollectionManifest(rows)
+
+ # now, on to CSV
+ fp = StringIO()
+ manifest.write_csv_header(fp)
+ manifest.write_to_csv(fp)
+
+ rfp = StringIO(fp.getvalue())
+ manifest2 = index.CollectionManifest.load_from_csv(rfp)
+
+ assert len(manifest) == len(manifest2)
+
+ pick1 = manifest.to_picklist()
+ pick2 = manifest2.to_picklist()
+
+ # manifest 1 in manifest2?
+ for row in manifest.rows:
+ assert pick2.matches_manifest_row(row)
+
+ # manifest 2 in manifest?
+ for row in manifest2.rows:
+ assert pick1.matches_manifest_row(row)
diff --git a/tests/test_minhash.py b/tests/test_minhash.py
index 509731718f..ed9b7a1f78 100644
--- a/tests/test_minhash.py
+++ b/tests/test_minhash.py
@@ -559,6 +559,24 @@ def test_similarity_1(track_abundance):
assert round(b.similarity(b), 3) == 1.0
+def test_copy(track_abundance):
+ a = MinHash(20, 21, track_abundance=track_abundance)
+ a.add_hash(5)
+ b = a.copy()
+ assert a == b
+ a.add_hash(6)
+ assert a != b
+
+
+def test_frozen_copy(track_abundance):
+ a = MinHash(20, 21, track_abundance=track_abundance)
+ a.add_hash(5)
+ b = a.copy()
+ assert 5 in b.hashes
+ a.add_hash(6)
+ assert 6 not in b.hashes
+
+
def test_mh_copy(track_abundance):
a = MinHash(20, 10, track_abundance=track_abundance)
@@ -1178,6 +1196,22 @@ def test_set_abundance_clear_4():
a.set_abundances({20: 1, 10: 2}, clear=False)
assert a.hashes == {10: 3, 20: 3}
+def test_clear_abundance_on_zero():
+ mh = sourmash.minhash.MinHash(n=0, ksize=31, scaled=1, track_abundance=True)
+ mh.set_abundances({ 1: 5, 2: 3, 3 : 5 })
+ mh.set_abundances({ 1: 0 }, clear=False)
+ assert 1 not in dict(mh.hashes)
+ assert dict(mh.hashes)[2] == 3
+ assert dict(mh.hashes)[3] == 5
+ assert len(mh) == 2
+
+ with pytest.raises(ValueError):
+ mh.set_abundances({ 2: -1 }) # Test on clear = True
+
+ with pytest.raises(ValueError):
+ mh.set_abundances({ 2: -1 }, clear=False)
+
+ assert len(mh) == 2 # Assert that nothing was affected
def test_reset_abundance_initialized():
a = MinHash(1, 4, track_abundance=True)
@@ -1396,6 +1430,28 @@ def test_remove_many(track_abundance):
assert len(a) == 33
assert all(c % 6 != 0 for c in a.hashes)
+def test_remove_minhash(track_abundance):
+ original_mh = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000)
+ added_mh = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000)
+ tested_mh = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000)
+
+ original_mh.add_many(list(range(101)))
+ added_mh.add_many(list(range(101,201))) # contains original in it
+ tested_mh.add_many(list(range(201))) # original + added
+
+ # Now we should expect tested_minhash == original_minhash
+ # Note we are passing a MinHash object instead of an iterable object
+ tested_mh.remove_many(added_mh)
+
+ # Assertion
+ original_sig = signature.SourmashSignature(original_mh)
+ tested_sig = signature.SourmashSignature(tested_mh)
+
+ # Should pass if the hashes list in the same order
+ assert original_mh.hashes == tested_mh.hashes
+ assert len(original_mh) == len(tested_mh)
+ assert original_sig.md5sum() == tested_sig.md5sum()
+
def test_add_many(track_abundance):
a = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000)
@@ -1675,10 +1731,27 @@ def test_intersection_1_num():
mh2.add_hash(2)
mh3 = mh1.intersection(mh2)
- print(set(mh3.hashes))
+ print("mh.intersection INTERSECTION HASHES:",set(mh3.hashes))
assert len(mh3) == 1
assert 0 in mh3.hashes
+def test_and_operator():
+ mh1 = MinHash(20, 21)
+ mh1.add_hash(5)
+ mh1.add_hash(6)
+ mh2 = MinHash(20, 21)
+ mh2.add_hash(6)
+ mh2.add_hash(7)
+
+ print("\n \n mh1 EQUALS ", mh1.hashes, "\n mh2 EQUALS", mh2.hashes)
+
+ mh3 = mh1.intersection(mh2)
+ mh4 = mh1 & mh2
+
+ print("\n Intersection hashes (mh3): ", mh3.hashes, "\n '&' hashes: (mh4)", mh4.hashes)
+
+ assert mh3
+ assert mh3 == mh4
def test_intersection_2_scaled():
mh1 = MinHash(0, 21, scaled=1)
@@ -1782,6 +1855,16 @@ def test_intersection_7_full_scaled():
assert mh1.intersection_and_union_size(mh2) == (50, 150)
+def test_intersection_and_union_8_incompatible_ksize():
+ # cannot intersect different ksizes
+ mh1 = MinHash(0, 21, scaled=1)
+ mh2 = MinHash(0, 31, scaled=1)
+
+ with pytest.raises(TypeError) as exc:
+ mh1.intersection_and_union_size(mh2)
+ assert "incompatible MinHash objects" in str(exc)
+
+
def test_merge_abund():
mh1 = MinHash(10, 21, track_abundance=True)
mh2 = MinHash(10, 21, track_abundance=True)
@@ -1792,6 +1875,8 @@ def test_merge_abund():
ret = mh1.merge(mh2)
assert ret is None
+ print("MH1 EQUALS ", mh1.hashes)
+
hashcounts = mh1.hashes
assert len(hashcounts) == 1
assert hashcounts[0] == 4
@@ -1858,12 +1943,8 @@ def test_merge_scaled():
assert len(mh1) == 100
assert len(mh2) == 100
- # add is symmetric:
- mh3 = mh1 + mh2
- mh4 = mh2 + mh1
- assert mh3 == mh4
-
# merge contains all the things
+ mh3 = mh1 + mh2
assert len(mh3) == 150
# everything in either one is in union
@@ -1872,6 +1953,30 @@ def test_merge_scaled():
for k in mh2.hashes:
assert k in mh3.hashes
+def test_add_is_symmetric():
+ mh1 = MinHash(20, 21)
+ mh1.add_hash(5)
+ mh2 = MinHash(20, 21)
+ mh2.add_hash(6)
+ print("\n mh1 EQUALS ", mh1.hashes, "\n mh2 EQUALS", mh2.hashes)
+ mh3 = mh1 + mh2
+ mh4 = mh2 + mh1
+ print("\n mh3 EQUALS ", mh3.hashes, "\n mh4 EQUALS", mh4.hashes)
+ #if mh3 != 0, then it is "true", so it passes
+ assert mh3
+ assert mh3 == mh4
+
+def test_or_equals_add():
+ mh1 = MinHash(20, 21)
+ mh1.add_hash(5)
+ mh2 = MinHash(20, 21)
+ mh2.add_hash(6)
+ print("\n mh1 EQUALS ", mh1.hashes, "\n mh2 EQUALS", mh2.hashes)
+ mh3 = mh1 + mh2
+ mh4 = mh1 | mh2
+ print("\n mh3 EQUALS ", mh3.hashes, "\n mh4 EQUALS", mh4.hashes)
+ assert mh3
+ assert mh3 == mh4
def test_max_containment():
mh1 = MinHash(0, 21, scaled=1, track_abundance=False)
diff --git a/tests/test_prefetch.py b/tests/test_prefetch.py
index da37559d2b..82057b35ab 100644
--- a/tests/test_prefetch.py
+++ b/tests/test_prefetch.py
@@ -4,6 +4,7 @@
import os
import csv
import pytest
+import glob
import sourmash_tst_utils as utils
import sourmash
@@ -270,6 +271,7 @@ def test_prefetch_matching_hashes(runtmp, linear_gather):
intersect.add_many(matches)
ss = sourmash.load_one_signature(matches_out)
+ assert ss.name.endswith('-known')
assert ss.minhash == intersect
@@ -299,6 +301,7 @@ def test_prefetch_nomatch_hashes(runtmp, linear_gather):
remain.remove_many(ss63.minhash.hashes)
ss = sourmash.load_one_signature(nomatch_out)
+ assert ss.name.endswith('-unknown')
assert ss.minhash == remain
@@ -444,3 +447,48 @@ def test_prefetch_basic_many_sigs(runtmp, linear_gather):
assert "total of 10 matching signatures." in c.last_result.err
assert "of 5177 distinct query hashes, 5177 were found in matches above threshold." in c.last_result.err
assert "a total of 0 query hashes remain unmatched." in c.last_result.err
+
+
+def test_prefetch_with_picklist(runtmp):
+ # test 'sourmash prefetch' with picklists
+ gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig'))
+ metag_sig = utils.get_test_data('gather/combined.sig')
+ picklist = utils.get_test_data('gather/thermotoga-picklist.csv')
+
+ runtmp.sourmash('prefetch', metag_sig, *gcf_sigs,
+ '-k', '21', '--picklist', f"{picklist}:md5:md5")
+
+ err = runtmp.last_result.err
+ print(err)
+ assert "for given picklist, found 3 matches to 9 distinct values" in err
+ # these are the different ksizes
+ assert "WARNING: 6 missing picklist values." in err
+
+ out = runtmp.last_result.out
+ print(out)
+
+ assert "total of 3 matching signatures." in err
+ assert "of 1466 distinct query hashes, 453 were found in matches above threshold." in err
+ assert "a total of 1013 query hashes remain unmatched." in err
+
+
+def test_prefetch_with_picklist_exclude(runtmp):
+ # test 'sourmash prefetch' with picklists, exclude
+ gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig'))
+ metag_sig = utils.get_test_data('gather/combined.sig')
+ picklist = utils.get_test_data('gather/thermotoga-picklist.csv')
+
+ runtmp.sourmash('prefetch', metag_sig, *gcf_sigs,
+ '-k', '21', '--picklist', f"{picklist}:md5:md5:exclude")
+
+ err = runtmp.last_result.err
+ print(err)
+ assert "for given picklist, found 9 matches by excluding 9 distinct values" in err
+ # these are the different ksizes
+
+ out = runtmp.last_result.out
+ print(out)
+
+ assert "total of 9 matching signatures." in err
+ assert "of 1466 distinct query hashes, 1013 were found in matches above threshold." in err
+ assert "a total of 453 query hashes remain unmatched." in err
diff --git a/tests/test_sbt.py b/tests/test_sbt.py
index 04efcf276d..566365c3aa 100644
--- a/tests/test_sbt.py
+++ b/tests/test_sbt.py
@@ -14,6 +14,7 @@
from sourmash.sbt_storage import (FSStorage, RedisStorage,
IPFSStorage, ZipStorage)
from sourmash.search import make_jaccard_search_query
+from sourmash.picklist import SignaturePicklist, PickStyle
import sourmash_tst_utils as utils
@@ -231,10 +232,12 @@ def test_search_minhashes():
# this fails if 'search_obj' is calc containment and not similarity.
search_obj = make_jaccard_search_query(threshold=0.08)
results = tree.find(search_obj, to_search.data)
- for sr in results:
+
+ n = 0
+ for n, sr in enumerate(results):
assert to_search.data.jaccard(sr.signature) >= 0.08
- print(results)
+ assert n == 1
def test_binary_nary_tree():
@@ -629,12 +632,168 @@ def test_sbt_as_index_select():
xx = tree.select(moltype='DNA')
assert xx == tree
+ xx = tree.select(abund=False)
+ assert xx == tree
+
with pytest.raises(ValueError):
tree.select(ksize=21)
with pytest.raises(ValueError):
tree.select(moltype='protein')
+ with pytest.raises(ValueError):
+ tree.select(abund=True)
+
+
+def test_sbt_as_index_select_picklist():
+ # test 'select' method from Index base class with a picklist
+
+ factory = GraphFactory(31, 1e5, 4)
+ tree = SBT(factory, d=2)
+
+ sig47 = load_one_signature(utils.get_test_data('47.fa.sig'))
+ sig63 = load_one_signature(utils.get_test_data('63.fa.sig'))
+
+ tree.insert(sig47)
+ tree.insert(sig63)
+
+ # construct a picklist...
+ picklist = SignaturePicklist('md5prefix8')
+ picklist.init(['09a08691'])
+
+ # select on picklist
+ tree = tree.select(picklist=picklist)
+ siglist = list(tree.signatures())
+ assert len(siglist) == 1
+
+ ss = siglist[0]
+ assert ss.minhash.ksize == 31
+ assert ss.md5sum().startswith('09a08691c')
+
+
+def test_sbt_as_index_select_picklist_exclude():
+ # test 'select' method from Index base class with a picklist, exclude
+
+ factory = GraphFactory(31, 1e5, 4)
+ tree = SBT(factory, d=2)
+
+ sig47 = load_one_signature(utils.get_test_data('47.fa.sig'))
+ sig63 = load_one_signature(utils.get_test_data('63.fa.sig'))
+
+ tree.insert(sig47)
+ tree.insert(sig63)
+
+ # construct a picklist...
+ picklist = SignaturePicklist('md5prefix8', pickstyle=PickStyle.EXCLUDE)
+ picklist.init(['09a08691'])
+
+ # select on picklist
+ tree = tree.select(picklist=picklist)
+ siglist = list(tree.signatures())
+ assert len(siglist) == 1
+
+ ss = siglist[0]
+ assert ss.minhash.ksize == 31
+ assert ss.md5sum().startswith('38729c637')
+
+
+def test_sbt_as_index_find_picklist():
+ # test 'select' method from Index base class with a picklist
+
+ factory = GraphFactory(31, 1e5, 4)
+ tree = SBT(factory, d=2)
+
+ sig47 = load_one_signature(utils.get_test_data('47.fa.sig'))
+ sig63 = load_one_signature(utils.get_test_data('63.fa.sig'))
+
+ tree.insert(sig47)
+ tree.insert(sig63)
+
+ # construct a picklist...
+ picklist = SignaturePicklist('md5prefix8')
+ picklist.init(['09a08691'])
+
+ # run a 'find' with sig63, should find 47 and 63 both.
+ search_obj = make_jaccard_search_query(do_containment=True, threshold=0.0)
+ results = list(tree.find(search_obj, sig63))
+ print(results)
+ assert len(results) == 2
+
+ # now, select on picklist and do another find...
+ tree = tree.select(picklist=picklist)
+ results = list(tree.find(search_obj, sig63))
+ print(results)
+ assert len(results) == 1
+
+ # and check that it is the expected one!
+ ss = results[0].signature
+ assert ss.minhash.ksize == 31
+ assert ss.md5sum().startswith('09a08691c')
+
+
+def test_sbt_as_index_find_picklist_exclude():
+ # test 'select' method from Index base class with a picklist
+
+ factory = GraphFactory(31, 1e5, 4)
+ tree = SBT(factory, d=2)
+
+ sig47 = load_one_signature(utils.get_test_data('47.fa.sig'))
+ sig63 = load_one_signature(utils.get_test_data('63.fa.sig'))
+
+ tree.insert(sig47)
+ tree.insert(sig63)
+
+ # construct a picklist...
+ picklist = SignaturePicklist('md5prefix8', pickstyle=PickStyle.EXCLUDE)
+ picklist.init(['09a08691'])
+
+ # run a 'find' with sig63, should find 47 and 63 both.
+ search_obj = make_jaccard_search_query(do_containment=True, threshold=0.0)
+ results = list(tree.find(search_obj, sig63))
+ print(results)
+ assert len(results) == 2
+
+ # now, select on picklist and do another find...
+ tree = tree.select(picklist=picklist)
+ results = list(tree.find(search_obj, sig63))
+ print(results)
+ assert len(results) == 1
+
+ # and check that it is the expected one!
+ ss = results[0].signature
+ assert ss.minhash.ksize == 31
+ assert ss.md5sum().startswith('38729c637')
+
+
+def test_sbt_as_index_find_picklist_twice():
+ # test 'select' method from Index base class with a picklist
+
+ factory = GraphFactory(31, 1e5, 4)
+ tree = SBT(factory, d=2)
+
+ sig47 = load_one_signature(utils.get_test_data('47.fa.sig'))
+ sig63 = load_one_signature(utils.get_test_data('63.fa.sig'))
+
+ tree.insert(sig47)
+ tree.insert(sig63)
+
+ # construct a picklist...
+ picklist = SignaturePicklist('md5prefix8')
+ picklist.init(['09a08691'])
+
+ # run a 'find' with sig63, should find 47 and 63 both.
+ search_obj = make_jaccard_search_query(do_containment=True, threshold=0.0)
+ results = list(tree.find(search_obj, sig63))
+ print(results)
+ assert len(results) == 2
+
+ # now, select twice on picklists...
+ tree = tree.select(picklist=picklist)
+
+ with pytest.raises(ValueError):
+ tree = tree.select(picklist=picklist)
+ assert "we do not (yet) support multiple picklists for SBT databases" in str(exc)
+
def test_sbt_as_index_signatures():
# test 'signatures' method from Index base class.
@@ -983,3 +1142,71 @@ def test_sbt_no_containment_on_num():
results = list(tree.find(search_obj, to_search))
assert "this search requires a scaled signature" in str(exc)
+
+
+def test_build_sbt_zip_with_dups(runtmp):
+ dups_data = utils.get_test_data('duplicate-sigs')
+
+ all_sigs = set(sourmash.load_file_as_signatures(dups_data))
+ assert len(all_sigs) == 4
+
+ runtmp.run_sourmash('index', 'dups.sbt.zip', dups_data)
+ outfile = runtmp.output('dups.sbt.zip')
+
+ sbt_sigs = set(sourmash.load_file_as_signatures(outfile))
+ assert len(sbt_sigs) == 4
+
+ assert all_sigs == sbt_sigs
+
+
+def test_build_sbt_zip_with_dups_exists(runtmp):
+ dups_data = utils.get_test_data('duplicate-sigs')
+
+ all_sigs = set(sourmash.load_file_as_signatures(dups_data))
+ assert len(all_sigs) == 4
+
+ runtmp.run_sourmash('index', 'dups.sbt.zip', dups_data)
+ outfile = runtmp.output('dups.sbt.zip')
+
+ # run again, to see what happens :)
+ runtmp.run_sourmash('index', 'dups.sbt.zip', dups_data)
+ outfile = runtmp.output('dups.sbt.zip')
+
+ sbt_sigs = set(sourmash.load_file_as_signatures(outfile))
+ assert len(sbt_sigs) == 4
+
+ assert all_sigs == sbt_sigs
+
+
+def test_build_sbt_json_with_dups(runtmp):
+ dups_data = utils.get_test_data('duplicate-sigs')
+
+ all_sigs = set(sourmash.load_file_as_signatures(dups_data))
+ assert len(all_sigs) == 4
+
+ runtmp.run_sourmash('index', 'dups.sbt.json', dups_data)
+ outfile = runtmp.output('dups.sbt.json')
+
+ sbt_sigs = set(sourmash.load_file_as_signatures(outfile))
+ assert len(sbt_sigs) == 4
+
+ assert all_sigs == sbt_sigs
+
+
+def test_build_sbt_json_with_dups_exists(runtmp):
+ dups_data = utils.get_test_data('duplicate-sigs')
+
+ all_sigs = set(sourmash.load_file_as_signatures(dups_data))
+ assert len(all_sigs) == 4
+
+ runtmp.run_sourmash('index', 'dups.sbt.json', dups_data)
+ outfile = runtmp.output('dups.sbt.json')
+
+ # run again, see what happens!
+ runtmp.run_sourmash('index', 'dups.sbt.json', dups_data)
+ outfile = runtmp.output('dups.sbt.json')
+
+ sbt_sigs = set(sourmash.load_file_as_signatures(outfile))
+ assert len(sbt_sigs) == 4
+
+ assert all_sigs == sbt_sigs
diff --git a/tests/test_signature.py b/tests/test_signature.py
index 8fdacc6be9..21fd7de717 100644
--- a/tests/test_signature.py
+++ b/tests/test_signature.py
@@ -9,6 +9,45 @@
from sourmash import MinHash
+def test_minhash_copy(track_abundance):
+ e = MinHash(n=1, ksize=20, track_abundance=track_abundance)
+ e.add_kmer("AT" * 10)
+ sig = SourmashSignature(e, name='foo')
+ f = e.copy()
+ assert e == f
+
+
+def test_sig_copy(track_abundance):
+ e = MinHash(n=1, ksize=20, track_abundance=track_abundance)
+ e.add_kmer("AT" * 10)
+ sig1 = SourmashSignature(e, name='foo')
+ sig2 = sig1.copy()
+ assert sig1 == sig2
+
+
+def test_sig_copy_frozen(track_abundance):
+ e = MinHash(n=1, ksize=20, track_abundance=track_abundance)
+ e.add_kmer("AT" * 10)
+ sig1 = SourmashSignature(e, name='foo')
+ sig2 = sig1.copy()
+ assert sig1 == sig2
+ with pytest.raises(TypeError) as e:
+ sig2.minhash.add_hash(5)
+ assert 'FrozenMinHash does not support modification' in str(e.value)
+
+
+def test_sig_copy_frozen_mutable(track_abundance):
+ e = MinHash(n=1, ksize=20, track_abundance=track_abundance)
+ e.add_kmer("AT" * 10)
+ sig1 = SourmashSignature(e, name='foo')
+ sig1.minhash = sig1.minhash.to_mutable()
+ sig2 = sig1.copy()
+ assert sig1 == sig2
+ with pytest.raises(TypeError) as e:
+ sig2.minhash.add_hash(5)
+ assert 'FrozenMinHash does not support modification' in str(e.value)
+
+
def test_compare(track_abundance):
# same content, same name -> equal
e = MinHash(n=1, ksize=20, track_abundance=track_abundance)
diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py
index 7f0e2aeac8..93da279c3b 100644
--- a/tests/test_sourmash.py
+++ b/tests/test_sourmash.py
@@ -98,7 +98,7 @@ def test_load_pathlist_from_file_badly_formatted(c):
with pytest.raises(ValueError) as e:
load_pathlist_from_file(file_list)
assert "file '{'a':1}' inside the pathlist does not exist" in str(e.value)
-
+
@utils.in_tempdir
def test_load_pathlist_from_file_badly_formatted_2(c):
@@ -247,6 +247,32 @@ def test_do_basic_compare_using_rna_arg(c):
assert (cmp_out == cmp_calc).all()
+def test_do_basic_compare_using_nucleotide_arg(runtmp):
+ # try doing a basic compare using --nucleotide instead of --dna/--rna
+ c=runtmp
+ import numpy
+ testsigs = utils.get_test_data('genome-s1*.sig')
+ testsigs = glob.glob(testsigs)
+
+ c.run_sourmash('compare', '-o', 'cmp', '-k', '21', '--nucleotide', *testsigs)
+
+ cmp_outfile = c.output('cmp')
+ assert os.path.exists(cmp_outfile)
+ cmp_out = numpy.load(cmp_outfile)
+
+ sigs = []
+ for fn in testsigs:
+ sigs.append(sourmash.load_one_signature(fn, ksize=21,
+ select_moltype='dna'))
+
+ cmp_calc = numpy.zeros([len(sigs), len(sigs)])
+ for i, si in enumerate(sigs):
+ for j, sj in enumerate(sigs):
+ cmp_calc[i][j] = si.similarity(sj)
+
+ assert (cmp_out == cmp_calc).all()
+
+
@utils.in_tempdir
def test_do_compare_quiet(c):
testdata1 = utils.get_test_data('short.fa')
@@ -920,7 +946,10 @@ def test_gather_lca_db(runtmp, linear_gather, prefetch_gather):
runtmp.sourmash('gather', query, lca_db, linear_gather, prefetch_gather)
print(runtmp)
- assert 'NC_009665.1 Shewanella baltica OS185' in str(runtmp.last_result.out)
+ out = runtmp.last_result.out
+
+ assert 'NC_009665.1 Shewanella baltica OS185' in out
+ assert 'WARNING: final scaled was 10000, vs query scaled of 1000' in out
def test_gather_csv_output_filename_bug(runtmp, linear_gather, prefetch_gather):
@@ -1899,7 +1928,7 @@ def test_search_metagenome_traverse_check_csv():
r = csv.DictReader(fp)
for row in r:
filename = row['filename']
- assert filename.startswith(testdata_dir)
+ assert filename.startswith(testdata_dir), filename
# should have full path to file sig was loaded from
assert len(filename) > prefix_len
@@ -2023,6 +2052,51 @@ def test_search_metagenome_downsample_index(c):
assert '12 matches; showing first 3:' in str(c)
+def test_search_with_picklist(runtmp):
+ # test 'sourmash search' with picklists
+ gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig'))
+ metag_sig = utils.get_test_data('gather/combined.sig')
+ picklist = utils.get_test_data('gather/thermotoga-picklist.csv')
+
+ runtmp.sourmash('search', metag_sig, *gcf_sigs, '--containment',
+ '-k', '21', '--picklist', f"{picklist}:md5:md5")
+
+ err = runtmp.last_result.err
+ print(err)
+ assert "for given picklist, found 3 matches to 9 distinct values" in err
+ # these are the different ksizes
+ assert "WARNING: 6 missing picklist values." in err
+
+ out = runtmp.last_result.out
+ print(out)
+ assert "3 matches:" in out
+ assert "13.1% NC_000853.1 Thermotoga" in out
+ assert "13.0% NC_009486.1 Thermotoga" in out
+ assert "12.8% NC_011978.1 Thermotoga" in out
+
+
+def test_search_with_picklist_exclude(runtmp):
+ # test 'sourmash search' with picklists
+ gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig'))
+ metag_sig = utils.get_test_data('gather/combined.sig')
+ picklist = utils.get_test_data('gather/thermotoga-picklist.csv')
+
+ runtmp.sourmash('search', metag_sig, *gcf_sigs, '--containment',
+ '-k', '21', '--picklist', f"{picklist}:md5:md5:exclude")
+
+ err = runtmp.last_result.err
+ print(err)
+ assert "for given picklist, found 9 matches by excluding 9 distinct values" in err
+ # these are the different ksizes
+
+ out = runtmp.last_result.out
+ print(out)
+ assert "9 matches; showing first 3:" in out
+ assert "33.2% NC_003198.1 Salmonella" in out
+ assert "33.1% NC_003197.2 Salmonella" in out
+ assert "32.2% NC_006905.1 Salmonella" in out
+
+
def test_mash_csv_to_sig():
with utils.TempDirectory() as location:
testdata1 = utils.get_test_data('short.fa.msh.dump')
@@ -2064,7 +2138,7 @@ def test_do_sourmash_index_bad_args():
in_directory=location, fail_ok=True)
print(out, err)
- assert 'cannot specify more than one of --dna/--rna/--protein/--hp/--dayhoff' in err
+ assert 'cannot specify more than one of --dna/--rna/--nucleotide/--protein/--hp/--dayhoff' in err
assert status != 0
@@ -2409,7 +2483,7 @@ def test_do_sourmash_index_sparseness():
in_directory=location)
print(out)
- assert len(glob.glob(os.path.join(location, '.sbt.zzz', '*'))) == 2
+ assert len(glob.glob(os.path.join(location, '.sbt.zzz', '*'))) == 3
assert not glob.glob(os.path.join(location, '.sbt.zzz', '*internal*'))
assert 'short.fa' in out
@@ -2685,10 +2759,10 @@ def test_do_sourmash_check_sbt_filenames():
sig_md5s.add(sig.md5sum())
sbt_files = glob.glob(os.path.join(location, '.sbt.zzz', '*'))
- assert len(sbt_files) == 13
+ assert len(sbt_files) == 14
for f in sbt_files:
- if 'internal' in f:
+ if 'internal' in f or f.endswith('zzz.manifest.csv'):
continue
f = os.path.basename(f)
assert f not in sig_names
@@ -2886,6 +2960,49 @@ def test_compare_with_abundance_3():
assert '70.5%' in out
+def test_compare_with_picklist(runtmp):
+ # test 'sourmash compare' with picklists
+ gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig'))
+ picklist = utils.get_test_data('gather/thermotoga-picklist.csv')
+
+ runtmp.sourmash('compare', *gcf_sigs,
+ '-k', '21', '--picklist', f"{picklist}:md5:md5")
+
+ err = runtmp.last_result.err
+ out = runtmp.last_result.out
+ print(runtmp.last_result.out)
+ print(runtmp.last_result.err)
+
+ assert "for given picklist, found 3 matches to 9 distinct values" in err
+ assert "WARNING: 6 missing picklist values." in err
+
+ assert "NC_009486.1 The..." in out
+ assert "NC_000853.1 The..." in out
+ assert "NC_011978.1 The..." in out
+
+
+def test_compare_with_picklist_exclude(runtmp):
+ # test 'sourmash compare' with picklists - exclude
+ gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig'))
+ picklist = utils.get_test_data('gather/thermotoga-picklist.csv')
+
+ runtmp.sourmash('compare', *gcf_sigs,
+ '-k', '21', '--picklist', f"{picklist}:md5:md5:exclude")
+
+ err = runtmp.last_result.err
+ out = runtmp.last_result.out
+ print(runtmp.last_result.out)
+ print(runtmp.last_result.err)
+
+ assert "for given picklist, found 9 matches by excluding 9 distinct values" in err
+
+ assert "NC_004631.1 Sal..." in out
+ assert "NC_006905.1 Sal..." in out
+ assert "NC_003198.1 Sal..." in out
+ assert "NC_002163.1 Cam..." in out
+ assert "NC_011294.1 Sal..." in out
+
+
def test_gather(linear_gather, prefetch_gather):
with utils.TempDirectory() as location:
testdata1 = utils.get_test_data('short.fa')
@@ -2928,12 +3045,14 @@ def test_gather_csv(linear_gather, prefetch_gather):
testdata2 = utils.get_test_data('short2.fa')
status, out, err = utils.runscript('sourmash',
['compute', testdata1, testdata2,
- '--scaled', '10'],
+ '--scaled', '10',
+ '--name-from-first'],
in_directory=location)
status, out, err = utils.runscript('sourmash',
['compute', testdata2,
'--scaled', '10',
+ '--name-from-first',
'-o', 'query.fa.sig'],
in_directory=location)
@@ -2968,9 +3087,13 @@ def test_gather_csv(linear_gather, prefetch_gather):
assert float(row['f_unique_to_query']) == 1.0
assert float(row['f_match']) == 1.0
assert row['filename'] == 'zzz'
- assert row['name'].endswith('short2.fa')
+ assert row['name'] == 'tr1 4'
assert row['md5'] == 'c9d5a795eeaaf58e286fb299133e1938'
assert row['gather_result_rank'] == '0'
+ assert row['query_filename'].endswith('short2.fa')
+ assert row['query_name'] == 'tr1 4'
+ assert row['query_md5'] == 'c9d5a795'
+ assert row['query_bp'] == '910'
def test_gather_abund_x_abund(runtmp, prefetch_gather, linear_gather):
@@ -3935,6 +4058,8 @@ def test_gather_query_downsample(linear_gather, prefetch_gather):
assert all(('4.9 Mbp 100.0% 100.0%' in out,
'NC_003197.2' in out))
+ assert 'WARNING: final scaled was 10000, vs query scaled of 500' in out
+
def test_gather_query_downsample_explicit(linear_gather, prefetch_gather):
# do an explicit downsampling to fix `test_gather_query_downsample`
@@ -3959,6 +4084,59 @@ def test_gather_query_downsample_explicit(linear_gather, prefetch_gather):
'NC_003197.2' in out))
+def test_gather_with_picklist(runtmp, linear_gather, prefetch_gather):
+ # test 'sourmash gather' with picklists
+ gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig'))
+ metag_sig = utils.get_test_data('gather/combined.sig')
+ picklist = utils.get_test_data('gather/thermotoga-picklist.csv')
+
+ runtmp.sourmash('gather', metag_sig, *gcf_sigs, '--threshold-bp=0',
+ '-k', '21', '--picklist', f"{picklist}:md5:md5",
+ linear_gather, prefetch_gather)
+
+ err = runtmp.last_result.err
+ print(err)
+ assert "for given picklist, found 3 matches to 9 distinct values" in err
+ # these are the different ksizes
+ assert "WARNING: 6 missing picklist values." in err
+
+ out = runtmp.last_result.out
+ print(out)
+ assert "found 3 matches total;" in out
+ assert "1.9 Mbp 13.1% 100.0% NC_000853.1 Thermotoga" in out
+ assert "1.9 Mbp 11.5% 89.9% NC_011978.1 Thermotoga" in out
+ assert "1.9 Mbp 6.3% 48.4% NC_009486.1 Thermotoga" in out
+
+
+def test_gather_with_picklist_exclude(runtmp, linear_gather, prefetch_gather):
+ # test 'sourmash gather' with picklists - exclude
+ gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig'))
+ metag_sig = utils.get_test_data('gather/combined.sig')
+ picklist = utils.get_test_data('gather/thermotoga-picklist.csv')
+
+ runtmp.sourmash('gather', metag_sig, *gcf_sigs, '--threshold-bp=0',
+ '-k', '21', '--picklist', f"{picklist}:md5:md5:exclude",
+ linear_gather, prefetch_gather)
+
+ err = runtmp.last_result.err
+ print(err)
+ assert "for given picklist, found 9 matches by excluding 9 distinct values" in err
+ # these are the different ksizes
+
+ out = runtmp.last_result.out
+ print(out)
+ assert "found 9 matches total;" in out
+ assert "4.9 Mbp 33.2% 100.0% NC_003198.1 Salmonella enterica subsp..." in out
+ assert "1.6 Mbp 10.7% 100.0% NC_002163.1 Campylobacter jejuni subs..." in out
+ assert "4.8 Mbp 10.4% 31.3% NC_003197.2 Salmonella enterica subsp..." in out
+ assert "4.7 Mbp 5.2% 16.1% NC_006905.1 Salmonella enterica subsp..." in out
+ assert "4.7 Mbp 4.0% 12.6% NC_011080.1 Salmonella enterica subsp..." in out
+ assert "4.6 Mbp 2.9% 9.2% NC_011274.1 Salmonella enterica subsp..." in out
+ assert "4.3 Mbp 2.1% 7.3% NC_006511.1 Salmonella enterica subsp..." in out
+ assert "4.7 Mbp 0.5% 1.5% NC_011294.1 Salmonella enterica subsp..." in out
+ assert "4.5 Mbp 0.1% 0.4% NC_004631.1 Salmonella enterica subsp..." in out
+
+
def test_gather_save_matches(linear_gather, prefetch_gather):
with utils.TempDirectory() as location:
testdata_glob = utils.get_test_data('gather/GCF*.sig')
@@ -4743,7 +4921,7 @@ def test_do_sourmash_index_zipfile(c):
# look internally at the zip file
with zipfile.ZipFile(outfile) as zf:
content = zf.namelist()
- assert len(content) == 25
+ assert len(content) == 26
assert len([c for c in content if 'internal' in c]) == 11
assert ".sbt.zzz/" in content
sbts = [c for c in content if c.endswith(".sbt.json")]
@@ -4783,7 +4961,7 @@ def test_do_sourmash_index_zipfile_append(c):
*second_half)
# UserWarning is raised when there are duplicated entries in the zipfile
print(record)
- assert not record, record
+ #assert not record, record
print(c)
assert c.last_result.status == 0
@@ -4792,9 +4970,413 @@ def test_do_sourmash_index_zipfile_append(c):
# look internally at the zip file
with zipfile.ZipFile(outfile) as zf:
content = zf.namelist()
- assert len(content) == 25
+ print(content)
+ assert len(content) == 26
assert len([c for c in content if 'internal' in c]) == 11
assert ".sbt.zzz/" in content
sbts = [c for c in content if c.endswith(".sbt.json")]
assert len(sbts) == 1
assert sbts[0] == "zzz.sbt.json"
+
+
+def test_index_with_picklist(runtmp):
+ # test 'sourmash index' with picklists
+ gcf_sig_dir = utils.get_test_data('gather/')
+ picklist = utils.get_test_data('gather/thermotoga-picklist.csv')
+
+ output_db = runtmp.output('thermo.sbt.zip')
+
+ runtmp.sourmash('index', output_db, gcf_sig_dir,
+ '-k', '31', '--picklist', f"{picklist}:md5:md5")
+
+ err = runtmp.last_result.err
+ print(err)
+ assert "for given picklist, found 3 matches to 9 distinct values" in err
+
+ # these are the different ksizes
+ assert "WARNING: 6 missing picklist values." in err
+
+ # verify:
+ siglist = list(sourmash.load_file_as_signatures(output_db))
+ assert len(siglist) == 3
+ for ss in siglist:
+ assert 'Thermotoga' in ss.name
+
+
+def test_index_with_picklist_exclude(runtmp):
+ # test 'sourmash index' with picklists - exclude
+ gcf_sig_dir = utils.get_test_data('gather/')
+ picklist = utils.get_test_data('gather/thermotoga-picklist.csv')
+
+ output_db = runtmp.output('thermo-exclude.sbt.zip')
+
+ runtmp.sourmash('index', output_db, gcf_sig_dir,
+ '-k', '31', '--picklist', f"{picklist}:md5:md5:exclude")
+
+ err = runtmp.last_result.err
+ print(err)
+ assert "for given picklist, found 9 matches by excluding 9 distinct values" in err
+
+ # verify:
+ siglist = list(sourmash.load_file_as_signatures(output_db))
+ assert len(siglist) == 9
+ for ss in siglist:
+ assert 'Thermotoga' not in ss.name
+
+
+def test_index_matches_search_with_picklist(runtmp):
+ # test 'sourmash index' with picklists
+ gcf_sig_dir = utils.get_test_data('gather/')
+ gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig'))
+ picklist = utils.get_test_data('gather/thermotoga-picklist.csv')
+ metag_sig = utils.get_test_data('gather/combined.sig')
+
+ output_db = runtmp.output('thermo.sbt.zip')
+
+ runtmp.sourmash('index', output_db, gcf_sig_dir, '-k', '21')
+ print(runtmp.last_result.out)
+ print(runtmp.last_result.err)
+
+ # verify:
+ siglist = list(sourmash.load_file_as_signatures(output_db))
+ assert len(siglist) > 3 # all signatures included...
+
+ n_thermo = 0
+ for ss in siglist:
+ if 'Thermotoga' in ss.name:
+ n_thermo += 1
+
+ assert n_thermo == 3
+
+ runtmp.sourmash('search', metag_sig, output_db, '--containment',
+ '-k', '21', '--picklist', f"{picklist}:md5:md5")
+
+ err = runtmp.last_result.err
+ print(err)
+ assert "for given picklist, found 3 matches to 9 distinct values" in err
+ # these are the different ksizes
+ assert "WARNING: 6 missing picklist values." in err
+
+ out = runtmp.last_result.out
+ print(out)
+ assert "3 matches:" in out
+ assert "13.1% NC_000853.1 Thermotoga" in out
+ assert "13.0% NC_009486.1 Thermotoga" in out
+ assert "12.8% NC_011978.1 Thermotoga" in out
+
+
+def test_index_matches_search_with_picklist_exclude(runtmp):
+ # test 'sourmash index' with picklists - exclude
+ gcf_sig_dir = utils.get_test_data('gather/')
+ gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig'))
+ picklist = utils.get_test_data('gather/thermotoga-picklist.csv')
+ metag_sig = utils.get_test_data('gather/combined.sig')
+
+ output_db = runtmp.output('thermo-exclude.sbt.zip')
+
+ runtmp.sourmash('index', output_db, gcf_sig_dir, '-k', '21')
+ print(runtmp.last_result.out)
+ print(runtmp.last_result.err)
+
+ # verify:
+ siglist = list(sourmash.load_file_as_signatures(output_db))
+ assert len(siglist) > 3 # all signatures included...
+
+ n_thermo = 0
+ for ss in siglist:
+ if 'Thermotoga' in ss.name:
+ n_thermo += 1
+
+ assert n_thermo == 3
+
+ runtmp.sourmash('search', metag_sig, output_db, '--containment',
+ '-k', '21', '--picklist', f"{picklist}:md5:md5:exclude")
+
+ err = runtmp.last_result.err
+ print(err)
+ assert "for given picklist, found 10 matches by excluding 9 distinct values" in err
+ ### NTP: FIX REPORTING
+ assert "WARNING: -1 missing picklist values"
+
+ out = runtmp.last_result.out
+ print(out)
+ assert "10 matches; showing first 3:" in out
+ assert "100.0% -" in out
+ assert "33.2% NC_003198.1 Salmonella" in out
+ assert "33.1% NC_003197.2 Salmonella" in out
+
+
+def test_gather_with_prefetch_picklist(runtmp, linear_gather):
+ # test 'gather' using a picklist taken from 'sourmash prefetch' output
+ gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig'))
+ metag_sig = utils.get_test_data('gather/combined.sig')
+ prefetch_csv = runtmp.output('prefetch-out.csv')
+
+ runtmp.sourmash('prefetch', metag_sig, *gcf_sigs,
+ '-k', '21', '-o', prefetch_csv)
+
+ err = runtmp.last_result.err
+ print(err)
+
+ out = runtmp.last_result.out
+ print(out)
+
+ assert "total of 12 matching signatures." in err
+ assert "of 1466 distinct query hashes, 1466 were found in matches above threshold." in err
+
+ # now, do a gather with the results
+ runtmp.sourmash('gather', metag_sig, *gcf_sigs, linear_gather,
+ '-k', '21', '--picklist',
+ f'{prefetch_csv}:match_md5:md5short')
+
+ err = runtmp.last_result.err
+ print(err)
+
+ out = runtmp.last_result.out
+ print(out)
+
+ assert "found 11 matches total;" in out
+ assert "the recovered matches hit 99.9% of the query" in out
+
+ assert "4.9 Mbp 33.2% 100.0% NC_003198.1 " in out
+ assert "1.9 Mbp 13.1% 100.0% NC_000853.1 " in out
+
+
+def test_gather_with_prefetch_picklist_2_prefetch(runtmp, linear_gather):
+ # test 'gather' using a picklist taken from 'sourmash prefetch' output
+ # using ::prefetch
+ gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig'))
+ metag_sig = utils.get_test_data('gather/combined.sig')
+ prefetch_csv = runtmp.output('prefetch-out.csv')
+
+ runtmp.sourmash('prefetch', metag_sig, *gcf_sigs,
+ '-k', '21', '-o', prefetch_csv)
+
+ err = runtmp.last_result.err
+ print(err)
+
+ out = runtmp.last_result.out
+ print(out)
+
+ assert "total of 12 matching signatures." in err
+ assert "of 1466 distinct query hashes, 1466 were found in matches above threshold." in err
+
+ # now, do a gather with the results
+ runtmp.sourmash('gather', metag_sig, *gcf_sigs, linear_gather,
+ '-k', '21', '--picklist',
+ f'{prefetch_csv}::prefetch')
+
+ err = runtmp.last_result.err
+ print(err)
+
+ out = runtmp.last_result.out
+ print(out)
+
+ assert "found 11 matches total;" in out
+ assert "the recovered matches hit 99.9% of the query" in out
+
+ assert "4.9 Mbp 33.2% 100.0% NC_003198.1 " in out
+ assert "1.9 Mbp 13.1% 100.0% NC_000853.1 " in out
+
+
+def test_gather_with_prefetch_picklist_3_gather(runtmp, linear_gather):
+ # test 'gather' using a picklist taken from 'sourmash gather' output,
+ # using ::gather.
+ # (this doesn't really do anything useful, but it's an ok test :)
+ gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig'))
+ metag_sig = utils.get_test_data('gather/combined.sig')
+ gather_csv = runtmp.output('gather-out.csv')
+
+ runtmp.sourmash('gather', metag_sig, *gcf_sigs,
+ '-k', '21', '-o', gather_csv)
+
+ err = runtmp.last_result.err
+ print(err)
+
+ out = runtmp.last_result.out
+ print(out)
+
+ assert "found 11 matches total;" in out
+ assert "the recovered matches hit 99.9% of the query" in out
+
+ assert "4.9 Mbp 33.2% 100.0% NC_003198.1 " in out
+ assert "1.9 Mbp 13.1% 100.0% NC_000853.1 " in out
+
+ # now, do another gather with the results
+ runtmp.sourmash('gather', metag_sig, *gcf_sigs, linear_gather,
+ '-k', '21', '--picklist',
+ f'{gather_csv}::gather')
+
+ err = runtmp.last_result.err
+ print(err)
+
+ out = runtmp.last_result.out
+ print(out)
+
+ assert "found 11 matches total;" in out
+ assert "the recovered matches hit 99.9% of the query" in out
+
+ assert "4.9 Mbp 33.2% 100.0% NC_003198.1 " in out
+ assert "1.9 Mbp 13.1% 100.0% NC_000853.1 " in out
+
+
+def test_gather_with_prefetch_picklist_3_gather_badcol(runtmp):
+ # test 'gather' using a picklist taken from 'sourmash gather' output,
+ # using ::gather.
+ # (this doesn't really do anything useful, but it's an ok test :)
+ gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig'))
+ metag_sig = utils.get_test_data('gather/combined.sig')
+ gather_csv = runtmp.output('gather-out.csv')
+
+ runtmp.sourmash('gather', metag_sig, *gcf_sigs,
+ '-k', '21', '-o', gather_csv)
+
+ err = runtmp.last_result.err
+ print(err)
+
+ out = runtmp.last_result.out
+ print(out)
+
+ assert "found 11 matches total;" in out
+ assert "the recovered matches hit 99.9% of the query" in out
+
+ assert "4.9 Mbp 33.2% 100.0% NC_003198.1 " in out
+ assert "1.9 Mbp 13.1% 100.0% NC_000853.1 " in out
+
+ # now, do another gather with the results, but with a bad picklist
+ # parameter
+ with pytest.raises(ValueError):
+ runtmp.sourmash('gather', metag_sig, *gcf_sigs,
+ '-k', '21', '--picklist',
+ f'{gather_csv}:FOO:gather')
+
+ err = runtmp.last_result.err
+ print(err)
+
+ out = runtmp.last_result.out
+ print(out)
+
+ assert "ERROR: could not load picklist." in err
+ assert "no column name allowed for coltype 'gather'" in err
+
+
+def test_gather_with_prefetch_picklist_4_manifest(runtmp, linear_gather):
+ # test 'gather' using a picklist taken from 'sourmash sig manifest'
+ # output, using ::manifest.
+ # (this doesn't really do anything useful, but it's an ok test :)
+ gather_dir = utils.get_test_data('gather/')
+ metag_sig = utils.get_test_data('gather/combined.sig')
+ manifest_csv = runtmp.output('manifest.csv')
+
+ runtmp.sourmash('sig', 'manifest', gather_dir, '-o', manifest_csv)
+
+ err = runtmp.last_result.err
+ print(err)
+
+ out = runtmp.last_result.out
+ print(out)
+
+ # now, do a gather on the manifest
+ runtmp.sourmash('gather', metag_sig, gather_dir, linear_gather,
+ '-k', '21', '--picklist',
+ f'{manifest_csv}::manifest')
+
+ err = runtmp.last_result.err
+ print(err)
+
+ out = runtmp.last_result.out
+ print(out)
+
+ assert "found 1 matches total;" in out
+ assert "the recovered matches hit 100.0% of the query" in out
+
+ # the query sig itself is in there, so :shrug: that matches at 100%
+ assert "14.7 Mbp 100.0% 100.0% -" in out
+
+
+def test_gather_with_prefetch_picklist_4_manifest_excl(runtmp, linear_gather):
+ # test 'gather' using a picklist taken from 'sourmash sig manifest'
+ # output, using ::manifest.
+ # (this doesn't really do anything useful, but it's an ok test :)
+ gather_dir = utils.get_test_data('gather/')
+ metag_sig = utils.get_test_data('gather/combined.sig')
+ manifest_csv = runtmp.output('manifest.csv')
+
+ runtmp.sourmash('sig', 'manifest', gather_dir, '-o', manifest_csv)
+
+ err = runtmp.last_result.err
+ print(err)
+
+ out = runtmp.last_result.out
+ print(out)
+
+ # now, do a gather on the manifest
+ runtmp.sourmash('gather', metag_sig, gather_dir, linear_gather,
+ '-k', '21', '--picklist',
+ f'{manifest_csv}::manifest:exclude')
+
+ err = runtmp.last_result.err
+ print(err)
+
+ out = runtmp.last_result.out
+ print(out)
+
+ # excluded everything, so nothing to match!
+ assert "found 0 matches total;" in out
+ assert "the recovered matches hit 0.0% of the query" in out
+
+
+def test_gather_with_prefetch_picklist_5_search(runtmp):
+ # test 'gather' using a picklist taken from 'sourmash prefetch' output
+ # using ::prefetch
+ gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig'))
+ metag_sig = utils.get_test_data('gather/combined.sig')
+ search_csv = runtmp.output('search-out.csv')
+
+ runtmp.sourmash('search', '--containment', metag_sig, *gcf_sigs,
+ '-k', '21', '-o', search_csv)
+
+ err = runtmp.last_result.err
+ print(err)
+
+ out = runtmp.last_result.out
+ print(out)
+
+ assert "12 matches; showing first 3:" in out
+ assert " 33.2% NC_003198.1 Salmonella enterica subsp." in out
+
+ # now, do a gather with the results
+ runtmp.sourmash('gather', metag_sig, *gcf_sigs,
+ '-k', '21', '--picklist',
+ f'{search_csv}::search')
+
+ err = runtmp.last_result.err
+ print(err)
+
+ out = runtmp.last_result.out
+ print(out)
+
+ assert "found 11 matches total;" in out
+ assert "the recovered matches hit 99.9% of the query" in out
+
+ assert "4.9 Mbp 33.2% 100.0% NC_003198.1 " in out
+ assert "1.9 Mbp 13.1% 100.0% NC_000853.1 " in out
+
+
+def test_gather_scaled_1(runtmp, linear_gather, prefetch_gather):
+ # test gather on a sig indexed with scaled=1
+ inp = utils.get_test_data('short.fa')
+ outp = runtmp.output('out.sig')
+
+ # prepare a signature with a scaled of 1
+ runtmp.sourmash('sketch', 'dna', '-p', 'scaled=1,k=31', inp, '-o', outp)
+
+ # run with a low threshold
+ runtmp.sourmash('gather', outp, outp, '--threshold-bp', '0')
+
+ print(runtmp.last_result.out)
+ print('---')
+ print(runtmp.last_result.err)
+
+ assert "1.0 kbp 100.0% 100.0%" in runtmp.last_result.out
+ assert "found 1 matches total;" in runtmp.last_result.out
diff --git a/tests/test_tax.py b/tests/test_tax.py
new file mode 100644
index 0000000000..1940a67d83
--- /dev/null
+++ b/tests/test_tax.py
@@ -0,0 +1,1467 @@
+"""
+Tests for the 'sourmash tax' command line and high level API.
+"""
+import os
+import csv
+import pytest
+
+import sourmash_tst_utils as utils
+from sourmash.tax import tax_utils
+
+## command line tests
+def test_run_sourmash_tax():
+ status, out, err = utils.runscript('sourmash', ['tax'], fail_ok=True)
+ assert status != 0 # no args provided, ok ;)
+
+
+def test_metagenome_stdout_0(runtmp):
+ # test basic metagenome
+ c = runtmp
+
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+ tax = utils.get_test_data('tax/test.taxonomy.csv')
+
+ c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax)
+
+ print(c.last_result.status)
+ print(c.last_result.out)
+ print(c.last_result.err)
+
+ assert c.last_result.status == 0
+ assert "query_name,rank,fraction,lineage" in c.last_result.out
+ assert 'test1,superkingdom,0.131,d__Bacteria' in c.last_result.out
+ assert "test1,phylum,0.073,d__Bacteria;p__Bacteroidota" in c.last_result.out
+ assert "test1,phylum,0.058,d__Bacteria;p__Proteobacteria" in c.last_result.out
+ assert "test1,class,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia" in c.last_result.out
+ assert "test1,class,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria" in c.last_result.out
+ assert "test1,order,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales" in c.last_result.out
+ assert "test1,order,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales" in c.last_result.out
+ assert "test1,family,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae" in c.last_result.out
+ assert "test1,family,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae" in c.last_result.out
+ assert "test1,genus,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia" in c.last_result.out
+ assert "test1,genus,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella" in c.last_result.out
+ assert "test1,genus,0.016,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola" in c.last_result.out
+ assert "test1,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out
+ assert "test1,species,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in c.last_result.out
+ assert "test1,species,0.016,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" in c.last_result.out
+
+
+def test_metagenome_stdout_0_db(runtmp):
+ # test basic metagenome with sqlite database
+ c = runtmp
+
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+ tax = utils.get_test_data('tax/test.taxonomy.db')
+
+ c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax)
+
+ print(c.last_result.status)
+ print(c.last_result.out)
+ print(c.last_result.err)
+
+ assert c.last_result.status == 0
+ assert "query_name,rank,fraction,lineage" in c.last_result.out
+ assert 'test1,superkingdom,0.131,d__Bacteria' in c.last_result.out
+ assert "test1,phylum,0.073,d__Bacteria;p__Bacteroidota" in c.last_result.out
+ assert "test1,phylum,0.058,d__Bacteria;p__Proteobacteria" in c.last_result.out
+ assert "test1,class,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia" in c.last_result.out
+ assert "test1,class,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria" in c.last_result.out
+ assert "test1,order,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales" in c.last_result.out
+ assert "test1,order,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales" in c.last_result.out
+ assert "test1,family,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae" in c.last_result.out
+ assert "test1,family,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae" in c.last_result.out
+ assert "test1,genus,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia" in c.last_result.out
+ assert "test1,genus,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella" in c.last_result.out
+ assert "test1,genus,0.016,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola" in c.last_result.out
+ assert "test1,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out
+ assert "test1,species,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in c.last_result.out
+ assert "test1,species,0.016,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" in c.last_result.out
+
+
+def test_metagenome_summary_csv_out(runtmp):
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+ tax = utils.get_test_data('tax/test.taxonomy.csv')
+ csv_base = "out"
+ sum_csv = csv_base + ".summarized.csv"
+ csvout = runtmp.output(sum_csv)
+ outdir = os.path.dirname(csvout)
+
+ runtmp.run_sourmash('tax', 'metagenome', '--gather-csv', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-dir', outdir)
+
+ print(runtmp.last_result.status)
+ print(runtmp.last_result.out)
+ print(runtmp.last_result.err)
+
+ assert runtmp.last_result.status == 0
+ assert os.path.exists(csvout)
+
+ sum_gather_results = [x.rstrip() for x in open(csvout)]
+ assert f"saving `csv_summary` output to {csvout}" in runtmp.last_result.err
+ assert "query_name,rank,fraction,lineage" in sum_gather_results[0]
+ assert 'test1,superkingdom,0.131,d__Bacteria' in sum_gather_results[1]
+ assert "test1,phylum,0.073,d__Bacteria;p__Bacteroidota" in sum_gather_results[2]
+ assert "test1,phylum,0.058,d__Bacteria;p__Proteobacteria" in sum_gather_results[3]
+ assert "test1,class,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia" in sum_gather_results[4]
+ assert "test1,class,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria" in sum_gather_results[5]
+ assert "test1,order,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales" in sum_gather_results[6]
+ assert "test1,order,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales" in sum_gather_results[7]
+ assert "test1,family,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae" in sum_gather_results[8]
+ assert "test1,family,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae" in sum_gather_results[9]
+ assert "test1,genus,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia" in sum_gather_results[10]
+ assert "test1,genus,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella" in sum_gather_results[11]
+ assert "test1,genus,0.016,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola" in sum_gather_results[12]
+ assert "test1,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in sum_gather_results[13]
+ assert "test1,species,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in sum_gather_results[14]
+ assert "test1,species,0.016,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" in sum_gather_results[15]
+
+
+def test_metagenome_krona_tsv_out(runtmp):
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+ tax = utils.get_test_data('tax/test.taxonomy.csv')
+ csv_base = "out"
+ kr_csv = csv_base + ".krona.tsv"
+ csvout = runtmp.output(kr_csv)
+ outdir = os.path.dirname(csvout)
+ print("csvout: ", csvout)
+
+ runtmp.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '-o', csv_base,
+ '--output-format', 'krona', '--rank', 'genus', '--output-dir', outdir)
+
+ print(runtmp.last_result.status)
+ print(runtmp.last_result.out)
+ print(runtmp.last_result.err)
+
+ assert runtmp.last_result.status == 0
+ assert os.path.exists(csvout)
+ assert f"saving `krona` output to {csvout}" in runtmp.last_result.err
+
+ gn_krona_results = [x.rstrip().split('\t') for x in open(csvout)]
+ print("species krona results: \n", gn_krona_results)
+ assert ['fraction', 'superkingdom', 'phylum', 'class', 'order', 'family', 'genus'] == gn_krona_results[0]
+ assert ['0.05815279361459521', 'd__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacterales', 'f__Enterobacteriaceae', 'g__Escherichia'] == gn_krona_results[1]
+ assert ['0.05701254275940707', 'd__Bacteria', 'p__Bacteroidota', 'c__Bacteroidia', 'o__Bacteroidales', 'f__Bacteroidaceae', 'g__Prevotella'] == gn_krona_results[2]
+ assert ['0.015637726014008795', 'd__Bacteria', 'p__Bacteroidota', 'c__Bacteroidia', 'o__Bacteroidales', 'f__Bacteroidaceae', 'g__Phocaeicola'] == gn_krona_results[3]
+
+
+def test_metagenome_lineage_summary_out(runtmp):
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+ tax = utils.get_test_data('tax/test.taxonomy.csv')
+ csv_base = "out"
+ lin_csv = csv_base + ".lineage_summary.tsv"
+ csvout = runtmp.output(lin_csv)
+ outdir = os.path.dirname(csvout)
+ print("csvout: ", csvout)
+
+ runtmp.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax,
+ '-o', csv_base, '--output-format', 'lineage_summary', '--rank',
+ 'genus', '--output-dir', outdir)
+
+ print(runtmp.last_result.status)
+ print(runtmp.last_result.out)
+ print(runtmp.last_result.err)
+
+ assert runtmp.last_result.status == 0
+ assert os.path.exists(csvout)
+ assert f"saving `lineage_summary` output to {csvout}" in runtmp.last_result.err
+
+ gn_lineage_summary = [x.rstrip().split('\t') for x in open(csvout)]
+ print("species lineage summary results: \n", gn_lineage_summary)
+ assert ['lineage', 'test1'] == gn_lineage_summary[0]
+ assert ['d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola', '0.015637726014008795'] == gn_lineage_summary[1]
+ assert ['d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella', '0.05701254275940707'] == gn_lineage_summary[2]
+ assert ['d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia', '0.05815279361459521'] == gn_lineage_summary[3]
+
+
+def test_metagenome_no_taxonomy_fail(runtmp):
+ c = runtmp
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+
+ with pytest.raises(ValueError) as exc:
+ c.run_sourmash('tax', 'metagenome', '-g', g_csv)
+ assert "error: the following arguments are required: -t/--taxonomy-csv" in str(exc.value)
+
+
+def test_metagenome_no_rank_lineage_summary(runtmp):
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+ tax = utils.get_test_data('tax/test.taxonomy.csv')
+ csv_base = "out"
+
+ with pytest.raises(ValueError) as exc:
+ runtmp.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-format', 'lineage_summary')
+ assert "Rank (--rank) is required for krona and lineage_summary output formats." in str(exc.value)
+
+
+def test_metagenome_no_rank_krona(runtmp):
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+ tax = utils.get_test_data('tax/test.taxonomy.csv')
+ csv_base = "out"
+
+ with pytest.raises(ValueError) as exc:
+ runtmp.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-format', 'krona')
+ assert "Rank (--rank) is required for krona and lineage_summary output formats." in str(exc.value)
+
+
+def test_genome_no_rank_krona(runtmp):
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+ tax = utils.get_test_data('tax/test.taxonomy.csv')
+ csv_base = "out"
+
+ with pytest.raises(ValueError) as exc:
+ runtmp.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-format', 'krona')
+ assert "Rank (--rank) is required for krona output format." in str(exc.value)
+
+
+def test_metagenome_rank_not_available(runtmp):
+ c = runtmp
+
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+ tax = utils.get_test_data('tax/test.taxonomy.csv')
+
+ with pytest.raises(ValueError) as exc:
+ c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax,
+ '--rank', 'strain')
+
+ print(str(exc.value))
+
+ assert c.last_result.status == -1
+ assert "No taxonomic information provided for rank strain: cannot summarize at this rank" in str(exc.value)
+
+
+def test_metagenome_duplicated_taxonomy_fail(runtmp):
+ c = runtmp
+ # write temp taxonomy with duplicates
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+ duplicated_csv = runtmp.output("duplicated_taxonomy.csv")
+ with open(duplicated_csv, 'w') as dup:
+ tax = [x.rstrip() for x in open(taxonomy_csv, 'r')]
+ tax.append(tax[1] + 'FOO') # add first tax_assign again
+ dup.write("\n".join(tax))
+
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+
+ with pytest.raises(ValueError) as exc:
+ c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', duplicated_csv)
+
+ assert "cannot read taxonomy" in str(exc.value)
+ assert "multiple lineages for identifier GCF_001881345" in str(exc.value)
+
+
+def test_metagenome_duplicated_taxonomy_force(runtmp):
+ c = runtmp
+ # write temp taxonomy with duplicates
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+ duplicated_csv = runtmp.output("duplicated_taxonomy.csv")
+ with open(duplicated_csv, 'w') as dup:
+ tax = [x.rstrip() for x in open(taxonomy_csv, 'r')]
+ tax.append(tax[1]) # add first tax_assign again
+ dup.write("\n".join(tax))
+
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+
+ c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', duplicated_csv, '--force')
+
+ print(c.last_result.status)
+ print(c.last_result.out)
+ print(c.last_result.err)
+
+ # same as stdout test - just check the first few lines
+ assert c.last_result.status == 0
+ assert "rank,fraction,lineage" in c.last_result.out
+ assert 'superkingdom,0.131,d__Bacteria' in c.last_result.out
+ assert "phylum,0.073,d__Bacteria;p__Bacteroidota" in c.last_result.out
+ assert "phylum,0.058,d__Bacteria;p__Proteobacteria" in c.last_result.out
+
+
+def test_metagenome_missing_taxonomy(runtmp):
+ c = runtmp
+ # write temp taxonomy with missing entry
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+ subset_csv = runtmp.output("subset_taxonomy.csv")
+ with open(subset_csv, 'w') as subset:
+ tax = [x.rstrip() for x in open(taxonomy_csv, 'r')]
+ subset.write("\n".join(tax[:4]))
+
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+
+ c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', subset_csv)
+ print(c.last_result.status)
+ print(c.last_result.out)
+ print(c.last_result.err)
+
+ assert c.last_result.status == 0
+ assert "The following are missing from the taxonomy information: GCF_003471795" in c.last_result.err
+ assert "rank,fraction,lineage" in c.last_result.out
+
+ assert "superkingdom,0.124,d__Bacteria" in c.last_result.out
+ assert "phylum,0.066,d__Bacteria;p__Bacteroidota" in c.last_result.out
+ assert "phylum,0.058,d__Bacteria;p__Proteobacteria" in c.last_result.out
+ assert "class,0.066,d__Bacteria;p__Bacteroidota;c__Bacteroidia" in c.last_result.out
+ assert "class,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria" in c.last_result.out
+ assert "order,0.066,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales" in c.last_result.out
+
+
+def test_metagenome_missing_taxonomy_fail(runtmp):
+ c = runtmp
+ # write temp taxonomy with missing entry
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+ subset_csv = runtmp.output("subset_taxonomy.csv")
+ with open(subset_csv, 'w') as subset:
+ tax = [x.rstrip() for x in open(taxonomy_csv, 'r')]
+ subset.write("\n".join(tax[:4]))
+
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+
+ with pytest.raises(ValueError) as exc:
+ c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', subset_csv, '--fail-on-missing-taxonomy')
+
+ print(str(exc.value))
+
+ assert "The following are missing from the taxonomy information: GCF_003471795" in str(exc.value)
+ assert "Failing on missing taxonomy, as requested via --fail-on-missing-taxonomy." in str(exc.value)
+ assert c.last_result.status == -1
+
+
+def test_metagenome_multiple_taxonomy_files_missing(runtmp):
+ c = runtmp
+ # write temp taxonomy with duplicates
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+
+ # gather against mult databases
+ g_csv = utils.get_test_data('tax/test1_x_gtdbrs202_genbank_euks.gather.csv')
+
+ c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', taxonomy_csv, '--force')
+ print(c.last_result.status)
+ print(c.last_result.out)
+ print(c.last_result.err)
+
+ assert "of 6, missed 2 lineage assignments." in c.last_result.err
+ assert "query_name,rank,fraction,lineage" in c.last_result.out
+ assert "multtest,superkingdom,0.131,d__Bacteria" in c.last_result.out
+ assert "multtest,phylum,0.073,d__Bacteria;p__Bacteroidota" in c.last_result.out
+ assert "multtest,phylum,0.058,d__Bacteria;p__Proteobacteria" in c.last_result.out
+ assert "multtest,class,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia" in c.last_result.out
+
+
+def test_metagenome_multiple_taxonomy_files(runtmp):
+ c = runtmp
+ # write temp taxonomy with duplicates
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+ protozoa_genbank = utils.get_test_data('tax/protozoa_genbank_lineage.csv')
+ bacteria_refseq = utils.get_test_data('tax/bacteria_refseq_lineage.csv')
+
+ # gather against mult databases
+ g_csv = utils.get_test_data('tax/test1_x_gtdbrs202_genbank_euks.gather.csv')
+
+ c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', taxonomy_csv, protozoa_genbank, bacteria_refseq)
+ print(c.last_result.status)
+ print(c.last_result.out)
+ print(c.last_result.err)
+
+ assert "of 6, missed 0 lineage assignments." in c.last_result.err
+ assert "query_name,rank,fraction,lineage" in c.last_result.out
+ assert "multtest,superkingdom,0.245,Eukaryota" in c.last_result.out
+ assert "multtest,superkingdom,0.131,Bacteria" in c.last_result.out
+ assert "multtest,phylum,0.245,Eukaryota;Apicomplexa" in c.last_result.out
+ assert "multtest,phylum,0.073,Bacteria;Bacteroidetes" in c.last_result.out
+ #assert "multtest,phylum,0.073,d__Bacteria;p__Bacteroidota" in c.last_result.out # this is gtdb tax, line above is genbank...
+
+
+def test_metagenome_empty_gather_results(runtmp):
+ tax = utils.get_test_data('tax/test.taxonomy.csv')
+
+ #creates empty gather result
+ g_csv = runtmp.output('g.csv')
+ with open(g_csv, "w") as fp:
+ fp.write("")
+ print("g_csv: ", g_csv)
+
+ with pytest.raises(ValueError) as exc:
+ runtmp.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax)
+
+ assert f'Cannot read gather results from {g_csv}. Is file empty?' in str(exc.value)
+ assert runtmp.last_result.status == -1
+
+
+def test_metagenome_bad_gather_header(runtmp):
+ tax = utils.get_test_data('tax/test.taxonomy.csv')
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+
+ bad_g_csv = runtmp.output('g.csv')
+
+ #creates bad gather result
+ bad_g = [x.replace("name", "nope") for x in open(g_csv, 'r')]
+ with open(bad_g_csv, 'w') as fp:
+ for line in bad_g:
+ fp.write(line)
+ print("bad_gather_results: \n", bad_g)
+
+ with pytest.raises(ValueError) as exc:
+ runtmp.run_sourmash('tax', 'metagenome', '-g', bad_g_csv, '--taxonomy-csv', tax)
+
+ assert f'Not all required gather columns are present in {bad_g_csv}.' in str(exc.value)
+ assert runtmp.last_result.status == -1
+
+
+def test_metagenome_empty_tax_lineage_input(runtmp):
+ tax_empty = runtmp.output('t.csv')
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+
+ with open(tax_empty, "w") as fp:
+ fp.write("")
+ print("t_csv: ", tax_empty)
+
+
+ with pytest.raises(ValueError) as exc:
+ runtmp.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax_empty)
+
+ print(runtmp.last_result.status)
+ print(runtmp.last_result.out)
+ print(runtmp.last_result.err)
+
+ assert runtmp.last_result.status != 0
+ assert f"cannot read taxonomy assignments from" in str(exc.value)
+
+
+def test_metagenome_perfect_match_warning(runtmp):
+ tax = utils.get_test_data('tax/test.taxonomy.csv')
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+
+ perfect_g_csv = runtmp.output('g.csv')
+
+ #create a perfect gather result
+ with open(g_csv, 'r') as fp:
+ r = csv.DictReader(fp, delimiter=',')
+ header = r.fieldnames
+ print(header)
+ with open(perfect_g_csv, 'w') as out_fp:
+ w = csv.DictWriter(out_fp, header)
+ w.writeheader()
+ for n, row in enumerate(r):
+ if n == 0:
+ row["f_unique_weighted"] = 1.0
+ w.writerow(row)
+ print(row)
+
+ runtmp.run_sourmash('tax', 'metagenome', '-g', perfect_g_csv, '--taxonomy-csv', tax)
+
+ print(runtmp.last_result.status)
+ print(runtmp.last_result.out)
+ print(runtmp.last_result.err)
+
+ assert runtmp.last_result.status == 0
+ assert 'WARNING: 100% match! Is query "test1" identical to its database match, GCF_001881345' in runtmp.last_result.err
+
+
+def test_metagenome_gather_duplicate_query(runtmp):
+ c = runtmp
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+ g_res = utils.get_test_data('tax/test1.gather.csv')
+
+ # different filename, contents identical to test1
+ g_res2 = runtmp.output("test2.gather.csv")
+ with open(g_res2, 'w') as fp:
+ for line in open(g_res, 'r'):
+ fp.write(line)
+
+ with pytest.raises(ValueError) as exc:
+ c.run_sourmash('tax', 'metagenome', '--gather-csv', g_res, g_res2,
+ '--taxonomy-csv', taxonomy_csv)
+
+ assert c.last_result.status == -1
+ print(str(exc.value))
+ assert "Gather query test1 was found in more than one CSV. Cannot load from " in str(exc.value)
+
+
+def test_metagenome_gather_duplicate_query_force(runtmp):
+ c = runtmp
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+ g_res = utils.get_test_data('tax/test1.gather.csv')
+
+ # different filename, contents identical to test1
+ g_res2 = runtmp.output("test2.gather.csv")
+ with open(g_res2, 'w') as fp:
+ for line in open(g_res, 'r'):
+ fp.write(line)
+
+ c.run_sourmash('tax', 'metagenome', '--gather-csv', g_res, g_res2,
+ '--taxonomy-csv', taxonomy_csv, '--force')
+
+ print(c.last_result.status)
+ print(c.last_result.out)
+ print(c.last_result.err)
+
+ assert c.last_result.status == 0
+ assert '--force is set, ignoring duplicate query.' in c.last_result.err
+ assert 'No gather results loaded from ' in c.last_result.err
+ assert 'loaded results from 1 gather CSVs' in c.last_result.err
+ assert "query_name,rank,fraction,lineage" in c.last_result.out
+ assert 'test1,superkingdom,0.131,d__Bacteria' in c.last_result.out
+
+
+def test_metagenome_gather_duplicate_filename(runtmp):
+ c = runtmp
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+ g_res = utils.get_test_data('tax/test1.gather.csv')
+
+ c.run_sourmash('tax', 'metagenome', '--gather-csv', g_res, g_res, '--taxonomy-csv', taxonomy_csv)
+
+ print(c.last_result.status)
+ print(c.last_result.out)
+ print(c.last_result.err)
+
+ assert c.last_result.status == 0
+ assert f'ignoring duplicated reference to file: {g_res}'
+ assert "query_name,rank,fraction,lineage" in c.last_result.out
+ assert 'test1,superkingdom,0.131,d__Bacteria' in c.last_result.out
+
+
+def test_metagenome_gather_duplicate_filename_from_file(runtmp):
+ c = runtmp
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+ g_res = utils.get_test_data('tax/test1.gather.csv')
+ g_from_file = runtmp.output("tmp-from-file.txt")
+ with open(g_from_file, 'w') as f_csv:
+ f_csv.write(f"{g_res}\n")
+ f_csv.write(f"{g_res}\n")
+
+ c.run_sourmash('tax', 'metagenome', '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv)
+
+ print(c.last_result.status)
+ print(c.last_result.out)
+ print(c.last_result.err)
+
+ assert c.last_result.status == 0
+ assert f'ignoring duplicated reference to file: {g_res}'
+ assert "query_name,rank,fraction,lineage" in c.last_result.out
+ assert 'test1,superkingdom,0.131,d__Bacteria' in c.last_result.out
+
+
+def test_genome_empty_gather_results(runtmp):
+ tax = utils.get_test_data('tax/test.taxonomy.csv')
+
+ #creates empty gather result
+ g_csv = runtmp.output('g.csv')
+ with open(g_csv, "w") as fp:
+ fp.write("")
+ print("g_csv: ", g_csv)
+
+ with pytest.raises(ValueError) as exc:
+ runtmp.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax)
+
+ assert f'Cannot read gather results from {g_csv}. Is file empty?' in str(exc.value)
+ assert runtmp.last_result.status == -1
+
+
+def test_genome_bad_gather_header(runtmp):
+ tax = utils.get_test_data('tax/test.taxonomy.csv')
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+
+ bad_g_csv = runtmp.output('g.csv')
+
+ #creates bad gather result
+ bad_g = [x.replace("f_unique_weighted", "nope") for x in open(g_csv, 'r')]
+ with open(bad_g_csv, 'w') as fp:
+ for line in bad_g:
+ fp.write(line)
+ print("bad_gather_results: \n", bad_g)
+
+ with pytest.raises(ValueError) as exc:
+ runtmp.run_sourmash('tax', 'genome', '-g', bad_g_csv, '--taxonomy-csv', tax)
+
+ assert f'Not all required gather columns are present in {bad_g_csv}.' in str(exc.value)
+ assert runtmp.last_result.status == -1
+
+
+def test_genome_empty_tax_lineage_input(runtmp):
+ tax_empty = runtmp.output('t.csv')
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+
+ with open(tax_empty, "w") as fp:
+ fp.write("")
+ print("t_csv: ", tax_empty)
+
+
+ with pytest.raises(ValueError) as exc:
+ runtmp.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax_empty)
+
+ print(runtmp.last_result.status)
+ print(runtmp.last_result.out)
+ print(runtmp.last_result.err)
+
+ assert runtmp.last_result.status != 0
+ assert f"cannot read taxonomy assignments from" in str(exc.value)
+
+
+def test_genome_rank_stdout_0(runtmp):
+ # test basic genome
+ c = runtmp
+
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+ tax = utils.get_test_data('tax/test.taxonomy.csv')
+
+ c.run_sourmash('tax', 'genome', '--gather-csv', g_csv, '--taxonomy-csv', tax,
+ '--rank', 'species', '--containment-threshold', '0')
+
+ print(c.last_result.status)
+ print(c.last_result.out)
+ print(c.last_result.err)
+
+ assert c.last_result.status == 0
+ assert "query_name,status,rank,fraction,lineage" in c.last_result.out
+ assert "test1,match,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out
+
+
+def test_genome_rank_stdout_0_db(runtmp):
+ # test basic genome with sqlite database
+ c = runtmp
+
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+ tax = utils.get_test_data('tax/test.taxonomy.db')
+
+ c.run_sourmash('tax', 'genome', '--gather-csv', g_csv, '--taxonomy-csv',
+ tax, '--rank', 'species', '--containment-threshold', '0')
+
+ print(c.last_result.status)
+ print(c.last_result.out)
+ print(c.last_result.err)
+
+ assert c.last_result.status == 0
+ assert "query_name,status,rank,fraction,lineage" in c.last_result.out
+ assert "test1,match,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out
+
+
+def test_genome_rank_csv_0(runtmp):
+ # test basic genome - output csv
+ c = runtmp
+
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+ tax = utils.get_test_data('tax/test.taxonomy.csv')
+ csv_base = "out"
+ cl_csv = csv_base + ".classifications.csv"
+ csvout = runtmp.output(cl_csv)
+ outdir = os.path.dirname(csvout)
+ print("csvout: ", csvout)
+
+ c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax,
+ '--rank', 'species', '-o', csv_base, '--containment-threshold', '0',
+ '--output-dir', outdir)
+
+ print(c.last_result.status)
+ print(c.last_result.out)
+ print(c.last_result.err)
+
+ assert f"saving `classification` output to {csvout}" in runtmp.last_result.err
+ assert c.last_result.status == 0
+ cl_results = [x.rstrip() for x in open(csvout)]
+ assert "query_name,status,rank,fraction,lineage" in cl_results[0]
+ assert "test1,match,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in cl_results[1]
+
+
+def test_genome_rank_krona(runtmp):
+ # test basic genome - output csv
+ c = runtmp
+
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+ tax = utils.get_test_data('tax/test.taxonomy.csv')
+ csv_base = "out"
+ cl_csv = csv_base + ".krona.tsv"
+ csvout = runtmp.output(cl_csv)
+ outdir = os.path.dirname(csvout)
+ print("csvout: ", csvout)
+
+ c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax,
+ '--rank', 'species', '-o', csv_base, '--containment-threshold', '0',
+ '--output-format', 'krona', '--output-dir', outdir)
+
+ print(c.last_result.status)
+ print(c.last_result.out)
+ print(c.last_result.err)
+
+ assert f"saving `krona` output to {csvout}" in runtmp.last_result.err
+ assert c.last_result.status == 0
+ kr_results = [x.rstrip().split('\t') for x in open(csvout)]
+ print(kr_results)
+ assert ['fraction', 'superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'] == kr_results[0]
+ assert ['0.05815279361459521', 'd__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacterales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__Escherichia coli'] == kr_results[1]
+
+
+def test_genome_gather_from_file_rank(runtmp):
+ c = runtmp
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+ g_res = utils.get_test_data('tax/test1.gather.csv')
+ g_from_file = runtmp.output("tmp-from-file.txt")
+ with open(g_from_file, 'w') as f_csv:
+ f_csv.write(f"{g_res}\n")
+
+ c.run_sourmash('tax', 'genome', '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv,
+ '--rank', 'species', '--containment-threshold', '0')
+
+ print(c.last_result.status)
+ print(c.last_result.out)
+ print(c.last_result.err)
+
+ assert c.last_result.status == 0
+ assert "query_name,status,rank,fraction,lineage" in c.last_result.out
+ assert "test1,match,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out
+
+
+def test_genome_gather_from_file_two_files(runtmp):
+ c = runtmp
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+ g_res = utils.get_test_data('tax/test1.gather.csv')
+
+ # make test2 results (identical to test1 except query_name)
+ g_res2 = runtmp.output("test2.gather.csv")
+ test2_results = [x.replace("test1", "test2") for x in open(g_res, 'r')]
+ with open(g_res2, 'w') as fp:
+ for line in test2_results:
+ fp.write(line)
+
+ # write test1 and test2 files to a text file for input
+ g_from_file = runtmp.output("tmp-from-file.txt")
+ with open(g_from_file, 'w') as f_csv:
+ f_csv.write(f"{g_res}\n")
+ f_csv.write(f"{g_res2}\n")
+
+ c.run_sourmash('tax', 'genome', '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv,
+ '--rank', 'species', '--containment-threshold', '0')
+
+ print(c.last_result.status)
+ print(c.last_result.out)
+ print(c.last_result.err)
+
+ assert c.last_result.status == 0
+ assert "query_name,status,rank,fraction,lineage" in c.last_result.out
+ assert "test1,match,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out
+ assert "test2,match,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out
+
+
+def test_genome_gather_duplicate_filename(runtmp):
+ c = runtmp
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+ g_res = utils.get_test_data('tax/test1.gather.csv')
+
+ c.run_sourmash('tax', 'genome', '--gather-csv', g_res, g_res, '--taxonomy-csv', taxonomy_csv,
+ '--rank', 'species', '--containment-threshold', '0')
+
+ print(c.last_result.status)
+ print(c.last_result.out)
+ print(c.last_result.err)
+
+ assert c.last_result.status == 0
+ assert f'ignoring duplicated reference to file: {g_res}'
+ assert "query_name,status,rank,fraction,lineage" in c.last_result.out
+ assert "test1,match,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out
+
+def test_genome_gather_from_file_duplicate_filename(runtmp):
+ c = runtmp
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+ g_res = utils.get_test_data('tax/test1.gather.csv')
+ g_from_file = runtmp.output("tmp-from-file.txt")
+ with open(g_from_file, 'w') as f_csv:
+ f_csv.write(f"{g_res}\n")
+ f_csv.write(f"{g_res}\n")
+
+ c.run_sourmash('tax', 'genome', '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv,
+ '--rank', 'species', '--containment-threshold', '0')
+
+ print(c.last_result.status)
+ print(c.last_result.out)
+ print(c.last_result.err)
+
+ assert c.last_result.status == 0
+ assert f'ignoring duplicated reference to file: {g_res}'
+ assert "query_name,status,rank,fraction,lineage" in c.last_result.out
+ assert "test1,match,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out
+
+
+def test_genome_gather_from_file_duplicate_query(runtmp):
+ c = runtmp
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+ g_res = utils.get_test_data('tax/test1.gather.csv')
+
+ # different filename, contents identical to test1
+ g_res2 = runtmp.output("test2.gather.csv")
+ with open(g_res2, 'w') as fp:
+ for line in open(g_res, 'r'):
+ fp.write(line)
+
+ g_from_file = runtmp.output("tmp-from-file.txt")
+ with open(g_from_file, 'w') as f_csv:
+ f_csv.write(f"{g_res}\n")
+ f_csv.write(f"{g_res2}\n")
+
+ with pytest.raises(ValueError) as exc:
+ c.run_sourmash('tax', 'genome', '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv,
+ '--rank', 'species', '--containment-threshold', '0')
+ assert c.last_result.status == -1
+ print(str(exc.value))
+ assert "Gather query test1 was found in more than one CSV. Cannot load from " in str(exc.value)
+
+
+def test_genome_gather_from_file_duplicate_query_force(runtmp):
+ c = runtmp
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+ g_res = utils.get_test_data('tax/test1.gather.csv')
+
+ # different filename, contents identical to test1
+ g_res2 = runtmp.output("test2.gather.csv")
+ with open(g_res2, 'w') as fp:
+ for line in open(g_res, 'r'):
+ fp.write(line)
+
+ g_from_file = runtmp.output("tmp-from-file.txt")
+ with open(g_from_file, 'w') as f_csv:
+ f_csv.write(f"{g_res}\n")
+ f_csv.write(f"{g_res2}\n")
+
+ c.run_sourmash('tax', 'genome', '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv,
+ '--rank', 'species', '--containment-threshold', '0', '--force')
+
+ print(c.last_result.status)
+ print(c.last_result.out)
+ print(c.last_result.err)
+
+ assert c.last_result.status == 0
+ assert "query_name,status,rank,fraction,lineage" in c.last_result.out
+ assert "test1,match,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out
+ assert '--force is set, ignoring duplicate query.' in c.last_result.err
+ assert 'No gather results loaded from ' in c.last_result.err
+ assert 'loaded results from 1 gather CSVs' in c.last_result.err
+
+
+def test_genome_gather_cli_and_from_file(runtmp):
+ c = runtmp
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+ g_res = utils.get_test_data('tax/test1.gather.csv')
+ g_from_file = runtmp.output("tmp-from-file.txt")
+
+ # make test2 results (identical to test1 except query_name)
+ g_res2 = runtmp.output("test2.gather.csv")
+ test2_results = [x.replace("test1", "test2") for x in open(g_res, 'r')]
+ with open(g_res2, 'w') as fp:
+ for line in test2_results:
+ fp.write(line)
+
+ # write test2 csv to a text file for input
+ g_from_file = runtmp.output("tmp-from-file.txt")
+ with open(g_from_file, 'w') as f_csv:
+ f_csv.write(f"{g_res2}\n")
+
+ c.run_sourmash('tax', 'genome', '-g', g_res, '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv,
+ '--rank', 'species', '--containment-threshold', '0')
+
+ print(c.last_result.status)
+ print(c.last_result.out)
+ print(c.last_result.err)
+
+ assert c.last_result.status == 0
+ assert "query_name,status,rank,fraction,lineage" in c.last_result.out
+ assert "test1,match,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out
+ assert "test2,match,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out
+
+
+def test_genome_gather_cli_and_from_file_duplicate_filename(runtmp):
+ c = runtmp
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+ g_res = utils.get_test_data('tax/test1.gather.csv')
+ g_from_file = runtmp.output("tmp-from-file.txt")
+
+ # also write test1 csv to a text file for input
+ g_from_file = runtmp.output("tmp-from-file.txt")
+ with open(g_from_file, 'w') as f_csv:
+ f_csv.write(f"{g_res}\n")
+
+ c.run_sourmash('tax', 'genome', '-g', g_res, '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv,
+ '--rank', 'species', '--containment-threshold', '0')
+
+ print(c.last_result.status)
+ print(c.last_result.out)
+ print(c.last_result.err)
+
+ assert c.last_result.status == 0
+ assert f'ignoring duplicated reference to file: {g_res}'
+ assert "query_name,status,rank,fraction,lineage" in c.last_result.out
+ assert "test1,match,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out
+
+
+def test_genome_gather_from_file_below_threshold(runtmp):
+ c = runtmp
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+ g_res = utils.get_test_data('tax/test1.gather.csv')
+ g_from_file = runtmp.output("tmp-from-file.txt")
+ with open(g_from_file, 'w') as f_csv:
+ f_csv.write(f"{g_res}\n")
+
+ c.run_sourmash('tax', 'genome', '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv,
+ '--containment-threshold', '1')
+
+ print(c.last_result.status)
+ print(c.last_result.out)
+ print(c.last_result.err)
+
+ assert c.last_result.status == 0
+ assert "query_name,status,rank,fraction,lineage" in c.last_result.out
+ assert "test1,below_threshold,,0.000," in c.last_result.out
+
+
+def test_genome_gather_two_queries(runtmp):
+ '''
+ This checks for initial bug where classification
+ would only happen for one genome per rank when
+ doing --containment-threshold classification
+ '''
+ c = runtmp
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+ g_res = utils.get_test_data('tax/47+63_x_gtdb-rs202.gather.csv')
+
+ # split 47+63 into two fake queries: q47, q63
+ g_res2 = runtmp.output("two-queries.gather.csv")
+ q2_results = [x for x in open(g_res, 'r')]
+ # rename queries
+ q2_results[1] = q2_results[1].replace('47+63', 'q47')
+ q2_results[2] = q2_results[2].replace('47+63', 'q63')
+ with open(g_res2, 'w') as fp:
+ for line in q2_results:
+ print(line)
+ fp.write(line)
+
+ c.run_sourmash('tax', 'genome', '-g', g_res2, '--taxonomy-csv', taxonomy_csv,
+ '--containment-threshold', '0')
+ print(c.last_result.status)
+ print(c.last_result.out)
+ print(c.last_result.err)
+
+ assert c.last_result.status == 0
+ assert "query_name,status,rank,fraction,lineage" in c.last_result.out
+ assert "q63,match,species,0.336,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Shewanellaceae;g__Shewanella;s__Shewanella baltica,491c0a81," in c.last_result.out
+ assert "q47,match,species,0.664,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Shewanellaceae;g__Shewanella;s__Shewanella baltica," in c.last_result.out
+
+
+def test_genome_rank_duplicated_taxonomy_fail(runtmp):
+ c = runtmp
+ # write temp taxonomy with duplicates
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+ duplicated_csv = runtmp.output("duplicated_taxonomy.csv")
+ with open(duplicated_csv, 'w') as dup:
+ tax = [x.rstrip() for x in open(taxonomy_csv, 'r')]
+ tax.append(tax[1] + 'FOO') # add first tax_assign again
+ dup.write("\n".join(tax))
+
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+
+ with pytest.raises(ValueError) as exc:
+ c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', duplicated_csv,
+ '--rank', 'species')
+ assert "cannot read taxonomy assignments" in str(exc.value)
+ assert "multiple lineages for identifier GCF_001881345" in str(exc.value)
+
+
+def test_genome_rank_duplicated_taxonomy_force(runtmp):
+ c = runtmp
+ # write temp taxonomy with duplicates
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+ duplicated_csv = runtmp.output("duplicated_taxonomy.csv")
+ with open(duplicated_csv, 'w') as dup:
+ tax = [x.rstrip() for x in open(taxonomy_csv, 'r')]
+ tax.append(tax[1]) # add first tax_assign again
+ dup.write("\n".join(tax))
+
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+
+ c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', duplicated_csv,
+ '--rank', 'species', '--force', '--containment-threshold', '0')
+
+ print(c.last_result.status)
+ print(c.last_result.out)
+ print(c.last_result.err)
+
+ assert c.last_result.status == 0
+ assert "query_name,status,rank,fraction,lineage" in c.last_result.out
+ assert "test1,match,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out
+
+
+def test_genome_missing_taxonomy_ignore_threshold(runtmp):
+ c = runtmp
+ # write temp taxonomy with missing entry
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+ subset_csv = runtmp.output("subset_taxonomy.csv")
+ with open(subset_csv, 'w') as subset:
+ tax = [x.rstrip() for x in open(taxonomy_csv, 'r')]
+ tax = [tax[0]] + tax[2:] # remove the best match (1st tax entry)
+ subset.write("\n".join(tax))
+
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+
+ c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', subset_csv, '--containment-threshold', '0')
+ print(c.last_result.status)
+ print(c.last_result.out)
+ print(c.last_result.err)
+
+ assert c.last_result.status == 0
+ assert "The following are missing from the taxonomy information: GCF_001881345" in c.last_result.err
+ assert "query_name,status,rank,fraction,lineage" in c.last_result.out
+ assert "test1,match,species,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in c.last_result.out
+
+
+def test_genome_missing_taxonomy_ignore_rank(runtmp):
+ c = runtmp
+ # write temp taxonomy with missing entry
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+ subset_csv = runtmp.output("subset_taxonomy.csv")
+ with open(subset_csv, 'w') as subset:
+ tax = [x.rstrip() for x in open(taxonomy_csv, 'r')]
+ tax = [tax[0]] + tax[2:] # remove the best match (1st tax entry)
+ subset.write("\n".join(tax))
+
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+
+ c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', subset_csv, '--rank', 'species')
+ print(c.last_result.status)
+ print(c.last_result.out)
+ print(c.last_result.err)
+
+ assert c.last_result.status == 0
+ assert "The following are missing from the taxonomy information: GCF_001881345" in c.last_result.err
+ assert "query_name,status,rank,fraction,lineage" in c.last_result.out
+ assert "test1,below_threshold,species,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in c.last_result.out
+
+
+def test_genome_missing_taxonomy_fail_threshold(runtmp):
+ c = runtmp
+ # write temp taxonomy with missing entry
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+ subset_csv = runtmp.output("subset_taxonomy.csv")
+ with open(subset_csv, 'w') as subset:
+ tax = [x.rstrip() for x in open(taxonomy_csv, 'r')]
+ tax = [tax[0]] + tax[2:] # remove the best match (1st tax entry)
+ subset.write("\n".join(tax))
+
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+
+ with pytest.raises(ValueError) as exc:
+ c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', subset_csv,
+ '--fail-on-missing-taxonomy', '--containment-threshold', '0')
+
+ print(str(exc.value))
+ print(c.last_result.status)
+ print(c.last_result.out)
+ print(c.last_result.err)
+
+ assert "The following are missing from the taxonomy information: GCF_001881345" in str(exc.value)
+ assert "Failing on missing taxonomy, as requested via --fail-on-missing-taxonomy." in str(exc.value)
+ assert c.last_result.status == -1
+
+
+def test_genome_missing_taxonomy_fail_rank(runtmp):
+ c = runtmp
+ # write temp taxonomy with missing entry
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+ subset_csv = runtmp.output("subset_taxonomy.csv")
+ with open(subset_csv, 'w') as subset:
+ tax = [x.rstrip() for x in open(taxonomy_csv, 'r')]
+ tax = [tax[0]] + tax[2:] # remove the best match (1st tax entry)
+ subset.write("\n".join(tax))
+
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+
+ with pytest.raises(ValueError) as exc:
+ c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', subset_csv,
+ '--fail-on-missing-taxonomy', '--rank', 'species')
+
+ print(str(exc.value))
+ print(c.last_result.status)
+ print(c.last_result.out)
+ print(c.last_result.err)
+
+ assert "The following are missing from the taxonomy information: GCF_001881345" in str(exc.value)
+ assert "Failing on missing taxonomy, as requested via --fail-on-missing-taxonomy." in str(exc.value)
+ assert c.last_result.status == -1
+
+
+def test_genome_rank_not_available(runtmp):
+ c = runtmp
+
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+ tax = utils.get_test_data('tax/test.taxonomy.csv')
+
+ with pytest.raises(ValueError) as exc:
+ c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax,
+ '--rank', 'strain', '--containment-threshold', '0')
+
+ print(str(exc.value))
+ print(c.last_result.status)
+ print(c.last_result.out)
+ print(c.last_result.err)
+
+ assert c.last_result.status == -1
+ assert "No taxonomic information provided for rank strain: cannot classify at this rank" in str(exc.value)
+
+
+def test_genome_empty_gather_results_with_header_single(runtmp):
+ c = runtmp
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+ gather_results = [x for x in open(g_csv, 'r')]
+ empty_gather_with_header = runtmp.output('g_header.csv')
+ # write temp empty gather results (header only)
+ with open(empty_gather_with_header, "w") as fp:
+ fp.write(gather_results[0])
+
+ with pytest.raises(ValueError) as exc:
+ c.run_sourmash('tax', 'genome', '-g', empty_gather_with_header, '--taxonomy-csv', taxonomy_csv)
+
+ print(str(exc.value))
+ print(c.last_result.status)
+ print(c.last_result.out)
+ print(c.last_result.err)
+
+ assert c.last_result.status == -1
+ assert f'No gather results loaded from {empty_gather_with_header}.' in str(exc.value)
+ assert 'Exiting.' in str(exc.value)
+
+
+def test_genome_empty_gather_results_single(runtmp):
+ c = runtmp
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+
+ # write temp empty gather results
+ empty_tax = runtmp.output('tax_header.csv')
+ with open(empty_tax, "w") as fp:
+ fp.write("")
+
+ with pytest.raises(ValueError) as exc:
+ c.run_sourmash('tax', 'genome', '-g', empty_tax, '--taxonomy-csv', taxonomy_csv)
+
+
+ print(c.last_result.status)
+ print(c.last_result.out)
+ print(c.last_result.err)
+
+ assert c.last_result.status == -1
+ assert f'Cannot read gather results from {empty_tax}. Is file empty?' in str(exc.value)
+ assert 'Exiting.' in c.last_result.err
+
+
+def test_genome_empty_gather_results_single_force(runtmp):
+ c = runtmp
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+
+ # write temp empty gather results (header only)
+ empty_tax = runtmp.output('tax_header.csv')
+ with open(empty_tax, "w") as fp:
+ fp.write("")
+
+ with pytest.raises(ValueError) as exc:
+ c.run_sourmash('tax', 'genome', '-g', empty_tax, '--taxonomy-csv', taxonomy_csv,
+ '--force')
+
+ print(str(exc.value))
+ print(c.last_result.status)
+ print(c.last_result.out)
+ print(c.last_result.err)
+
+ assert c.last_result.status == -1
+ assert '--force is set. Attempting to continue to next set of gather results.' in str(exc.value)
+ assert 'No results for classification. Exiting.' in str(exc.value)
+
+
+def test_genome_empty_gather_results_with_empty_csv_force(runtmp):
+ c = runtmp
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+
+ # write temp empty gather results
+ empty_tax = runtmp.output('tax_empty.txt')
+ with open(empty_tax, "w") as fp:
+ fp.write("")
+
+ g_from_file = runtmp.output("tmp-from-csv.csv")
+ with open(g_from_file, 'w') as f_csv:
+ f_csv.write(f"{empty_tax}\n")
+
+ with pytest.raises(ValueError) as exc:
+ c.run_sourmash('tax', 'genome', '-g', empty_tax, '--from-file', g_from_file,
+ '--taxonomy-csv', taxonomy_csv, '--rank', 'species', '--force')
+
+ print(str(exc.value))
+ print(c.last_result.status)
+ print(c.last_result.out)
+ print(c.last_result.err)
+
+ assert c.last_result.status == -1
+ assert '--force is set. Attempting to continue to next set of gather results.' in str(exc.value)
+ assert 'No results for classification. Exiting.' in str(exc.value)
+
+
+def test_genome_empty_gather_results_with_csv_force(runtmp):
+ c = runtmp
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+
+ g_res = utils.get_test_data('tax/test1.gather.csv')
+ g_from_file = runtmp.output("tmp-from-file.txt")
+ with open(g_from_file, 'w') as f_csv:
+ f_csv.write(f"{g_res}\n")
+
+ # write temp empty gather results
+ empty_tax = runtmp.output('tax_empty.csv')
+ with open(empty_tax, "w") as fp:
+ fp.write("")
+
+ c.run_sourmash('tax', 'genome', '-g', empty_tax, '--from-file', g_from_file,
+ '--taxonomy-csv', taxonomy_csv, '--rank', 'species',
+ '--containment-threshold', '0', '--force')
+
+ print(c.last_result.status)
+ print(c.last_result.out)
+ print(c.last_result.err)
+
+ assert c.last_result.status == 0
+ assert '--force is set. Attempting to continue to next set of gather results.' in c.last_result.err
+ assert 'loaded results from 1 gather CSVs' in c.last_result.err
+ assert "test1,match,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out
+
+
+def test_genome_containment_threshold_bounds(runtmp):
+ c = runtmp
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+ tax = utils.get_test_data('tax/test.taxonomy.csv')
+ below_threshold = "-1"
+
+ with pytest.raises(ValueError) as exc:
+ c.run_sourmash('tax', 'genome', '-g', tax, '--taxonomy-csv', tax,
+ '--containment-threshold', below_threshold)
+
+ print(c.last_result.status)
+ print(c.last_result.out)
+ print(c.last_result.err)
+ assert "ERROR: Argument must be >0 and <1" in str(exc.value)
+
+ above_threshold = "1.1"
+ with pytest.raises(ValueError) as exc:
+ c.run_sourmash('tax', 'genome', '-g', tax, '--taxonomy-csv', tax,
+ '--containment-threshold', above_threshold)
+
+ print(c.last_result.status)
+ print(c.last_result.out)
+ print(c.last_result.err)
+ assert "ERROR: Argument must be >0 and <1" in str(exc.value)
+
+
+def test_genome_containment_threshold_type(runtmp):
+ c = runtmp
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+ tax = utils.get_test_data('tax/test.taxonomy.csv')
+ not_a_float = "str"
+
+ with pytest.raises(ValueError) as exc:
+ c.run_sourmash('tax', 'genome', '-g', tax, '--taxonomy-csv', tax,
+ '--containment-threshold', not_a_float)
+
+ print(c.last_result.status)
+ print(c.last_result.out)
+ print(c.last_result.err)
+ assert "ERROR: Must be a floating point number" in str(exc.value)
+
+
+def test_annotate_0(runtmp):
+ # test annotate
+ c = runtmp
+
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+ tax = utils.get_test_data('tax/test.taxonomy.csv')
+ csvout = runtmp.output("test1.gather.with-lineages.csv")
+ out_dir = os.path.dirname(csvout)
+
+ c.run_sourmash('tax', 'annotate', '--gather-csv', g_csv, '--taxonomy-csv', tax, '-o', out_dir)
+
+ print(c.last_result.status)
+ print(c.last_result.out)
+ print(c.last_result.err)
+
+ assert c.last_result.status == 0
+
+ lin_gather_results = [x.rstrip() for x in open(csvout)]
+ print("\n".join(lin_gather_results))
+ assert f"saving `annotate` output to {csvout}" in runtmp.last_result.err
+
+ assert "lineage" in lin_gather_results[0]
+ assert "d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in lin_gather_results[1]
+ assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in lin_gather_results[2]
+ assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" in lin_gather_results[3]
+ assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in lin_gather_results[4]
+
+
+def test_annotate_0_db(runtmp):
+ # test annotate with sqlite db
+ c = runtmp
+
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+ tax = utils.get_test_data('tax/test.taxonomy.db')
+ csvout = runtmp.output("test1.gather.with-lineages.csv")
+ out_dir = os.path.dirname(csvout)
+
+ c.run_sourmash('tax', 'annotate', '--gather-csv', g_csv, '--taxonomy-csv', tax, '-o', out_dir)
+
+ print(c.last_result.status)
+ print(c.last_result.out)
+ print(c.last_result.err)
+
+ assert c.last_result.status == 0
+
+ lin_gather_results = [x.rstrip() for x in open(csvout)]
+ print("\n".join(lin_gather_results))
+ assert f"saving `annotate` output to {csvout}" in runtmp.last_result.err
+
+ assert "lineage" in lin_gather_results[0]
+ assert "d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in lin_gather_results[1]
+ assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in lin_gather_results[2]
+ assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" in lin_gather_results[3]
+ assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in lin_gather_results[4]
+
+
+def test_annotate_empty_gather_results(runtmp):
+ tax = utils.get_test_data('tax/test.taxonomy.csv')
+
+ #creates empty gather result
+ g_csv = runtmp.output('g.csv')
+ with open(g_csv, "w") as fp:
+ fp.write("")
+ print("g_csv: ", g_csv)
+
+ with pytest.raises(ValueError) as exc:
+ runtmp.run_sourmash('tax', 'annotate', '-g', g_csv, '--taxonomy-csv', tax)
+
+ assert f'Cannot read gather results from {g_csv}. Is file empty?' in str(exc.value)
+ assert runtmp.last_result.status == -1
+
+
+def test_annotate_bad_gather_header(runtmp):
+ tax = utils.get_test_data('tax/test.taxonomy.csv')
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+
+ bad_g_csv = runtmp.output('g.csv')
+
+ #creates bad gather result
+ bad_g = [x.replace("query_name", "nope") for x in open(g_csv, 'r')]
+ with open(bad_g_csv, 'w') as fp:
+ for line in bad_g:
+ fp.write(line)
+ print("bad_gather_results: \n", bad_g)
+
+ with pytest.raises(ValueError) as exc:
+ runtmp.run_sourmash('tax', 'annotate', '-g', bad_g_csv, '--taxonomy-csv', tax)
+
+ assert f'Not all required gather columns are present in {bad_g_csv}.' in str(exc.value)
+ assert runtmp.last_result.status == -1
+
+
+def test_annotate_empty_tax_lineage_input(runtmp):
+ tax_empty = runtmp.output('t.csv')
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+
+ with open(tax_empty, "w") as fp:
+ fp.write("")
+ print("t_csv: ", tax_empty)
+
+
+ with pytest.raises(ValueError) as exc:
+ runtmp.run_sourmash('tax', 'annotate', '-g', g_csv, '--taxonomy-csv', tax_empty)
+
+ print(runtmp.last_result.status)
+ print(runtmp.last_result.out)
+ print(runtmp.last_result.err)
+
+ assert runtmp.last_result.status != 0
+ assert f"cannot read taxonomy assignments from" in str(exc.value)
+
+
+def test_tax_prepare_1_csv_to_csv(runtmp, keep_identifiers, keep_versions):
+ # CSV -> CSV; same assignments
+ tax = utils.get_test_data('tax/test.taxonomy.csv')
+ taxout = runtmp.output('out.csv')
+
+ args = []
+ if keep_identifiers:
+ args.append('--keep-full-identifiers')
+ if keep_versions:
+ args.append('--keep-identifier-versions')
+
+ # this is an error - can't strip versions if not splitting identifiers
+ if keep_identifiers and not keep_versions:
+ with pytest.raises(ValueError):
+ runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o',
+ taxout, '-F', 'csv', *args)
+ return
+
+ runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o',
+ taxout, '-F', 'csv', *args)
+ assert os.path.exists(taxout)
+
+ db1 = tax_utils.MultiLineageDB.load([tax],
+ keep_full_identifiers=keep_identifiers,
+ keep_identifier_versions=keep_versions)
+
+ db2 = tax_utils.MultiLineageDB.load([taxout])
+
+ assert set(db1) == set(db2)
+
+
+def test_tax_prepare_2_csv_to_sql(runtmp, keep_identifiers, keep_versions):
+ # CSV -> SQL; same assignments?
+ tax = utils.get_test_data('tax/test.taxonomy.csv')
+ taxout = runtmp.output('out.db')
+
+ args = []
+ if keep_identifiers:
+ args.append('--keep-full-identifiers')
+ if keep_versions:
+ args.append('--keep-identifier-versions')
+
+ # this is an error - can't strip versions if not splitting identifiers
+ if keep_identifiers and not keep_versions:
+ with pytest.raises(ValueError):
+ runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', taxout,
+ '-F', 'sql', *args)
+ return
+
+ runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', taxout,
+ '-F', 'sql', *args)
+ assert os.path.exists(taxout)
+
+ db1 = tax_utils.MultiLineageDB.load([tax],
+ keep_full_identifiers=keep_identifiers,
+ keep_identifier_versions=keep_versions)
+ db2 = tax_utils.MultiLineageDB.load([taxout])
+
+ assert set(db1) == set(db2)
+
+ # cannot overwrite -
+ with pytest.raises(ValueError) as exc:
+ runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', taxout,
+ '-F', 'sql', *args)
+ assert 'taxonomy table already exists' in str(exc.value)
+
+
+def test_tax_prepare_3_db_to_csv(runtmp):
+ # CSV -> CSV; same assignments
+ taxcsv = utils.get_test_data('tax/test.taxonomy.csv')
+ taxdb = utils.get_test_data('tax/test.taxonomy.db')
+ taxout = runtmp.output('out.csv')
+
+ runtmp.run_sourmash('tax', 'prepare', '-t', taxdb,
+ '-o', taxout, '-F', 'csv')
+ assert os.path.exists(taxout)
+ with open(taxout) as fp:
+ print(fp.read())
+
+ db1 = tax_utils.MultiLineageDB.load([taxcsv],
+ keep_full_identifiers=False,
+ keep_identifier_versions=False)
+
+ db2 = tax_utils.MultiLineageDB.load([taxout])
+ db3 = tax_utils.MultiLineageDB.load([taxdb],
+ keep_full_identifiers=False,
+ keep_identifier_versions=False)
+ assert set(db1) == set(db2)
+ assert set(db1) == set(db3)
diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
new file mode 100644
index 0000000000..0370b9364c
--- /dev/null
+++ b/tests/test_tax_utils.py
@@ -0,0 +1,1049 @@
+"""
+Tests for functions in taxonomy submodule.
+"""
+import pytest
+from os.path import basename
+
+import sourmash_tst_utils as utils
+
+from sourmash.tax.tax_utils import (ascending_taxlist, get_ident, load_gather_results,
+ summarize_gather_at, find_missing_identities,
+ write_summary, MultiLineageDB,
+ collect_gather_csvs, check_and_load_gather_csvs,
+ SummarizedGatherResult, ClassificationResult,
+ write_classifications,
+ aggregate_by_lineage_at_rank,
+ make_krona_header, format_for_krona, write_krona,
+ combine_sumgather_csvs_by_lineage, write_lineage_sample_frac,
+ LineageDB, LineageDB_Sqlite)
+
+# import lca utils as needed for now
+from sourmash.lca import lca_utils
+from sourmash.lca.lca_utils import LineagePair
+
+# utility functions for testing
+def make_mini_gather_results(g_infolist):
+ # make mini gather_results
+ min_header = ["query_name", "name", "match_ident", "f_unique_weighted", "query_md5", "query_filename"]
+ gather_results = []
+ for g_info in g_infolist:
+ inf = dict(zip(min_header, g_info))
+ gather_results.append(inf)
+ return gather_results
+
+
+def make_mini_taxonomy(tax_info):
+ #pass in list of tuples: (name, lineage)
+ taxD = {}
+ for (name,lin) in tax_info:
+ taxD[name] = lca_utils.make_lineage(lin)
+ return taxD
+
+
+## tests
+def test_ascending_taxlist_1():
+ assert list(ascending_taxlist()) == ['strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom']
+
+
+def test_ascending_taxlist_2():
+ assert list(ascending_taxlist(include_strain=False)) == ['species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom']
+
+
+def test_get_ident_default():
+ ident = "GCF_001881345.1"
+ n_id = get_ident(ident)
+ assert n_id == "GCF_001881345"
+
+
+def test_get_ident_split_but_keep_version():
+ ident = "GCF_001881345.1"
+ n_id = get_ident(ident, keep_identifier_versions=True)
+ assert n_id == "GCF_001881345.1"
+
+
+def test_get_ident_no_split():
+ ident = "GCF_001881345.1 secondname"
+ n_id = get_ident(ident, keep_full_identifiers=True)
+ assert n_id == "GCF_001881345.1 secondname"
+
+
+def test_collect_gather_csvs(runtmp):
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+ from_file = runtmp.output("tmp-from-file.txt")
+ with open(from_file, 'w') as fp:
+ fp.write(f"{g_csv}\n")
+
+ gather_files = collect_gather_csvs([g_csv], from_file=from_file)
+ print("gather_files: ", gather_files)
+ assert len(gather_files) == 1
+ assert basename(gather_files[0]) == 'test1.gather.csv'
+
+
+def test_check_and_load_gather_csvs_empty(runtmp):
+ g_res = runtmp.output('empty.gather.csv')
+ with open(g_res, 'w') as fp:
+ fp.write("")
+ csvs = [g_res]
+ # load taxonomy csv
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+ tax_assign = MultiLineageDB.load([taxonomy_csv], keep_full_identifiers=1)
+
+ print(tax_assign)
+ # check gather results and missing ids
+ with pytest.raises(Exception) as exc:
+ gather_results, ids_missing, n_missing, header = check_and_load_gather_csvs(csvs, tax_assign)
+ assert "Cannot read gather results from" in str(exc.value)
+
+
+def test_check_and_load_gather_csvs_with_empty_force(runtmp):
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+ # make gather results with taxonomy name not in tax_assign
+ g_res2 = runtmp.output('gA.gather.csv')
+ g_results = [x.replace("GCF_001881345.1", "gA") for x in open(g_csv, 'r')]
+ with open(g_res2, 'w') as fp:
+ for line in g_results:
+ fp.write(line)
+ # make empty gather results
+ g_res3 = runtmp.output('empty.gather.csv')
+ with open(g_res3, 'w') as fp:
+ fp.write("")
+
+ csvs = [g_res2, g_res3]
+
+ # load taxonomy csv
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+ tax_assign = MultiLineageDB.load([taxonomy_csv],
+ keep_full_identifiers=False,
+ keep_identifier_versions=False)
+ print(tax_assign)
+ # check gather results and missing ids
+ gather_results, ids_missing, n_missing, header = check_and_load_gather_csvs(csvs, tax_assign, force=True)
+ assert len(gather_results) == 4
+ print("n_missing: ", n_missing)
+ print("ids_missing: ", ids_missing)
+ assert n_missing == 1
+ assert ids_missing == {"gA"}
+
+
+def test_check_and_load_gather_csvs_fail_on_missing(runtmp):
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+ # make gather results with taxonomy name not in tax_assign
+ g_res2 = runtmp.output('gA.gather.csv')
+ g_results = [x.replace("GCF_001881345.1", "gA") for x in open(g_csv, 'r')]
+ with open(g_res2, 'w') as fp:
+ for line in g_results:
+ fp.write(line)
+
+ csvs = [g_res2]
+
+ # load taxonomy csv
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+ tax_assign = MultiLineageDB.load([taxonomy_csv], keep_full_identifiers=1)
+ print(tax_assign)
+ # check gather results and missing ids
+ with pytest.raises(ValueError) as exc:
+ gather_results, ids_missing, n_missing, header = check_and_load_gather_csvs(csvs, tax_assign, fail_on_missing_taxonomy=True, force=True)
+ assert "Failing on missing taxonomy" in str(exc)
+
+
+def test_load_gather_results():
+ gather_csv = utils.get_test_data('tax/test1.gather.csv')
+ gather_results, header, seen_queries = load_gather_results(gather_csv)
+ assert len(gather_results) == 4
+
+
+def test_load_gather_results_bad_header(runtmp):
+ g_csv = utils.get_test_data('tax/test1.gather.csv')
+
+ bad_g_csv = runtmp.output('g.csv')
+
+ #creates bad gather result
+ bad_g = [x.replace("f_unique_weighted", "nope") for x in open(g_csv, 'r')]
+ with open(bad_g_csv, 'w') as fp:
+ for line in bad_g:
+ fp.write(line)
+ print("bad_gather_results: \n", bad_g)
+
+ with pytest.raises(ValueError) as exc:
+ gather_results, header = load_gather_results(bad_g_csv)
+ assert f'Not all required gather columns are present in {bad_g_csv}.' in str(exc.value)
+
+
+def test_load_gather_results_empty(runtmp):
+ empty_csv = runtmp.output('g.csv')
+
+ #creates empty gather result
+ with open(empty_csv, 'w') as fp:
+ fp.write('')
+
+ with pytest.raises(ValueError) as exc:
+ gather_results, header = load_gather_results(empty_csv)
+ assert f'Cannot read gather results from {empty_csv}. Is file empty?' in str(exc.value)
+
+
+def test_load_taxonomy_csv():
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+ tax_assign = MultiLineageDB.load([taxonomy_csv])
+ print("taxonomy assignments: \n", tax_assign)
+ assert list(tax_assign.keys()) == ['GCF_001881345.1', 'GCF_009494285.1', 'GCF_013368705.1', 'GCF_003471795.1', 'GCF_000017325.1', 'GCF_000021665.1']
+ assert len(tax_assign) == 6 # should have read 6 rows
+
+
+def test_load_taxonomy_csv_split_id():
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+ tax_assign = MultiLineageDB.load([taxonomy_csv], keep_full_identifiers=0,
+ keep_identifier_versions=False)
+ print("taxonomy assignments: \n", tax_assign)
+ assert list(tax_assign.keys()) == ['GCF_001881345', 'GCF_009494285', 'GCF_013368705', 'GCF_003471795', 'GCF_000017325', 'GCF_000021665']
+ assert len(tax_assign) == 6 # should have read 6 rows
+
+
+def test_load_taxonomy_csv_with_ncbi_id(runtmp):
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+ upd_csv = runtmp.output("updated_taxonomy.csv")
+ with open(upd_csv, 'w') as new_tax:
+ tax = [x.rstrip() for x in open(taxonomy_csv, 'r')]
+ ncbi_id = "ncbi_id after_space"
+ fake_lin = [ncbi_id] + ["sk", "phy", "cls", "ord", "fam", "gen", "sp"]
+ ncbi_tax = ",".join(fake_lin)
+ tax.append(ncbi_tax)
+ new_tax.write("\n".join(tax))
+
+ tax_assign = MultiLineageDB.load([upd_csv], keep_full_identifiers=True)
+ print("taxonomy assignments: \n", tax_assign)
+ assert list(tax_assign.keys()) == ['GCF_001881345.1', 'GCF_009494285.1', 'GCF_013368705.1', 'GCF_003471795.1', 'GCF_000017325.1', 'GCF_000021665.1', "ncbi_id after_space"]
+ assert len(tax_assign) == 7 # should have read 7 rows
+
+
+def test_load_taxonomy_csv_split_id_ncbi(runtmp):
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+ upd_csv = runtmp.output("updated_taxonomy.csv")
+ with open(upd_csv, 'w') as new_tax:
+ tax = [x.rstrip() for x in open(taxonomy_csv, 'r')]
+ ncbi_id = "ncbi_id after_space"
+ fake_lin = [ncbi_id] + ["sk", "phy", "cls", "ord", "fam", "gen", "sp"]
+ ncbi_tax = ",".join(fake_lin)
+ tax.append(ncbi_tax)
+ new_tax.write("\n".join(tax))
+
+ tax_assign = MultiLineageDB.load([upd_csv], keep_full_identifiers=False,
+ keep_identifier_versions=False)
+ print("taxonomy assignments: \n", tax_assign)
+ assert list(tax_assign.keys()) == ['GCF_001881345', 'GCF_009494285', 'GCF_013368705', 'GCF_003471795', 'GCF_000017325', 'GCF_000021665', "ncbi_id"]
+ assert len(tax_assign) == 7 # should have read 7 rows
+
+ # check for non-sensical args.
+ with pytest.raises(ValueError):
+ tax_assign = MultiLineageDB.load([upd_csv], keep_full_identifiers=1,
+ keep_identifier_versions=False)
+
+
+def test_load_taxonomy_csv_duplicate(runtmp):
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+ duplicated_csv = runtmp.output("duplicated_taxonomy.csv")
+ with open(duplicated_csv, 'w') as dup:
+ tax = [x.rstrip() for x in open(taxonomy_csv, 'r')]
+ tax.append(tax[1] + 'FOO') # add first tax_assign again
+ print(tax[-1])
+ dup.write("\n".join(tax))
+
+ with pytest.raises(Exception) as exc:
+ MultiLineageDB.load([duplicated_csv])
+
+ assert "cannot read taxonomy assignments" in str(exc.value)
+ assert "multiple lineages for identifier GCF_001881345.1" in str(exc.value)
+
+
+def test_load_taxonomy_csv_duplicate_force(runtmp):
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+ duplicated_csv = runtmp.output("duplicated_taxonomy.csv")
+ with open(duplicated_csv, 'w') as dup:
+ tax = [x.rstrip() for x in open(taxonomy_csv, 'r')]
+ tax.append(tax[1]) # add first tax_assign again
+ dup.write("\n".join(tax))
+
+ # now force
+ tax_assign = MultiLineageDB.load([duplicated_csv], force=True)
+ num_rows = len(tax_assign)
+
+ print("taxonomy assignments: \n", tax_assign)
+ assert list(tax_assign.keys()) == ['GCF_001881345.1', 'GCF_009494285.1', 'GCF_013368705.1', 'GCF_003471795.1', 'GCF_000017325.1', 'GCF_000021665.1']
+
+
+def test_find_missing_identities():
+ # make gather results
+ gA = ["queryA", "gA","0.5","0.5", "queryA_md5", "queryA.sig"]
+ gB = ["queryA", "gB","0.3","0.5", "queryA_md5", "queryA.sig"]
+ g_res = make_mini_gather_results([gA,gB])
+
+ # make mini taxonomy
+ gA_tax = ("gA", "a;b;c")
+ taxD = make_mini_taxonomy([gA_tax])
+
+ n, ids = find_missing_identities(g_res, taxD)
+ print("n_missing: ", n)
+ print("ids_missing: ", ids)
+ assert n == 1
+ assert ids == {"gB"}
+
+
+def test_summarize_gather_at_0():
+ """test two matches, equal f_unique_weighted"""
+ # make gather results
+ gA = ["queryA", "gA","0.5","0.5", "queryA_md5", "queryA.sig"]
+ gB = ["queryA", "gB","0.3","0.5", "queryA_md5", "queryA.sig"]
+ g_res = make_mini_gather_results([gA,gB])
+
+ # make mini taxonomy
+ gA_tax = ("gA", "a;b;c")
+ gB_tax = ("gB", "a;b;d")
+ taxD = make_mini_taxonomy([gA_tax,gB_tax])
+
+ # run summarize_gather_at and check results!
+ sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res)
+
+ # superkingdom
+ assert len(sk_sum) == 1
+ print("superkingdom summarized gather: ", sk_sum[0])
+ assert sk_sum[0].query_name == "queryA"
+ assert sk_sum[0].query_md5 == "queryA_md5"
+ assert sk_sum[0].query_filename == "queryA.sig"
+ assert sk_sum[0].rank == 'superkingdom'
+ assert sk_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),)
+ assert sk_sum[0].fraction == 1.0
+
+ # phylum
+ phy_sum, _ = summarize_gather_at("phylum", taxD, g_res)
+ print("phylum summarized gather: ", phy_sum[0])
+ assert len(phy_sum) == 1
+ assert phy_sum[0].query_name == "queryA"
+ assert phy_sum[0].query_md5 == "queryA_md5"
+ assert phy_sum[0].query_filename == "queryA.sig"
+ assert phy_sum[0].rank == 'phylum'
+ assert phy_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),LineagePair(rank='phylum', name='b'))
+ assert phy_sum[0].fraction == 1.0
+ # class
+ cl_sum, _ = summarize_gather_at("class", taxD, g_res)
+ assert len(cl_sum) == 2
+ print("class summarized gather: ", cl_sum)
+ assert cl_sum[0].query_name == "queryA"
+ assert cl_sum[0].query_md5 == "queryA_md5"
+ assert cl_sum[0].query_filename == "queryA.sig"
+ assert cl_sum[0].rank == 'class'
+ assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),
+ LineagePair(rank='phylum', name='b'),
+ LineagePair(rank='class', name='c'))
+ assert cl_sum[0].fraction == 0.5
+ assert cl_sum[1].rank == 'class'
+ assert cl_sum[1].lineage == (LineagePair(rank='superkingdom', name='a'),
+ LineagePair(rank='phylum', name='b'),
+ LineagePair(rank='class', name='d'))
+ assert cl_sum[1].fraction == 0.5
+
+
+def test_summarize_gather_at_1():
+ """test two matches, diff f_unique_weighted"""
+ # make mini gather_results
+ gA = ["queryA", "gA","0.5","0.6", "queryA_md5", "queryA.sig"]
+ gB = ["queryA", "gB","0.3","0.1", "queryA_md5", "queryA.sig"]
+ g_res = make_mini_gather_results([gA,gB])
+
+ # make mini taxonomy
+ gA_tax = ("gA", "a;b;c")
+ gB_tax = ("gB", "a;b;d")
+ taxD = make_mini_taxonomy([gA_tax,gB_tax])
+ # run summarize_gather_at and check results!
+ sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res)
+
+ # superkingdom
+ assert len(sk_sum) == 1
+ print("superkingdom summarized gather: ", sk_sum[0])
+ assert sk_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),)
+ assert sk_sum[0].fraction == 0.7
+
+ # phylum
+ phy_sum, _ = summarize_gather_at("phylum", taxD, g_res)
+ print("phylum summarized gather: ", phy_sum[0])
+ assert len(phy_sum) == 1
+ assert phy_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),LineagePair(rank='phylum', name='b'))
+ assert phy_sum[0].fraction == 0.7
+ # class
+ cl_sum, _ = summarize_gather_at("class", taxD, g_res)
+ assert len(cl_sum) == 2
+ print("class summarized gather: ", cl_sum)
+ assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),
+ LineagePair(rank='phylum', name='b'),
+ LineagePair(rank='class', name='c'))
+ assert cl_sum[0].fraction == 0.6
+ assert cl_sum[1].rank == 'class'
+ assert cl_sum[1].lineage == (LineagePair(rank='superkingdom', name='a'),
+ LineagePair(rank='phylum', name='b'),
+ LineagePair(rank='class', name='d'))
+ assert cl_sum[1].fraction == 0.1
+
+
+def test_summarize_gather_at_100percent_match():
+ """test 100% gather match (f_unique_weighted == 1)"""
+ # make mini gather_results
+ gA = ["queryA", "gA","0.5","1.0", "queryA_md5", "queryA.sig"]
+ gB = ["queryA", "gB","0.3","0.0", "queryA_md5", "queryA.sig"]
+ g_res = make_mini_gather_results([gA,gB])
+
+ # make mini taxonomy
+ gA_tax = ("gA", "a;b;c")
+ gB_tax = ("gB", "a;b;d")
+ taxD = make_mini_taxonomy([gA_tax,gB_tax])
+
+ # run summarize_gather_at and check results!
+ sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res)
+ # superkingdom
+ assert len(sk_sum) == 1
+ print("superkingdom summarized gather: ", sk_sum[0])
+ assert sk_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),)
+ assert sk_sum[0].fraction == 1.0
+
+
+def test_summarize_gather_at_over100percent_f_unique_weighted():
+ """gather matches that add up to >100% f_unique_weighted"""
+ ## should we make this fail?
+ # make mini gather_results
+ gA = ["queryA", "gA","0.5","0.5", "queryA_md5", "queryA.sig"]
+ gB = ["queryA", "gB","0.3","0.6", "queryA_md5", "queryA.sig"]
+ g_res = make_mini_gather_results([gA,gB])
+
+ # make mini taxonomy
+ gA_tax = ("gA", "a;b;c")
+ gB_tax = ("gB", "a;b;d")
+ taxD = make_mini_taxonomy([gA_tax,gB_tax])
+
+ # run summarize_gather_at and check results!
+ sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res)
+ # superkingdom
+ assert len(sk_sum) == 1
+ print("superkingdom summarized gather: ", sk_sum[0])
+ assert sk_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),)
+ assert sk_sum[0].fraction == 1.1
+ # phylum
+ phy_sum, _ = summarize_gather_at("phylum", taxD, g_res)
+ print("phylum summarized gather: ", phy_sum[0])
+ assert len(phy_sum) == 1
+ assert phy_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),LineagePair(rank='phylum', name='b'))
+ assert phy_sum[0].fraction == 1.1
+ # class
+ cl_sum, _ = summarize_gather_at("class", taxD, g_res)
+ assert len(cl_sum) == 2
+ print("class summarized gather: ", cl_sum)
+ assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),
+ LineagePair(rank='phylum', name='b'),
+ LineagePair(rank='class', name='d'))
+ assert cl_sum[0].fraction == 0.6
+ assert cl_sum[1].rank == 'class'
+ assert cl_sum[1].lineage == (LineagePair(rank='superkingdom', name='a'),
+ LineagePair(rank='phylum', name='b'),
+ LineagePair(rank='class', name='c'))
+ assert cl_sum[1].fraction == 0.5
+
+
+def test_summarize_gather_at_missing_ignore():
+ """test two matches, equal f_unique_weighted"""
+ # make gather results
+ gA = ["queryA", "gA","0.5","0.5", "queryA_md5", "queryA.sig"]
+ gB = ["queryA", "gB","0.3","0.5", "queryA_md5", "queryA.sig"]
+ g_res = make_mini_gather_results([gA,gB])
+
+ # make mini taxonomy
+ gA_tax = ("gA", "a;b;c")
+ taxD = make_mini_taxonomy([gA_tax])
+
+ # run summarize_gather_at and check results!
+ sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res, skip_idents=['gB'])
+ # superkingdom
+ assert len(sk_sum) == 1
+ print("superkingdom summarized gather: ", sk_sum[0])
+ assert sk_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),)
+ assert sk_sum[0].fraction == 0.5
+
+ # phylum
+ phy_sum, _ = summarize_gather_at("phylum", taxD, g_res, skip_idents=['gB'])
+ print("phylum summarized gather: ", phy_sum[0])
+ assert len(phy_sum) == 1
+ assert phy_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),LineagePair(rank='phylum', name='b'))
+ assert phy_sum[0].fraction == 0.5
+ # class
+ cl_sum, _ = summarize_gather_at("class", taxD, g_res, skip_idents=['gB'])
+ assert len(cl_sum) == 1
+ print("class summarized gather: ", cl_sum)
+ assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),
+ LineagePair(rank='phylum', name='b'),
+ LineagePair(rank='class', name='c'))
+ assert cl_sum[0].fraction == 0.5
+
+
+def test_summarize_gather_at_missing_fail():
+ """test two matches, equal f_unique_weighted"""
+ # make gather results
+ gA = ["queryA", "gA","0.5","0.5", "queryA_md5", "queryA.sig"]
+ gB = ["queryA", "gB","0.3","0.5", "queryA_md5", "queryA.sig"]
+ g_res = make_mini_gather_results([gA,gB])
+
+ # make mini taxonomy
+ gA_tax = ("gA", "a;b;c")
+ taxD = make_mini_taxonomy([gA_tax])
+
+ # run summarize_gather_at and check results!
+ with pytest.raises(ValueError) as exc:
+ sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res)
+ assert "ident gB is not in the taxonomy database." in str(exc.value)
+
+
+def test_summarize_gather_at_best_only_0():
+ """test two matches, diff f_unique_weighted"""
+ # make mini gather_results
+ gA = ["queryA", "gA","0.5","0.6", "queryA_md5", "queryA.sig"]
+ gB = ["queryA", "gB","0.3","0.1", "queryA_md5", "queryA.sig"]
+ g_res = make_mini_gather_results([gA,gB])
+
+ # make mini taxonomy
+ gA_tax = ("gA", "a;b;c")
+ gB_tax = ("gB", "a;b;d")
+ taxD = make_mini_taxonomy([gA_tax,gB_tax])
+ # run summarize_gather_at and check results!
+ sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res, best_only=True)
+ # superkingdom
+ assert len(sk_sum) == 1
+ print("superkingdom summarized gather: ", sk_sum[0])
+ assert sk_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),)
+ assert sk_sum[0].fraction == 0.7
+
+ # phylum
+ phy_sum, _ = summarize_gather_at("phylum", taxD, g_res, best_only=True)
+ print("phylum summarized gather: ", phy_sum[0])
+ assert len(phy_sum) == 1
+ assert phy_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),LineagePair(rank='phylum', name='b'))
+ assert phy_sum[0].fraction == 0.7
+ # class
+ cl_sum, _ = summarize_gather_at("class", taxD, g_res, best_only=True)
+ assert len(cl_sum) == 1
+ print("class summarized gather: ", cl_sum)
+ assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),
+ LineagePair(rank='phylum', name='b'),
+ LineagePair(rank='class', name='c'))
+ assert cl_sum[0].fraction == 0.6
+
+
+def test_summarize_gather_at_best_only_equal_choose_first():
+ """test two matches, equal f_unique_weighted. best_only chooses first"""
+ # make mini gather_results
+ gA = ["queryA", "gA","0.5","0.5", "queryA_md5", "queryA.sig"]
+ gB = ["queryA", "gB","0.3","0.5", "queryA_md5", "queryA.sig"]
+ g_res = make_mini_gather_results([gA,gB])
+
+ # make mini taxonomy
+ gA_tax = ("gA", "a;b;c")
+ gB_tax = ("gB", "a;b;d")
+ taxD = make_mini_taxonomy([gA_tax,gB_tax])
+
+ # run summarize_gather_at and check results!
+ # class
+ cl_sum, _ = summarize_gather_at("class", taxD, g_res, best_only=True)
+ assert len(cl_sum) == 1
+ print("class summarized gather: ", cl_sum)
+ assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),
+ LineagePair(rank='phylum', name='b'),
+ LineagePair(rank='class', name='c'))
+ assert cl_sum[0].fraction == 0.5
+
+
+def test_write_summary_csv(runtmp):
+ """test summary csv write function"""
+
+ sum_gather = {'superkingdom': [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=1.0,
+ query_md5='queryA_md5', query_filename='queryA.sig',
+ lineage=(LineagePair(rank='superkingdom', name='a'),))],
+ 'phylum': [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=1.0,
+ query_md5='queryA_md5', query_filename='queryA.sig',
+ lineage=(LineagePair(rank='superkingdom', name='a'),
+ LineagePair(rank='phylum', name='b')))]}
+
+ outs= runtmp.output("outsum.csv")
+ with open(outs, 'w') as out_fp:
+ write_summary(sum_gather, out_fp)
+
+ sr = [x.rstrip().split(',') for x in open(outs, 'r')]
+ print("gather_summary_results_from_file: \n", sr)
+ assert ['query_name', 'rank', 'fraction', 'lineage', 'query_md5', 'query_filename'] == sr[0]
+ assert ['queryA', 'superkingdom', '1.000', 'a', 'queryA_md5', 'queryA.sig'] == sr[1]
+ assert ['queryA', 'phylum', '1.000', 'a;b', 'queryA_md5', 'queryA.sig'] == sr[2]
+
+
+def test_write_classification(runtmp):
+ """test classification csv write function"""
+ classif = ClassificationResult('queryA', 'match', 'phylum', 1.0,
+ (LineagePair(rank='superkingdom', name='a'),
+ LineagePair(rank='phylum', name='b')),
+ 'queryA_md5', 'queryA.sig')
+
+ classification = {'phylum': [classif]}
+
+ outs= runtmp.output("outsum.csv")
+ with open(outs, 'w') as out_fp:
+ write_classifications(classification, out_fp)
+
+ sr = [x.rstrip().split(',') for x in open(outs, 'r')]
+ print("gather_classification_results_from_file: \n", sr)
+ assert ['query_name', 'status', 'rank', 'fraction', 'lineage', 'query_md5', 'query_filename'] == sr[0]
+ assert ['queryA', 'match', 'phylum', '1.000', 'a;b', 'queryA_md5', 'queryA.sig'] == sr[1]
+
+
+def test_make_krona_header_0():
+ hd = make_krona_header("species")
+ print("header: ", hd)
+ assert hd == ("fraction", "superkingdom", "phylum", "class", "order", "family", "genus", "species")
+
+
+def test_make_krona_header_1():
+ hd = make_krona_header("order")
+ print("header: ", hd)
+ assert hd == ("fraction", "superkingdom", "phylum", "class", "order")
+
+
+def test_make_krona_header_strain():
+ hd = make_krona_header("strain", include_strain=True)
+ print("header: ", hd)
+ assert hd == ("fraction", "superkingdom", "phylum", "class", "order", "family", "genus", "species", "strain")
+
+
+def test_make_krona_header_fail():
+ with pytest.raises(ValueError) as exc:
+ make_krona_header("strain")
+ assert "Rank strain not present in available ranks" in str(exc.value)
+
+
+def test_aggregate_by_lineage_at_rank_by_query():
+ """test two queries, aggregate lineage at rank for each"""
+ # make gather results
+ gA = ["queryA","gA","0.5","0.5", "queryA_md5", "queryA.sig"]
+ gB = ["queryA","gB","0.3","0.4", "queryA_md5", "queryA.sig"]
+ gC = ["queryB","gB","0.3","0.3", "queryB_md5", "queryB.sig"]
+ g_res = make_mini_gather_results([gA,gB,gC])
+
+ # make mini taxonomy
+ gA_tax = ("gA", "a;b")
+ gB_tax = ("gB", "a;c")
+ taxD = make_mini_taxonomy([gA_tax,gB_tax])
+
+ # aggregate by lineage at rank
+ sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res)
+ print("superkingdom summarized gather results:", sk_sum)
+ assert len(sk_sum) ==2
+ assert sk_sum[0].query_name == "queryA"
+ assert sk_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),)
+ assert sk_sum[0].fraction == 0.9
+ assert sk_sum[1].query_name == "queryB"
+ assert sk_sum[1].lineage == (LineagePair(rank='superkingdom', name='a'),)
+ assert sk_sum[1].fraction == 0.3
+ sk_lin_sum, query_names, num_queries = aggregate_by_lineage_at_rank(sk_sum, by_query=True)
+ print("superkingdom lineage summary:", sk_lin_sum, '\n')
+ assert sk_lin_sum == {(LineagePair(rank='superkingdom', name='a'),): {'queryA': 0.9, 'queryB': 0.3}}
+ assert num_queries == 2
+ assert query_names == ['queryA', 'queryB']
+
+ phy_sum, _ = summarize_gather_at("phylum", taxD, g_res)
+ print("phylum summary:", phy_sum, ']\n')
+ phy_lin_sum, query_names, num_queries = aggregate_by_lineage_at_rank(phy_sum, by_query=True)
+ print("phylum lineage summary:", phy_lin_sum, '\n')
+ assert phy_lin_sum == {(LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b')): {'queryA': 0.5},
+ (LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='c')): {'queryA': 0.4, 'queryB': 0.3}}
+ assert num_queries == 2
+ assert query_names == ['queryA', 'queryB']
+
+
+def test_format_for_krona_0():
+ """test two matches, equal f_unique_weighted"""
+ # make gather results
+ gA = ["queryA", "gA","0.5","0.5", "queryA_md5", "queryA.sig"]
+ gB = ["queryA", "gB","0.3","0.5", "queryA_md5", "queryA.sig"]
+ g_res = make_mini_gather_results([gA,gB])
+
+ # make mini taxonomy
+ gA_tax = ("gA", "a;b;c")
+ gB_tax = ("gB", "a;b;d")
+ taxD = make_mini_taxonomy([gA_tax,gB_tax])
+
+ # check krona format and check results!
+ sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res)
+ print("superkingdom summarized gather results:", sk_sum)
+ krona_res = format_for_krona("superkingdom", {"superkingdom": sk_sum})
+ print("krona_res: ", krona_res)
+ assert krona_res == [(1.0, 'a')]
+
+ phy_sum, _ = summarize_gather_at("phylum", taxD, g_res)
+ krona_res = format_for_krona("phylum", {"phylum": phy_sum})
+ print("krona_res: ", krona_res)
+ assert krona_res == [(1.0, 'a', 'b')]
+
+
+def test_format_for_krona_1():
+ """test two matches, equal f_unique_weighted"""
+ # make gather results
+ gA = ["queryA", "gA","0.5","0.5", "queryA_md5", "queryA.sig"]
+ gB = ["queryA", "gB","0.3","0.5", "queryA_md5", "queryA.sig"]
+ g_res = make_mini_gather_results([gA,gB])
+
+ # make mini taxonomy
+ gA_tax = ("gA", "a;b;c")
+ gB_tax = ("gB", "a;b;d")
+ taxD = make_mini_taxonomy([gA_tax,gB_tax])
+
+ # summarize with all ranks
+ sum_res = {}
+ #for rank in lca_utils.taxlist(include_strain=False):
+ for rank in ['superkingdom', 'phylum', 'class']:
+ sum_res[rank], _ = summarize_gather_at(rank, taxD, g_res)
+ print('summarized gather: ', sum_res)
+ # check krona format
+ sk_krona = format_for_krona("superkingdom", sum_res)
+ print("sk_krona: ", sk_krona)
+ assert sk_krona == [(1.0, 'a')]
+ phy_krona = format_for_krona("phylum", sum_res)
+ print("phy_krona: ", phy_krona)
+ assert phy_krona == [(1.0, 'a', 'b')]
+ cl_krona = format_for_krona("class", sum_res)
+ print("cl_krona: ", cl_krona)
+ assert cl_krona == [(0.5, 'a', 'b', 'c'), (0.5, 'a', 'b', 'd')]
+
+
+def test_format_for_krona_best_only():
+ """test two matches, equal f_unique_weighted"""
+ # make gather results
+ gA = ["queryA", "gA","0.5","0.5", "queryA_md5", "queryA.sig"]
+ gB = ["queryA", "gB","0.3","0.5", "queryA_md5", "queryA.sig"]
+ g_res = make_mini_gather_results([gA,gB])
+
+ # make mini taxonomy
+ gA_tax = ("gA", "a;b;c")
+ gB_tax = ("gB", "a;b;d")
+ taxD = make_mini_taxonomy([gA_tax,gB_tax])
+
+ # summarize with all ranks
+ sum_res = {}
+ #for rank in lca_utils.taxlist(include_strain=False):
+ for rank in ['superkingdom', 'phylum', 'class']:
+ sum_res[rank], _ = summarize_gather_at(rank, taxD, g_res, best_only=True)
+ print('summarized gather: ', sum_res)
+ # check krona format
+ sk_krona = format_for_krona("superkingdom", sum_res)
+ print("sk_krona: ", sk_krona)
+ assert sk_krona == [(1.0, 'a')]
+ phy_krona = format_for_krona("phylum", sum_res)
+ print("phy_krona: ", phy_krona)
+ assert phy_krona == [(1.0, 'a', 'b')]
+ cl_krona = format_for_krona("class", sum_res)
+ print("cl_krona: ", cl_krona)
+ assert cl_krona == [(0.5, 'a', 'b', 'c')]
+
+
+def test_write_krona(runtmp):
+ """test two matches, equal f_unique_weighted"""
+ class_krona_results = [(0.5, 'a', 'b', 'c'), (0.5, 'a', 'b', 'd')]
+ outk= runtmp.output("outkrona.tsv")
+ with open(outk, 'w') as out_fp:
+ write_krona("class", class_krona_results, out_fp)
+
+ kr = [x.strip().split('\t') for x in open(outk, 'r')]
+ print("krona_results_from_file: \n", kr)
+ assert kr[0] == ["fraction", "superkingdom", "phylum", "class"]
+ assert kr[1] == ["0.5", "a", "b", "c"]
+ assert kr[2] == ["0.5", "a", "b", "d"]
+
+
+def test_combine_sumgather_csvs_by_lineage(runtmp):
+ # some summarized gather dicts
+ sum_gather1 = {'superkingdom': [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=0.5,
+ query_md5='queryA_md5', query_filename='queryA.sig',
+ lineage=(LineagePair(rank='superkingdom', name='a'),))],
+ 'phylum': [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=0.5,
+ query_md5='queryA_md5', query_filename='queryA.sig',
+ lineage=(LineagePair(rank='superkingdom', name='a'),
+ LineagePair(rank='phylum', name='b')))]}
+ sum_gather2 = {'superkingdom': [SummarizedGatherResult(query_name='queryB', rank='superkingdom', fraction=0.7,
+ query_md5='queryB_md5', query_filename='queryB.sig',
+ lineage=(LineagePair(rank='superkingdom', name='a'),))],
+ 'phylum': [SummarizedGatherResult(query_name='queryB', rank='phylum', fraction=0.7,
+ query_md5='queryB_md5', query_filename='queryB.sig',
+ lineage=(LineagePair(rank='superkingdom', name='a'),
+ LineagePair(rank='phylum', name='c')))]}
+
+ # write summarized gather results csvs
+ sg1= runtmp.output("sample1.csv")
+ with open(sg1, 'w') as out_fp:
+ write_summary(sum_gather1, out_fp)
+
+ sg2= runtmp.output("sample2.csv")
+ with open(sg2, 'w') as out_fp:
+ write_summary(sum_gather2, out_fp)
+
+ # test combine_summarized_gather_csvs_by_lineage_at_rank
+ linD, query_names = combine_sumgather_csvs_by_lineage([sg1,sg2], rank="phylum")
+ print("lineage_dict", linD)
+ assert linD == {'a;b': {'queryA': '0.500'}, 'a;c': {'queryB': '0.700'}}
+ assert query_names == ['queryA', 'queryB']
+ linD, query_names = combine_sumgather_csvs_by_lineage([sg1,sg2], rank="superkingdom")
+ print("lineage dict: \n", linD)
+ assert linD, query_names == {'a': {'queryA': '0.500', 'queryB': '0.700'}}
+ assert query_names == ['queryA', 'queryB']
+
+
+def test_write_lineage_sample_frac(runtmp):
+ outfrac = runtmp.output('outfrac.csv')
+ sample_names = ['sample1', 'sample2']
+ sk_linD = {'a': {'sample1': '0.500' ,'sample2': '0.700'}}
+ with open(outfrac, 'w') as out_fp:
+ write_lineage_sample_frac(sample_names, sk_linD, out_fp)
+
+ frac_lines = [x.strip().split('\t') for x in open(outfrac, 'r')]
+ print("csv_lines: ", frac_lines)
+ assert frac_lines == [['lineage', 'sample1', 'sample2'], ['a', '0.500', '0.700']]
+
+ phy_linD = {'a;b': {'sample1': '0.500'}, 'a;c': {'sample2': '0.700'}}
+ with open(outfrac, 'w') as out_fp:
+ write_lineage_sample_frac(sample_names, phy_linD, out_fp)
+
+ frac_lines = [x.strip().split('\t') for x in open(outfrac, 'r')]
+ print("csv_lines: ", frac_lines)
+ assert frac_lines == [['lineage', 'sample1', 'sample2'], ['a;b', '0.500', '0'], ['a;c', '0', '0.700']]
+
+
+def test_write_lineage_sample_frac_format_lineage(runtmp):
+ outfrac = runtmp.output('outfrac.csv')
+ sample_names = ['sample1', 'sample2']
+ sk_lineage = lca_utils.make_lineage('a')
+ print(sk_lineage)
+ sk_linD = {sk_lineage: {'sample1': '0.500' ,'sample2': '0.700'}}
+ with open(outfrac, 'w') as out_fp:
+ write_lineage_sample_frac(sample_names, sk_linD, out_fp, format_lineage=True)
+
+ frac_lines = [x.strip().split('\t') for x in open(outfrac, 'r')]
+ print("csv_lines: ", frac_lines)
+ assert frac_lines == [['lineage', 'sample1', 'sample2'], ['a', '0.500', '0.700']]
+
+ phy_lineage = lca_utils.make_lineage('a;b')
+ print(phy_lineage)
+ phy2_lineage = lca_utils.make_lineage('a;c')
+ print(phy2_lineage)
+ phy_linD = {phy_lineage: {'sample1': '0.500'}, phy2_lineage: {'sample2': '0.700'}}
+ with open(outfrac, 'w') as out_fp:
+ write_lineage_sample_frac(sample_names, phy_linD, out_fp, format_lineage=True)
+
+ frac_lines = [x.strip().split('\t') for x in open(outfrac, 'r')]
+ print("csv_lines: ", frac_lines)
+ assert frac_lines == [['lineage', 'sample1', 'sample2'], ['a;b', '0.500', '0'], ['a;c', '0', '0.700']]
+
+
+def test_combine_sumgather_csvs_by_lineage_improper_rank(runtmp):
+ # some summarized gather dicts
+ sum_gather1 = {'superkingdom': [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=0.5,
+ query_md5='queryA_md5', query_filename='queryA.sig',
+ lineage=(LineagePair(rank='superkingdom', name='a'),))],
+ 'phylum': [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=0.5,
+ query_md5='queryA_md5', query_filename='queryA.sig',
+ lineage=(LineagePair(rank='superkingdom', name='a'),
+ LineagePair(rank='phylum', name='b')))]}
+ sum_gather2 = {'superkingdom': [SummarizedGatherResult(query_name='queryB', rank='superkingdom', fraction=0.7,
+ query_md5='queryB_md5', query_filename='queryB.sig',
+ lineage=(LineagePair(rank='superkingdom', name='a'),))],
+ 'phylum': [SummarizedGatherResult(query_name='queryB', rank='phylum', fraction=0.7,
+ query_md5='queryB_md5', query_filename='queryB.sig',
+ lineage=(LineagePair(rank='superkingdom', name='a'),
+ LineagePair(rank='phylum', name='c')))]}
+
+ # write summarized gather results csvs
+ sg1= runtmp.output("sample1.csv")
+ with open(sg1, 'w') as out_fp:
+ write_summary(sum_gather1, out_fp)
+
+ sg2= runtmp.output("sample2.csv")
+ with open(sg2, 'w') as out_fp:
+ write_summary(sum_gather2, out_fp)
+
+ # test combine_summarized_gather_csvs_by_lineage_at_rank
+ with pytest.raises(ValueError) as exc:
+ linD, sample_names = combine_sumgather_csvs_by_lineage([sg1,sg2], rank="strain")
+ print("ValueError: ", exc.value)
+ assert "Rank strain not available." in str(exc.value)
+
+
+def test_tax_multi_load_files(runtmp):
+ # test loading various good and bad files
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+ taxonomy_csv2 = utils.get_test_data('tax/test-strain.taxonomy.csv')
+ badcsv = utils.get_test_data('tax/47+63_x_gtdb-rs202.gather.csv')
+
+ db = MultiLineageDB.load([taxonomy_csv])
+ assert len(db) == 6
+ assert 'strain' not in db.available_ranks
+
+ db = MultiLineageDB.load([taxonomy_csv2])
+ assert len(db) == 6
+ assert 'strain' in db.available_ranks
+ assert db['GCF_001881345.1'][0].rank == 'superkingdom'
+
+ # load a string rather than a list
+ with pytest.raises(TypeError):
+ MultiLineageDB.load(badcsv)
+
+ # load a bad CSV
+ with pytest.raises(ValueError):
+ MultiLineageDB.load([badcsv])
+
+ # load a directory
+ with pytest.raises(ValueError):
+ MultiLineageDB.load([runtmp.output('')])
+
+ # file does not exist
+ with pytest.raises(ValueError):
+ MultiLineageDB.load([runtmp.output('no-such-file')])
+
+
+def test_tax_multi_load_files_shadowed(runtmp):
+ # test loading various good and bad files
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+ taxonomy_csv2 = utils.get_test_data('tax/test-strain.taxonomy.csv')
+ taxonomy_db = utils.get_test_data('tax/test.taxonomy.db')
+
+ db = MultiLineageDB.load([taxonomy_csv, taxonomy_csv2, taxonomy_db],
+ keep_full_identifiers=False,
+ keep_identifier_versions=False)
+ assert len(db.shadowed_identifiers()) == 6
+
+ # we should have everything including strain
+ assert set(lca_utils.taxlist()) == set(db.available_ranks)
+
+ db = MultiLineageDB.load([taxonomy_csv, taxonomy_db],
+ keep_full_identifiers=False,
+ keep_identifier_versions=False)
+ assert len(db.shadowed_identifiers()) == 6
+ assert set(lca_utils.taxlist(include_strain=False)) == set(db.available_ranks)
+
+
+def test_tax_multi_save_files(runtmp, keep_identifiers, keep_versions):
+ # test save
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+
+ if keep_identifiers and not keep_versions:
+ with pytest.raises(ValueError):
+ db = MultiLineageDB.load([taxonomy_csv],
+ keep_full_identifiers=keep_identifiers,
+ keep_identifier_versions=keep_versions)
+ return
+
+ db = MultiLineageDB.load([taxonomy_csv],
+ keep_full_identifiers=keep_identifiers,
+ keep_identifier_versions=keep_versions)
+
+ out_db = runtmp.output('out.db')
+ out_csv = runtmp.output('out.csv')
+ out2_csv = runtmp.output('out2.csv')
+
+ # can't save to fp with sql
+ with open(out_csv, 'wt') as fp:
+ with pytest.raises(ValueError):
+ db.save(fp, 'sql')
+
+ # these should all work...
+ with open(out_csv, 'wt') as fp:
+ db.save(fp, 'csv')
+
+ db.save(out2_csv, 'csv')
+ db.save(out_db, 'sql')
+
+ # ...and be equal
+ db1 = db.load([out_db])
+ db2 = db.load([out_csv])
+ db3 = db.load([out2_csv])
+
+ def strip_strain(it):
+ for k, v in it:
+ if v[-1].rank == 'strain':
+ v = v[:-1]
+ yield k, v
+
+ import pprint
+ db_items = list(strip_strain(db.items()))
+ db1_items = list(strip_strain(db1.items()))
+ db2_items = list(strip_strain(db2.items()))
+ db3_items = list(strip_strain(db3.items()))
+ pprint.pprint(db_items)
+ print('XXX')
+ pprint.pprint(list(db1_items))
+ print('XXX')
+ pprint.pprint(list(db2_items))
+
+ assert set(db_items) == set(db1_items)
+ assert set(db_items) == set(db2_items)
+ assert set(db_items) == set(db3_items)
+
+
+def test_lineage_db_csv_load(runtmp):
+ # test LineageDB.load
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+ taxonomy_csv2 = utils.get_test_data('tax/test-strain.taxonomy.csv')
+ badcsv = utils.get_test_data('tax/47+63_x_gtdb-rs202.gather.csv')
+ badcsv2 = utils.get_test_data('tax/test-missing-ranks.taxonomy.csv')
+
+ db = LineageDB.load(taxonomy_csv)
+ assert len(db) == 6
+ assert 'strain' not in db.available_ranks
+
+ db = LineageDB.load(taxonomy_csv2)
+ assert len(db) == 6
+ assert 'strain' in db.available_ranks
+
+ # load the wrong kind of csv
+ with pytest.raises(ValueError):
+ LineageDB.load(badcsv)
+
+ # load a bad CSV
+ with pytest.raises(ValueError):
+ LineageDB.load(badcsv2)
+
+ # load a directory
+ with pytest.raises(ValueError):
+ LineageDB.load(runtmp.output(''))
+
+ # file does not exist
+ with pytest.raises(ValueError):
+ LineageDB.load(runtmp.output('no-such-file'))
+
+ # construct a CSV with bad headers
+ with open(runtmp.output('xxx.csv'), 'w', newline="") as fp:
+ fp.write('x,y,z\n')
+ with pytest.raises(ValueError):
+ LineageDB.load(runtmp.output('xxx.csv'))
+
+
+def test_lineage_db_sql_load(runtmp):
+ # test LineageDB_sqlite.load
+ taxonomy_db = utils.get_test_data('tax/test.taxonomy.db')
+ taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+
+ db = LineageDB_Sqlite.load(taxonomy_db)
+ assert bool(db)
+ assert len(db) == 6
+ db.available_ranks
+ assert 'strain' not in db.available_ranks
+ assert db['GCF_001881345'][0].rank == 'superkingdom'
+ with pytest.raises(KeyError):
+ db['foo']
+
+ # load any kind of CSV
+ with pytest.raises(ValueError):
+ LineageDB_Sqlite.load(taxonomy_csv)
+
+ # load a directory
+ with pytest.raises(ValueError):
+ LineageDB_Sqlite.load(runtmp.output(''))
+
+ # file does not exist
+ with pytest.raises(ValueError):
+ LineageDB_Sqlite.load(runtmp.output('no-such-file'))
diff --git a/tox.ini b/tox.ini
index 228f689f07..10c2bbde31 100644
--- a/tox.ini
+++ b/tox.ini
@@ -19,7 +19,7 @@ skip_missing_interpreters = true
description = run the tests with pytest under {basepython}
setenv =
PIP_DISABLE_VERSION_CHECK = 1
- COVERAGE_FILE = {toxworkdir}/.coverage.{envname}
+ COVERAGE_FILE = {env:COVERAGE_FILE:{toxworkdir}/.coverage.{envname}}
VIRTUALENV_NO_DOWNLOAD = 1
PIP_EXTRA_INDEX_URL = https://antocuni.github.io/pypy-wheels/manylinux2010
passenv =
@@ -33,18 +33,14 @@ passenv =
PYTEST_*
PIP_CACHE_DIR
CI
- TRAVIS
- TRAVIS_*
PYTHONDEVMODE
deps =
- pip>=19.3.1
+ pip >= 19.3.1
extras =
test
storage
-changedir = tests
commands = pytest \
--cov "{envsitepackagesdir}/sourmash" \
- --cov . \
--cov-config "{toxinidir}/tox.ini" \
--cov-report= \
--junitxml {toxworkdir}/junit.{envname}.xml \
@@ -58,7 +54,6 @@ deps =
[testenv:hypothesis]
commands = pytest \
--cov "{envsitepackagesdir}/sourmash" \
- --cov . \
--cov-config "{toxinidir}/tox.ini" \
--cov-report= \
--junitxml {toxworkdir}/junit.{envname}.xml \
@@ -72,7 +67,6 @@ deps =
khmer
commands = pytest \
--cov "{envsitepackagesdir}/sourmash" \
- --cov . \
--cov-config "{toxinidir}/tox.ini" \
--cov-report= \
--junitxml {toxworkdir}/junit.{envname}.xml \
@@ -84,7 +78,6 @@ deps =
-e git+/~https://github.com/dib-lab/khmer.git#egg=khmer
commands = pytest \
--cov "{envsitepackagesdir}/sourmash" \
- --cov . \
--cov-config "{toxinidir}/tox.ini" \
--cov-report= \
--junitxml {toxworkdir}/junit.{envname}.xml \