sourmash-bio · ctb · Aug 26, 2022 · Aug 16, 2022 · Aug 17, 2022 · Aug 19, 2022
diff --git a/doc/command-line.md b/doc/command-line.md
@@ -49,7 +49,7 @@ Finally, plot a dendrogram: ``` sourmash plot cmp.dist --labels ```
 This will output three files, `cmp.dist.dendro.png`,
 `cmp.dist.matrix.png`, and `cmp.dist.hist.png`, containing a
 clustering & dendrogram of the sequences, a similarity matrix and
-heatmap, and a histogram of the pairwise distances between the three
+heatmap, and a histogram of the pairwise similarities between the three
 genomes.
 
 Matrix:
@@ -75,8 +75,8 @@ There are seven main subcommands: `sketch`, `compare`, `plot`,
 [the tutorial](tutorials.md) for a walkthrough of these commands.
 
 * `sketch` creates signatures.
-* `compare` compares signatures and builds a distance matrix.
-* `plot` plots distance matrices created by `compare`.
+* `compare` compares signatures and builds a similarity matrix.
+* `plot` plots similarity matrices created by `compare`.
 * `search` finds matches to a query signature in a collection of signatures.
 * `gather` finds the best reference genomes for a metagenome, using the provided collection of signatures.
 * `index` builds a fast index for many (thousands) of signatures.
@@ -199,27 +199,47 @@ with `--output` and used with the `sourmash plot` subcommand (or loaded
 with `numpy.load(...)`.  Using `--csv` will output a CSV file that can
 be loaded into other languages than Python, such as R.
 
+As of sourmash 4.4.0, `compare` also supports Average Nucleotide
+Identity (ANI) estimates instead of Jaccard or containment index; use
+`--ani` to enable this.
+
 Usage:
 ```
 sourmash compare file1.sig [ file2.sig ... ]
 ```
 
 Options:
 
-* `--output` -- save the distance matrix to this file (as a numpy binary matrix)
+* `--output` -- save the output matrix to this file (as a numpy binary matrix).
+* `--distance-matrix` -- create and output a distance matrix, instead of a similarity matrix.
 * `--ksize` -- do the comparisons at this k-mer size.
 * `--containment` -- calculate containment instead of similarity; `C(i, j) = size(i intersection j) / size(i)`
+* `--ani` -- output estimates of Average Nucleotide Identity (ANI) instead of Jaccard similarity or containment.
 * `--from-file` -- append the list of files in this text file to the input
         signatures.
 * `--ignore-abundance` -- ignore abundances in signatures.
 * `--picklist` -- select a subset of signatures with [a picklist](#using-picklists-to-subset-large-collections-of-signatures)
 
-**Note:** compare by default produces a symmetric similarity matrix that can be used as an input to clustering. With `--containment`, however, this matrix is no longer symmetric and cannot formally be used for clustering.
+**Note:** compare by default produces a symmetric similarity matrix
+that can be used for clustering in downstream tasks. With `--containment`,
+however, this matrix is no longer symmetric and cannot formally be
+used for clustering.
+
+The containment matrix is organized such that the value in row A for column B is the containment of the B'th sketch in the A'th sketch, i.e.
+
+```
+C(A, B) = B.contained_by(A)
+```
+
+**Note:** The ANI estimate will be calculated based on Jaccard similarity
+by default; however, if `--containment` or `--max-containment` is
+specified, those values will be used instead. With `--containment --ani`, the
+ANI output matrix will be asymmetric as discussed above.
 
 ### `sourmash plot` - cluster and visualize comparisons of many signatures
 
 The `plot` subcommand produces two plots -- a dendrogram and a
-dendrogram+matrix -- from a distance matrix created by `sourmash compare
+dendrogram+matrix -- from a matrix created by `sourmash compare
 --output <matrix>`.  The default output is two PNG files.
 
 Usage:

diff --git a/src/sourmash/cli/compare.py b/src/sourmash/cli/compare.py
@@ -1,4 +1,4 @@
-"""compare sequence signatures made by compute"""
+"""create a similarity matrix comparing many samples"""
 
 usage="""
 
@@ -39,8 +39,6 @@ def subparser(subparsers):
     subparser.add_argument(
         '-q', '--quiet', action='store_true', help='suppress non-error output'
     )
-    add_ksize_arg(subparser)
-    add_moltype_args(subparser)
     subparser.add_argument(
         '-o', '--output', metavar='F',
         help='file to which output will be written; default is terminal '
@@ -82,6 +80,18 @@ def subparser(subparsers):
     subparser.add_argument(
         '-p', '--processes', metavar='N', type=int, default=None,
         help='Number of processes to use to calculate similarity')
+    subparser.add_argument(
+        '--distance-matrix', action='store_true',
+        help='output a distance matrix, instead of a similarity matrix'
+    )
+    subparser.add_argument(
+        '--similarity-matrix', action='store_false',
+        dest='distance_matrix',
+        help='output a similiarty matrix; this is the default',
+    )
+
+    add_ksize_arg(subparser)
+    add_moltype_args(subparser)
     add_picklist_args(subparser)
     add_pattern_args(subparser)
 

diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py
@@ -169,16 +169,24 @@ def compare(args):
         similarity = compare_all_pairs(siglist, args.ignore_abundance,
                                        n_jobs=args.processes, return_ani=return_ani)
 
+    # if distance matrix desired, switch to 1-similarity
+    if args.distance_matrix:
+        similarity = 1 - similarity
+
     if len(siglist) < 30:
-        for i, E in enumerate(siglist):
+        for i, ss in enumerate(siglist):
             # for small matrices, pretty-print some output
-            name_num = '{}-{}'.format(i, str(E))
+            name_num = '{}-{}'.format(i, str(ss))
             if len(name_num) > 20:
                 name_num = name_num[:17] + '...'
             print_results('{:20s}\t{}'.format(name_num, similarity[i, :, ],))
 
-    print_results('min similarity in matrix: {:.3f}', numpy.min(similarity))
-    # shall we output a matrix?
+    if args.distance_matrix:
+        print_results('max distance in matrix: {:.3f}', numpy.max(similarity))
+    else:
+        print_results('min similarity in matrix: {:.3f}', numpy.min(similarity))
+
+    # shall we output a matrix to stdout?
     if args.output:
         labeloutname = args.output + '.labels.txt'
         notify(f'saving labels to: {labeloutname}')
@@ -202,7 +210,10 @@ def compare(args):
                 w.writerow(y)
 
     if size_may_be_inaccurate:
-        notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will be set to 0 for these comparisons.")
+        if args.distance_matrix:
+            notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI distances will be set to 1 for these comparisons.")
+        else:
+            notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will be set to 1 for these comparisons.")
 
 
 def plot(args):

diff --git a/src/sourmash/search.py b/src/sourmash/search.py
@@ -321,11 +321,11 @@ def estimate_search_ani(self):
         if self.cmp_scaled is None:
             raise TypeError("ANI can only be estimated from scaled signatures.")
         if self.searchtype == SearchType.CONTAINMENT:
-            self.cmp.estimate_mh1_containment_ani(containment = self.similarity)
-            self.ani = self.cmp.mh1_containment_ani
+            self.cmp.estimate_ani_from_mh2_containment_in_mh1(containment = self.similarity)
+            self.ani = self.cmp.ani_from_mh2_containment_in_mh1
             if self.estimate_ani_ci:
-                self.ani_low = self.cmp.mh1_containment_ani_low
-                self.ani_high = self.cmp.mh1_containment_ani_high
+                self.ani_low = self.cmp.ani_from_mh2_containment_in_mh1_low
+                self.ani_high = self.cmp.ani_from_mh2_containment_in_mh1_high
         elif self.searchtype == SearchType.MAX_CONTAINMENT:
             self.cmp.estimate_max_containment_ani(max_containment = self.similarity)
             self.ani = self.cmp.max_containment_ani
@@ -377,25 +377,25 @@ def init_sigcomparison(self):
 
     def estimate_containment_ani(self):
         self.cmp.estimate_all_containment_ani()
-        self.query_containment_ani = self.cmp.mh1_containment_ani
-        self.match_containment_ani = self.cmp.mh2_containment_ani
+        self.query_containment_ani = self.cmp.ani_from_mh1_containment_in_mh2
+        self.match_containment_ani = self.cmp.ani_from_mh2_containment_in_mh1
         self.average_containment_ani = self.cmp.avg_containment_ani
         self.max_containment_ani = self.cmp.max_containment_ani
         self.potential_false_negative = self.cmp.potential_false_negative
         if self.estimate_ani_ci:
             self.handle_ani_ci()
 
     def handle_ani_ci(self):
-        self.query_containment_ani_low = self.cmp.mh1_containment_ani_low
-        self.query_containment_ani_high = self.cmp.mh1_containment_ani_high
-        self.match_containment_ani_low = self.cmp.mh2_containment_ani_low
-        self.match_containment_ani_high = self.cmp.mh2_containment_ani_high
+        self.query_containment_ani_low = self.cmp.ani_from_mh1_containment_in_mh2_low
+        self.query_containment_ani_high = self.cmp.ani_from_mh1_containment_in_mh2_high
+        self.match_containment_ani_low = self.cmp.ani_from_mh2_containment_in_mh1_low
+        self.match_containment_ani_high = self.cmp.ani_from_mh2_containment_in_mh1_high
 
     def build_prefetch_result(self):
         # unique prefetch values
         self.jaccard = self.cmp.jaccard
-        self.f_query_match = self.cmp.mh2_containment #db_mh.contained_by(query_mh)
-        self.f_match_query = self.cmp.mh1_containment #query_mh.contained_by(db_mh)
+        self.f_query_match = self.cmp.mh2_containment_in_mh1 #db_mh.contained_by(query_mh)
+        self.f_match_query = self.cmp.mh1_containment_in_mh2 #query_mh.contained_by(db_mh)
         # set write columns for prefetch result
         self.write_cols = self.prefetch_write_cols
         if self.estimate_ani_ci:
@@ -476,10 +476,10 @@ def build_gather_result(self):
         self.unique_intersect_bp = self.gather_comparison.total_unique_intersect_hashes
 
         # calculate fraction of subject match with orig query
-        self.f_match_orig = self.cmp.mh2_containment
+        self.f_match_orig = self.cmp.mh2_containment_in_mh1
 
         # calculate fractions wrt first denominator - genome size
-        self.f_match = self.gather_comparison.mh2_containment # unique match containment
+        self.f_match = self.gather_comparison.mh2_containment_in_mh1 # unique match containment
         self.f_orig_query = len(self.cmp.intersect_mh) / self.orig_query_len
         assert self.gather_comparison.intersect_mh.contained_by(self.gather_comparison.mh1_cmp) == 1.0
 
@@ -536,8 +536,8 @@ def prefetchresultdict(self):
         if self.estimate_ani_ci:
             prefetch_cols = self.prefetch_write_cols_ci
         self.jaccard = self.cmp.jaccard
-        self.f_query_match = self.cmp.mh2_containment #db_mh.contained_by(query_mh)
-        self.f_match_query = self.cmp.mh1_containment #query_mh.contained_by(db_mh)
+        self.f_query_match = self.cmp.mh2_containment_in_mh1 #db_mh.contained_by(query_mh)
+        self.f_match_query = self.cmp.mh1_containment_in_mh2 #query_mh.contained_by(db_mh)
         self.prep_prefetch_result()
         return self.to_write(columns=prefetch_cols)
 

diff --git a/src/sourmash/sketchcomparison.py b/src/sourmash/sketchcomparison.py
@@ -124,41 +124,41 @@ def total_unique_intersect_hashes(self):
         return len(self.intersect_mh) * self.cmp_scaled # + (ksize-1) #for bp estimation
 
     @property
-    def mh1_containment(self):
+    def mh1_containment_in_mh2(self):
         return self.mh1_cmp.contained_by(self.mh2_cmp)
 
-    def estimate_mh1_containment_ani(self, containment = None):
+    def estimate_ani_from_mh1_containment_in_mh2(self, containment = None):
         # build result once
         m1_cani = self.mh1_cmp.containment_ani(self.mh2_cmp,
                                             containment=containment,
                                             confidence=self.ani_confidence,
                                             estimate_ci=self.estimate_ani_ci)
 #                                            prob_threshold=self.pfn_threshold)
         # propagate params
-        self.mh1_containment_ani = m1_cani.ani
+        self.ani_from_mh1_containment_in_mh2 = m1_cani.ani
         if m1_cani.p_exceeds_threshold:
             # only update if True
             self.potential_false_negative = True
         if self.estimate_ani_ci:
-            self.mh1_containment_ani_low = m1_cani.ani_low
-            self.mh1_containment_ani_high = m1_cani.ani_high
+            self.ani_from_mh1_containment_in_mh2_low = m1_cani.ani_low
+            self.ani_from_mh1_containment_in_mh2_high = m1_cani.ani_high
 
     @property
-    def mh2_containment(self):
+    def mh2_containment_in_mh1(self):
         return self.mh2_cmp.contained_by(self.mh1_cmp)
 
-    def estimate_mh2_containment_ani(self, containment=None):
+    def estimate_ani_from_mh2_containment_in_mh1(self, containment=None):
         m2_cani =  self.mh2_cmp.containment_ani(self.mh1_cmp,
                                             containment=containment,
                                             confidence=self.ani_confidence,
                                             estimate_ci=self.estimate_ani_ci)
 #                                            prob_threshold=self.pfn_threshold)
-        self.mh2_containment_ani = m2_cani.ani
+        self.ani_from_mh2_containment_in_mh1 = m2_cani.ani
         if m2_cani.p_exceeds_threshold:
             self.potential_false_negative = True
         if self.estimate_ani_ci:
-            self.mh2_containment_ani_low = m2_cani.ani_low
-            self.mh2_containment_ani_high = m2_cani.ani_high
+            self.ani_from_mh2_containment_in_mh1_low = m2_cani.ani_low
+            self.ani_from_mh2_containment_in_mh1_high = m2_cani.ani_high
 
     @property
     def max_containment(self):
@@ -185,22 +185,22 @@ def avg_containment(self):
     @property
     def avg_containment_ani(self):
         "Returns single average_containment_ani value. Sets self.potential_false_negative internally."
-        self.estimate_mh1_containment_ani()
-        self.estimate_mh2_containment_ani()
-        if any([self.mh1_containment_ani is None, self.mh2_containment_ani is None]):
+        self.estimate_ani_from_mh1_containment_in_mh2()
+        self.estimate_ani_from_mh2_containment_in_mh1()
+        if any([self.ani_from_mh1_containment_in_mh2 is None, self.ani_from_mh2_containment_in_mh1 is None]):
             return None
         else:
-            return (self.mh1_containment_ani + self.mh2_containment_ani)/2
+            return (self.ani_from_mh1_containment_in_mh2 + self.ani_from_mh2_containment_in_mh1)/2
 
     def estimate_all_containment_ani(self):
         "Estimate all containment ANI values."
-        self.estimate_mh1_containment_ani()
-        self.estimate_mh2_containment_ani()
-        if any([self.mh1_containment_ani is None, self.mh2_containment_ani is None]):
+        self.estimate_ani_from_mh1_containment_in_mh2()
+        self.estimate_ani_from_mh2_containment_in_mh1()
+        if any([self.ani_from_mh1_containment_in_mh2 is None, self.ani_from_mh2_containment_in_mh1 is None]):
 #            self.estimate_max_containment_ani()
             self.max_containment_ani = None
         else:
-            self.max_containment_ani = max([self.mh1_containment_ani, self.mh2_containment_ani])
+            self.max_containment_ani = max([self.ani_from_mh1_containment_in_mh2, self.ani_from_mh2_containment_in_mh1])
 
     def weighted_intersection(self, from_mh=None, from_abundD={}):
          # map abundances to all intersection hashes.

diff --git a/tests/test-data/47-63-merge.sig b/tests/test-data/47-63-merge.sig
diff --git a/tests/test_prefetch.py b/tests/test_prefetch.py
@@ -12,7 +12,7 @@
 import sourmash_tst_utils as utils
 import sourmash
 from sourmash_tst_utils import SourmashCommandFailed
-from sourmash import signature
+from sourmash import signature, sourmash_args
 
 
 def approx_eq(val1, val2):
@@ -839,3 +839,28 @@ def test_prefetch_ani_csv_out_estimate_ci(runtmp, linear_gather):
             assert approx_eq(row['max_containment_ani'], expected['mc_ani'])
             assert approx_eq(row['average_containment_ani'], expected['ac_ani'])
             assert row['potential_false_negative'] == expected['pfn']
+
+
+def test_prefetch_ani_containment_asymmetry(runtmp):
+    # test contained_by asymmetries, viz #2215
+    query_sig = utils.get_test_data('47.fa.sig')
+    merged_sig = utils.get_test_data('47-63-merge.sig')
+
+    runtmp.sourmash('prefetch', query_sig, merged_sig, '-o',
+                    'query-in-merged.csv')
+    runtmp.sourmash('prefetch', merged_sig, query_sig, '-o',
+                    'merged-in-query.csv')
+
+    with sourmash_args.FileInputCSV(runtmp.output('query-in-merged.csv')) as r:
+        query_in_merged = list(r)[0]
+
+    with sourmash_args.FileInputCSV(runtmp.output('merged-in-query.csv')) as r:
+        merged_in_query = list(r)[0]
+
+    assert query_in_merged['query_containment_ani'] == '1.0'
+    assert query_in_merged['match_containment_ani'] == '0.9865155060423993'
+    assert query_in_merged['average_containment_ani'] == '0.9932577530211997'
+
+    assert merged_in_query['match_containment_ani'] == '1.0'
+    assert merged_in_query['query_containment_ani'] == '0.9865155060423993'
+    assert merged_in_query['average_containment_ani'] == '0.9932577530211997'