From 0eb1211e18cf5b3c4420eb9033854be5addbb723 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adam=20Gudy=C5=9B?= <adam.gudys@polsl.pl>
Date: Mon, 25 Nov 2024 14:58:53 +0100
Subject: [PATCH] Several updates

* Remove --bin* options in vclust.py
* Submodules updated to latest revisions.

Co-authored-by: aziele <a.zielezinski@gmail.com>
---
 .github/workflows/deploy.yml      |  2 +-
 .github/workflows/main.yml        | 12 ++++--
 .github/workflows/self-hosted.yml |  2 +-
 3rd_party/clusty                  |  2 +-
 3rd_party/kmer-db                 |  2 +-
 3rd_party/lz-ani                  |  2 +-
 3rd_party/ref-utils               |  2 +-
 README.md                         |  2 +
 makefile                          |  8 ++--
 pyproject.toml                    |  2 +-
 vclust.py                         | 67 +++++++------------------------
 11 files changed, 36 insertions(+), 67 deletions(-)

diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
index d865f7e..3f0d88a 100644
--- a/.github/workflows/deploy.yml
+++ b/.github/workflows/deploy.yml
@@ -52,7 +52,7 @@ jobs:
         
     steps:
     - name: make
-      run: make -j32 CXX=g++-${{matrix.compiler}} CC=gcc-${{matrix.compiler}} PLATFORM=${{ matrix.platform }} LEIDEN=true
+      run: gmake -j32 CXX=g++-${{matrix.compiler}} CC=gcc-${{matrix.compiler}} PLATFORM=${{ matrix.platform }} LEIDEN=true STATIC_LINK=true
     - name: tar artifacts
       run: | 
         mkdir ${DIR}
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 3d56435..e7b073b 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -15,18 +15,24 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        machine: [ubuntu-latest, macOS-12]
+        machine: [ubuntu-latest]
+        gmake_install_command: ['gmake --version']
         compiler: [12]
+        include:
+        - {machine: macOS-13, gmake_install_command: 'brew install make && gmake --version', compiler: 12}
     runs-on: ['${{ matrix.machine }}']   
 
     steps:
     - uses: actions/checkout@v4
       with:
         submodules: recursive
+
+    - name: install gmake
+      run: ${{ matrix.gmake_install_command }}        
     
     - name: make
       run: | 
-        make -j32 CXX=g++-${{matrix.compiler}} CC=gcc-${{matrix.compiler}}
+        gmake -j CXX=g++-${{matrix.compiler}} CC=gcc-${{matrix.compiler}} STATIC_LINK=true
     - name: tar artifacts
       run: tar -cvf vclust.tar ./vclust.py ./test.py ./example ./bin/kmer-db ./bin/lz-ani ./bin/clusty ./bin/multi-fasta-split
     
@@ -42,7 +48,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        machine: [ubuntu-latest, macOS-12]
+        machine: [ubuntu-latest, macOS-13]
      
     runs-on: ['${{ matrix.machine }}']    
     
diff --git a/.github/workflows/self-hosted.yml b/.github/workflows/self-hosted.yml
index 58abdae..9daafb6 100644
--- a/.github/workflows/self-hosted.yml
+++ b/.github/workflows/self-hosted.yml
@@ -50,7 +50,7 @@ jobs:
 
     steps:
     - name: make
-      run: make -j32 CXX=g++-${{matrix.compiler}} CC=gcc-${{matrix.compiler}} PLATFORM=${{ matrix.platform }} LEIDEN=true
+      run: gmake -j32 CXX=g++-${{matrix.compiler}} CC=gcc-${{matrix.compiler}} PLATFORM=${{ matrix.platform }} LEIDEN=true STATIC_LINK=true
     - name: print info
       run: python3 vclust.py info 
 
diff --git a/3rd_party/clusty b/3rd_party/clusty
index 7b109d4..6a5430d 160000
--- a/3rd_party/clusty
+++ b/3rd_party/clusty
@@ -1 +1 @@
-Subproject commit 7b109d42a4c603e26dead5b566d43c0506a858d7
+Subproject commit 6a5430ddf7adc5de7af20438daec61d179e3200b
diff --git a/3rd_party/kmer-db b/3rd_party/kmer-db
index 742b494..e98e257 160000
--- a/3rd_party/kmer-db
+++ b/3rd_party/kmer-db
@@ -1 +1 @@
-Subproject commit 742b4942b71271e8b0a1be63405e86b0d1f795ec
+Subproject commit e98e257c955ae8795cbc3a512ebbae96e21d6bc9
diff --git a/3rd_party/lz-ani b/3rd_party/lz-ani
index e3cc571..c898e1e 160000
--- a/3rd_party/lz-ani
+++ b/3rd_party/lz-ani
@@ -1 +1 @@
-Subproject commit e3cc571d973aedf634afd349c641dbb1328ea493
+Subproject commit c898e1e6a91dd90c3926fef583feb9ee5a04bb03
diff --git a/3rd_party/ref-utils b/3rd_party/ref-utils
index 21d36c7..6f52a54 160000
--- a/3rd_party/ref-utils
+++ b/3rd_party/ref-utils
@@ -1 +1 @@
-Subproject commit 21d36c7c5a629e23446400d51cfd317c57ac5dc7
+Subproject commit 6f52a541fcf3c4f880c37eaa30cee9f58837729e
diff --git a/README.md b/README.md
index f61af0a..bbe3191 100644
--- a/README.md
+++ b/README.md
@@ -5,6 +5,8 @@
 [![Build and tests](../../workflows/Build%20and%20tests/badge.svg)](../../actions/workflows/main.yml)
 [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0)
 
+
+[![PyPI - Total Downloads](https://static.pepy.tech/personalized-badge/vclust?period=total&units=abbreviation&left_color=grey&right_color=green&left_text=PyPI%20total%20downloads)](https://www.pepy.tech/projects/vclust)
 [![PyPI - Downloads](https://img.shields.io/pypi/dm/vclust?label=PyPI%20downloads)](https://pypi.org/project/vclust/)
 [![GitHub downloads](https://img.shields.io/github/downloads/refresh-bio/vclust/total.svg?style=flag&label=GitHub%20downloads)](/~https://github.com/refresh-bio/vclust/releases)
 [![Bioconda downloads](https://img.shields.io/conda/dn/bioconda/vclust.svg?style=flag&label=Bioconda%20downloads)](https://anaconda.org/bioconda/vclust)
diff --git a/makefile b/makefile
index 8a205fd..47f4daa 100644
--- a/makefile
+++ b/makefile
@@ -14,10 +14,10 @@ prep:
 	cd 3rd_party/clusty && $(MAKE) -j 
 	cd 3rd_party/ref-utils && $(MAKE) -j 
 	mkdir -p bin
-	cp 3rd_party/kmer-db/kmer-db ./bin/
-	cp 3rd_party/lz-ani/lz-ani ./bin/
-	cp 3rd_party/clusty/clusty ./bin/
-	cp 3rd_party/ref-utils/multi-fasta-split/multi-fasta-split ./bin/
+	cp 3rd_party/kmer-db/bin/kmer-db ./bin/
+	cp 3rd_party/lz-ani/bin/lz-ani ./bin/
+	cp 3rd_party/clusty/bin/clusty ./bin/
+	cp 3rd_party/ref-utils/bin/multi-fasta-split ./bin/
 	
 clean:
 	cd 3rd_party/kmer-db && $(MAKE) clean
diff --git a/pyproject.toml b/pyproject.toml
index 3fa8bdb..494fad5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,7 +9,7 @@ py-modules = ["vclust"]
 where = ["./"]
 
 [project]
-name = "vclust-test"
+name = "vclust"
 description = """Fast and accurate tool for calculating \
 Average Nucleotide Identity (ANI) and clustering virus \
 genomes and metagenomic contigs"""
diff --git a/vclust.py b/vclust.py
index 00c7269..b2dacc3 100755
--- a/vclust.py
+++ b/vclust.py
@@ -16,7 +16,7 @@
 import typing
 import uuid
 
-__version__ = '1.2.8'
+__version__ = '1.2.9'
 
 DEFAULT_THREAD_COUNT = min(multiprocessing.cpu_count(), 64)
 
@@ -95,7 +95,7 @@ def ranged_float_type(value):
         metavar='<file>',
         type=input_path_type,
         dest='input_path',
-        help='Input FASTA file or directory with FASTA files',
+        help='Input FASTA file or directory of files (gzipped or uncompressed)',
         required=True
     )
     prefilter_required.add_argument(
@@ -119,16 +119,15 @@ def ranged_float_type(value):
         metavar="<int>",
         type=int,
         default=20,
-        help='Filter genome pairs based on minimum number of shared k-mers '
-             '[%(default)s]'
+        help='Minimum number of shared k-mers between two genomes [%(default)s]'
     )
     prefilter_parser.add_argument(
         '--min-ident',
         metavar="<float>",
         type=ranged_float_type,
         default=0.7,
-        help='Filter genome pairs based on minimum sequence identity of '
-        'the shorter sequence (0-1) [%(default)s]'
+        help='Minimum sequence identity (0-1) between two genomes. Calculated '
+        'based on the shorter sequence [%(default)s]'
     )
     prefilter_parser.add_argument(
         '--batch-size',
@@ -144,9 +143,9 @@ def ranged_float_type(value):
         metavar="<float>",
         type=ranged_float_type,
         default=1.0,
-        help='Fraction of k-mers to analyze for each genome (0-1). A lower '
-        'value reduces RAM usage and speeds up processing (affects sensitivity) '
-        '[%(default)s]'
+        help='Fraction of k-mers to analyze in each genome (0-1). A lower '
+        'value reduces RAM usage and speeds up processing. By default, all '
+        'k-mers [%(default)s]'
     )
     prefilter_parser.add_argument(
         '--max-seqs',
@@ -164,22 +163,6 @@ def ranged_float_type(value):
         action="store_true",
         help='Keep temporary Kmer-db files [%(default)s]'
     )
-    prefilter_parser.add_argument(
-        '--bin',
-        metavar='<file>',
-        type=pathlib.Path,
-        dest="bin_kmerdb",
-        default=f'{BIN_KMERDB}',
-        help='Path to the Kmer-db binary [%(default)s]'
-    )
-    prefilter_parser.add_argument(
-        '--bin-fasta',
-        metavar='<file>',
-        type=pathlib.Path,
-        dest="bin_fastasplit",
-        default=f'{BIN_FASTASPLIT}',
-        help='Path to the multi-fasta-split binary [%(default)s]'
-    )
     prefilter_parser.add_argument(
         '-t', '--threads',
         metavar="<int>",
@@ -214,7 +197,7 @@ def ranged_float_type(value):
         metavar='<file>',
         type=input_path_type,
         dest='input_path',
-        help='Input FASTA file or directory with FASTA files',
+        help='Input FASTA file or directory of files (gzipped or uncompressed)',
         required=True
     )
     align_required.add_argument(
@@ -298,14 +281,6 @@ def ranged_float_type(value):
         help='Min. reference coverage (aligned fraction) to output (0-1) '
         '[%(default)s]'
     )
-    align_parser.add_argument(
-        '--bin',
-        metavar='<file>',
-        type=pathlib.Path,
-        dest='bin_lzani',
-        default=f'{BIN_LZANI}',
-        help='Path to the LZ-ANI binary [%(default)s]'
-    )
     align_parser.add_argument(
         '--mal',
         metavar='<int>',
@@ -528,14 +503,6 @@ def ranged_float_type(value):
         default=2,
         help='Number of iterations for the Leiden algorithm [%(default)s]'
     )
-    cluster_parser.add_argument(
-        '--bin',
-        metavar='<file>',
-        type=pathlib.Path,
-        dest="bin_clusty",
-        default=f'{BIN_CLUSTY}',
-        help='Path to the Clusty binary [%(default)s]'
-    )
     cluster_parser.add_argument(
         '-v', '--verbose',
         action="store_true",
@@ -1215,7 +1182,7 @@ def vclust_info() -> None:
         output_lines.append(f'{RED}Status: error{RESET}')
         output_lines.extend(f"   - {name}: {error}" for name, error in errors)
     else:
-        output_lines.append(f'{GREEN}Status: ok{RESET}')
+        output_lines.append(f'{GREEN}Status: ready{RESET}')
 
     # Output the complete information.
     print('\n'.join(output_lines))
@@ -1263,7 +1230,7 @@ def main():
         vclust_info()
     # Prefilter
     elif args.command == 'prefilter':
-        args.bin_kmerdb = validate_binary(args.bin_kmerdb)
+        validate_binary(BIN_KMERDB)
         args = validate_args_prefilter(args, parser)
         args = validate_args_fasta_input(args, parser)
 
@@ -1278,13 +1245,12 @@ def main():
         else:
             # Split multi-fasta file.
             if args.batch_size:
-                args.bin_fastasplit = validate_binary(args.bin_fastasplit)
+                validate_binary(BIN_FASTASPLIT)
                 cmd = cmd_fastasplit(
                     input_fasta=args.input_path, 
                     out_dir=out_dir,
                     n=args.batch_size,
                     verbose=args.verbose,
-                    bin_path=args.bin_fastasplit,
                 )
                 p = run(cmd, args.verbose, logger)
                 for f in out_dir.glob('part_*'):
@@ -1311,7 +1277,6 @@ def main():
                 kmer_size=args.k,
                 kmers_fraction=args.kmers_fraction,
                 num_threads=args.num_threads,
-                bin_path=args.bin_kmerdb,
             )
             p = run(cmd, args.verbose, logger)
             db_paths.append(db_path)
@@ -1333,7 +1298,6 @@ def main():
             min_ident=args.min_ident,
             max_seqs=args.max_seqs,
             num_threads=args.num_threads,
-            bin_path=args.bin_kmerdb,
         )
         p = run(cmd, args.verbose, logger)
 
@@ -1342,7 +1306,6 @@ def main():
             outfile_distance=args.output_path,
             min_ident=args.min_ident,
             num_threads=args.num_threads,
-            bin_path=args.bin_kmerdb,
         )
         p = run(cmd, args.verbose, logger)
 
@@ -1353,7 +1316,7 @@ def main():
 
     # Align
     elif args.command == 'align':
-        args.bin_lzani = validate_binary(args.bin_lzani)
+        validate_binary(BIN_LZANI)
         args = validate_args_fasta_input(args, parser)
 
         out_dir = args.output_path.parent / get_uuid()
@@ -1386,7 +1349,6 @@ def main():
             ar=args.ar,
             num_threads=args.num_threads,
             verbose=args.verbose,
-            bin_path=args.bin_lzani,
         )
         p = run(cmd, args.verbose, logger)
 
@@ -1396,7 +1358,7 @@ def main():
 
     # Cluster
     elif args.command == 'cluster':
-        args.bin_clusty = validate_binary(args.bin_clusty)
+        validate_binary(BIN_CLUSTY)
         args = validate_args_cluster(args, parser)
 
         cmd = cmd_clusty(
@@ -1416,7 +1378,6 @@ def main():
             leiden_resolution=args.leiden_resolution,
             leiden_beta=args.leiden_beta,
             leiden_iterations=args.leiden_iterations,
-            bin_path=args.bin_clusty,
         )
         p = run(cmd, args.verbose, logger)