Several updates

* Remove --bin* options in vclust.py * Submodules updated to latest revisions. Co-authored-by: aziele <a.zielezinski@gmail.com>
refresh-bio · Nov 25, 2024 · 0eb1211 · 0eb1211
1 parent c5058fc
commit 0eb1211
Show file tree

Hide file tree

Showing 11 changed files with 36 additions and 67 deletions.
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -52,7 +52,7 @@ jobs:
 
     steps:
     - name: make
-      run: make -j32 CXX=g++-${{matrix.compiler}} CC=gcc-${{matrix.compiler}} PLATFORM=${{ matrix.platform }} LEIDEN=true
+      run: gmake -j32 CXX=g++-${{matrix.compiler}} CC=gcc-${{matrix.compiler}} PLATFORM=${{ matrix.platform }} LEIDEN=true STATIC_LINK=true
     - name: tar artifacts
       run: | 
         mkdir ${DIR}

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -15,18 +15,24 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        machine: [ubuntu-latest, macOS-12]
+        machine: [ubuntu-latest]
+        gmake_install_command: ['gmake --version']
         compiler: [12]
+        include:
+        - {machine: macOS-13, gmake_install_command: 'brew install make && gmake --version', compiler: 12}
     runs-on: ['${{ matrix.machine }}']   
 
     steps:
     - uses: actions/checkout@v4
       with:
         submodules: recursive
+
+    - name: install gmake
+      run: ${{ matrix.gmake_install_command }}        
 
     - name: make
       run: | 
-        make -j32 CXX=g++-${{matrix.compiler}} CC=gcc-${{matrix.compiler}}
+        gmake -j CXX=g++-${{matrix.compiler}} CC=gcc-${{matrix.compiler}} STATIC_LINK=true
     - name: tar artifacts
       run: tar -cvf vclust.tar ./vclust.py ./test.py ./example ./bin/kmer-db ./bin/lz-ani ./bin/clusty ./bin/multi-fasta-split
 
@@ -42,7 +48,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        machine: [ubuntu-latest, macOS-12]
+        machine: [ubuntu-latest, macOS-13]
 
     runs-on: ['${{ matrix.machine }}']    
 

diff --git a/.github/workflows/self-hosted.yml b/.github/workflows/self-hosted.yml
@@ -50,7 +50,7 @@ jobs:
 
     steps:
     - name: make
-      run: make -j32 CXX=g++-${{matrix.compiler}} CC=gcc-${{matrix.compiler}} PLATFORM=${{ matrix.platform }} LEIDEN=true
+      run: gmake -j32 CXX=g++-${{matrix.compiler}} CC=gcc-${{matrix.compiler}} PLATFORM=${{ matrix.platform }} LEIDEN=true STATIC_LINK=true
     - name: print info
       run: python3 vclust.py info 
 

diff --git a/3rd_party/clusty b/3rd_party/clusty
diff --git a/3rd_party/kmer-db b/3rd_party/kmer-db
diff --git a/3rd_party/lz-ani b/3rd_party/lz-ani
diff --git a/3rd_party/ref-utils b/3rd_party/ref-utils
diff --git a/README.md b/README.md
@@ -5,6 +5,8 @@
 [![Build and tests](../../workflows/Build%20and%20tests/badge.svg)](../../actions/workflows/main.yml)
 [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0)
 
+
+[![PyPI - Total Downloads](https://static.pepy.tech/personalized-badge/vclust?period=total&units=abbreviation&left_color=grey&right_color=green&left_text=PyPI%20total%20downloads)](https://www.pepy.tech/projects/vclust)
 [![PyPI - Downloads](https://img.shields.io/pypi/dm/vclust?label=PyPI%20downloads)](https://pypi.org/project/vclust/)
 [![GitHub downloads](https://img.shields.io/github/downloads/refresh-bio/vclust/total.svg?style=flag&label=GitHub%20downloads)](/~https://github.com/refresh-bio/vclust/releases)
 [![Bioconda downloads](https://img.shields.io/conda/dn/bioconda/vclust.svg?style=flag&label=Bioconda%20downloads)](https://anaconda.org/bioconda/vclust)

diff --git a/makefile b/makefile
@@ -14,10 +14,10 @@ prep:
 	cd 3rd_party/clusty && $(MAKE) -j 
 	cd 3rd_party/ref-utils && $(MAKE) -j 
 	mkdir -p bin
-	cp 3rd_party/kmer-db/kmer-db ./bin/
-	cp 3rd_party/lz-ani/lz-ani ./bin/
-	cp 3rd_party/clusty/clusty ./bin/
-	cp 3rd_party/ref-utils/multi-fasta-split/multi-fasta-split ./bin/
+	cp 3rd_party/kmer-db/bin/kmer-db ./bin/
+	cp 3rd_party/lz-ani/bin/lz-ani ./bin/
+	cp 3rd_party/clusty/bin/clusty ./bin/
+	cp 3rd_party/ref-utils/bin/multi-fasta-split ./bin/
 
 clean:
 	cd 3rd_party/kmer-db && $(MAKE) clean

diff --git a/pyproject.toml b/pyproject.toml
@@ -9,7 +9,7 @@ py-modules = ["vclust"]
 where = ["./"]
 
 [project]
-name = "vclust-test"
+name = "vclust"
 description = """Fast and accurate tool for calculating \
 Average Nucleotide Identity (ANI) and clustering virus \
 genomes and metagenomic contigs"""

diff --git a/vclust.py b/vclust.py
@@ -16,7 +16,7 @@
 import typing
 import uuid
 
-__version__ = '1.2.8'
+__version__ = '1.2.9'
 
 DEFAULT_THREAD_COUNT = min(multiprocessing.cpu_count(), 64)
 
@@ -95,7 +95,7 @@ def ranged_float_type(value):
         metavar='<file>',
         type=input_path_type,
         dest='input_path',
-        help='Input FASTA file or directory with FASTA files',
+        help='Input FASTA file or directory of files (gzipped or uncompressed)',
         required=True
     )
     prefilter_required.add_argument(
@@ -119,16 +119,15 @@ def ranged_float_type(value):
         metavar="<int>",
         type=int,
         default=20,
-        help='Filter genome pairs based on minimum number of shared k-mers '
-             '[%(default)s]'
+        help='Minimum number of shared k-mers between two genomes [%(default)s]'
     )
     prefilter_parser.add_argument(
         '--min-ident',
         metavar="<float>",
         type=ranged_float_type,
         default=0.7,
-        help='Filter genome pairs based on minimum sequence identity of '
-        'the shorter sequence (0-1) [%(default)s]'
+        help='Minimum sequence identity (0-1) between two genomes. Calculated '
+        'based on the shorter sequence [%(default)s]'
     )
     prefilter_parser.add_argument(
         '--batch-size',
@@ -144,9 +143,9 @@ def ranged_float_type(value):
         metavar="<float>",
         type=ranged_float_type,
         default=1.0,
-        help='Fraction of k-mers to analyze for each genome (0-1). A lower '
-        'value reduces RAM usage and speeds up processing (affects sensitivity) '
-        '[%(default)s]'
+        help='Fraction of k-mers to analyze in each genome (0-1). A lower '
+        'value reduces RAM usage and speeds up processing. By default, all '
+        'k-mers [%(default)s]'
     )
     prefilter_parser.add_argument(
         '--max-seqs',
@@ -164,22 +163,6 @@ def ranged_float_type(value):
         action="store_true",
         help='Keep temporary Kmer-db files [%(default)s]'
     )
-    prefilter_parser.add_argument(
-        '--bin',
-        metavar='<file>',
-        type=pathlib.Path,
-        dest="bin_kmerdb",
-        default=f'{BIN_KMERDB}',
-        help='Path to the Kmer-db binary [%(default)s]'
-    )
-    prefilter_parser.add_argument(
-        '--bin-fasta',
-        metavar='<file>',
-        type=pathlib.Path,
-        dest="bin_fastasplit",
-        default=f'{BIN_FASTASPLIT}',
-        help='Path to the multi-fasta-split binary [%(default)s]'
-    )
     prefilter_parser.add_argument(
         '-t', '--threads',
         metavar="<int>",
@@ -214,7 +197,7 @@ def ranged_float_type(value):
         metavar='<file>',
         type=input_path_type,
         dest='input_path',
-        help='Input FASTA file or directory with FASTA files',
+        help='Input FASTA file or directory of files (gzipped or uncompressed)',
         required=True
     )
     align_required.add_argument(
@@ -298,14 +281,6 @@ def ranged_float_type(value):
         help='Min. reference coverage (aligned fraction) to output (0-1) '
         '[%(default)s]'
     )
-    align_parser.add_argument(
-        '--bin',
-        metavar='<file>',
-        type=pathlib.Path,
-        dest='bin_lzani',
-        default=f'{BIN_LZANI}',
-        help='Path to the LZ-ANI binary [%(default)s]'
-    )
     align_parser.add_argument(
         '--mal',
         metavar='<int>',
@@ -528,14 +503,6 @@ def ranged_float_type(value):
         default=2,
         help='Number of iterations for the Leiden algorithm [%(default)s]'
     )
-    cluster_parser.add_argument(
-        '--bin',
-        metavar='<file>',
-        type=pathlib.Path,
-        dest="bin_clusty",
-        default=f'{BIN_CLUSTY}',
-        help='Path to the Clusty binary [%(default)s]'
-    )
     cluster_parser.add_argument(
         '-v', '--verbose',
         action="store_true",
@@ -1215,7 +1182,7 @@ def vclust_info() -> None:
         output_lines.append(f'{RED}Status: error{RESET}')
         output_lines.extend(f"   - {name}: {error}" for name, error in errors)
     else:
-        output_lines.append(f'{GREEN}Status: ok{RESET}')
+        output_lines.append(f'{GREEN}Status: ready{RESET}')
 
     # Output the complete information.
     print('\n'.join(output_lines))
@@ -1263,7 +1230,7 @@ def main():
         vclust_info()
     # Prefilter
     elif args.command == 'prefilter':
-        args.bin_kmerdb = validate_binary(args.bin_kmerdb)
+        validate_binary(BIN_KMERDB)
         args = validate_args_prefilter(args, parser)
         args = validate_args_fasta_input(args, parser)
 
@@ -1278,13 +1245,12 @@ def main():
         else:
             # Split multi-fasta file.
             if args.batch_size:
-                args.bin_fastasplit = validate_binary(args.bin_fastasplit)
+                validate_binary(BIN_FASTASPLIT)
                 cmd = cmd_fastasplit(
                     input_fasta=args.input_path, 
                     out_dir=out_dir,
                     n=args.batch_size,
                     verbose=args.verbose,
-                    bin_path=args.bin_fastasplit,
                 )
                 p = run(cmd, args.verbose, logger)
                 for f in out_dir.glob('part_*'):
@@ -1311,7 +1277,6 @@ def main():
                 kmer_size=args.k,
                 kmers_fraction=args.kmers_fraction,
                 num_threads=args.num_threads,
-                bin_path=args.bin_kmerdb,
             )
             p = run(cmd, args.verbose, logger)
             db_paths.append(db_path)
@@ -1333,7 +1298,6 @@ def main():
             min_ident=args.min_ident,
             max_seqs=args.max_seqs,
             num_threads=args.num_threads,
-            bin_path=args.bin_kmerdb,
         )
         p = run(cmd, args.verbose, logger)
 
@@ -1342,7 +1306,6 @@ def main():
             outfile_distance=args.output_path,
             min_ident=args.min_ident,
             num_threads=args.num_threads,
-            bin_path=args.bin_kmerdb,
         )
         p = run(cmd, args.verbose, logger)
 
@@ -1353,7 +1316,7 @@ def main():
 
     # Align
     elif args.command == 'align':
-        args.bin_lzani = validate_binary(args.bin_lzani)
+        validate_binary(BIN_LZANI)
         args = validate_args_fasta_input(args, parser)
 
         out_dir = args.output_path.parent / get_uuid()
@@ -1386,7 +1349,6 @@ def main():
             ar=args.ar,
             num_threads=args.num_threads,
             verbose=args.verbose,
-            bin_path=args.bin_lzani,
         )
         p = run(cmd, args.verbose, logger)
 
@@ -1396,7 +1358,7 @@ def main():
 
     # Cluster
     elif args.command == 'cluster':
-        args.bin_clusty = validate_binary(args.bin_clusty)
+        validate_binary(BIN_CLUSTY)
         args = validate_args_cluster(args, parser)
 
         cmd = cmd_clusty(
@@ -1416,7 +1378,6 @@ def main():
             leiden_resolution=args.leiden_resolution,
             leiden_beta=args.leiden_beta,
             leiden_iterations=args.leiden_iterations,
-            bin_path=args.bin_clusty,
         )
         p = run(cmd, args.verbose, logger)
+4 −4		.github/workflows/deploy.yml
+12 −5		.github/workflows/main.yml
+3 −3		.github/workflows/self-hosted.yml
+2 −0		.gitignore
+12 −18		README.md
+45 −106		makefile
+0 −67		makefile_no_mimalloc
+739 −0		refresh.mk
+8 −2		src/version.h
+4 −4		.github/workflows/deploy.yml
+11 −5		.github/workflows/main.yml
+20 −25		.github/workflows/self-hosted.yml
+2 −0		.gitignore
+13 −13		README.md
+125 −0		libs/refresh/logs/lib/progress.h
+71 −164		makefile
+10 −13		quick-start.sh
+739 −0		refresh.mk
+31 −2		src/array.h
+153 −0		src/bubble_helper.h
+8 −8		src/console_all2all.cpp
+38 −27		src/console_all2all_parts.cpp
+23 −11		src/console_all2all_sparse.cpp
+18 −18		src/console_build.cpp
+12 −12		src/console_db2db.cpp
+7 −7		src/console_distance.cpp
+4 −5		src/console_minhash.cpp
+13 −13		src/console_new2all.cpp
+10 −10		src/console_one2all.cpp
+10 −0		src/hashmap_lp.h
+2 −2		src/kmer_db.h
+5 −6		src/kmer_db.vcxproj
+5 −4		src/kmer_db.vcxproj.filters
+19 −16		src/loader_ex.cpp
+2 −2		src/loader_ex.h
+8 −3		src/log.h
+4 −4		src/main.cpp
+32 −30		src/params.cpp
+3 −4		src/params.h
+22 −21		src/prefix_kmer_db.cpp
+1 −0		src/prefix_kmer_db.h
+5 −0		src/sampler.h
+0 −0		src/simd/row_add.h
+0 −0		src/simd/row_add_avx.cpp
+0 −0		src/simd/row_add_avx2.cpp
+0 −0		src/simd/row_add_neon.cpp
+61 −68		src/similarity_calculator.cpp
+2 −2		src/similarity_calculator.h
+8 −2		src/version.h
+5 −4		.github/workflows/deploy.yml
+11 −6		.github/workflows/main.yml
+23 −23		.github/workflows/self-hosted.yml
+1 −1		README.md
+28 −27		libs/refresh/compression/lib/file_wrapper.h
+154 −0		libs/refresh/logs/lib/progress.h
+39 −4		libs/refresh/parallel_queues/lib/parallel-queues.h
+49 −117		makefile
+739 −0		refresh.mk
+1 −0		src/ani-entropy.vcxproj
+3 −0		src/ani-entropy.vcxproj.filters
+3 −3		src/defs.h
+76 −13		src/filter.cpp
+6 −0		src/filter.h
+43 −9		src/lz_matcher.cpp
+14 −7		src/seq_reservoir.cpp