diff --git a/CHANGES.md b/CHANGES.md index fbd0524..5724687 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,4 +1,5 @@ - v0.4.4 + - `unikmer`: add global option `-L/--compression-level`. - `unikmer diff`: reduce memory occupation, speed not affected. - v0.4.3 - `unikmer diff`: fix bug of hanging when the first file having no Kmers. diff --git a/unikmer/cmd/concat.go b/unikmer/cmd/concat.go index cf86bbc..6f3edf1 100644 --- a/unikmer/cmd/concat.go +++ b/unikmer/cmd/concat.go @@ -63,7 +63,7 @@ Attentions: if !isStdout(outFile) { outFile += extDataFile } - outfh, gw, w, err := outStream(outFile, opt.Compress) + outfh, gw, w, err := outStream(outFile, opt.Compress, opt.CompressionLevel) checkError(err) defer func() { outfh.Flush() diff --git a/unikmer/cmd/count.go b/unikmer/cmd/count.go index 69252c3..864cd70 100644 --- a/unikmer/cmd/count.go +++ b/unikmer/cmd/count.go @@ -68,7 +68,7 @@ var countCmd = &cobra.Command{ if !isStdout(outFile) { outFile += extDataFile } - outfh, gw, w, err := outStream(outFile, opt.Compress) + outfh, gw, w, err := outStream(outFile, opt.Compress, opt.CompressionLevel) checkError(err) defer func() { outfh.Flush() diff --git a/unikmer/cmd/diff.go b/unikmer/cmd/diff.go index e0e82bf..6d1252d 100644 --- a/unikmer/cmd/diff.go +++ b/unikmer/cmd/diff.go @@ -103,7 +103,7 @@ Tips: if !isStdout(outFile) { outFile += extDataFile } - outfh, gw, w, err := outStream(outFile, opt.Compress) + outfh, gw, w, err := outStream(outFile, opt.Compress, opt.CompressionLevel) checkError(err) defer func() { outfh.Flush() @@ -230,7 +230,7 @@ Tips: if !isStdout(outFile) { outFile += extDataFile } - outfh, gw, w, err := outStream(outFile, opt.Compress) + outfh, gw, w, err := outStream(outFile, opt.Compress, opt.CompressionLevel) checkError(err) defer func() { outfh.Flush() @@ -477,7 +477,7 @@ Tips: if !isStdout(outFile) { outFile += extDataFile } - outfh, gw, w, err := outStream(outFile, opt.Compress) + outfh, gw, w, err := outStream(outFile, opt.Compress, opt.CompressionLevel) checkError(err) defer func() { outfh.Flush() diff --git a/unikmer/cmd/dump.go b/unikmer/cmd/dump.go index ab70790..c635e02 100644 --- a/unikmer/cmd/dump.go +++ b/unikmer/cmd/dump.go @@ -57,7 +57,7 @@ var dumpCmd = &cobra.Command{ if !isStdout(outFile) { outFile += extDataFile } - outfh, gw, w, err := outStream(outFile, opt.Compress) + outfh, gw, w, err := outStream(outFile, opt.Compress, opt.CompressionLevel) checkError(err) defer func() { outfh.Flush() diff --git a/unikmer/cmd/grep.go b/unikmer/cmd/grep.go index 224a6cd..159c333 100644 --- a/unikmer/cmd/grep.go +++ b/unikmer/cmd/grep.go @@ -133,7 +133,7 @@ var grepCmd = &cobra.Command{ log.Infof("finish reading Kmers from %s", file) } - outfh, gw, w, err := outStream(outFile, strings.HasSuffix(strings.ToLower(outFile), ".gz")) + outfh, gw, w, err := outStream(outFile, strings.HasSuffix(strings.ToLower(outFile), ".gz"), opt.CompressionLevel) checkError(err) defer func() { outfh.Flush() diff --git a/unikmer/cmd/inter.go b/unikmer/cmd/inter.go index 907f1b6..3b99f3b 100644 --- a/unikmer/cmd/inter.go +++ b/unikmer/cmd/inter.go @@ -96,7 +96,7 @@ var interCmd = &cobra.Command{ if !isStdout(outFile) { outFile += extDataFile } - outfh, gw, w, err := outStream(outFile, opt.Compress) + outfh, gw, w, err := outStream(outFile, opt.Compress, opt.CompressionLevel) checkError(err) defer func() { outfh.Flush() @@ -260,7 +260,7 @@ var interCmd = &cobra.Command{ if !isStdout(outFile) { outFile += extDataFile } - outfh, gw, w, err := outStream(outFile, opt.Compress) + outfh, gw, w, err := outStream(outFile, opt.Compress, opt.CompressionLevel) checkError(err) defer func() { outfh.Flush() diff --git a/unikmer/cmd/num.go b/unikmer/cmd/num.go index a8a75ed..830a6db 100644 --- a/unikmer/cmd/num.go +++ b/unikmer/cmd/num.go @@ -58,7 +58,7 @@ var numCmd = &cobra.Command{ outFile := getFlagString(cmd, "out-file") showFile := getFlagBool(cmd, "file-name") - outfh, gw, w, err := outStream(outFile, strings.HasSuffix(strings.ToLower(outFile), ".gz")) + outfh, gw, w, err := outStream(outFile, strings.HasSuffix(strings.ToLower(outFile), ".gz"), opt.CompressionLevel) checkError(err) defer func() { outfh.Flush() diff --git a/unikmer/cmd/root.go b/unikmer/cmd/root.go index fac98af..a665d5e 100644 --- a/unikmer/cmd/root.go +++ b/unikmer/cmd/root.go @@ -25,6 +25,7 @@ import ( "os" "runtime" + "github.com/klauspost/compress/flate" "github.com/spf13/cobra" ) @@ -70,6 +71,7 @@ func init() { RootCmd.PersistentFlags().IntP("threads", "j", defaultThreads, "number of CPUs to use. (default value: 1 for single-CPU PC, 2 for others)") RootCmd.PersistentFlags().BoolP("verbose", "", false, "print verbose information") RootCmd.PersistentFlags().BoolP("no-compress", "C", false, "do not compress binary file (not recommended)") + RootCmd.PersistentFlags().IntP("compression-level", "L", flate.DefaultCompression, "compression level") RootCmd.PersistentFlags().BoolP("compact", "c", false, "write more compact binary file with little loss of speed") RootCmd.PersistentFlags().StringP("infile-list", "i", "", "file of input files list (one file per line), if given, files from cli arguments are ignored") } diff --git a/unikmer/cmd/sample.go b/unikmer/cmd/sample.go index 8342029..4573e3d 100644 --- a/unikmer/cmd/sample.go +++ b/unikmer/cmd/sample.go @@ -72,7 +72,7 @@ Attentions: if !isStdout(outFile) { outFile += extDataFile } - outfh, gw, w, err := outStream(outFile, opt.Compress) + outfh, gw, w, err := outStream(outFile, opt.Compress, opt.CompressionLevel) checkError(err) defer func() { outfh.Flush() diff --git a/unikmer/cmd/sort.go b/unikmer/cmd/sort.go index c640c0e..1b30ebd 100644 --- a/unikmer/cmd/sort.go +++ b/unikmer/cmd/sort.go @@ -63,7 +63,7 @@ var sortCmd = &cobra.Command{ if !isStdout(outFile) { outFile += extDataFile } - outfh, gw, w, err := outStream(outFile, opt.Compress) + outfh, gw, w, err := outStream(outFile, opt.Compress, opt.CompressionLevel) checkError(err) defer func() { outfh.Flush() diff --git a/unikmer/cmd/stats.go b/unikmer/cmd/stats.go index e7b1924..c5351e6 100644 --- a/unikmer/cmd/stats.go +++ b/unikmer/cmd/stats.go @@ -75,7 +75,7 @@ Tips: checkError(fmt.Errorf("values of -/--symbol-true and -F/--symbol--false should no be the same")) } - outfh, gw, w, err := outStream(outFile, strings.HasSuffix(strings.ToLower(outFile), ".gz")) + outfh, gw, w, err := outStream(outFile, strings.HasSuffix(strings.ToLower(outFile), ".gz"), opt.CompressionLevel) checkError(err) defer func() { outfh.Flush() diff --git a/unikmer/cmd/subset.go b/unikmer/cmd/subset.go index 6490e84..c6b998a 100644 --- a/unikmer/cmd/subset.go +++ b/unikmer/cmd/subset.go @@ -93,7 +93,7 @@ Attention: if !isStdout(outFile) { outFile += extDataFile } - outfh, gw, w, err := outStream(outFile, opt.Compress) + outfh, gw, w, err := outStream(outFile, opt.Compress, opt.CompressionLevel) checkError(err) defer func() { outfh.Flush() diff --git a/unikmer/cmd/union.go b/unikmer/cmd/union.go index 7051d5b..bdd6829 100644 --- a/unikmer/cmd/union.go +++ b/unikmer/cmd/union.go @@ -68,7 +68,7 @@ Attentions: if !isStdout(outFile) { outFile += extDataFile } - outfh, gw, w, err := outStream(outFile, opt.Compress) + outfh, gw, w, err := outStream(outFile, opt.Compress, opt.CompressionLevel) checkError(err) defer func() { outfh.Flush() diff --git a/unikmer/cmd/util-io.go b/unikmer/cmd/util-io.go index 1b92784..85fd71c 100644 --- a/unikmer/cmd/util-io.go +++ b/unikmer/cmd/util-io.go @@ -31,7 +31,7 @@ import ( gzip "github.com/klauspost/pgzip" ) -func outStream(file string, gzipped bool) (*bufio.Writer, io.WriteCloser, *os.File, error) { +func outStream(file string, gzipped bool, level int) (*bufio.Writer, io.WriteCloser, *os.File, error) { var w *os.File if file == "-" { w = os.Stdout @@ -52,7 +52,11 @@ func outStream(file string, gzipped bool) (*bufio.Writer, io.WriteCloser, *os.Fi } if gzipped { - gw := gzip.NewWriter(w) + // gw := gzip.NewWriter(w) + gw, err := gzip.NewWriterLevel(w, level) + if err != nil { + return nil, nil, nil, fmt.Errorf("fail to write %s: %s", file, err) + } return bufio.NewWriterSize(gw, os.Getpagesize()), gw, w, nil } return bufio.NewWriterSize(w, os.Getpagesize()), nil, w, nil diff --git a/unikmer/cmd/util.go b/unikmer/cmd/util.go index 8973c2e..6a37411 100644 --- a/unikmer/cmd/util.go +++ b/unikmer/cmd/util.go @@ -21,6 +21,7 @@ package cmd import ( + "compress/flate" "fmt" "io" "sort" @@ -41,19 +42,25 @@ const ( // Options contains the global flags type Options struct { - NumCPUs int - Verbose bool - Compress bool - Compact bool + NumCPUs int + Verbose bool + Compress bool + Compact bool + CompressionLevel int } func getOptions(cmd *cobra.Command) *Options { + level := getFlagInt(cmd, "compression-level") + if level < flate.HuffmanOnly || level > flate.BestCompression { + checkError(fmt.Errorf("gzip: invalid compression level: %d", level)) + } return &Options{ NumCPUs: getFlagPositiveInt(cmd, "threads"), // NumCPUs: 1, - Verbose: getFlagBool(cmd, "verbose"), - Compress: !getFlagBool(cmd, "no-compress"), - Compact: getFlagBool(cmd, "compact"), + Verbose: getFlagBool(cmd, "verbose"), + Compress: !getFlagBool(cmd, "no-compress"), + Compact: getFlagBool(cmd, "compact"), + CompressionLevel: level, } } @@ -168,7 +175,7 @@ func sortUnikFile(opt Options, unique bool, file string, outFile string) (*unikm if !isStdout(outFile) { outFile += extDataFile } - outfh, gw, w, err := outStream(outFile, opt.Compress) + outfh, gw, w, err := outStream(outFile, opt.Compress, opt.CompressionLevel) if err != nil { return nil, 0, err } diff --git a/unikmer/cmd/view.go b/unikmer/cmd/view.go index 2069203..1406f63 100644 --- a/unikmer/cmd/view.go +++ b/unikmer/cmd/view.go @@ -61,7 +61,7 @@ var viewCmd = &cobra.Command{ outFasta := getFlagBool(cmd, "fasta") outFastq := getFlagBool(cmd, "fastq") - outfh, gw, w, err := outStream(outFile, strings.HasSuffix(strings.ToLower(outFile), ".gz")) + outfh, gw, w, err := outStream(outFile, strings.HasSuffix(strings.ToLower(outFile), ".gz"), opt.CompressionLevel) checkError(err) defer func() { outfh.Flush()