From 26a30c6d070d821893daa31f87d280233330cd02 Mon Sep 17 00:00:00 2001 From: Wei Shen Date: Sun, 23 Sep 2018 00:19:25 +0800 Subject: [PATCH] unikmer count: new option --canonical --- CHANGES.md | 11 +++++- README.md | 25 ++++++++---- kmer.go | 86 +++++++++++++++++++++++++++++++----------- kmer_test.go | 56 ++++++++++++++++++++++----- unikmer/cmd/count.go | 23 ++++++++--- unikmer/cmd/version.go | 2 +- 6 files changed, 157 insertions(+), 46 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 8d1c8b6..61f87f0 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1 +1,10 @@ - +- v0.2.1 + - `unikmer count`: performance improvement and new option `--canonical` for only keeping canonical Kmers. +- v0.2 + - new command `unikmer sample`: sample Kmers from binary files. + - new global options: + - `-c, --compact`: write more compact binary file with little loss of speed. + - `-C, --no-compress`: do not compress binary file (not recommended). + - some improvements. +- v0.1.0 + - first release diff --git a/README.md b/README.md index fadc50f..7141f08 100644 --- a/README.md +++ b/README.md @@ -45,13 +45,13 @@ Kmer frequencies) and provides serialization methods. goos: linux goarch: amd64 pkg: github.com/shenwei356/unikmer - BenchmarkEncodeK32-16 20000000 51.5 ns/op 0 B/op 0 allocs/op - BenchmarkEncodeFromPreviousKmerK32-16 200000000 9.28 ns/op 0 B/op 0 allocs/op - BenchmarkMustEncodeFromPreviousKmerK32-16 1000000000 1.81 ns/op 0 B/op 0 allocs/op - BenchmarkDecodeK32-16 20000000 82.5 ns/op 32 B/op 1 allocs/op - BenchmarkRevK32-16 50000000 26.1 ns/op 0 B/op 0 allocs/op - BenchmarkCompK32-16 50000000 28.4 ns/op 0 B/op 0 allocs/op - BenchmarkRevCompK32-16 50000000 22.4 ns/op 0 B/op 0 allocs/op + BenchmarkEncodeK32-16 20000000 50.1 ns/op 0 B/op 0 allocs/op + BenchmarkEncodeFromFormerKmerK32-16 200000000 9.30 ns/op 0 B/op 0 allocs/op + BenchmarkMustEncodeFromFormerKmerK32-16 2000000000 1.95 ns/op 0 B/op 0 allocs/op + BenchmarkDecodeK32-16 20000000 79.5 ns/op 32 B/op 1 allocs/op + BenchmarkRevK32-16 50000000 20.7 ns/op 0 B/op 0 allocs/op + BenchmarkCompK32-16 50000000 32.7 ns/op 0 B/op 0 allocs/op + BenchmarkRevCompK32-16 100000000 22.4 ns/op 0 B/op 0 allocs/op ## The toolkit @@ -139,6 +139,17 @@ label |encoded-kmera|gzip-compressedb|compact-fo -rw-rw-r--. 1 shenwei shenwei 1.4M Aug 9 23:19 Ecoli-MG1655.fasta.gz -rw-rw-r--. 1 shenwei shenwei 23M Aug 9 23:29 Ecoli-MG1655.fasta.gz.k31.unik + # counting (only keep the canonical kmers) + $ memusg -t unikmer count -k 31 Ecoli-MG1655.fasta.gz -o Ecoli-MG1655.fasta.gz.k31 --canonical + + elapsed time: 3.428s + peak rss: 236.14 MB + + $ ls -lh Ecoli-MG1655.fasta.gz* + -rw-rw-r--. 1 shenwei shenwei 1.4M Aug 9 23:19 Ecoli-MG1655.fasta.gz + -rw-rw-r--. 1 shenwei shenwei 19M Aug 9 23:29 Ecoli-MG1655.fasta.gz.k31.unik + + # view $ unikmer view Ecoli-MG1655.fasta.gz.k31.unik | head -n 3 diff --git a/kmer.go b/kmer.go index f207c58..4f7aabf 100644 --- a/kmer.go +++ b/kmer.go @@ -80,37 +80,70 @@ func Encode(kmer []byte) (code uint64, err error) { // ErrNotConsecutiveKmers means the two Kmers are not consecutive var ErrNotConsecutiveKmers = errors.New("unikmer: not consecutive Kmers") -// MustEncodeFromPrevious encodes from previous kmer, -// assuming the kmer and preKmer are both OK. -func MustEncodeFromPreviousKmer(kmer []byte, preKmer []byte, preCode uint64) (uint64, error) { - preCode = preCode & ((1 << (uint(len(kmer)-1) << 1)) - 1) << 2 +// MustEncodeFromFormerKmer encodes from former kmer, +// assuming the kmer and leftKmer are both OK. +func MustEncodeFromFormerKmer(kmer []byte, leftKmer []byte, leftCode uint64) (uint64, error) { + leftCode = leftCode & ((1 << (uint(len(kmer)-1) << 1)) - 1) << 2 switch kmer[len(kmer)-1] { case 'G', 'g', 'K', 'k': - preCode += 2 + leftCode |= 2 case 'T', 't', 'U', 'u': - preCode += 3 + leftCode |= 3 case 'C', 'c', 'S', 's', 'B', 'b', 'Y', 'y': - preCode += 1 + leftCode |= 1 case 'A', 'a', 'N', 'n', 'M', 'm', 'V', 'v', 'H', 'h', 'R', 'r', 'D', 'd', 'W', 'w': - preCode += 0 + // leftCode |= 0 default: - return preCode, ErrIllegalBase + return leftCode, ErrIllegalBase } - return preCode, nil + return leftCode, nil } -// EncodeFromPrevious encodes from previous kmer, inspired by ntHash -func EncodeFromPreviousKmer(kmer []byte, preKmer []byte, preCode uint64) (uint64, error) { +// EncodeFromFormerKmer encodes from former kmer, inspired by ntHash +func EncodeFromFormerKmer(kmer []byte, leftKmer []byte, leftCode uint64) (uint64, error) { if len(kmer) == 0 { return 0, ErrKOverflow } - if len(kmer) != len(preKmer) { + if len(kmer) != len(leftKmer) { return 0, ErrKMismatch } - if !bytes.Equal(kmer[0:len(kmer)-1], preKmer[1:len(preKmer)]) { + if !bytes.Equal(kmer[0:len(kmer)-1], leftKmer[1:len(leftKmer)]) { return 0, ErrNotConsecutiveKmers } - return MustEncodeFromPreviousKmer(kmer, preKmer, preCode) + return MustEncodeFromFormerKmer(kmer, leftKmer, leftCode) +} + +// MustEncodeFromLatterKmer encodes from latter kmer, +// assuming the kmer and rightKmer are both OK. +func MustEncodeFromLatterKmer(kmer []byte, rightKmer []byte, rightCode uint64) (uint64, error) { + rightCode >>= 2 + switch kmer[0] { + case 'G', 'g', 'K', 'k': + rightCode |= 2 << (uint(len(kmer)-1) << 1) + case 'T', 't', 'U', 'u': + rightCode |= 3 << (uint(len(kmer)-1) << 1) + case 'C', 'c', 'S', 's', 'B', 'b', 'Y', 'y': + rightCode |= 1 << (uint(len(kmer)-1) << 1) + case 'A', 'a', 'N', 'n', 'M', 'm', 'V', 'v', 'H', 'h', 'R', 'r', 'D', 'd', 'W', 'w': + // rightCode |= 0 << (uint(len(kmer)-1) << 1) + default: + return rightCode, ErrIllegalBase + } + return rightCode, nil +} + +// EncodeFromLatterKmer encodes from former kmer. +func EncodeFromLatterKmer(kmer []byte, rightKmer []byte, rightCode uint64) (uint64, error) { + if len(kmer) == 0 { + return 0, ErrKOverflow + } + if len(kmer) != len(rightKmer) { + return 0, ErrKMismatch + } + if !bytes.Equal(rightKmer[0:len(kmer)-1], kmer[1:len(rightKmer)]) { + return 0, ErrNotConsecutiveKmers + } + return MustEncodeFromLatterKmer(kmer, rightKmer, rightCode) } // Reverse returns code of the reversed sequence. @@ -182,19 +215,19 @@ func NewKmerCode(kmer []byte) (KmerCode, error) { return KmerCode{code, len(kmer)}, err } -// NewKmerCodeFromPreviousOne computes KmerCode from the previous consecutive kmer. -func NewKmerCodeFromPreviousOne(kmer []byte, preKmer []byte, preKcode KmerCode) (KmerCode, error) { - code, err := EncodeFromPreviousKmer(kmer, preKmer, preKcode.Code) +// NewKmerCodeFromFormerOne computes KmerCode from the Former consecutive kmer. +func NewKmerCodeFromFormerOne(kmer []byte, leftKmer []byte, preKcode KmerCode) (KmerCode, error) { + code, err := EncodeFromFormerKmer(kmer, leftKmer, preKcode.Code) if err != nil { return KmerCode{}, err } return KmerCode{code, len(kmer)}, err } -// NewKmerCodeMustFromPreviousOne computes KmerCode from the previous consecutive kmer, -// assuming the kmer and preKmer are both OK. -func NewKmerCodeMustFromPreviousOne(kmer []byte, preKmer []byte, preKcode KmerCode) (KmerCode, error) { - code, err := MustEncodeFromPreviousKmer(kmer, preKmer, preKcode.Code) +// NewKmerCodeMustFromFormerOne computes KmerCode from the Former consecutive kmer, +// assuming the kmer and leftKmer are both OK. +func NewKmerCodeMustFromFormerOne(kmer []byte, leftKmer []byte, preKcode KmerCode) (KmerCode, error) { + code, err := MustEncodeFromFormerKmer(kmer, leftKmer, preKcode.Code) if err != nil { return KmerCode{}, err } @@ -221,6 +254,15 @@ func (kcode KmerCode) RevComp() KmerCode { return KmerCode{RevComp(kcode.Code, kcode.K), kcode.K} } +// Canonical returns its canonical kmer +func (kcode KmerCode) Canonical() KmerCode { + rcKcode := kcode.RevComp() + if rcKcode.Code < kcode.Code { + return rcKcode + } + return kcode +} + // Bytes returns kmer in []byte. func (kcode KmerCode) Bytes() []byte { return Decode(kcode.Code, kcode.K) diff --git a/kmer_test.go b/kmer_test.go index a6b3fdb..0d071c2 100644 --- a/kmer_test.go +++ b/kmer_test.go @@ -75,8 +75,8 @@ func TestEncodeDecode(t *testing.T) { } } -// TestEncodeFromPreviousKmer tests TestEncodeFromPreviousKmer -func TestEncodeFromPreviousKmer(t *testing.T) { +// TestEncodeFromFormerKmer tests TestEncodeFromFormerKmer +func TestEncodeFromFormerKmer(t *testing.T) { var err error k := 5 first := true @@ -95,7 +95,7 @@ func TestEncodeFromPreviousKmer(t *testing.T) { continue } pKmer = benchMer[i-1 : i+k-1] - code, err = EncodeFromPreviousKmer(kmer, pKmer, pCode) + code, err = EncodeFromFormerKmer(kmer, pKmer, pCode) if err != nil { t.Errorf("Encode error: %s", kmer) } @@ -105,7 +105,43 @@ func TestEncodeFromPreviousKmer(t *testing.T) { t.Errorf("Encode error: %s", kmer) } if code0 != code { - t.Errorf("EncodeFromPreviousKmer error for %s: wrong %d != right %d", kmer, code, code0) + t.Errorf("EncodeFromFormerKmer error for %s: wrong %d != right %d", kmer, code, code0) + } + + pCode = code + } +} + +func TestEncodeFromLatterKmer(t *testing.T) { + var err error + k := 5 + first := true + var code, code0, pCode uint64 + var kmer, pKmer []byte + for i := len(benchMer) - k - 1; i >= 0; i-- { + kmer = benchMer[i : i+k] + if first { + code, err = Encode(kmer) + if err != nil { + t.Errorf("Encode error: %s", kmer) + } + + pCode = code + first = false + continue + } + pKmer = benchMer[i+1 : i+k+1] + code, err = EncodeFromLatterKmer(kmer, pKmer, pCode) + if err != nil { + t.Errorf("Encode error: %s", kmer) + } + + code0, err = Encode(kmer) + if err != nil { + t.Errorf("Encode error: %s", kmer) + } + if code0 != code { + t.Errorf("EncodeFromLatterKmer error for %s: wrong %d != right %d", kmer, code, code0) } pCode = code @@ -154,12 +190,12 @@ func BenchmarkEncodeK32(b *testing.B) { } } -// BenchmarkEncode tests speed of EncodeFromPreviousKmer -func BenchmarkEncodeFromPreviousKmerK32(b *testing.B) { +// BenchmarkEncode tests speed of EncodeFromFormerKmer +func BenchmarkEncodeFromFormerKmerK32(b *testing.B) { var code uint64 var err error for i := 0; i < b.N; i++ { - code, err = EncodeFromPreviousKmer(benchMer2, benchMer, benchCode) + code, err = EncodeFromFormerKmer(benchMer2, benchMer, benchCode) if err != nil { b.Errorf("Encode error: %s", benchMer) } @@ -169,12 +205,12 @@ func BenchmarkEncodeFromPreviousKmerK32(b *testing.B) { } } -// BenchmarkEncode tests speed of MustEncodeFromPreviousKmer -func BenchmarkMustEncodeFromPreviousKmerK32(b *testing.B) { +// BenchmarkEncode tests speed of MustEncodeFromFormerKmer +func BenchmarkMustEncodeFromFormerKmerK32(b *testing.B) { var code uint64 var err error for i := 0; i < b.N; i++ { - code, err = MustEncodeFromPreviousKmer(benchMer2, benchMer, benchCode) + code, err = MustEncodeFromFormerKmer(benchMer2, benchMer, benchCode) if err != nil { b.Errorf("Encode error: %s", benchMer) } diff --git a/unikmer/cmd/count.go b/unikmer/cmd/count.go index 135030d..c5e6701 100644 --- a/unikmer/cmd/count.go +++ b/unikmer/cmd/count.go @@ -51,6 +51,8 @@ var countCmd = &cobra.Command{ checkError(fmt.Errorf("k > 32 not supported")) } + canonical := getFlagBool(cmd, "canonical") + if !isStdout(outFile) { outFile += extDataFile } @@ -75,7 +77,7 @@ var countCmd = &cobra.Command{ var fastxReader *fastx.Reader var kcode, preKcode unikmer.KmerCode var first bool - var i, j int + var i, j, iters int var ok bool var n int64 for _, file := range files { @@ -94,7 +96,13 @@ var countCmd = &cobra.Command{ break } - for j = 0; j < 2; j++ { + if canonical { + iters = 1 + } else { + iters = 2 + } + + for j = 0; j < iters; j++ { if j == 0 { // sequence sequence = record.Seq.Seq @@ -135,12 +143,16 @@ var countCmd = &cobra.Command{ kcode, err = unikmer.NewKmerCode(kmer) first = false } else { - kcode, err = unikmer.NewKmerCodeMustFromPreviousOne(kmer, preKmer, preKcode) + kcode, err = unikmer.NewKmerCodeMustFromFormerOne(kmer, preKmer, preKcode) } - preKmer, preKcode = kmer, kcode if err != nil { checkError(fmt.Errorf("encoding '%s': %s", kmer, err)) } + preKmer, preKcode = kmer, kcode + + if canonical { + kcode = kcode.Canonical() + } if _, ok = m[kcode.Code]; !ok { m[kcode.Code] = struct{}{} @@ -162,6 +174,7 @@ func init() { RootCmd.AddCommand(countCmd) countCmd.Flags().StringP("out-prefix", "o", "-", `out file prefix ("-" for stdout)`) - countCmd.Flags().IntP("kmer-len", "k", 0, "kmer length") + countCmd.Flags().IntP("kmer-len", "k", 0, "Kmer length") countCmd.Flags().BoolP("circular", "", false, "circular genome") + countCmd.Flags().BoolP("canonical", "", false, "only keep the canonical Kmers") } diff --git a/unikmer/cmd/version.go b/unikmer/cmd/version.go index cdf2ee9..a2d8f7e 100644 --- a/unikmer/cmd/version.go +++ b/unikmer/cmd/version.go @@ -29,7 +29,7 @@ import ( ) // VERSION is the version -var VERSION = "v0.2.1" +var VERSION = "v0.2.2" // versionCmd represents the version command var versionCmd = &cobra.Command{