diff --git a/CHANGES.md b/CHANGES.md
index 8d1c8b6..61f87f0 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1 +1,10 @@
-
+- v0.2.1
+ - `unikmer count`: performance improvement and new option `--canonical` for only keeping canonical Kmers.
+- v0.2
+ - new command `unikmer sample`: sample Kmers from binary files.
+ - new global options:
+ - `-c, --compact`: write more compact binary file with little loss of speed.
+ - `-C, --no-compress`: do not compress binary file (not recommended).
+ - some improvements.
+- v0.1.0
+ - first release
diff --git a/README.md b/README.md
index fadc50f..7141f08 100644
--- a/README.md
+++ b/README.md
@@ -45,13 +45,13 @@ Kmer frequencies) and provides serialization methods.
goos: linux
goarch: amd64
pkg: github.com/shenwei356/unikmer
- BenchmarkEncodeK32-16 20000000 51.5 ns/op 0 B/op 0 allocs/op
- BenchmarkEncodeFromPreviousKmerK32-16 200000000 9.28 ns/op 0 B/op 0 allocs/op
- BenchmarkMustEncodeFromPreviousKmerK32-16 1000000000 1.81 ns/op 0 B/op 0 allocs/op
- BenchmarkDecodeK32-16 20000000 82.5 ns/op 32 B/op 1 allocs/op
- BenchmarkRevK32-16 50000000 26.1 ns/op 0 B/op 0 allocs/op
- BenchmarkCompK32-16 50000000 28.4 ns/op 0 B/op 0 allocs/op
- BenchmarkRevCompK32-16 50000000 22.4 ns/op 0 B/op 0 allocs/op
+ BenchmarkEncodeK32-16 20000000 50.1 ns/op 0 B/op 0 allocs/op
+ BenchmarkEncodeFromFormerKmerK32-16 200000000 9.30 ns/op 0 B/op 0 allocs/op
+ BenchmarkMustEncodeFromFormerKmerK32-16 2000000000 1.95 ns/op 0 B/op 0 allocs/op
+ BenchmarkDecodeK32-16 20000000 79.5 ns/op 32 B/op 1 allocs/op
+ BenchmarkRevK32-16 50000000 20.7 ns/op 0 B/op 0 allocs/op
+ BenchmarkCompK32-16 50000000 32.7 ns/op 0 B/op 0 allocs/op
+ BenchmarkRevCompK32-16 100000000 22.4 ns/op 0 B/op 0 allocs/op
## The toolkit
@@ -139,6 +139,17 @@ label |encoded-kmera|gzip-compressedb|compact-fo
-rw-rw-r--. 1 shenwei shenwei 1.4M Aug 9 23:19 Ecoli-MG1655.fasta.gz
-rw-rw-r--. 1 shenwei shenwei 23M Aug 9 23:29 Ecoli-MG1655.fasta.gz.k31.unik
+ # counting (only keep the canonical kmers)
+ $ memusg -t unikmer count -k 31 Ecoli-MG1655.fasta.gz -o Ecoli-MG1655.fasta.gz.k31 --canonical
+
+ elapsed time: 3.428s
+ peak rss: 236.14 MB
+
+ $ ls -lh Ecoli-MG1655.fasta.gz*
+ -rw-rw-r--. 1 shenwei shenwei 1.4M Aug 9 23:19 Ecoli-MG1655.fasta.gz
+ -rw-rw-r--. 1 shenwei shenwei 19M Aug 9 23:29 Ecoli-MG1655.fasta.gz.k31.unik
+
+
# view
$ unikmer view Ecoli-MG1655.fasta.gz.k31.unik | head -n 3
diff --git a/kmer.go b/kmer.go
index f207c58..4f7aabf 100644
--- a/kmer.go
+++ b/kmer.go
@@ -80,37 +80,70 @@ func Encode(kmer []byte) (code uint64, err error) {
// ErrNotConsecutiveKmers means the two Kmers are not consecutive
var ErrNotConsecutiveKmers = errors.New("unikmer: not consecutive Kmers")
-// MustEncodeFromPrevious encodes from previous kmer,
-// assuming the kmer and preKmer are both OK.
-func MustEncodeFromPreviousKmer(kmer []byte, preKmer []byte, preCode uint64) (uint64, error) {
- preCode = preCode & ((1 << (uint(len(kmer)-1) << 1)) - 1) << 2
+// MustEncodeFromFormerKmer encodes from former kmer,
+// assuming the kmer and leftKmer are both OK.
+func MustEncodeFromFormerKmer(kmer []byte, leftKmer []byte, leftCode uint64) (uint64, error) {
+ leftCode = leftCode & ((1 << (uint(len(kmer)-1) << 1)) - 1) << 2
switch kmer[len(kmer)-1] {
case 'G', 'g', 'K', 'k':
- preCode += 2
+ leftCode |= 2
case 'T', 't', 'U', 'u':
- preCode += 3
+ leftCode |= 3
case 'C', 'c', 'S', 's', 'B', 'b', 'Y', 'y':
- preCode += 1
+ leftCode |= 1
case 'A', 'a', 'N', 'n', 'M', 'm', 'V', 'v', 'H', 'h', 'R', 'r', 'D', 'd', 'W', 'w':
- preCode += 0
+ // leftCode |= 0
default:
- return preCode, ErrIllegalBase
+ return leftCode, ErrIllegalBase
}
- return preCode, nil
+ return leftCode, nil
}
-// EncodeFromPrevious encodes from previous kmer, inspired by ntHash
-func EncodeFromPreviousKmer(kmer []byte, preKmer []byte, preCode uint64) (uint64, error) {
+// EncodeFromFormerKmer encodes from former kmer, inspired by ntHash
+func EncodeFromFormerKmer(kmer []byte, leftKmer []byte, leftCode uint64) (uint64, error) {
if len(kmer) == 0 {
return 0, ErrKOverflow
}
- if len(kmer) != len(preKmer) {
+ if len(kmer) != len(leftKmer) {
return 0, ErrKMismatch
}
- if !bytes.Equal(kmer[0:len(kmer)-1], preKmer[1:len(preKmer)]) {
+ if !bytes.Equal(kmer[0:len(kmer)-1], leftKmer[1:len(leftKmer)]) {
return 0, ErrNotConsecutiveKmers
}
- return MustEncodeFromPreviousKmer(kmer, preKmer, preCode)
+ return MustEncodeFromFormerKmer(kmer, leftKmer, leftCode)
+}
+
+// MustEncodeFromLatterKmer encodes from latter kmer,
+// assuming the kmer and rightKmer are both OK.
+func MustEncodeFromLatterKmer(kmer []byte, rightKmer []byte, rightCode uint64) (uint64, error) {
+ rightCode >>= 2
+ switch kmer[0] {
+ case 'G', 'g', 'K', 'k':
+ rightCode |= 2 << (uint(len(kmer)-1) << 1)
+ case 'T', 't', 'U', 'u':
+ rightCode |= 3 << (uint(len(kmer)-1) << 1)
+ case 'C', 'c', 'S', 's', 'B', 'b', 'Y', 'y':
+ rightCode |= 1 << (uint(len(kmer)-1) << 1)
+ case 'A', 'a', 'N', 'n', 'M', 'm', 'V', 'v', 'H', 'h', 'R', 'r', 'D', 'd', 'W', 'w':
+ // rightCode |= 0 << (uint(len(kmer)-1) << 1)
+ default:
+ return rightCode, ErrIllegalBase
+ }
+ return rightCode, nil
+}
+
+// EncodeFromLatterKmer encodes from former kmer.
+func EncodeFromLatterKmer(kmer []byte, rightKmer []byte, rightCode uint64) (uint64, error) {
+ if len(kmer) == 0 {
+ return 0, ErrKOverflow
+ }
+ if len(kmer) != len(rightKmer) {
+ return 0, ErrKMismatch
+ }
+ if !bytes.Equal(rightKmer[0:len(kmer)-1], kmer[1:len(rightKmer)]) {
+ return 0, ErrNotConsecutiveKmers
+ }
+ return MustEncodeFromLatterKmer(kmer, rightKmer, rightCode)
}
// Reverse returns code of the reversed sequence.
@@ -182,19 +215,19 @@ func NewKmerCode(kmer []byte) (KmerCode, error) {
return KmerCode{code, len(kmer)}, err
}
-// NewKmerCodeFromPreviousOne computes KmerCode from the previous consecutive kmer.
-func NewKmerCodeFromPreviousOne(kmer []byte, preKmer []byte, preKcode KmerCode) (KmerCode, error) {
- code, err := EncodeFromPreviousKmer(kmer, preKmer, preKcode.Code)
+// NewKmerCodeFromFormerOne computes KmerCode from the Former consecutive kmer.
+func NewKmerCodeFromFormerOne(kmer []byte, leftKmer []byte, preKcode KmerCode) (KmerCode, error) {
+ code, err := EncodeFromFormerKmer(kmer, leftKmer, preKcode.Code)
if err != nil {
return KmerCode{}, err
}
return KmerCode{code, len(kmer)}, err
}
-// NewKmerCodeMustFromPreviousOne computes KmerCode from the previous consecutive kmer,
-// assuming the kmer and preKmer are both OK.
-func NewKmerCodeMustFromPreviousOne(kmer []byte, preKmer []byte, preKcode KmerCode) (KmerCode, error) {
- code, err := MustEncodeFromPreviousKmer(kmer, preKmer, preKcode.Code)
+// NewKmerCodeMustFromFormerOne computes KmerCode from the Former consecutive kmer,
+// assuming the kmer and leftKmer are both OK.
+func NewKmerCodeMustFromFormerOne(kmer []byte, leftKmer []byte, preKcode KmerCode) (KmerCode, error) {
+ code, err := MustEncodeFromFormerKmer(kmer, leftKmer, preKcode.Code)
if err != nil {
return KmerCode{}, err
}
@@ -221,6 +254,15 @@ func (kcode KmerCode) RevComp() KmerCode {
return KmerCode{RevComp(kcode.Code, kcode.K), kcode.K}
}
+// Canonical returns its canonical kmer
+func (kcode KmerCode) Canonical() KmerCode {
+ rcKcode := kcode.RevComp()
+ if rcKcode.Code < kcode.Code {
+ return rcKcode
+ }
+ return kcode
+}
+
// Bytes returns kmer in []byte.
func (kcode KmerCode) Bytes() []byte {
return Decode(kcode.Code, kcode.K)
diff --git a/kmer_test.go b/kmer_test.go
index a6b3fdb..0d071c2 100644
--- a/kmer_test.go
+++ b/kmer_test.go
@@ -75,8 +75,8 @@ func TestEncodeDecode(t *testing.T) {
}
}
-// TestEncodeFromPreviousKmer tests TestEncodeFromPreviousKmer
-func TestEncodeFromPreviousKmer(t *testing.T) {
+// TestEncodeFromFormerKmer tests TestEncodeFromFormerKmer
+func TestEncodeFromFormerKmer(t *testing.T) {
var err error
k := 5
first := true
@@ -95,7 +95,7 @@ func TestEncodeFromPreviousKmer(t *testing.T) {
continue
}
pKmer = benchMer[i-1 : i+k-1]
- code, err = EncodeFromPreviousKmer(kmer, pKmer, pCode)
+ code, err = EncodeFromFormerKmer(kmer, pKmer, pCode)
if err != nil {
t.Errorf("Encode error: %s", kmer)
}
@@ -105,7 +105,43 @@ func TestEncodeFromPreviousKmer(t *testing.T) {
t.Errorf("Encode error: %s", kmer)
}
if code0 != code {
- t.Errorf("EncodeFromPreviousKmer error for %s: wrong %d != right %d", kmer, code, code0)
+ t.Errorf("EncodeFromFormerKmer error for %s: wrong %d != right %d", kmer, code, code0)
+ }
+
+ pCode = code
+ }
+}
+
+func TestEncodeFromLatterKmer(t *testing.T) {
+ var err error
+ k := 5
+ first := true
+ var code, code0, pCode uint64
+ var kmer, pKmer []byte
+ for i := len(benchMer) - k - 1; i >= 0; i-- {
+ kmer = benchMer[i : i+k]
+ if first {
+ code, err = Encode(kmer)
+ if err != nil {
+ t.Errorf("Encode error: %s", kmer)
+ }
+
+ pCode = code
+ first = false
+ continue
+ }
+ pKmer = benchMer[i+1 : i+k+1]
+ code, err = EncodeFromLatterKmer(kmer, pKmer, pCode)
+ if err != nil {
+ t.Errorf("Encode error: %s", kmer)
+ }
+
+ code0, err = Encode(kmer)
+ if err != nil {
+ t.Errorf("Encode error: %s", kmer)
+ }
+ if code0 != code {
+ t.Errorf("EncodeFromLatterKmer error for %s: wrong %d != right %d", kmer, code, code0)
}
pCode = code
@@ -154,12 +190,12 @@ func BenchmarkEncodeK32(b *testing.B) {
}
}
-// BenchmarkEncode tests speed of EncodeFromPreviousKmer
-func BenchmarkEncodeFromPreviousKmerK32(b *testing.B) {
+// BenchmarkEncode tests speed of EncodeFromFormerKmer
+func BenchmarkEncodeFromFormerKmerK32(b *testing.B) {
var code uint64
var err error
for i := 0; i < b.N; i++ {
- code, err = EncodeFromPreviousKmer(benchMer2, benchMer, benchCode)
+ code, err = EncodeFromFormerKmer(benchMer2, benchMer, benchCode)
if err != nil {
b.Errorf("Encode error: %s", benchMer)
}
@@ -169,12 +205,12 @@ func BenchmarkEncodeFromPreviousKmerK32(b *testing.B) {
}
}
-// BenchmarkEncode tests speed of MustEncodeFromPreviousKmer
-func BenchmarkMustEncodeFromPreviousKmerK32(b *testing.B) {
+// BenchmarkEncode tests speed of MustEncodeFromFormerKmer
+func BenchmarkMustEncodeFromFormerKmerK32(b *testing.B) {
var code uint64
var err error
for i := 0; i < b.N; i++ {
- code, err = MustEncodeFromPreviousKmer(benchMer2, benchMer, benchCode)
+ code, err = MustEncodeFromFormerKmer(benchMer2, benchMer, benchCode)
if err != nil {
b.Errorf("Encode error: %s", benchMer)
}
diff --git a/unikmer/cmd/count.go b/unikmer/cmd/count.go
index 135030d..c5e6701 100644
--- a/unikmer/cmd/count.go
+++ b/unikmer/cmd/count.go
@@ -51,6 +51,8 @@ var countCmd = &cobra.Command{
checkError(fmt.Errorf("k > 32 not supported"))
}
+ canonical := getFlagBool(cmd, "canonical")
+
if !isStdout(outFile) {
outFile += extDataFile
}
@@ -75,7 +77,7 @@ var countCmd = &cobra.Command{
var fastxReader *fastx.Reader
var kcode, preKcode unikmer.KmerCode
var first bool
- var i, j int
+ var i, j, iters int
var ok bool
var n int64
for _, file := range files {
@@ -94,7 +96,13 @@ var countCmd = &cobra.Command{
break
}
- for j = 0; j < 2; j++ {
+ if canonical {
+ iters = 1
+ } else {
+ iters = 2
+ }
+
+ for j = 0; j < iters; j++ {
if j == 0 { // sequence
sequence = record.Seq.Seq
@@ -135,12 +143,16 @@ var countCmd = &cobra.Command{
kcode, err = unikmer.NewKmerCode(kmer)
first = false
} else {
- kcode, err = unikmer.NewKmerCodeMustFromPreviousOne(kmer, preKmer, preKcode)
+ kcode, err = unikmer.NewKmerCodeMustFromFormerOne(kmer, preKmer, preKcode)
}
- preKmer, preKcode = kmer, kcode
if err != nil {
checkError(fmt.Errorf("encoding '%s': %s", kmer, err))
}
+ preKmer, preKcode = kmer, kcode
+
+ if canonical {
+ kcode = kcode.Canonical()
+ }
if _, ok = m[kcode.Code]; !ok {
m[kcode.Code] = struct{}{}
@@ -162,6 +174,7 @@ func init() {
RootCmd.AddCommand(countCmd)
countCmd.Flags().StringP("out-prefix", "o", "-", `out file prefix ("-" for stdout)`)
- countCmd.Flags().IntP("kmer-len", "k", 0, "kmer length")
+ countCmd.Flags().IntP("kmer-len", "k", 0, "Kmer length")
countCmd.Flags().BoolP("circular", "", false, "circular genome")
+ countCmd.Flags().BoolP("canonical", "", false, "only keep the canonical Kmers")
}
diff --git a/unikmer/cmd/version.go b/unikmer/cmd/version.go
index cdf2ee9..a2d8f7e 100644
--- a/unikmer/cmd/version.go
+++ b/unikmer/cmd/version.go
@@ -29,7 +29,7 @@ import (
)
// VERSION is the version
-var VERSION = "v0.2.1"
+var VERSION = "v0.2.2"
// versionCmd represents the version command
var versionCmd = &cobra.Command{