Skip to content

Commit

Permalink
unikmer count: new option --canonical
Browse files Browse the repository at this point in the history
  • Loading branch information
shenwei356 committed Sep 22, 2018
1 parent 420e681 commit 26a30c6
Show file tree
Hide file tree
Showing 6 changed files with 157 additions and 46 deletions.
11 changes: 10 additions & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
@@ -1 +1,10 @@

- v0.2.1
- `unikmer count`: performance improvement and new option `--canonical` for only keeping canonical Kmers.
- v0.2
- new command `unikmer sample`: sample Kmers from binary files.
- new global options:
- `-c, --compact`: write more compact binary file with little loss of speed.
- `-C, --no-compress`: do not compress binary file (not recommended).
- some improvements.
- v0.1.0
- first release
25 changes: 18 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,13 @@ Kmer frequencies) and provides serialization methods.
goos: linux
goarch: amd64
pkg: github.com/shenwei356/unikmer
BenchmarkEncodeK32-16 20000000 51.5 ns/op 0 B/op 0 allocs/op
BenchmarkEncodeFromPreviousKmerK32-16 200000000 9.28 ns/op 0 B/op 0 allocs/op
BenchmarkMustEncodeFromPreviousKmerK32-16 1000000000 1.81 ns/op 0 B/op 0 allocs/op
BenchmarkDecodeK32-16 20000000 82.5 ns/op 32 B/op 1 allocs/op
BenchmarkRevK32-16 50000000 26.1 ns/op 0 B/op 0 allocs/op
BenchmarkCompK32-16 50000000 28.4 ns/op 0 B/op 0 allocs/op
BenchmarkRevCompK32-16 50000000 22.4 ns/op 0 B/op 0 allocs/op
BenchmarkEncodeK32-16 20000000 50.1 ns/op 0 B/op 0 allocs/op
BenchmarkEncodeFromFormerKmerK32-16 200000000 9.30 ns/op 0 B/op 0 allocs/op
BenchmarkMustEncodeFromFormerKmerK32-16 2000000000 1.95 ns/op 0 B/op 0 allocs/op
BenchmarkDecodeK32-16 20000000 79.5 ns/op 32 B/op 1 allocs/op
BenchmarkRevK32-16 50000000 20.7 ns/op 0 B/op 0 allocs/op
BenchmarkCompK32-16 50000000 32.7 ns/op 0 B/op 0 allocs/op
BenchmarkRevCompK32-16 100000000 22.4 ns/op 0 B/op 0 allocs/op

## The toolkit

Expand Down Expand Up @@ -139,6 +139,17 @@ label |encoded-kmer<sup>a</sup>|gzip-compressed<sup>b</sup>|compact-fo
-rw-rw-r--. 1 shenwei shenwei 1.4M Aug 9 23:19 Ecoli-MG1655.fasta.gz
-rw-rw-r--. 1 shenwei shenwei 23M Aug 9 23:29 Ecoli-MG1655.fasta.gz.k31.unik

# counting (only keep the canonical kmers)
$ memusg -t unikmer count -k 31 Ecoli-MG1655.fasta.gz -o Ecoli-MG1655.fasta.gz.k31 --canonical

elapsed time: 3.428s
peak rss: 236.14 MB

$ ls -lh Ecoli-MG1655.fasta.gz*
-rw-rw-r--. 1 shenwei shenwei 1.4M Aug 9 23:19 Ecoli-MG1655.fasta.gz
-rw-rw-r--. 1 shenwei shenwei 19M Aug 9 23:29 Ecoli-MG1655.fasta.gz.k31.unik



# view
$ unikmer view Ecoli-MG1655.fasta.gz.k31.unik | head -n 3
Expand Down
86 changes: 64 additions & 22 deletions kmer.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,37 +80,70 @@ func Encode(kmer []byte) (code uint64, err error) {
// ErrNotConsecutiveKmers means the two Kmers are not consecutive
var ErrNotConsecutiveKmers = errors.New("unikmer: not consecutive Kmers")

// MustEncodeFromPrevious encodes from previous kmer,
// assuming the kmer and preKmer are both OK.
func MustEncodeFromPreviousKmer(kmer []byte, preKmer []byte, preCode uint64) (uint64, error) {
preCode = preCode & ((1 << (uint(len(kmer)-1) << 1)) - 1) << 2
// MustEncodeFromFormerKmer encodes from former kmer,
// assuming the kmer and leftKmer are both OK.
func MustEncodeFromFormerKmer(kmer []byte, leftKmer []byte, leftCode uint64) (uint64, error) {
leftCode = leftCode & ((1 << (uint(len(kmer)-1) << 1)) - 1) << 2
switch kmer[len(kmer)-1] {
case 'G', 'g', 'K', 'k':
preCode += 2
leftCode |= 2
case 'T', 't', 'U', 'u':
preCode += 3
leftCode |= 3
case 'C', 'c', 'S', 's', 'B', 'b', 'Y', 'y':
preCode += 1
leftCode |= 1
case 'A', 'a', 'N', 'n', 'M', 'm', 'V', 'v', 'H', 'h', 'R', 'r', 'D', 'd', 'W', 'w':
preCode += 0
// leftCode |= 0
default:
return preCode, ErrIllegalBase
return leftCode, ErrIllegalBase
}
return preCode, nil
return leftCode, nil
}

// EncodeFromPrevious encodes from previous kmer, inspired by ntHash
func EncodeFromPreviousKmer(kmer []byte, preKmer []byte, preCode uint64) (uint64, error) {
// EncodeFromFormerKmer encodes from former kmer, inspired by ntHash
func EncodeFromFormerKmer(kmer []byte, leftKmer []byte, leftCode uint64) (uint64, error) {
if len(kmer) == 0 {
return 0, ErrKOverflow
}
if len(kmer) != len(preKmer) {
if len(kmer) != len(leftKmer) {
return 0, ErrKMismatch
}
if !bytes.Equal(kmer[0:len(kmer)-1], preKmer[1:len(preKmer)]) {
if !bytes.Equal(kmer[0:len(kmer)-1], leftKmer[1:len(leftKmer)]) {
return 0, ErrNotConsecutiveKmers
}
return MustEncodeFromPreviousKmer(kmer, preKmer, preCode)
return MustEncodeFromFormerKmer(kmer, leftKmer, leftCode)
}

// MustEncodeFromLatterKmer encodes from latter kmer,
// assuming the kmer and rightKmer are both OK.
func MustEncodeFromLatterKmer(kmer []byte, rightKmer []byte, rightCode uint64) (uint64, error) {
rightCode >>= 2
switch kmer[0] {
case 'G', 'g', 'K', 'k':
rightCode |= 2 << (uint(len(kmer)-1) << 1)
case 'T', 't', 'U', 'u':
rightCode |= 3 << (uint(len(kmer)-1) << 1)
case 'C', 'c', 'S', 's', 'B', 'b', 'Y', 'y':
rightCode |= 1 << (uint(len(kmer)-1) << 1)
case 'A', 'a', 'N', 'n', 'M', 'm', 'V', 'v', 'H', 'h', 'R', 'r', 'D', 'd', 'W', 'w':
// rightCode |= 0 << (uint(len(kmer)-1) << 1)
default:
return rightCode, ErrIllegalBase
}
return rightCode, nil
}

// EncodeFromLatterKmer encodes from former kmer.
func EncodeFromLatterKmer(kmer []byte, rightKmer []byte, rightCode uint64) (uint64, error) {
if len(kmer) == 0 {
return 0, ErrKOverflow
}
if len(kmer) != len(rightKmer) {
return 0, ErrKMismatch
}
if !bytes.Equal(rightKmer[0:len(kmer)-1], kmer[1:len(rightKmer)]) {
return 0, ErrNotConsecutiveKmers
}
return MustEncodeFromLatterKmer(kmer, rightKmer, rightCode)
}

// Reverse returns code of the reversed sequence.
Expand Down Expand Up @@ -182,19 +215,19 @@ func NewKmerCode(kmer []byte) (KmerCode, error) {
return KmerCode{code, len(kmer)}, err
}

// NewKmerCodeFromPreviousOne computes KmerCode from the previous consecutive kmer.
func NewKmerCodeFromPreviousOne(kmer []byte, preKmer []byte, preKcode KmerCode) (KmerCode, error) {
code, err := EncodeFromPreviousKmer(kmer, preKmer, preKcode.Code)
// NewKmerCodeFromFormerOne computes KmerCode from the Former consecutive kmer.
func NewKmerCodeFromFormerOne(kmer []byte, leftKmer []byte, preKcode KmerCode) (KmerCode, error) {
code, err := EncodeFromFormerKmer(kmer, leftKmer, preKcode.Code)
if err != nil {
return KmerCode{}, err
}
return KmerCode{code, len(kmer)}, err
}

// NewKmerCodeMustFromPreviousOne computes KmerCode from the previous consecutive kmer,
// assuming the kmer and preKmer are both OK.
func NewKmerCodeMustFromPreviousOne(kmer []byte, preKmer []byte, preKcode KmerCode) (KmerCode, error) {
code, err := MustEncodeFromPreviousKmer(kmer, preKmer, preKcode.Code)
// NewKmerCodeMustFromFormerOne computes KmerCode from the Former consecutive kmer,
// assuming the kmer and leftKmer are both OK.
func NewKmerCodeMustFromFormerOne(kmer []byte, leftKmer []byte, preKcode KmerCode) (KmerCode, error) {
code, err := MustEncodeFromFormerKmer(kmer, leftKmer, preKcode.Code)
if err != nil {
return KmerCode{}, err
}
Expand All @@ -221,6 +254,15 @@ func (kcode KmerCode) RevComp() KmerCode {
return KmerCode{RevComp(kcode.Code, kcode.K), kcode.K}
}

// Canonical returns its canonical kmer
func (kcode KmerCode) Canonical() KmerCode {
rcKcode := kcode.RevComp()
if rcKcode.Code < kcode.Code {
return rcKcode
}
return kcode
}

// Bytes returns kmer in []byte.
func (kcode KmerCode) Bytes() []byte {
return Decode(kcode.Code, kcode.K)
Expand Down
56 changes: 46 additions & 10 deletions kmer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,8 @@ func TestEncodeDecode(t *testing.T) {
}
}

// TestEncodeFromPreviousKmer tests TestEncodeFromPreviousKmer
func TestEncodeFromPreviousKmer(t *testing.T) {
// TestEncodeFromFormerKmer tests TestEncodeFromFormerKmer
func TestEncodeFromFormerKmer(t *testing.T) {
var err error
k := 5
first := true
Expand All @@ -95,7 +95,7 @@ func TestEncodeFromPreviousKmer(t *testing.T) {
continue
}
pKmer = benchMer[i-1 : i+k-1]
code, err = EncodeFromPreviousKmer(kmer, pKmer, pCode)
code, err = EncodeFromFormerKmer(kmer, pKmer, pCode)
if err != nil {
t.Errorf("Encode error: %s", kmer)
}
Expand All @@ -105,7 +105,43 @@ func TestEncodeFromPreviousKmer(t *testing.T) {
t.Errorf("Encode error: %s", kmer)
}
if code0 != code {
t.Errorf("EncodeFromPreviousKmer error for %s: wrong %d != right %d", kmer, code, code0)
t.Errorf("EncodeFromFormerKmer error for %s: wrong %d != right %d", kmer, code, code0)
}

pCode = code
}
}

func TestEncodeFromLatterKmer(t *testing.T) {
var err error
k := 5
first := true
var code, code0, pCode uint64
var kmer, pKmer []byte
for i := len(benchMer) - k - 1; i >= 0; i-- {
kmer = benchMer[i : i+k]
if first {
code, err = Encode(kmer)
if err != nil {
t.Errorf("Encode error: %s", kmer)
}

pCode = code
first = false
continue
}
pKmer = benchMer[i+1 : i+k+1]
code, err = EncodeFromLatterKmer(kmer, pKmer, pCode)
if err != nil {
t.Errorf("Encode error: %s", kmer)
}

code0, err = Encode(kmer)
if err != nil {
t.Errorf("Encode error: %s", kmer)
}
if code0 != code {
t.Errorf("EncodeFromLatterKmer error for %s: wrong %d != right %d", kmer, code, code0)
}

pCode = code
Expand Down Expand Up @@ -154,12 +190,12 @@ func BenchmarkEncodeK32(b *testing.B) {
}
}

// BenchmarkEncode tests speed of EncodeFromPreviousKmer
func BenchmarkEncodeFromPreviousKmerK32(b *testing.B) {
// BenchmarkEncode tests speed of EncodeFromFormerKmer
func BenchmarkEncodeFromFormerKmerK32(b *testing.B) {
var code uint64
var err error
for i := 0; i < b.N; i++ {
code, err = EncodeFromPreviousKmer(benchMer2, benchMer, benchCode)
code, err = EncodeFromFormerKmer(benchMer2, benchMer, benchCode)
if err != nil {
b.Errorf("Encode error: %s", benchMer)
}
Expand All @@ -169,12 +205,12 @@ func BenchmarkEncodeFromPreviousKmerK32(b *testing.B) {
}
}

// BenchmarkEncode tests speed of MustEncodeFromPreviousKmer
func BenchmarkMustEncodeFromPreviousKmerK32(b *testing.B) {
// BenchmarkEncode tests speed of MustEncodeFromFormerKmer
func BenchmarkMustEncodeFromFormerKmerK32(b *testing.B) {
var code uint64
var err error
for i := 0; i < b.N; i++ {
code, err = MustEncodeFromPreviousKmer(benchMer2, benchMer, benchCode)
code, err = MustEncodeFromFormerKmer(benchMer2, benchMer, benchCode)
if err != nil {
b.Errorf("Encode error: %s", benchMer)
}
Expand Down
23 changes: 18 additions & 5 deletions unikmer/cmd/count.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ var countCmd = &cobra.Command{
checkError(fmt.Errorf("k > 32 not supported"))
}

canonical := getFlagBool(cmd, "canonical")

if !isStdout(outFile) {
outFile += extDataFile
}
Expand All @@ -75,7 +77,7 @@ var countCmd = &cobra.Command{
var fastxReader *fastx.Reader
var kcode, preKcode unikmer.KmerCode
var first bool
var i, j int
var i, j, iters int
var ok bool
var n int64
for _, file := range files {
Expand All @@ -94,7 +96,13 @@ var countCmd = &cobra.Command{
break
}

for j = 0; j < 2; j++ {
if canonical {
iters = 1
} else {
iters = 2
}

for j = 0; j < iters; j++ {
if j == 0 { // sequence
sequence = record.Seq.Seq

Expand Down Expand Up @@ -135,12 +143,16 @@ var countCmd = &cobra.Command{
kcode, err = unikmer.NewKmerCode(kmer)
first = false
} else {
kcode, err = unikmer.NewKmerCodeMustFromPreviousOne(kmer, preKmer, preKcode)
kcode, err = unikmer.NewKmerCodeMustFromFormerOne(kmer, preKmer, preKcode)
}
preKmer, preKcode = kmer, kcode
if err != nil {
checkError(fmt.Errorf("encoding '%s': %s", kmer, err))
}
preKmer, preKcode = kmer, kcode

if canonical {
kcode = kcode.Canonical()
}

if _, ok = m[kcode.Code]; !ok {
m[kcode.Code] = struct{}{}
Expand All @@ -162,6 +174,7 @@ func init() {
RootCmd.AddCommand(countCmd)

countCmd.Flags().StringP("out-prefix", "o", "-", `out file prefix ("-" for stdout)`)
countCmd.Flags().IntP("kmer-len", "k", 0, "kmer length")
countCmd.Flags().IntP("kmer-len", "k", 0, "Kmer length")
countCmd.Flags().BoolP("circular", "", false, "circular genome")
countCmd.Flags().BoolP("canonical", "", false, "only keep the canonical Kmers")
}
2 changes: 1 addition & 1 deletion unikmer/cmd/version.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ import (
)

// VERSION is the version
var VERSION = "v0.2.1"
var VERSION = "v0.2.2"

// versionCmd represents the version command
var versionCmd = &cobra.Command{
Expand Down

0 comments on commit 26a30c6

Please sign in to comment.