diff --git a/.gitignore b/.gitignore index 813740c..2b9854a 100644 --- a/.gitignore +++ b/.gitignore @@ -12,4 +12,9 @@ *.out *.directory -unikmer/unikmer +unikmer/unikmer* +unikmer/binaries* +doc/site/* + +*ssshtest +testdata/*.unik diff --git a/README.md b/README.md index 8d1c8b6..cc4dfb7 100644 --- a/README.md +++ b/README.md @@ -1 +1,138 @@ - +# unikmer + +unikmer (unique Kmer) is a golang package and a command-line toolkit for +manipulating [Kmers](https://en.wikipedia.org/wiki/K-mer) while NOT recording +Kmer frequencies. + +## The package + +[![GoDoc](https://godoc.org/github.com/shenwei356/unikmer?status.svg)](https://godoc.org/github.com/shenwei356/unikmer) +[![Go Report Card](https://goreportcard.com/badge/github.com/shenwei356/unikmer)](https://goreportcard.com/report/github.com/shenwei356/unikmer) + +The unikmer package provides basic manipulations of unique Kmers (NOT including +Kmer frequencies) and its binary file. + +### Installation + + go get -u github.com/shenwei356/unikmer + +### Benchmark + + $ go test . -bench=Bench* + goos: linux + goarch: amd64 + pkg: github.com/shenwei356/unikmer + BenchmarkEncodeK32-4 20000000 98.1 ns/op + BenchmarkDecodeK32-4 20000000 102 ns/op + BenchmarkRevK32-4 20000000 64.2 ns/op + BenchmarkCompK32-4 20000000 54.8 ns/op + BenchmarkRevCompK32-4 10000000 116 ns/op + + +## The toolkit + +`unikmer` is a command-line toolkit provides some functions including counting, +format convertion, set operations and searching on unique Kmers. + +### Installation + +1. Download [binary files](/~https://github.com/shenwei356/unikmer/releases). + +1. Bioconda (not available now) + + conda install unikmer + +### Commands + +1. Counting + + count count Kmer from FASTA/Q sequences + subset extract smaller Kmers from binary file + +1. Format convertion + + view read and output binary format to plain text + dump convert plain Kmer text to binary format + +1. Set operations + + concat concatenate multiple binary files + diff set difference of multiple binary files + inter intersection of multiple binary files + union union of multiple binary files + +1. Searching + + grep search Kmer from binary file + +1. Misc + + genautocomplete generate shell autocompletion script + help Help about any command + version print version information and check for update + +### Quick Start + + # counting + $ time unikmer count -k 31 Ecoli-MG1655.fasta.gz -o Ecoli-MG1655.fasta.gz + real 0m5.209s + user 0m6.864s + sys 0m0.169s + + $ ls -lh Ecoli-MG1655.fasta.gz* + -rw-rw-r--. 1 shenwei shenwei 1.4M Aug 9 23:19 Ecoli-MG1655.fasta.gz + -rw-rw-r--. 1 shenwei shenwei 23M Aug 9 23:29 Ecoli-MG1655.fasta.gz.unik + + + # view + $ unikmer view Ecoli-MG1655.fasta.gz.unik | head -n 3 + AGCTTTTCATTCTGACTGCAACGGGCAATAT + GCTTTTCATTCTGACTGCAACGGGCAATATG + CTTTTCATTCTGACTGCAACGGGCAATATGT + + $ unikmer view Ecoli-MG1655.fasta.gz.unik | wc -l + 9108538 + + + # union + $ unikmer union Ecoli-MG1655.fasta.gz.unik Ecoli-IAI39.fasta.gz.unik -o union + + + # intersection + $ unikmer inter Ecoli-MG1655.fasta.gz.unik Ecoli-IAI39.fasta.gz.unik -o inter + + + # difference + $ unikmer diff -t 4 Ecoli-MG1655.fasta.gz.unik Ecoli-IAI39.fasta.gz.unik -o diff + + + # ------------------------------------------------------------------------- + + $ ls -lh + -rw-rw-r--. 1 shenwei shenwei 1.6M Aug 9 23:19 Ecoli-IAI39.fasta.gz + -rw-rw-r--. 1 shenwei shenwei 25M Aug 9 23:29 Ecoli-IAI39.fasta.gz.unik + -rw-rw-r--. 1 shenwei shenwei 1.4M Aug 9 23:19 Ecoli-MG1655.fasta.gz + -rw-rw-r--. 1 shenwei shenwei 23M Aug 9 23:29 Ecoli-MG1655.fasta.gz.unik + -rw-rw-r--. 1 shenwei shenwei 38M Aug 9 23:32 union.unik + -rw-rw-r--. 1 shenwei shenwei 35M Aug 9 23:33 inter.unik + -rw-rw-r--. 1 shenwei shenwei 35M Aug 9 23:34 diff.unik + + $ unikmer view Ecoli-MG1655.fasta.gz.unik | wc -l + 9108538 + $ unikmer view Ecoli-IAI39.fasta.gz.unik | wc -l + 9821960 + $ unikmer view union.unik | wc -l + 14402956 + $ unikmer view inter.unik | wc -l + 4527542 + $ unikmer view diff.unik | wc -l + 4580996 + + +## Contributing + +We welcome pull requests, bug fixes and issue reports. + +## License + +[MIT License](/~https://github.com/shenwei356/unikmer/blob/master/LICENSE) diff --git a/file.go b/file.go index 5648062..b7786be 100644 --- a/file.go +++ b/file.go @@ -27,22 +27,22 @@ import ( "io" ) -// MainVersion is the main version number +// MainVersion is the main version number. const MainVersion int64 = 0 -// MinorVersion is the minor version number +// MinorVersion is the minor version number. const MinorVersion int64 = 1 -// Magic number of binary file +// Magic number of binary file. var Magic = [8]byte{'.', 'u', 'n', 'i', 'k', 'm', 'e', 'r'} -// ErrInvalidFileFormat means invalid file format +// ErrInvalidFileFormat means invalid file format. var ErrInvalidFileFormat = errors.New("unikmer: invalid binary format") -// ErrBrokenFile means the file is not complete +// ErrBrokenFile means the file is not complete. // var ErrBrokenFile = errors.New("unikmer: broken file") -// ErrKMismatch means K size mismatch +// ErrKMismatch means K size mismatch. var ErrKMismatch = errors.New("unikmer: K mismatch") var be = binary.BigEndian @@ -57,7 +57,7 @@ func (h Header) String() string { return fmt.Sprintf("unikmer binary kmer data file v%s, K=%d", h.Version, h.K) } -// Reader is for reading KmerCode +// Reader is for reading KmerCode. type Reader struct { Header r io.Reader @@ -66,7 +66,7 @@ type Reader struct { size uint64 } -// NewReader returns a Reader +// NewReader returns a Reader. func NewReader(r io.Reader) (*Reader, error) { reader := &Reader{r: r} reader.err = reader.readHeader() @@ -106,7 +106,7 @@ func (reader *Reader) readHeader() error { return nil } -// Read reads one KmerCode +// Read reads one KmerCode. func (reader *Reader) Read() (KmerCode, error) { reader.err = binary.Read(reader.r, be, &reader.code) if reader.err != nil { @@ -116,7 +116,7 @@ func (reader *Reader) Read() (KmerCode, error) { return KmerCode{Code: reader.code, K: reader.Header.K}, nil } -// Writer writes KmerCode +// Writer writes KmerCode. type Writer struct { Header w io.Writer @@ -126,7 +126,7 @@ type Writer struct { size int64 } -// NewWriter creates a Writer +// NewWriter creates a Writer. func NewWriter(w io.Writer, k int) *Writer { return &Writer{ Header: Header{Version: fmt.Sprintf("%d.%d", MainVersion, MinorVersion), K: k}, @@ -148,7 +148,7 @@ func (writer *Writer) writeHeader() error { return nil } -// WriteKmer writes one Kmer +// WriteKmer writes one Kmer. func (writer *Writer) WriteKmer(mer []byte) error { writer.kcode, writer.err = NewKmerCode(mer) if writer.err != nil { @@ -157,7 +157,7 @@ func (writer *Writer) WriteKmer(mer []byte) error { return writer.Write(writer.kcode) } -// Write writes one KmerCode +// Write writes one KmerCode. func (writer *Writer) Write(kcode KmerCode) error { if writer.Header.K != kcode.K { writer.err = ErrKMismatch @@ -181,7 +181,7 @@ func (writer *Writer) Write(kcode KmerCode) error { return nil } -// Flush writes the size to the end +// Flush is not used actually. func (writer *Writer) Flush() error { // writer.err = binary.Write(writer.w, be, writer.size) // if writer.err != nil { diff --git a/file_test.go b/file_test.go index 9c36891..8ef5dad 100644 --- a/file_test.go +++ b/file_test.go @@ -36,7 +36,7 @@ func genKmers(k int, num int) [][]byte { for i := 0; i < num; i++ { mers[i] = make([]byte, k) for j = 0; j < k; j++ { - mers[i][j] = code2base[rand.Intn(4)] + mers[i][j] = bit2base[rand.Intn(4)] } } return mers diff --git a/kmer.go b/kmer.go index bedbe4d..00a1bd0 100644 --- a/kmer.go +++ b/kmer.go @@ -24,25 +24,34 @@ import ( "errors" ) -// ErrIllegalBase means that base beyond "ACGTU" was detected +// ErrIllegalBase means that base beyond IUPAC symbols are detected. var ErrIllegalBase = errors.New("unikmer: illegal base") -// ErrKOverflow means K > 32 +// ErrKOverflow means K > 32. var ErrKOverflow = errors.New("unikmer: K (1-32) overflow") // Encode converts byte slice to bits. // -// M AC -// V ACG -// H ACT -// R AG -// D AGT -// W AT -// S CG -// B CGT -// Y CT -// K GT +// Codes: // +// A 00 +// C 01 +// G 10 +// T 11 +// +// For degenerate bases, only the first base is kept. +// +// M AC A +// V ACG A +// H ACT A +// R AG A +// D AGT A +// W AT A +// S CG C +// B CGT C +// Y CT C +// K GT G +// N ACGT A // func Encode(mer []byte) (code uint64, err error) { size := len(mer) @@ -66,7 +75,7 @@ func Encode(mer []byte) (code uint64, err error) { return code, nil } -// Reverse returns code of reversed sequence +// Reverse returns code of the reversed sequence. func Reverse(code uint64, k int) (c uint64) { if k <= 0 || k > 32 { panic(ErrKOverflow) @@ -78,7 +87,7 @@ func Reverse(code uint64, k int) (c uint64) { return } -// Complement return code of complement sequence +// Complement returns code of complement sequence. func Complement(code uint64, k int) (c uint64) { if k <= 0 || k > 32 { panic(ErrKOverflow) @@ -90,29 +99,29 @@ func Complement(code uint64, k int) (c uint64) { return } -// code2base is for mapping code to base -var code2base = [4]byte{'A', 'C', 'G', 'T'} +// bit2base is for mapping bit to base. +var bit2base = [4]byte{'A', 'C', 'G', 'T'} -// Decode converts the bits to origional seq +// Decode converts the code to origional seq func Decode(code uint64, k int) []byte { if k <= 0 || k > 32 { panic(ErrKOverflow) } mer := make([]byte, k) for i := 0; i < k; i++ { - mer[k-1-i] = code2base[code&3] + mer[k-1-i] = bit2base[code&3] code >>= 2 } return mer } -// KmerCode is a struct representing a kmer in 64-bits +// KmerCode is a struct representing a kmer in 64-bits. type KmerCode struct { Code uint64 K int } -// NewKmerCode returns a new KmerCode from byte slice +// NewKmerCode returns a new KmerCode struct from byte slice. func NewKmerCode(mer []byte) (KmerCode, error) { code, err := Encode(mer) if err != nil { @@ -121,27 +130,27 @@ func NewKmerCode(mer []byte) (KmerCode, error) { return KmerCode{code, len(mer)}, err } -// Equal checks wether two KmerCodes are the same +// Equal checks wether two KmerCodes are the same. func (kcode KmerCode) Equal(kcode2 KmerCode) bool { return kcode.K == kcode2.K && kcode.Code == kcode2.Code } -// Rev returns KmerCode of the reverse sequence +// Rev returns KmerCode of the reverse sequence. func (kcode KmerCode) Rev() KmerCode { return KmerCode{Reverse(kcode.Code, kcode.K), kcode.K} } -// Comp returns KmerCode of the complement sequence +// Comp returns KmerCode of the complement sequence. func (kcode KmerCode) Comp() KmerCode { return KmerCode{Complement(kcode.Code, kcode.K), kcode.K} } -// RevComp returns KmerCode of the reverse complement sequence +// RevComp returns KmerCode of the reverse complement sequence. func (kcode KmerCode) RevComp() KmerCode { return kcode.Rev().Comp() } -// Bytes returns kmer in []byte +// Bytes returns kmer in []byte. func (kcode KmerCode) Bytes() []byte { return Decode(kcode.Code, kcode.K) } diff --git a/kmer_test.go b/kmer_test.go index fb66619..06a2664 100644 --- a/kmer_test.go +++ b/kmer_test.go @@ -39,7 +39,7 @@ func init() { for i := 0; i < randomMersN; i++ { randomMers[i] = make([]byte, rand.Intn(32)+1) for j := range randomMers[i] { - randomMers[i][j] = code2base[rand.Intn(4)] + randomMers[i][j] = bit2base[rand.Intn(4)] } } diff --git a/testdata/Ecoli-IAI39.fasta.gz b/testdata/Ecoli-IAI39.fasta.gz new file mode 100644 index 0000000..f8be76c Binary files /dev/null and b/testdata/Ecoli-IAI39.fasta.gz differ diff --git a/testdata/Ecoli-MG1655.fasta.gz b/testdata/Ecoli-MG1655.fasta.gz new file mode 100644 index 0000000..19f2a65 Binary files /dev/null and b/testdata/Ecoli-MG1655.fasta.gz differ diff --git a/unikmer/cmd/subset.go b/unikmer/cmd/subset.go index 0d7f150..96f114d 100644 --- a/unikmer/cmd/subset.go +++ b/unikmer/cmd/subset.go @@ -34,8 +34,8 @@ import ( // subsetCmd represents var subsetCmd = &cobra.Command{ Use: "subset", - Short: "extract smaller kmers from binary file", - Long: `extract smaller kmers from binary file + Short: "extract smaller Kmers from binary file", + Long: `extract smaller Kmers from binary file Attention: - It's faster than re-counting from sequence file but in cost of losing diff --git a/unikmer/packaging.sh b/unikmer/packaging.sh new file mode 100755 index 0000000..9a84939 --- /dev/null +++ b/unikmer/packaging.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env sh + +CGO_ENABLED=0 gox -os="windows darwin linux" -arch="386 amd64" -tags netgo -ldflags '-w -s' + +dir=binaries +mkdir -p $dir; +rm -rf $dir/$f; + +for f in unikmer_*; do + mkdir -p $dir/$f; + mv $f $dir/$f; + cd $dir/$f; + mv $f $(echo $f | perl -pe 's/_[^\.]+//g'); + tar -zcf $f.tar.gz unikmer*; + mv *.tar.gz ../; + cd ..; + rm -rf $f; + cd ..; +done;