Even more docs to finalize v0.3

onecodex · Sep 12, 2019 · 6e92dc9 · 6e92dc9
1 parent d831789
commit 6e92dc9
Show file tree

Hide file tree

Showing 10 changed files with 345 additions and 232 deletions.
diff --git a/src/bitkmer.rs b/src/bitkmer.rs
@@ -1,3 +1,4 @@
+//! Compact binary representations of nucleic acid kmers
 pub type BitKmerSeq = u64;
 pub type BitKmer = (BitKmerSeq, u8);
 

diff --git a/src/formats/buffer.rs b/src/formats/buffer.rs
@@ -4,15 +4,15 @@ use safemem::copy_over;
 
 use crate::util::ParseError;
 
+/// A buffer that wraps an object with the `Read` trait and allows extracting
+/// a set of slices to data. Acts as a lower-level primitive for our FASTX
+/// readers.
 pub struct RecBuffer<'a> {
     file: &'a mut dyn io::Read,
     pub buf: Vec<u8>,
     pub last: bool,
 }
 
-/// A buffer that wraps an object with the `Read` trait and allows extracting
-/// a set of slices to data. Acts as a lower-level primitive for our FASTX
-/// readers.
 impl<'a> RecBuffer<'a> {
     /// Instantiate a new buffer.
     ///
@@ -56,12 +56,11 @@ impl<'a> RecBuffer<'a> {
     }
 }
 
-/// RecParser is an adaptor trait that allows new file format parsers to be
-/// defined. It takes a chunk from a RecBuffer (`from_reader`), optionally
-/// parses an initial header out (`header`) and then provides an iterator
-/// interface to parse a record stream. When finished, it provides a `eof`
-/// function to determine if the stream is completely exhausted.
-///
+/// [⚠️Unstable] RecParser is an adaptor trait that allows new file format
+/// parsers to be defined. It takes a chunk from a RecBuffer (`from_reader`),
+/// optionally parses an initial header out (`header`) and then provides an
+/// iterator interface to parse a record stream. When finished, it provides an
+/// `eof` function to determine if the stream is completely exhausted.
 pub trait RecParser<'s>: Sized + Iterator {
     type Header;
 

diff --git a/src/formats/fasta.rs b/src/formats/fasta.rs
@@ -7,6 +7,7 @@ use crate::sequence::Sequence;
 use crate::sequence_record::SequenceRecord;
 use crate::util::{memchr_both_last, ParseError, ParseErrorType};
 
+/// A zero-copy reference to a FASTA record in a buffer.
 #[derive(Debug)]
 pub struct FastaRecord<'a> {
     pub id: &'a [u8],
@@ -27,6 +28,7 @@ impl<'a> From<FastaRecord<'a>> for SequenceRecord<'a> {
     }
 }
 
+/// An iterator that parses a buffer into a sequence of FASTARecords
 pub struct FastaParser<'a> {
     buf: &'a [u8],
     last: bool,

diff --git a/src/formats/fastq.rs b/src/formats/fastq.rs
@@ -8,6 +8,7 @@ use crate::sequence::Sequence;
 use crate::sequence_record::SequenceRecord;
 use crate::util::{memchr_both, ParseError, ParseErrorType};
 
+/// A zero-copy reference to a FASTQ record in a buffer.
 #[derive(Debug)]
 pub struct FastqRecord<'a> {
     pub id: &'a [u8],
@@ -28,6 +29,7 @@ impl<'a> From<FastqRecord<'a>> for SequenceRecord<'a> {
     }
 }
 
+/// An iterator that parses a buffer into a sequence of FASTQRecords
 pub struct FastqParser<'a> {
     buf: &'a [u8],
     last: bool,

diff --git a/src/formats/mod.rs b/src/formats/mod.rs
@@ -1,14 +1,13 @@
-//! This module contains functions for reading FASTA data from either within
-//! memory or from files.
+//! Functions for reading sequence data from FASTA and FASTQ formats.
 //!
 //! # Design Note
 //!
 //! These functions are designed to take callbacks to process the FASTX records
-//! they read. It would be nice to present a FASTX Iterator that downstream users
-//! can use at some point, but this is only possible for in-memory data (e.g. using
-//! MemProducer from nom) because otherwise the lifetime of each record is linked
-//! to what part of the file we're reading and Rust doesn't support "streaming
-//! iterators" otherwise. Maybe some day.
+//! they read. It would be nice to present a FASTX Iterator that downstream
+//! users can use at some point, but this is complicated because the lifetimes
+//! of the records are linked to the slice of buffer that's being parsed (the
+//! records are "zero-copy" so they mustn't outlive the buffer). Perhaps we'll
+//! figure out a "streaming iterator" strategy here someday.
 //!
 //! See: /~https://github.com/emk/rust-streaming
 
@@ -37,7 +36,6 @@ use crate::util::{ParseError, ParseErrorType};
 
 static BUF_SIZE: usize = 256 * 1024;
 
-#[macro_export]
 macro_rules! parse_stream {
     ($reader:expr, $first:expr, $reader_type: ty, $rec: ident, $handler: block) => {{
         use $crate::formats::{RecBuffer, RecParser};

diff --git a/src/kmer.rs b/src/kmer.rs
@@ -1,114 +1,26 @@
-//! This module contains functions for kmerizing a longer sequence and various
-//! utilities for dealing with these kmers.
-//!
-//!
-use std::borrow::Cow;
-
-#[inline]
-pub fn complement(n: u8) -> u8 {
-    //! Returns the complementary base for a given IUPAC base code.
-    //!
-    //! Does not work for RNA sequences (maybe we should raise an error or something?)
-    match n {
-        b'a' => b't',
-        b'A' => b'T',
-        b'c' => b'g',
-        b'C' => b'G',
-        b'g' => b'c',
-        b'G' => b'C',
-        b't' => b'a',
-        b'T' => b'A',
-
-        // IUPAC codes
-        b'r' => b'y',
-        b'y' => b'r',
-        b'k' => b'm',
-        b'm' => b'k',
-        b'b' => b'v',
-        b'v' => b'b',
-        b'd' => b'h',
-        b'h' => b'd',
-        b's' => b's',
-        b'w' => b'w',
-        b'R' => b'Y',
-        b'Y' => b'R',
-        b'K' => b'M',
-        b'M' => b'K',
-        b'B' => b'V',
-        b'V' => b'B',
-        b'D' => b'H',
-        b'H' => b'D',
-        b'S' => b'S',
-        b'W' => b'W',
-
-        // anything else just pass through
-        // 'u' | 'U' => panic!("Does not support complements of U"),
-        x => x,
-    }
-}
-
-pub fn canonical(seq: &[u8]) -> Cow<[u8]> {
-    //! Taking in a sequence string, return the canonical form of the sequence
-    //! (e.g. the lexigraphically lowest of either the original sequence or its
-    //! reverse complement)
-    let mut buf: Vec<u8> = Vec::with_capacity(seq.len());
-    // enough just keeps our comparisons from happening after they need to
-    let mut enough = false;
-    let mut original_was_canonical = false;
-
-    // loop through the kmer and its reverse complement simultaneously
-    for (rn, n) in seq.iter().rev().map(|n| complement(*n)).zip(seq.iter()) {
-        buf.push(rn);
-        if !enough && (*n < rn) {
-            original_was_canonical = true;
-            break;
-        } else if !enough && (rn < *n) {
-            enough = true;
-        }
-        // unstated if branch: if rn == n, keep comparing
-    }
-    match (original_was_canonical, enough) {
-        (true, true) => panic!("Bug: should never set original_was_canonical if enough == true"),
-        (true, false) => seq.into(),
-        (false, true) => buf.into(),
-        // the sequences were completely equal, return the ref
-        (false, false) => seq.into(),
-    }
-}
-
-/// Find the lexigraphically smallest substring of `seq` of length `length`
-///
-/// There's probably a faster algorithm for this somewhere...
-pub fn minimizer(seq: &[u8], length: usize) -> Cow<[u8]> {
-    let reverse_complement: Vec<u8> = seq.iter().rev().map(|n| complement(*n)).collect();
-    let mut minmer = Cow::Borrowed(&seq[..length]);
-
-    for (kmer, rc_kmer) in seq.windows(length).zip(reverse_complement.windows(length)) {
-        if *kmer < minmer[..] {
-            minmer = kmer.into();
-        }
-        if *rc_kmer < minmer[..] {
-            minmer = rc_kmer.to_vec().into();
-        }
-    }
-    minmer
-}
+//! Functions for splitting sequences into fixed-width moving windows (kmers)
+//! and utilities for dealing with these kmers.
 
-pub fn is_good_base(chr: u8) -> bool {
+/// Returns true if the base is a unambiguous nucleic acid base (e.g. ACGT) and
+/// false otherwise.
+fn is_good_base(chr: u8) -> bool {
     match chr as char {
         'a' | 'c' | 'g' | 't' | 'A' | 'C' | 'G' | 'T' => true,
         _ => false,
     }
 }
 
+/// Generic moving window iterator over sequences to return k-mers
+///
+/// Iterator returns slices to the original data.
 pub struct Kmers<'a> {
     k: u8,
     start_pos: usize,
     buffer: &'a [u8],
 }
 
 impl<'a> Kmers<'a> {
-    //! A kmer-izer for a nucleotide/amino acid sequence; returning slices to the original data
+    /// Creates a new kmer-izer for a nucleotide/amino acid sequence.
     pub fn new(buffer: &'a [u8], k: u8) -> Self {
         Kmers {
             k,
@@ -131,18 +43,36 @@ impl<'a> Iterator for Kmers<'a> {
     }
 }
 
+/// A kmer-izer for a nucleotide acid sequences to return canonical kmers.
+///
+/// Iterator returns the position of the kmer, a slice to the original data,
+/// and an boolean indicating if the kmer returned is the original or the
+/// reverse complement.
 pub struct CanonicalKmers<'a> {
     k: u8,
     start_pos: usize,
     buffer: &'a [u8],
     rc_buffer: &'a [u8],
 }
 
-/// A kmer-izer for a nucleotide acid sequences to return canonical kmers.
-/// Returns the position of the kmer, a slice to the original data, and
-/// an boolean indicating if the kmer returned is the original or the reverse
-/// complement.
 impl<'a> CanonicalKmers<'a> {
+    /// Creates a new iterator.
+    ///
+    /// It's generally more useful to use this directly from a sequences (e.g.
+    /// `seq.canonical_kmers`. Requires a reference to the reverse complement
+    /// of the sequence it's created on, e.g.
+    /// ```
+    /// use needletail::Sequence;
+    /// use needletail::kmer::CanonicalKmers;
+    ///
+    /// let seq = b"ACGT";
+    /// let rc = seq.reverse_complement();
+    /// let c_iter = CanonicalKmers::new(seq, &rc, 3);
+    /// for (pos, kmer, canonical) in c_iter {
+    ///    // process data in here
+    /// }
+    ///
+    /// ```
     pub fn new(buffer: &'a [u8], rc_buffer: &'a [u8], k: u8) -> Self {
         let mut nucl_kmers = CanonicalKmers {
             k,
@@ -202,31 +132,96 @@ impl<'a> Iterator for CanonicalKmers<'a> {
     }
 }
 
-#[cfg(test)]
+#[cfg(tests)]
 mod tests {
     use super::*;
 
     #[test]
-    fn test_complement() {
-        assert_eq!(complement(b'a'), b't');
-        assert_eq!(complement(b'c'), b'g');
-        assert_eq!(complement(b'g'), b'c');
-        assert_eq!(complement(b'n'), b'n');
+    fn can_kmerize() {
+        let k_iter = Kmers::new(b"AGCT", 1);
+        // test general function
+        for (i, k) in k_iter.enumerate() {
+            match i {
+                0 => assert_eq!(k, &b"A"[..]),
+                1 => assert_eq!(k, &b"G"[..]),
+                2 => assert_eq!(k, &b"C"[..]),
+                3 => assert_eq!(k, &b"T"[..]),
+                _ => unreachable!("Too many kmers"),
+            }
+        }
+
+        // test that we handle length 2 (and don't drop Ns)
+        let k_iter = Kmers::new(b"AGNCT", 2);
+        for (i, k) in k_iter.enumerate() {
+            match i {
+                0 => assert_eq!(k, &b"AC"[..]),
+                1 => assert_eq!(k, &b"CN"[..]),
+                2 => assert_eq!(k, &b"NG"[..]),
+                3 => assert_eq!(k, &b"GT"[..]),
+                _ => unreachable!("Too many kmers"),
+            }
+        }
+
+        // test that the minimum length works
+        let k_iter = Kmers::new(b"AC", 2);
+        for k in k_iter {
+            assert_eq!(k, &b"AC"[..]);
+        }
     }
 
     #[test]
     fn can_canonicalize() {
-        assert!(canonical(b"A") == Cow::Borrowed(b"A"));
-        assert!(canonical(b"T") == Cow::Owned::<[u8]>(b"A".to_vec()));
-        assert!(canonical(b"AAGT") == Cow::Borrowed(b"AAGT"));
-        assert!(canonical(b"ACTT") == Cow::Owned::<[u8]>(b"AAGT".to_vec()));
-        assert!(canonical(b"GC") == Cow::Borrowed(b"GC"));
-    }
+        // test general function
+        let seq = b"AGCT";
+        let c_iter = CanonicalKmers(seq, &seq.reverse_complement(), 1);
+        for (i, (_, k, is_c)) in c_iter.enumerate() {
+            match i {
+                0 => {
+                    assert_eq!(k, &b"A"[..]);
+                    assert_eq!(is_c, false);
+                }
+                1 => {
+                    assert_eq!(k, &b"C"[..]);
+                    assert_eq!(is_c, true);
+                }
+                2 => {
+                    assert_eq!(k, &b"C"[..]);
+                    assert_eq!(is_c, false);
+                }
+                3 => {
+                    assert_eq!(k, &b"A"[..]);
+                    assert_eq!(is_c, true);
+                }
+                _ => unreachable!("Too many kmers"),
+            }
+        }
 
-    #[test]
-    fn can_minimize() {
-        let minmer = minimizer(&b"ATTTCG"[..], 3);
-        assert_eq!(&minmer[..], b"AAA");
-    }
+        let seq = b"AGCTA";
+        let c_iter = CanonicalKmers(seq, &seq.reverse_complement(), 2);
+        for (i, (_, k, _)) in c_iter.enumerate() {
+            match i {
+                0 => assert_eq!(k, &b"AG"[..]),
+                1 => assert_eq!(k, &b"GC"[..]),
+                2 => assert_eq!(k, &b"AG"[..]),
+                3 => assert_eq!(k, &b"TA"[..]),
+                _ => unreachable!("Too many kmers"),
+            }
+        }
 
+        let seq = b"AGNTA";
+        let c_iter = CanonicalKmers(seq, &seq.reverse_complement(), 2);
+        for (i, (ix, k, _)) in c_iter.enumerate() {
+            match i {
+                0 => {
+                    assert_eq!(ix, 0);
+                    assert_eq!(k, &b"AG"[..]);
+                }
+                1 => {
+                    assert_eq!(ix, 3);
+                    assert_eq!(k, &b"TA"[..]);
+                }
+                _ => unreachable!("Too many kmers"),
+            }
+        }
+    }
 }