Skip to content

Commit

Permalink
Even more docs to finalize v0.3
Browse files Browse the repository at this point in the history
  • Loading branch information
Roderick Bovee authored and bovee committed Sep 12, 2019
1 parent d831789 commit 6e92dc9
Show file tree
Hide file tree
Showing 10 changed files with 345 additions and 232 deletions.
1 change: 1 addition & 0 deletions src/bitkmer.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
//! Compact binary representations of nucleic acid kmers
pub type BitKmerSeq = u64;
pub type BitKmer = (BitKmerSeq, u8);

Expand Down
17 changes: 8 additions & 9 deletions src/formats/buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@ use safemem::copy_over;

use crate::util::ParseError;

/// A buffer that wraps an object with the `Read` trait and allows extracting
/// a set of slices to data. Acts as a lower-level primitive for our FASTX
/// readers.
pub struct RecBuffer<'a> {
file: &'a mut dyn io::Read,
pub buf: Vec<u8>,
pub last: bool,
}

/// A buffer that wraps an object with the `Read` trait and allows extracting
/// a set of slices to data. Acts as a lower-level primitive for our FASTX
/// readers.
impl<'a> RecBuffer<'a> {
/// Instantiate a new buffer.
///
Expand Down Expand Up @@ -56,12 +56,11 @@ impl<'a> RecBuffer<'a> {
}
}

/// RecParser is an adaptor trait that allows new file format parsers to be
/// defined. It takes a chunk from a RecBuffer (`from_reader`), optionally
/// parses an initial header out (`header`) and then provides an iterator
/// interface to parse a record stream. When finished, it provides a `eof`
/// function to determine if the stream is completely exhausted.
///
/// [⚠️Unstable] RecParser is an adaptor trait that allows new file format
/// parsers to be defined. It takes a chunk from a RecBuffer (`from_reader`),
/// optionally parses an initial header out (`header`) and then provides an
/// iterator interface to parse a record stream. When finished, it provides an
/// `eof` function to determine if the stream is completely exhausted.
pub trait RecParser<'s>: Sized + Iterator {
type Header;

Expand Down
2 changes: 2 additions & 0 deletions src/formats/fasta.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use crate::sequence::Sequence;
use crate::sequence_record::SequenceRecord;
use crate::util::{memchr_both_last, ParseError, ParseErrorType};

/// A zero-copy reference to a FASTA record in a buffer.
#[derive(Debug)]
pub struct FastaRecord<'a> {
pub id: &'a [u8],
Expand All @@ -27,6 +28,7 @@ impl<'a> From<FastaRecord<'a>> for SequenceRecord<'a> {
}
}

/// An iterator that parses a buffer into a sequence of FASTARecords
pub struct FastaParser<'a> {
buf: &'a [u8],
last: bool,
Expand Down
2 changes: 2 additions & 0 deletions src/formats/fastq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use crate::sequence::Sequence;
use crate::sequence_record::SequenceRecord;
use crate::util::{memchr_both, ParseError, ParseErrorType};

/// A zero-copy reference to a FASTQ record in a buffer.
#[derive(Debug)]
pub struct FastqRecord<'a> {
pub id: &'a [u8],
Expand All @@ -28,6 +29,7 @@ impl<'a> From<FastqRecord<'a>> for SequenceRecord<'a> {
}
}

/// An iterator that parses a buffer into a sequence of FASTQRecords
pub struct FastqParser<'a> {
buf: &'a [u8],
last: bool,
Expand Down
14 changes: 6 additions & 8 deletions src/formats/mod.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
//! This module contains functions for reading FASTA data from either within
//! memory or from files.
//! Functions for reading sequence data from FASTA and FASTQ formats.
//!
//! # Design Note
//!
//! These functions are designed to take callbacks to process the FASTX records
//! they read. It would be nice to present a FASTX Iterator that downstream users
//! can use at some point, but this is only possible for in-memory data (e.g. using
//! MemProducer from nom) because otherwise the lifetime of each record is linked
//! to what part of the file we're reading and Rust doesn't support "streaming
//! iterators" otherwise. Maybe some day.
//! they read. It would be nice to present a FASTX Iterator that downstream
//! users can use at some point, but this is complicated because the lifetimes
//! of the records are linked to the slice of buffer that's being parsed (the
//! records are "zero-copy" so they mustn't outlive the buffer). Perhaps we'll
//! figure out a "streaming iterator" strategy here someday.
//!
//! See: /~https://github.com/emk/rust-streaming
Expand Down Expand Up @@ -37,7 +36,6 @@ use crate::util::{ParseError, ParseErrorType};

static BUF_SIZE: usize = 256 * 1024;

#[macro_export]
macro_rules! parse_stream {
($reader:expr, $first:expr, $reader_type: ty, $rec: ident, $handler: block) => {{
use $crate::formats::{RecBuffer, RecParser};
Expand Down
231 changes: 113 additions & 118 deletions src/kmer.rs
Original file line number Diff line number Diff line change
@@ -1,114 +1,26 @@
//! This module contains functions for kmerizing a longer sequence and various
//! utilities for dealing with these kmers.
//!
//!
use std::borrow::Cow;

#[inline]
pub fn complement(n: u8) -> u8 {
//! Returns the complementary base for a given IUPAC base code.
//!
//! Does not work for RNA sequences (maybe we should raise an error or something?)
match n {
b'a' => b't',
b'A' => b'T',
b'c' => b'g',
b'C' => b'G',
b'g' => b'c',
b'G' => b'C',
b't' => b'a',
b'T' => b'A',

// IUPAC codes
b'r' => b'y',
b'y' => b'r',
b'k' => b'm',
b'm' => b'k',
b'b' => b'v',
b'v' => b'b',
b'd' => b'h',
b'h' => b'd',
b's' => b's',
b'w' => b'w',
b'R' => b'Y',
b'Y' => b'R',
b'K' => b'M',
b'M' => b'K',
b'B' => b'V',
b'V' => b'B',
b'D' => b'H',
b'H' => b'D',
b'S' => b'S',
b'W' => b'W',

// anything else just pass through
// 'u' | 'U' => panic!("Does not support complements of U"),
x => x,
}
}

pub fn canonical(seq: &[u8]) -> Cow<[u8]> {
//! Taking in a sequence string, return the canonical form of the sequence
//! (e.g. the lexigraphically lowest of either the original sequence or its
//! reverse complement)
let mut buf: Vec<u8> = Vec::with_capacity(seq.len());
// enough just keeps our comparisons from happening after they need to
let mut enough = false;
let mut original_was_canonical = false;

// loop through the kmer and its reverse complement simultaneously
for (rn, n) in seq.iter().rev().map(|n| complement(*n)).zip(seq.iter()) {
buf.push(rn);
if !enough && (*n < rn) {
original_was_canonical = true;
break;
} else if !enough && (rn < *n) {
enough = true;
}
// unstated if branch: if rn == n, keep comparing
}
match (original_was_canonical, enough) {
(true, true) => panic!("Bug: should never set original_was_canonical if enough == true"),
(true, false) => seq.into(),
(false, true) => buf.into(),
// the sequences were completely equal, return the ref
(false, false) => seq.into(),
}
}

/// Find the lexigraphically smallest substring of `seq` of length `length`
///
/// There's probably a faster algorithm for this somewhere...
pub fn minimizer(seq: &[u8], length: usize) -> Cow<[u8]> {
let reverse_complement: Vec<u8> = seq.iter().rev().map(|n| complement(*n)).collect();
let mut minmer = Cow::Borrowed(&seq[..length]);

for (kmer, rc_kmer) in seq.windows(length).zip(reverse_complement.windows(length)) {
if *kmer < minmer[..] {
minmer = kmer.into();
}
if *rc_kmer < minmer[..] {
minmer = rc_kmer.to_vec().into();
}
}
minmer
}
//! Functions for splitting sequences into fixed-width moving windows (kmers)
//! and utilities for dealing with these kmers.
pub fn is_good_base(chr: u8) -> bool {
/// Returns true if the base is a unambiguous nucleic acid base (e.g. ACGT) and
/// false otherwise.
fn is_good_base(chr: u8) -> bool {
match chr as char {
'a' | 'c' | 'g' | 't' | 'A' | 'C' | 'G' | 'T' => true,
_ => false,
}
}

/// Generic moving window iterator over sequences to return k-mers
///
/// Iterator returns slices to the original data.
pub struct Kmers<'a> {
k: u8,
start_pos: usize,
buffer: &'a [u8],
}

impl<'a> Kmers<'a> {
//! A kmer-izer for a nucleotide/amino acid sequence; returning slices to the original data
/// Creates a new kmer-izer for a nucleotide/amino acid sequence.
pub fn new(buffer: &'a [u8], k: u8) -> Self {
Kmers {
k,
Expand All @@ -131,18 +43,36 @@ impl<'a> Iterator for Kmers<'a> {
}
}

/// A kmer-izer for a nucleotide acid sequences to return canonical kmers.
///
/// Iterator returns the position of the kmer, a slice to the original data,
/// and an boolean indicating if the kmer returned is the original or the
/// reverse complement.
pub struct CanonicalKmers<'a> {
k: u8,
start_pos: usize,
buffer: &'a [u8],
rc_buffer: &'a [u8],
}

/// A kmer-izer for a nucleotide acid sequences to return canonical kmers.
/// Returns the position of the kmer, a slice to the original data, and
/// an boolean indicating if the kmer returned is the original or the reverse
/// complement.
impl<'a> CanonicalKmers<'a> {
/// Creates a new iterator.
///
/// It's generally more useful to use this directly from a sequences (e.g.
/// `seq.canonical_kmers`. Requires a reference to the reverse complement
/// of the sequence it's created on, e.g.
/// ```
/// use needletail::Sequence;
/// use needletail::kmer::CanonicalKmers;
///
/// let seq = b"ACGT";
/// let rc = seq.reverse_complement();
/// let c_iter = CanonicalKmers::new(seq, &rc, 3);
/// for (pos, kmer, canonical) in c_iter {
/// // process data in here
/// }
///
/// ```
pub fn new(buffer: &'a [u8], rc_buffer: &'a [u8], k: u8) -> Self {
let mut nucl_kmers = CanonicalKmers {
k,
Expand Down Expand Up @@ -202,31 +132,96 @@ impl<'a> Iterator for CanonicalKmers<'a> {
}
}

#[cfg(test)]
#[cfg(tests)]
mod tests {
use super::*;

#[test]
fn test_complement() {
assert_eq!(complement(b'a'), b't');
assert_eq!(complement(b'c'), b'g');
assert_eq!(complement(b'g'), b'c');
assert_eq!(complement(b'n'), b'n');
fn can_kmerize() {
let k_iter = Kmers::new(b"AGCT", 1);
// test general function
for (i, k) in k_iter.enumerate() {
match i {
0 => assert_eq!(k, &b"A"[..]),
1 => assert_eq!(k, &b"G"[..]),
2 => assert_eq!(k, &b"C"[..]),
3 => assert_eq!(k, &b"T"[..]),
_ => unreachable!("Too many kmers"),
}
}

// test that we handle length 2 (and don't drop Ns)
let k_iter = Kmers::new(b"AGNCT", 2);
for (i, k) in k_iter.enumerate() {
match i {
0 => assert_eq!(k, &b"AC"[..]),
1 => assert_eq!(k, &b"CN"[..]),
2 => assert_eq!(k, &b"NG"[..]),
3 => assert_eq!(k, &b"GT"[..]),
_ => unreachable!("Too many kmers"),
}
}

// test that the minimum length works
let k_iter = Kmers::new(b"AC", 2);
for k in k_iter {
assert_eq!(k, &b"AC"[..]);
}
}

#[test]
fn can_canonicalize() {
assert!(canonical(b"A") == Cow::Borrowed(b"A"));
assert!(canonical(b"T") == Cow::Owned::<[u8]>(b"A".to_vec()));
assert!(canonical(b"AAGT") == Cow::Borrowed(b"AAGT"));
assert!(canonical(b"ACTT") == Cow::Owned::<[u8]>(b"AAGT".to_vec()));
assert!(canonical(b"GC") == Cow::Borrowed(b"GC"));
}
// test general function
let seq = b"AGCT";
let c_iter = CanonicalKmers(seq, &seq.reverse_complement(), 1);
for (i, (_, k, is_c)) in c_iter.enumerate() {
match i {
0 => {
assert_eq!(k, &b"A"[..]);
assert_eq!(is_c, false);
}
1 => {
assert_eq!(k, &b"C"[..]);
assert_eq!(is_c, true);
}
2 => {
assert_eq!(k, &b"C"[..]);
assert_eq!(is_c, false);
}
3 => {
assert_eq!(k, &b"A"[..]);
assert_eq!(is_c, true);
}
_ => unreachable!("Too many kmers"),
}
}

#[test]
fn can_minimize() {
let minmer = minimizer(&b"ATTTCG"[..], 3);
assert_eq!(&minmer[..], b"AAA");
}
let seq = b"AGCTA";
let c_iter = CanonicalKmers(seq, &seq.reverse_complement(), 2);
for (i, (_, k, _)) in c_iter.enumerate() {
match i {
0 => assert_eq!(k, &b"AG"[..]),
1 => assert_eq!(k, &b"GC"[..]),
2 => assert_eq!(k, &b"AG"[..]),
3 => assert_eq!(k, &b"TA"[..]),
_ => unreachable!("Too many kmers"),
}
}

let seq = b"AGNTA";
let c_iter = CanonicalKmers(seq, &seq.reverse_complement(), 2);
for (i, (ix, k, _)) in c_iter.enumerate() {
match i {
0 => {
assert_eq!(ix, 0);
assert_eq!(k, &b"AG"[..]);
}
1 => {
assert_eq!(ix, 3);
assert_eq!(k, &b"TA"[..]);
}
_ => unreachable!("Too many kmers"),
}
}
}
}
Loading

0 comments on commit 6e92dc9

Please sign in to comment.