From 4aef928bee0d83c8df6bdf52e838399818e12444 Mon Sep 17 00:00:00 2001 From: Roderick Bovee Date: Tue, 27 Aug 2019 13:59:20 -0700 Subject: [PATCH] Refactor format code in module, relax UTF8 id requirement, and add writers. Closes #29, closes #13 --- benches/benchmark.rs | 14 +- src/formats/fasta.rs | 84 ++++++++ src/formats/fastq.rs | 142 +++++++++++++ src/{fastx.rs => formats/mod.rs} | 350 ++++++++----------------------- src/lib.rs | 4 +- src/seq.rs | 48 +++-- src/util.rs | 4 +- 7 files changed, 354 insertions(+), 292 deletions(-) create mode 100644 src/formats/fasta.rs create mode 100644 src/formats/fastq.rs rename src/{fastx.rs => formats/mod.rs} (62%) diff --git a/benches/benchmark.rs b/benches/benchmark.rs index 4d6ba5d..0ce6861 100644 --- a/benches/benchmark.rs +++ b/benches/benchmark.rs @@ -3,7 +3,7 @@ extern crate bencher; extern crate needletail; use bencher::Bencher; -use needletail::fastx; +use needletail::parse_sequences; use std::fs::File; use std::io::{Cursor, Read}; @@ -18,7 +18,7 @@ fn bench_kmer_speed(bench: &mut Bencher) { let mut n_total = 0; let mut n_canonical = 0; let file = File::open(filename).unwrap(); - fastx::parse_sequences( + parse_sequences( file, |_| {}, |seq| { @@ -44,7 +44,7 @@ fn bench_bitkmer_speed(bench: &mut Bencher) { let mut n_total = 0; let mut n_canonical = 0; let file = File::open(filename).unwrap(); - fastx::parse_sequences( + parse_sequences( file, |_| {}, |seq| { @@ -71,7 +71,7 @@ fn bench_fastq_bytes(bench: &mut Bencher) { bench.iter(|| { let mut n_bases = 0; - fastx::parse_sequences( + parse_sequences( Cursor::new(&data), |_| {}, |seq| { @@ -90,7 +90,7 @@ fn bench_fastq_file(bench: &mut Bencher) { // fastx::fastx_file(&filename[..], |seq| { assert!(seq.1.len() > 0) }).unwrap(); bench.iter(|| { let mut n_bases = 0; - fastx::parse_sequences( + parse_sequences( File::open(filename).unwrap(), |_| {}, |seq| { @@ -111,7 +111,7 @@ fn bench_fasta_bytes(bench: &mut Bencher) { bench.iter(|| { let mut n_bases = 0; - fastx::parse_sequences( + parse_sequences( Cursor::new(&data), |_| {}, |seq| { @@ -130,7 +130,7 @@ fn bench_fasta_file(bench: &mut Bencher) { // fastx::fastx_file(&filename[..], |seq| { assert!(seq.1.len() > 0) }).unwrap(); bench.iter(|| { let mut n_bases = 0; - fastx::parse_sequences( + parse_sequences( File::open(filename).unwrap(), |_| {}, |seq| { diff --git a/src/formats/fasta.rs b/src/formats/fasta.rs new file mode 100644 index 0000000..17436d9 --- /dev/null +++ b/src/formats/fasta.rs @@ -0,0 +1,84 @@ +use std::io::Write; + +use memchr::memchr; + +use crate::buffer::RecBuffer; +use crate::seq::Sequence; +use crate::util::{memchr_both, strip_whitespace, ParseError, ParseErrorType}; + +#[derive(Debug)] +pub struct FASTA<'a> { + pub id: &'a [u8], + pub seq: &'a [u8], +} + +impl<'a> FASTA<'a> { + pub fn write(&self, mut writer: W) -> Result<(), ParseError> where W: Write { + writer.write(b">")?; + writer.write(&self.id)?; + writer.write(b"\n")?; + writer.write(&self.seq)?; + writer.write(b"\n")?; + Ok(()) + } +} + +impl<'a> Iterator for RecBuffer<'a, FASTA<'static>> { + type Item = Result, ParseError>; + + fn next(&mut self) -> Option { + let buf = &self.buf[self.pos..]; + if buf.is_empty() { + return None; + } + + let id_end; + match memchr(b'\n', &buf) { + Some(i) => id_end = i + 1, + None => return None, + }; + let mut id = &buf[1..id_end - 1]; + if !id.is_empty() && id[id.len() - 1] == b'\r' { + id = &id[..id.len() - 1]; + } + + let seq_end; + match (memchr_both(b'\n', b'>', &buf[id_end..]), self.last) { + (Some(i), _) => seq_end = id_end + i + 1, + (None, true) => seq_end = buf.len(), + (None, false) => return None, + }; + if id_end == seq_end { + let context = String::from_utf8_lossy(id); + return Some(Err(ParseError::new( + "Sequence completely empty", + ParseErrorType::PrematureEOF, + ) + .record(self.count + 1) + .context(context))); + } + let mut seq = &buf[id_end..seq_end]; + if seq[seq.len() - 1] == b'\r' { + seq = &seq[..seq.len()]; + } + + self.pos += seq_end; + self.count += 1; + Some(Ok(FASTA { id, seq })) + } +} + +impl<'a> From> for Sequence<'a> { + fn from(fasta: FASTA<'a>) -> Sequence<'a> { + Sequence::new(fasta.id, strip_whitespace(fasta.seq), None) + } +} + +impl<'a> From<&'a Sequence<'a>> for FASTA<'a> { + fn from(seq: &'a Sequence<'a>) -> FASTA<'a> { + FASTA { + id: &seq.id, + seq: &seq.seq, + } + } +} diff --git a/src/formats/fastq.rs b/src/formats/fastq.rs new file mode 100644 index 0000000..3f20167 --- /dev/null +++ b/src/formats/fastq.rs @@ -0,0 +1,142 @@ +use std::borrow::Cow; +use std::cmp::min; +use std::io::Write; + +use memchr::memchr; + +use crate::buffer::RecBuffer; +use crate::seq::Sequence; +use crate::util::{memchr_both, ParseError, ParseErrorType}; + + +#[derive(Debug)] +pub struct FASTQ<'a> { + pub id: &'a [u8], + pub seq: &'a [u8], + pub id2: &'a [u8], + pub qual: &'a [u8], +} + +impl<'a> FASTQ<'a> { + pub fn write(&self, mut writer: W) -> Result<(), ParseError> where W: Write { + writer.write(b"@")?; + writer.write(&self.id)?; + writer.write(b"\n")?; + writer.write(&self.seq)?; + writer.write(b"+\n")?; + if self.seq.len() != self.qual.len() { + writer.write(&vec![b'I'; self.seq.len()])?; + } else { + writer.write(&self.qual)?; + } + writer.write(b"\n")?; + Ok(()) + } +} + +impl<'a> Iterator for RecBuffer<'a, FASTQ<'a>> { + type Item = Result, ParseError>; + + fn next(&mut self) -> Option { + if self.pos >= self.buf.len() { + return None; + } + let buf = &self.buf[self.pos..]; + + if buf[0] != b'@' { + // sometimes there are extra returns at the end of a file so we shouldn't blow up + if buf[0] == b'\r' || buf[0] == b'\n' { + return None; + } else { + let context = String::from_utf8_lossy(&buf[0..min(16, buf.len())]); + let e = + ParseError::new("Record must start with '@'", ParseErrorType::InvalidHeader) + .record(self.count) + .context(context); + return Some(Err(e)); + } + } + + let id_end; + match memchr(b'\n', &buf) { + Some(i) => id_end = i + 1, + None => return None, + }; + let mut id = &buf[1..id_end - 1]; + + let seq_end; + match memchr_both(b'\n', b'+', &buf[id_end..]) { + Some(i) => seq_end = id_end + i + 1, + None => return None, + }; + let mut seq = &buf[id_end..seq_end - 1]; + + let id2_end; + match memchr(b'\n', &buf[seq_end..]) { + Some(i) => id2_end = seq_end + i + 1, + None => return None, + }; + let id2 = &buf[seq_end..id2_end - 1]; + + // we know the qual scores must be the same length as the sequence + // so we can just do some arithmatic instead of memchr'ing + let mut qual_end = id2_end + seq.len() + 1; + let mut buffer_used = qual_end; + if qual_end > buf.len() { + if !self.last { + // we need to pull more into the buffer + return None; + } + // now do some math to figure out if the file doesn't end with a newline + let windows_ending = if seq.last() == Some(&b'\r') { 1 } else { 0 }; + if qual_end != buf.len() + 1 + windows_ending { + return None; + } + buffer_used -= 1 + windows_ending; + qual_end -= windows_ending; + } + let mut qual = &buf[id2_end..qual_end - 1]; + + // clean up any extra '\r' from the id and seq + if !id.is_empty() && id[id.len() - 1] == b'\r' { + id = &id[..id.len() - 1]; + } + if !seq.is_empty() && seq[seq.len() - 1] == b'\r' { + seq = &seq[..seq.len() - 1]; + } + // we do qual separately in case this is the end of the file + if !qual.is_empty() && qual[qual.len() - 1] == b'\r' { + qual = &qual[..qual.len() - 1]; + } + + self.pos += buffer_used; + self.count += 1; + Some(Ok(FASTQ { id, seq, id2, qual })) + } +} + +impl<'a> From> for Sequence<'a> { + fn from(fastq: FASTQ<'a>) -> Sequence<'a> { + let qual = if fastq.seq.len() != fastq.qual.len() { + None + } else { + Some(fastq.qual) + }; + Sequence::new(fastq.id, Cow::from(fastq.seq), qual) + } +} + +impl<'a> From<&'a Sequence<'a>> for FASTQ<'a> { + fn from(seq: &'a Sequence<'a>) -> FASTQ<'a> { + let qual = match &seq.qual { + None => &b""[..], + Some(q) => &q, + }; + FASTQ { + id: &seq.id, + seq: &seq.seq, + id2: b"", + qual: qual, + } + } +} diff --git a/src/fastx.rs b/src/formats/mod.rs similarity index 62% rename from src/fastx.rs rename to src/formats/mod.rs index db606e3..b57d610 100644 --- a/src/fastx.rs +++ b/src/formats/mod.rs @@ -12,13 +12,13 @@ //! //! See: /~https://github.com/emk/rust-streaming -use std::borrow::Cow; +mod fasta; +mod fastq; + use std::cmp::min; use std::io::{Cursor, Read}; use std::str; -use memchr::memchr; - #[cfg(feature = "compression")] use bzip2::read::BzDecoder; #[cfg(feature = "compression")] @@ -26,188 +26,21 @@ use flate2::read::MultiGzDecoder; #[cfg(feature = "compression")] use xz2::read::XzDecoder; -use crate::buffer::{RecBuffer, RecReader}; -use crate::seq::SeqRecord; -use crate::util::{memchr_both, strip_whitespace, ParseError, ParseErrorType}; - -#[derive(Debug)] -struct FASTA<'a> { - id: &'a str, - seq: &'a [u8], -} - -#[derive(Debug)] -struct FASTQ<'a> { - id: &'a str, - seq: &'a [u8], - id2: &'a [u8], - qual: &'a [u8], -} - -impl<'a> Iterator for RecBuffer<'a, FASTA<'static>> { - type Item = Result, ParseError>; - - fn next(&mut self) -> Option { - let buf = &self.buf[self.pos..]; - if buf.is_empty() { - return None; - } - - let id_end; - match memchr(b'\n', &buf) { - Some(i) => id_end = i + 1, - None => return None, - }; - let mut raw_id = &buf[1..id_end - 1]; - if !raw_id.is_empty() && raw_id[raw_id.len() - 1] == b'\r' { - raw_id = &raw_id[..raw_id.len() - 1]; - } - let id; - match str::from_utf8(raw_id) { - Ok(i) => id = i, - Err(e) => { - let e = ParseError::from(e) - .record(self.count) - .context(String::from_utf8_lossy(raw_id)); - return Some(Err(e)); - }, - } - - let seq_end; - match (memchr_both(b'\n', b'>', &buf[id_end..]), self.last) { - (Some(i), _) => seq_end = id_end + i + 1, - (None, true) => seq_end = buf.len(), - (None, false) => return None, - }; - if id_end == seq_end { - let context = String::from_utf8_lossy(raw_id); - return Some(Err(ParseError::new( - "Sequence completely empty", - ParseErrorType::PrematureEOF, - ).record(self.count + 1).context(context))); - } - let mut seq = &buf[id_end..seq_end]; - if seq[seq.len() - 1] == b'\r' { - seq = &seq[..seq.len()]; - } - - self.pos += seq_end; - self.count += 1; - Some(Ok(FASTA { id, seq })) - } -} - -impl<'a> Iterator for RecBuffer<'a, FASTQ<'a>> { - type Item = Result, ParseError>; +use crate::buffer::RecReader; +pub use crate::formats::fasta::FASTA; +pub use crate::formats::fastq::FASTQ; +use crate::seq::Sequence; +use crate::util::{ParseError, ParseErrorType}; - fn next(&mut self) -> Option { - if self.pos >= self.buf.len() { - return None; - } - let buf = &self.buf[self.pos..]; - - if buf[0] != b'@' { - // sometimes there are extra returns at the end of a file so we shouldn't blow up - if buf[0] == b'\r' || buf[0] == b'\n' { - return None; - } else { - let context = String::from_utf8_lossy(&buf[0..min(16, buf.len())]); - let e = - ParseError::new("Record must start with '@'", ParseErrorType::InvalidHeader) - .record(self.count) - .context(context); - return Some(Err(e)); - } - } - - let id_end; - match memchr(b'\n', &buf) { - Some(i) => id_end = i + 1, - None => return None, - }; - let mut raw_id = &buf[1..id_end - 1]; - - let seq_end; - match memchr_both(b'\n', b'+', &buf[id_end..]) { - Some(i) => seq_end = id_end + i + 1, - None => return None, - }; - let mut seq = &buf[id_end..seq_end - 1]; - - let id2_end; - match memchr(b'\n', &buf[seq_end..]) { - Some(i) => id2_end = seq_end + i + 1, - None => return None, - }; - let id2 = &buf[seq_end..id2_end - 1]; - - // we know the qual scores must be the same length as the sequence - // so we can just do some arithmatic instead of memchr'ing - let mut qual_end = id2_end + seq.len() + 1; - let mut buffer_used = qual_end; - if qual_end > buf.len() { - if !self.last { - // we need to pull more into the buffer - return None; - } - // now do some math to figure out if the file doesn't end with a newline - let windows_ending = if seq.last() == Some(&b'\r') { 1 } else { 0 }; - if qual_end != buf.len() + 1 + windows_ending { - return None; - } - buffer_used -= 1 + windows_ending; - qual_end -= windows_ending; - } - let mut qual = &buf[id2_end..qual_end - 1]; - - // clean up any extra '\r' from the id and seq - if !raw_id.is_empty() && raw_id[raw_id.len() - 1] == b'\r' { - raw_id = &raw_id[..raw_id.len() - 1]; - } - if !seq.is_empty() && seq[seq.len() - 1] == b'\r' { - seq = &seq[..seq.len() - 1]; - } - // we do qual separately in case this is the end of the file - if !qual.is_empty() && qual[qual.len() - 1] == b'\r' { - qual = &qual[..qual.len() - 1]; - } - - let id; - match str::from_utf8(raw_id) { - Ok(i) => id = i, - Err(e) => { - let e = ParseError::from(e) - .record(self.count) - .context(String::from_utf8_lossy(raw_id)); - return Some(Err(e)); - }, - } - self.pos += buffer_used; - self.count += 1; - Some(Ok(FASTQ { id, seq, id2, qual })) - } -} - -impl<'a> From> for SeqRecord<'a> { - fn from(fasta: FASTA<'a>) -> SeqRecord<'a> { - SeqRecord::new(fasta.id, strip_whitespace(fasta.seq), None) - } -} - -impl<'a> From> for SeqRecord<'a> { - fn from(fastq: FASTQ<'a>) -> SeqRecord<'a> { - SeqRecord::new(fastq.id, Cow::from(fastq.seq), Some(fastq.qual)) - } -} /// Internal function abstracting over byte and file FASTX parsing -fn fastx_reader( +fn seq_reader( reader: &mut R, mut callback: F, type_callback: &mut T, ) -> Result<(), ParseError> where - F: for<'a> FnMut(SeqRecord<'a>) -> (), + F: for<'a> FnMut(Sequence<'a>) -> (), R: Read, T: ?Sized + FnMut(&'static str) -> (), { @@ -225,7 +58,7 @@ where b'>' => { let mut rec_buffer = rec_reader.get_buffer::(record_count); for s in rec_buffer.by_ref() { - callback(SeqRecord::from(s?)); + callback(Sequence::from(s?)); } record_count += rec_buffer.count; rec_buffer.pos @@ -233,16 +66,17 @@ where b'@' => { let mut rec_buffer = rec_reader.get_buffer::(record_count); for s in rec_buffer.by_ref() { - callback(SeqRecord::from(s?)); + callback(Sequence::from(s?)); } record_count += rec_buffer.count; rec_buffer.pos }, _ => { - return Err(ParseError::new( - "Bad starting byte", - ParseErrorType::InvalidHeader, - ).record(0).context(String::from_utf8_lossy(&first))) + return Err( + ParseError::new("Bad starting byte", ParseErrorType::InvalidHeader) + .record(0) + .context(String::from_utf8_lossy(&first)), + ) }, }; if rec_reader.refill(used)? { @@ -252,21 +86,21 @@ where // check if there's anything left stuff in the buffer (besides returns) let rec_buffer = rec_reader.get_buffer::(record_count); if !rec_buffer.last { - return Err(ParseError::new( - "File ended abruptly", - ParseErrorType::PrematureEOF, - ).record(record_count)); + return Err( + ParseError::new("File ended abruptly", ParseErrorType::PrematureEOF) + .record(record_count), + ); } for c in &rec_buffer.buf[rec_buffer.pos..] { if c != &b'\r' && c != &b'\n' { let end = min(rec_buffer.pos + 16, rec_buffer.buf.len()); - let context = String::from_utf8_lossy( - &rec_buffer.buf[rec_buffer.pos..end] - ); + let context = String::from_utf8_lossy(&rec_buffer.buf[rec_buffer.pos..end]); return Err(ParseError::new( "File had extra data past end of records", ParseErrorType::PrematureEOF, - ).record(record_count).context(context)); + ) + .record(record_count) + .context(context)); } } Ok(()) @@ -279,7 +113,7 @@ pub fn parse_sequences( callback: F, ) -> Result<(), ParseError> where - F: for<'a> FnMut(SeqRecord<'a>) -> (), + F: for<'a> FnMut(Sequence<'a>) -> (), R: Read, T: FnMut(&'static str) -> (), { @@ -295,7 +129,7 @@ pub fn parse_sequences( callback: F, ) -> Result<(), ParseError> where - F: for<'a> FnMut(SeqRecord<'a>) -> (), + F: for<'a> FnMut(Sequence<'a>) -> (), R: Read, T: FnMut(&'static str) -> (), { @@ -316,7 +150,7 @@ where let cursor = Cursor::new(vec![0x1F, 0x8B]); let mut gz_reader = MultiGzDecoder::new(cursor.chain(reader)); - fastx_reader(&mut gz_reader, callback, &mut type_callback) + seq_reader(&mut gz_reader, callback, &mut type_callback) } else if first[0] == 0x42 { // bz files reader.read_exact(&mut first)?; @@ -329,7 +163,7 @@ where let cursor = Cursor::new(vec![0x42, 0x5A]); let mut bz_reader = BzDecoder::new(cursor.chain(reader)); - fastx_reader(&mut bz_reader, callback, &mut type_callback) + seq_reader(&mut bz_reader, callback, &mut type_callback) } else if first[0] == 0xFD { // xz files reader.read_exact(&mut first)?; @@ -342,11 +176,11 @@ where let cursor = Cursor::new(vec![0xFD, 0x37]); let mut xz_reader = XzDecoder::new(cursor.chain(reader)); - fastx_reader(&mut xz_reader, callback, &mut type_callback) + seq_reader(&mut xz_reader, callback, &mut type_callback) } else { let cursor = Cursor::new(first); let mut reader = cursor.chain(reader); - fastx_reader(&mut reader, callback, &mut type_callback) + seq_reader(&mut reader, callback, &mut type_callback) } } @@ -357,7 +191,7 @@ mod test { use std::path::Path; use crate::buffer::{RecBuffer, RecReader}; - use crate::fastx::{parse_sequences, FASTA, FASTQ}; + use crate::formats::{parse_sequences, FASTA, FASTQ}; use crate::util::ParseErrorType; fn seq(s: &[u8]) -> Cursor<&[u8]> { @@ -373,13 +207,13 @@ mod test { |seq| { match i { 0 => { - assert_eq!(seq.id, "test"); - assert_eq!(&seq.seq[..], &b"AGCT"[..]); + assert_eq!(&seq.id[..], b"test"); + assert_eq!(&seq.seq[..], b"AGCT"); assert_eq!(seq.qual, None); }, 1 => { - assert_eq!(seq.id, "test2"); - assert_eq!(&seq.seq[..], &b"GATC"[..]); + assert_eq!(&seq.id[..], b"test2"); + assert_eq!(&seq.seq[..], b"GATC"); assert_eq!(seq.qual, None); }, _ => unreachable!("Too many records"), @@ -400,12 +234,12 @@ mod test { |seq| { match i { 0 => { - assert_eq!(seq.id, "test"); + assert_eq!(&seq.id[..], b"test"); assert_eq!(&seq.seq[..], b"AGCTGATCGA"); assert_eq!(seq.qual, None); }, 1 => { - assert_eq!(seq.id, "test2"); + assert_eq!(&seq.id[..], b"test2"); assert_eq!(&seq.seq[..], b"TAGC"); assert_eq!(seq.qual, None); }, @@ -453,12 +287,12 @@ mod test { |seq| { match i { 0 => { - assert_eq!(seq.id, "test"); + assert_eq!(&seq.id[..], b"test"); assert_eq!(&seq.seq[..], b"AGCTGATCGA"); assert_eq!(seq.qual, None); }, 1 => { - assert_eq!(seq.id, "test2"); + assert_eq!(&seq.id[..], b"test2"); assert_eq!(&seq.seq[..], b"TAGC"); assert_eq!(seq.qual, None); }, @@ -481,14 +315,14 @@ mod test { |seq| { match i { 0 => { - assert_eq!(seq.id, "test"); - assert_eq!(&seq.seq[..], &b"AGCT"[..]); - assert_eq!(&seq.qual.unwrap()[..], &b"~~a!"[..]); + assert_eq!(&seq.id[..], b"test"); + assert_eq!(&seq.seq[..], b"AGCT"); + assert_eq!(&seq.qual.unwrap()[..], b"~~a!"); }, 1 => { - assert_eq!(seq.id, "test2"); - assert_eq!(&seq.seq[..], &b"TGCA"[..]); - assert_eq!(&seq.qual.unwrap()[..], &b"WUI9"[..]); + assert_eq!(&seq.id[..], b"test2"); + assert_eq!(&seq.seq[..], b"TGCA"); + assert_eq!(&seq.qual.unwrap()[..], b"WUI9"); }, _ => unreachable!("Too many records"), } @@ -505,14 +339,14 @@ mod test { |seq| { match i { 0 => { - assert_eq!(seq.id, "test"); - assert_eq!(&seq.seq[..], &b"AGCT"[..]); - assert_eq!(&seq.qual.unwrap()[..], &b"~~a!"[..]); + assert_eq!(&seq.id[..], b"test"); + assert_eq!(&seq.seq[..], b"AGCT"); + assert_eq!(&seq.qual.unwrap()[..], b"~~a!"); }, 1 => { - assert_eq!(seq.id, "test2"); - assert_eq!(&seq.seq[..], &b"TGCA"[..]); - assert_eq!(&seq.qual.unwrap()[..], &b"WUI9"[..]); + assert_eq!(&seq.id[..], b"test2"); + assert_eq!(&seq.seq[..], b"TGCA"); + assert_eq!(&seq.qual.unwrap()[..], b"WUI9"); }, _ => unreachable!("Too many records"), } @@ -528,11 +362,7 @@ mod test { //! Check for the absence of a panic. The parser previously assumed //! if the ID ended with an `\r\n` then the sequence did also. //! (Discovered via fuzzing) - let res = parse_sequences( - seq(b"@\r\n\n+A\n@"), - |_| (), - |_seq| {}, - ); + let res = parse_sequences(seq(b"@\r\n\n+A\n@"), |_| (), |_seq| {}); assert_eq!(res, Ok(())); } @@ -545,13 +375,13 @@ mod test { |seq| { match i { 0 => { - assert_eq!(seq.id, "test"); - assert_eq!(&seq.seq[..], &b"AGCTTCG"[..]); + assert_eq!(&seq.id[..], b"test"); + assert_eq!(&seq.seq[..], b"AGCTTCG"); assert_eq!(seq.qual, None); }, 1 => { - assert_eq!(seq.id, "test2"); - assert_eq!(&seq.seq[..], &b"G"[..]); + assert_eq!(&seq.id[..], b"test2"); + assert_eq!(&seq.seq[..], b"G"); assert_eq!(seq.qual, None); }, _ => unreachable!("Too many records"), @@ -569,13 +399,13 @@ mod test { |seq| { match i { 0 => { - assert_eq!(seq.id, "test"); - assert_eq!(&seq.seq[..], &b"AGCTTCG"[..]); + assert_eq!(&seq.id[..], b"test"); + assert_eq!(&seq.seq[..], b"AGCTTCG"); assert_eq!(seq.qual, None); }, 1 => { - assert_eq!(seq.id, "test2"); - assert_eq!(&seq.seq[..], &b"G"[..]); + assert_eq!(&seq.id[..], b"test2"); + assert_eq!(&seq.seq[..], b"G"); assert_eq!(seq.qual, None); }, _ => unreachable!("Too many records"), @@ -596,8 +426,8 @@ mod test { |seq| { match i { 0 => { - assert_eq!(seq.id, "test"); - assert_eq!(&seq.seq[..], &b"AGCT"[..]); + assert_eq!(&seq.id[..], b"test"); + assert_eq!(&seq.seq[..], b"AGCT"); assert_eq!(seq.qual, None); }, _ => unreachable!("Too many records"), @@ -617,9 +447,9 @@ mod test { |seq| { match i { 0 => { - assert_eq!(seq.id, "test"); - assert_eq!(&seq.seq[..], &b"AGCT"[..]); - assert_eq!(&seq.qual.unwrap()[..], &b"~~a!"[..]); + assert_eq!(&seq.id[..], b"test"); + assert_eq!(&seq.seq[..], b"AGCT"); + assert_eq!(&seq.qual.unwrap()[..], b"~~a!"); }, _ => unreachable!("Too many records"), } @@ -639,9 +469,9 @@ mod test { |seq| { match i { 0 => { - assert_eq!(seq.id, "test"); - assert_eq!(&seq.seq[..], &b"AGCT"[..]); - assert_eq!(&seq.qual.unwrap()[..], &b"~~a!"[..]); + assert_eq!(&seq.id[..], b"test"); + assert_eq!(&seq.seq[..], b"AGCT"); + assert_eq!(&seq.qual.unwrap()[..], b"~~a!"); }, _ => unreachable!("Too many records"), } @@ -659,9 +489,9 @@ mod test { |seq| { match i { 0 => { - assert_eq!(seq.id, "test"); - assert_eq!(&seq.seq[..], &b"AGCT"[..]); - assert_eq!(&seq.qual.unwrap()[..], &b"~~a!"[..]); + assert_eq!(&seq.id[..], b"test"); + assert_eq!(&seq.seq[..], b"AGCT"); + assert_eq!(&seq.qual.unwrap()[..], b"~~a!"); }, _ => unreachable!("Too many records"), } @@ -681,8 +511,8 @@ mod test { |seq| { match i { 0 => { - assert_eq!(seq.id, "test"); - assert_eq!(&seq.seq[..], &b"ACGT"[..]); + assert_eq!(&seq.id[..], b"test"); + assert_eq!(&seq.seq[..], b"ACGT"); }, _ => unreachable!("Too many records"), } @@ -706,14 +536,14 @@ mod test { |seq| { match i { 0 => { - assert_eq!(seq.id, ""); - assert_eq!(&seq.seq[..], &b""[..]); - assert_eq!(&seq.qual.unwrap()[..], &b""[..]); + assert_eq!(&seq.id[..], b""); + assert_eq!(&seq.seq[..], b""); + assert_eq!(&seq.qual.unwrap()[..], b""); }, 1 => { - assert_eq!(seq.id, "test2"); - assert_eq!(&seq.seq[..], &b"TGCA"[..]); - assert_eq!(&seq.qual.unwrap()[..], &b"~~~~"[..]); + assert_eq!(&seq.id[..], b"test2"); + assert_eq!(&seq.seq[..], b"TGCA"); + assert_eq!(&seq.qual.unwrap()[..], b"~~~~"); }, _ => unreachable!("Too many records"), } @@ -730,13 +560,13 @@ mod test { |seq| { match i { 0 => { - assert_eq!(seq.id, ""); - assert_eq!(&seq.seq[..], &b""[..]); + assert_eq!(&seq.id[..], b""); + assert_eq!(&seq.seq[..], b""); assert_eq!(seq.qual, None); }, 1 => { - assert_eq!(seq.id, "shine"); - assert_eq!(&seq.seq[..], &b"AGGAGGU"[..]); + assert_eq!(&seq.id[..], b"shine"); + assert_eq!(&seq.seq[..], b"AGGAGGU"); assert_eq!(seq.qual, None); }, _ => unreachable!("Too many records"), @@ -754,13 +584,13 @@ mod test { |seq| { match i { 0 => { - assert_eq!(seq.id, ""); - assert_eq!(&seq.seq[..], &b""[..]); + assert_eq!(&seq.id[..], b""); + assert_eq!(&seq.seq[..], b""); assert_eq!(seq.qual, None); }, 1 => { - assert_eq!(seq.id, "shine"); - assert_eq!(&seq.seq[..], &b"AGGAGGU"[..]); + assert_eq!(&seq.id[..], b"shine"); + assert_eq!(&seq.seq[..], b"AGGAGGU"); assert_eq!(seq.qual, None); }, _ => unreachable!("Too many records"), @@ -776,7 +606,7 @@ mod test { fn test_buffer() { let mut buf: RecBuffer = RecBuffer::from_bytes(b">test\nACGT"); let rec = buf.next().unwrap().unwrap(); - assert_eq!(rec.id, "test", "Record has the right ID"); + assert_eq!(rec.id, b"test", "Record has the right ID"); assert_eq!(rec.seq, b"ACGT", "Record has the right sequence"); let mut buf: RecBuffer = RecBuffer::from_bytes(b">test"); @@ -809,7 +639,7 @@ mod test { // handled the buffer boundary let iterated_seq = rec_buffer.by_ref().next(); let seq = iterated_seq.unwrap(); - assert_eq!(seq.unwrap().id, "A"); + assert_eq!(seq.unwrap().id, b"A"); // but not another because the buffer's too short let iterated_seq = rec_buffer.by_ref().next(); diff --git a/src/lib.rs b/src/lib.rs index 1b85e51..4739e10 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,9 +1,9 @@ #![crate_name = "needletail"] pub mod bitkmer; mod buffer; -pub mod fastx; +pub mod formats; pub mod kmer; pub mod seq; mod util; -pub use fastx::parse_sequences; +pub use formats::parse_sequences; diff --git a/src/seq.rs b/src/seq.rs index 1946453..032a6cb 100644 --- a/src/seq.rs +++ b/src/seq.rs @@ -93,16 +93,16 @@ fn test_normalize() { /// A generic FASTX record that also abstracts over several logical operations /// that can be performed on nucleic acid sequences. #[derive(Clone, Debug)] -pub struct SeqRecord<'a> { - pub id: Cow<'a, str>, +pub struct Sequence<'a> { + pub id: Cow<'a, [u8]>, pub seq: Cow<'a, [u8]>, pub qual: Option>, rev_seq: Option>, } -impl<'a> SeqRecord<'a> { - pub fn new(id: &'a str, seq: Cow<'a, [u8]>, qual: Option<&'a [u8]>) -> Self { - SeqRecord { +impl<'a> Sequence<'a> { + pub fn new(id: &'a [u8], seq: Cow<'a, [u8]>, qual: Option<&'a [u8]>) -> Self { + Sequence { id: id.into(), seq, qual: qual.map(Cow::Borrowed), @@ -111,8 +111,8 @@ impl<'a> SeqRecord<'a> { } pub fn from_bytes(seq: &'a [u8]) -> Self { - SeqRecord { - id: "".into(), + Sequence { + id: b""[..].into(), seq: seq.into(), qual: None, rev_seq: None, @@ -136,7 +136,7 @@ impl<'a> SeqRecord<'a> { .zip(qual.iter()) .map(|(base, qual)| if *qual < score { b'N' } else { *base }) .collect(); - SeqRecord { + Sequence { id: self.id, seq, qual: Some(Cow::Owned(qual)), @@ -157,8 +157,14 @@ impl<'a> SeqRecord<'a> { /// /// Returns `true` if the header was masked pub fn mask_header(mut self) -> Self { - if memchr(b'\t', self.id.as_ref().as_bytes()).is_some() { - self.id = self.id.as_ref().replace("\t", "|").into(); + if memchr(b'\t', self.id.as_ref()).is_some() { + self.id = self.id.iter().map(|x| { + if *x == b'\t' { + b'|' + } else { + *x + } + }).collect(); } self } @@ -185,8 +191,8 @@ impl<'a> SeqRecord<'a> { /// Construct an owned version of `self` to, e.g. pass across threads /// (it's not clear why this can't be the `impl for Clone`, but the /// 'static lifetime doesn't work there for some reason) - pub fn into_owned(self) -> SeqRecord<'static> { - SeqRecord { + pub fn into_owned(self) -> Sequence<'static> { + Sequence { id: Cow::Owned(self.id.clone().into_owned()), seq: Cow::Owned(self.seq.clone().into_owned()), qual: self.qual.clone().map(Cow::into_owned).map(Cow::Owned), @@ -197,8 +203,8 @@ impl<'a> SeqRecord<'a> { #[test] fn test_quality_mask() { - let seq_rec = SeqRecord { - id: "".into(), + let seq_rec = Sequence { + id: b""[..].into(), // seq: Cow::Borrowed(&b"AGCT"[..]), seq: b"AGCT"[..].into(), qual: Some(b"AAA0"[..].into()), @@ -212,7 +218,7 @@ fn test_quality_mask() { fn can_kmerize() { // test general function let mut i = 0; - for (_, k, _) in SeqRecord::from_bytes(b"AGCT").kmers(1, false) { + for (_, k, _) in Sequence::from_bytes(b"AGCT").kmers(1, false) { match i { 0 => assert_eq!(k, &b"A"[..]), 1 => assert_eq!(k, &b"G"[..]), @@ -225,7 +231,7 @@ fn can_kmerize() { // test that we skip over N's i = 0; - for (_, k, _) in SeqRecord::from_bytes(b"ACNGT").kmers(2, false) { + for (_, k, _) in Sequence::from_bytes(b"ACNGT").kmers(2, false) { match i { 0 => assert_eq!(k, &b"AC"[..]), 1 => assert_eq!(k, &b"GT"[..]), @@ -236,7 +242,7 @@ fn can_kmerize() { // test that we skip over N's and handle short kmers i = 0; - for (ix, k, _) in SeqRecord::from_bytes(b"ACNG").kmers(2, false) { + for (ix, k, _) in Sequence::from_bytes(b"ACNG").kmers(2, false) { match i { 0 => { assert_eq!(ix, 0); @@ -248,7 +254,7 @@ fn can_kmerize() { } // test that the minimum length works - for (_, k, _) in SeqRecord::from_bytes(b"AC").kmers(2, false) { + for (_, k, _) in Sequence::from_bytes(b"AC").kmers(2, false) { assert_eq!(k, &b"AC"[..]); } } @@ -257,7 +263,7 @@ fn can_kmerize() { fn can_canonicalize() { // test general function let mut i = 0; - for (_, k, is_c) in SeqRecord::from_bytes(b"AGCT").kmers(1, true) { + for (_, k, is_c) in Sequence::from_bytes(b"AGCT").kmers(1, true) { match i { 0 => { assert_eq!(k, &b"A"[..]); @@ -281,7 +287,7 @@ fn can_canonicalize() { } let mut i = 0; - for (_, k, _) in SeqRecord::from_bytes(b"AGCTA").kmers(2, true) { + for (_, k, _) in Sequence::from_bytes(b"AGCTA").kmers(2, true) { match i { 0 => assert_eq!(k, &b"AG"[..]), 1 => assert_eq!(k, &b"GC"[..]), @@ -293,7 +299,7 @@ fn can_canonicalize() { } let mut i = 0; - for (ix, k, _) in SeqRecord::from_bytes(b"AGNTA").kmers(2, true) { + for (ix, k, _) in Sequence::from_bytes(b"AGNTA").kmers(2, true) { match i { 0 => { assert_eq!(ix, 0); diff --git a/src/util.rs b/src/util.rs index a38b297..5608c54 100644 --- a/src/util.rs +++ b/src/util.rs @@ -44,9 +44,9 @@ impl ParseError { pub fn context(mut self, context: S) -> Self where - S: Into, + S: ToString, { - self.context = context.into(); + self.context = context.to_string(); self } }