Skip to content

Commit

Permalink
Refactor format code in module, relax UTF8 id requirement, and add wr…
Browse files Browse the repository at this point in the history
…iters. Closes #29, closes #13
  • Loading branch information
Roderick Bovee committed Aug 27, 2019
1 parent 942a760 commit 4aef928
Show file tree
Hide file tree
Showing 7 changed files with 354 additions and 292 deletions.
14 changes: 7 additions & 7 deletions benches/benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ extern crate bencher;
extern crate needletail;

use bencher::Bencher;
use needletail::fastx;
use needletail::parse_sequences;
use std::fs::File;
use std::io::{Cursor, Read};

Expand All @@ -18,7 +18,7 @@ fn bench_kmer_speed(bench: &mut Bencher) {
let mut n_total = 0;
let mut n_canonical = 0;
let file = File::open(filename).unwrap();
fastx::parse_sequences(
parse_sequences(
file,
|_| {},
|seq| {
Expand All @@ -44,7 +44,7 @@ fn bench_bitkmer_speed(bench: &mut Bencher) {
let mut n_total = 0;
let mut n_canonical = 0;
let file = File::open(filename).unwrap();
fastx::parse_sequences(
parse_sequences(
file,
|_| {},
|seq| {
Expand All @@ -71,7 +71,7 @@ fn bench_fastq_bytes(bench: &mut Bencher) {

bench.iter(|| {
let mut n_bases = 0;
fastx::parse_sequences(
parse_sequences(
Cursor::new(&data),
|_| {},
|seq| {
Expand All @@ -90,7 +90,7 @@ fn bench_fastq_file(bench: &mut Bencher) {
// fastx::fastx_file(&filename[..], |seq| { assert!(seq.1.len() > 0) }).unwrap();
bench.iter(|| {
let mut n_bases = 0;
fastx::parse_sequences(
parse_sequences(
File::open(filename).unwrap(),
|_| {},
|seq| {
Expand All @@ -111,7 +111,7 @@ fn bench_fasta_bytes(bench: &mut Bencher) {

bench.iter(|| {
let mut n_bases = 0;
fastx::parse_sequences(
parse_sequences(
Cursor::new(&data),
|_| {},
|seq| {
Expand All @@ -130,7 +130,7 @@ fn bench_fasta_file(bench: &mut Bencher) {
// fastx::fastx_file(&filename[..], |seq| { assert!(seq.1.len() > 0) }).unwrap();
bench.iter(|| {
let mut n_bases = 0;
fastx::parse_sequences(
parse_sequences(
File::open(filename).unwrap(),
|_| {},
|seq| {
Expand Down
84 changes: 84 additions & 0 deletions src/formats/fasta.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
use std::io::Write;

use memchr::memchr;

use crate::buffer::RecBuffer;
use crate::seq::Sequence;
use crate::util::{memchr_both, strip_whitespace, ParseError, ParseErrorType};

#[derive(Debug)]
pub struct FASTA<'a> {
pub id: &'a [u8],
pub seq: &'a [u8],
}

impl<'a> FASTA<'a> {
pub fn write<W>(&self, mut writer: W) -> Result<(), ParseError> where W: Write {
writer.write(b">")?;
writer.write(&self.id)?;
writer.write(b"\n")?;
writer.write(&self.seq)?;
writer.write(b"\n")?;
Ok(())
}
}

impl<'a> Iterator for RecBuffer<'a, FASTA<'static>> {
type Item = Result<FASTA<'a>, ParseError>;

fn next(&mut self) -> Option<Self::Item> {
let buf = &self.buf[self.pos..];
if buf.is_empty() {
return None;
}

let id_end;
match memchr(b'\n', &buf) {
Some(i) => id_end = i + 1,
None => return None,
};
let mut id = &buf[1..id_end - 1];
if !id.is_empty() && id[id.len() - 1] == b'\r' {
id = &id[..id.len() - 1];
}

let seq_end;
match (memchr_both(b'\n', b'>', &buf[id_end..]), self.last) {
(Some(i), _) => seq_end = id_end + i + 1,
(None, true) => seq_end = buf.len(),
(None, false) => return None,
};
if id_end == seq_end {
let context = String::from_utf8_lossy(id);
return Some(Err(ParseError::new(
"Sequence completely empty",
ParseErrorType::PrematureEOF,
)
.record(self.count + 1)
.context(context)));
}
let mut seq = &buf[id_end..seq_end];
if seq[seq.len() - 1] == b'\r' {
seq = &seq[..seq.len()];
}

self.pos += seq_end;
self.count += 1;
Some(Ok(FASTA { id, seq }))
}
}

impl<'a> From<FASTA<'a>> for Sequence<'a> {
fn from(fasta: FASTA<'a>) -> Sequence<'a> {
Sequence::new(fasta.id, strip_whitespace(fasta.seq), None)
}
}

impl<'a> From<&'a Sequence<'a>> for FASTA<'a> {
fn from(seq: &'a Sequence<'a>) -> FASTA<'a> {
FASTA {
id: &seq.id,
seq: &seq.seq,
}
}
}
142 changes: 142 additions & 0 deletions src/formats/fastq.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
use std::borrow::Cow;
use std::cmp::min;
use std::io::Write;

use memchr::memchr;

use crate::buffer::RecBuffer;
use crate::seq::Sequence;
use crate::util::{memchr_both, ParseError, ParseErrorType};


#[derive(Debug)]
pub struct FASTQ<'a> {
pub id: &'a [u8],
pub seq: &'a [u8],
pub id2: &'a [u8],
pub qual: &'a [u8],
}

impl<'a> FASTQ<'a> {
pub fn write<W>(&self, mut writer: W) -> Result<(), ParseError> where W: Write {
writer.write(b"@")?;
writer.write(&self.id)?;
writer.write(b"\n")?;
writer.write(&self.seq)?;
writer.write(b"+\n")?;
if self.seq.len() != self.qual.len() {
writer.write(&vec![b'I'; self.seq.len()])?;
} else {
writer.write(&self.qual)?;
}
writer.write(b"\n")?;
Ok(())
}
}

impl<'a> Iterator for RecBuffer<'a, FASTQ<'a>> {
type Item = Result<FASTQ<'a>, ParseError>;

fn next(&mut self) -> Option<Self::Item> {
if self.pos >= self.buf.len() {
return None;
}
let buf = &self.buf[self.pos..];

if buf[0] != b'@' {
// sometimes there are extra returns at the end of a file so we shouldn't blow up
if buf[0] == b'\r' || buf[0] == b'\n' {
return None;
} else {
let context = String::from_utf8_lossy(&buf[0..min(16, buf.len())]);
let e =
ParseError::new("Record must start with '@'", ParseErrorType::InvalidHeader)
.record(self.count)
.context(context);
return Some(Err(e));
}
}

let id_end;
match memchr(b'\n', &buf) {
Some(i) => id_end = i + 1,
None => return None,
};
let mut id = &buf[1..id_end - 1];

let seq_end;
match memchr_both(b'\n', b'+', &buf[id_end..]) {
Some(i) => seq_end = id_end + i + 1,
None => return None,
};
let mut seq = &buf[id_end..seq_end - 1];

let id2_end;
match memchr(b'\n', &buf[seq_end..]) {
Some(i) => id2_end = seq_end + i + 1,
None => return None,
};
let id2 = &buf[seq_end..id2_end - 1];

// we know the qual scores must be the same length as the sequence
// so we can just do some arithmatic instead of memchr'ing
let mut qual_end = id2_end + seq.len() + 1;
let mut buffer_used = qual_end;
if qual_end > buf.len() {
if !self.last {
// we need to pull more into the buffer
return None;
}
// now do some math to figure out if the file doesn't end with a newline
let windows_ending = if seq.last() == Some(&b'\r') { 1 } else { 0 };
if qual_end != buf.len() + 1 + windows_ending {
return None;
}
buffer_used -= 1 + windows_ending;
qual_end -= windows_ending;
}
let mut qual = &buf[id2_end..qual_end - 1];

// clean up any extra '\r' from the id and seq
if !id.is_empty() && id[id.len() - 1] == b'\r' {
id = &id[..id.len() - 1];
}
if !seq.is_empty() && seq[seq.len() - 1] == b'\r' {
seq = &seq[..seq.len() - 1];
}
// we do qual separately in case this is the end of the file
if !qual.is_empty() && qual[qual.len() - 1] == b'\r' {
qual = &qual[..qual.len() - 1];
}

self.pos += buffer_used;
self.count += 1;
Some(Ok(FASTQ { id, seq, id2, qual }))
}
}

impl<'a> From<FASTQ<'a>> for Sequence<'a> {
fn from(fastq: FASTQ<'a>) -> Sequence<'a> {
let qual = if fastq.seq.len() != fastq.qual.len() {
None
} else {
Some(fastq.qual)
};
Sequence::new(fastq.id, Cow::from(fastq.seq), qual)
}
}

impl<'a> From<&'a Sequence<'a>> for FASTQ<'a> {
fn from(seq: &'a Sequence<'a>) -> FASTQ<'a> {
let qual = match &seq.qual {
None => &b""[..],
Some(q) => &q,
};
FASTQ {
id: &seq.id,
seq: &seq.seq,
id2: b"",
qual: qual,
}
}
}
Loading

0 comments on commit 4aef928

Please sign in to comment.