-
Notifications
You must be signed in to change notification settings - Fork 25
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Refactor format code in module, relax UTF8 id requirement, and add wr…
- Loading branch information
Roderick Bovee
committed
Aug 27, 2019
1 parent
942a760
commit 4aef928
Showing
7 changed files
with
354 additions
and
292 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
use std::io::Write; | ||
|
||
use memchr::memchr; | ||
|
||
use crate::buffer::RecBuffer; | ||
use crate::seq::Sequence; | ||
use crate::util::{memchr_both, strip_whitespace, ParseError, ParseErrorType}; | ||
|
||
#[derive(Debug)] | ||
pub struct FASTA<'a> { | ||
pub id: &'a [u8], | ||
pub seq: &'a [u8], | ||
} | ||
|
||
impl<'a> FASTA<'a> { | ||
pub fn write<W>(&self, mut writer: W) -> Result<(), ParseError> where W: Write { | ||
writer.write(b">")?; | ||
writer.write(&self.id)?; | ||
writer.write(b"\n")?; | ||
writer.write(&self.seq)?; | ||
writer.write(b"\n")?; | ||
Ok(()) | ||
} | ||
} | ||
|
||
impl<'a> Iterator for RecBuffer<'a, FASTA<'static>> { | ||
type Item = Result<FASTA<'a>, ParseError>; | ||
|
||
fn next(&mut self) -> Option<Self::Item> { | ||
let buf = &self.buf[self.pos..]; | ||
if buf.is_empty() { | ||
return None; | ||
} | ||
|
||
let id_end; | ||
match memchr(b'\n', &buf) { | ||
Some(i) => id_end = i + 1, | ||
None => return None, | ||
}; | ||
let mut id = &buf[1..id_end - 1]; | ||
if !id.is_empty() && id[id.len() - 1] == b'\r' { | ||
id = &id[..id.len() - 1]; | ||
} | ||
|
||
let seq_end; | ||
match (memchr_both(b'\n', b'>', &buf[id_end..]), self.last) { | ||
(Some(i), _) => seq_end = id_end + i + 1, | ||
(None, true) => seq_end = buf.len(), | ||
(None, false) => return None, | ||
}; | ||
if id_end == seq_end { | ||
let context = String::from_utf8_lossy(id); | ||
return Some(Err(ParseError::new( | ||
"Sequence completely empty", | ||
ParseErrorType::PrematureEOF, | ||
) | ||
.record(self.count + 1) | ||
.context(context))); | ||
} | ||
let mut seq = &buf[id_end..seq_end]; | ||
if seq[seq.len() - 1] == b'\r' { | ||
seq = &seq[..seq.len()]; | ||
} | ||
|
||
self.pos += seq_end; | ||
self.count += 1; | ||
Some(Ok(FASTA { id, seq })) | ||
} | ||
} | ||
|
||
impl<'a> From<FASTA<'a>> for Sequence<'a> { | ||
fn from(fasta: FASTA<'a>) -> Sequence<'a> { | ||
Sequence::new(fasta.id, strip_whitespace(fasta.seq), None) | ||
} | ||
} | ||
|
||
impl<'a> From<&'a Sequence<'a>> for FASTA<'a> { | ||
fn from(seq: &'a Sequence<'a>) -> FASTA<'a> { | ||
FASTA { | ||
id: &seq.id, | ||
seq: &seq.seq, | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,142 @@ | ||
use std::borrow::Cow; | ||
use std::cmp::min; | ||
use std::io::Write; | ||
|
||
use memchr::memchr; | ||
|
||
use crate::buffer::RecBuffer; | ||
use crate::seq::Sequence; | ||
use crate::util::{memchr_both, ParseError, ParseErrorType}; | ||
|
||
|
||
#[derive(Debug)] | ||
pub struct FASTQ<'a> { | ||
pub id: &'a [u8], | ||
pub seq: &'a [u8], | ||
pub id2: &'a [u8], | ||
pub qual: &'a [u8], | ||
} | ||
|
||
impl<'a> FASTQ<'a> { | ||
pub fn write<W>(&self, mut writer: W) -> Result<(), ParseError> where W: Write { | ||
writer.write(b"@")?; | ||
writer.write(&self.id)?; | ||
writer.write(b"\n")?; | ||
writer.write(&self.seq)?; | ||
writer.write(b"+\n")?; | ||
if self.seq.len() != self.qual.len() { | ||
writer.write(&vec![b'I'; self.seq.len()])?; | ||
} else { | ||
writer.write(&self.qual)?; | ||
} | ||
writer.write(b"\n")?; | ||
Ok(()) | ||
} | ||
} | ||
|
||
impl<'a> Iterator for RecBuffer<'a, FASTQ<'a>> { | ||
type Item = Result<FASTQ<'a>, ParseError>; | ||
|
||
fn next(&mut self) -> Option<Self::Item> { | ||
if self.pos >= self.buf.len() { | ||
return None; | ||
} | ||
let buf = &self.buf[self.pos..]; | ||
|
||
if buf[0] != b'@' { | ||
// sometimes there are extra returns at the end of a file so we shouldn't blow up | ||
if buf[0] == b'\r' || buf[0] == b'\n' { | ||
return None; | ||
} else { | ||
let context = String::from_utf8_lossy(&buf[0..min(16, buf.len())]); | ||
let e = | ||
ParseError::new("Record must start with '@'", ParseErrorType::InvalidHeader) | ||
.record(self.count) | ||
.context(context); | ||
return Some(Err(e)); | ||
} | ||
} | ||
|
||
let id_end; | ||
match memchr(b'\n', &buf) { | ||
Some(i) => id_end = i + 1, | ||
None => return None, | ||
}; | ||
let mut id = &buf[1..id_end - 1]; | ||
|
||
let seq_end; | ||
match memchr_both(b'\n', b'+', &buf[id_end..]) { | ||
Some(i) => seq_end = id_end + i + 1, | ||
None => return None, | ||
}; | ||
let mut seq = &buf[id_end..seq_end - 1]; | ||
|
||
let id2_end; | ||
match memchr(b'\n', &buf[seq_end..]) { | ||
Some(i) => id2_end = seq_end + i + 1, | ||
None => return None, | ||
}; | ||
let id2 = &buf[seq_end..id2_end - 1]; | ||
|
||
// we know the qual scores must be the same length as the sequence | ||
// so we can just do some arithmatic instead of memchr'ing | ||
let mut qual_end = id2_end + seq.len() + 1; | ||
let mut buffer_used = qual_end; | ||
if qual_end > buf.len() { | ||
if !self.last { | ||
// we need to pull more into the buffer | ||
return None; | ||
} | ||
// now do some math to figure out if the file doesn't end with a newline | ||
let windows_ending = if seq.last() == Some(&b'\r') { 1 } else { 0 }; | ||
if qual_end != buf.len() + 1 + windows_ending { | ||
return None; | ||
} | ||
buffer_used -= 1 + windows_ending; | ||
qual_end -= windows_ending; | ||
} | ||
let mut qual = &buf[id2_end..qual_end - 1]; | ||
|
||
// clean up any extra '\r' from the id and seq | ||
if !id.is_empty() && id[id.len() - 1] == b'\r' { | ||
id = &id[..id.len() - 1]; | ||
} | ||
if !seq.is_empty() && seq[seq.len() - 1] == b'\r' { | ||
seq = &seq[..seq.len() - 1]; | ||
} | ||
// we do qual separately in case this is the end of the file | ||
if !qual.is_empty() && qual[qual.len() - 1] == b'\r' { | ||
qual = &qual[..qual.len() - 1]; | ||
} | ||
|
||
self.pos += buffer_used; | ||
self.count += 1; | ||
Some(Ok(FASTQ { id, seq, id2, qual })) | ||
} | ||
} | ||
|
||
impl<'a> From<FASTQ<'a>> for Sequence<'a> { | ||
fn from(fastq: FASTQ<'a>) -> Sequence<'a> { | ||
let qual = if fastq.seq.len() != fastq.qual.len() { | ||
None | ||
} else { | ||
Some(fastq.qual) | ||
}; | ||
Sequence::new(fastq.id, Cow::from(fastq.seq), qual) | ||
} | ||
} | ||
|
||
impl<'a> From<&'a Sequence<'a>> for FASTQ<'a> { | ||
fn from(seq: &'a Sequence<'a>) -> FASTQ<'a> { | ||
let qual = match &seq.qual { | ||
None => &b""[..], | ||
Some(q) => &q, | ||
}; | ||
FASTQ { | ||
id: &seq.id, | ||
seq: &seq.seq, | ||
id2: b"", | ||
qual: qual, | ||
} | ||
} | ||
} |
Oops, something went wrong.