Skip to content

Commit

Permalink
vcf/header/parser: Add partial parser
Browse files Browse the repository at this point in the history
This allows headers to be parsed line by line.
  • Loading branch information
zaeleus committed Oct 25, 2023
1 parent 89b9acd commit 1f5fa65
Show file tree
Hide file tree
Showing 3 changed files with 117 additions and 56 deletions.
4 changes: 4 additions & 0 deletions noodles-vcf/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
When the inner writer is buffered, a call to `AsyncWriter::shutdown` is
required prior to drop.

* vcf/header/parser: Add partial parser.

This allows headers to be parsed line by line.

* vcf/reader/builder: Add a compression method setter.

This allows the compression method to be overridden using
Expand Down
168 changes: 112 additions & 56 deletions noodles-vcf/src/header/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,38 @@ mod builder;
mod file_format_option;
pub(crate) mod record;

pub use self::{builder::Builder, file_format_option::FileFormatOption, record::parse_record};

use std::error;

use indexmap::IndexSet;
use indexmap::IndexMap;

pub use self::{builder::Builder, file_format_option::FileFormatOption, record::parse_record};
use super::{
file_format::{self, FileFormat},
record::Record,
Header,
AlternativeAlleles, Contigs, Filters, Formats, Header, Infos, OtherRecords, SampleNames,
};

#[derive(Debug, Default, Eq, PartialEq)]
enum State {
#[default]
Empty,
Ready,
Done,
}

/// A VCF header parser.
#[derive(Debug, Default, Eq, PartialEq)]
pub struct Parser {
file_format_option: FileFormatOption,
state: State,
file_format: FileFormat,
infos: Infos,
filters: Filters,
formats: Formats,
alternative_alleles: AlternativeAlleles,
contigs: Contigs,
sample_names: SampleNames,
other_records: OtherRecords,
}

impl Parser {
Expand All @@ -30,47 +46,94 @@ impl Parser {

/// Parses a raw VCF header.
pub fn parse(&self, s: &str) -> Result<Header, ParseError> {
let mut builder = Header::builder();
let mut lines = s.lines();

let line = lines.next().ok_or(ParseError::MissingFileFormat)?;
let file_format = match parse_file_format(line) {
Ok(f) => match self.file_format_option {
FileFormatOption::Auto => f,
FileFormatOption::FileFormat(g) => g,
},
Err(e) => return Err(e),
};
let mut parser = Self::default();

builder = builder.set_file_format(file_format);
for line in s.lines() {
parser.parse_partial(line)?;
}

let mut has_header = false;
parser.finish()
}

for line in &mut lines {
if line.starts_with("#CHROM") {
builder = parse_header(builder, line)?;
has_header = true;
break;
}
/// Parses and adds a raw record to the header.
pub fn parse_partial(&mut self, s: &str) -> Result<(), ParseError> {
if self.state == State::Done {
return Err(ParseError::ExpectedEof);
}

if self.state == State::Empty {
let file_format = match parse_file_format(s) {
Ok(f) => match self.file_format_option {
FileFormatOption::Auto => f,
FileFormatOption::FileFormat(g) => g,
},
Err(e) => return Err(e),
};

self.file_format = file_format;
self.state = State::Ready;

builder = add_record(file_format, builder, line)?;
return Ok(());
}

if !has_header {
return Err(ParseError::MissingHeader);
if s.starts_with("#CHROM") {
parse_header(s, &mut self.sample_names)?;
self.state = State::Done;
return Ok(());
}

if lines.next().is_some() {
return Err(ParseError::ExpectedEof);
let record = record::parse_record(s.as_bytes(), self.file_format)
.map_err(ParseError::InvalidRecord)?;

match record {
Record::FileFormat(_) => return Err(ParseError::UnexpectedFileFormat),
Record::Info(id, info) => {
self.infos.insert(id, info);
}
Record::Filter(id, filter) => {
self.filters.insert(id, filter);
}
Record::Format(id, format) => {
self.formats.insert(id, format);
}
Record::AlternativeAllele(id, alternative_allele) => {
self.alternative_alleles.insert(id, alternative_allele);
}
Record::Contig(id, contig) => {
self.contigs.insert(id, contig);
}
Record::Other(key, value) => {
insert_other_record(&mut self.other_records, key, value)?;
}
}

Ok(builder.build())
Ok(())
}

/// Builds the VCF header.
pub fn finish(self) -> Result<Header, ParseError> {
match self.state {
State::Empty => Err(ParseError::Empty),
State::Ready => Err(ParseError::MissingHeader),
State::Done => Ok(Header {
file_format: self.file_format,
infos: self.infos,
filters: self.filters,
formats: self.formats,
alternative_alleles: self.alternative_alleles,
contigs: self.contigs,
sample_names: self.sample_names,
other_records: self.other_records,
}),
}
}
}

/// An error returned when a raw VCF header fails to parse.
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum ParseError {
/// The input is empty.
Empty,
/// The file format (`fileformat`) is missing.
MissingFileFormat,
/// The file format (`fileformat`) appears other than the first line.
Expand Down Expand Up @@ -110,6 +173,7 @@ impl error::Error for ParseError {
impl std::fmt::Display for ParseError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Empty => f.write_str("empty input"),
Self::MissingFileFormat => f.write_str("missing fileformat"),
Self::UnexpectedFileFormat => f.write_str("unexpected file format"),
Self::InvalidFileFormat(_) => f.write_str("invalid file format"),
Expand Down Expand Up @@ -142,32 +206,28 @@ fn parse_file_format(s: &str) -> Result<FileFormat, ParseError> {
}
}

fn add_record(
file_format: FileFormat,
mut builder: super::Builder,
line: &str,
) -> Result<super::Builder, ParseError> {
let record =
record::parse_record(line.as_bytes(), file_format).map_err(ParseError::InvalidRecord)?;

builder = match record {
Record::FileFormat(_) => return Err(ParseError::UnexpectedFileFormat),
Record::Info(id, info) => builder.add_info(id, info),
Record::Filter(id, filter) => builder.add_filter(id, filter),
Record::Format(id, format) => builder.add_format(id, format),
Record::AlternativeAllele(id, alternative_allele) => {
builder.add_alternative_allele(id, alternative_allele)
fn insert_other_record(
other_records: &mut OtherRecords,
key: super::record::key::Other,
value: super::record::Value,
) -> Result<(), ParseError> {
let collection = other_records.entry(key).or_insert_with(|| match value {
super::record::Value::String(_) => {
super::record::value::Collection::Unstructured(Vec::new())
}
Record::Contig(id, contig) => builder.add_contig(id, contig),
Record::Other(key, value) => builder
.insert(key, value)
.map_err(ParseError::InvalidRecordValue)?,
};
super::record::Value::Map(..) => {
super::record::value::Collection::Structured(IndexMap::new())
}
});

collection
.add(value)
.map_err(ParseError::InvalidRecordValue)?;

Ok(builder)
Ok(())
}

fn parse_header(mut builder: super::Builder, line: &str) -> Result<super::Builder, ParseError> {
fn parse_header(line: &str, sample_names: &mut SampleNames) -> Result<(), ParseError> {
static HEADERS: &[&str] = &[
"#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO",
];
Expand All @@ -193,18 +253,14 @@ fn parse_header(mut builder: super::Builder, line: &str) -> Result<super::Builde
));
}

let mut sample_names = IndexSet::new();

for sample_name in fields {
if !sample_names.insert(sample_name.into()) {
return Err(ParseError::DuplicateSampleName(sample_name.into()));
}
}

builder = builder.set_sample_names(sample_names);
}

Ok(builder)
Ok(())
}

#[cfg(test)]
Expand Down
1 change: 1 addition & 0 deletions noodles-vcf/src/header/parser/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ impl Builder {
pub fn build(self) -> Parser {
Parser {
file_format_option: self.file_format_option,
..Default::default()
}
}
}
Expand Down

0 comments on commit 1f5fa65

Please sign in to comment.