diff --git a/noodles-vcf/CHANGELOG.md b/noodles-vcf/CHANGELOG.md index b575e9d4a..bccfe3bb2 100644 --- a/noodles-vcf/CHANGELOG.md +++ b/noodles-vcf/CHANGELOG.md @@ -9,6 +9,10 @@ When the inner writer is buffered, a call to `AsyncWriter::shutdown` is required prior to drop. + * vcf/header/parser: Add partial parser. + + This allows headers to be parsed line by line. + * vcf/reader/builder: Add a compression method setter. This allows the compression method to be overridden using diff --git a/noodles-vcf/src/header/parser.rs b/noodles-vcf/src/header/parser.rs index 27f03e105..ad331754f 100644 --- a/noodles-vcf/src/header/parser.rs +++ b/noodles-vcf/src/header/parser.rs @@ -4,22 +4,38 @@ mod builder; mod file_format_option; pub(crate) mod record; -pub use self::{builder::Builder, file_format_option::FileFormatOption, record::parse_record}; - use std::error; -use indexmap::IndexSet; +use indexmap::IndexMap; +pub use self::{builder::Builder, file_format_option::FileFormatOption, record::parse_record}; use super::{ file_format::{self, FileFormat}, record::Record, - Header, + AlternativeAlleles, Contigs, Filters, Formats, Header, Infos, OtherRecords, SampleNames, }; +#[derive(Debug, Default, Eq, PartialEq)] +enum State { + #[default] + Empty, + Ready, + Done, +} + /// A VCF header parser. #[derive(Debug, Default, Eq, PartialEq)] pub struct Parser { file_format_option: FileFormatOption, + state: State, + file_format: FileFormat, + infos: Infos, + filters: Filters, + formats: Formats, + alternative_alleles: AlternativeAlleles, + contigs: Contigs, + sample_names: SampleNames, + other_records: OtherRecords, } impl Parser { @@ -30,47 +46,94 @@ impl Parser { /// Parses a raw VCF header. pub fn parse(&self, s: &str) -> Result { - let mut builder = Header::builder(); - let mut lines = s.lines(); - - let line = lines.next().ok_or(ParseError::MissingFileFormat)?; - let file_format = match parse_file_format(line) { - Ok(f) => match self.file_format_option { - FileFormatOption::Auto => f, - FileFormatOption::FileFormat(g) => g, - }, - Err(e) => return Err(e), - }; + let mut parser = Self::default(); - builder = builder.set_file_format(file_format); + for line in s.lines() { + parser.parse_partial(line)?; + } - let mut has_header = false; + parser.finish() + } - for line in &mut lines { - if line.starts_with("#CHROM") { - builder = parse_header(builder, line)?; - has_header = true; - break; - } + /// Parses and adds a raw record to the header. + pub fn parse_partial(&mut self, s: &str) -> Result<(), ParseError> { + if self.state == State::Done { + return Err(ParseError::ExpectedEof); + } + + if self.state == State::Empty { + let file_format = match parse_file_format(s) { + Ok(f) => match self.file_format_option { + FileFormatOption::Auto => f, + FileFormatOption::FileFormat(g) => g, + }, + Err(e) => return Err(e), + }; + + self.file_format = file_format; + self.state = State::Ready; - builder = add_record(file_format, builder, line)?; + return Ok(()); } - if !has_header { - return Err(ParseError::MissingHeader); + if s.starts_with("#CHROM") { + parse_header(s, &mut self.sample_names)?; + self.state = State::Done; + return Ok(()); } - if lines.next().is_some() { - return Err(ParseError::ExpectedEof); + let record = record::parse_record(s.as_bytes(), self.file_format) + .map_err(ParseError::InvalidRecord)?; + + match record { + Record::FileFormat(_) => return Err(ParseError::UnexpectedFileFormat), + Record::Info(id, info) => { + self.infos.insert(id, info); + } + Record::Filter(id, filter) => { + self.filters.insert(id, filter); + } + Record::Format(id, format) => { + self.formats.insert(id, format); + } + Record::AlternativeAllele(id, alternative_allele) => { + self.alternative_alleles.insert(id, alternative_allele); + } + Record::Contig(id, contig) => { + self.contigs.insert(id, contig); + } + Record::Other(key, value) => { + insert_other_record(&mut self.other_records, key, value)?; + } } - Ok(builder.build()) + Ok(()) + } + + /// Builds the VCF header. + pub fn finish(self) -> Result { + match self.state { + State::Empty => Err(ParseError::Empty), + State::Ready => Err(ParseError::MissingHeader), + State::Done => Ok(Header { + file_format: self.file_format, + infos: self.infos, + filters: self.filters, + formats: self.formats, + alternative_alleles: self.alternative_alleles, + contigs: self.contigs, + sample_names: self.sample_names, + other_records: self.other_records, + }), + } } } /// An error returned when a raw VCF header fails to parse. #[derive(Clone, Debug, Eq, PartialEq)] pub enum ParseError { + /// The input is empty. + Empty, /// The file format (`fileformat`) is missing. MissingFileFormat, /// The file format (`fileformat`) appears other than the first line. @@ -110,6 +173,7 @@ impl error::Error for ParseError { impl std::fmt::Display for ParseError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { + Self::Empty => f.write_str("empty input"), Self::MissingFileFormat => f.write_str("missing fileformat"), Self::UnexpectedFileFormat => f.write_str("unexpected file format"), Self::InvalidFileFormat(_) => f.write_str("invalid file format"), @@ -142,32 +206,28 @@ fn parse_file_format(s: &str) -> Result { } } -fn add_record( - file_format: FileFormat, - mut builder: super::Builder, - line: &str, -) -> Result { - let record = - record::parse_record(line.as_bytes(), file_format).map_err(ParseError::InvalidRecord)?; - - builder = match record { - Record::FileFormat(_) => return Err(ParseError::UnexpectedFileFormat), - Record::Info(id, info) => builder.add_info(id, info), - Record::Filter(id, filter) => builder.add_filter(id, filter), - Record::Format(id, format) => builder.add_format(id, format), - Record::AlternativeAllele(id, alternative_allele) => { - builder.add_alternative_allele(id, alternative_allele) +fn insert_other_record( + other_records: &mut OtherRecords, + key: super::record::key::Other, + value: super::record::Value, +) -> Result<(), ParseError> { + let collection = other_records.entry(key).or_insert_with(|| match value { + super::record::Value::String(_) => { + super::record::value::Collection::Unstructured(Vec::new()) } - Record::Contig(id, contig) => builder.add_contig(id, contig), - Record::Other(key, value) => builder - .insert(key, value) - .map_err(ParseError::InvalidRecordValue)?, - }; + super::record::Value::Map(..) => { + super::record::value::Collection::Structured(IndexMap::new()) + } + }); + + collection + .add(value) + .map_err(ParseError::InvalidRecordValue)?; - Ok(builder) + Ok(()) } -fn parse_header(mut builder: super::Builder, line: &str) -> Result { +fn parse_header(line: &str, sample_names: &mut SampleNames) -> Result<(), ParseError> { static HEADERS: &[&str] = &[ "#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", ]; @@ -193,18 +253,14 @@ fn parse_header(mut builder: super::Builder, line: &str) -> Result Parser { Parser { file_format_option: self.file_format_option, + ..Default::default() } } }