From ebd26e9bbed4407c5e460772d9e79cb43dac25e2 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 2 May 2016 19:12:53 -0400 Subject: [PATCH] Update Replacer trait for Unicode regexes. This uses the new Replacer trait essentially as defined in the `bytes` sub-module and described in #151. Fixes #151 --- src/expand.rs | 142 +++++++++++++++++++++++++++++---- src/re_bytes.rs | 23 +++++- src/re_unicode.rs | 177 +++++++++++++++++++++++------------------- tests/macros_bytes.rs | 3 - tests/macros_str.rs | 3 +- 5 files changed, 248 insertions(+), 100 deletions(-) diff --git a/src/expand.rs b/src/expand.rs index 9bea703881..40c4c87152 100644 --- a/src/expand.rs +++ b/src/expand.rs @@ -2,9 +2,50 @@ use std::str; use memchr::memchr; -use bytes::Captures; +use re_bytes; +use re_unicode; -pub fn expand(caps: &Captures, mut replacement: &[u8], dst: &mut Vec) { +pub fn expand_str( + caps: &re_unicode::Captures, + mut replacement: &str, + dst: &mut String, +) { + while !replacement.is_empty() { + match memchr(b'$', replacement.as_bytes()) { + None => break, + Some(i) => { + dst.push_str(&replacement[..i]); + replacement = &replacement[i..]; + } + } + if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$') { + dst.push_str("$"); + replacement = &replacement[2..]; + continue; + } + debug_assert!(!replacement.is_empty()); + let cap_ref = match find_cap_ref(replacement) { + Some(cap_ref) => cap_ref, + None => { + dst.push_str("$"); + replacement = &replacement[1..]; + continue; + } + }; + replacement = &replacement[cap_ref.end..]; + match cap_ref.cap { + Ref::Number(i) => dst.push_str(caps.at(i).unwrap_or("")), + Ref::Named(name) => dst.push_str(caps.name(name).unwrap_or("")), + } + } + dst.push_str(replacement); +} + +pub fn expand_bytes( + caps: &re_bytes::Captures, + mut replacement: &[u8], + dst: &mut Vec, +) { while !replacement.is_empty() { match memchr(b'$', replacement) { None => break, @@ -27,7 +68,7 @@ pub fn expand(caps: &Captures, mut replacement: &[u8], dst: &mut Vec) { continue; } }; - replacement = cap_ref.rest; + replacement = &replacement[cap_ref.end..]; match cap_ref.cap { Ref::Number(i) => dst.extend(caps.at(i).unwrap_or(b"")), Ref::Named(name) => dst.extend(caps.name(name).unwrap_or(b"")), @@ -36,56 +77,127 @@ pub fn expand(caps: &Captures, mut replacement: &[u8], dst: &mut Vec) { dst.extend(replacement); } +/// CaptureRef represents a reference to a capture group inside some text. The +/// reference is either a capture group name or a number. +/// +/// It is also tagged with the position in the text immediately proceding the +/// capture reference. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] struct CaptureRef<'a> { - rest: &'a [u8], cap: Ref<'a>, + end: usize, } +/// A reference to a capture group in some text. +/// +/// e.g., `$2`, `$foo`, `${foo}`. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] enum Ref<'a> { Named(&'a str), Number(usize), } -fn find_cap_ref(mut replacement: &[u8]) -> Option { - if replacement.len() <= 1 || replacement[0] != b'$' { +impl<'a> From<&'a str> for Ref<'a> { + fn from(x: &'a str) -> Ref<'a> { + Ref::Named(x) + } +} + +impl From for Ref<'static> { + fn from(x: usize) -> Ref<'static> { + Ref::Number(x) + } +} + +/// Parses a possible reference to a capture group name in the given text, +/// starting at the beginning of `replacement`. +/// +/// If no such valid reference could be found, None is returned. +fn find_cap_ref>( + replacement: &T, +) -> Option { + let mut i = 0; + let rep: &[u8] = replacement.as_ref(); + if rep.len() <= 1 || rep[0] != b'$' { return None; } let mut brace = false; - replacement = &replacement[1..]; - if replacement[0] == b'{' { + i += 1; + if rep[i] == b'{' { brace = true; - replacement = &replacement[1..]; + i += 1; } - let mut cap_end = 0; - while replacement.get(cap_end).map_or(false, is_valid_cap_letter) { + let mut cap_end = i; + while rep.get(cap_end).map_or(false, is_valid_cap_letter) { cap_end += 1; } - if cap_end == 0 { + if cap_end == i { return None; } // We just verified that the range 0..cap_end is valid ASCII, so it must // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8 // check with either unsafe or by parsing the number straight from &[u8]. - let cap = str::from_utf8(&replacement[..cap_end]) + let cap = str::from_utf8(&rep[i..cap_end]) .ok().expect("valid UTF-8 capture name"); if brace { - if !replacement.get(cap_end).map_or(false, |&b| b == b'}') { + if !rep.get(cap_end).map_or(false, |&b| b == b'}') { return None; } cap_end += 1; } Some(CaptureRef { - rest: &replacement[cap_end..], cap: match cap.parse::() { Ok(i) => Ref::Number(i as usize), Err(_) => Ref::Named(cap), }, + end: cap_end, }) } +/// Returns true if and only if the given byte is allowed in a capture name. fn is_valid_cap_letter(b: &u8) -> bool { match *b { b'0' ... b'9' | b'a' ... b'z' | b'A' ... b'Z' | b'_' => true, _ => false, } } + +#[cfg(test)] +mod tests { + use super::{CaptureRef, find_cap_ref}; + + macro_rules! find { + ($name:ident, $text:expr) => { + #[test] + fn $name() { + assert_eq!(None, find_cap_ref($text)); + } + }; + ($name:ident, $text:expr, $capref:expr) => { + #[test] + fn $name() { + assert_eq!(Some($capref), find_cap_ref($text)); + } + }; + } + + macro_rules! c { + ($name_or_number:expr, $pos:expr) => { + CaptureRef { cap: $name_or_number.into(), end: $pos } + }; + } + + find!(find_cap_ref1, "$foo", c!("foo", 4)); + find!(find_cap_ref2, "${foo}", c!("foo", 6)); + find!(find_cap_ref3, "$0", c!(0, 2)); + find!(find_cap_ref4, "$5", c!(5, 2)); + find!(find_cap_ref5, "$10", c!(10, 3)); + find!(find_cap_ref6, "$42a", c!("42a", 4)); + find!(find_cap_ref7, "${42}a", c!(42, 5)); + find!(find_cap_ref8, "${42"); + find!(find_cap_ref9, "${42 "); + find!(find_cap_ref10, " $0 "); + find!(find_cap_ref11, "$"); + find!(find_cap_ref12, " "); + find!(find_cap_ref13, ""); +} diff --git a/src/re_bytes.rs b/src/re_bytes.rs index 97ac5b923a..ed517364a7 100644 --- a/src/re_bytes.rs +++ b/src/re_bytes.rs @@ -19,7 +19,7 @@ use std::sync::Arc; use memchr::memchr; use exec::{Exec, ExecNoSync}; -use expand::expand; +use expand::expand_bytes; use error::Error; use re_builder::bytes::RegexBuilder; use re_trait::{self, RegularExpression, Slot}; @@ -375,6 +375,25 @@ impl Regex { /// If no match is found, then a copy of the byte string is returned /// unchanged. /// + /// # Replacement string syntax + /// + /// All instances of `$name` in the replacement text is replaced with the + /// corresponding capture group `name`. + /// + /// `name` may be an integer corresponding to the index of the + /// capture group (counted by order of opening parenthesis where `0` is the + /// entire match) or it can be a name (consisting of letters, digits or + /// underscores) corresponding to a named capture group. + /// + /// If `name` isn't a valid capture group (whether the name doesn't exist + /// or isn't a valid index), then it is replaced with the empty string. + /// + /// The longest possible name is used. e.g., `$1a` looks up the capture + /// group named `1a` and not the capture group at index `1`. To exert more + /// precise control over the name, use braces, e.g., `${1}a`. + /// + /// To write a literal `$` use `$$`. + /// /// # Examples /// /// Note that this function is polymorphic with respect to the replacement. @@ -768,7 +787,7 @@ impl<'t> Captures<'t> { /// /// To write a literal `$` use `$$`. pub fn expand(&self, replacement: &[u8], dst: &mut Vec) { - expand(self, replacement, dst) + expand_bytes(self, replacement, dst) } /// Returns the number of captured groups. diff --git a/src/re_unicode.rs b/src/re_unicode.rs index ed3c6b5bde..359b6e0736 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -15,10 +15,12 @@ use std::ops::Index; use std::str::FromStr; use std::sync::Arc; +use memchr::memchr; use syntax; use error::Error; use exec::{Exec, ExecNoSyncStr}; +use expand::expand_str; use re_builder::unicode::RegexBuilder; use re_plugin::Plugin; use re_trait::{self, RegularExpression, Slot}; @@ -478,6 +480,25 @@ impl Regex { /// /// If no match is found, then a copy of the string is returned unchanged. /// + /// # Replacement string syntax + /// + /// All instances of `$name` in the replacement text is replaced with the + /// corresponding capture group `name`. + /// + /// `name` may be an integer corresponding to the index of the + /// capture group (counted by order of opening parenthesis where `0` is the + /// entire match) or it can be a name (consisting of letters, digits or + /// underscores) corresponding to a named capture group. + /// + /// If `name` isn't a valid capture group (whether the name doesn't exist + /// or isn't a valid index), then it is replaced with the empty string. + /// + /// The longest possible name is used. e.g., `$1a` looks up the capture + /// group named `1a` and not the capture group at index `1`. To exert more + /// precise control over the name, use braces, e.g., `${1}a`. + /// + /// To write a literal `$` use `$$`. + /// /// # Examples /// /// Note that this function is polymorphic with respect to the replacement. @@ -574,7 +595,7 @@ impl Regex { // 2) We don't need to look up all of the capture groups and do // replacements inside the replacement string. We just push it // at each match and be done with it. - if let Some(rep) = rep.no_expand() { + if let Some(rep) = rep.no_expansion() { let mut new = String::with_capacity(text.len()); let mut last_match = 0; for (i, (s, e)) in self.find_iter(text).enumerate() { @@ -600,7 +621,7 @@ impl Regex { // unwrap on 0 is OK because captures only reports matches let (s, e) = cap.pos(0).unwrap(); new.push_str(&text[last_match..s]); - new.push_str(&rep.reg_replace(&cap)); + rep.replace_append(&cap, &mut new); last_match = e; } new.push_str(&text[last_match..]); @@ -714,58 +735,6 @@ impl<'r> Iterator for CaptureNames<'r> { } } -/// NoExpand indicates literal string replacement. -/// -/// It can be used with `replace` and `replace_all` to do a literal -/// string replacement without expanding `$name` to their corresponding -/// capture groups. -/// -/// `'t` is the lifetime of the literal text. -pub struct NoExpand<'t>(pub &'t str); - -/// Replacer describes types that can be used to replace matches in a string. -pub trait Replacer { - /// Returns a possibly owned string that is used to replace the match - /// corresponding to the `caps` capture group. - /// - /// The `'a` lifetime refers to the lifetime of a borrowed string when - /// a new owned string isn't needed (e.g., for `NoExpand`). - fn reg_replace(&mut self, caps: &Captures) -> Cow; - - /// Returns a possibly owned string that never needs expansion. - fn no_expand(&mut self) -> Option> { None } -} - -impl<'t> Replacer for NoExpand<'t> { - fn reg_replace(&mut self, _: &Captures) -> Cow { - self.0.into() - } - - fn no_expand(&mut self) -> Option> { - Some(self.0.into()) - } -} - -impl<'t> Replacer for &'t str { - fn reg_replace<'a>(&'a mut self, caps: &Captures) -> Cow<'a, str> { - caps.expand(*self).into() - } - - fn no_expand(&mut self) -> Option> { - // if there is a $ there may be an expansion - match self.find('$') { - Some(_) => None, - None => Some((*self).into()), - } - } -} - -impl Replacer for F where F: FnMut(&Captures) -> String { - fn reg_replace<'a>(&'a mut self, caps: &Captures) -> Cow<'a, str> { - (*self)(caps).into() - } -} - /// Yields all substrings delimited by a regular expression match. /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the @@ -951,39 +920,23 @@ impl<'t> Captures<'t> { } /// Expands all instances of `$name` in `text` to the corresponding capture - /// group `name`. + /// group `name`, and writes them to the `dst` buffer given. /// /// `name` may be an integer corresponding to the index of the /// capture group (counted by order of opening parenthesis where `0` is the /// entire match) or it can be a name (consisting of letters, digits or /// underscores) corresponding to a named capture group. /// - /// If `name` isn't a valid capture group (whether the name doesn't exist or - /// isn't a valid index), then it is replaced with the empty string. + /// If `name` isn't a valid capture group (whether the name doesn't exist + /// or isn't a valid index), then it is replaced with the empty string. + /// + /// The longest possible name is used. e.g., `$1a` looks up the capture + /// group named `1a` and not the capture group at index `1`. To exert more + /// precise control over the name, use braces, e.g., `${1}a`. /// /// To write a literal `$` use `$$`. - pub fn expand(&self, text: &str) -> String { - const REPLACE_EXPAND: &'static str = r"(?x) - (?P^|\b|[^$]) # Ignore `$$name`. - \$ - (?P # Match the actual capture name. Can be... - [0-9]+ # A sequence of digits (for indexed captures), or... - | - [_a-zA-Z][_0-9a-zA-Z]* # A name for named captures. - ) - "; - // How evil can you get? - let re = Regex::new(REPLACE_EXPAND).unwrap(); - let text = re.replace_all(text, |refs: &Captures| -> String { - let before = refs.name("before").unwrap_or(""); - let name = refs.name("name").unwrap_or(""); - format!("{}{}", before, match name.parse::() { - Err(_) => self.name(name).unwrap_or("").to_owned(), - Ok(i) => self.at(i).unwrap_or("").to_owned(), - }) - }); - let re = Regex::new(r"\$\$").unwrap(); - re.replace_all(&text, NoExpand("$")) + pub fn expand(&self, replacement: &str, dst: &mut String) { + expand_str(self, replacement, dst) } /// Returns the number of captured groups. @@ -1204,3 +1157,69 @@ impl<'r, 't> Iterator for FindMatches<'r, 't> { } } } + +/// Replacer describes types that can be used to replace matches in a string. +/// +/// In general, users of this crate shouldn't need to implement this trait, +/// since implementations are already provided for `&str` and +/// `FnMut(&Captures) -> String`, which covers most use cases. +pub trait Replacer { + /// Appends text to `dst` to replace the current match. + /// + /// The current match is represented by `caps`, which is guaranteed to + /// have a match at capture group `0`. + /// + /// For example, a no-op replacement would be + /// `dst.extend(caps.at(0).unwrap())`. + fn replace_append(&mut self, caps: &Captures, dst: &mut String); + + /// Return a fixed unchanging replacement string. + /// + /// When doing replacements, if access to `Captures` is not needed (e.g., + /// the replacement byte string does not need `$` expansion), then it can + /// be beneficial to avoid finding sub-captures. + /// + /// In general, this is called once for every call to `replacen`. + fn no_expansion<'r>(&'r mut self) -> Option> { + None + } +} + +impl<'a> Replacer for &'a str { + fn replace_append(&mut self, caps: &Captures, dst: &mut String) { + caps.expand(*self, dst); + } + + fn no_expansion<'r>(&'r mut self) -> Option> { + match memchr(b'$', self.as_bytes()) { + Some(_) => None, + None => Some(Cow::Borrowed(*self)), + } + } +} + +impl Replacer for F where F: FnMut(&Captures) -> String { + fn replace_append(&mut self, caps: &Captures, dst: &mut String) { + dst.push_str(&(*self)(caps)); + } +} + +/// NoExpand indicates literal string replacement. +/// +/// It can be used with `replace` and `replace_all` to do a literal string +/// replacement without expanding `$name` to their corresponding capture +/// groups. This can be both convenient (to avoid escaping `$`, for example) +/// and performant (since capture groups don't need to be found). +/// +/// `'t` is the lifetime of the literal text. +pub struct NoExpand<'r>(pub &'r str); + +impl<'a> Replacer for NoExpand<'a> { + fn replace_append(&mut self, _: &Captures, dst: &mut String) { + dst.push_str(self.0); + } + + fn no_expansion<'r>(&'r mut self) -> Option> { + Some(Cow::Borrowed(self.0)) + } +} diff --git a/tests/macros_bytes.rs b/tests/macros_bytes.rs index 89c236ff31..c0875ab074 100644 --- a/tests/macros_bytes.rs +++ b/tests/macros_bytes.rs @@ -25,9 +25,6 @@ macro_rules! show { }} } -// N.B. The expansion API for &str and &[u8] APIs differs slightly for now, -// but they should be unified in 1.0. Then we can move this macro back into -// tests/api.rs where it is used. ---AG macro_rules! expand { ($name:ident, $re:expr, $text:expr, $expand:expr, $expected:expr) => { #[test] diff --git a/tests/macros_str.rs b/tests/macros_str.rs index c419ee90dd..5acbe282b6 100644 --- a/tests/macros_str.rs +++ b/tests/macros_str.rs @@ -26,7 +26,8 @@ macro_rules! expand { let re = regex!($re); let cap = re.captures(t!($text)).unwrap(); - let got = cap.expand(t!($expand)); + let mut got = String::new(); + cap.expand(t!($expand), &mut got); assert_eq!(show!(t!($expected)), show!(&*got)); } }