Skip to content

Commit

Permalink
Update Replacer trait for Unicode regexes.
Browse files Browse the repository at this point in the history
This uses the new Replacer trait essentially as defined in the `bytes`
sub-module and described in #151.

Fixes #151
  • Loading branch information
BurntSushi committed Dec 30, 2016
1 parent d44a9f9 commit ebd26e9
Show file tree
Hide file tree
Showing 5 changed files with 248 additions and 100 deletions.
142 changes: 127 additions & 15 deletions src/expand.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,50 @@ use std::str;

use memchr::memchr;

use bytes::Captures;
use re_bytes;
use re_unicode;

pub fn expand(caps: &Captures, mut replacement: &[u8], dst: &mut Vec<u8>) {
pub fn expand_str(
caps: &re_unicode::Captures,
mut replacement: &str,
dst: &mut String,
) {
while !replacement.is_empty() {
match memchr(b'$', replacement.as_bytes()) {
None => break,
Some(i) => {
dst.push_str(&replacement[..i]);
replacement = &replacement[i..];
}
}
if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$') {
dst.push_str("$");
replacement = &replacement[2..];
continue;
}
debug_assert!(!replacement.is_empty());
let cap_ref = match find_cap_ref(replacement) {
Some(cap_ref) => cap_ref,
None => {
dst.push_str("$");
replacement = &replacement[1..];
continue;
}
};
replacement = &replacement[cap_ref.end..];
match cap_ref.cap {
Ref::Number(i) => dst.push_str(caps.at(i).unwrap_or("")),
Ref::Named(name) => dst.push_str(caps.name(name).unwrap_or("")),
}
}
dst.push_str(replacement);
}

pub fn expand_bytes(
caps: &re_bytes::Captures,
mut replacement: &[u8],
dst: &mut Vec<u8>,
) {
while !replacement.is_empty() {
match memchr(b'$', replacement) {
None => break,
Expand All @@ -27,7 +68,7 @@ pub fn expand(caps: &Captures, mut replacement: &[u8], dst: &mut Vec<u8>) {
continue;
}
};
replacement = cap_ref.rest;
replacement = &replacement[cap_ref.end..];
match cap_ref.cap {
Ref::Number(i) => dst.extend(caps.at(i).unwrap_or(b"")),
Ref::Named(name) => dst.extend(caps.name(name).unwrap_or(b"")),
Expand All @@ -36,56 +77,127 @@ pub fn expand(caps: &Captures, mut replacement: &[u8], dst: &mut Vec<u8>) {
dst.extend(replacement);
}

/// CaptureRef represents a reference to a capture group inside some text. The
/// reference is either a capture group name or a number.
///
/// It is also tagged with the position in the text immediately proceding the
/// capture reference.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
struct CaptureRef<'a> {
rest: &'a [u8],
cap: Ref<'a>,
end: usize,
}

/// A reference to a capture group in some text.
///
/// e.g., `$2`, `$foo`, `${foo}`.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
enum Ref<'a> {
Named(&'a str),
Number(usize),
}

fn find_cap_ref(mut replacement: &[u8]) -> Option<CaptureRef> {
if replacement.len() <= 1 || replacement[0] != b'$' {
impl<'a> From<&'a str> for Ref<'a> {
fn from(x: &'a str) -> Ref<'a> {
Ref::Named(x)
}
}

impl From<usize> for Ref<'static> {
fn from(x: usize) -> Ref<'static> {
Ref::Number(x)
}
}

/// Parses a possible reference to a capture group name in the given text,
/// starting at the beginning of `replacement`.
///
/// If no such valid reference could be found, None is returned.
fn find_cap_ref<T: ?Sized + AsRef<[u8]>>(
replacement: &T,
) -> Option<CaptureRef> {
let mut i = 0;
let rep: &[u8] = replacement.as_ref();
if rep.len() <= 1 || rep[0] != b'$' {
return None;
}
let mut brace = false;
replacement = &replacement[1..];
if replacement[0] == b'{' {
i += 1;
if rep[i] == b'{' {
brace = true;
replacement = &replacement[1..];
i += 1;
}
let mut cap_end = 0;
while replacement.get(cap_end).map_or(false, is_valid_cap_letter) {
let mut cap_end = i;
while rep.get(cap_end).map_or(false, is_valid_cap_letter) {
cap_end += 1;
}
if cap_end == 0 {
if cap_end == i {
return None;
}
// We just verified that the range 0..cap_end is valid ASCII, so it must
// therefore be valid UTF-8. If we really cared, we could avoid this UTF-8
// check with either unsafe or by parsing the number straight from &[u8].
let cap = str::from_utf8(&replacement[..cap_end])
let cap = str::from_utf8(&rep[i..cap_end])
.ok().expect("valid UTF-8 capture name");
if brace {
if !replacement.get(cap_end).map_or(false, |&b| b == b'}') {
if !rep.get(cap_end).map_or(false, |&b| b == b'}') {
return None;
}
cap_end += 1;
}
Some(CaptureRef {
rest: &replacement[cap_end..],
cap: match cap.parse::<u32>() {
Ok(i) => Ref::Number(i as usize),
Err(_) => Ref::Named(cap),
},
end: cap_end,
})
}

/// Returns true if and only if the given byte is allowed in a capture name.
fn is_valid_cap_letter(b: &u8) -> bool {
match *b {
b'0' ... b'9' | b'a' ... b'z' | b'A' ... b'Z' | b'_' => true,
_ => false,
}
}

#[cfg(test)]
mod tests {
use super::{CaptureRef, find_cap_ref};

macro_rules! find {
($name:ident, $text:expr) => {
#[test]
fn $name() {
assert_eq!(None, find_cap_ref($text));
}
};
($name:ident, $text:expr, $capref:expr) => {
#[test]
fn $name() {
assert_eq!(Some($capref), find_cap_ref($text));
}
};
}

macro_rules! c {
($name_or_number:expr, $pos:expr) => {
CaptureRef { cap: $name_or_number.into(), end: $pos }
};
}

find!(find_cap_ref1, "$foo", c!("foo", 4));
find!(find_cap_ref2, "${foo}", c!("foo", 6));
find!(find_cap_ref3, "$0", c!(0, 2));
find!(find_cap_ref4, "$5", c!(5, 2));
find!(find_cap_ref5, "$10", c!(10, 3));
find!(find_cap_ref6, "$42a", c!("42a", 4));
find!(find_cap_ref7, "${42}a", c!(42, 5));
find!(find_cap_ref8, "${42");
find!(find_cap_ref9, "${42 ");
find!(find_cap_ref10, " $0 ");
find!(find_cap_ref11, "$");
find!(find_cap_ref12, " ");
find!(find_cap_ref13, "");
}
23 changes: 21 additions & 2 deletions src/re_bytes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ use std::sync::Arc;
use memchr::memchr;

use exec::{Exec, ExecNoSync};
use expand::expand;
use expand::expand_bytes;
use error::Error;
use re_builder::bytes::RegexBuilder;
use re_trait::{self, RegularExpression, Slot};
Expand Down Expand Up @@ -375,6 +375,25 @@ impl Regex {
/// If no match is found, then a copy of the byte string is returned
/// unchanged.
///
/// # Replacement string syntax
///
/// All instances of `$name` in the replacement text is replaced with the
/// corresponding capture group `name`.
///
/// `name` may be an integer corresponding to the index of the
/// capture group (counted by order of opening parenthesis where `0` is the
/// entire match) or it can be a name (consisting of letters, digits or
/// underscores) corresponding to a named capture group.
///
/// If `name` isn't a valid capture group (whether the name doesn't exist
/// or isn't a valid index), then it is replaced with the empty string.
///
/// The longest possible name is used. e.g., `$1a` looks up the capture
/// group named `1a` and not the capture group at index `1`. To exert more
/// precise control over the name, use braces, e.g., `${1}a`.
///
/// To write a literal `$` use `$$`.
///
/// # Examples
///
/// Note that this function is polymorphic with respect to the replacement.
Expand Down Expand Up @@ -768,7 +787,7 @@ impl<'t> Captures<'t> {
///
/// To write a literal `$` use `$$`.
pub fn expand(&self, replacement: &[u8], dst: &mut Vec<u8>) {
expand(self, replacement, dst)
expand_bytes(self, replacement, dst)
}

/// Returns the number of captured groups.
Expand Down
Loading

0 comments on commit ebd26e9

Please sign in to comment.