Skip to content

Commit

Permalink
Switch bytes::Regex to using Unicode mode by default.
Browse files Browse the repository at this point in the history
  • Loading branch information
BurntSushi committed Dec 30, 2016
1 parent e2f0850 commit d44a9f9
Show file tree
Hide file tree
Showing 13 changed files with 77 additions and 98 deletions.
11 changes: 0 additions & 11 deletions bench/src/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,22 +59,11 @@ pub use ffi::tcl::Regex;
// Due to macro scoping rules, this definition only applies for the modules
// defined below. Effectively, it allows us to use the same tests for both
// native and dynamic regexes.
#[cfg(not(feature = "re-rust-bytes"))]
#[cfg(not(feature = "re-rust-plugin"))]
macro_rules! regex {
($re:expr) => { ::Regex::new(&$re.to_owned()).unwrap() }
}

#[cfg(feature = "re-rust-bytes")]
macro_rules! regex {
($re:expr) => {{
// Always enable the Unicode flag for byte based regexes.
// Really, this should have been enabled by default. *sigh*
use regex::bytes::RegexBuilder;
RegexBuilder::new(&$re.to_owned()).unicode(true).compile().unwrap()
}}
}

// Usage: text!(haystack)
//
// Builds a ::Text from an owned string.
Expand Down
34 changes: 14 additions & 20 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -244,16 +244,11 @@
//! # Opt out of Unicode support
//!
//! The `bytes` sub-module provides a `Regex` type that can be used to match
//! on `&[u8]`. By default, text is interpreted as ASCII compatible text with
//! all Unicode support disabled (e.g., `.` matches any byte instead of any
//! Unicode codepoint). Unicode support can be selectively enabled with the
//! `u` flag. See the `bytes` module documentation for more details.
//!
//! Unicode support can also be selectively *disabled* with the main `Regex`
//! type that matches on `&str`. For example, `(?-u:\b)` will match an ASCII
//! word boundary. Note though that invalid UTF-8 is not allowed to be matched
//! even when the `u` flag is disabled. For example, `(?-u:.)` will return an
//! error, since `.` matches *any byte* when Unicode support is disabled.
//! on `&[u8]`. By default, text is interpreted as UTF-8 just like it is with
//! the main `Regex` type. However, this behavior can be disabled by turning
//! off the `u` flag, even if doing so could result in matching invalid UTF-8.
//! For example, when the `u` flag is disabled, `.` will match any byte instead
//! of any Unicode codepoint.
//!
//! # Syntax
//!
Expand Down Expand Up @@ -480,19 +475,16 @@ top-level of this crate. There are two important differences:
1. Matching is done on `&[u8]` instead of `&str`. Additionally, `Vec<u8>`
is used where `String` would have been used.
2. Regular expressions are compiled with Unicode support *disabled* by
default. This means that while Unicode regular expressions can only match valid
UTF-8, regular expressions in this module can match arbitrary bytes. Unicode
support can be selectively enabled via the `u` flag in regular expressions
provided by this sub-module.
2. Unicode support can be disabled even when disabling it would result in
matching invalid UTF-8 bytes.
# Example: match null terminated string
This shows how to find all null-terminated strings in a slice of bytes:
```rust
# use regex::bytes::Regex;
let re = Regex::new(r"(?P<cstr>[^\x00]+)\x00").unwrap();
let re = Regex::new(r"(?-u)(?P<cstr>[^\x00]+)\x00").unwrap();
let text = b"foo\x00bar\x00baz\x00";
// Extract all of the strings without the null terminator from each match.
Expand All @@ -512,7 +504,9 @@ string (e.g., to extract a title from a Matroska file):
```rust
# use std::str;
# use regex::bytes::Regex;
let re = Regex::new(r"\x7b\xa9(?:[\x80-\xfe]|[\x40-\xff].)(?u:(.*))").unwrap();
let re = Regex::new(
r"(?-u)\x7b\xa9(?:[\x80-\xfe]|[\x40-\xff].)(?u:(.*))"
).unwrap();
let text = b"\x12\xd0\x3b\x5f\x7b\xa9\x85\xe2\x98\x83\x80\x98\x54\x76\x68\x65";
let caps = re.captures(text).unwrap();
Expand All @@ -536,9 +530,9 @@ The supported syntax is pretty much the same as the syntax for Unicode
regular expressions with a few changes that make sense for matching arbitrary
bytes:
1. The `u` flag is *disabled* by default, but can be selectively enabled. (The
opposite is true for the main `Regex` type.) Disabling the `u` flag is said to
invoke "ASCII compatible" mode.
1. The `u` flag can be disabled even when disabling it might cause the regex to
match invalid UTF-8. When the `u` flag is disabled, the regex is said to be in
"ASCII compatible" mode.
2. In ASCII compatible mode, neither Unicode codepoints nor Unicode character
classes are allowed.
3. In ASCII compatible mode, Perl character classes (`\w`, `\d` and `\s`)
Expand Down
7 changes: 3 additions & 4 deletions src/re_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ impl Default for RegexOptions {
}

macro_rules! define_builder {
($name:ident, $regex_mod:ident, $unicode:expr, $only_utf8:expr) => {
($name:ident, $regex_mod:ident, $only_utf8:expr) => {
pub mod $name {
use error::Error;
use exec::ExecBuilder;
Expand All @@ -63,7 +63,6 @@ impl RegexBuilder {
pub fn new(pattern: &str) -> RegexBuilder {
let mut builder = RegexBuilder(RegexOptions::default());
builder.0.pats.push(pattern.to_owned());
builder.0.unicode = $unicode;
builder
}

Expand Down Expand Up @@ -151,5 +150,5 @@ impl RegexBuilder {
}
}

define_builder!(bytes, re_bytes, false, false);
define_builder!(unicode, re_unicode, true, true);
define_builder!(bytes, re_bytes, false);
define_builder!(unicode, re_unicode, true);
4 changes: 2 additions & 2 deletions tests/api_str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@
fn empty_match_unicode_find_iter() {
// Tests that we still yield byte ranges at valid UTF-8 sequence boundaries
// even when we're susceptible to empty width matches.
let re = regex!(u!(r".*?"));
let re = regex!(r".*?");
assert_eq!(vec![(0, 0), (3, 3), (4, 4), (7, 7), (8, 8)],
findall!(re, "Ⅰ1Ⅱ2"));
}

#[test]
fn empty_match_unicode_captures_iter() {
// Same as empty_match_unicode_find_iter, but tests capture iteration.
let re = regex!(u!(r".*?"));
let re = regex!(r".*?");
let ms: Vec<_> = re.captures_iter(text!("Ⅰ1Ⅱ2"))
.map(|c| c.pos(0).unwrap())
.collect();
Expand Down
41 changes: 21 additions & 20 deletions tests/bytes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,38 +5,39 @@
struct R<'a>(&'a [u8]);
impl<'a> R<'a> { fn as_bytes(&self) -> &'a [u8] { &self.0 } }

mat!(word_boundary, r" \b", " δ", None);
mat!(word_boundary_unicode, r"(?u) \b", " δ", Some((0, 1)));
mat!(word_not_boundary, r" \B", " δ", Some((0, 1)));
mat!(word_not_boundary_unicode, r"(?u) \B", " δ", None);

mat!(perl_w_ascii, r"\w+", "aδ", Some((0, 1)));
mat!(perl_w_unicode, r"(?u)\w+", "aδ", Some((0, 3)));
mat!(perl_d_ascii, r"\d+", "1२३9", Some((0, 1)));
mat!(perl_d_unicode, r"(?u)\d+", "1२३9", Some((0, 8)));
mat!(perl_s_ascii, r"\s+", " \u{1680}", Some((0, 1)));
mat!(perl_s_unicode, r"(?u)\s+", " \u{1680}", Some((0, 4)));
mat!(word_boundary, r"(?-u) \b", " δ", None);
mat!(word_boundary_unicode, r" \b", " δ", Some((0, 1)));
mat!(word_not_boundary, r"(?-u) \B", " δ", Some((0, 1)));
mat!(word_not_boundary_unicode, r" \B", " δ", None);

mat!(perl_w_ascii, r"(?-u)\w+", "aδ", Some((0, 1)));
mat!(perl_w_unicode, r"\w+", "aδ", Some((0, 3)));
mat!(perl_d_ascii, r"(?-u)\d+", "1२३9", Some((0, 1)));
mat!(perl_d_unicode, r"\d+", "1२३9", Some((0, 8)));
mat!(perl_s_ascii, r"(?-u)\s+", " \u{1680}", Some((0, 1)));
mat!(perl_s_unicode, r"\s+", " \u{1680}", Some((0, 4)));

// The first `(.+)` matches two Unicode codepoints, but can't match the 5th
// byte, which isn't valid UTF-8. The second (byte based) `(.+)` takes over and
// matches.
mat!(mixed1, r"(?u)(.+)(?-u)(.+)", R(b"\xCE\x93\xCE\x94\xFF"),
mat!(mixed1, r"(.+)(?-u)(.+)", R(b"\xCE\x93\xCE\x94\xFF"),
Some((0, 5)), Some((0, 4)), Some((4, 5)));

mat!(case_ascii_one, r"(?i)a", "A", Some((0, 1)));
mat!(case_ascii_class, r"(?i)[a-z]+", "AaAaA", Some((0, 5)));
mat!(case_unicode, r"(?iu)[a-z]+", "aA\u{212A}aA", Some((0, 7)));
mat!(case_not_unicode, r"(?i)[a-z]+", "aA\u{212A}aA", Some((0, 2)));
mat!(case_ascii_one, r"(?i-u)a", "A", Some((0, 1)));
mat!(case_ascii_class, r"(?i-u)[a-z]+", "AaAaA", Some((0, 5)));
mat!(case_unicode, r"(?i)[a-z]+", "aA\u{212A}aA", Some((0, 7)));
mat!(case_not_unicode, r"(?i-u)[a-z]+", "aA\u{212A}aA", Some((0, 2)));

mat!(negate_unicode, r"(?u)[^a]", "δ", Some((0, 2)));
mat!(negate_not_unicode, r"[^a]", "δ", Some((0, 1)));
mat!(negate_unicode, r"[^a]", "δ", Some((0, 2)));
mat!(negate_not_unicode, r"(?-u)[^a]", "δ", Some((0, 1)));

// This doesn't match in a normal Unicode regex because the implicit preceding
// `.*?` is Unicode aware.
mat!(dotstar_prefix_not_unicode, r"a", R(b"\xFFa"), Some((1, 2)));
mat!(dotstar_prefix_not_unicode1, r"(?-u)a", R(b"\xFFa"), Some((1, 2)));
mat!(dotstar_prefix_not_unicode2, r"a", R(b"\xFFa"), Some((1, 2)));

// Have fun with null bytes.
mat!(null_bytes, r"(?P<cstr>[^\x00]+)\x00",
mat!(null_bytes, r"(?-u)(?P<cstr>[^\x00]+)\x00",
R(b"foo\x00"), Some((0, 4)), Some((0, 3)));

// Test that lookahead operators work properly in the face of invalid UTF-8.
Expand Down
2 changes: 1 addition & 1 deletion tests/crazy.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
mat!(ascii_literal, u!(r"a"), "a", Some((0, 1)));
mat!(ascii_literal, r"a", "a", Some((0, 1)));

// Some crazy expressions from regular-expressions.info.
mat!(match_ranges,
Expand Down
2 changes: 1 addition & 1 deletion tests/macros_bytes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ macro_rules! t { ($re:expr) => { text!($re) } }
macro_rules! bytes { ($text:expr) => { $text } }
macro_rules! b { ($text:expr) => { bytes!($text) } }

macro_rules! u { ($re:expr) => { concat!("(?u)", $re) } }
// macro_rules! u { ($re:expr) => { concat!("(?u)", $re) } }

macro_rules! no_expand {
($text:expr) => {{
Expand Down
2 changes: 1 addition & 1 deletion tests/macros_str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ macro_rules! t { ($text:expr) => { text!($text) } }
macro_rules! bytes { ($text:expr) => { $text.as_bytes() } }
macro_rules! b { ($text:expr) => { bytes!($text) } }

macro_rules! u { ($re:expr) => { $re } }
// macro_rules! u { ($re:expr) => { $re } }

macro_rules! no_expand {
($text:expr) => {{
Expand Down
2 changes: 1 addition & 1 deletion tests/regression.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ mat!(regression_alt_in_alt2, r"^(.*?)(\n|\r\n?|$)", "ab\rcd", Some((0, 3)));
mat!(regression_leftmost_first_prefix, r"z*azb", "azb", Some((0, 3)));

// See: /~https://github.com/rust-lang/regex/issues/76
mat!(uni_case_lower_nocase_flag, u!(r"(?i)\p{Ll}+"), "ΛΘΓΔα", Some((0, 10)));
mat!(uni_case_lower_nocase_flag, r"(?i)\p{Ll}+", "ΛΘΓΔα", Some((0, 10)));

// See: /~https://github.com/rust-lang-nursery/regex/issues/191
mat!(many_alternates, r"1|2|3|4|5|6|7|8|9|10|int", "int", Some((0, 3)));
Expand Down
2 changes: 0 additions & 2 deletions tests/test_backtrack_bytes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ macro_rules! regex_new {
use regex::internal::ExecBuilder;
ExecBuilder::new($re)
.bounded_backtracking()
.unicode(false)
.only_utf8(false)
.build()
.map(|e| e.into_byte_regex())
Expand All @@ -34,7 +33,6 @@ macro_rules! regex_set_new {
use regex::internal::ExecBuilder;
ExecBuilder::new_many($re)
.bounded_backtracking()
.unicode(false)
.only_utf8(false)
.build()
.map(|e| e.into_byte_regex_set())
Expand Down
2 changes: 0 additions & 2 deletions tests/test_nfa_bytes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ macro_rules! regex_new {
use regex::internal::ExecBuilder;
ExecBuilder::new($re)
.nfa()
.unicode(false)
.only_utf8(false)
.build()
.map(|e| e.into_byte_regex())
Expand All @@ -35,7 +34,6 @@ macro_rules! regex_set_new {
use regex::internal::ExecBuilder;
ExecBuilder::new_many($re)
.nfa()
.unicode(false)
.only_utf8(false)
.build()
.map(|e| e.into_byte_regex_set())
Expand Down
54 changes: 27 additions & 27 deletions tests/unicode.rs
Original file line number Diff line number Diff line change
@@ -1,31 +1,31 @@
mat!(uni_literal, u!(r"☃"), "☃", Some((0, 3)));
mat!(uni_literal_plus, u!(r"☃+"), "☃", Some((0, 3)));
mat!(uni_literal_casei_plus, u!(r"(?i)☃+"), "☃", Some((0, 3)));
mat!(uni_class_plus, u!(r"[☃Ⅰ]+"), "☃", Some((0, 3)));
mat!(uni_one, u!(r"\pN"), "Ⅰ", Some((0, 3)));
mat!(uni_mixed, u!(r"\pN+"), "Ⅰ1Ⅱ2", Some((0, 8)));
mat!(uni_not, u!(r"\PN+"), "abⅠ", Some((0, 2)));
mat!(uni_not_class, u!(r"[\PN]+"), "abⅠ", Some((0, 2)));
mat!(uni_not_class_neg, u!(r"[^\PN]+"), "abⅠ", Some((2, 5)));
mat!(uni_case, u!(r"(?i)Δ"), "δ", Some((0, 2)));
mat!(uni_case_upper, u!(r"\p{Lu}+"), "ΛΘΓΔα", Some((0, 8)));
mat!(uni_case_upper_nocase_flag, u!(r"(?i)\p{Lu}+"), "ΛΘΓΔα", Some((0, 10)));
mat!(uni_case_upper_nocase, u!(r"\p{L}+"), "ΛΘΓΔα", Some((0, 10)));
mat!(uni_case_lower, u!(r"\p{Ll}+"), "ΛΘΓΔα", Some((8, 10)));
mat!(uni_literal, r"☃", "☃", Some((0, 3)));
mat!(uni_literal_plus, r"☃+", "☃", Some((0, 3)));
mat!(uni_literal_casei_plus, r"(?i)☃+", "☃", Some((0, 3)));
mat!(uni_class_plus, r"[☃Ⅰ]+", "☃", Some((0, 3)));
mat!(uni_one, r"\pN", "Ⅰ", Some((0, 3)));
mat!(uni_mixed, r"\pN+", "Ⅰ1Ⅱ2", Some((0, 8)));
mat!(uni_not, r"\PN+", "abⅠ", Some((0, 2)));
mat!(uni_not_class, r"[\PN]+", "abⅠ", Some((0, 2)));
mat!(uni_not_class_neg, r"[^\PN]+", "abⅠ", Some((2, 5)));
mat!(uni_case, r"(?i)Δ", "δ", Some((0, 2)));
mat!(uni_case_upper, r"\p{Lu}+", "ΛΘΓΔα", Some((0, 8)));
mat!(uni_case_upper_nocase_flag, r"(?i)\p{Lu}+", "ΛΘΓΔα", Some((0, 10)));
mat!(uni_case_upper_nocase, r"\p{L}+", "ΛΘΓΔα", Some((0, 10)));
mat!(uni_case_lower, r"\p{Ll}+", "ΛΘΓΔα", Some((8, 10)));

// Test the Unicode friendliness of Perl character classes.
mat!(uni_perl_w, u!(r"\w+"), "dδd", Some((0, 4)));
mat!(uni_perl_w_not, u!(r"\w+"), "⥡", None);
mat!(uni_perl_w_neg, u!(r"\W+"), "⥡", Some((0, 3)));
mat!(uni_perl_d, u!(r"\d+"), "1२३9", Some((0, 8)));
mat!(uni_perl_d_not, u!(r"\d+"), "Ⅱ", None);
mat!(uni_perl_d_neg, u!(r"\D+"), "Ⅱ", Some((0, 3)));
mat!(uni_perl_s, u!(r"\s+"), " ", Some((0, 3)));
mat!(uni_perl_s_not, u!(r"\s+"), "☃", None);
mat!(uni_perl_s_neg, u!(r"\S+"), "☃", Some((0, 3)));
mat!(uni_perl_w, r"\w+", "dδd", Some((0, 4)));
mat!(uni_perl_w_not, r"\w+", "⥡", None);
mat!(uni_perl_w_neg, r"\W+", "⥡", Some((0, 3)));
mat!(uni_perl_d, r"\d+", "1२३9", Some((0, 8)));
mat!(uni_perl_d_not, r"\d+", "Ⅱ", None);
mat!(uni_perl_d_neg, r"\D+", "Ⅱ", Some((0, 3)));
mat!(uni_perl_s, r"\s+", " ", Some((0, 3)));
mat!(uni_perl_s_not, r"\s+", "☃", None);
mat!(uni_perl_s_neg, r"\S+", "☃", Some((0, 3)));

// And do the same for word boundaries.
mat!(uni_boundary_none, u!(r"\d\b"), "6δ", None);
mat!(uni_boundary_ogham, u!(r"\d\b"), "6 ", Some((0, 1)));
mat!(uni_not_boundary_none, u!(r"\d\B"), "6δ", Some((0, 1)));
mat!(uni_not_boundary_ogham, u!(r"\d\B"), "6 ", None);
mat!(uni_boundary_none, r"\d\b", "6δ", None);
mat!(uni_boundary_ogham, r"\d\b", "6 ", Some((0, 1)));
mat!(uni_not_boundary_none, r"\d\B", "6δ", Some((0, 1)));
mat!(uni_not_boundary_ogham, r"\d\B", "6 ", None);
12 changes: 6 additions & 6 deletions tests/word_boundary_ascii.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
// ASCII word boundaries are completely oblivious to Unicode characters.
// For Unicode word boundaries, the tests are precisely inverted.
matiter!(ascii1, r"\bx\b", "áxβ", (2, 3));
matiter!(ascii2, r"\Bx\B", "áxβ");
matiter!(ascii3, r"\B", "0\u{7EF5E}", (2, 2), (3, 3), (4, 4), (5, 5));
matiter!(ascii1, r"(?-u:\b)x(?-u:\b)", "áxβ", (2, 3));
matiter!(ascii2, r"(?-u:\B)x(?-u:\B)", "áxβ");
matiter!(ascii3, r"(?-u:\B)", "0\u{7EF5E}", (2, 2), (3, 3), (4, 4), (5, 5));

// We can still get Unicode mode in byte regexes.
matiter!(unicode1, r"(?u:\b)x(?u:\b)", "áxβ");
matiter!(unicode2, r"(?u:\B)x(?u:\B)", "áxβ", (2, 3));
// We still get Unicode word boundaries by default in byte regexes.
matiter!(unicode1, r"\bx\b", "áxβ");
matiter!(unicode2, r"\Bx\B", "áxβ", (2, 3));

0 comments on commit d44a9f9

Please sign in to comment.