Switch bytes::Regex to using Unicode mode by default.

rust-lang · Dec 30, 2016 · d44a9f9 · d44a9f9
1 parent e2f0850
commit d44a9f9
Show file tree

Hide file tree

Showing 13 changed files with 77 additions and 98 deletions.
diff --git a/bench/src/bench.rs b/bench/src/bench.rs
@@ -59,22 +59,11 @@ pub use ffi::tcl::Regex;
 // Due to macro scoping rules, this definition only applies for the modules
 // defined below. Effectively, it allows us to use the same tests for both
 // native and dynamic regexes.
-#[cfg(not(feature = "re-rust-bytes"))]
 #[cfg(not(feature = "re-rust-plugin"))]
 macro_rules! regex {
     ($re:expr) => { ::Regex::new(&$re.to_owned()).unwrap() }
 }
 
-#[cfg(feature = "re-rust-bytes")]
-macro_rules! regex {
-    ($re:expr) => {{
-        // Always enable the Unicode flag for byte based regexes.
-        // Really, this should have been enabled by default. *sigh*
-        use regex::bytes::RegexBuilder;
-        RegexBuilder::new(&$re.to_owned()).unicode(true).compile().unwrap()
-    }}
-}
-
 // Usage: text!(haystack)
 //
 // Builds a ::Text from an owned string.

diff --git a/src/lib.rs b/src/lib.rs
@@ -244,16 +244,11 @@
 //! # Opt out of Unicode support
 //!
 //! The `bytes` sub-module provides a `Regex` type that can be used to match
-//! on `&[u8]`. By default, text is interpreted as ASCII compatible text with
-//! all Unicode support disabled (e.g., `.` matches any byte instead of any
-//! Unicode codepoint). Unicode support can be selectively enabled with the
-//! `u` flag. See the `bytes` module documentation for more details.
-//!
-//! Unicode support can also be selectively *disabled* with the main `Regex`
-//! type that matches on `&str`. For example, `(?-u:\b)` will match an ASCII
-//! word boundary. Note though that invalid UTF-8 is not allowed to be matched
-//! even when the `u` flag is disabled. For example, `(?-u:.)` will return an
-//! error, since `.` matches *any byte* when Unicode support is disabled.
+//! on `&[u8]`. By default, text is interpreted as UTF-8 just like it is with
+//! the main `Regex` type. However, this behavior can be disabled by turning
+//! off the `u` flag, even if doing so could result in matching invalid UTF-8.
+//! For example, when the `u` flag is disabled, `.` will match any byte instead
+//! of any Unicode codepoint.
 //!
 //! # Syntax
 //!
@@ -480,19 +475,16 @@ top-level of this crate. There are two important differences:
 
 1. Matching is done on `&[u8]` instead of `&str`. Additionally, `Vec<u8>`
 is used where `String` would have been used.
-2. Regular expressions are compiled with Unicode support *disabled* by
-default. This means that while Unicode regular expressions can only match valid
-UTF-8, regular expressions in this module can match arbitrary bytes. Unicode
-support can be selectively enabled via the `u` flag in regular expressions
-provided by this sub-module.
+2. Unicode support can be disabled even when disabling it would result in
+matching invalid UTF-8 bytes.
 
 # Example: match null terminated string
 
 This shows how to find all null-terminated strings in a slice of bytes:
 
 ```rust
 # use regex::bytes::Regex;
-let re = Regex::new(r"(?P<cstr>[^\x00]+)\x00").unwrap();
+let re = Regex::new(r"(?-u)(?P<cstr>[^\x00]+)\x00").unwrap();
 let text = b"foo\x00bar\x00baz\x00";
 
 // Extract all of the strings without the null terminator from each match.
@@ -512,7 +504,9 @@ string (e.g., to extract a title from a Matroska file):
 ```rust
 # use std::str;
 # use regex::bytes::Regex;
-let re = Regex::new(r"\x7b\xa9(?:[\x80-\xfe]|[\x40-\xff].)(?u:(.*))").unwrap();
+let re = Regex::new(
+    r"(?-u)\x7b\xa9(?:[\x80-\xfe]|[\x40-\xff].)(?u:(.*))"
+).unwrap();
 let text = b"\x12\xd0\x3b\x5f\x7b\xa9\x85\xe2\x98\x83\x80\x98\x54\x76\x68\x65";
 let caps = re.captures(text).unwrap();
 
@@ -536,9 +530,9 @@ The supported syntax is pretty much the same as the syntax for Unicode
 regular expressions with a few changes that make sense for matching arbitrary
 bytes:
 
-1. The `u` flag is *disabled* by default, but can be selectively enabled. (The
-opposite is true for the main `Regex` type.) Disabling the `u` flag is said to
-invoke "ASCII compatible" mode.
+1. The `u` flag can be disabled even when disabling it might cause the regex to
+match invalid UTF-8. When the `u` flag is disabled, the regex is said to be in
+"ASCII compatible" mode.
 2. In ASCII compatible mode, neither Unicode codepoints nor Unicode character
 classes are allowed.
 3. In ASCII compatible mode, Perl character classes (`\w`, `\d` and `\s`)

diff --git a/src/re_builder.rs b/src/re_builder.rs
@@ -40,7 +40,7 @@ impl Default for RegexOptions {
 }
 
 macro_rules! define_builder {
-    ($name:ident, $regex_mod:ident, $unicode:expr, $only_utf8:expr) => {
+    ($name:ident, $regex_mod:ident, $only_utf8:expr) => {
         pub mod $name {
             use error::Error;
             use exec::ExecBuilder;
@@ -63,7 +63,6 @@ impl RegexBuilder {
     pub fn new(pattern: &str) -> RegexBuilder {
         let mut builder = RegexBuilder(RegexOptions::default());
         builder.0.pats.push(pattern.to_owned());
-        builder.0.unicode = $unicode;
         builder
     }
 
@@ -151,5 +150,5 @@ impl RegexBuilder {
     }
 }
 
-define_builder!(bytes, re_bytes, false, false);
-define_builder!(unicode, re_unicode, true, true);
+define_builder!(bytes, re_bytes, false);
+define_builder!(unicode, re_unicode, true);
diff --git a/tests/api_str.rs b/tests/api_str.rs
@@ -5,15 +5,15 @@
 fn empty_match_unicode_find_iter() {
     // Tests that we still yield byte ranges at valid UTF-8 sequence boundaries
     // even when we're susceptible to empty width matches.
-    let re = regex!(u!(r".*?"));
+    let re = regex!(r".*?");
     assert_eq!(vec![(0, 0), (3, 3), (4, 4), (7, 7), (8, 8)],
                findall!(re, "Ⅰ1Ⅱ2"));
 }
 
 #[test]
 fn empty_match_unicode_captures_iter() {
     // Same as empty_match_unicode_find_iter, but tests capture iteration.
-    let re = regex!(u!(r".*?"));
+    let re = regex!(r".*?");
     let ms: Vec<_> = re.captures_iter(text!("Ⅰ1Ⅱ2"))
                        .map(|c| c.pos(0).unwrap())
                        .collect();

diff --git a/tests/bytes.rs b/tests/bytes.rs
@@ -5,38 +5,39 @@
 struct R<'a>(&'a [u8]);
 impl<'a> R<'a> { fn as_bytes(&self) -> &'a [u8] { &self.0 } }
 
-mat!(word_boundary, r" \b", " δ", None);
-mat!(word_boundary_unicode, r"(?u) \b", " δ", Some((0, 1)));
-mat!(word_not_boundary, r" \B", " δ", Some((0, 1)));
-mat!(word_not_boundary_unicode, r"(?u) \B", " δ", None);
-
-mat!(perl_w_ascii, r"\w+", "aδ", Some((0, 1)));
-mat!(perl_w_unicode, r"(?u)\w+", "aδ", Some((0, 3)));
-mat!(perl_d_ascii, r"\d+", "1२३9", Some((0, 1)));
-mat!(perl_d_unicode, r"(?u)\d+", "1२३9", Some((0, 8)));
-mat!(perl_s_ascii, r"\s+", " \u{1680}", Some((0, 1)));
-mat!(perl_s_unicode, r"(?u)\s+", " \u{1680}", Some((0, 4)));
+mat!(word_boundary, r"(?-u) \b", " δ", None);
+mat!(word_boundary_unicode, r" \b", " δ", Some((0, 1)));
+mat!(word_not_boundary, r"(?-u) \B", " δ", Some((0, 1)));
+mat!(word_not_boundary_unicode, r" \B", " δ", None);
+
+mat!(perl_w_ascii, r"(?-u)\w+", "aδ", Some((0, 1)));
+mat!(perl_w_unicode, r"\w+", "aδ", Some((0, 3)));
+mat!(perl_d_ascii, r"(?-u)\d+", "1२३9", Some((0, 1)));
+mat!(perl_d_unicode, r"\d+", "1२३9", Some((0, 8)));
+mat!(perl_s_ascii, r"(?-u)\s+", " \u{1680}", Some((0, 1)));
+mat!(perl_s_unicode, r"\s+", " \u{1680}", Some((0, 4)));
 
 // The first `(.+)` matches two Unicode codepoints, but can't match the 5th
 // byte, which isn't valid UTF-8. The second (byte based) `(.+)` takes over and
 // matches.
-mat!(mixed1, r"(?u)(.+)(?-u)(.+)", R(b"\xCE\x93\xCE\x94\xFF"),
+mat!(mixed1, r"(.+)(?-u)(.+)", R(b"\xCE\x93\xCE\x94\xFF"),
      Some((0, 5)), Some((0, 4)), Some((4, 5)));
 
-mat!(case_ascii_one, r"(?i)a", "A", Some((0, 1)));
-mat!(case_ascii_class, r"(?i)[a-z]+", "AaAaA", Some((0, 5)));
-mat!(case_unicode, r"(?iu)[a-z]+", "aA\u{212A}aA", Some((0, 7)));
-mat!(case_not_unicode, r"(?i)[a-z]+", "aA\u{212A}aA", Some((0, 2)));
+mat!(case_ascii_one, r"(?i-u)a", "A", Some((0, 1)));
+mat!(case_ascii_class, r"(?i-u)[a-z]+", "AaAaA", Some((0, 5)));
+mat!(case_unicode, r"(?i)[a-z]+", "aA\u{212A}aA", Some((0, 7)));
+mat!(case_not_unicode, r"(?i-u)[a-z]+", "aA\u{212A}aA", Some((0, 2)));
 
-mat!(negate_unicode, r"(?u)[^a]", "δ", Some((0, 2)));
-mat!(negate_not_unicode, r"[^a]", "δ", Some((0, 1)));
+mat!(negate_unicode, r"[^a]", "δ", Some((0, 2)));
+mat!(negate_not_unicode, r"(?-u)[^a]", "δ", Some((0, 1)));
 
 // This doesn't match in a normal Unicode regex because the implicit preceding
 // `.*?` is Unicode aware.
-mat!(dotstar_prefix_not_unicode, r"a", R(b"\xFFa"), Some((1, 2)));
+mat!(dotstar_prefix_not_unicode1, r"(?-u)a", R(b"\xFFa"), Some((1, 2)));
+mat!(dotstar_prefix_not_unicode2, r"a", R(b"\xFFa"), Some((1, 2)));
 
 // Have fun with null bytes.
-mat!(null_bytes, r"(?P<cstr>[^\x00]+)\x00",
+mat!(null_bytes, r"(?-u)(?P<cstr>[^\x00]+)\x00",
      R(b"foo\x00"), Some((0, 4)), Some((0, 3)));
 
 // Test that lookahead operators work properly in the face of invalid UTF-8.

diff --git a/tests/crazy.rs b/tests/crazy.rs
@@ -1,4 +1,4 @@
-mat!(ascii_literal, u!(r"a"), "a", Some((0, 1)));
+mat!(ascii_literal, r"a", "a", Some((0, 1)));
 
 // Some crazy expressions from regular-expressions.info.
 mat!(match_ranges,

diff --git a/tests/macros_bytes.rs b/tests/macros_bytes.rs
@@ -5,7 +5,7 @@ macro_rules! t { ($re:expr) => { text!($re) } }
 macro_rules! bytes { ($text:expr) => { $text } }
 macro_rules! b { ($text:expr) => { bytes!($text) } }
 
-macro_rules! u { ($re:expr) => { concat!("(?u)", $re) } }
+// macro_rules! u { ($re:expr) => { concat!("(?u)", $re) } }
 
 macro_rules! no_expand {
     ($text:expr) => {{

diff --git a/tests/macros_str.rs b/tests/macros_str.rs
@@ -5,7 +5,7 @@ macro_rules! t { ($text:expr) => { text!($text) } }
 macro_rules! bytes { ($text:expr) => { $text.as_bytes() } }
 macro_rules! b { ($text:expr) => { bytes!($text) } }
 
-macro_rules! u { ($re:expr) => { $re } }
+// macro_rules! u { ($re:expr) => { $re } }
 
 macro_rules! no_expand {
     ($text:expr) => {{

diff --git a/tests/regression.rs b/tests/regression.rs
@@ -41,7 +41,7 @@ mat!(regression_alt_in_alt2, r"^(.*?)(\n|\r\n?|$)", "ab\rcd", Some((0, 3)));
 mat!(regression_leftmost_first_prefix, r"z*azb", "azb", Some((0, 3)));
 
 // See: /~https://github.com/rust-lang/regex/issues/76
-mat!(uni_case_lower_nocase_flag, u!(r"(?i)\p{Ll}+"), "ΛΘΓΔα", Some((0, 10)));
+mat!(uni_case_lower_nocase_flag, r"(?i)\p{Ll}+", "ΛΘΓΔα", Some((0, 10)));
 
 // See: /~https://github.com/rust-lang-nursery/regex/issues/191
 mat!(many_alternates, r"1|2|3|4|5|6|7|8|9|10|int", "int", Some((0, 3)));

diff --git a/tests/test_backtrack_bytes.rs b/tests/test_backtrack_bytes.rs
@@ -16,7 +16,6 @@ macro_rules! regex_new {
         use regex::internal::ExecBuilder;
         ExecBuilder::new($re)
             .bounded_backtracking()
-            .unicode(false)
             .only_utf8(false)
             .build()
             .map(|e| e.into_byte_regex())
@@ -34,7 +33,6 @@ macro_rules! regex_set_new {
         use regex::internal::ExecBuilder;
         ExecBuilder::new_many($re)
             .bounded_backtracking()
-            .unicode(false)
             .only_utf8(false)
             .build()
             .map(|e| e.into_byte_regex_set())

diff --git a/tests/test_nfa_bytes.rs b/tests/test_nfa_bytes.rs
@@ -17,7 +17,6 @@ macro_rules! regex_new {
         use regex::internal::ExecBuilder;
         ExecBuilder::new($re)
             .nfa()
-            .unicode(false)
             .only_utf8(false)
             .build()
             .map(|e| e.into_byte_regex())
@@ -35,7 +34,6 @@ macro_rules! regex_set_new {
         use regex::internal::ExecBuilder;
         ExecBuilder::new_many($re)
             .nfa()
-            .unicode(false)
             .only_utf8(false)
             .build()
             .map(|e| e.into_byte_regex_set())

diff --git a/tests/unicode.rs b/tests/unicode.rs
@@ -1,31 +1,31 @@
-mat!(uni_literal, u!(r"☃"), "☃", Some((0, 3)));
-mat!(uni_literal_plus, u!(r"☃+"), "☃", Some((0, 3)));
-mat!(uni_literal_casei_plus, u!(r"(?i)☃+"), "☃", Some((0, 3)));
-mat!(uni_class_plus, u!(r"[☃Ⅰ]+"), "☃", Some((0, 3)));
-mat!(uni_one, u!(r"\pN"), "Ⅰ", Some((0, 3)));
-mat!(uni_mixed, u!(r"\pN+"), "Ⅰ1Ⅱ2", Some((0, 8)));
-mat!(uni_not, u!(r"\PN+"), "abⅠ", Some((0, 2)));
-mat!(uni_not_class, u!(r"[\PN]+"), "abⅠ", Some((0, 2)));
-mat!(uni_not_class_neg, u!(r"[^\PN]+"), "abⅠ", Some((2, 5)));
-mat!(uni_case, u!(r"(?i)Δ"), "δ", Some((0, 2)));
-mat!(uni_case_upper, u!(r"\p{Lu}+"), "ΛΘΓΔα", Some((0, 8)));
-mat!(uni_case_upper_nocase_flag, u!(r"(?i)\p{Lu}+"), "ΛΘΓΔα", Some((0, 10)));
-mat!(uni_case_upper_nocase, u!(r"\p{L}+"), "ΛΘΓΔα", Some((0, 10)));
-mat!(uni_case_lower, u!(r"\p{Ll}+"), "ΛΘΓΔα", Some((8, 10)));
+mat!(uni_literal, r"☃", "☃", Some((0, 3)));
+mat!(uni_literal_plus, r"☃+", "☃", Some((0, 3)));
+mat!(uni_literal_casei_plus, r"(?i)☃+", "☃", Some((0, 3)));
+mat!(uni_class_plus, r"[☃Ⅰ]+", "☃", Some((0, 3)));
+mat!(uni_one, r"\pN", "Ⅰ", Some((0, 3)));
+mat!(uni_mixed, r"\pN+", "Ⅰ1Ⅱ2", Some((0, 8)));
+mat!(uni_not, r"\PN+", "abⅠ", Some((0, 2)));
+mat!(uni_not_class, r"[\PN]+", "abⅠ", Some((0, 2)));
+mat!(uni_not_class_neg, r"[^\PN]+", "abⅠ", Some((2, 5)));
+mat!(uni_case, r"(?i)Δ", "δ", Some((0, 2)));
+mat!(uni_case_upper, r"\p{Lu}+", "ΛΘΓΔα", Some((0, 8)));
+mat!(uni_case_upper_nocase_flag, r"(?i)\p{Lu}+", "ΛΘΓΔα", Some((0, 10)));
+mat!(uni_case_upper_nocase, r"\p{L}+", "ΛΘΓΔα", Some((0, 10)));
+mat!(uni_case_lower, r"\p{Ll}+", "ΛΘΓΔα", Some((8, 10)));
 
 // Test the Unicode friendliness of Perl character classes.
-mat!(uni_perl_w, u!(r"\w+"), "dδd", Some((0, 4)));
-mat!(uni_perl_w_not, u!(r"\w+"), "⥡", None);
-mat!(uni_perl_w_neg, u!(r"\W+"), "⥡", Some((0, 3)));
-mat!(uni_perl_d, u!(r"\d+"), "1२३9", Some((0, 8)));
-mat!(uni_perl_d_not, u!(r"\d+"), "Ⅱ", None);
-mat!(uni_perl_d_neg, u!(r"\D+"), "Ⅱ", Some((0, 3)));
-mat!(uni_perl_s, u!(r"\s+"), " ", Some((0, 3)));
-mat!(uni_perl_s_not, u!(r"\s+"), "☃", None);
-mat!(uni_perl_s_neg, u!(r"\S+"), "☃", Some((0, 3)));
+mat!(uni_perl_w, r"\w+", "dδd", Some((0, 4)));
+mat!(uni_perl_w_not, r"\w+", "⥡", None);
+mat!(uni_perl_w_neg, r"\W+", "⥡", Some((0, 3)));
+mat!(uni_perl_d, r"\d+", "1२३9", Some((0, 8)));
+mat!(uni_perl_d_not, r"\d+", "Ⅱ", None);
+mat!(uni_perl_d_neg, r"\D+", "Ⅱ", Some((0, 3)));
+mat!(uni_perl_s, r"\s+", " ", Some((0, 3)));
+mat!(uni_perl_s_not, r"\s+", "☃", None);
+mat!(uni_perl_s_neg, r"\S+", "☃", Some((0, 3)));
 
 // And do the same for word boundaries.
-mat!(uni_boundary_none, u!(r"\d\b"), "6δ", None);
-mat!(uni_boundary_ogham, u!(r"\d\b"), "6 ", Some((0, 1)));
-mat!(uni_not_boundary_none, u!(r"\d\B"), "6δ", Some((0, 1)));
-mat!(uni_not_boundary_ogham, u!(r"\d\B"), "6 ", None);
+mat!(uni_boundary_none, r"\d\b", "6δ", None);
+mat!(uni_boundary_ogham, r"\d\b", "6 ", Some((0, 1)));
+mat!(uni_not_boundary_none, r"\d\B", "6δ", Some((0, 1)));
+mat!(uni_not_boundary_ogham, r"\d\B", "6 ", None);
diff --git a/tests/word_boundary_ascii.rs b/tests/word_boundary_ascii.rs
@@ -1,9 +1,9 @@
 // ASCII word boundaries are completely oblivious to Unicode characters.
 // For Unicode word boundaries, the tests are precisely inverted.
-matiter!(ascii1, r"\bx\b", "áxβ", (2, 3));
-matiter!(ascii2, r"\Bx\B", "áxβ");
-matiter!(ascii3, r"\B", "0\u{7EF5E}", (2, 2), (3, 3), (4, 4), (5, 5));
+matiter!(ascii1, r"(?-u:\b)x(?-u:\b)", "áxβ", (2, 3));
+matiter!(ascii2, r"(?-u:\B)x(?-u:\B)", "áxβ");
+matiter!(ascii3, r"(?-u:\B)", "0\u{7EF5E}", (2, 2), (3, 3), (4, 4), (5, 5));
 
-// We can still get Unicode mode in byte regexes.
-matiter!(unicode1, r"(?u:\b)x(?u:\b)", "áxβ");
-matiter!(unicode2, r"(?u:\B)x(?u:\B)", "áxβ", (2, 3));
+// We still get Unicode word boundaries by default in byte regexes.
+matiter!(unicode1, r"\bx\b", "áxβ");
+matiter!(unicode2, r"\Bx\B", "áxβ", (2, 3));