diff --git a/bench/src/bench.rs b/bench/src/bench.rs index 9c8a924746..a45079edc0 100644 --- a/bench/src/bench.rs +++ b/bench/src/bench.rs @@ -59,22 +59,11 @@ pub use ffi::tcl::Regex; // Due to macro scoping rules, this definition only applies for the modules // defined below. Effectively, it allows us to use the same tests for both // native and dynamic regexes. -#[cfg(not(feature = "re-rust-bytes"))] #[cfg(not(feature = "re-rust-plugin"))] macro_rules! regex { ($re:expr) => { ::Regex::new(&$re.to_owned()).unwrap() } } -#[cfg(feature = "re-rust-bytes")] -macro_rules! regex { - ($re:expr) => {{ - // Always enable the Unicode flag for byte based regexes. - // Really, this should have been enabled by default. *sigh* - use regex::bytes::RegexBuilder; - RegexBuilder::new(&$re.to_owned()).unicode(true).compile().unwrap() - }} -} - // Usage: text!(haystack) // // Builds a ::Text from an owned string. diff --git a/src/lib.rs b/src/lib.rs index 7e4b24cb03..458f2d91ca 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -244,16 +244,11 @@ //! # Opt out of Unicode support //! //! The `bytes` sub-module provides a `Regex` type that can be used to match -//! on `&[u8]`. By default, text is interpreted as ASCII compatible text with -//! all Unicode support disabled (e.g., `.` matches any byte instead of any -//! Unicode codepoint). Unicode support can be selectively enabled with the -//! `u` flag. See the `bytes` module documentation for more details. -//! -//! Unicode support can also be selectively *disabled* with the main `Regex` -//! type that matches on `&str`. For example, `(?-u:\b)` will match an ASCII -//! word boundary. Note though that invalid UTF-8 is not allowed to be matched -//! even when the `u` flag is disabled. For example, `(?-u:.)` will return an -//! error, since `.` matches *any byte* when Unicode support is disabled. +//! on `&[u8]`. By default, text is interpreted as UTF-8 just like it is with +//! the main `Regex` type. However, this behavior can be disabled by turning +//! off the `u` flag, even if doing so could result in matching invalid UTF-8. +//! For example, when the `u` flag is disabled, `.` will match any byte instead +//! of any Unicode codepoint. //! //! # Syntax //! @@ -480,11 +475,8 @@ top-level of this crate. There are two important differences: 1. Matching is done on `&[u8]` instead of `&str`. Additionally, `Vec` is used where `String` would have been used. -2. Regular expressions are compiled with Unicode support *disabled* by -default. This means that while Unicode regular expressions can only match valid -UTF-8, regular expressions in this module can match arbitrary bytes. Unicode -support can be selectively enabled via the `u` flag in regular expressions -provided by this sub-module. +2. Unicode support can be disabled even when disabling it would result in +matching invalid UTF-8 bytes. # Example: match null terminated string @@ -492,7 +484,7 @@ This shows how to find all null-terminated strings in a slice of bytes: ```rust # use regex::bytes::Regex; -let re = Regex::new(r"(?P[^\x00]+)\x00").unwrap(); +let re = Regex::new(r"(?-u)(?P[^\x00]+)\x00").unwrap(); let text = b"foo\x00bar\x00baz\x00"; // Extract all of the strings without the null terminator from each match. @@ -512,7 +504,9 @@ string (e.g., to extract a title from a Matroska file): ```rust # use std::str; # use regex::bytes::Regex; -let re = Regex::new(r"\x7b\xa9(?:[\x80-\xfe]|[\x40-\xff].)(?u:(.*))").unwrap(); +let re = Regex::new( + r"(?-u)\x7b\xa9(?:[\x80-\xfe]|[\x40-\xff].)(?u:(.*))" +).unwrap(); let text = b"\x12\xd0\x3b\x5f\x7b\xa9\x85\xe2\x98\x83\x80\x98\x54\x76\x68\x65"; let caps = re.captures(text).unwrap(); @@ -536,9 +530,9 @@ The supported syntax is pretty much the same as the syntax for Unicode regular expressions with a few changes that make sense for matching arbitrary bytes: -1. The `u` flag is *disabled* by default, but can be selectively enabled. (The -opposite is true for the main `Regex` type.) Disabling the `u` flag is said to -invoke "ASCII compatible" mode. +1. The `u` flag can be disabled even when disabling it might cause the regex to +match invalid UTF-8. When the `u` flag is disabled, the regex is said to be in +"ASCII compatible" mode. 2. In ASCII compatible mode, neither Unicode codepoints nor Unicode character classes are allowed. 3. In ASCII compatible mode, Perl character classes (`\w`, `\d` and `\s`) diff --git a/src/re_builder.rs b/src/re_builder.rs index a5ea341125..98be31408c 100644 --- a/src/re_builder.rs +++ b/src/re_builder.rs @@ -40,7 +40,7 @@ impl Default for RegexOptions { } macro_rules! define_builder { - ($name:ident, $regex_mod:ident, $unicode:expr, $only_utf8:expr) => { + ($name:ident, $regex_mod:ident, $only_utf8:expr) => { pub mod $name { use error::Error; use exec::ExecBuilder; @@ -63,7 +63,6 @@ impl RegexBuilder { pub fn new(pattern: &str) -> RegexBuilder { let mut builder = RegexBuilder(RegexOptions::default()); builder.0.pats.push(pattern.to_owned()); - builder.0.unicode = $unicode; builder } @@ -151,5 +150,5 @@ impl RegexBuilder { } } -define_builder!(bytes, re_bytes, false, false); -define_builder!(unicode, re_unicode, true, true); +define_builder!(bytes, re_bytes, false); +define_builder!(unicode, re_unicode, true); diff --git a/tests/api_str.rs b/tests/api_str.rs index 266b6455b2..e5e667863d 100644 --- a/tests/api_str.rs +++ b/tests/api_str.rs @@ -5,7 +5,7 @@ fn empty_match_unicode_find_iter() { // Tests that we still yield byte ranges at valid UTF-8 sequence boundaries // even when we're susceptible to empty width matches. - let re = regex!(u!(r".*?")); + let re = regex!(r".*?"); assert_eq!(vec![(0, 0), (3, 3), (4, 4), (7, 7), (8, 8)], findall!(re, "Ⅰ1Ⅱ2")); } @@ -13,7 +13,7 @@ fn empty_match_unicode_find_iter() { #[test] fn empty_match_unicode_captures_iter() { // Same as empty_match_unicode_find_iter, but tests capture iteration. - let re = regex!(u!(r".*?")); + let re = regex!(r".*?"); let ms: Vec<_> = re.captures_iter(text!("Ⅰ1Ⅱ2")) .map(|c| c.pos(0).unwrap()) .collect(); diff --git a/tests/bytes.rs b/tests/bytes.rs index c950688fae..fc391b1a5e 100644 --- a/tests/bytes.rs +++ b/tests/bytes.rs @@ -5,38 +5,39 @@ struct R<'a>(&'a [u8]); impl<'a> R<'a> { fn as_bytes(&self) -> &'a [u8] { &self.0 } } -mat!(word_boundary, r" \b", " δ", None); -mat!(word_boundary_unicode, r"(?u) \b", " δ", Some((0, 1))); -mat!(word_not_boundary, r" \B", " δ", Some((0, 1))); -mat!(word_not_boundary_unicode, r"(?u) \B", " δ", None); - -mat!(perl_w_ascii, r"\w+", "aδ", Some((0, 1))); -mat!(perl_w_unicode, r"(?u)\w+", "aδ", Some((0, 3))); -mat!(perl_d_ascii, r"\d+", "1२३9", Some((0, 1))); -mat!(perl_d_unicode, r"(?u)\d+", "1२३9", Some((0, 8))); -mat!(perl_s_ascii, r"\s+", " \u{1680}", Some((0, 1))); -mat!(perl_s_unicode, r"(?u)\s+", " \u{1680}", Some((0, 4))); +mat!(word_boundary, r"(?-u) \b", " δ", None); +mat!(word_boundary_unicode, r" \b", " δ", Some((0, 1))); +mat!(word_not_boundary, r"(?-u) \B", " δ", Some((0, 1))); +mat!(word_not_boundary_unicode, r" \B", " δ", None); + +mat!(perl_w_ascii, r"(?-u)\w+", "aδ", Some((0, 1))); +mat!(perl_w_unicode, r"\w+", "aδ", Some((0, 3))); +mat!(perl_d_ascii, r"(?-u)\d+", "1२३9", Some((0, 1))); +mat!(perl_d_unicode, r"\d+", "1२३9", Some((0, 8))); +mat!(perl_s_ascii, r"(?-u)\s+", " \u{1680}", Some((0, 1))); +mat!(perl_s_unicode, r"\s+", " \u{1680}", Some((0, 4))); // The first `(.+)` matches two Unicode codepoints, but can't match the 5th // byte, which isn't valid UTF-8. The second (byte based) `(.+)` takes over and // matches. -mat!(mixed1, r"(?u)(.+)(?-u)(.+)", R(b"\xCE\x93\xCE\x94\xFF"), +mat!(mixed1, r"(.+)(?-u)(.+)", R(b"\xCE\x93\xCE\x94\xFF"), Some((0, 5)), Some((0, 4)), Some((4, 5))); -mat!(case_ascii_one, r"(?i)a", "A", Some((0, 1))); -mat!(case_ascii_class, r"(?i)[a-z]+", "AaAaA", Some((0, 5))); -mat!(case_unicode, r"(?iu)[a-z]+", "aA\u{212A}aA", Some((0, 7))); -mat!(case_not_unicode, r"(?i)[a-z]+", "aA\u{212A}aA", Some((0, 2))); +mat!(case_ascii_one, r"(?i-u)a", "A", Some((0, 1))); +mat!(case_ascii_class, r"(?i-u)[a-z]+", "AaAaA", Some((0, 5))); +mat!(case_unicode, r"(?i)[a-z]+", "aA\u{212A}aA", Some((0, 7))); +mat!(case_not_unicode, r"(?i-u)[a-z]+", "aA\u{212A}aA", Some((0, 2))); -mat!(negate_unicode, r"(?u)[^a]", "δ", Some((0, 2))); -mat!(negate_not_unicode, r"[^a]", "δ", Some((0, 1))); +mat!(negate_unicode, r"[^a]", "δ", Some((0, 2))); +mat!(negate_not_unicode, r"(?-u)[^a]", "δ", Some((0, 1))); // This doesn't match in a normal Unicode regex because the implicit preceding // `.*?` is Unicode aware. -mat!(dotstar_prefix_not_unicode, r"a", R(b"\xFFa"), Some((1, 2))); +mat!(dotstar_prefix_not_unicode1, r"(?-u)a", R(b"\xFFa"), Some((1, 2))); +mat!(dotstar_prefix_not_unicode2, r"a", R(b"\xFFa"), Some((1, 2))); // Have fun with null bytes. -mat!(null_bytes, r"(?P[^\x00]+)\x00", +mat!(null_bytes, r"(?-u)(?P[^\x00]+)\x00", R(b"foo\x00"), Some((0, 4)), Some((0, 3))); // Test that lookahead operators work properly in the face of invalid UTF-8. diff --git a/tests/crazy.rs b/tests/crazy.rs index bed66277e5..ade839ade1 100644 --- a/tests/crazy.rs +++ b/tests/crazy.rs @@ -1,4 +1,4 @@ -mat!(ascii_literal, u!(r"a"), "a", Some((0, 1))); +mat!(ascii_literal, r"a", "a", Some((0, 1))); // Some crazy expressions from regular-expressions.info. mat!(match_ranges, diff --git a/tests/macros_bytes.rs b/tests/macros_bytes.rs index a68fada744..89c236ff31 100644 --- a/tests/macros_bytes.rs +++ b/tests/macros_bytes.rs @@ -5,7 +5,7 @@ macro_rules! t { ($re:expr) => { text!($re) } } macro_rules! bytes { ($text:expr) => { $text } } macro_rules! b { ($text:expr) => { bytes!($text) } } -macro_rules! u { ($re:expr) => { concat!("(?u)", $re) } } +// macro_rules! u { ($re:expr) => { concat!("(?u)", $re) } } macro_rules! no_expand { ($text:expr) => {{ diff --git a/tests/macros_str.rs b/tests/macros_str.rs index 7ea29335de..c419ee90dd 100644 --- a/tests/macros_str.rs +++ b/tests/macros_str.rs @@ -5,7 +5,7 @@ macro_rules! t { ($text:expr) => { text!($text) } } macro_rules! bytes { ($text:expr) => { $text.as_bytes() } } macro_rules! b { ($text:expr) => { bytes!($text) } } -macro_rules! u { ($re:expr) => { $re } } +// macro_rules! u { ($re:expr) => { $re } } macro_rules! no_expand { ($text:expr) => {{ diff --git a/tests/regression.rs b/tests/regression.rs index 05717ea1bc..d96f78aadb 100644 --- a/tests/regression.rs +++ b/tests/regression.rs @@ -41,7 +41,7 @@ mat!(regression_alt_in_alt2, r"^(.*?)(\n|\r\n?|$)", "ab\rcd", Some((0, 3))); mat!(regression_leftmost_first_prefix, r"z*azb", "azb", Some((0, 3))); // See: /~https://github.com/rust-lang/regex/issues/76 -mat!(uni_case_lower_nocase_flag, u!(r"(?i)\p{Ll}+"), "ΛΘΓΔα", Some((0, 10))); +mat!(uni_case_lower_nocase_flag, r"(?i)\p{Ll}+", "ΛΘΓΔα", Some((0, 10))); // See: /~https://github.com/rust-lang-nursery/regex/issues/191 mat!(many_alternates, r"1|2|3|4|5|6|7|8|9|10|int", "int", Some((0, 3))); diff --git a/tests/test_backtrack_bytes.rs b/tests/test_backtrack_bytes.rs index 57074f1870..4ea60e7d0f 100644 --- a/tests/test_backtrack_bytes.rs +++ b/tests/test_backtrack_bytes.rs @@ -16,7 +16,6 @@ macro_rules! regex_new { use regex::internal::ExecBuilder; ExecBuilder::new($re) .bounded_backtracking() - .unicode(false) .only_utf8(false) .build() .map(|e| e.into_byte_regex()) @@ -34,7 +33,6 @@ macro_rules! regex_set_new { use regex::internal::ExecBuilder; ExecBuilder::new_many($re) .bounded_backtracking() - .unicode(false) .only_utf8(false) .build() .map(|e| e.into_byte_regex_set()) diff --git a/tests/test_nfa_bytes.rs b/tests/test_nfa_bytes.rs index 83eea01a2d..a084c804fe 100644 --- a/tests/test_nfa_bytes.rs +++ b/tests/test_nfa_bytes.rs @@ -17,7 +17,6 @@ macro_rules! regex_new { use regex::internal::ExecBuilder; ExecBuilder::new($re) .nfa() - .unicode(false) .only_utf8(false) .build() .map(|e| e.into_byte_regex()) @@ -35,7 +34,6 @@ macro_rules! regex_set_new { use regex::internal::ExecBuilder; ExecBuilder::new_many($re) .nfa() - .unicode(false) .only_utf8(false) .build() .map(|e| e.into_byte_regex_set()) diff --git a/tests/unicode.rs b/tests/unicode.rs index 5357a18c96..48e9a95aaf 100644 --- a/tests/unicode.rs +++ b/tests/unicode.rs @@ -1,31 +1,31 @@ -mat!(uni_literal, u!(r"☃"), "☃", Some((0, 3))); -mat!(uni_literal_plus, u!(r"☃+"), "☃", Some((0, 3))); -mat!(uni_literal_casei_plus, u!(r"(?i)☃+"), "☃", Some((0, 3))); -mat!(uni_class_plus, u!(r"[☃Ⅰ]+"), "☃", Some((0, 3))); -mat!(uni_one, u!(r"\pN"), "Ⅰ", Some((0, 3))); -mat!(uni_mixed, u!(r"\pN+"), "Ⅰ1Ⅱ2", Some((0, 8))); -mat!(uni_not, u!(r"\PN+"), "abⅠ", Some((0, 2))); -mat!(uni_not_class, u!(r"[\PN]+"), "abⅠ", Some((0, 2))); -mat!(uni_not_class_neg, u!(r"[^\PN]+"), "abⅠ", Some((2, 5))); -mat!(uni_case, u!(r"(?i)Δ"), "δ", Some((0, 2))); -mat!(uni_case_upper, u!(r"\p{Lu}+"), "ΛΘΓΔα", Some((0, 8))); -mat!(uni_case_upper_nocase_flag, u!(r"(?i)\p{Lu}+"), "ΛΘΓΔα", Some((0, 10))); -mat!(uni_case_upper_nocase, u!(r"\p{L}+"), "ΛΘΓΔα", Some((0, 10))); -mat!(uni_case_lower, u!(r"\p{Ll}+"), "ΛΘΓΔα", Some((8, 10))); +mat!(uni_literal, r"☃", "☃", Some((0, 3))); +mat!(uni_literal_plus, r"☃+", "☃", Some((0, 3))); +mat!(uni_literal_casei_plus, r"(?i)☃+", "☃", Some((0, 3))); +mat!(uni_class_plus, r"[☃Ⅰ]+", "☃", Some((0, 3))); +mat!(uni_one, r"\pN", "Ⅰ", Some((0, 3))); +mat!(uni_mixed, r"\pN+", "Ⅰ1Ⅱ2", Some((0, 8))); +mat!(uni_not, r"\PN+", "abⅠ", Some((0, 2))); +mat!(uni_not_class, r"[\PN]+", "abⅠ", Some((0, 2))); +mat!(uni_not_class_neg, r"[^\PN]+", "abⅠ", Some((2, 5))); +mat!(uni_case, r"(?i)Δ", "δ", Some((0, 2))); +mat!(uni_case_upper, r"\p{Lu}+", "ΛΘΓΔα", Some((0, 8))); +mat!(uni_case_upper_nocase_flag, r"(?i)\p{Lu}+", "ΛΘΓΔα", Some((0, 10))); +mat!(uni_case_upper_nocase, r"\p{L}+", "ΛΘΓΔα", Some((0, 10))); +mat!(uni_case_lower, r"\p{Ll}+", "ΛΘΓΔα", Some((8, 10))); // Test the Unicode friendliness of Perl character classes. -mat!(uni_perl_w, u!(r"\w+"), "dδd", Some((0, 4))); -mat!(uni_perl_w_not, u!(r"\w+"), "⥡", None); -mat!(uni_perl_w_neg, u!(r"\W+"), "⥡", Some((0, 3))); -mat!(uni_perl_d, u!(r"\d+"), "1२३9", Some((0, 8))); -mat!(uni_perl_d_not, u!(r"\d+"), "Ⅱ", None); -mat!(uni_perl_d_neg, u!(r"\D+"), "Ⅱ", Some((0, 3))); -mat!(uni_perl_s, u!(r"\s+"), " ", Some((0, 3))); -mat!(uni_perl_s_not, u!(r"\s+"), "☃", None); -mat!(uni_perl_s_neg, u!(r"\S+"), "☃", Some((0, 3))); +mat!(uni_perl_w, r"\w+", "dδd", Some((0, 4))); +mat!(uni_perl_w_not, r"\w+", "⥡", None); +mat!(uni_perl_w_neg, r"\W+", "⥡", Some((0, 3))); +mat!(uni_perl_d, r"\d+", "1२३9", Some((0, 8))); +mat!(uni_perl_d_not, r"\d+", "Ⅱ", None); +mat!(uni_perl_d_neg, r"\D+", "Ⅱ", Some((0, 3))); +mat!(uni_perl_s, r"\s+", " ", Some((0, 3))); +mat!(uni_perl_s_not, r"\s+", "☃", None); +mat!(uni_perl_s_neg, r"\S+", "☃", Some((0, 3))); // And do the same for word boundaries. -mat!(uni_boundary_none, u!(r"\d\b"), "6δ", None); -mat!(uni_boundary_ogham, u!(r"\d\b"), "6 ", Some((0, 1))); -mat!(uni_not_boundary_none, u!(r"\d\B"), "6δ", Some((0, 1))); -mat!(uni_not_boundary_ogham, u!(r"\d\B"), "6 ", None); +mat!(uni_boundary_none, r"\d\b", "6δ", None); +mat!(uni_boundary_ogham, r"\d\b", "6 ", Some((0, 1))); +mat!(uni_not_boundary_none, r"\d\B", "6δ", Some((0, 1))); +mat!(uni_not_boundary_ogham, r"\d\B", "6 ", None); diff --git a/tests/word_boundary_ascii.rs b/tests/word_boundary_ascii.rs index 9beb7c0cb1..5a3cf1166c 100644 --- a/tests/word_boundary_ascii.rs +++ b/tests/word_boundary_ascii.rs @@ -1,9 +1,9 @@ // ASCII word boundaries are completely oblivious to Unicode characters. // For Unicode word boundaries, the tests are precisely inverted. -matiter!(ascii1, r"\bx\b", "áxβ", (2, 3)); -matiter!(ascii2, r"\Bx\B", "áxβ"); -matiter!(ascii3, r"\B", "0\u{7EF5E}", (2, 2), (3, 3), (4, 4), (5, 5)); +matiter!(ascii1, r"(?-u:\b)x(?-u:\b)", "áxβ", (2, 3)); +matiter!(ascii2, r"(?-u:\B)x(?-u:\B)", "áxβ"); +matiter!(ascii3, r"(?-u:\B)", "0\u{7EF5E}", (2, 2), (3, 3), (4, 4), (5, 5)); -// We can still get Unicode mode in byte regexes. -matiter!(unicode1, r"(?u:\b)x(?u:\b)", "áxβ"); -matiter!(unicode2, r"(?u:\B)x(?u:\B)", "áxβ", (2, 3)); +// We still get Unicode word boundaries by default in byte regexes. +matiter!(unicode1, r"\bx\b", "áxβ"); +matiter!(unicode2, r"\Bx\B", "áxβ", (2, 3));