diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5d5b24c4c..060a2226c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -122,6 +122,8 @@ jobs: - name: Run subset of regex-automata tests if: matrix.build != 'win-gnu' # Just horrifically slow. run: ${{ env.CARGO }} test --verbose --manifest-path regex-automata/Cargo.toml $TARGET + - name: Run regex-lite tests + run: ${{ env.CARGO }} test --verbose --manifest-path regex-lite/Cargo.toml $TARGET - if: matrix.build == 'nightly' name: Run benchmarks as tests run: | diff --git a/Cargo.toml b/Cargo.toml index 50f6ca6de..c8781f39f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,6 +23,7 @@ members = [ "regex-automata", "regex-capi", "regex-cli", + "regex-lite", "regex-syntax", "regex-test", ] diff --git a/regex-lite/Cargo.toml b/regex-lite/Cargo.toml new file mode 100644 index 000000000..6724f39e9 --- /dev/null +++ b/regex-lite/Cargo.toml @@ -0,0 +1,36 @@ +[package] +name = "regex-lite" +version = "0.1.0" #:version +authors = ["The Rust Project Developers", "Andrew Gallant "] +license = "MIT OR Apache-2.0" +repository = "/~https://github.com/rust-lang/regex/tree/master/regex-lite" +documentation = "https://docs.rs/regex-lite" +description = """ +A lightweight regex engine that optimizes for binary size and compilation time. +""" +workspace = ".." +edition = "2021" +rust-version = "1.60.0" +autotests = false + +# Features are documented in the "Crate features" section of the crate docs: +# https://docs.rs/regex-syntax/*/#crate-features +[features] +default = ["std"] +std = [] + +[dev-dependencies] +anyhow = "1.0.69" +regex-test = { path = "../regex-test", version = "0.1.0" } + +[[test]] +path = "tests/lib.rs" +name = "integration" + +[package.metadata.docs.rs] +# We want to document all features. +all-features = true +# To test this locally, run: +# +# RUSTDOCFLAGS="--cfg docsrs" cargo +nightly doc --all-features +rustdoc-args = ["--cfg", "docsrs"] diff --git a/regex-lite/LICENSE-APACHE b/regex-lite/LICENSE-APACHE new file mode 100644 index 000000000..16fe87b06 --- /dev/null +++ b/regex-lite/LICENSE-APACHE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/regex-lite/LICENSE-MIT b/regex-lite/LICENSE-MIT new file mode 100644 index 000000000..39d4bdb5a --- /dev/null +++ b/regex-lite/LICENSE-MIT @@ -0,0 +1,25 @@ +Copyright (c) 2014 The Rust Project Developers + +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/regex-lite/README.md b/regex-lite/README.md new file mode 100644 index 000000000..00d7bdd40 --- /dev/null +++ b/regex-lite/README.md @@ -0,0 +1 @@ +WIP diff --git a/regex-lite/src/error.rs b/regex-lite/src/error.rs new file mode 100644 index 000000000..a6313aa8a --- /dev/null +++ b/regex-lite/src/error.rs @@ -0,0 +1,19 @@ +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Error { + msg: &'static str, +} + +impl Error { + pub(crate) fn new(msg: &'static str) -> Error { + Error { msg } + } +} + +#[cfg(feature = "std")] +impl std::error::Error for Error {} + +impl core::fmt::Display for Error { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "{}", self.msg) + } +} diff --git a/regex-lite/src/hir/mod.rs b/regex-lite/src/hir/mod.rs new file mode 100644 index 000000000..bb05e690b --- /dev/null +++ b/regex-lite/src/hir/mod.rs @@ -0,0 +1,662 @@ +use alloc::{boxed::Box, vec, vec::Vec}; + +use crate::{error::Error, utf8}; + +mod parse; + +/// Returns true if the given character has significance in a regex. +/// +/// Generally speaking, these are the only characters which _must_ be escaped +/// in order to match their literal meaning. For example, to match a literal +/// `|`, one could write `\|`. Sometimes escaping isn't always necessary. For +/// example, `-` is treated as a meta character because of its significance +/// for writing ranges inside of character classes, but the regex `-` will +/// match a literal `-` because `-` has no special meaning outside of character +/// classes. +/// +/// In order to determine whether a character may be escaped at all, the +/// [`is_escapeable_character`] routine should be used. The difference between +/// `is_meta_character` and `is_escapeable_character` is that the latter will +/// return true for some characters that are _not_ meta characters. For +/// example, `%` and `\%` both match a literal `%` in all contexts. In other +/// words, `is_escapeable_character` includes "superfluous" escapes. +/// +/// Note that the set of characters for which this function returns `true` or +/// `false` is fixed and won't change in a semver compatible release. (In this +/// case, "semver compatible release" actually refers to the `regex` crate +/// itself, since reducing or expanding the set of meta characters would be a +/// breaking change for not just `regex-syntax` but also `regex` itself.) +/// +/// # Example +/// +/// ``` +/// use regex_lite::is_meta_character; +/// +/// assert!(is_meta_character('?')); +/// assert!(is_meta_character('-')); +/// assert!(is_meta_character('&')); +/// assert!(is_meta_character('#')); +/// +/// assert!(!is_meta_character('%')); +/// assert!(!is_meta_character('/')); +/// assert!(!is_meta_character('!')); +/// assert!(!is_meta_character('"')); +/// assert!(!is_meta_character('e')); +/// ``` +pub fn is_meta_character(c: char) -> bool { + match c { + '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{' + | '}' | '^' | '$' | '#' | '&' | '-' | '~' => true, + _ => false, + } +} + +/// Returns true if the given character can be escaped in a regex. +/// +/// This returns true in all cases that `is_meta_character` returns true, but +/// also returns true in some cases where `is_meta_character` returns false. +/// For example, `%` is not a meta character, but it is escapeable. That is, +/// `%` and `\%` both match a literal `%` in all contexts. +/// +/// The purpose of this routine is to provide knowledge about what characters +/// may be escaped. Namely, most regex engines permit "superfluous" escapes +/// where characters without any special significance may be escaped even +/// though there is no actual _need_ to do so. +/// +/// This will return false for some characters. For example, `e` is not +/// escapeable. Therefore, `\e` will either result in a parse error (which is +/// true today), or it could backwards compatibly evolve into a new construct +/// with its own meaning. Indeed, that is the purpose of banning _some_ +/// superfluous escapes: it provides a way to evolve the syntax in a compatible +/// manner. +/// +/// # Example +/// +/// ``` +/// use regex_lite::is_escapeable_character; +/// +/// assert!(is_escapeable_character('?')); +/// assert!(is_escapeable_character('-')); +/// assert!(is_escapeable_character('&')); +/// assert!(is_escapeable_character('#')); +/// assert!(is_escapeable_character('%')); +/// assert!(is_escapeable_character('/')); +/// assert!(is_escapeable_character('!')); +/// assert!(is_escapeable_character('"')); +/// +/// assert!(!is_escapeable_character('e')); +/// ``` +pub fn is_escapeable_character(c: char) -> bool { + // Certainly escapeable if it's a meta character. + if is_meta_character(c) { + return true; + } + // Any character that isn't ASCII is definitely not escapeable. There's + // no real need to allow things like \☃ right? + if !c.is_ascii() { + return false; + } + // Otherwise, we basically say that everything is escapeable unless it's a + // letter or digit. Things like \3 are either octal (when enabled) or an + // error, and we should keep it that way. Otherwise, letters are reserved + // for adding new syntax in a backwards compatible way. + match c { + '0'..='9' | 'A'..='Z' | 'a'..='z' => false, + // While not currently supported, we keep these as not escapeable to + // give us some flexibility with respect to supporting the \< and + // \> word boundary assertions in the future. By rejecting them as + // escapeable, \< and \> will result in a parse error. Thus, we can + // turn them into something else in the future without it being a + // backwards incompatible change. + '<' | '>' => false, + _ => true, + } +} + +/// The configuration for a regex parser. +#[derive(Clone, Copy, Debug)] +pub(crate) struct Config { + /// The maximum number of times we're allowed to recurse. + /// + /// Note that unlike the regex-syntax parser, we actually use recursion in + /// this parser for simplicity. My hope is that by setting a conservative + /// default call limit and providing a way to configure it, that we can + /// keep this simplification. But if we must, we can re-work the parser to + /// put the call stack on the heap like regex-syntax does. + pub(crate) nest_limit: u32, + /// Various flags that control how a pattern is interpreted. + pub(crate) flags: Flags, +} + +impl Default for Config { + fn default() -> Config { + Config { nest_limit: 50, flags: Flags::default() } + } +} + +/// Various flags that control the interpretation of the pattern. +/// +/// These can be set via explicit configuration in code, or change dynamically +/// during parsing via inline flags. For example, `foo(?i:bar)baz` will match +/// `foo` and `baz` case sensitiviely and `bar` case insensitively (assuming a +/// default configuration). +#[derive(Clone, Copy, Debug, Default)] +pub(crate) struct Flags { + /// Whether to match case insensitively. + /// + /// This is the `i` flag. + pub(crate) case_insensitive: bool, + /// Whether `^` and `$` should be treated as line anchors or not. + /// + /// This is the `m` flag. + pub(crate) multi_line: bool, + /// Whether `.` should match line terminators or not. + /// + /// This is the `s` flag. + pub(crate) dot_matches_new_line: bool, + /// Whether to swap the meaning of greedy and non-greedy operators. + /// + /// This is the `U` flag. + pub(crate) swap_greed: bool, + /// Whether to enable CRLF mode. + /// + /// This is the `R` flag. + pub(crate) crlf: bool, + /// Whether to ignore whitespace. i.e., verbose mode. + /// + /// This is the `x` flag. + pub(crate) ignore_whitespace: bool, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub(crate) struct Hir { + kind: HirKind, + is_start_anchored: bool, + is_match_empty: bool, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub(crate) enum HirKind { + Empty, + Char(char), + Class(Class), + Look(Look), + Repetition(Repetition), + Capture(Capture), + Concat(Vec), + Alternation(Vec), +} + +impl Hir { + /// Parses the given pattern string with the given configuration into a + /// structured representation. If the pattern is invalid, then an error + /// is returned. + pub(crate) fn parse(config: Config, pattern: &str) -> Result { + self::parse::Parser::new(config, pattern).parse() + } + + /// Returns the underlying kind of this high-level intermediate + /// representation. + /// + /// Note that there is explicitly no way to build an `Hir` directly from + /// an `HirKind`. If you need to do that, then you must do case analysis + /// on the `HirKind` and call the appropriate smart constructor on `Hir`. + pub(crate) fn kind(&self) -> &HirKind { + &self.kind + } + + /// Returns true if and only if this Hir expression can only match at the + /// beginning of a haystack. + pub(crate) fn is_start_anchored(&self) -> bool { + self.is_start_anchored + } + + /// Returns true if and only if this Hir expression can match the empty + /// string. + pub(crate) fn is_match_empty(&self) -> bool { + self.is_match_empty + } + + fn fail() -> Hir { + let kind = HirKind::Class(Class { ranges: vec![] }); + Hir { kind, is_start_anchored: false, is_match_empty: false } + } + + fn empty() -> Hir { + let kind = HirKind::Empty; + Hir { kind, is_start_anchored: false, is_match_empty: true } + } + + fn char(ch: char) -> Hir { + let kind = HirKind::Char(ch); + Hir { kind, is_start_anchored: false, is_match_empty: false } + } + + fn class(class: Class) -> Hir { + let kind = HirKind::Class(class); + Hir { kind, is_start_anchored: false, is_match_empty: false } + } + + fn look(look: Look) -> Hir { + let kind = HirKind::Look(look); + Hir { + kind, + is_start_anchored: matches!(look, Look::Start), + is_match_empty: true, + } + } + + fn repetition(rep: Repetition) -> Hir { + if rep.min == 0 && rep.max == Some(0) { + return Hir::empty(); + } else if rep.min == 1 && rep.max == Some(1) { + return *rep.sub; + } + let is_start_anchored = rep.min > 0 && rep.sub.is_start_anchored; + let is_match_empty = rep.min == 0 || rep.sub.is_match_empty; + let kind = HirKind::Repetition(rep); + Hir { kind, is_start_anchored, is_match_empty } + } + + fn capture(cap: Capture) -> Hir { + let is_start_anchored = cap.sub.is_start_anchored; + let is_match_empty = cap.sub.is_match_empty; + let kind = HirKind::Capture(cap); + Hir { kind, is_start_anchored, is_match_empty } + } + + fn concat(mut subs: Vec) -> Hir { + if subs.is_empty() { + Hir::empty() + } else if subs.len() == 1 { + subs.pop().unwrap() + } else { + let is_start_anchored = subs[0].is_start_anchored; + let is_match_empty = subs.iter().all(|s| s.is_match_empty); + let kind = HirKind::Concat(subs); + Hir { kind, is_start_anchored, is_match_empty } + } + } + + fn alternation(mut subs: Vec) -> Hir { + if subs.is_empty() { + Hir::fail() + } else if subs.len() == 1 { + subs.pop().unwrap() + } else { + let is_start_anchored = subs.iter().all(|s| s.is_start_anchored); + let is_match_empty = subs.iter().any(|s| s.is_match_empty); + let kind = HirKind::Alternation(subs); + Hir { kind, is_start_anchored, is_match_empty } + } + } +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub(crate) struct Class { + pub(crate) ranges: Vec, +} + +impl Class { + /// Create a new class from the given ranges. The ranges may be provided + /// in any order or may even overlap. They will be automatically + /// canonicalized. + fn new>(ranges: I) -> Class { + let mut class = Class { ranges: ranges.into_iter().collect() }; + class.canonicalize(); + class + } + + /// Add a new range to this set. + fn push(&mut self, range: ClassRange) { + self.ranges.push(range); + self.canonicalize(); + } + + /// Expand this class such that it matches the ASCII codepoints in this set + /// case insensitively. + fn ascii_case_fold(&mut self) { + let len = self.ranges.len(); + for i in 0..len { + if let Some(folded) = self.ranges[i].ascii_case_fold() { + self.ranges.push(folded); + } + } + self.canonicalize(); + } + + /// Negate this set. + /// + /// For all `x` where `x` is any element, if `x` was in this set, then it + /// will not be in this set after negation. + fn negate(&mut self) { + const MIN: char = '\x00'; + const MAX: char = char::MAX; + + if self.ranges.is_empty() { + self.ranges.push(ClassRange { start: MIN, end: MAX }); + return; + } + + // There should be a way to do this in-place with constant memory, + // but I couldn't figure out a simple way to do it. So just append + // the negation to the end of this range, and then drain it before + // we're done. + let drain_end = self.ranges.len(); + + // If our class doesn't start the minimum possible char, then negation + // needs to include all codepoints up to the minimum in this set. + if self.ranges[0].start > MIN { + self.ranges.push(ClassRange { + start: MIN, + // OK because we know it's bigger than MIN. + end: prev_char(self.ranges[0].start).unwrap(), + }); + } + for i in 1..drain_end { + // let lower = self.ranges[i - 1].upper().increment(); + // let upper = self.ranges[i].lower().decrement(); + // self.ranges.push(I::create(lower, upper)); + self.ranges.push(ClassRange { + // OK because we know i-1 is never the last range and therefore + // there must be a range greater than it. It therefore follows + // that 'end' can never be char::MAX, and thus there must be + // a next char. + start: next_char(self.ranges[i - 1].end).unwrap(), + // Since 'i' is guaranteed to never be the first range, it + // follows that there is always a range before this and thus + // 'start' can never be '\x00'. Thus, there must be a previous + // char. + end: prev_char(self.ranges[i].start).unwrap(), + }); + } + if self.ranges[drain_end - 1].end < MAX { + // let lower = self.ranges[drain_end - 1].upper().increment(); + // self.ranges.push(I::create(lower, I::Bound::max_value())); + self.ranges.push(ClassRange { + // OK because we know 'end' is less than char::MAX, and thus + // there is a next char. + start: next_char(self.ranges[drain_end - 1].end).unwrap(), + end: MAX, + }); + } + self.ranges.drain(..drain_end); + // We don't need to canonicalize because we processed the ranges above + // in canonical order and the new ranges we added based on those are + // also necessarily in canonical order. + } + + /// Union this set with the given set, in place. + fn union(&mut self, other: &Class) { + if other.ranges.is_empty() || self.ranges == other.ranges { + return; + } + // This could almost certainly be done more efficiently. + self.ranges.extend(&other.ranges); + self.canonicalize(); + } + + /// Converts this set into a canonical ordering. + fn canonicalize(&mut self) { + if self.is_canonical() { + return; + } + self.ranges.sort(); + assert!(!self.ranges.is_empty()); + + // Is there a way to do this in-place with constant memory? I couldn't + // figure out a way to do it. So just append the canonicalization to + // the end of this range, and then drain it before we're done. + let drain_end = self.ranges.len(); + for oldi in 0..drain_end { + // If we've added at least one new range, then check if we can + // merge this range in the previously added range. + if self.ranges.len() > drain_end { + let (last, rest) = self.ranges.split_last_mut().unwrap(); + if let Some(union) = last.union(&rest[oldi]) { + *last = union; + continue; + } + } + self.ranges.push(self.ranges[oldi]); + } + self.ranges.drain(..drain_end); + } + + /// Returns true if and only if this class is in a canonical ordering. + fn is_canonical(&self) -> bool { + for pair in self.ranges.windows(2) { + if pair[0] >= pair[1] { + return false; + } + if pair[0].is_contiguous(&pair[1]) { + return false; + } + } + true + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq, PartialOrd, Ord)] +pub(crate) struct ClassRange { + pub(crate) start: char, + pub(crate) end: char, +} + +impl ClassRange { + /// Apply simple case folding to this byte range. Only ASCII case mappings + /// (for A-Za-z) are applied. + /// + /// Additional ranges are appended to the given vector. Canonical ordering + /// is *not* maintained in the given vector. + fn ascii_case_fold(&self) -> Option { + if !(ClassRange { start: 'a', end: 'z' }).is_intersection_empty(self) { + let start = core::cmp::max(self.start, 'a'); + let end = core::cmp::min(self.end, 'z'); + return Some(ClassRange { + start: char::try_from(u32::from(start) - 32).unwrap(), + end: char::try_from(u32::from(end) - 32).unwrap(), + }); + } + if !(ClassRange { start: 'A', end: 'Z' }).is_intersection_empty(self) { + let start = core::cmp::max(self.start, 'A'); + let end = core::cmp::min(self.end, 'Z'); + return Some(ClassRange { + start: char::try_from(u32::from(start) + 32).unwrap(), + end: char::try_from(u32::from(end) + 32).unwrap(), + }); + } + None + } + + /// Union the given overlapping range into this range. + /// + /// If the two ranges aren't contiguous, then this returns `None`. + fn union(&self, other: &ClassRange) -> Option { + if !self.is_contiguous(other) { + return None; + } + let start = core::cmp::min(self.start, other.start); + let end = core::cmp::max(self.end, other.end); + Some(ClassRange { start, end }) + } + + /// Returns true if and only if the two ranges are contiguous. Two ranges + /// are contiguous if and only if the ranges are either overlapping or + /// adjacent. + fn is_contiguous(&self, other: &ClassRange) -> bool { + let (s1, e1) = (u32::from(self.start), u32::from(self.end)); + let (s2, e2) = (u32::from(other.start), u32::from(other.end)); + core::cmp::max(s1, s2) <= core::cmp::min(e1, e2).saturating_add(1) + } + + /// Returns true if and only if the intersection of this range and the + /// other range is empty. + fn is_intersection_empty(&self, other: &ClassRange) -> bool { + let (s1, e1) = (self.start, self.end); + let (s2, e2) = (other.start, other.end); + core::cmp::max(s1, s2) > core::cmp::min(e1, e2) + } +} + +/// The high-level intermediate representation for a look-around assertion. +/// +/// An assertion match is always zero-length. Also called an "empty match." +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub(crate) enum Look { + /// Match the beginning of text. Specifically, this matches at the starting + /// position of the input. + Start = 1 << 0, + /// Match the end of text. Specifically, this matches at the ending + /// position of the input. + End = 1 << 1, + /// Match the beginning of a line or the beginning of text. Specifically, + /// this matches at the starting position of the input, or at the position + /// immediately following a `\n` character. + StartLF = 1 << 2, + /// Match the end of a line or the end of text. Specifically, this matches + /// at the end position of the input, or at the position immediately + /// preceding a `\n` character. + EndLF = 1 << 3, + /// Match the beginning of a line or the beginning of text. Specifically, + /// this matches at the starting position of the input, or at the position + /// immediately following either a `\r` or `\n` character, but never after + /// a `\r` when a `\n` follows. + StartCRLF = 1 << 4, + /// Match the end of a line or the end of text. Specifically, this matches + /// at the end position of the input, or at the position immediately + /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r` + /// precedes it. + EndCRLF = 1 << 5, + /// Match an ASCII-only word boundary. That is, this matches a position + /// where the left adjacent character and right adjacent character + /// correspond to a word and non-word or a non-word and word character. + Word = 1 << 6, + /// Match an ASCII-only negation of a word boundary. + WordNegate = 1 << 7, +} + +impl Look { + /// Returns true if the given position in the given haystack matches this + /// look-around assertion. + pub(crate) fn is_match(&self, haystack: &[u8], at: usize) -> bool { + use self::Look::*; + + match *self { + Start => at == 0, + End => at == haystack.len(), + StartLF => at == 0 || haystack[at - 1] == b'\n', + EndLF => at == haystack.len() || haystack[at] == b'\n', + StartCRLF => { + at == 0 + || haystack[at - 1] == b'\n' + || (haystack[at - 1] == b'\r' + && (at >= haystack.len() || haystack[at] != b'\n')) + } + EndCRLF => { + at == haystack.len() + || haystack[at] == b'\r' + || (haystack[at] == b'\n' + && (at == 0 || haystack[at - 1] != b'\r')) + } + Word => { + let word_before = + at > 0 && utf8::is_word_byte(haystack[at - 1]); + let word_after = + at < haystack.len() && utf8::is_word_byte(haystack[at]); + word_before != word_after + } + WordNegate => { + let word_before = + at > 0 && utf8::is_word_byte(haystack[at - 1]); + let word_after = + at < haystack.len() && utf8::is_word_byte(haystack[at]); + word_before == word_after + } + } + } +} + +/// The high-level intermediate representation of a repetition operator. +/// +/// A repetition operator permits the repetition of an arbitrary +/// sub-expression. +#[derive(Clone, Debug, Eq, PartialEq)] +pub(crate) struct Repetition { + /// The minimum range of the repetition. + /// + /// Note that special cases like `?`, `+` and `*` all get translated into + /// the ranges `{0,1}`, `{1,}` and `{0,}`, respectively. + /// + /// When `min` is zero, this expression can match the empty string + /// regardless of what its sub-expression is. + pub(crate) min: u32, + /// The maximum range of the repetition. + /// + /// Note that when `max` is `None`, `min` acts as a lower bound but where + /// there is no upper bound. For something like `x{5}` where the min and + /// max are equivalent, `min` will be set to `5` and `max` will be set to + /// `Some(5)`. + pub(crate) max: Option, + /// Whether this repetition operator is greedy or not. A greedy operator + /// will match as much as it can. A non-greedy operator will match as + /// little as it can. + /// + /// Typically, operators are greedy by default and are only non-greedy when + /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is + /// not. However, this can be inverted via the `U` "ungreedy" flag. + pub(crate) greedy: bool, + /// The expression being repeated. + pub(crate) sub: Box, +} + +impl Repetition { + /// Returns a new repetition with the same `min`, `max` and `greedy` + /// values, but with its sub-expression replaced with the one given. + pub(crate) fn with(&self, sub: Hir) -> Repetition { + Repetition { + min: self.min, + max: self.max, + greedy: self.greedy, + sub: Box::new(sub), + } + } +} + +/// The high-level intermediate representation for a capturing group. +/// +/// A capturing group always has an index and a child expression. It may +/// also have a name associated with it (e.g., `(?P\w)`), but it's not +/// necessary. +/// +/// Note that there is no explicit representation of a non-capturing group +/// in a `Hir`. Instead, non-capturing grouping is handled automatically by +/// the recursive structure of the `Hir` itself. +#[derive(Clone, Debug, Eq, PartialEq)] +pub(crate) struct Capture { + /// The capture index of the capture. + pub(crate) index: u32, + /// The name of the capture, if it exists. + pub(crate) name: Option>, + /// The expression inside the capturing group, which may be empty. + pub(crate) sub: Box, +} + +fn next_char(ch: char) -> Option { + // Skip over the surrogate range. + if ch == '\u{D7FF}' { + return Some('\u{E000}'); + } + // OK because char::MAX < u32::MAX and we handle U+D7FF above. + char::from_u32(u32::from(ch).checked_add(1).unwrap()) +} + +fn prev_char(ch: char) -> Option { + // Skip over the surrogate range. + if ch == '\u{E000}' { + return Some('\u{D7FF}'); + } + // OK because subtracting 1 from any valid scalar value other than 0 + // and U+E000 yields a valid scalar value. + Some(char::from_u32(u32::from(ch).checked_sub(1)?).unwrap()) +} diff --git a/regex-lite/src/hir/parse.rs b/regex-lite/src/hir/parse.rs new file mode 100644 index 000000000..38f5aed8f --- /dev/null +++ b/regex-lite/src/hir/parse.rs @@ -0,0 +1,2123 @@ +use core::cell::{Cell, RefCell}; + +use alloc::{ + boxed::Box, + string::{String, ToString}, + vec, + vec::Vec, +}; + +use crate::{ + error::Error, + hir::{self, Config, Flags, Hir, HirKind}, +}; + +// These are all of the errors that can occur while parsing a regex. Unlike +// regex-syntax, our errors are not particularly great. They are just enough +// to get a general sense of what went wrong. But in exchange, the error +// reporting mechanism is *much* simpler than what's in regex-syntax. +// +// By convention, we use each of these messages in exactly one place. That +// way, every branch that leads to an error has a unique message. This in turn +// means that given a message, one can precisely identify which part of the +// parser reported it. +// +// Finally, we give names to each message so that we can reference them in +// tests. +const ERR_TOO_MUCH_NESTING: &str = "pattern has too much nesting"; +const ERR_TOO_MANY_CAPTURES: &str = "too many capture groups"; +const ERR_DUPLICATE_CAPTURE_NAME: &str = "duplicate capture group name"; +const ERR_UNCLOSED_GROUP: &str = "found open group without closing ')'"; +const ERR_UNCLOSED_GROUP_QUESTION: &str = + "expected closing ')', but got end of pattern"; +const ERR_UNOPENED_GROUP: &str = "found closing ')' without matching '('"; +const ERR_LOOK_UNSUPPORTED: &str = "look-around is not supported"; +const ERR_EMPTY_FLAGS: &str = "empty flag directive '(?)' is not allowed"; +const ERR_MISSING_GROUP_NAME: &str = + "exepcted capture group name, but got end of pattern"; +const ERR_INVALID_GROUP_NAME: &str = "invalid group name"; +const ERR_UNCLOSED_GROUP_NAME: &str = + "expected end of capture group name, but got end of pattern"; +const ERR_EMPTY_GROUP_NAME: &str = "empty capture group names are not allowed"; +const ERR_FLAG_UNRECOGNIZED: &str = "unrecognized inline flag"; +const ERR_FLAG_REPEATED_NEGATION: &str = + "inline flag negation cannot be repeated"; +const ERR_FLAG_DUPLICATE: &str = "duplicate inline flag is not allowed"; +const ERR_FLAG_UNEXPECTED_EOF: &str = + "expected ':' or ')' to end inline flags, but got end of pattern"; +const ERR_FLAG_DANGLING_NEGATION: &str = + "inline flags cannot end with negation directive"; +const ERR_DECIMAL_NO_DIGITS: &str = + "expected decimal number, but found no digits"; +const ERR_DECIMAL_INVALID: &str = "got invalid decimal number"; +const ERR_HEX_BRACE_INVALID_DIGIT: &str = + "expected hexadecimal number in braces, but got non-hex digit"; +const ERR_HEX_BRACE_UNEXPECTED_EOF: &str = + "expected hexadecimal number, but saw end of pattern before closing brace"; +const ERR_HEX_BRACE_EMPTY: &str = + "expected hexadecimal number in braces, but got no digits"; +const ERR_HEX_BRACE_INVALID: &str = "got invalid hexadecimal number in braces"; +const ERR_HEX_FIXED_UNEXPECTED_EOF: &str = + "expected fixed length hexadecimal number, but saw end of pattern first"; +const ERR_HEX_FIXED_INVALID_DIGIT: &str = + "expected fixed length hexadecimal number, but got non-hex digit"; +const ERR_HEX_FIXED_INVALID: &str = + "got invalid fixed length hexadecimal number"; +const ERR_HEX_UNEXPECTED_EOF: &str = + "expected hexadecimal number, but saw end of pattern first"; +const ERR_ESCAPE_UNEXPECTED_EOF: &str = + "saw start of escape sequence, but saw end of pattern before it finished"; +const ERR_BACKREF_UNSUPPORTED: &str = "backreferences are not supported"; +const ERR_UNICODE_CLASS_UNSUPPORTED: &str = + "Unicode character classes are not supported"; +const ERR_ESCAPE_UNRECOGNIZED: &str = "unrecognized escape sequence"; +const ERR_POSIX_CLASS_UNRECOGNIZED: &str = + "unrecognized POSIX character class"; +const ERR_UNCOUNTED_REP_SUB_MISSING: &str = + "uncounted repetition operator must be applied to a sub-expression"; +const ERR_COUNTED_REP_SUB_MISSING: &str = + "counted repetition operator must be applied to a sub-expression"; +const ERR_COUNTED_REP_UNCLOSED: &str = + "found unclosed counted repetition operator"; +const ERR_COUNTED_REP_MIN_UNCLOSED: &str = + "found incomplete and unclosed counted repetition operator"; +const ERR_COUNTED_REP_COMMA_UNCLOSED: &str = + "found counted repetition operator with a comma that is unclosed"; +const ERR_COUNTED_REP_MIN_MAX_UNCLOSED: &str = + "found counted repetition with min and max that is unclosed"; +const ERR_COUNTED_REP_INVALID: &str = + "expected closing brace for counted repetition, but got something else"; +const ERR_COUNTED_REP_INVALID_RANGE: &str = + "found counted repetition with a min bigger than its max"; +const ERR_CLASS_UNCLOSED_AFTER_ITEM: &str = + "non-empty character class has no closing bracket"; +const ERR_CLASS_INVALID_RANGE_ITEM: &str = + "character class ranges must start and end with a single character"; +const ERR_CLASS_INVALID_ITEM: &str = + "invalid escape sequence in character class"; +const ERR_CLASS_UNCLOSED_AFTER_DASH: &str = + "non-empty character class has no closing bracket after dash"; +const ERR_CLASS_UNCLOSED_AFTER_NEGATION: &str = + "negated character class has no closing bracket"; +const ERR_CLASS_UNCLOSED_AFTER_CLOSING: &str = + "character class begins with literal ']' but has no closing bracket"; +const ERR_CLASS_INVALID_RANGE: &str = "invalid range in character class"; +const ERR_CLASS_UNCLOSED: &str = "found unclosed character class"; +const ERR_CLASS_NEST_UNSUPPORTED: &str = + "nested character classes are not supported"; +const ERR_CLASS_INTERSECTION_UNSUPPORTED: &str = + "character class intersection is not supported"; +const ERR_CLASS_DIFFERENCE_UNSUPPORTED: &str = + "character class difference is not supported"; +const ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED: &str = + "character class symmetric difference is not supported"; + +/// A regular expression parser. +/// +/// This parses a string representation of a regular expression into an +/// abstract syntax tree. The size of the tree is proportional to the length +/// of the regular expression pattern. +/// +/// A `Parser` can be configured in more detail via a [`ParserBuilder`]. +#[derive(Clone, Debug)] +pub(super) struct Parser<'a> { + /// The configuration of the parser as given by the caller. + config: Config, + /// The pattern we're parsing as given by the caller. + pattern: &'a str, + /// The call depth of the parser. This is incremented for each + /// sub-expression parsed. Its peak value is the maximum nesting of the + /// pattern. + depth: Cell, + /// The current position of the parser. + pos: Cell, + /// The current codepoint of the parser. The codepoint corresponds to the + /// codepoint encoded in `pattern` beginning at `pos`. + /// + /// This is `None` if and only if `pos == pattern.len()`. + char: Cell>, + /// The current capture index. + capture_index: Cell, + /// The flags that are currently set. + flags: RefCell, + /// A sorted sequence of capture names. This is used to detect duplicate + /// capture names and report an error if one is detected. + capture_names: RefCell>, + /// A scratch buffer used in various places. Mostly this is used to + /// accumulate relevant characters from parts of a pattern. + scratch: RefCell, +} + +/// The constructor and a variety of helper routines. +impl<'a> Parser<'a> { + /// Build a parser from this configuration with the given pattern. + pub(super) fn new(config: Config, pattern: &'a str) -> Parser<'a> { + Parser { + config, + pattern, + depth: Cell::new(0), + pos: Cell::new(0), + char: Cell::new(pattern.chars().next()), + capture_index: Cell::new(0), + flags: RefCell::new(config.flags), + capture_names: RefCell::new(vec![]), + scratch: RefCell::new(String::new()), + } + } + + /// Returns the full pattern string that we're parsing. + fn pattern(&self) -> &str { + self.pattern + } + + /// Return the current byte offset of the parser. + /// + /// The offset starts at `0` from the beginning of the regular expression + /// pattern string. + fn pos(&self) -> usize { + self.pos.get() + } + + /// Increments the call depth of the parser. + /// + /// If the call depth would exceed the configured nest limit, then this + /// returns an error. + /// + /// This returns the old depth. + fn increment_depth(&self) -> Result { + let old = self.depth.get(); + // OK because our depth starts at 0, and we return an error if it + // ever reaches the limit. So the call depth can never exceed u32::MAX. + let new = old.checked_add(1).unwrap(); + if new >= self.config.nest_limit { + return Err(Error::new(ERR_TOO_MUCH_NESTING)); + } + self.depth.set(new); + Ok(old) + } + + /// Decrements the call depth of the parser. + /// + /// This panics if the current depth is 0. + fn decrement_depth(&self) { + let old = self.depth.get(); + // If this fails then the caller has a bug in how they're incrementing + // and decrementing the depth of the parser's call stack. + let new = old.checked_sub(1).unwrap(); + self.depth.set(new); + } + + /// Return the codepoint at the current position of the parser. + /// + /// This panics if the parser is positioned at the end of the pattern. + fn char(&self) -> char { + self.char.get().expect("codepoint, but parser is done") + } + + /// Return the character at the given position. + /// + /// This panics if the given position does not point to a valid char. + fn char_at(&self, i: usize) -> char { + self.pattern()[i..] + .chars() + .next() + .unwrap_or_else(|| panic!("expected char at offset {}", i)) + } + + /// Returns true if the next call to `bump` would return false. + fn is_done(&self) -> bool { + self.pos() == self.pattern.len() + } + + /// Returns the flags that are current set for this regex. + fn flags(&self) -> Flags { + *self.flags.borrow() + } + + /// Bump the parser to the next Unicode scalar value. + /// + /// If the end of the input has been reached, then `false` is returned. + fn bump(&self) -> bool { + if self.is_done() { + return false; + } + self.pos.set(self.pos() + self.char().len_utf8()); + self.char.set(self.pattern()[self.pos()..].chars().next()); + self.char.get().is_some() + } + + /// If the substring starting at the current position of the parser has + /// the given prefix, then bump the parser to the character immediately + /// following the prefix and return true. Otherwise, don't bump the parser + /// and return false. + fn bump_if(&self, prefix: &str) -> bool { + if self.pattern()[self.pos()..].starts_with(prefix) { + for _ in 0..prefix.chars().count() { + self.bump(); + } + true + } else { + false + } + } + + /// Bump the parser, and if the `x` flag is enabled, bump through any + /// subsequent spaces. Return true if and only if the parser is not done. + fn bump_and_bump_space(&self) -> bool { + if !self.bump() { + return false; + } + self.bump_space(); + !self.is_done() + } + + /// If the `x` flag is enabled (i.e., whitespace insensitivity with + /// comments), then this will advance the parser through all whitespace + /// and comments to the next non-whitespace non-comment byte. + /// + /// If the `x` flag is disabled, then this is a no-op. + /// + /// This should be used selectively throughout the parser where + /// arbitrary whitespace is permitted when the `x` flag is enabled. For + /// example, `{ 5 , 6}` is equivalent to `{5,6}`. + fn bump_space(&self) { + if !self.flags().ignore_whitespace { + return; + } + while !self.is_done() { + if self.char().is_whitespace() { + self.bump(); + } else if self.char() == '#' { + self.bump(); + while !self.is_done() { + let c = self.char(); + self.bump(); + if c == '\n' { + break; + } + } + } else { + break; + } + } + } + + /// Peek at the next character in the input without advancing the parser. + /// + /// If the input has been exhausted, then this returns `None`. + fn peek(&self) -> Option { + if self.is_done() { + return None; + } + self.pattern()[self.pos() + self.char().len_utf8()..].chars().next() + } + + /// Peeks at the next character in the pattern from the current offset, and + /// will ignore spaces when the parser is in whitespace insensitive mode. + fn peek_space(&self) -> Option { + if !self.flags().ignore_whitespace { + return self.peek(); + } + if self.is_done() { + return None; + } + let mut start = self.pos() + self.char().len_utf8(); + let mut in_comment = false; + for (i, ch) in self.pattern()[start..].char_indices() { + if ch.is_whitespace() { + continue; + } else if !in_comment && ch == '#' { + in_comment = true; + } else if in_comment && ch == '\n' { + in_comment = false; + } else { + start += i; + break; + } + } + self.pattern()[start..].chars().next() + } + + /// Return the next capturing index. Each subsequent call increments the + /// internal index. Since the way capture indices are computed is a public + /// API guarantee, use of this routine depends on the parser being depth + /// first and left-to-right. + /// + /// If the capture limit is exceeded, then an error is returned. + fn next_capture_index(&self) -> Result { + let current = self.capture_index.get(); + let next = current + .checked_add(1) + .ok_or_else(|| Error::new(ERR_TOO_MANY_CAPTURES))?; + self.capture_index.set(next); + Ok(next) + } + + /// Adds the given capture name to this parser. If this capture name has + /// already been used, then an error is returned. + fn add_capture_name(&self, name: &str) -> Result<(), Error> { + let mut names = self.capture_names.borrow_mut(); + match names.binary_search_by(|n| name.cmp(n)) { + Ok(i) => Err(Error::new(ERR_DUPLICATE_CAPTURE_NAME)), + Err(i) => { + names.insert(i, name.to_string()); + Ok(()) + } + } + } + + /// Returns true if and only if the parser is positioned at a look-around + /// prefix. The conditions under which this returns true must always + /// correspond to a regular expression that would otherwise be consider + /// invalid. + /// + /// This should only be called immediately after parsing the opening of + /// a group or a set of flags. + fn is_lookaround_prefix(&self) -> bool { + self.bump_if("?=") + || self.bump_if("?!") + || self.bump_if("?<=") + || self.bump_if("? Parser<'a> { + pub(super) fn parse(&self) -> Result { + let depth = self.increment_depth()?; + let mut alternates = vec![]; + let mut concat = vec![]; + loop { + self.bump_space(); + if self.is_done() { + break; + } + match self.char() { + '(' => { + // Save the old flags and reset them only when we close + // the group. + let oldflags = *self.flags.borrow(); + if let Some(sub) = self.parse_group()? { + concat.push(sub); + // We only reset them here because if 'parse_group' + // returns None, then that means it handled a flag + // directive, e.g., '(?ism)'. And the whole point is + // that those flags remain active until either disabled + // or the end of the pattern or current group. + *self.flags.borrow_mut() = oldflags; + } + if self.char.get() != Some(')') { + return Err(Error::new(ERR_UNCLOSED_GROUP)); + } + self.bump(); + } + ')' => { + if depth == 0 { + return Err(Error::new(ERR_UNOPENED_GROUP)); + } + break; + } + '|' => { + alternates.push(Hir::concat(core::mem::take(&mut concat))); + self.bump(); + } + '[' => concat.push(self.parse_class()?), + '?' | '*' | '+' => { + concat = self.parse_uncounted_repetition(concat)?; + } + '{' => { + concat = self.parse_counted_repetition(concat)?; + } + _ => concat.push(self.parse_primitive()?), + } + } + self.decrement_depth(); + alternates.push(Hir::concat(concat)); + // N.B. This strips off the "alternation" if there's only one branch. + Ok(Hir::alternation(alternates)) + } + + /// Parses a "primitive" pattern. A primitive is any expression that does + /// not contain any sub-expressions. + /// + /// This assumes the parser is pointing at the beginning of the primitive. + fn parse_primitive(&self) -> Result { + let ch = self.char(); + self.bump(); + match ch { + '\\' => self.parse_escape(), + '.' => Ok(self.hir_dot()), + '^' => Ok(self.hir_anchor_start()), + '$' => Ok(self.hir_anchor_end()), + ch => Ok(self.hir_char(ch)), + } + } + + /// Parse an escape sequence. This always results in a "primitive" HIR, + /// that is, an HIR with no sub-expressions. + /// + /// This assumes the parser is positioned at the start of the sequence, + /// immediately *after* the `\`. It advances the parser to the first + /// position immediately following the escape sequence. + fn parse_escape(&self) -> Result { + if self.is_done() { + return Err(Error::new(ERR_ESCAPE_UNEXPECTED_EOF)); + } + let ch = self.char(); + // Put some of the more complicated routines into helpers. + match ch { + '0'..='9' => return Err(Error::new(ERR_BACKREF_UNSUPPORTED)), + 'p' | 'P' => { + return Err(Error::new(ERR_UNICODE_CLASS_UNSUPPORTED)) + } + 'x' | 'u' | 'U' => return self.parse_hex(), + 'd' | 's' | 'w' | 'D' | 'S' | 'W' => { + return Ok(self.parse_perl_class()); + } + _ => {} + } + + // Handle all of the one letter sequences inline. + self.bump(); + if hir::is_meta_character(ch) || hir::is_escapeable_character(ch) { + return Ok(self.hir_char(ch)); + } + let special = |ch| Ok(self.hir_char(ch)); + match ch { + 'a' => special('\x07'), + 'f' => special('\x0C'), + 't' => special('\t'), + 'n' => special('\n'), + 'r' => special('\r'), + 'v' => special('\x0B'), + 'A' => Ok(Hir::look(hir::Look::Start)), + 'z' => Ok(Hir::look(hir::Look::End)), + 'b' => Ok(Hir::look(hir::Look::Word)), + 'B' => Ok(Hir::look(hir::Look::WordNegate)), + _ => Err(Error::new(ERR_ESCAPE_UNRECOGNIZED)), + } + } + + /// Parse a hex representation of a Unicode codepoint. This handles both + /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to + /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to + /// the first character immediately following the hexadecimal literal. + fn parse_hex(&self) -> Result { + let digit_len = match self.char() { + 'x' => 2, + 'u' => 4, + 'U' => 8, + unk => unreachable!( + "invalid start of fixed length hexadecimal number {}", + unk + ), + }; + if !self.bump_and_bump_space() { + return Err(Error::new(ERR_HEX_UNEXPECTED_EOF)); + } + if self.char() == '{' { + self.parse_hex_brace() + } else { + self.parse_hex_digits(digit_len) + } + } + + /// Parse an N-digit hex representation of a Unicode codepoint. This + /// expects the parser to be positioned at the first digit and will advance + /// the parser to the first character immediately following the escape + /// sequence. + /// + /// The number of digits given must be 2 (for `\xNN`), 4 (for `\uNNNN`) + /// or 8 (for `\UNNNNNNNN`). + fn parse_hex_digits(&self, digit_len: usize) -> Result { + let mut scratch = String::new(); + for i in 0..digit_len { + if i > 0 && !self.bump_and_bump_space() { + return Err(Error::new(ERR_HEX_FIXED_UNEXPECTED_EOF)); + } + if !is_hex(self.char()) { + return Err(Error::new(ERR_HEX_FIXED_INVALID_DIGIT)); + } + scratch.push(self.char()); + } + // The final bump just moves the parser past the literal, which may + // be EOF. + self.bump_and_bump_space(); + match u32::from_str_radix(&scratch, 16).ok().and_then(char::from_u32) { + None => Err(Error::new(ERR_HEX_FIXED_INVALID)), + Some(ch) => Ok(self.hir_char(ch)), + } + } + + /// Parse a hex representation of any Unicode scalar value. This expects + /// the parser to be positioned at the opening brace `{` and will advance + /// the parser to the first character following the closing brace `}`. + fn parse_hex_brace(&self) -> Result { + let mut scratch = String::new(); + while self.bump_and_bump_space() && self.char() != '}' { + if !is_hex(self.char()) { + return Err(Error::new(ERR_HEX_BRACE_INVALID_DIGIT)); + } + scratch.push(self.char()); + } + if self.is_done() { + return Err(Error::new(ERR_HEX_BRACE_UNEXPECTED_EOF)); + } + assert_eq!(self.char(), '}'); + self.bump_and_bump_space(); + + if scratch.is_empty() { + return Err(Error::new(ERR_HEX_BRACE_EMPTY)); + } + match u32::from_str_radix(&scratch, 16).ok().and_then(char::from_u32) { + None => Err(Error::new(ERR_HEX_BRACE_INVALID)), + Some(ch) => Ok(self.hir_char(ch)), + } + } + + /// Parse a decimal number into a u32 while trimming leading and trailing + /// whitespace. + /// + /// This expects the parser to be positioned at the first position where + /// a decimal digit could occur. This will advance the parser to the byte + /// immediately following the last contiguous decimal digit. + /// + /// If no decimal digit could be found or if there was a problem parsing + /// the complete set of digits into a u32, then an error is returned. + fn parse_decimal(&self) -> Result { + let mut scratch = String::new(); + while !self.is_done() && self.char().is_whitespace() { + self.bump(); + } + let start = self.pos(); + while !self.is_done() && '0' <= self.char() && self.char() <= '9' { + scratch.push(self.char()); + self.bump_and_bump_space(); + } + while !self.is_done() && self.char().is_whitespace() { + self.bump_and_bump_space(); + } + let digits = scratch.as_str(); + if digits.is_empty() { + return Err(Error::new(ERR_DECIMAL_NO_DIGITS)); + } + match u32::from_str_radix(digits, 10).ok() { + Some(n) => Ok(n), + None => Err(Error::new(ERR_DECIMAL_INVALID)), + } + } + + /// Parses an uncounted repetition operator. An uncounted repetition + /// operator includes `?`, `*` and `+`, but does not include the `{m,n}` + /// syntax. The current character should be one of `?`, `*` or `+`. Any + /// other character will result in a panic. + /// + /// This assumes that the parser is currently positioned at the repetition + /// operator and advances the parser to the first character after the + /// operator. (Note that the operator may include a single additional `?`, + /// which makes the operator ungreedy.) + /// + /// The caller should include the concatenation that is being built. The + /// concatenation returned includes the repetition operator applied to the + /// last expression in the given concatenation. + /// + /// If the concatenation is empty, then this returns an error. + fn parse_uncounted_repetition( + &self, + mut concat: Vec, + ) -> Result, Error> { + let sub = match concat.pop() { + Some(hir) => Box::new(hir), + None => { + return Err(Error::new(ERR_UNCOUNTED_REP_SUB_MISSING)); + } + }; + let (min, max) = match self.char() { + '?' => (0, Some(1)), + '*' => (0, None), + '+' => (1, None), + unk => unreachable!("unrecognized repetition operator '{}'", unk), + }; + let mut greedy = true; + if self.bump() && self.char() == '?' { + greedy = false; + self.bump(); + } + if self.flags().swap_greed { + greedy = !greedy; + } + concat.push(Hir::repetition(hir::Repetition { + min, + max, + greedy, + sub, + })); + Ok(concat) + } + + /// Parses a counted repetition operation. A counted repetition operator + /// corresponds to the `{m,n}` syntax, and does not include the `?`, `*` or + /// `+` operators. + /// + /// This assumes that the parser is currently at the opening `{` and + /// advances the parser to the first character after the operator. (Note + /// that the operator may include a single additional `?`, which makes the + /// operator ungreedy.) + /// + /// The caller should include the concatenation that is being built. The + /// concatenation returned includes the repetition operator applied to the + /// last expression in the given concatenation. + /// + /// If the concatenation is empty, then this returns an error. + fn parse_counted_repetition( + &self, + mut concat: Vec, + ) -> Result, Error> { + assert_eq!(self.char(), '{', "expected opening brace"); + let sub = match concat.pop() { + Some(hir) => Box::new(hir), + None => { + return Err(Error::new(ERR_COUNTED_REP_SUB_MISSING)); + } + }; + if !self.bump_and_bump_space() { + return Err(Error::new(ERR_COUNTED_REP_UNCLOSED)); + } + let min = self.parse_decimal()?; + let mut max = Some(min); + if self.is_done() { + return Err(Error::new(ERR_COUNTED_REP_MIN_UNCLOSED)); + } + if self.char() == ',' { + if !self.bump_and_bump_space() { + return Err(Error::new(ERR_COUNTED_REP_COMMA_UNCLOSED)); + } + if self.char() != '}' { + max = Some(self.parse_decimal()?); + } else { + max = None; + } + if self.is_done() { + return Err(Error::new(ERR_COUNTED_REP_MIN_MAX_UNCLOSED)); + } + } + if self.char() != '}' { + return Err(Error::new(ERR_COUNTED_REP_INVALID)); + } + + let mut greedy = true; + if self.bump_and_bump_space() && self.char() == '?' { + greedy = false; + self.bump(); + } + if self.flags().swap_greed { + greedy = !greedy; + } + + if max.map_or(false, |max| min > max) { + return Err(Error::new(ERR_COUNTED_REP_INVALID_RANGE)); + } + concat.push(Hir::repetition(hir::Repetition { + min, + max, + greedy, + sub, + })); + Ok(concat) + } + + /// Parses the part of a pattern that starts with a `(`. This is usually + /// a group sub-expression, but might just be a directive that enables + /// (or disables) certain flags. + /// + /// This assumes the parser is pointing at the opening `(`. + fn parse_group(&self) -> Result, Error> { + assert_eq!(self.char(), '('); + self.bump_and_bump_space(); + if self.is_lookaround_prefix() { + return Err(Error::new(ERR_LOOK_UNSUPPORTED)); + } + if self.bump_if("?P<") || self.bump_if("?<") { + let index = self.next_capture_index()?; + let name = Some(Box::from(self.parse_capture_name(index)?)); + let sub = Box::new(self.parse()?); + let cap = hir::Capture { index, name, sub }; + Ok(Some(Hir::capture(cap))) + } else if self.bump_if("?") { + if self.is_done() { + return Err(Error::new(ERR_UNCLOSED_GROUP_QUESTION)); + } + let start = self.pos(); + // The flags get reset in the top-level 'parse' routine. + *self.flags.borrow_mut() = self.parse_flags()?; + let consumed = self.pos() - start; + if self.char() == ')' { + // We don't allow empty flags, e.g., `(?)`. + if consumed == 0 { + return Err(Error::new(ERR_EMPTY_FLAGS)); + } + Ok(None) + } else { + assert_eq!(':', self.char()); + self.bump(); + self.parse().map(Some) + } + } else { + let index = self.next_capture_index()?; + let sub = Box::new(self.parse()?); + let cap = hir::Capture { index, name: None, sub }; + Ok(Some(Hir::capture(cap))) + } + } + + /// Parses a capture group name. Assumes that the parser is positioned at + /// the first character in the name following the opening `<` (and may + /// possibly be EOF). This advances the parser to the first character + /// following the closing `>`. + /// + /// The caller must provide the capture index of the group for this name. + fn parse_capture_name(&self, capture_index: u32) -> Result<&str, Error> { + if self.is_done() { + return Err(Error::new(ERR_MISSING_GROUP_NAME)); + } + let start = self.pos(); + loop { + if self.char() == '>' { + break; + } + if !is_capture_char(self.char(), self.pos() == start) { + return Err(Error::new(ERR_INVALID_GROUP_NAME)); + } + if !self.bump() { + break; + } + } + let end = self.pos(); + if self.is_done() { + return Err(Error::new(ERR_UNCLOSED_GROUP_NAME)); + } + assert_eq!(self.char(), '>'); + self.bump(); + let name = &self.pattern()[start..end]; + if name.is_empty() { + return Err(Error::new(ERR_EMPTY_GROUP_NAME)); + } + self.add_capture_name(name)?; + Ok(name) + } + + /// Parse a sequence of flags starting at the current character. + /// + /// This advances the parser to the character immediately following the + /// flags, which is guaranteed to be either `:` or `)`. + /// + /// # Errors + /// + /// If any flags are duplicated, then an error is returned. + /// + /// If the negation operator is used more than once, then an error is + /// returned. + /// + /// If no flags could be found or if the negation operation is not followed + /// by any flags, then an error is returned. + fn parse_flags(&self) -> Result { + let mut flags = *self.flags.borrow(); + let mut negate = false; + // Keeps track of whether the previous flag item was a '-'. We use this + // to detect whether there is a dangling '-', which is invalid. + let mut last_was_negation = false; + // A set to keep track of the flags we've seen. Since all flags are + // ASCII, we only need 128 bytes. + let mut seen = [false; 128]; + while self.char() != ':' && self.char() != ')' { + if self.char() == '-' { + last_was_negation = true; + if negate { + return Err(Error::new(ERR_FLAG_REPEATED_NEGATION)); + } + negate = true; + } else { + last_was_negation = false; + self.parse_flag(&mut flags, negate)?; + // OK because every valid flag is ASCII, and we're only here if + // the flag is valid. + let flag_byte = u8::try_from(self.char()).unwrap(); + if seen[usize::from(flag_byte)] { + return Err(Error::new(ERR_FLAG_DUPLICATE)); + } + seen[usize::from(flag_byte)] = true; + } + if !self.bump() { + return Err(Error::new(ERR_FLAG_UNEXPECTED_EOF)); + } + } + if last_was_negation { + return Err(Error::new(ERR_FLAG_DANGLING_NEGATION)); + } + Ok(flags) + } + + /// Parse the current character as a flag. Do not advance the parser. + /// + /// This sets the appropriate boolean value in place on the set of flags + /// given. The boolean is inverted when `negate` is true. + /// + /// # Errors + /// + /// If the flag is not recognized, then an error is returned. + fn parse_flag( + &self, + flags: &mut Flags, + negate: bool, + ) -> Result<(), Error> { + let enabled = !negate; + match self.char() { + 'i' => flags.case_insensitive = enabled, + 'm' => flags.multi_line = enabled, + 's' => flags.dot_matches_new_line = enabled, + 'U' => flags.swap_greed = enabled, + 'R' => flags.crlf = enabled, + 'x' => flags.ignore_whitespace = enabled, + // We make a special exception for this flag where we let it + // through as a recognized flag, but treat it as a no-op. This in + // practice retains some compatibility with the regex crate. It is + // a little suspect to do this, but for example, '(?-u:\b).+' in + // the regex crate is equivalent to '\b.+' in regex-lite. + 'u' => {} + _ => return Err(Error::new(ERR_FLAG_UNRECOGNIZED)), + } + Ok(()) + } + + /// Parse a standard character class consisting primarily of characters or + /// character ranges. + /// + /// This assumes the parser is positioned at the opening `[`. If parsing + /// is successful, then the parser is advanced to the position immediately + /// following the closing `]`. + fn parse_class(&self) -> Result { + assert_eq!(self.char(), '['); + + let mut union = vec![]; + if !self.bump_and_bump_space() { + return Err(Error::new(ERR_CLASS_UNCLOSED)); + } + // Determine whether the class is negated or not. + let negate = if self.char() != '^' { + false + } else { + if !self.bump_and_bump_space() { + return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_NEGATION)); + } + true + }; + // Accept any number of `-` as literal `-`. + while self.char() == '-' { + union.push(hir::ClassRange { start: '-', end: '-' }); + if !self.bump_and_bump_space() { + return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_DASH)); + } + } + // If `]` is the *first* char in a set, then interpret it as a literal + // `]`. That is, an empty class is impossible to write. + if union.is_empty() && self.char() == ']' { + union.push(hir::ClassRange { start: ']', end: ']' }); + if !self.bump_and_bump_space() { + return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_CLOSING)); + } + } + loop { + self.bump_space(); + if self.is_done() { + return Err(Error::new(ERR_CLASS_UNCLOSED)); + } + match self.char() { + '[' => { + // Attempt to treat this as the beginning of a POSIX class. + // If POSIX class parsing fails, then the parser backs up + // to `[`. + if let Some(class) = self.maybe_parse_posix_class() { + union.extend_from_slice(&class.ranges); + continue; + } + // ... otherwise we don't support nested classes. + return Err(Error::new(ERR_CLASS_NEST_UNSUPPORTED)); + } + ']' => { + self.bump(); + let mut class = hir::Class::new(union); + // Note that we must apply case folding before negation! + // Consider `(?i)[^x]`. If we applied negation first, then + // the result would be the character class that matched any + // Unicode scalar value. + if self.flags().case_insensitive { + class.ascii_case_fold(); + } + if negate { + class.negate(); + } + return Ok(Hir::class(class)); + } + '&' if self.peek() == Some('&') => { + return Err(Error::new( + ERR_CLASS_INTERSECTION_UNSUPPORTED, + )); + } + '-' if self.peek() == Some('-') => { + return Err(Error::new(ERR_CLASS_DIFFERENCE_UNSUPPORTED)); + } + '~' if self.peek() == Some('~') => { + return Err(Error::new( + ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED, + )); + } + _ => self.parse_class_range(&mut union)?, + } + } + } + + /// Parse a single primitive item in a character class set. The item to + /// be parsed can either be one of a simple literal character, a range + /// between two simple literal characters or a "primitive" character + /// class like `\w`. + /// + /// If an invalid escape is found, or if a character class is found where + /// a simple literal is expected (e.g., in a range), then an error is + /// returned. + /// + /// Otherwise, the range (or ranges) are appended to the given union of + /// ranges. + fn parse_class_range( + &self, + union: &mut Vec, + ) -> Result<(), Error> { + let prim1 = self.parse_class_item()?; + self.bump_space(); + if self.is_done() { + return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_ITEM)); + } + // If the next char isn't a `-`, then we don't have a range. + // There are two exceptions. If the char after a `-` is a `]`, then + // `-` is interpreted as a literal `-`. Alternatively, if the char + // after a `-` is a `-`, then `--` corresponds to a "difference" + // operation. (Which we don't support in regex-lite, but error about + // specifically in an effort to be loud about differences between the + // main regex crate where possible.) + if self.char() != '-' + || self.peek_space() == Some(']') + || self.peek_space() == Some('-') + { + union.extend_from_slice(&into_class_item_ranges(prim1)?); + return Ok(()); + } + // OK, now we're parsing a range, so bump past the `-` and parse the + // second half of the range. + if !self.bump_and_bump_space() { + return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_DASH)); + } + let prim2 = self.parse_class_item()?; + let range = hir::ClassRange { + start: into_class_item_range(prim1)?, + end: into_class_item_range(prim2)?, + }; + if range.start > range.end { + return Err(Error::new(ERR_CLASS_INVALID_RANGE)); + } + union.push(range); + Ok(()) + } + + /// Parse a single item in a character class as a primitive, where the + /// primitive either consists of a verbatim literal or a single escape + /// sequence. + /// + /// This assumes the parser is positioned at the beginning of a primitive, + /// and advances the parser to the first position after the primitive if + /// successful. + /// + /// Note that it is the caller's responsibility to report an error if an + /// illegal primitive was parsed. + fn parse_class_item(&self) -> Result { + let ch = self.char(); + self.bump(); + if ch == '\\' { + self.parse_escape() + } else { + Ok(Hir::char(ch)) + } + } + + /// Attempt to parse a POSIX character class, e.g., `[:alnum:]`. + /// + /// This assumes the parser is positioned at the opening `[`. + /// + /// If no valid POSIX character class could be found, then this does not + /// advance the parser and `None` is returned. Otherwise, the parser is + /// advanced to the first byte following the closing `]` and the + /// corresponding POSIX class is returned. + fn maybe_parse_posix_class(&self) -> Option { + // POSIX character classes are interesting from a parsing perspective + // because parsing cannot fail with any interesting error. For example, + // in order to use an POSIX character class, it must be enclosed in + // double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think + // of it as "POSIX character classes have the syntax `[:NAME:]` which + // can only appear within character brackets." This means that things + // like `[[:lower:]A]` are legal constructs. + // + // However, if one types an incorrect POSIX character class, e.g., + // `[[:loower:]]`, then we treat that as if it were normal nested + // character class containing the characters `:elorw`. (Which isn't + // supported and results in an error in regex-lite.) One might argue + // that we should return an error instead since the repeated colons + // give away the intent to write an POSIX class. But what if the user + // typed `[[:lower]]` instead? How can we tell that was intended to be + // a POSXI class and not just a normal nested class? + // + // Reasonable people can probably disagree over this, but for better + // or worse, we implement semantics that never fails at the expense of + // better failure modes. + assert_eq!(self.char(), '['); + + // If parsing fails, then we back up the parser to this starting point. + let start_pos = self.pos(); + let start_char = self.char.get(); + let reset = || { + self.pos.set(start_pos); + self.char.set(start_char); + }; + + let mut negated = false; + if !self.bump() || self.char() != ':' { + reset(); + return None; + } + if !self.bump() { + reset(); + return None; + } + if self.char() == '^' { + negated = true; + if !self.bump() { + reset(); + return None; + } + } + let name_start = self.pos(); + while self.char() != ':' && self.bump() {} + if self.is_done() { + reset(); + return None; + } + let name = &self.pattern()[name_start..self.pos()]; + if !self.bump_if(":]") { + reset(); + return None; + } + if let Ok(ranges) = posix_class(name) { + let mut class = hir::Class::new(ranges); + if negated { + class.negate(); + } + return Some(class); + } + reset(); + None + } + + /// Parse a Perl character class, e.g., `\d` or `\W`. This assumes the + /// parser is currently at a valid character class name and will be + /// advanced to the character immediately following the class. + fn parse_perl_class(&self) -> Hir { + let ch = self.char(); + self.bump(); + let mut class = hir::Class::new(match ch { + 'd' | 'D' => posix_class("digit").unwrap(), + 's' | 'S' => posix_class("space").unwrap(), + 'w' | 'W' => posix_class("word").unwrap(), + unk => unreachable!("invalid Perl class \\{}", unk), + }); + if ch.is_ascii_uppercase() { + class.negate(); + } + Hir::class(class) + } + + fn hir_dot(&self) -> Hir { + if self.flags().dot_matches_new_line { + Hir::class(hir::Class::new([hir::ClassRange { + start: '\x00', + end: '\u{10FFFF}', + }])) + } else if self.flags().crlf { + Hir::class(hir::Class::new([ + hir::ClassRange { start: '\x00', end: '\x09' }, + hir::ClassRange { start: '\x0B', end: '\x0C' }, + hir::ClassRange { start: '\x0E', end: '\u{10FFFF}' }, + ])) + } else { + Hir::class(hir::Class::new([ + hir::ClassRange { start: '\x00', end: '\x09' }, + hir::ClassRange { start: '\x0B', end: '\u{10FFFF}' }, + ])) + } + } + + fn hir_anchor_start(&self) -> Hir { + let look = if self.flags().multi_line { + if self.flags().crlf { + hir::Look::StartCRLF + } else { + hir::Look::StartLF + } + } else { + hir::Look::Start + }; + Hir::look(look) + } + + fn hir_anchor_end(&self) -> Hir { + let look = if self.flags().multi_line { + if self.flags().crlf { + hir::Look::EndCRLF + } else { + hir::Look::EndLF + } + } else { + hir::Look::End + }; + Hir::look(look) + } + + fn hir_char(&self, ch: char) -> Hir { + if self.flags().case_insensitive { + let this = hir::ClassRange { start: ch, end: ch }; + if let Some(folded) = this.ascii_case_fold() { + return Hir::class(hir::Class::new([this, folded])); + } + } + Hir::char(ch) + } +} + +/// Converts the given Hir to a literal char if the Hir is just a single +/// character. Otherwise this returns an error. +/// +/// This is useful in contexts where you can only accept a single character, +/// but where it is convenient to parse something more general. For example, +/// parsing a single part of a character class range. It's useful to reuse +/// the literal parsing code, but that code can itself return entire classes +/// which can't be used as the start/end of a class range. +fn into_class_item_range(hir: Hir) -> Result { + match hir.kind { + HirKind::Char(ch) => Ok(ch), + _ => Err(Error::new(ERR_CLASS_INVALID_RANGE_ITEM)), + } +} + +fn into_class_item_ranges(hir: Hir) -> Result, Error> { + match hir.kind { + HirKind::Char(ch) => Ok(vec![hir::ClassRange { start: ch, end: ch }]), + HirKind::Class(hir::Class { ranges }) => Ok(ranges), + _ => Err(Error::new(ERR_CLASS_INVALID_ITEM)), + } +} + +/// Returns an iterator of character class ranges for the given named POSIX +/// character class. If no such character class exists for the name given, then +/// an error is returned. +fn posix_class( + kind: &str, +) -> Result, Error> { + let slice: &'static [(u8, u8)] = match kind { + "alnum" => &[(b'0', b'9'), (b'A', b'Z'), (b'a', b'z')], + "alpha" => &[(b'A', b'Z'), (b'a', b'z')], + "ascii" => &[(b'\x00', b'\x7F')], + "blank" => &[(b'\t', b'\t'), (b' ', b' ')], + "cntrl" => &[(b'\x00', b'\x1F'), (b'\x7F', b'\x7F')], + "digit" => &[(b'0', b'9')], + "graph" => &[(b'!', b'~')], + "lower" => &[(b'a', b'z')], + "print" => &[(b' ', b'~')], + "punct" => &[(b'!', b'/'), (b':', b'@'), (b'[', b'`'), (b'{', b'~')], + "space" => &[ + (b'\t', b'\t'), + (b'\n', b'\n'), + (b'\x0B', b'\x0B'), + (b'\x0C', b'\x0C'), + (b'\r', b'\r'), + (b' ', b' '), + ], + "upper" => &[(b'A', b'Z')], + "word" => &[(b'0', b'9'), (b'A', b'Z'), (b'_', b'_'), (b'a', b'z')], + "xdigit" => &[(b'0', b'9'), (b'A', b'F'), (b'a', b'f')], + _ => return Err(Error::new(ERR_POSIX_CLASS_UNRECOGNIZED)), + }; + Ok(slice.iter().map(|&(start, end)| hir::ClassRange { + start: char::from(start), + end: char::from(end), + })) +} + +/// Returns true if the given character is a hexadecimal digit. +fn is_hex(c: char) -> bool { + ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F') +} + +/// Returns true if the given character is a valid in a capture group name. +/// +/// If `first` is true, then `c` is treated as the first character in the +/// group name (which must be alphabetic or underscore). +fn is_capture_char(c: char, first: bool) -> bool { + if first { + c == '_' || c.is_alphabetic() + } else { + c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn p(pattern: &str) -> Hir { + Parser::new(Config::default(), pattern).parse().unwrap() + } + + fn perr(pattern: &str) -> String { + Parser::new(Config::default(), pattern) + .parse() + .unwrap_err() + .to_string() + } + + fn class>(it: I) -> Hir { + Hir::class(hir::Class::new( + it.into_iter().map(|(start, end)| hir::ClassRange { start, end }), + )) + } + + fn singles>(it: I) -> Hir { + Hir::class(hir::Class::new( + it.into_iter().map(|ch| hir::ClassRange { start: ch, end: ch }), + )) + } + + fn posix(name: &str) -> Hir { + Hir::class(hir::Class::new(posix_class(name).unwrap())) + } + + fn cap(index: u32, sub: Hir) -> Hir { + Hir::capture(hir::Capture { index, name: None, sub: Box::new(sub) }) + } + + fn named_cap(index: u32, name: &str, sub: Hir) -> Hir { + Hir::capture(hir::Capture { + index, + name: Some(Box::from(name)), + sub: Box::new(sub), + }) + } + + #[test] + fn ok_literal() { + assert_eq!(p("a"), Hir::char('a')); + assert_eq!(p("ab"), Hir::concat(vec![Hir::char('a'), Hir::char('b')])); + assert_eq!(p("💩"), Hir::char('💩')); + } + + #[test] + fn ok_meta_escapes() { + assert_eq!(p(r"\*"), Hir::char('*')); + assert_eq!(p(r"\+"), Hir::char('+')); + assert_eq!(p(r"\?"), Hir::char('?')); + assert_eq!(p(r"\|"), Hir::char('|')); + assert_eq!(p(r"\("), Hir::char('(')); + assert_eq!(p(r"\)"), Hir::char(')')); + assert_eq!(p(r"\^"), Hir::char('^')); + assert_eq!(p(r"\$"), Hir::char('$')); + assert_eq!(p(r"\["), Hir::char('[')); + assert_eq!(p(r"\]"), Hir::char(']')); + } + + #[test] + fn ok_special_escapes() { + assert_eq!(p(r"\a"), Hir::char('\x07')); + assert_eq!(p(r"\f"), Hir::char('\x0C')); + assert_eq!(p(r"\t"), Hir::char('\t')); + assert_eq!(p(r"\n"), Hir::char('\n')); + assert_eq!(p(r"\r"), Hir::char('\r')); + assert_eq!(p(r"\v"), Hir::char('\x0B')); + assert_eq!(p(r"\A"), Hir::look(hir::Look::Start)); + assert_eq!(p(r"\z"), Hir::look(hir::Look::End)); + assert_eq!(p(r"\b"), Hir::look(hir::Look::Word)); + assert_eq!(p(r"\B"), Hir::look(hir::Look::WordNegate)); + } + + #[test] + fn ok_hex() { + // fixed length + assert_eq!(p(r"\x41"), Hir::char('A')); + assert_eq!(p(r"\u2603"), Hir::char('☃')); + assert_eq!(p(r"\U0001F4A9"), Hir::char('💩')); + // braces + assert_eq!(p(r"\x{1F4A9}"), Hir::char('💩')); + assert_eq!(p(r"\u{1F4A9}"), Hir::char('💩')); + assert_eq!(p(r"\U{1F4A9}"), Hir::char('💩')); + } + + #[test] + fn ok_perl() { + assert_eq!(p(r"\d"), posix("digit")); + assert_eq!(p(r"\s"), posix("space")); + assert_eq!(p(r"\w"), posix("word")); + + let negated = |name| { + let mut class = hir::Class::new(posix_class(name).unwrap()); + class.negate(); + Hir::class(class) + }; + assert_eq!(p(r"\D"), negated("digit")); + assert_eq!(p(r"\S"), negated("space")); + assert_eq!(p(r"\W"), negated("word")); + } + + #[test] + fn ok_flags_and_primitives() { + assert_eq!(p(r"a"), Hir::char('a')); + assert_eq!(p(r"(?i:a)"), singles(['A', 'a'])); + + assert_eq!(p(r"^"), Hir::look(hir::Look::Start)); + assert_eq!(p(r"(?m:^)"), Hir::look(hir::Look::StartLF)); + assert_eq!(p(r"(?mR:^)"), Hir::look(hir::Look::StartCRLF)); + + assert_eq!(p(r"$"), Hir::look(hir::Look::End)); + assert_eq!(p(r"(?m:$)"), Hir::look(hir::Look::EndLF)); + assert_eq!(p(r"(?mR:$)"), Hir::look(hir::Look::EndCRLF)); + + assert_eq!(p(r"."), class([('\x00', '\x09'), ('\x0B', '\u{10FFFF}')])); + assert_eq!( + p(r"(?R:.)"), + class([ + ('\x00', '\x09'), + ('\x0B', '\x0C'), + ('\x0E', '\u{10FFFF}'), + ]) + ); + assert_eq!(p(r"(?s:.)"), class([('\x00', '\u{10FFFF}')])); + assert_eq!(p(r"(?sR:.)"), class([('\x00', '\u{10FFFF}')])); + } + + #[test] + fn ok_alternate() { + assert_eq!( + p(r"a|b"), + Hir::alternation(vec![Hir::char('a'), Hir::char('b')]) + ); + assert_eq!( + p(r"(?:a|b)"), + Hir::alternation(vec![Hir::char('a'), Hir::char('b')]) + ); + + assert_eq!( + p(r"(a|b)"), + cap(1, Hir::alternation(vec![Hir::char('a'), Hir::char('b')])) + ); + assert_eq!( + p(r"(?a|b)"), + named_cap( + 1, + "foo", + Hir::alternation(vec![Hir::char('a'), Hir::char('b')]) + ) + ); + + assert_eq!( + p(r"a|b|c"), + Hir::alternation(vec![ + Hir::char('a'), + Hir::char('b'), + Hir::char('c') + ]) + ); + + assert_eq!( + p(r"ax|by|cz"), + Hir::alternation(vec![ + Hir::concat(vec![Hir::char('a'), Hir::char('x')]), + Hir::concat(vec![Hir::char('b'), Hir::char('y')]), + Hir::concat(vec![Hir::char('c'), Hir::char('z')]), + ]) + ); + assert_eq!( + p(r"(ax|(by|(cz)))"), + cap( + 1, + Hir::alternation(vec![ + Hir::concat(vec![Hir::char('a'), Hir::char('x')]), + cap( + 2, + Hir::alternation(vec![ + Hir::concat(vec![Hir::char('b'), Hir::char('y')]), + cap( + 3, + Hir::concat(vec![ + Hir::char('c'), + Hir::char('z') + ]) + ), + ]) + ), + ]) + ) + ); + + assert_eq!( + p(r"|"), + Hir::alternation(vec![Hir::empty(), Hir::empty()]) + ); + assert_eq!( + p(r"||"), + Hir::alternation(vec![Hir::empty(), Hir::empty(), Hir::empty()]) + ); + + assert_eq!( + p(r"a|"), + Hir::alternation(vec![Hir::char('a'), Hir::empty()]) + ); + assert_eq!( + p(r"|a"), + Hir::alternation(vec![Hir::empty(), Hir::char('a')]) + ); + + assert_eq!( + p(r"(|)"), + cap(1, Hir::alternation(vec![Hir::empty(), Hir::empty()])) + ); + assert_eq!( + p(r"(a|)"), + cap(1, Hir::alternation(vec![Hir::char('a'), Hir::empty()])) + ); + assert_eq!( + p(r"(|a)"), + cap(1, Hir::alternation(vec![Hir::empty(), Hir::char('a')])) + ); + } + + #[test] + fn ok_flag_group() { + assert_eq!( + p("a(?i:b)"), + Hir::concat(vec![Hir::char('a'), singles(['B', 'b'])]) + ); + } + + #[test] + fn ok_flag_directive() { + assert_eq!(p("(?i)a"), singles(['A', 'a'])); + assert_eq!(p("a(?i)"), Hir::char('a')); + assert_eq!( + p("a(?i)b"), + Hir::concat(vec![Hir::char('a'), singles(['B', 'b'])]) + ); + assert_eq!( + p("a(?i)a(?-i)a"), + Hir::concat(vec![ + Hir::char('a'), + singles(['A', 'a']), + Hir::char('a'), + ]) + ); + assert_eq!( + p("a(?:(?i)a)a"), + Hir::concat(vec![ + Hir::char('a'), + singles(['A', 'a']), + Hir::char('a'), + ]) + ); + assert_eq!( + p("a((?i)a)a"), + Hir::concat(vec![ + Hir::char('a'), + cap(1, singles(['A', 'a'])), + Hir::char('a'), + ]) + ); + } + + #[test] + fn ok_uncounted_repetition() { + assert_eq!( + p(r"a?"), + Hir::repetition(hir::Repetition { + min: 0, + max: Some(1), + greedy: true, + sub: Box::new(Hir::char('a')), + }), + ); + assert_eq!( + p(r"a*"), + Hir::repetition(hir::Repetition { + min: 0, + max: None, + greedy: true, + sub: Box::new(Hir::char('a')), + }), + ); + assert_eq!( + p(r"a+"), + Hir::repetition(hir::Repetition { + min: 1, + max: None, + greedy: true, + sub: Box::new(Hir::char('a')), + }), + ); + + assert_eq!( + p(r"a??"), + Hir::repetition(hir::Repetition { + min: 0, + max: Some(1), + greedy: false, + sub: Box::new(Hir::char('a')), + }), + ); + assert_eq!( + p(r"a*?"), + Hir::repetition(hir::Repetition { + min: 0, + max: None, + greedy: false, + sub: Box::new(Hir::char('a')), + }), + ); + assert_eq!( + p(r"a+?"), + Hir::repetition(hir::Repetition { + min: 1, + max: None, + greedy: false, + sub: Box::new(Hir::char('a')), + }), + ); + + assert_eq!( + p(r"a?b"), + Hir::concat(vec![ + Hir::repetition(hir::Repetition { + min: 0, + max: Some(1), + greedy: true, + sub: Box::new(Hir::char('a')), + }), + Hir::char('b'), + ]), + ); + + assert_eq!( + p(r"ab?"), + Hir::concat(vec![ + Hir::char('a'), + Hir::repetition(hir::Repetition { + min: 0, + max: Some(1), + greedy: true, + sub: Box::new(Hir::char('b')), + }), + ]), + ); + + assert_eq!( + p(r"(?:ab)?"), + Hir::repetition(hir::Repetition { + min: 0, + max: Some(1), + greedy: true, + sub: Box::new(Hir::concat(vec![ + Hir::char('a'), + Hir::char('b') + ])), + }), + ); + + assert_eq!( + p(r"(ab)?"), + Hir::repetition(hir::Repetition { + min: 0, + max: Some(1), + greedy: true, + sub: Box::new(cap( + 1, + Hir::concat(vec![Hir::char('a'), Hir::char('b')]) + )), + }), + ); + + assert_eq!( + p(r"|a?"), + Hir::alternation(vec![ + Hir::empty(), + Hir::repetition(hir::Repetition { + min: 0, + max: Some(1), + greedy: true, + sub: Box::new(Hir::char('a')), + }) + ]), + ); + } + + #[test] + fn ok_counted_repetition() { + assert_eq!( + p(r"a{5}"), + Hir::repetition(hir::Repetition { + min: 5, + max: Some(5), + greedy: true, + sub: Box::new(Hir::char('a')), + }), + ); + assert_eq!( + p(r"a{5}?"), + Hir::repetition(hir::Repetition { + min: 5, + max: Some(5), + greedy: false, + sub: Box::new(Hir::char('a')), + }), + ); + + assert_eq!( + p(r"a{5,}"), + Hir::repetition(hir::Repetition { + min: 5, + max: None, + greedy: true, + sub: Box::new(Hir::char('a')), + }), + ); + + assert_eq!( + p(r"a{5,9}"), + Hir::repetition(hir::Repetition { + min: 5, + max: Some(9), + greedy: true, + sub: Box::new(Hir::char('a')), + }), + ); + + assert_eq!( + p(r"ab{5}c"), + Hir::concat(vec![ + Hir::char('a'), + Hir::repetition(hir::Repetition { + min: 5, + max: Some(5), + greedy: true, + sub: Box::new(Hir::char('b')), + }), + Hir::char('c'), + ]), + ); + + assert_eq!( + p(r"a{ 5 }"), + Hir::repetition(hir::Repetition { + min: 5, + max: Some(5), + greedy: true, + sub: Box::new(Hir::char('a')), + }), + ); + assert_eq!( + p(r"a{ 5 , 9 }"), + Hir::repetition(hir::Repetition { + min: 5, + max: Some(9), + greedy: true, + sub: Box::new(Hir::char('a')), + }), + ); + } + + #[test] + fn ok_group_unnamed() { + assert_eq!(p("(a)"), cap(1, Hir::char('a'))); + assert_eq!( + p("(ab)"), + cap(1, Hir::concat(vec![Hir::char('a'), Hir::char('b')])) + ); + } + + #[test] + fn ok_group_named() { + assert_eq!(p("(?Pa)"), named_cap(1, "foo", Hir::char('a'))); + assert_eq!(p("(?a)"), named_cap(1, "foo", Hir::char('a'))); + + assert_eq!( + p("(?Pab)"), + named_cap( + 1, + "foo", + Hir::concat(vec![Hir::char('a'), Hir::char('b')]) + ) + ); + assert_eq!( + p("(?ab)"), + named_cap( + 1, + "foo", + Hir::concat(vec![Hir::char('a'), Hir::char('b')]) + ) + ); + + assert_eq!(p(r"(?z)"), named_cap(1, "a", Hir::char('z'))); + assert_eq!(p(r"(?Pz)"), named_cap(1, "a", Hir::char('z'))); + + assert_eq!(p(r"(?z)"), named_cap(1, "a_1", Hir::char('z'))); + assert_eq!(p(r"(?Pz)"), named_cap(1, "a_1", Hir::char('z'))); + + assert_eq!(p(r"(?z)"), named_cap(1, "a.1", Hir::char('z'))); + assert_eq!(p(r"(?Pz)"), named_cap(1, "a.1", Hir::char('z'))); + + assert_eq!(p(r"(?z)"), named_cap(1, "a[1]", Hir::char('z'))); + assert_eq!(p(r"(?Pz)"), named_cap(1, "a[1]", Hir::char('z'))); + + assert_eq!(p(r"(?z)"), named_cap(1, "a¾", Hir::char('z'))); + assert_eq!(p(r"(?Pz)"), named_cap(1, "a¾", Hir::char('z'))); + + assert_eq!(p(r"(?<名字>z)"), named_cap(1, "名字", Hir::char('z'))); + assert_eq!(p(r"(?P<名字>z)"), named_cap(1, "名字", Hir::char('z'))); + } + + #[test] + fn ok_class() { + assert_eq!(p(r"[a]"), singles(['a'])); + assert_eq!(p(r"[a\]]"), singles(['a', ']'])); + assert_eq!(p(r"[a\-z]"), singles(['a', '-', 'z'])); + assert_eq!(p(r"[ab]"), class([('a', 'b')])); + assert_eq!(p(r"[a-]"), singles(['a', '-'])); + assert_eq!(p(r"[-a]"), singles(['a', '-'])); + assert_eq!(p(r"[--a]"), singles(['a', '-'])); + assert_eq!(p(r"[---a]"), singles(['a', '-'])); + assert_eq!(p(r"[[:alnum:]]"), posix("alnum")); + assert_eq!(p(r"[\w]"), posix("word")); + assert_eq!(p(r"[a\wz]"), posix("word")); + assert_eq!(p(r"[\s\S]"), class([('\x00', '\u{10FFFF}')])); + assert_eq!(p(r"[^\s\S]"), Hir::fail()); + assert_eq!(p(r"[a-cx-z]"), class([('a', 'c'), ('x', 'z')])); + assert_eq!(p(r"[☃-⛄]"), class([('☃', '⛄')])); + assert_eq!(p(r"[]]"), singles([']'])); + assert_eq!(p(r"[]a]"), singles([']', 'a'])); + assert_eq!(p(r"[]\[]"), singles(['[', ']'])); + assert_eq!(p(r"[\[]"), singles(['['])); + + assert_eq!(p(r"(?i)[a]"), singles(['A', 'a'])); + assert_eq!(p(r"(?i)[A]"), singles(['A', 'a'])); + assert_eq!(p(r"(?i)[k]"), singles(['K', 'k'])); + assert_eq!(p(r"(?i)[s]"), singles(['S', 's'])); + assert_eq!(p(r"(?i)[β]"), singles(['β'])); + + assert_eq!(p(r"[^^]"), class([('\x00', ']'), ('_', '\u{10FFFF}')])); + assert_eq!( + p(r"[^-a]"), + class([('\x00', ','), ('.', '`'), ('b', '\u{10FFFF}')]) + ); + + assert_eq!( + p(r"[-]a]"), + Hir::concat(vec![singles(['-']), Hir::char('a'), Hir::char(']')]) + ); + } + + #[test] + fn ok_verbatim() { + assert_eq!( + p(r"(?x)a{5,9} ?"), + Hir::repetition(hir::Repetition { + min: 5, + max: Some(9), + greedy: false, + sub: Box::new(Hir::char('a')), + }) + ); + assert_eq!(p(r"(?x)[ a]"), singles(['a'])); + assert_eq!( + p(r"(?x)[ ^ a]"), + class([('\x00', '`'), ('b', '\u{10FFFF}')]) + ); + assert_eq!(p(r"(?x)[ - a]"), singles(['a', '-'])); + assert_eq!(p(r"(?x)[ ] a]"), singles([']', 'a'])); + + assert_eq!( + p(r"(?x)a b"), + Hir::concat(vec![Hir::char('a'), Hir::char('b')]) + ); + assert_eq!( + p(r"(?x)a b(?-x)a b"), + Hir::concat(vec![ + Hir::char('a'), + Hir::char('b'), + Hir::char('a'), + Hir::char(' '), + Hir::char('b'), + ]) + ); + assert_eq!( + p(r"a (?x:a )a "), + Hir::concat(vec![ + Hir::char('a'), + Hir::char(' '), + Hir::char('a'), + Hir::char('a'), + Hir::char(' '), + ]) + ); + assert_eq!( + p(r"(?x)( ?P a )"), + named_cap(1, "foo", Hir::char('a')), + ); + assert_eq!(p(r"(?x)( a )"), cap(1, Hir::char('a'))); + assert_eq!(p(r"(?x)( ?: a )"), Hir::char('a')); + assert_eq!(p(r"(?x)\x { 53 }"), Hir::char('\x53')); + assert_eq!(p(r"(?x)\ "), Hir::char(' ')); + } + + #[test] + fn ok_comments() { + let pat = "(?x) +# This is comment 1. +foo # This is comment 2. + # This is comment 3. +bar +# This is comment 4."; + assert_eq!( + p(pat), + Hir::concat(vec![ + Hir::char('f'), + Hir::char('o'), + Hir::char('o'), + Hir::char('b'), + Hir::char('a'), + Hir::char('r'), + ]) + ); + } + + #[test] + fn err_standard() { + assert_eq!( + ERR_TOO_MUCH_NESTING, + perr("(((((((((((((((((((((((((((((((((((((((((((((((((a)))))))))))))))))))))))))))))))))))))))))))))))))"), + ); + // This one is tricky, because the only way it can happen is if the + // number of captures overflows u32. Perhaps we should allow setting a + // lower limit? + // assert_eq!(ERR_TOO_MANY_CAPTURES, perr("")); + assert_eq!(ERR_DUPLICATE_CAPTURE_NAME, perr(r"(?Py)(?Pz)")); + assert_eq!(ERR_UNCLOSED_GROUP, perr("(")); + assert_eq!(ERR_UNCLOSED_GROUP_QUESTION, perr("(?")); + assert_eq!(ERR_UNOPENED_GROUP, perr(")")); + assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?=a)")); + assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?!a)")); + assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?<=a)")); + assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?z)")); + assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<1abc>z)")); + assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<¾>z)")); + assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<¾a>z)")); + assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<☃>z)")); + assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?z)")); + assert_eq!(ERR_UNCLOSED_GROUP_NAME, perr(r"(?Pz)")); + assert_eq!(ERR_EMPTY_GROUP_NAME, perr(r"(?<>z)")); + assert_eq!(ERR_FLAG_UNRECOGNIZED, perr(r"(?z:foo)")); + assert_eq!(ERR_FLAG_REPEATED_NEGATION, perr(r"(?s-i-R)")); + assert_eq!(ERR_FLAG_DUPLICATE, perr(r"(?isi)")); + assert_eq!(ERR_FLAG_DUPLICATE, perr(r"(?is-i)")); + assert_eq!(ERR_FLAG_UNEXPECTED_EOF, perr(r"(?is")); + assert_eq!(ERR_FLAG_DANGLING_NEGATION, perr(r"(?is-:foo)")); + assert_eq!(ERR_HEX_BRACE_INVALID_DIGIT, perr(r"\x{Z}")); + assert_eq!(ERR_HEX_BRACE_UNEXPECTED_EOF, perr(r"\x{")); + assert_eq!(ERR_HEX_BRACE_UNEXPECTED_EOF, perr(r"\x{A")); + assert_eq!(ERR_HEX_BRACE_EMPTY, perr(r"\x{}")); + assert_eq!(ERR_HEX_BRACE_INVALID, perr(r"\x{FFFFFFFFFFFFFFFFF}")); + assert_eq!(ERR_HEX_FIXED_UNEXPECTED_EOF, perr(r"\xA")); + assert_eq!(ERR_HEX_FIXED_INVALID_DIGIT, perr(r"\xZ")); + assert_eq!(ERR_HEX_FIXED_INVALID_DIGIT, perr(r"\xZA")); + assert_eq!(ERR_HEX_FIXED_INVALID_DIGIT, perr(r"\xAZ")); + assert_eq!(ERR_HEX_FIXED_INVALID, perr(r"\uD800")); + assert_eq!(ERR_HEX_FIXED_INVALID, perr(r"\UFFFFFFFF")); + assert_eq!(ERR_HEX_UNEXPECTED_EOF, perr(r"\x")); + assert_eq!(ERR_ESCAPE_UNEXPECTED_EOF, perr(r"\")); + assert_eq!(ERR_BACKREF_UNSUPPORTED, perr(r"\0")); + assert_eq!(ERR_BACKREF_UNSUPPORTED, perr(r"\1")); + assert_eq!(ERR_BACKREF_UNSUPPORTED, perr(r"\8")); + assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\pL")); + assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\p{L}")); + assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\i")); + assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\<")); + assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\>")); + assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"?")); + assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"*")); + assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"+")); + assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"(+)")); + assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"|?")); + assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"(?i)?")); + assert_eq!(ERR_COUNTED_REP_SUB_MISSING, perr(r"{5}")); + assert_eq!(ERR_COUNTED_REP_SUB_MISSING, perr(r"({5})")); + assert_eq!(ERR_COUNTED_REP_SUB_MISSING, perr(r"(?i){5}")); + assert_eq!(ERR_COUNTED_REP_UNCLOSED, perr(r"a{")); + assert_eq!(ERR_COUNTED_REP_MIN_UNCLOSED, perr(r"a{5")); + assert_eq!(ERR_COUNTED_REP_COMMA_UNCLOSED, perr(r"a{5,")); + assert_eq!(ERR_COUNTED_REP_MIN_MAX_UNCLOSED, perr(r"a{5,6")); + assert_eq!(ERR_COUNTED_REP_INVALID, perr(r"a{5,6Z")); + assert_eq!(ERR_COUNTED_REP_INVALID_RANGE, perr(r"a{6,5}")); + assert_eq!(ERR_DECIMAL_NO_DIGITS, perr(r"a{}")); + assert_eq!(ERR_DECIMAL_NO_DIGITS, perr(r"a{]}")); + assert_eq!(ERR_DECIMAL_INVALID, perr(r"a{999999999999999}")); + assert_eq!(ERR_CLASS_UNCLOSED_AFTER_ITEM, perr(r"[a")); + assert_eq!(ERR_CLASS_INVALID_RANGE_ITEM, perr(r"[\w-a]")); + assert_eq!(ERR_CLASS_INVALID_RANGE_ITEM, perr(r"[a-\w]")); + assert_eq!(ERR_CLASS_INVALID_ITEM, perr(r"[\b]")); + assert_eq!(ERR_CLASS_UNCLOSED_AFTER_DASH, perr(r"[a-")); + assert_eq!(ERR_CLASS_UNCLOSED_AFTER_NEGATION, perr(r"[^")); + assert_eq!(ERR_CLASS_UNCLOSED_AFTER_CLOSING, perr(r"[]")); + assert_eq!(ERR_CLASS_INVALID_RANGE, perr(r"[z-a]")); + assert_eq!(ERR_CLASS_UNCLOSED, perr(r"[")); + assert_eq!(ERR_CLASS_UNCLOSED, perr(r"[a-z")); + assert_eq!(ERR_CLASS_NEST_UNSUPPORTED, perr(r"[a-z[A-Z]]")); + assert_eq!(ERR_CLASS_NEST_UNSUPPORTED, perr(r"[[:alnum]]")); + assert_eq!(ERR_CLASS_INTERSECTION_UNSUPPORTED, perr(r"[a&&b]")); + assert_eq!(ERR_CLASS_DIFFERENCE_UNSUPPORTED, perr(r"[a--b]")); + assert_eq!(ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED, perr(r"[a~~b]")); + } + + #[test] + fn err_verbatim() { + // See: /~https://github.com/rust-lang/regex/issues/792 + assert_eq!(ERR_CLASS_UNCLOSED_AFTER_DASH, perr(r"(?x)[-#]")); + assert_eq!(ERR_CLASS_UNCLOSED_AFTER_ITEM, perr(r"(?x)[a ")); + assert_eq!(ERR_CLASS_UNCLOSED_AFTER_DASH, perr(r"(?x)[a- ")); + assert_eq!(ERR_CLASS_UNCLOSED, perr(r"(?x)[ ")); + } + + // This tests a bug fix where the nest limit checker wasn't decrementing + // its depth during post-traversal, which causes long regexes to trip + // the default limit too aggressively. + #[test] + fn regression_454_nest_too_big() { + let pattern = r#" + 2(?: + [45]\d{3}| + 7(?: + 1[0-267]| + 2[0-289]| + 3[0-29]| + 4[01]| + 5[1-3]| + 6[013]| + 7[0178]| + 91 + )| + 8(?: + 0[125]| + [139][1-6]| + 2[0157-9]| + 41| + 6[1-35]| + 7[1-5]| + 8[1-8]| + 90 + )| + 9(?: + 0[0-2]| + 1[0-4]| + 2[568]| + 3[3-6]| + 5[5-7]| + 6[0167]| + 7[15]| + 8[0146-9] + ) + )\d{4} + "#; + p(pattern); + } + + // This tests that we treat a trailing `-` in a character class as a + // literal `-` even when whitespace mode is enabled and there is whitespace + // after the trailing `-`. + #[test] + fn regression_455_trailing_dash_ignore_whitespace() { + p("(?x)[ / - ]"); + p("(?x)[ a - ]"); + p("(?x)[ + a + - ] + "); + p("(?x)[ + a # wat + - ] + "); + + perr("(?x)[ / -"); + perr("(?x)[ / - "); + perr( + "(?x)[ + / - + ", + ); + perr( + "(?x)[ + / - # wat + ", + ); + } + + #[test] + fn regression_capture_indices() { + let got = p(r"(a|ab|c|bcd){4,10}(d*)"); + assert_eq!( + got, + Hir::concat(vec![ + Hir::repetition(hir::Repetition { + min: 4, + max: Some(10), + greedy: true, + sub: Box::new(cap( + 1, + Hir::alternation(vec![ + Hir::char('a'), + Hir::concat(vec![Hir::char('a'), Hir::char('b')]), + Hir::char('c'), + Hir::concat(vec![ + Hir::char('b'), + Hir::char('c'), + Hir::char('d') + ]), + ]) + )) + }), + cap( + 2, + Hir::repetition(hir::Repetition { + min: 0, + max: None, + greedy: true, + sub: Box::new(Hir::char('d')), + }) + ), + ]) + ); + } +} diff --git a/regex-lite/src/int.rs b/regex-lite/src/int.rs new file mode 100644 index 000000000..c369f0429 --- /dev/null +++ b/regex-lite/src/int.rs @@ -0,0 +1,56 @@ +use core::num::NonZeroUsize; + +/// An extension trait that adds routines to the `u32` primitive type. +pub(crate) trait U32 { + fn as_usize(self) -> usize; +} + +impl U32 for u32 { + fn as_usize(self) -> usize { + // OK because we require 32 or 64 bit targets. Therefore, every u32 + // necessarily fits into a usize. + self as usize + } +} + +/// A `usize` that can never be `usize::MAX`. +/// +/// This is similar to `core::num::NonZeroUsize`, but instead of not permitting +/// a zero value, this does not permit a max value. +/// +/// This is useful in certain contexts where one wants to optimize the memory +/// usage of things that contain match offsets. Namely, since Rust slices +/// are guaranteed to never have a length exceeding `isize::MAX`, we can use +/// `usize::MAX` as a sentinel to indicate that no match was found. Indeed, +/// types like `Option` have exactly the same size in memory as a +/// `usize`. +/// +/// This type is defined to be `repr(transparent)` for +/// `core::num::NonZeroUsize`, which is in turn defined to be +/// `repr(transparent)` for `usize`. +#[derive(Clone, Copy, Eq, Hash, PartialEq, PartialOrd, Ord)] +#[repr(transparent)] +pub(crate) struct NonMaxUsize(NonZeroUsize); + +impl NonMaxUsize { + /// Create a new `NonMaxUsize` from the given value. + /// + /// This returns `None` only when the given value is equal to `usize::MAX`. + pub(crate) fn new(value: usize) -> Option { + NonZeroUsize::new(value.wrapping_add(1)).map(NonMaxUsize) + } + + /// Return the underlying `usize` value. The returned value is guaranteed + /// to not equal `usize::MAX`. + pub(crate) fn get(self) -> usize { + self.0.get().wrapping_sub(1) + } +} + +// We provide our own Debug impl because seeing the internal repr can be quite +// surprising if you aren't expecting it. e.g., 'NonMaxUsize(5)' vs just '5'. +impl core::fmt::Debug for NonMaxUsize { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "{:?}", self.get()) + } +} diff --git a/regex-lite/src/interpolate.rs b/regex-lite/src/interpolate.rs new file mode 100644 index 000000000..a440738ab --- /dev/null +++ b/regex-lite/src/interpolate.rs @@ -0,0 +1,527 @@ +/*! +Provides routines for interpolating capture group references. + +That is, if a replacement string contains references like `$foo` or `${foo1}`, +then they are replaced with the corresponding capture values for the groups +named `foo` and `foo1`, respectively. Similarly, syntax like `$1` and `${1}` +is supported as well, with `1` corresponding to a capture group index and not +a name. + +This module provides the free functions [`string`] and [`bytes`], which +interpolate Rust Unicode strings and byte strings, respectively. + +# Format + +These routines support two different kinds of capture references: unbraced and +braced. + +For the unbraced format, the format supported is `$ref` where `name` can be +any character in the class `[0-9A-Za-z_]`. `ref` is always the longest +possible parse. So for example, `$1a` corresponds to the capture group named +`1a` and not the capture group at index `1`. If `ref` matches `^[0-9]+$`, then +it is treated as a capture group index itself and not a name. + +For the braced format, the format supported is `${ref}` where `ref` can be any +sequence of bytes except for `}`. If no closing brace occurs, then it is not +considered a capture reference. As with the unbraced format, if `ref` matches +`^[0-9]+$`, then it is treated as a capture group index and not a name. + +The braced format is useful for exerting precise control over the name of the +capture reference. For example, `${1}a` corresponds to the capture group +reference `1` followed by the letter `a`, where as `$1a` (as mentioned above) +corresponds to the capture group reference `1a`. The braced format is also +useful for expressing capture group names that use characters not supported by +the unbraced format. For example, `${foo[bar].baz}` refers to the capture group +named `foo[bar].baz`. + +If a capture group reference is found and it does not refer to a valid capture +group, then it will be replaced with the empty string. + +To write a literal `$`, use `$$`. + +To be clear, and as exhibited via the type signatures in the routines in this +module, it is impossible for a replacement string to be invalid. A replacement +string may not have the intended semantics, but the interpolation procedure +itself can never fail. +*/ + +use alloc::{string::String, vec::Vec}; + +/// Accepts a replacement string and interpolates capture references with their +/// corresponding values. +/// +/// `append` should be a function that appends the string value of a capture +/// group at a particular index to the string given. If the capture group +/// index is invalid, then nothing should be appended. +/// +/// `name_to_index` should be a function that maps a capture group name to a +/// capture group index. If the given name doesn't exist, then `None` should +/// be returned. +/// +/// Finally, `dst` is where the final interpolated contents should be written. +/// If `replacement` contains no capture group references, then `dst` will be +/// equivalent to `replacement`. +/// +/// See the [module documentation](self) for details about the format +/// supported. +pub fn string( + mut replacement: &str, + mut append: impl FnMut(usize, &mut String), + mut name_to_index: impl FnMut(&str) -> Option, + dst: &mut String, +) { + while !replacement.is_empty() { + match replacement.find('$') { + None => break, + Some(i) => { + dst.push_str(&replacement[..i]); + replacement = &replacement[i..]; + } + } + // Handle escaping of '$'. + if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$') { + dst.push_str("$"); + replacement = &replacement[2..]; + continue; + } + debug_assert!(!replacement.is_empty()); + let cap_ref = match find_cap_ref(replacement.as_bytes()) { + Some(cap_ref) => cap_ref, + None => { + dst.push_str("$"); + replacement = &replacement[1..]; + continue; + } + }; + replacement = &replacement[cap_ref.end..]; + match cap_ref.cap { + Ref::Number(i) => append(i, dst), + Ref::Named(name) => { + if let Some(i) = name_to_index(name) { + append(i, dst); + } + } + } + } + dst.push_str(replacement); +} + +/// Accepts a replacement byte string and interpolates capture references with +/// their corresponding values. +/// +/// `append` should be a function that appends the byte string value of a +/// capture group at a particular index to the byte string given. If the +/// capture group index is invalid, then nothing should be appended. +/// +/// `name_to_index` should be a function that maps a capture group name to a +/// capture group index. If the given name doesn't exist, then `None` should +/// be returned. +/// +/// Finally, `dst` is where the final interpolated contents should be written. +/// If `replacement` contains no capture group references, then `dst` will be +/// equivalent to `replacement`. +/// +/// See the [module documentation](self) for details about the format +/// supported. +pub fn bytes( + mut replacement: &[u8], + mut append: impl FnMut(usize, &mut Vec), + mut name_to_index: impl FnMut(&str) -> Option, + dst: &mut Vec, +) { + while !replacement.is_empty() { + match replacement.iter().position(|&b| b == b'$') { + None => break, + Some(i) => { + dst.extend_from_slice(&replacement[..i]); + replacement = &replacement[i..]; + } + } + // Handle escaping of '$'. + if replacement.get(1).map_or(false, |&b| b == b'$') { + dst.push(b'$'); + replacement = &replacement[2..]; + continue; + } + debug_assert!(!replacement.is_empty()); + let cap_ref = match find_cap_ref(replacement) { + Some(cap_ref) => cap_ref, + None => { + dst.push(b'$'); + replacement = &replacement[1..]; + continue; + } + }; + replacement = &replacement[cap_ref.end..]; + match cap_ref.cap { + Ref::Number(i) => append(i, dst), + Ref::Named(name) => { + if let Some(i) = name_to_index(name) { + append(i, dst); + } + } + } + } + dst.extend_from_slice(replacement); +} + +/// `CaptureRef` represents a reference to a capture group inside some text. +/// The reference is either a capture group name or a number. +/// +/// It is also tagged with the position in the text following the +/// capture reference. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct CaptureRef<'a> { + cap: Ref<'a>, + end: usize, +} + +/// A reference to a capture group in some text. +/// +/// e.g., `$2`, `$foo`, `${foo}`. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum Ref<'a> { + Named(&'a str), + Number(usize), +} + +impl<'a> From<&'a str> for Ref<'a> { + fn from(x: &'a str) -> Ref<'a> { + Ref::Named(x) + } +} + +impl From for Ref<'static> { + fn from(x: usize) -> Ref<'static> { + Ref::Number(x) + } +} + +/// Parses a possible reference to a capture group name in the given text, +/// starting at the beginning of `replacement`. +/// +/// If no such valid reference could be found, None is returned. +/// +/// Note that this returns a "possible" reference because this routine doesn't +/// know whether the reference is to a valid group or not. If it winds up not +/// being a valid reference, then it should be replaced with the empty string. +fn find_cap_ref(replacement: &[u8]) -> Option> { + let mut i = 0; + let rep: &[u8] = replacement; + if rep.len() <= 1 || rep[0] != b'$' { + return None; + } + i += 1; + if rep[i] == b'{' { + return find_cap_ref_braced(rep, i + 1); + } + let mut cap_end = i; + while rep.get(cap_end).copied().map_or(false, is_valid_cap_letter) { + cap_end += 1; + } + if cap_end == i { + return None; + } + // We just verified that the range 0..cap_end is valid ASCII, so it must + // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8 + // check via an unchecked conversion or by parsing the number straight from + // &[u8]. + let cap = core::str::from_utf8(&rep[i..cap_end]) + .expect("valid UTF-8 capture name"); + Some(CaptureRef { + cap: match cap.parse::() { + Ok(i) => Ref::Number(i as usize), + Err(_) => Ref::Named(cap), + }, + end: cap_end, + }) +} + +/// Looks for a braced reference, e.g., `${foo1}`. This assumes that an opening +/// brace has been found at `i-1` in `rep`. This then looks for a closing +/// brace and returns the capture reference within the brace. +fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option> { + assert_eq!(b'{', rep[i.checked_sub(1).unwrap()]); + let start = i; + while rep.get(i).map_or(false, |&b| b != b'}') { + i += 1; + } + if !rep.get(i).map_or(false, |&b| b == b'}') { + return None; + } + // When looking at braced names, we don't put any restrictions on the name, + // so it's possible it could be invalid UTF-8. But a capture group name + // can never be invalid UTF-8, so if we have invalid UTF-8, then we can + // safely return None. + let cap = match core::str::from_utf8(&rep[start..i]) { + Err(_) => return None, + Ok(cap) => cap, + }; + Some(CaptureRef { + cap: match cap.parse::() { + Ok(i) => Ref::Number(i as usize), + Err(_) => Ref::Named(cap), + }, + end: i + 1, + }) +} + +/// Returns true if and only if the given byte is allowed in a capture name +/// written in non-brace form. +fn is_valid_cap_letter(b: u8) -> bool { + match b { + b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true, + _ => false, + } +} + +#[cfg(test)] +mod tests { + use alloc::{string::String, vec, vec::Vec}; + + use super::{find_cap_ref, CaptureRef}; + + macro_rules! find { + ($name:ident, $text:expr) => { + #[test] + fn $name() { + assert_eq!(None, find_cap_ref($text.as_bytes())); + } + }; + ($name:ident, $text:expr, $capref:expr) => { + #[test] + fn $name() { + assert_eq!(Some($capref), find_cap_ref($text.as_bytes())); + } + }; + } + + macro_rules! c { + ($name_or_number:expr, $pos:expr) => { + CaptureRef { cap: $name_or_number.into(), end: $pos } + }; + } + + find!(find_cap_ref1, "$foo", c!("foo", 4)); + find!(find_cap_ref2, "${foo}", c!("foo", 6)); + find!(find_cap_ref3, "$0", c!(0, 2)); + find!(find_cap_ref4, "$5", c!(5, 2)); + find!(find_cap_ref5, "$10", c!(10, 3)); + // See /~https://github.com/rust-lang/regex/pull/585 + // for more on characters following numbers + find!(find_cap_ref6, "$42a", c!("42a", 4)); + find!(find_cap_ref7, "${42}a", c!(42, 5)); + find!(find_cap_ref8, "${42"); + find!(find_cap_ref9, "${42 "); + find!(find_cap_ref10, " $0 "); + find!(find_cap_ref11, "$"); + find!(find_cap_ref12, " "); + find!(find_cap_ref13, ""); + find!(find_cap_ref14, "$1-$2", c!(1, 2)); + find!(find_cap_ref15, "$1_$2", c!("1_", 3)); + find!(find_cap_ref16, "$x-$y", c!("x", 2)); + find!(find_cap_ref17, "$x_$y", c!("x_", 3)); + find!(find_cap_ref18, "${#}", c!("#", 4)); + find!(find_cap_ref19, "${Z[}", c!("Z[", 5)); + find!(find_cap_ref20, "${¾}", c!("¾", 5)); + find!(find_cap_ref21, "${¾a}", c!("¾a", 6)); + find!(find_cap_ref22, "${a¾}", c!("a¾", 6)); + find!(find_cap_ref23, "${☃}", c!("☃", 6)); + find!(find_cap_ref24, "${a☃}", c!("a☃", 7)); + find!(find_cap_ref25, "${☃a}", c!("☃a", 7)); + find!(find_cap_ref26, "${名字}", c!("名字", 9)); + + fn interpolate_string( + mut name_to_index: Vec<(&'static str, usize)>, + caps: Vec<&'static str>, + replacement: &str, + ) -> String { + name_to_index.sort_by_key(|x| x.0); + + let mut dst = String::new(); + super::string( + replacement, + |i, dst| { + if let Some(&s) = caps.get(i) { + dst.push_str(s); + } + }, + |name| -> Option { + name_to_index + .binary_search_by_key(&name, |x| x.0) + .ok() + .map(|i| name_to_index[i].1) + }, + &mut dst, + ); + dst + } + + fn interpolate_bytes( + mut name_to_index: Vec<(&'static str, usize)>, + caps: Vec<&'static str>, + replacement: &str, + ) -> String { + name_to_index.sort_by_key(|x| x.0); + + let mut dst = vec![]; + super::bytes( + replacement.as_bytes(), + |i, dst| { + if let Some(&s) = caps.get(i) { + dst.extend_from_slice(s.as_bytes()); + } + }, + |name| -> Option { + name_to_index + .binary_search_by_key(&name, |x| x.0) + .ok() + .map(|i| name_to_index[i].1) + }, + &mut dst, + ); + String::from_utf8(dst).unwrap() + } + + macro_rules! interp { + ($name:ident, $map:expr, $caps:expr, $hay:expr, $expected:expr $(,)*) => { + #[test] + fn $name() { + assert_eq!( + $expected, + interpolate_string($map, $caps, $hay), + "interpolate::string failed", + ); + assert_eq!( + $expected, + interpolate_bytes($map, $caps, $hay), + "interpolate::bytes failed", + ); + } + }; + } + + interp!( + interp1, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test $foo test", + "test xxx test", + ); + + interp!( + interp2, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test$footest", + "test", + ); + + interp!( + interp3, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test${foo}test", + "testxxxtest", + ); + + interp!( + interp4, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test$2test", + "test", + ); + + interp!( + interp5, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test${2}test", + "testxxxtest", + ); + + interp!( + interp6, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test $$foo test", + "test $foo test", + ); + + interp!( + interp7, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test $foo", + "test xxx", + ); + + interp!( + interp8, + vec![("foo", 2)], + vec!["", "", "xxx"], + "$foo test", + "xxx test", + ); + + interp!( + interp9, + vec![("bar", 1), ("foo", 2)], + vec!["", "yyy", "xxx"], + "test $bar$foo", + "test yyyxxx", + ); + + interp!( + interp10, + vec![("bar", 1), ("foo", 2)], + vec!["", "yyy", "xxx"], + "test $ test", + "test $ test", + ); + + interp!( + interp11, + vec![("bar", 1), ("foo", 2)], + vec!["", "yyy", "xxx"], + "test ${} test", + "test test", + ); + + interp!( + interp12, + vec![("bar", 1), ("foo", 2)], + vec!["", "yyy", "xxx"], + "test ${ } test", + "test test", + ); + + interp!( + interp13, + vec![("bar", 1), ("foo", 2)], + vec!["", "yyy", "xxx"], + "test ${a b} test", + "test test", + ); + + interp!( + interp14, + vec![("bar", 1), ("foo", 2)], + vec!["", "yyy", "xxx"], + "test ${a} test", + "test test", + ); + + // This is a funny case where a braced reference is never closed, but + // within the unclosed braced reference, there is an unbraced reference. + // In this case, the braced reference is just treated literally and the + // unbraced reference is found. + interp!( + interp15, + vec![("bar", 1), ("foo", 2)], + vec!["", "yyy", "xxx"], + "test ${wat $bar ok", + "test ${wat yyy ok", + ); +} diff --git a/regex-lite/src/lib.rs b/regex-lite/src/lib.rs new file mode 100644 index 000000000..7e1664dff --- /dev/null +++ b/regex-lite/src/lib.rs @@ -0,0 +1,32 @@ +/*! +TODO +*/ + +#![allow(warnings)] +#![no_std] +#![forbid(unsafe_code)] +// #![deny(missing_docs, rustdoc::broken_intra_doc_links)] +#![warn(missing_debug_implementations)] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] + +#[cfg(not(any(target_pointer_width = "32", target_pointer_width = "64")))] +compile_error!("not supported on non-{32,64}, please file an issue"); + +extern crate alloc; +#[cfg(any(test, feature = "std"))] +extern crate std; + +pub use self::{ + hir::{is_escapeable_character, is_meta_character}, + string::*, +}; + +mod error; +mod hir; +mod int; +mod interpolate; +mod nfa; +mod pikevm; +mod pool; +mod string; +mod utf8; diff --git a/regex-lite/src/nfa.rs b/regex-lite/src/nfa.rs new file mode 100644 index 000000000..8f476ba59 --- /dev/null +++ b/regex-lite/src/nfa.rs @@ -0,0 +1,700 @@ +use core::{cell::RefCell, mem::size_of}; + +use alloc::{string::String, sync::Arc, vec, vec::Vec}; + +use crate::{ + error::Error, + hir::{self, Hir, HirKind}, + int::U32, +}; + +pub(crate) type StateID = u32; + +#[derive(Clone, Copy, Debug)] +pub(crate) struct Config { + pub(crate) size_limit: Option, +} + +impl Default for Config { + fn default() -> Config { + Config { size_limit: Some(10 * (1 << 20)) } + } +} + +#[derive(Clone)] +pub(crate) struct NFA { + /// The pattern string this NFA was generated from. + /// + /// We put it here for lack of a better place to put it. ¯\_(ツ)_/¯ + pattern: String, + /// The states that make up this NFA. + states: Vec, + /// The ID of the start state. + start: StateID, + /// Whether this NFA can only match at the beginning of a haystack. + is_start_anchored: bool, + /// Whether this NFA can match the empty string. + is_match_empty: bool, + /// A map from capture group name to its corresponding index. + cap_name_to_index: CaptureNameMap, + /// A map from capture group index to the corresponding name, if one + /// exists. + cap_index_to_name: Vec>>, + /// Heap memory used indirectly by NFA states and other things (like the + /// various capturing group representations above). Since each state + /// might use a different amount of heap, we need to keep track of this + /// incrementally. + memory_extra: usize, +} + +impl NFA { + /// Creates a new NFA from the given configuration and HIR. + pub(crate) fn new( + config: Config, + pattern: String, + hir: &Hir, + ) -> Result { + Compiler::new(config, pattern).compile(hir) + } + + /// Returns the pattern string used to construct this NFA. + pub(crate) fn pattern(&self) -> &str { + &self.pattern + } + + /// Returns the state corresponding to the given ID. + /// + /// # Panics + /// + /// If the ID does not refer to a valid state, then this panics. + pub(crate) fn state(&self, id: StateID) -> &State { + &self.states[id.as_usize()] + } + + /// Returns the total number of states in this NFA. + pub(crate) fn len(&self) -> usize { + self.states.len() + } + + /// Returns the ID of the starting state for this NFA. + pub(crate) fn start(&self) -> StateID { + self.start + } + + /// Returns the capture group index for the corresponding named group. + /// If no such group with the given name exists, then `None` is returned. + pub(crate) fn to_index(&self, name: &str) -> Option { + self.cap_name_to_index.get(name).cloned().map(|i| i.as_usize()) + } + + /// Returns the capture group name for the corresponding capture group + /// index. If no such group, then `None` is returned. + pub(crate) fn to_name(&self, index: usize) -> Option<&str> { + self.cap_index_to_name.get(index)?.as_deref() + } + + /// Returns an iterator over all of the capture groups, along with their + /// names if they exist, in this NFA. + pub(crate) fn capture_names(&self) -> CaptureNames<'_> { + CaptureNames { it: self.cap_index_to_name.iter() } + } + + /// Returns the total number of capture groups, including the first and + /// implicit group, in this NFA. + pub(crate) fn group_len(&self) -> usize { + self.cap_index_to_name.len() + } + + /// Returns true if and only if this NFA can only match at the beginning of + /// a haystack. + pub(crate) fn is_start_anchored(&self) -> bool { + self.is_start_anchored + } + + /// Returns true if and only if this NFA can match the empty string. + pub(crate) fn is_match_empty(&self) -> bool { + self.is_match_empty + } + + /// Returns the heap memory usage, in bytes, used by this NFA. + fn memory_usage(&self) -> usize { + (self.states.len() * size_of::()) + + (self.cap_index_to_name.len() * size_of::>>()) + + self.memory_extra + } +} + +impl core::fmt::Debug for NFA { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + writeln!(f, "NFA(")?; + writeln!(f, "pattern: {}", self.pattern)?; + for (sid, state) in self.states.iter().enumerate() { + writeln!(f, "{:07?}: {:?}", sid, state)?; + } + writeln!(f, ")")?; + Ok(()) + } +} + +/// An iterator over all capture groups in an NFA. +/// +/// If a particular group has a name, then it is yielded. Otherwise, `None` +/// is yielded. +#[derive(Clone, Debug)] +pub(crate) struct CaptureNames<'a> { + it: core::slice::Iter<'a, Option>>, +} + +impl<'a> Iterator for CaptureNames<'a> { + type Item = Option<&'a str>; + + fn next(&mut self) -> Option> { + self.it.next().map(|n| n.as_deref()) + } +} + +#[derive(Clone, Eq, PartialEq)] +pub(crate) enum State { + Char { target: StateID, ch: char }, + Ranges { target: StateID, ranges: Vec<(char, char)> }, + Splits { targets: Vec, reverse: bool }, + Goto { target: StateID, look: Option }, + Capture { target: StateID, slot: u32 }, + Fail, + Match, +} + +impl State { + /// Returns the heap memory usage of this NFA state in bytes. + fn memory_usage(&self) -> usize { + match *self { + State::Char { .. } + | State::Goto { .. } + | State::Capture { .. } + | State::Fail { .. } + | State::Match => 0, + State::Splits { ref targets, .. } => { + targets.len() * size_of::() + } + State::Ranges { ref ranges, .. } => { + ranges.len() * size_of::<(char, char)>() + } + } + } + + /// Returns an iterator over the given split targets. The order of the + /// iterator yields elements in reverse when `reverse` is true. + pub(crate) fn iter_splits<'a>( + splits: &'a [StateID], + reverse: bool, + ) -> impl Iterator + 'a { + let mut it = splits.iter(); + core::iter::from_fn(move || { + if reverse { it.next_back() } else { it.next() }.copied() + }) + } +} + +impl core::fmt::Debug for State { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match *self { + State::Char { target, ch } => { + write!(f, "{:?} => {:?}", ch, target) + } + State::Ranges { target, ref ranges } => { + for (i, &(start, end)) in ranges.iter().enumerate() { + if i > 0 { + write!(f, ", ")?; + } + write!(f, "{:?}-{:?} => {:?}", start, end, target)?; + } + Ok(()) + } + State::Splits { ref targets, reverse } => { + write!(f, "splits(")?; + for (i, sid) in + State::iter_splits(targets, reverse).enumerate() + { + if i > 0 { + write!(f, ", ")?; + } + write!(f, "{:?}", sid)?; + } + write!(f, ")") + } + State::Goto { target, look: None } => { + write!(f, "goto({:?})", target) + } + State::Goto { target, look: Some(look) } => { + write!(f, "{:?} => {:?}", look, target) + } + State::Capture { target, slot } => { + write!(f, "capture(slot={:?}) => {:?}", slot, target,) + } + State::Fail => write!(f, "FAIL"), + State::Match => { + write!(f, "MATCH") + } + } + } +} + +/// A map from capture group name to its corresponding capture group index. +/// +/// We define a type alias here so that we can transparently use a `HashMap` +/// whenever it's available. We do so presumably because it's faster, although +/// there are no benchmarks verifying this. +#[cfg(feature = "std")] +type CaptureNameMap = std::collections::HashMap, u32>; +#[cfg(not(feature = "std"))] +type CaptureNameMap = alloc::collections::BTreeMap, u32>; + +#[derive(Debug)] +struct Compiler { + config: Config, + nfa: RefCell, +} + +impl Compiler { + fn new(config: Config, pattern: String) -> Compiler { + let nfa = RefCell::new(NFA { + pattern, + states: vec![], + start: 0, + is_start_anchored: false, + is_match_empty: false, + cap_name_to_index: CaptureNameMap::default(), + cap_index_to_name: vec![], + memory_extra: 0, + }); + Compiler { config, nfa } + } + + fn compile(self, hir: &Hir) -> Result { + self.nfa.borrow_mut().is_start_anchored = hir.is_start_anchored(); + self.nfa.borrow_mut().is_match_empty = hir.is_match_empty(); + let compiled = self.c_capture(0, None, hir)?; + let mat = self.add(State::Match)?; + self.patch(compiled.end, mat); + self.nfa.borrow_mut().start = compiled.start; + Ok(self.nfa.into_inner()) + } + + fn c(&self, hir: &Hir) -> Result { + match *hir.kind() { + HirKind::Empty => self.c_empty(), + HirKind::Char(ch) => self.c_char(ch), + HirKind::Class(ref class) => self.c_class(class), + HirKind::Look(ref look) => self.c_look(look), + HirKind::Repetition(ref rep) => self.c_repetition(rep), + HirKind::Capture(ref cap) => { + self.c_capture(cap.index, cap.name.as_deref(), &cap.sub) + } + HirKind::Concat(ref subs) => { + self.c_concat(subs.iter().map(|s| self.c(s))) + } + HirKind::Alternation(ref subs) => { + self.c_alternation(subs.iter().map(|s| self.c(s))) + } + } + } + + /// Compile a "fail" state that can never be transitioned out of. + fn c_fail(&self) -> Result { + let id = self.add(State::Fail)?; + Ok(ThompsonRef { start: id, end: id }) + } + + /// Compile an "empty" state with one unconditional epsilon transition. + /// + /// Both the `start` and `end` locations point to the state created. + /// Callers will likely want to keep the `start`, but patch the `end` to + /// point to some other state. + fn c_empty(&self) -> Result { + let id = self.add_empty()?; + Ok(ThompsonRef { start: id, end: id }) + } + + /// Compile the given literal char to an NFA. + fn c_char(&self, ch: char) -> Result { + let id = self.add(State::Char { target: 0, ch })?; + Ok(ThompsonRef { start: id, end: id }) + } + + /// Compile the given character class into an NFA. + /// + /// If the class is empty, then this compiles to a `Fail` state. + fn c_class(&self, class: &hir::Class) -> Result { + let id = if class.ranges.is_empty() { + // Technically using an explicit fail state probably isn't + // necessary. Because if you try to match against an empty Ranges, + // then it should turn up with nothing regardless of input, and + // thus "acts" like a Fail state. But it's better to be more + // explicit, and there's no real cost to doing so. + self.add(State::Fail) + } else { + let ranges = + class.ranges.iter().map(|r| (r.start, r.end)).collect(); + self.add(State::Ranges { target: 0, ranges }) + }?; + Ok(ThompsonRef { start: id, end: id }) + } + + /// Compile the given HIR look-around assertion to an NFA look-around + /// assertion. + fn c_look(&self, look: &hir::Look) -> Result { + let id = self.add(State::Goto { target: 0, look: Some(*look) })?; + Ok(ThompsonRef { start: id, end: id }) + } + + /// Compile the given repetition expression. This handles all types of + /// repetitions and greediness. + fn c_repetition( + &self, + rep: &hir::Repetition, + ) -> Result { + match (rep.min, rep.max) { + (0, Some(1)) => self.c_zero_or_one(&rep.sub, rep.greedy), + (min, None) => self.c_at_least(&rep.sub, rep.greedy, min), + (min, Some(max)) if min == max => self.c_exactly(&rep.sub, min), + (min, Some(max)) => self.c_bounded(&rep.sub, rep.greedy, min, max), + } + } + + /// Compile the given expression such that it matches at least `min` times, + /// but no more than `max` times. + /// + /// When `greedy` is true, then the preference is for the expression to + /// match as much as possible. Otheriwse, it will match as little as + /// possible. + fn c_bounded( + &self, + hir: &Hir, + greedy: bool, + min: u32, + max: u32, + ) -> Result { + let prefix = self.c_exactly(hir, min)?; + if min == max { + return Ok(prefix); + } + + // It is tempting here to compile the rest here as a concatenation + // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it + // were `aaa?a?a?`. The problem here is that it leads to this program: + // + // >000000: 61 => 01 + // 000001: 61 => 02 + // 000002: union(03, 04) + // 000003: 61 => 04 + // 000004: union(05, 06) + // 000005: 61 => 06 + // 000006: union(07, 08) + // 000007: 61 => 08 + // 000008: MATCH + // + // And effectively, once you hit state 2, the epsilon closure will + // include states 3, 5, 6, 7 and 8, which is quite a bit. It is better + // to instead compile it like so: + // + // >000000: 61 => 01 + // 000001: 61 => 02 + // 000002: union(03, 08) + // 000003: 61 => 04 + // 000004: union(05, 08) + // 000005: 61 => 06 + // 000006: union(07, 08) + // 000007: 61 => 08 + // 000008: MATCH + // + // So that the epsilon closure of state 2 is now just 3 and 8. + let empty = self.add_empty()?; + let mut prev_end = prefix.end; + for _ in min..max { + let splits = + self.add(State::Splits { targets: vec![], reverse: !greedy })?; + let compiled = self.c(hir)?; + self.patch(prev_end, splits)?; + self.patch(splits, compiled.start)?; + self.patch(splits, empty)?; + prev_end = compiled.end; + } + self.patch(prev_end, empty)?; + Ok(ThompsonRef { start: prefix.start, end: empty }) + } + + /// Compile the given expression such that it may be matched `n` or more + /// times, where `n` can be any integer. (Although a particularly large + /// integer is likely to run afoul of any configured size limits.) + /// + /// When `greedy` is true, then the preference is for the expression to + /// match as much as possible. Otheriwse, it will match as little as + /// possible. + fn c_at_least( + &self, + hir: &Hir, + greedy: bool, + n: u32, + ) -> Result { + if n == 0 { + // When the expression cannot match the empty string, then we + // can get away with something much simpler: just one 'alt' + // instruction that optionally repeats itself. But if the expr + // can match the empty string... see below. + if !hir.is_match_empty() { + let splits = self.add(State::Splits { + targets: vec![], + reverse: !greedy, + })?; + let compiled = self.c(hir)?; + self.patch(splits, compiled.start)?; + self.patch(compiled.end, splits)?; + return Ok(ThompsonRef { start: splits, end: splits }); + } + + // What's going on here? Shouldn't x* be simpler than this? It + // turns out that when implementing leftmost-first (Perl-like) + // match semantics, x* results in an incorrect preference order + // when computing the transitive closure of states if and only if + // 'x' can match the empty string. So instead, we compile x* as + // (x+)?, which preserves the correct preference order. + // + // See: /~https://github.com/rust-lang/regex/issues/779 + let compiled = self.c(hir)?; + let plus = + self.add(State::Splits { targets: vec![], reverse: !greedy })?; + self.patch(compiled.end, plus)?; + self.patch(plus, compiled.start)?; + + let question = + self.add(State::Splits { targets: vec![], reverse: !greedy })?; + let empty = self.add_empty()?; + self.patch(question, compiled.start)?; + self.patch(question, empty)?; + self.patch(plus, empty)?; + Ok(ThompsonRef { start: question, end: empty }) + } else if n == 1 { + let compiled = self.c(hir)?; + let splits = + self.add(State::Splits { targets: vec![], reverse: !greedy })?; + self.patch(compiled.end, splits)?; + self.patch(splits, compiled.start)?; + Ok(ThompsonRef { start: compiled.start, end: splits }) + } else { + let prefix = self.c_exactly(hir, n - 1)?; + let last = self.c(hir)?; + let splits = + self.add(State::Splits { targets: vec![], reverse: !greedy })?; + self.patch(prefix.end, last.start)?; + self.patch(last.end, splits)?; + self.patch(splits, last.start)?; + Ok(ThompsonRef { start: prefix.start, end: splits }) + } + } + + /// Compile the given expression such that it may be matched zero or one + /// times. + /// + /// When `greedy` is true, then the preference is for the expression to + /// match as much as possible. Otheriwse, it will match as little as + /// possible. + fn c_zero_or_one( + &self, + hir: &Hir, + greedy: bool, + ) -> Result { + let splits = + self.add(State::Splits { targets: vec![], reverse: !greedy })?; + let compiled = self.c(hir)?; + let empty = self.add_empty()?; + self.patch(splits, compiled.start)?; + self.patch(splits, empty)?; + self.patch(compiled.end, empty)?; + Ok(ThompsonRef { start: splits, end: empty }) + } + + /// Compile the given HIR expression exactly `n` times. + fn c_exactly(&self, hir: &Hir, n: u32) -> Result { + self.c_concat((0..n).map(|_| self.c(hir))) + } + + /// Compile the given expression and insert capturing states at the + /// beginning and end of it. The slot for the capture states is computed + /// from the index. + fn c_capture( + &self, + index: u32, + name: Option<&str>, + hir: &Hir, + ) -> Result { + let existing_groups_len = self.nfa.borrow().cap_index_to_name.len(); + assert!( + index.as_usize() <= existing_groups_len, + "captures compiled in wrong order" + ); + if index.as_usize() == existing_groups_len { + if let Some(name) = name { + let name = Arc::from(name); + let mut nfa = self.nfa.borrow_mut(); + nfa.cap_name_to_index.insert(Arc::clone(&name), index); + nfa.cap_index_to_name.push(Some(Arc::clone(&name))); + // This is an approximation. + nfa.memory_extra += name.len() + size_of::(); + } else { + self.nfa.borrow_mut().cap_index_to_name.push(None); + } + } + + let Some(slot) = index.checked_mul(2) else { + return Err(Error::new("capture group slots exhausted")); + }; + let start = self.add(State::Capture { target: 0, slot })?; + let inner = self.c(hir)?; + let Some(slot) = slot.checked_add(1) else { + return Err(Error::new("capture group slots exhausted")); + }; + let end = self.add(State::Capture { target: 0, slot })?; + self.patch(start, inner.start)?; + self.patch(inner.end, end)?; + + Ok(ThompsonRef { start, end }) + } + + /// Compile a concatenation of the sub-expressions yielded by the given + /// iterator. If the iterator yields no elements, then this compiles down + /// to an "empty" state that always matches. + fn c_concat(&self, mut it: I) -> Result + where + I: Iterator>, + { + let ThompsonRef { start, mut end } = match it.next() { + Some(result) => result?, + None => return self.c_empty(), + }; + for result in it { + let compiled = result?; + self.patch(end, compiled.start)?; + end = compiled.end; + } + Ok(ThompsonRef { start, end }) + } + + /// Compile an alternation, where each element yielded by the given + /// iterator represents an item in the alternation. If the iterator yields + /// no elements, then this compiles down to a "fail" state. + /// + /// In an alternation, expressions appearing earlier are "preferred" at + /// match time over expressions appearing later. (This is currently always + /// true, as this crate only supports leftmost-first semantics.) + fn c_alternation(&self, mut it: I) -> Result + where + I: Iterator>, + { + let first = match it.next() { + None => return self.c_fail(), + Some(result) => result?, + }; + let second = match it.next() { + None => return Ok(first), + Some(result) => result?, + }; + + let splits = + self.add(State::Splits { targets: vec![], reverse: false })?; + let end = self.add_empty()?; + self.patch(splits, first.start)?; + self.patch(first.end, end)?; + self.patch(splits, second.start)?; + self.patch(second.end, end)?; + for result in it { + let compiled = result?; + self.patch(splits, compiled.start)?; + self.patch(compiled.end, end)?; + } + Ok(ThompsonRef { start: splits, end }) + } + + /// A convenience routine for adding an empty state, also known as an + /// unconditional epsilon transition. These are quite useful for making + /// NFA construction simpler. + /// + /// (In the regex crate, we do a second pass to remove these, but don't + /// bother with that here.) + fn add_empty(&self) -> Result { + self.add(State::Goto { target: 0, look: None }) + } + + /// The common implementation of "add a state." It handles the common + /// error cases of state ID exhausting (by owning state ID allocation) and + /// whether the size limit has been exceeded. + fn add(&self, state: State) -> Result { + let id = u32::try_from(self.nfa.borrow().states.len()) + .map_err(|_| Error::new("exhausted state IDs, too many states"))?; + self.nfa.borrow_mut().memory_extra += state.memory_usage(); + self.nfa.borrow_mut().states.push(state); + self.check_size_limit()?; + Ok(id) + } + + /// Add a transition from one state to another. + /// + /// This routine is called "patch" since it is very common to add the + /// states you want, typically with "dummy" state ID transitions, and then + /// "patch" in the real state IDs later. This is because you don't always + /// know all of the necessary state IDs to add because they might not + /// exist yet. + /// + /// # Errors + /// + /// This may error if patching leads to an increase in heap usage beyond + /// the configured size limit. Heap usage only grows when patching adds a + /// new transition (as in the case of a "splits" state). + fn patch(&self, from: StateID, to: StateID) -> Result<(), Error> { + let mut new_memory_extra = self.nfa.borrow().memory_extra; + match self.nfa.borrow_mut().states[from.as_usize()] { + State::Char { ref mut target, .. } => { + *target = to; + } + State::Ranges { ref mut target, .. } => { + *target = to; + } + State::Splits { ref mut targets, .. } => { + targets.push(to); + new_memory_extra += size_of::(); + } + State::Goto { ref mut target, .. } => { + *target = to; + } + State::Capture { ref mut target, .. } => { + *target = to; + } + State::Fail | State::Match => {} + } + if new_memory_extra != self.nfa.borrow().memory_extra { + self.nfa.borrow_mut().memory_extra = new_memory_extra; + self.check_size_limit()?; + } + Ok(()) + } + + /// Checks that the current heap memory usage of the NFA being compiled + /// doesn't exceed the configured size limit. If it does, an error is + /// returned. + fn check_size_limit(&self) -> Result<(), Error> { + if let Some(limit) = self.config.size_limit { + if self.nfa.borrow().memory_usage() > limit { + return Err(Error::new("compiled regex exceeded size limit")); + } + } + Ok(()) + } +} + +/// A value that represents the result of compiling a sub-expression of a +/// regex's HIR. Specifically, this represents a sub-graph of the NFA that +/// has an initial state at `start` and a final state at `end`. +#[derive(Clone, Copy, Debug)] +struct ThompsonRef { + start: StateID, + end: StateID, +} diff --git a/regex-lite/src/pikevm.rs b/regex-lite/src/pikevm.rs new file mode 100644 index 000000000..260e59c72 --- /dev/null +++ b/regex-lite/src/pikevm.rs @@ -0,0 +1,957 @@ +use alloc::{vec, vec::Vec}; + +use crate::{ + int::{NonMaxUsize, U32}, + nfa::{State, StateID, NFA}, + pool::CachePoolGuard, + utf8, +}; + +/// A PikeVM searcher. +/// +/// A PikeVM uses the standard Thompson NFA linear time search algorithm, but +/// augmented to support tracking the offsets of matching capture groups. +#[derive(Clone, Debug)] +pub(crate) struct PikeVM { + nfa: NFA, +} + +impl PikeVM { + /// Create a new PikeVM searcher that uses the given NFA. + pub(crate) fn new(nfa: NFA) -> PikeVM { + PikeVM { nfa } + } + + /// Return the underlying NFA used by this PikeVM. + pub(crate) fn nfa(&self) -> &NFA { + &self.nfa + } + + /// Returns an iterator of non-overlapping matches in the given haystack. + pub(crate) fn find_iter<'r, 'h>( + &'r self, + cache: CachePoolGuard<'r>, + haystack: &'h [u8], + ) -> FindMatches<'r, 'h> { + FindMatches { + pikevm: self, + cache, + haystack, + at: 0, + slots: vec![None, None], + last_match_end: None, + } + } + + /// Returns an iterator of non-overlapping capture matches in the given + /// haystack. + pub(crate) fn captures_iter<'r, 'h>( + &'r self, + cache: CachePoolGuard<'r>, + haystack: &'h [u8], + ) -> CapturesMatches<'r, 'h> { + // OK because the NFA wouldn't have compiled if this could overflow. + let len = self.nfa().group_len().checked_mul(2).unwrap(); + CapturesMatches { + it: FindMatches { + pikevm: self, + cache, + haystack, + at: 0, + slots: vec![None; len], + last_match_end: None, + }, + } + } + + /// The implementation of standard leftmost search. + /// + /// Capturing group spans are written to `slots`, but only if requested. + /// `slots` can be any length. Any slot in the NFA that is activated but + /// which is out of bounds for the given `slots` is ignored. + pub(crate) fn search( + &self, + cache: &mut Cache, + haystack: &[u8], + start: usize, + end: usize, + earliest: bool, + slots: &mut [Option], + ) -> bool { + cache.setup_search(slots.len()); + if start > end { + return false; + } + // Why do we even care about this? Well, in our `slots` representation, + // we use usize::MAX as a sentinel to indicate "no match." This isn't + // problematic so long as our haystack doesn't have a maximal length. + // Byte slices are guaranteed by Rust to have a length that fits into + // isize, and so this assert should always pass. But we put it here to + // make our assumption explicit. + assert!( + haystack.len() < core::usize::MAX, + "byte slice lengths must be less than usize MAX", + ); + + let Cache { ref mut stack, ref mut curr, ref mut next } = cache; + let start_id = self.nfa().start(); + let anchored = self.nfa().is_start_anchored(); + let mut matched = false; + // Yes, our search doesn't end at `end`, but includes it. This is + // necessary because matches are delayed by one byte. The delay is used + // to handle look-behind assertions. In the case of the PikeVM, the + // delay is implemented by not considering a match to exist until it + // is visited in `nexts`. Technically, we know a match exists in the + // previous iteration via `epsilon_closure`. + let mut at = start; + while at <= end { + // If we have no states left to visit, then there are some cases + // where we know we can quit early or even skip ahead. + if curr.set.is_empty() { + // We have a match so we can quit. + if matched { + break; + } + // If we're running an anchored search and we've advanced + // beyond the start position with no other states to try, then + // we will never observe a match and thus can stop. + if anchored && at > start { + break; + } + } + // Instead of using a hypothetical unanchored start state in the + // NFA (which doesn't exist, but we could add it), we actually + // always use its anchored starting state. As a result, when doing + // an unanchored search, we need to simulate our own '(?s:.)*?' + // prefix, to permit a match to appear anywhere. + // + // Now, we don't *have* to do things this way. We could create + // a proper unanchored start state in the NFA and do one + // `epsilon_closure` call from that starting state before the main + // loop here. And that is just as correct. However, it turns out to + // be slower than our approach here because it slightly increases + // the cost of processing each byte by requiring us to visit + // more NFA states to deal with the additional NFA states in the + // unanchored prefix. By simulating it explicitly here, we lower + // those costs substantially. The cost is itself small, but it adds + // up for large haystacks. + // + // In order to simulate the '(?s:.)*?' prefix---which is not + // greedy---we are careful not to perform an epsilon closure on + // the start state if we already have a match. Namely, if we + // did otherwise, we would never reach a terminating condition + // because there would always be additional states to process. + if !matched { + // Since we are adding to the 'curr' active states and since + // this is for the start ID, we use a slots slice that is + // guaranteed to have the right length but where every element + // is absent. This is exactly what we want, because this + // epsilon closure is responsible for simulating an unanchored + // '(?s:.)*?' prefix. It is specifically outside of any + // capturing groups, and thus, using slots that are always + // absent is correct. + // + // Note though that we can't just use `&mut []` here, since + // this epsilon closure may traverse through `Capture` states + // transitions, and thus must be able to write offsets to the + // slots given which are later copied to slot values in `curr`. + let slots = next.slot_table.all_absent(); + self.epsilon_closure( + stack, slots, curr, haystack, at, start_id, + ); + } + let (ch, len) = utf8::decode_lossy(&haystack[at..]); + if self.nexts(stack, curr, next, haystack, at, ch, len, slots) { + matched = true; + } + // Unless the caller asked us to return early, we need to mush + // on to see if we can extend our match. (But note that 'nexts' + // will quit right after seeing a match, as is consistent with + // leftmost-first match priority.) + if (earliest && matched) || len == 0 { + break; + } + core::mem::swap(curr, next); + next.set.clear(); + at += len; + } + matched + } + + /// Process the active states in 'curr' to find the states (written to + /// 'next') we should process for the next byte in the haystack. + /// + /// 'stack' is used to perform a depth first traversal of the NFA when + /// computing an epsilon closure. + /// + /// When a match is found, the slots for that match state (in 'curr') are + /// copied to 'caps'. Moreover, once a match is seen, processing for 'curr' + /// stops (unless the PikeVM was configured with MatchKind::All semantics). + /// + /// `at_ch` is the Unicode scalar value whose UTF-8 encoding begins at `at` + /// in `haystack`. + /// + /// `at_len` is the number of bytes consumed by `at_ch`. This is usually + /// equal to `at_ch.len_utf8()`, but not always. For example, in the case + /// where `at_ch` is the replacement codepoint that results from decoding + /// invalid UTF-8. In that case, `at_len` can be 1, 2 or 3. + fn nexts( + &self, + stack: &mut Vec, + curr: &mut ActiveStates, + next: &mut ActiveStates, + haystack: &[u8], + at: usize, + at_ch: char, + at_len: usize, + slots: &mut [Option], + ) -> bool { + let ActiveStates { ref set, ref mut slot_table } = *curr; + for sid in set.iter() { + if self.next( + stack, slot_table, next, haystack, at, at_ch, at_len, sid, + ) { + slots.copy_from_slice(slot_table.for_state(sid)); + return true; + } + } + false + } + + /// Starting from `sid`, if the position `at` in the `haystack` has a + /// transition defined out of `sid`, then add the state transitioned to and + /// its epsilon closure to the `next` set of states to explore. + /// + /// `stack` is used by the epsilon closure computation to perform a depth + /// first traversal of the NFA. + /// + /// `curr_slot_table` should be the table of slots for the current set of + /// states being explored. If there is a transition out of `sid`, then + /// sid's row in the slot table is used to perform the epsilon closure. + /// + /// `at_ch` is the Unicode scalar value whose UTF-8 encoding begins at `at` + /// in `haystack`. The caller provides it so that this routine doesn't + /// need to re-decode it. (Since it's expected that this routine is called + /// multiple times for each position.) + /// + /// `at_len` is the number of bytes consumed by `at_ch`. This is usually + /// equal to `at_ch.len_utf8()`, but not always. For example, in the case + /// where `at_ch` is the replacement codepoint that results from decoding + /// invalid UTF-8. In that case, `at_len` can be 1, 2 or 3. + fn next( + &self, + stack: &mut Vec, + curr_slot_table: &mut SlotTable, + next: &mut ActiveStates, + haystack: &[u8], + at: usize, + at_ch: char, + at_len: usize, + sid: StateID, + ) -> bool { + match *self.nfa.state(sid) { + State::Fail + | State::Goto { .. } + | State::Splits { .. } + | State::Capture { .. } => false, + State::Char { target, ch } => { + if at_ch == ch && at_len > 0 { + let slots = curr_slot_table.for_state(sid); + // OK because `at_len` is always derived from the number + // of bytes read from `at` that make up `at_ch`. So this + // will never wrap. + let at = at.wrapping_add(at_len); + self.epsilon_closure( + stack, slots, next, haystack, at, target, + ); + } + false + } + State::Ranges { target, ref ranges } => { + for (start, end) in ranges.iter().copied() { + if start > at_ch { + break; + } else if start <= at_ch && at_ch <= end { + if at_len == 0 { + return false; + } + let slots = curr_slot_table.for_state(sid); + // OK because `at_len` is always derived from the + // number of bytes read from `at` that make up `at_ch`. + // So this will never wrap. + let at = at.wrapping_add(at_len); + self.epsilon_closure( + stack, slots, next, haystack, at, target, + ); + } + } + false + } + State::Match => true, + } + } + + /// Compute the epsilon closure of `sid`, writing the closure into `next` + /// while copying slot values from `curr_slots` into corresponding states + /// in `next`. `curr_slots` should be the slot values corresponding to + /// `sid`. + /// + /// The given `stack` is used to perform a depth first traversal of the + /// NFA by recursively following all epsilon transitions out of `sid`. + /// Conditional epsilon transitions are followed if and only if they are + /// satisfied for the position `at` in the `input` haystack. + /// + /// While this routine may write to `curr_slots`, once it returns, any + /// writes are undone and the original values (even if absent) are + /// restored. + fn epsilon_closure( + &self, + stack: &mut Vec, + curr_slots: &mut [Option], + next: &mut ActiveStates, + haystack: &[u8], + at: usize, + sid: StateID, + ) { + stack.push(FollowEpsilon::Explore(sid)); + while let Some(frame) = stack.pop() { + match frame { + FollowEpsilon::RestoreCapture { slot, offset } => { + curr_slots[slot.as_usize()] = offset; + } + FollowEpsilon::Explore(sid) => { + self.epsilon_closure_explore( + stack, curr_slots, next, haystack, at, sid, + ); + } + } + } + } + + /// Explore all of the epsilon transitions out of `sid`. This is mostly + /// split out from `epsilon_closure` in order to clearly delineate + /// the actual work of computing an epsilon closure from the stack + /// book-keeping. + /// + /// This will push any additional explorations needed on to `stack`. + /// + /// `curr_slots` should refer to the slots for the currently active NFA + /// state. That is, the current state we are stepping through. These + /// slots are mutated in place as new `Captures` states are traversed + /// during epsilon closure, but the slots are restored to their original + /// values once the full epsilon closure is completed. The ultimate use of + /// `curr_slots` is to copy them to the corresponding `next_slots`, so that + /// the capturing group spans are forwarded from the currently active state + /// to the next. + /// + /// `next` refers to the next set of active states. Computing an epsilon + /// closure may increase the next set of active states. + /// + /// `haystack` refers to the what we're searching and `at` refers to the + /// current position in the haystack. These are used to check whether + /// conditional epsilon transitions (like look-around) are satisfied at + /// the current position. If they aren't, then the epsilon closure won't + /// include them. + fn epsilon_closure_explore( + &self, + stack: &mut Vec, + curr_slots: &mut [Option], + next: &mut ActiveStates, + haystack: &[u8], + at: usize, + mut sid: StateID, + ) { + // We can avoid pushing some state IDs on to our stack in precisely + // the cases where a 'push(x)' would be immediately followed by a 'x + // = pop()'. This is achieved by this outer-loop. We simply set 'sid' + // to be the next state ID we want to explore once we're done with + // our initial exploration. In practice, this avoids a lot of stack + // thrashing. + loop { + // Record this state as part of our next set of active states. If + // we've already explored it, then no need to do it again. + if !next.set.insert(sid) { + return; + } + match *self.nfa.state(sid) { + State::Fail + | State::Match { .. } + | State::Char { .. } + | State::Ranges { .. } => { + next.slot_table.for_state(sid).copy_from_slice(curr_slots); + return; + } + State::Goto { target, look: None } => { + sid = target; + } + State::Goto { target, look: Some(look) } => { + if !look.is_match(haystack, at) { + return; + } + sid = target; + } + State::Splits { ref targets, reverse: false } => { + sid = match targets.get(0) { + None => return, + Some(&sid) => sid, + }; + stack.extend( + targets[1..] + .iter() + .copied() + .rev() + .map(FollowEpsilon::Explore), + ); + } + State::Splits { ref targets, reverse: true } => { + sid = match targets.last() { + None => return, + Some(&sid) => sid, + }; + stack.extend( + targets[..targets.len() - 1] + .iter() + .copied() + .map(FollowEpsilon::Explore), + ); + } + State::Capture { target, slot } => { + // There's no need to do anything with slots that + // ultimately won't be copied into the caller-provided + // 'Captures' value. So we just skip dealing with them at + // all. + if slot.as_usize() < curr_slots.len() { + stack.push(FollowEpsilon::RestoreCapture { + slot, + offset: curr_slots[slot.as_usize()], + }); + // OK because length of a slice must fit into an isize. + curr_slots[slot.as_usize()] = + Some(NonMaxUsize::new(at).unwrap()); + } + sid = target; + } + } + } + } +} + +/// An iterator over all successive non-overlapping matches in a particular +/// haystack. `'r` represents the lifetime of the regex, `'c` is the lifetime +/// of the cache and `'h` represents the lifetime of the haystack. +#[derive(Debug)] +pub(crate) struct FindMatches<'r, 'h> { + pikevm: &'r PikeVM, + cache: CachePoolGuard<'r>, + haystack: &'h [u8], + at: usize, + slots: Vec>, + last_match_end: Option, +} + +impl<'r, 'h> Iterator for FindMatches<'r, 'h> { + type Item = (usize, usize); + + fn next(&mut self) -> Option<(usize, usize)> { + if !self.pikevm.search( + &mut self.cache, + self.haystack, + self.at, + self.haystack.len(), + false, + &mut self.slots, + ) { + return None; + } + let mut m = + (self.slots[0].unwrap().get(), self.slots[1].unwrap().get()); + if m.0 >= m.1 { + m = self.handle_overlapping_empty_match(m)?; + } + self.at = m.1; + self.last_match_end = Some(m.1); + Some(m) + } +} + +impl<'r, 'h> FindMatches<'r, 'h> { + /// Handles the special case of an empty match by ensuring that 1) the + /// iterator always advances and 2) empty matches never overlap with other + /// matches. + /// + /// Note that we mark this cold and forcefully prevent inlining because + /// handling empty matches like this is extremely rare and does require a + /// bit of code, comparatively. Keeping this code out of the main iterator + /// function keeps it smaller and more amenable to inlining itself. + #[cold] + #[inline(never)] + fn handle_overlapping_empty_match( + &mut self, + mut m: (usize, usize), + ) -> Option<(usize, usize)> { + assert!(m.0 >= m.1); + if Some(m.1) == self.last_match_end { + let len = + core::cmp::max(1, utf8::decode(&self.haystack[self.at..]).1); + self.at = self.at.checked_add(len).unwrap(); + if !self.pikevm.search( + &mut self.cache, + self.haystack, + self.at, + self.haystack.len(), + false, + &mut self.slots, + ) { + return None; + } + m = (self.slots[0].unwrap().get(), self.slots[1].unwrap().get()); + } + Some(m) + } +} + +/// An iterator over all successive non-overlapping capture matches in a particular +/// haystack. `'r` represents the lifetime of the regex, `'c` is the lifetime +/// of the cache and `'h` represents the lifetime of the haystack. +#[derive(Debug)] +pub(crate) struct CapturesMatches<'r, 'h> { + it: FindMatches<'r, 'h>, +} + +impl<'r, 'h> Iterator for CapturesMatches<'r, 'h> { + type Item = Vec>; + + fn next(&mut self) -> Option>> { + self.it.next()?; + Some(self.it.slots.clone()) + } +} + +/// A cache represents mutable state that a `PikeVM` requires during a search. +/// +/// For a given `PikeVM`, its corresponding cache may be created either via +/// `PikeVM::create_cache`, or via `Cache::new`. They are equivalent in every +/// way, except the former does not require explicitly importing `Cache`. +/// +/// A particular `Cache` is coupled with the `PikeVM` from which it was +/// created. It may only be used with that `PikeVM`. A cache and its +/// allocations may be re-purposed via `Cache::reset`, in which case, it can +/// only be used with the new `PikeVM` (and not the old one). +#[derive(Clone, Debug)] +pub(crate) struct Cache { + /// Stack used while computing epsilon closure. This effectively lets us + /// move what is more naturally expressed through recursion to a stack + /// on the heap. + stack: Vec, + /// The current active states being explored for the current byte in the + /// haystack. + curr: ActiveStates, + /// The next set of states we're building that will be explored for the + /// next byte in the haystack. + next: ActiveStates, +} + +impl Cache { + /// Create a new `PikeVM` cache. + /// + /// A potentially more convenient routine to create a cache is + /// `PikeVM::create_cache`, as it does not require also importing the + /// `Cache` type. + /// + /// If you want to reuse the returned `Cache` with some other `PikeVM`, + /// then you must call `Cache::reset` with the desired `PikeVM`. + pub(crate) fn new(re: &PikeVM) -> Cache { + Cache { + stack: vec![], + curr: ActiveStates::new(re), + next: ActiveStates::new(re), + } + } + + /// Reset this cache such that it can be used for searching with a + /// different `PikeVM`. + /// + /// A cache reset permits reusing memory already allocated in this cache + /// with a different `PikeVM`. + pub(crate) fn reset(&mut self, re: &PikeVM) { + self.curr.reset(re); + self.next.reset(re); + } + + /// Returns the heap memory usage, in bytes, of this cache. + /// + /// This does **not** include the stack size used up by this cache. To + /// compute that, use `std::mem::size_of::()`. + pub(crate) fn memory_usage(&self) -> usize { + (self.stack.len() * core::mem::size_of::()) + + self.curr.memory_usage() + + self.next.memory_usage() + } + + /// Clears this cache. This should be called at the start of every search + /// to ensure we start with a clean slate. + /// + /// This also sets the length of the capturing groups used in the current + /// search. This permits an optimization where by 'SlotTable::for_state' + /// only returns the number of slots equivalent to the number of slots + /// given in the 'Captures' value. This may be less than the total number + /// of possible slots, e.g., when one only wants to track overall match + /// offsets. This in turn permits less copying of capturing group spans + /// in the PikeVM. + fn setup_search(&mut self, captures_slot_len: usize) { + self.stack.clear(); + self.curr.setup_search(captures_slot_len); + self.next.setup_search(captures_slot_len); + } +} + +/// A set of active states used to "simulate" the execution of an NFA via the +/// PikeVM. +/// +/// There are two sets of these used during NFA simulation. One set corresponds +/// to the "current" set of states being traversed for the current position +/// in a haystack. The other set corresponds to the "next" set of states being +/// built, which will become the new "current" set for the next position in the +/// haystack. These two sets correspond to CLIST and NLIST in Thompson's +/// original paper regexes: https://dl.acm.org/doi/pdf/10.1145/363347.363387 +/// +/// In addition to representing a set of NFA states, this also maintains slot +/// values for each state. These slot values are what turn the NFA simulation +/// into the "Pike VM." Namely, they track capturing group values for each +/// state. During the computation of epsilon closure, we copy slot values from +/// states in the "current" set to the "next" set. Eventually, once a match +/// is found, the slot values for that match state are what we write to the +/// caller provided slots. +#[derive(Clone, Debug)] +struct ActiveStates { + /// The set of active NFA states. This set preserves insertion order, which + /// is critical for simulating the match semantics of backtracking regex + /// engines. + set: SparseSet, + /// The slots for every NFA state, where each slot stores a (possibly + /// absent) offset. Every capturing group has two slots. One for a start + /// offset and one for an end offset. + slot_table: SlotTable, +} + +impl ActiveStates { + /// Create a new set of active states for the given PikeVM. The active + /// states returned may only be used with the given PikeVM. (Use 'reset' + /// to re-purpose the allocation for a different PikeVM.) + fn new(re: &PikeVM) -> ActiveStates { + let mut active = ActiveStates { + set: SparseSet::new(0), + slot_table: SlotTable::new(), + }; + active.reset(re); + active + } + + /// Reset this set of active states such that it can be used with the given + /// PikeVM (and only that PikeVM). + fn reset(&mut self, re: &PikeVM) { + self.set.resize(re.nfa().len()); + self.slot_table.reset(re); + } + + /// Return the heap memory usage, in bytes, used by this set of active + /// states. + /// + /// This does not include the stack size of this value. + fn memory_usage(&self) -> usize { + self.set.memory_usage() + self.slot_table.memory_usage() + } + + /// Setup this set of active states for a new search. The given slot + /// length should be the number of slots in a caller provided 'Captures' + /// (and may be zero). + fn setup_search(&mut self, captures_slot_len: usize) { + self.set.clear(); + self.slot_table.setup_search(captures_slot_len); + } +} + +/// A table of slots, where each row represent a state in an NFA. Thus, the +/// table has room for storing slots for every single state in an NFA. +/// +/// This table is represented with a single contiguous allocation. In general, +/// the notion of "capturing group" doesn't really exist at this level of +/// abstraction, hence the name "slot" instead. (Indeed, every capturing group +/// maps to a pair of slots, one for the start offset and one for the end +/// offset.) Slots are indexed by the `Captures` NFA state. +#[derive(Clone, Debug)] +struct SlotTable { + /// The actual table of offsets. + table: Vec>, + /// The number of slots per state, i.e., the table's stride or the length + /// of each row. + slots_per_state: usize, + /// The number of slots in the caller-provided `Captures` value for the + /// current search. Setting this to `slots_per_state` is always correct, + /// but may be wasteful. + slots_for_captures: usize, +} + +impl SlotTable { + /// Create a new slot table. + /// + /// One should call 'reset' with the corresponding PikeVM before use. + fn new() -> SlotTable { + SlotTable { table: vec![], slots_for_captures: 0, slots_per_state: 0 } + } + + /// Reset this slot table such that it can be used with the given PikeVM + /// (and only that PikeVM). + fn reset(&mut self, re: &PikeVM) { + let nfa = re.nfa(); + // OK because NFA construction would have failed if this overflowed. + self.slots_per_state = nfa.group_len().checked_mul(2).unwrap(); + // This is always correct, but may be reduced for a particular search + // if fewer slots were given by the caller, e.g., none at all or only + // slots for tracking the overall match instead of all slots for every + // group. + self.slots_for_captures = self.slots_per_state; + let len = nfa + .len() + // We add 1 so that our last row is always empty. We use it as + // "scratch" space for computing the epsilon closure off of the + // starting state. + .checked_add(1) + .and_then(|x| x.checked_mul(self.slots_per_state)) + // It seems like this could actually panic on legitimate inputs + // on 32-bit targets. Should we somehow convert this to an error? + // What about something similar for the lazy DFA cache? If you're + // tripping this assert, please file a bug. + .expect("slot table length doesn't overflow"); + self.table.resize(len, None); + } + + /// Return the heap memory usage, in bytes, used by this slot table. + /// + /// This does not include the stack size of this value. + fn memory_usage(&self) -> usize { + self.table.len() * core::mem::size_of::>() + } + + /// Perform any per-search setup for this slot table. + /// + /// In particular, this sets the length of the number of slots used in the + /// slots given by the caller (if any at all). This number may be smaller + /// than the total number of slots available, e.g., when the caller is only + /// interested in tracking the overall match and not the spans of every + /// matching capturing group. Only tracking the overall match can save a + /// substantial amount of time copying capturing spans during a search. + fn setup_search(&mut self, captures_slot_len: usize) { + self.slots_for_captures = captures_slot_len; + } + + /// Return a mutable slice of the slots for the given state. + /// + /// Note that the length of the slice returned may be less than the total + /// number of slots available for this state. In particular, the length + /// always matches the number of slots indicated via `setup_search`. + fn for_state(&mut self, sid: StateID) -> &mut [Option] { + let i = sid.as_usize() * self.slots_per_state; + &mut self.table[i..i + self.slots_for_captures] + } + + /// Return a slice of slots of appropriate length where every slot offset + /// is guaranteed to be absent. This is useful in cases where you need to + /// compute an epsilon closure outside of the user supplied regex, and thus + /// never want it to have any capturing slots set. + fn all_absent(&mut self) -> &mut [Option] { + let i = self.table.len() - self.slots_per_state; + &mut self.table[i..i + self.slots_for_captures] + } +} + +/// Represents a stack frame for use while computing an epsilon closure. +/// +/// (An "epsilon closure" refers to the set of reachable NFA states from a +/// single state without consuming any input. That is, the set of all epsilon +/// transitions not only from that single state, but from every other state +/// reachable by an epsilon transition as well. This is why it's called a +/// "closure.") +/// +/// Computing the epsilon closure in a Thompson NFA proceeds via a depth +/// first traversal over all epsilon transitions from a particular state. +/// (A depth first traversal is important because it emulates the same priority +/// of matches that is typically found in backtracking regex engines.) This +/// depth first traversal is naturally expressed using recursion, but to avoid +/// a call stack size proportional to the size of a regex, we put our stack on +/// the heap instead. +/// +/// This stack thus consists of call frames. The typical call frame is +/// `Explore`, which instructs epsilon closure to explore the epsilon +/// transitions from that state. (Subsequent epsilon transitions are then +/// pushed on to the stack as more `Explore` frames.) If the state ID being +/// explored has no epsilon transitions, then the capturing group slots are +/// copied from the original state that sparked the epsilon closure (from the +/// 'step' routine) to the state ID being explored. This way, capturing group +/// slots are forwarded from the previous state to the next. +/// +/// The other stack frame, `RestoreCaptures`, instructs the epsilon closure to +/// set the position for a particular slot back to some particular offset. This +/// frame is pushed when `Explore` sees a `Capture` transition. `Explore` will +/// set the offset of the slot indicated in `Capture` to the current offset, +/// and then push the old offset on to the stack as a `RestoreCapture` frame. +/// Thus, the new offset is only used until the epsilon closure reverts back to +/// the `RestoreCapture` frame. In effect, this gives the `Capture` epsilon +/// transition its "scope" to only states that come "after" it during depth +/// first traversal. +#[derive(Clone, Debug)] +enum FollowEpsilon { + /// Explore the epsilon transitions from a state ID. + Explore(StateID), + /// Reset the given `slot` to the given `offset` (which might be `None`). + RestoreCapture { slot: u32, offset: Option }, +} + +/// A sparse set used for representing ordered NFA states. +/// +/// This supports constant time addition and membership testing. Clearing an +/// entire set can also be done in constant time. Iteration yields elements +/// in the order in which they were inserted. +/// +/// The data structure is based on: https://research.swtch.com/sparse +/// Note though that we don't actually use uninitialized memory. We generally +/// reuse sparse sets, so the initial allocation cost is bareable. However, its +/// other properties listed above are extremely useful. +#[derive(Clone)] +struct SparseSet { + /// The number of elements currently in this set. + len: usize, + /// Dense contains the ids in the order in which they were inserted. + dense: Vec, + /// Sparse maps ids to their location in dense. + /// + /// A state ID is in the set if and only if + /// sparse[id] < len && id == dense[sparse[id]]. + /// + /// Note that these are indices into 'dense'. It's a little weird to use + /// StateID here, but we know our length can never exceed the bounds of + /// StateID (enforced by 'resize') and StateID will be at most 4 bytes + /// where as a usize is likely double that in most cases. + sparse: Vec, +} + +impl SparseSet { + /// Create a new sparse set with the given capacity. + /// + /// Sparse sets have a fixed size and they cannot grow. Attempting to + /// insert more distinct elements than the total capacity of the set will + /// result in a panic. + /// + /// This panics if the capacity given is bigger than `StateID::LIMIT`. + fn new(capacity: usize) -> SparseSet { + let mut set = SparseSet { len: 0, dense: vec![], sparse: vec![] }; + set.resize(capacity); + set + } + + /// Resizes this sparse set to have the new capacity given. + /// + /// This set is automatically cleared. + /// + /// This panics if the capacity given is bigger than `StateID::LIMIT`. + fn resize(&mut self, new_capacity: usize) { + assert!( + new_capacity <= u32::MAX.as_usize(), + "sparse set capacity cannot excced {:?}", + u32::MAX, + ); + self.clear(); + self.dense.resize(new_capacity, 0); + self.sparse.resize(new_capacity, 0); + } + + /// Returns the capacity of this set. + /// + /// The capacity represents a fixed limit on the number of distinct + /// elements that are allowed in this set. The capacity cannot be changed. + fn capacity(&self) -> usize { + self.dense.len() + } + + /// Returns the number of elements in this set. + fn len(&self) -> usize { + self.len + } + + /// Returns true if and only if this set is empty. + fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Insert the state ID value into this set and return true if the given + /// state ID was not previously in this set. + /// + /// This operation is idempotent. If the given value is already in this + /// set, then this is a no-op. + /// + /// If more than `capacity` ids are inserted, then this panics. + fn insert(&mut self, id: StateID) -> bool { + if self.contains(id) { + return false; + } + + let index = self.len(); + assert!( + index < self.capacity(), + "{:?} exceeds capacity of {:?} when inserting {:?}", + index, + self.capacity(), + id, + ); + self.dense[index] = id; + // OK because we don't permit the capacity to be set higher than + // u32::MAX. + self.sparse[id.as_usize()] = u32::try_from(index).unwrap(); + self.len += 1; + true + } + + /// Returns true if and only if this set contains the given value. + fn contains(&self, id: StateID) -> bool { + let index = self.sparse[id.as_usize()]; + index.as_usize() < self.len() && self.dense[index.as_usize()] == id + } + + /// Clear this set such that it has no members. + fn clear(&mut self) { + self.len = 0; + } + + /// Returns an iterator over all the state IDs in this set in the order in + /// which they were inserted. + fn iter(&self) -> SparseSetIter<'_> { + SparseSetIter(self.dense[..self.len()].iter()) + } + + /// Returns the heap memory usage, in bytes, used by this sparse set. + fn memory_usage(&self) -> usize { + let idsize = core::mem::size_of::(); + (self.dense.len() * idsize) + (self.sparse.len() * idsize) + } +} + +impl core::fmt::Debug for SparseSet { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + let elements: Vec = self.iter().collect(); + f.debug_tuple("SparseSet").field(&elements).finish() + } +} + +/// An iterator over all elements in a sparse set. +/// +/// The lifetime `'a` refers to the lifetime of the set being iterated over. +#[derive(Debug)] +struct SparseSetIter<'a>(core::slice::Iter<'a, StateID>); + +impl<'a> Iterator for SparseSetIter<'a> { + type Item = StateID; + + fn next(&mut self) -> Option { + self.0.next().map(|&id| id) + } +} diff --git a/regex-lite/src/pool.rs b/regex-lite/src/pool.rs new file mode 100644 index 000000000..75277469e --- /dev/null +++ b/regex-lite/src/pool.rs @@ -0,0 +1,160 @@ +use core::{ + cell::UnsafeCell, + panic::{RefUnwindSafe, UnwindSafe}, + sync::atomic::{AtomicBool, Ordering}, +}; + +use alloc::{boxed::Box, vec, vec::Vec}; + +use crate::pikevm; + +// Literally the only reason that this crate requires 'std' currently. +// +// In regex-automata, we support the no-std use case by rolling our own +// spin-lock based Mutex. That's questionable on its own, but it's not clear if +// we should be doing that here. It will require introducing non-safe code in a +// crate that is otherwise safe. But maybe it's worth doing? +use std::sync::Mutex; + +/// A type alias for our pool of meta::Cache that fixes the type parameters to +/// what we use for the meta regex below. +pub(crate) type CachePool = Pool; + +/// Same as above, but for the guard returned by a pool. +pub(crate) type CachePoolGuard<'a> = PoolGuard<'a, pikevm::Cache, CachePoolFn>; + +/// The type of the closure we use to create new caches. We need to spell out +/// all of the marker traits or else we risk leaking !MARKER impls. +pub(crate) type CachePoolFn = + Box pikevm::Cache + Send + Sync + UnwindSafe + RefUnwindSafe>; + +/// A thread safe pool utilizing alloc-only features. +/// +/// Unlike the pool in regex-automata, this has no "fast path." We could add +/// it, but it's more code and requires reasoning about safety. +pub(super) struct Pool { + /// A stack of T values to hand out. These are used when a Pool is + /// accessed by a thread that didn't create it. + stack: Mutex>>, + /// A function to create more T values when stack is empty and a caller + /// has requested a T. + create: F, +} + +// If T is UnwindSafe, then since we provide exclusive access to any +// particular value in the pool, it should therefore also be considered +// RefUnwindSafe. +impl RefUnwindSafe for Pool {} + +impl Pool { + /// Create a new pool. The given closure is used to create values in + /// the pool when necessary. + pub(super) const fn new(create: F) -> Pool { + Pool { stack: Mutex::new(vec![]), create } + } +} + +impl T> Pool { + /// Get a value from the pool. This may block if another thread is also + /// attempting to retrieve a value from the pool. + pub(super) fn get(&self) -> PoolGuard<'_, T, F> { + let mut stack = self.stack.lock().unwrap(); + let value = match stack.pop() { + None => Box::new((self.create)()), + Some(value) => value, + }; + PoolGuard { pool: self, value: Some(value) } + } + + fn put(&self, guard: PoolGuard<'_, T, F>) { + let mut guard = core::mem::ManuallyDrop::new(guard); + if let Some(value) = guard.value.take() { + self.put_value(value); + } + } + + /// Puts a value back into the pool. Callers don't need to call this. + /// Once the guard that's returned by 'get' is dropped, it is put back + /// into the pool automatically. + fn put_value(&self, value: Box) { + let mut stack = self.stack.lock().unwrap(); + stack.push(value); + } +} + +impl core::fmt::Debug for Pool { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_struct("Pool").field("stack", &self.stack).finish() + } +} + +/// A guard that is returned when a caller requests a value from the pool. +pub(super) struct PoolGuard<'a, T: Send, F: Fn() -> T> { + /// The pool that this guard is attached to. + pool: &'a Pool, + /// This is None after the guard has been put back into the pool. + value: Option>, +} + +impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> { + /// Return the underlying value. + pub(super) fn value(&self) -> &T { + self.value.as_deref().unwrap() + } + + /// Return the underlying value as a mutable borrow. + pub(super) fn value_mut(&mut self) -> &mut T { + self.value.as_deref_mut().unwrap() + } + + /// Consumes this guard and puts it back into the pool. + pub(super) fn put(this: PoolGuard<'_, T, F>) { + // Since this is effectively consuming the guard and putting the + // value back into the pool, there's no reason to run its Drop + // impl after doing this. I don't believe there is a correctness + // problem with doing so, but there's definitely a perf problem + // by redoing this work. So we avoid it. + let mut this = core::mem::ManuallyDrop::new(this); + this.put_imp(); + } + + /// Puts this guard back into the pool by only borrowing the guard as + /// mutable. This should be called at most once. + #[inline(always)] + fn put_imp(&mut self) { + if let Some(value) = self.value.take() { + self.pool.put_value(value); + } + } +} + +impl<'a, T: Send, F: Fn() -> T> Drop for PoolGuard<'a, T, F> { + fn drop(&mut self) { + self.put_imp(); + } +} + +impl<'a, T: Send, F: Fn() -> T> core::ops::Deref for PoolGuard<'a, T, F> { + type Target = T; + + fn deref(&self) -> &T { + self.value() + } +} + +impl<'a, T: Send, F: Fn() -> T> core::ops::DerefMut for PoolGuard<'a, T, F> { + fn deref_mut(&mut self) -> &mut T { + self.value_mut() + } +} + +impl<'a, T: Send + core::fmt::Debug, F: Fn() -> T> core::fmt::Debug + for PoolGuard<'a, T, F> +{ + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + f.debug_struct("PoolGuard") + .field("pool", &self.pool) + .field("value", &self.value) + .finish() + } +} diff --git a/regex-lite/src/string.rs b/regex-lite/src/string.rs new file mode 100644 index 000000000..915c0b0b0 --- /dev/null +++ b/regex-lite/src/string.rs @@ -0,0 +1,865 @@ +use core::{ + cell::RefCell, + panic::{RefUnwindSafe, UnwindSafe}, +}; + +use alloc::{ + boxed::Box, string::String, string::ToString, sync::Arc, vec, vec::Vec, +}; + +use crate::{ + error::Error, + hir::{self, Hir}, + int::NonMaxUsize, + interpolate, + nfa::{self, NFA}, + pikevm::{self, Cache, PikeVM}, + pool::{CachePool, CachePoolFn, CachePoolGuard}, +}; + +#[derive(Debug)] +pub struct Regex { + pikevm: Arc, + pool: CachePool, +} + +impl Regex { + pub fn new(pattern: &str) -> Result { + RegexBuilder::new(pattern).build() + } + + pub fn is_match(&self, haystack: &str) -> bool { + self.is_match_at(haystack, 0) + } + + pub fn find<'h>(&self, haystack: &'h str) -> Option> { + self.find_at(haystack, 0) + } + + pub fn find_iter<'r, 'h>(&'r self, haystack: &'h str) -> Matches<'r, 'h> { + Matches { + haystack, + it: self.pikevm.find_iter(self.pool.get(), haystack.as_bytes()), + } + } + + pub fn captures<'h>(&self, haystack: &'h str) -> Option> { + self.captures_at(haystack, 0) + } + + pub fn captures_iter<'r, 'h>( + &'r self, + haystack: &'h str, + ) -> CaptureMatches<'r, 'h> { + CaptureMatches { + haystack, + re: self, + it: self + .pikevm + .captures_iter(self.pool.get(), haystack.as_bytes()), + } + } +} + +/// Advanced or "lower level" search methods. +impl Regex { + /// Returns the end location of a match in the haystack given. + /// + /// This method may have the same performance characteristics as + /// `is_match`, except it provides an end location for a match. In + /// particular, the location returned *may be shorter* than the proper end + /// of the leftmost-first match that you would find via `Regex::find`. + /// + /// Note that it is not guaranteed that this routine finds the shortest or + /// "earliest" possible match. Instead, the main idea of this API is that + /// it returns the offset at the point at which the internal regex engine + /// has determined that a match has occurred. This may vary depending on + /// which internal regex engine is used, and thus, the offset itself may + /// change. + /// + /// # Example + /// + /// Typically, `a+` would match the entire first sequence of `a` in some + /// haystack, but `shortest_match` can give up as soon as it sees the first + /// `a`. + /// + /// ``` + /// + /// use regex_lite::Regex; + /// let haystack = "aaaaa"; + /// let pos = Regex::new(r"a+").unwrap().shortest_match(haystack); + /// assert_eq!(pos, Some(1)); + /// ``` + pub fn shortest_match(&self, haystack: &str) -> Option { + self.shortest_match_at(haystack, 0) + } + + /// Returns the same as `shortest_match`, but starts the search at the + /// given offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only match + /// when `start == 0`. + pub fn shortest_match_at( + &self, + haystack: &str, + start: usize, + ) -> Option { + let mut cache = self.pool.get(); + let mut slots = [None, None]; + let matched = self.pikevm.search( + &mut cache, + haystack.as_bytes(), + start, + haystack.len(), + true, + &mut slots, + ); + if !matched { + return None; + } + Some(slots[1].unwrap().get()) + } + + /// Returns the same as is_match, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn is_match_at(&self, haystack: &str, start: usize) -> bool { + let mut cache = self.pool.get(); + self.pikevm.search( + &mut cache, + haystack.as_bytes(), + start, + haystack.len(), + true, + &mut [], + ) + } + + /// Returns the same as find, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn find_at<'h>( + &self, + haystack: &'h str, + start: usize, + ) -> Option> { + let mut cache = self.pool.get(); + let mut slots = [None, None]; + let matched = self.pikevm.search( + &mut cache, + haystack.as_bytes(), + start, + haystack.len(), + false, + &mut slots, + ); + if !matched { + return None; + } + let (start, end) = (slots[0].unwrap().get(), slots[1].unwrap().get()); + Some(Match::new(haystack, start, end)) + } + + /// Returns the same as [`Regex::captures`], but starts the search at the + /// given offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + #[inline] + pub fn captures_at<'h>( + &self, + haystack: &'h str, + start: usize, + ) -> Option> { + let mut caps = Captures { + haystack, + slots: self.capture_locations(), + pikevm: Arc::clone(&self.pikevm), + }; + let mut cache = self.pool.get(); + let matched = self.pikevm.search( + &mut cache, + haystack.as_bytes(), + start, + haystack.len(), + false, + &mut caps.slots.0, + ); + if !matched { + return None; + } + Some(caps) + } + + /// This is like `captures`, but uses + /// [`CaptureLocations`](struct.CaptureLocations.html) + /// instead of + /// [`Captures`](struct.Captures.html) in order to amortize allocations. + /// + /// To create a `CaptureLocations` value, use the + /// `Regex::capture_locations` method. + /// + /// This returns the overall match if this was successful, which is always + /// equivalence to the `0`th capture group. + #[inline] + pub fn captures_read<'h>( + &self, + locs: &mut CaptureLocations, + haystack: &'h str, + ) -> Option> { + self.captures_read_at(locs, haystack, 0) + } + + /// Returns the same as captures, but starts the search at the given + /// offset and populates the capture locations given. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + #[inline] + pub fn captures_read_at<'h>( + &self, + locs: &mut CaptureLocations, + haystack: &'h str, + start: usize, + ) -> Option> { + let mut cache = self.pool.get(); + let matched = self.pikevm.search( + &mut cache, + haystack.as_bytes(), + start, + haystack.len(), + false, + &mut locs.0, + ); + if !matched { + return None; + } + let (start, end) = locs.get(0).unwrap(); + Some(Match::new(haystack, start, end)) + } + + /// An undocumented alias for `captures_read_at`. + /// + /// The `regex-capi` crate previously used this routine, so to avoid + /// breaking that crate, we continue to provide the name as an undocumented + /// alias. + #[doc(hidden)] + #[inline] + pub fn read_captures_at<'h>( + &self, + locs: &mut CaptureLocations, + haystack: &'h str, + start: usize, + ) -> Option> { + self.captures_read_at(locs, haystack, start) + } +} + +/// Auxiliary methods. +impl Regex { + /// Returns the original string of this regex. + #[inline] + pub fn as_str(&self) -> &str { + &self.pikevm.nfa().pattern() + } + + /// Returns an iterator over the capture names. + pub fn capture_names(&self) -> CaptureNames<'_> { + CaptureNames(self.pikevm.nfa().capture_names()) + } + + /// Returns the number of captures. + pub fn captures_len(&self) -> usize { + self.pikevm.nfa().group_len() + } + + /// Returns an empty set of capture locations that can be reused in + /// multiple calls to `captures_read` or `captures_read_at`. + #[inline] + pub fn capture_locations(&self) -> CaptureLocations { + // OK because NFA construction would have failed if this overflowed. + let len = self.pikevm.nfa().group_len().checked_mul(2).unwrap(); + CaptureLocations(vec![None; len]) + } + + /// An alias for `capture_locations` to preserve backward compatibility. + /// + /// The `regex-capi` crate uses this method, so to avoid breaking that + /// crate, we continue to export it as an undocumented API. + #[doc(hidden)] + #[inline] + pub fn locations(&self) -> CaptureLocations { + self.capture_locations() + } +} + +impl Clone for Regex { + fn clone(&self) -> Regex { + let pikevm = Arc::clone(&self.pikevm); + let pool = { + let pikevm = Arc::clone(&self.pikevm); + let create = Box::new(move || Cache::new(&pikevm)); + CachePool::new(create) + }; + Regex { pikevm, pool } + } +} + +/// Match represents a single match of a regex in a haystack. +/// +/// The lifetime parameter `'h` refers to the lifetime of the haystack. +#[derive(Copy, Clone, Eq, PartialEq)] +pub struct Match<'h> { + haystack: &'h str, + start: usize, + end: usize, +} + +impl<'h> Match<'h> { + /// Creates a new match from the given haystack and byte offsets. + fn new(haystack: &'h str, start: usize, end: usize) -> Match<'h> { + Match { haystack, start, end } + } + + /// Returns the starting byte offset of the match in the haystack. + #[inline] + pub fn start(&self) -> usize { + self.start + } + + /// Returns the ending byte offset of the match in the haystack. + #[inline] + pub fn end(&self) -> usize { + self.end + } + + /// Returns true if and only if this match has a length of zero. + #[inline] + pub fn is_empty(&self) -> bool { + self.start == self.end + } + + /// Returns the length, in bytes, of this match. + #[inline] + pub fn len(&self) -> usize { + self.end - self.start + } + + /// Returns the range over the starting and ending byte offsets of the + /// match in the haystack. + #[inline] + pub fn range(&self) -> core::ops::Range { + self.start..self.end + } + + /// Returns the matched portion of the haystack. + #[inline] + pub fn as_str(&self) -> &'h str { + &self.haystack[self.range()] + } +} + +impl<'h> std::fmt::Debug for Match<'h> { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + f.debug_struct("Match") + .field("start", &self.start) + .field("end", &self.end) + .field("haystack", &self.as_str()) + .finish() + } +} + +impl<'h> From> for &'h str { + fn from(m: Match<'h>) -> &'h str { + m.as_str() + } +} + +impl<'h> From> for core::ops::Range { + fn from(m: Match<'h>) -> core::ops::Range { + m.range() + } +} + +/// Captures represents a group of captured strings for a single match. +/// +/// The 0th capture always corresponds to the entire match. Each subsequent +/// index corresponds to the next capture group in the regex. If a capture +/// group is named, then the matched string is *also* available via the `name` +/// method. (Note that the 0th capture is always unnamed and so must be +/// accessed with the `get` method.) +/// +/// Positions returned from a capture group are always byte indices. +/// +/// `'h` is the lifetime of the matched haystack. +pub struct Captures<'h> { + haystack: &'h str, + slots: CaptureLocations, + // It's a little weird to put the PikeVM in our Captures, but it's the + // simplest thing to do and is cheap. The PikeVM gives us access to the + // NFA and the NFA gives us access to the capture name<->index mapping. + pikevm: Arc, +} + +impl<'h> Captures<'h> { + /// Returns the match associated with the capture group at index `i`. If + /// `i` does not correspond to a capture group, or if the capture group + /// did not participate in the match, then `None` is returned. + /// + /// # Examples + /// + /// Get the haystack of the match with a default of an empty string if this + /// group didn't participate in the match: + /// + /// ```rust + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r"[a-z]+(?:([0-9]+)|([A-Z]+))").unwrap(); + /// let caps = re.captures("abc123").unwrap(); + /// + /// let hay1 = caps.get(1).map_or("", |m| m.as_str()); + /// let hay2 = caps.get(2).map_or("", |m| m.as_str()); + /// assert_eq!(hay1, "123"); + /// assert_eq!(hay2, ""); + /// ``` + #[inline] + pub fn get(&self, i: usize) -> Option> { + self.slots.get(i).map(|(s, e)| Match::new(self.haystack, s, e)) + } + + /// Returns the match for the capture group named `name`. If `name` isn't a + /// valid capture group or didn't match anything, then `None` is returned. + #[inline] + pub fn name(&self, name: &str) -> Option> { + let i = self.pikevm.nfa().to_index(name)?; + self.get(i) + } + + /// An iterator that yields all capturing matches in the order in which + /// they appear in the regex. If a particular capture group didn't + /// participate in the match, then `None` is yielded for that capture. + /// + /// The first match always corresponds to the overall match of the regex. + #[inline] + pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 'h> { + SubCaptureMatches { + haystack: self.haystack, + caps: self, + it: self.pikevm.nfa().capture_names().enumerate(), + } + } + + /// Expands all instances of `$name` in `replacement` to the corresponding + /// capture group `name`, and writes them to the `dst` buffer given. + /// + /// `name` may be an integer corresponding to the index of the capture + /// group (counted by order of opening parenthesis where `0` is the + /// entire match) or it can be a name (consisting of letters, digits or + /// underscores) corresponding to a named capture group. + /// + /// If `name` isn't a valid capture group (whether the name doesn't exist + /// or isn't a valid index), then it is replaced with the empty string. + /// + /// The longest possible name consisting of the characters `[_0-9A-Za-z]` + /// is used. e.g., `$1a` looks up the capture group named `1a` and not the + /// capture group at index `1`. To exert more precise control over the + /// name, or to refer to a capture group name that uses characters outside + /// of `[_0-9A-Za-z]`, use braces, e.g., `${1}a` or `${foo[bar].baz}`. When + /// using braces, any sequence of characters is permitted. If the sequence + /// does not refer to a capture group name in the corresponding regex, then + /// it is replaced with an empty string. + /// + /// To write a literal `$` use `$$`. + #[inline] + pub fn expand(&self, replacement: &str, dst: &mut String) { + interpolate::string( + replacement, + |index, dst| { + let m = match self.get(index) { + None => return, + Some(m) => m, + }; + dst.push_str(&self.haystack[m.range()]); + }, + |name| self.pikevm.nfa().to_index(name), + dst, + ); + } + + /// Returns the total number of capture groups (even if they didn't match). + /// + /// This is always at least `1`, since every regex has at least one capture + /// group that corresponds to the full match. + #[inline] + pub fn len(&self) -> usize { + self.pikevm.nfa().group_len() + } +} + +impl<'h> core::fmt::Debug for Captures<'h> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + // TODO: Make this better. + f.debug_tuple("Captures").field(&self.slots).finish() + } +} + +/// Get a group by index. +/// +/// `'h` is the lifetime of the matched portion of the haystack. +/// +/// The haystack can't outlive the `Captures` object if this method is used, +/// because of how `Index` is defined (normally `a[i]` is part of `a` and can't +/// outlive it); to do that, use `get()` instead. +/// +/// # Panics +/// +/// If there is no group at the given index. +impl<'h> core::ops::Index for Captures<'h> { + type Output = str; + + fn index(&self, i: usize) -> &str { + self.get(i) + .map(|m| m.as_str()) + .unwrap_or_else(|| panic!("no group at index '{}'", i)) + } +} + +/// Get a group by name. +/// +/// `'h` is the lifetime of the matched portion of the haystack and `'n` is the +/// lifetime of the group name that is used as the lookup key. +/// +/// The haystack can't outlive the `Captures` object if this method is used, +/// because of how `Index` is defined (normally `a[i]` is part of `a` and can't +/// outlive it); to do that, use `name` instead. +/// +/// # Panics +/// +/// If there is no group named by the given value. +impl<'h, 'n> core::ops::Index<&'n str> for Captures<'h> { + type Output = str; + + fn index<'a>(&'a self, name: &'n str) -> &'a str { + self.name(name) + .map(|m| m.as_str()) + .unwrap_or_else(|| panic!("no group named '{}'", name)) + } +} + +/// An iterator that yields all capturing matches in the order in which they +/// appear in the regex. +/// +/// If a particular capture group didn't participate in the match, then `None` +/// is yielded for that capture. The first match always corresponds to the +/// overall match of the regex. +/// +/// The lifetime `'c` corresponds to the lifetime of the `Captures` value, and +/// the lifetime `'h` corresponds to the originally matched haystack. +#[derive(Clone, Debug)] +pub struct SubCaptureMatches<'c, 'h> { + haystack: &'h str, + caps: &'c Captures<'h>, + it: core::iter::Enumerate>, +} + +impl<'c, 'h> Iterator for SubCaptureMatches<'c, 'h> { + type Item = Option>; + + #[inline] + fn next(&mut self) -> Option>> { + let (group_index, _) = self.it.next()?; + Some(self.caps.get(group_index)) + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + self.it.size_hint() + } + + #[inline] + fn count(self) -> usize { + self.it.count() + } +} + +impl<'c, 'h> ExactSizeIterator for SubCaptureMatches<'c, 'h> {} + +impl<'c, 'h> core::iter::FusedIterator for SubCaptureMatches<'c, 'h> {} + +/// CaptureLocations is a low level representation of the raw offsets of each +/// submatch. +/// +/// You can think of this as a lower level [`Captures`], where this type does +/// not support named capturing groups directly and it does not borrow the +/// haystack that these offsets were matched on. +/// +/// Primarily, this type is useful when using the lower level `Regex` APIs +/// such as `read_captures`, which permits amortizing the allocation in which +/// capture match locations are stored. +/// +/// In order to build a value of this type, you'll need to call the +/// `capture_locations` method on the `Regex` being used to execute the search. +/// The value returned can then be reused in subsequent searches. +/// +/// # Example +/// +/// This example shows how to create and use `CaptureLocations` in a search. +/// +/// ``` +/// use regex_lite::Regex; +/// +/// let re = Regex::new(r"(?\w+)\s+(?\w+)").unwrap(); +/// let mut locs = re.capture_locations(); +/// let m = re.captures_read(&mut locs, "Bruce Springsteen").unwrap(); +/// assert_eq!(0..17, m.range()); +/// assert_eq!(Some((0, 17)), locs.get(0)); +/// assert_eq!(Some((0, 5)), locs.get(1)); +/// assert_eq!(Some((6, 17)), locs.get(2)); +/// +/// // Asking for an invalid capture group always returns None. +/// assert_eq!(None, locs.get(3)); +/// assert_eq!(None, locs.get(34973498648)); +/// assert_eq!(None, locs.get(9944060567225171988)); +/// ``` +#[derive(Clone, Debug)] +pub struct CaptureLocations(Vec>); + +/// A type alias for `CaptureLocations` for backwards compatibility. +/// +/// Previously, we exported `CaptureLocations` as `Locations` in an +/// undocumented API. To prevent breaking that code (e.g., in `regex-capi`), +/// we continue re-exporting the same undocumented API. +#[doc(hidden)] +pub type Locations = CaptureLocations; + +impl CaptureLocations { + /// Returns the start and end positions of the Nth capture group. Returns + /// `None` if `i` is not a valid capture group or if the capture group did + /// not match anything. The positions returned are *always* byte indices + /// with respect to the original string matched. + #[inline] + pub fn get(&self, i: usize) -> Option<(usize, usize)> { + let slot = i.checked_mul(2)?; + let start = self.0.get(slot).copied()??.get(); + let slot = slot.checked_add(1)?; + let end = self.0.get(slot).copied()??.get(); + Some((start, end)) + } + + /// Returns the total number of capture groups (even if they didn't match). + /// + /// This is always at least `1` since every regex has at least `1` + /// capturing group that corresponds to the entire match. + #[inline] + pub fn len(&self) -> usize { + // We always have twice as many slots as groups. + self.0.len().checked_shr(1).unwrap() + } + + /// An alias for the `get` method for backwards compatibility. + /// + /// Previously, we exported `get` as `pos` in an undocumented API. To + /// prevent breaking that code (e.g., in `regex-capi`), we continue + /// re-exporting the same undocumented API. + #[doc(hidden)] + #[inline] + pub fn pos(&self, i: usize) -> Option<(usize, usize)> { + self.get(i) + } +} + +/// An iterator over the names of all possible captures. +/// +/// `None` indicates an unnamed capture; the first element (capture 0, the +/// whole matched region) is always unnamed. +/// +/// `'r` is the lifetime of the compiled regular expression. +#[derive(Clone, Debug)] +pub struct CaptureNames<'r>(nfa::CaptureNames<'r>); + +impl<'r> Iterator for CaptureNames<'r> { + type Item = Option<&'r str>; + + #[inline] + fn next(&mut self) -> Option> { + self.0.next() + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + self.0.size_hint() + } + + #[inline] + fn count(self) -> usize { + self.0.count() + } +} + +impl<'r> ExactSizeIterator for CaptureNames<'r> {} + +impl<'r> core::iter::FusedIterator for CaptureNames<'r> {} + +/// An iterator over all non-overlapping matches for a particular string. +/// +/// The iterator yields a `Match` value. The iterator stops when no more +/// matches can be found. +/// +/// `'r` is the lifetime of the compiled regular expression and `'h` is the +/// lifetime of the matched string. +#[derive(Debug)] +pub struct Matches<'r, 'h> { + haystack: &'h str, + it: pikevm::FindMatches<'r, 'h>, +} + +impl<'r, 'h> Iterator for Matches<'r, 'h> { + type Item = Match<'h>; + + #[inline] + fn next(&mut self) -> Option> { + self.it.next().map(|(s, e)| Match::new(self.haystack, s, e)) + } + + #[inline] + fn count(self) -> usize { + self.it.count() + } +} + +impl<'r, 'h> core::iter::FusedIterator for Matches<'r, 'h> {} + +/// An iterator that yields all non-overlapping capture groups matching a +/// particular regular expression. +/// +/// The iterator stops when no more matches can be found. +/// +/// `'r` is the lifetime of the compiled regular expression and `'h` is the +/// lifetime of the matched string. +#[derive(Debug)] +pub struct CaptureMatches<'r, 'h> { + haystack: &'h str, + re: &'r Regex, + it: pikevm::CapturesMatches<'r, 'h>, +} + +impl<'r, 'h> Iterator for CaptureMatches<'r, 'h> { + type Item = Captures<'h>; + + #[inline] + fn next(&mut self) -> Option> { + self.it.next().map(|slots| Captures { + haystack: self.haystack, + slots: CaptureLocations(slots), + pikevm: Arc::clone(&self.re.pikevm), + }) + } + + #[inline] + fn count(self) -> usize { + self.it.count() + } +} + +impl<'r, 'h> core::iter::FusedIterator for CaptureMatches<'r, 'h> {} + +#[derive(Debug)] +pub struct RegexBuilder { + pattern: String, + hir_config: hir::Config, + nfa_config: nfa::Config, +} + +impl RegexBuilder { + pub fn new(pattern: &str) -> RegexBuilder { + RegexBuilder { + pattern: pattern.to_string(), + hir_config: hir::Config::default(), + nfa_config: nfa::Config::default(), + } + } + + pub fn build(&self) -> Result { + let hir = Hir::parse(self.hir_config, &self.pattern)?; + let nfa = NFA::new(self.nfa_config, self.pattern.clone(), &hir)?; + let pikevm = Arc::new(PikeVM::new(nfa)); + let pool = { + let pikevm = Arc::clone(&pikevm); + let create = Box::new(move || Cache::new(&pikevm)); + CachePool::new(create) + }; + Ok(Regex { pikevm, pool }) + } + + pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder { + self.hir_config.flags.case_insensitive = yes; + self + } + + pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder { + self.hir_config.flags.multi_line = yes; + self + } + + pub fn crlf(&mut self, yes: bool) -> &mut RegexBuilder { + self.hir_config.flags.crlf = yes; + self + } + + pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut RegexBuilder { + self.hir_config.flags.dot_matches_new_line = yes; + self + } + + pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder { + self.hir_config.flags.swap_greed = yes; + self + } + + pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder { + self.hir_config.flags.ignore_whitespace = yes; + self + } + + pub fn size_limit(&mut self, limit: usize) -> &mut RegexBuilder { + self.nfa_config.size_limit = Some(limit); + self + } + + pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder { + self.hir_config.nest_limit = limit; + self + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn scratch() { + let re = Regex::new("abc").unwrap(); + assert_eq!(Some(0..3), re.find("abc").map(|m| m.range())); + + let re = Regex::new("abc").unwrap(); + assert_eq!(Some(4..7), re.find("foo abc").map(|m| m.range())); + + let re = Regex::new("^abc").unwrap(); + assert_eq!(Some(0..3), re.find("abc").map(|m| m.range())); + + let re = Regex::new("^abc").unwrap(); + assert_eq!(None, re.find("foo abc").map(|m| m.range())); + + let re = Regex::new("(?Rm)^foo$").unwrap(); + assert_eq!(Some(2..5), re.find("\r\nfoo\r\n").map(|m| m.range())); + } +} diff --git a/regex-lite/src/utf8.rs b/regex-lite/src/utf8.rs new file mode 100644 index 000000000..cb361ac5a --- /dev/null +++ b/regex-lite/src/utf8.rs @@ -0,0 +1,445 @@ +/// Returns true if and only if the given byte is considered a word character. +/// This only applies to ASCII. +pub(crate) fn is_word_byte(b: u8) -> bool { + const fn mkwordset() -> [bool; 256] { + // FIXME: Use as_usize() once const functions in traits are stable. + let mut set = [false; 256]; + set[b'_' as usize] = true; + + let mut byte = b'0'; + while byte <= b'9' { + set[byte as usize] = true; + byte += 1; + } + byte = b'A'; + while byte <= b'Z' { + set[byte as usize] = true; + byte += 1; + } + byte = b'a'; + while byte <= b'z' { + set[byte as usize] = true; + byte += 1; + } + set + } + const WORD: [bool; 256] = mkwordset(); + WORD[b as usize] +} + +/// The accept state index. When we enter this state, we know we've found a +/// valid Unicode scalar value. +const ACCEPT: usize = 12; +/// The reject state index. When we enter this state, we know that we've found +/// invalid UTF-8. +const REJECT: usize = 0; + +/// Like `decode`, but automatically converts the `None` case to the +/// replacement codepoint. +pub(crate) fn decode_lossy>(slice: B) -> (char, usize) { + match decode(slice) { + (Some(ch), size) => (ch, size), + (None, size) => ('\u{FFFD}', size), + } +} + +/// Like `decode_last`, but automatically converts the `None` case to the +/// replacement codepoint. +pub(crate) fn decode_last_lossy>(slice: B) -> (char, usize) { + match decode_last(slice) { + (Some(ch), size) => (ch, size), + (None, size) => ('\u{FFFD}', size), + } +} + +/// UTF-8 decode a single Unicode scalar value from the beginning of a slice. +/// +/// When successful, the corresponding Unicode scalar value is returned along +/// with the number of bytes it was encoded with. The number of bytes consumed +/// for a successful decode is always between 1 and 4, inclusive. +/// +/// When unsuccessful, `None` is returned along with the number of bytes that +/// make up a maximal prefix of a valid UTF-8 code unit sequence. In this case, +/// the number of bytes consumed is always between 0 and 3, inclusive, where +/// 0 is only returned when `slice` is empty. +pub(crate) fn decode>(slice: B) -> (Option, usize) { + let slice = slice.as_ref(); + match slice.get(0) { + None => return (None, 0), + Some(&b) if b <= 0x7F => return (Some(b as char), 1), + _ => {} + } + + let (mut state, mut cp, mut i) = (ACCEPT, 0, 0); + while i < slice.len() { + decode_step(&mut state, &mut cp, slice[i]); + i += 1; + + if state == ACCEPT { + // OK since `decode_step` guarantees that `cp` is a valid Unicode + // scalar value in an ACCEPT state. + // + // We don't have to use safe code here, but do so because perf + // isn't our primary objective in regex-lite. + let ch = char::from_u32(cp).unwrap(); + return (Some(ch), i); + } else if state == REJECT { + // At this point, we always want to advance at least one byte. + return (None, core::cmp::max(1, i.saturating_sub(1))); + } + } + (None, i) +} + +/// Like `decode`, but in reverse from the end of the given slice. +pub(crate) fn decode_last>(slice: B) -> (Option, usize) { + // TODO: We could implement this by reversing the UTF-8 automaton, but for + // now, we do it the slow way by using the forward automaton. + + let slice = slice.as_ref(); + if slice.is_empty() { + return (None, 0); + } + let mut start = slice.len() - 1; + let limit = slice.len().saturating_sub(4); + while start > limit && !is_leading_or_invalid_utf8_byte(slice[start]) { + start -= 1; + } + let (ch, size) = decode(&slice[start..]); + // If we didn't consume all of the bytes, then that means there's at least + // one stray byte that never occurs in a valid code unit prefix, so we can + // advance by one byte. + if start + size != slice.len() { + (None, 1) + } else { + (ch, size) + } +} + +/// Transitions to the next state and updates `cp` while it does. +fn decode_step(state: &mut usize, cp: &mut u32, b: u8) { + // Splits the space of all bytes into equivalence classes, such that + // any byte in the same class can never discriminate between whether a + // particular sequence is valid UTF-8 or not. + #[cfg_attr(rustfmt, rustfmt::skip)] + const CLASSES: [u8; 256] = [ + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, + ]; + + // A state machine taken from `bstr` which was in turn adapted from: + // https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ + #[cfg_attr(rustfmt, rustfmt::skip)] + const STATES_FORWARD: &'static [u8] = &[ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 12, 0, 24, 36, 60, 96, 84, 0, 0, 0, 48, 72, + 0, 12, 0, 0, 0, 0, 0, 12, 0, 12, 0, 0, + 0, 24, 0, 0, 0, 0, 0, 24, 0, 24, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 24, 0, 0, 0, 0, + 0, 24, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0, + 0, 36, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0, + 0, 36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ]; + + let class = CLASSES[b as usize]; + if *state == ACCEPT { + *cp = (0xFF >> class) & (b as u32); + } else { + *cp = (b as u32 & 0b111111) | (*cp << 6); + } + *state = STATES_FORWARD[*state + class as usize] as usize; +} + +/// Returns true if and only if the given byte is either a valid leading UTF-8 +/// byte, or is otherwise an invalid byte that can never appear anywhere in a +/// valid UTF-8 sequence. +fn is_leading_or_invalid_utf8_byte(b: u8) -> bool { + // In the ASCII case, the most significant bit is never set. The leading + // byte of a 2/3/4-byte sequence always has the top two most significant + // bits set. For bytes that can never appear anywhere in valid UTF-8, this + // also returns true, since every such byte has its two most significant + // bits set: + // + // \xC0 :: 11000000 + // \xC1 :: 11000001 + // \xF5 :: 11110101 + // \xF6 :: 11110110 + // \xF7 :: 11110111 + // \xF8 :: 11111000 + // \xF9 :: 11111001 + // \xFA :: 11111010 + // \xFB :: 11111011 + // \xFC :: 11111100 + // \xFD :: 11111101 + // \xFE :: 11111110 + // \xFF :: 11111111 + (b & 0b1100_0000) != 0b1000_0000 +} + +#[cfg(test)] +mod tests { + use alloc::{vec, vec::Vec}; + + use super::*; + + #[test] + fn decode_valid() { + fn d(mut s: &str) -> Vec { + let mut chars = vec![]; + while !s.is_empty() { + let (ch, size) = decode(s.as_bytes()); + s = &s[size..]; + chars.push(ch.unwrap()); + } + chars + } + + assert_eq!(vec!['☃'], d("☃")); + assert_eq!(vec!['☃', '☃'], d("☃☃")); + assert_eq!(vec!['α', 'β', 'γ', 'δ', 'ε'], d("αβγδε")); + assert_eq!(vec!['☃', '⛄', '⛇'], d("☃⛄⛇")); + assert_eq!(vec!['𝗮', '𝗯', '𝗰', '𝗱', '𝗲'], d("𝗮𝗯𝗰𝗱𝗲")); + } + + #[test] + fn decode_invalid() { + let (ch, size) = decode(b""); + assert_eq!(None, ch); + assert_eq!(0, size); + + let (ch, size) = decode(b"\xFF"); + assert_eq!(None, ch); + assert_eq!(1, size); + + let (ch, size) = decode(b"\xCE\xF0"); + assert_eq!(None, ch); + assert_eq!(1, size); + + let (ch, size) = decode(b"\xE2\x98\xF0"); + assert_eq!(None, ch); + assert_eq!(2, size); + + let (ch, size) = decode(b"\xF0\x9D\x9D"); + assert_eq!(None, ch); + assert_eq!(3, size); + + let (ch, size) = decode(b"\xF0\x9D\x9D\xF0"); + assert_eq!(None, ch); + assert_eq!(3, size); + + let (ch, size) = decode(b"\xF0\x82\x82\xAC"); + assert_eq!(None, ch); + assert_eq!(1, size); + + let (ch, size) = decode(b"\xED\xA0\x80"); + assert_eq!(None, ch); + assert_eq!(1, size); + + let (ch, size) = decode(b"\xCEa"); + assert_eq!(None, ch); + assert_eq!(1, size); + + let (ch, size) = decode(b"\xE2\x98a"); + assert_eq!(None, ch); + assert_eq!(2, size); + + let (ch, size) = decode(b"\xF0\x9D\x9Ca"); + assert_eq!(None, ch); + assert_eq!(3, size); + } + + #[test] + fn decode_lossily() { + let (ch, size) = decode_lossy(b""); + assert_eq!('\u{FFFD}', ch); + assert_eq!(0, size); + + let (ch, size) = decode_lossy(b"\xFF"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(1, size); + + let (ch, size) = decode_lossy(b"\xCE\xF0"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(1, size); + + let (ch, size) = decode_lossy(b"\xE2\x98\xF0"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(2, size); + + let (ch, size) = decode_lossy(b"\xF0\x9D\x9D\xF0"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(3, size); + + let (ch, size) = decode_lossy(b"\xF0\x82\x82\xAC"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(1, size); + + let (ch, size) = decode_lossy(b"\xED\xA0\x80"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(1, size); + + let (ch, size) = decode_lossy(b"\xCEa"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(1, size); + + let (ch, size) = decode_lossy(b"\xE2\x98a"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(2, size); + + let (ch, size) = decode_lossy(b"\xF0\x9D\x9Ca"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(3, size); + } + + #[test] + fn decode_last_valid() { + fn d(mut s: &str) -> Vec { + let mut chars = vec![]; + while !s.is_empty() { + let (ch, size) = decode_last(s.as_bytes()); + s = &s[..s.len() - size]; + chars.push(ch.unwrap()); + } + chars + } + + assert_eq!(vec!['☃'], d("☃")); + assert_eq!(vec!['☃', '☃'], d("☃☃")); + assert_eq!(vec!['ε', 'δ', 'γ', 'β', 'α'], d("αβγδε")); + assert_eq!(vec!['⛇', '⛄', '☃'], d("☃⛄⛇")); + assert_eq!(vec!['𝗲', '𝗱', '𝗰', '𝗯', '𝗮'], d("𝗮𝗯𝗰𝗱𝗲")); + } + + #[test] + fn decode_last_invalid() { + let (ch, size) = decode_last(b""); + assert_eq!(None, ch); + assert_eq!(0, size); + + let (ch, size) = decode_last(b"\xFF"); + assert_eq!(None, ch); + assert_eq!(1, size); + + let (ch, size) = decode_last(b"\xCE\xF0"); + assert_eq!(None, ch); + assert_eq!(1, size); + + let (ch, size) = decode_last(b"\xCE"); + assert_eq!(None, ch); + assert_eq!(1, size); + + let (ch, size) = decode_last(b"\xE2\x98\xF0"); + assert_eq!(None, ch); + assert_eq!(1, size); + + let (ch, size) = decode_last(b"\xE2\x98"); + assert_eq!(None, ch); + assert_eq!(2, size); + + let (ch, size) = decode_last(b"\xF0\x9D\x9D\xF0"); + assert_eq!(None, ch); + assert_eq!(1, size); + + let (ch, size) = decode_last(b"\xF0\x9D\x9D"); + assert_eq!(None, ch); + assert_eq!(3, size); + + let (ch, size) = decode_last(b"\xF0\x82\x82\xAC"); + assert_eq!(None, ch); + assert_eq!(1, size); + + let (ch, size) = decode_last(b"\xED\xA0\x80"); + assert_eq!(None, ch); + assert_eq!(1, size); + + let (ch, size) = decode_last(b"\xED\xA0"); + assert_eq!(None, ch); + assert_eq!(1, size); + + let (ch, size) = decode_last(b"\xED"); + assert_eq!(None, ch); + assert_eq!(1, size); + + let (ch, size) = decode_last(b"a\xCE"); + assert_eq!(None, ch); + assert_eq!(1, size); + + let (ch, size) = decode_last(b"a\xE2\x98"); + assert_eq!(None, ch); + assert_eq!(2, size); + + let (ch, size) = decode_last(b"a\xF0\x9D\x9C"); + assert_eq!(None, ch); + assert_eq!(3, size); + } + + #[test] + fn decode_last_lossily() { + let (ch, size) = decode_last_lossy(b""); + assert_eq!('\u{FFFD}', ch); + assert_eq!(0, size); + + let (ch, size) = decode_last_lossy(b"\xFF"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(1, size); + + let (ch, size) = decode_last_lossy(b"\xCE\xF0"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(1, size); + + let (ch, size) = decode_last_lossy(b"\xCE"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(1, size); + + let (ch, size) = decode_last_lossy(b"\xE2\x98\xF0"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(1, size); + + let (ch, size) = decode_last_lossy(b"\xE2\x98"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(2, size); + + let (ch, size) = decode_last_lossy(b"\xF0\x9D\x9D\xF0"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(1, size); + + let (ch, size) = decode_last_lossy(b"\xF0\x9D\x9D"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(3, size); + + let (ch, size) = decode_last_lossy(b"\xF0\x82\x82\xAC"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(1, size); + + let (ch, size) = decode_last_lossy(b"\xED\xA0\x80"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(1, size); + + let (ch, size) = decode_last_lossy(b"\xED\xA0"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(1, size); + + let (ch, size) = decode_last_lossy(b"\xED"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(1, size); + + let (ch, size) = decode_last_lossy(b"a\xCE"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(1, size); + + let (ch, size) = decode_last_lossy(b"a\xE2\x98"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(2, size); + + let (ch, size) = decode_last_lossy(b"a\xF0\x9D\x9C"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(3, size); + } +} diff --git a/regex-lite/tests/lib.rs b/regex-lite/tests/lib.rs new file mode 100644 index 000000000..3b9a18643 --- /dev/null +++ b/regex-lite/tests/lib.rs @@ -0,0 +1,47 @@ +mod string; + +const BLACKLIST: &[&str] = &[ + // CRLF-aware line anchors aren't supported in regex API yet. + "crlf", + // Custom line terminators aren't supported in regex-lite. We could add it, + // but it didn't seem worth it. + "line-terminator", +]; + +fn suite() -> anyhow::Result { + let mut tests = regex_test::RegexTests::new(); + macro_rules! load { + ($name:expr) => {{ + const DATA: &[u8] = + include_bytes!(concat!("../../testdata/", $name, ".toml")); + tests.load_slice($name, DATA)?; + }}; + } + + load!("anchored"); + load!("bytes"); + load!("crazy"); + load!("crlf"); + load!("earliest"); + load!("empty"); + load!("expensive"); + load!("flags"); + load!("iter"); + load!("leftmost-all"); + load!("line-terminator"); + load!("misc"); + load!("multiline"); + load!("no-unicode"); + load!("overlapping"); + load!("regression"); + load!("set"); + load!("substring"); + load!("unicode"); + load!("utf8"); + load!("word-boundary"); + load!("fowler/basic"); + load!("fowler/nullsubexpr"); + load!("fowler/repetition"); + + Ok(tests) +} diff --git a/regex-lite/tests/string.rs b/regex-lite/tests/string.rs new file mode 100644 index 000000000..98fbbbe3d --- /dev/null +++ b/regex-lite/tests/string.rs @@ -0,0 +1,137 @@ +use { + anyhow::Result, + regex_lite::{Regex, RegexBuilder}, + regex_test::{ + CompiledRegex, Match, RegexTest, Span, TestResult, TestRunner, + }, +}; + +/// Tests the default configuration of the hybrid NFA/DFA. +#[test] +fn default() -> Result<()> { + let mut runner = TestRunner::new()?; + runner + .expand(&["is_match", "find", "captures"], |test| test.compiles()) + .blacklist_iter(super::BLACKLIST) + .test_iter(crate::suite()?.iter(), compiler) + .assert(); + Ok(()) +} + +fn run_test(re: &Regex, test: &RegexTest) -> TestResult { + let hay = match std::str::from_utf8(test.haystack()) { + Ok(hay) => hay, + Err(err) => { + return TestResult::fail(&format!( + "haystack is not valid UTF-8: {}", + err + )); + } + }; + match test.additional_name() { + "is_match" => TestResult::matched(re.is_match(hay)), + "find" => TestResult::matches( + re.find_iter(hay) + .take(test.match_limit().unwrap_or(std::usize::MAX)) + .map(|m| Match { + id: 0, + span: Span { start: m.start(), end: m.end() }, + }), + ), + "captures" => { + let it = re + .captures_iter(hay) + .take(test.match_limit().unwrap_or(std::usize::MAX)) + .map(|caps| testify_captures(&caps)); + TestResult::captures(it) + } + name => TestResult::fail(&format!("unrecognized test name: {}", name)), + } +} + +/// Converts the given regex test to a closure that searches with a +/// `bytes::Regex`. If the test configuration is unsupported, then a +/// `CompiledRegex` that skips the test is returned. +fn compiler( + test: &RegexTest, + _patterns: &[String], +) -> anyhow::Result { + let skip = Ok(CompiledRegex::skip()); + + // We're only testing Regex here, which supports one pattern only. + let pattern = match test.regexes().len() { + 1 => &test.regexes()[0], + _ => return skip, + }; + // If the pattern has a \p in it, then we almost certainly don't support + // it. This probably skips more than we intend, but there are likely very + // few tests that contain a \p that isn't also a Unicode class. + if pattern.contains(r"\p") || pattern.contains(r"\P") { + return skip; + } + // Similar deal for Perl classes, but we can abide them if the haystack + // is ASCII-only. + if !test.haystack().is_ascii() { + if pattern.contains(r"\d") || pattern.contains(r"\D") { + return skip; + } + if pattern.contains(r"\s") || pattern.contains(r"\S") { + return skip; + } + if pattern.contains(r"\w") || pattern.contains(r"\W") { + return skip; + } + } + // And also same deal for word boundaries. + if !test.haystack().is_ascii() { + if pattern.contains(r"\b") || pattern.contains(r"\B") { + return skip; + } + } + // We only test is_match, find_iter and captures_iter. All of those are + // leftmost searches. + if !matches!(test.search_kind(), regex_test::SearchKind::Leftmost) { + return skip; + } + // The top-level single-pattern regex API always uses leftmost-first. + if !matches!(test.match_kind(), regex_test::MatchKind::LeftmostFirst) { + return skip; + } + // The top-level regex API always runs unanchored searches. ... But we can + // handle tests that are anchored but have only one match. + if test.anchored() && test.match_limit() != Some(1) { + return skip; + } + // We don't support tests with explicit search bounds. We could probably + // support this by using the 'find_at' (and such) APIs. + let bounds = test.bounds(); + if !(bounds.start == 0 && bounds.end == test.haystack().len()) { + return skip; + } + // The Regex API specifically does not support disabling UTF-8 mode because + // it can only search &str which is always valid UTF-8. + if !test.utf8() { + return skip; + } + // regex-lite doesn't support Unicode-aware case insensitive matching. + if test.case_insensitive() + && (!pattern.is_ascii() || !test.haystack().is_ascii()) + { + return skip; + } + let re = RegexBuilder::new(pattern) + .case_insensitive(test.case_insensitive()) + .build()?; + Ok(CompiledRegex::compiled(move |test| run_test(&re, test))) +} + +/// Convert `Captures` into the test suite's capture values. +fn testify_captures(caps: ®ex_lite::Captures<'_>) -> regex_test::Captures { + let spans = caps.iter().map(|group| { + group.map(|m| regex_test::Span { start: m.start(), end: m.end() }) + }); + // This unwrap is OK because we assume our 'caps' represents a match, and + // a match always gives a non-zero number of groups with the first group + // being non-None. + regex_test::Captures::new(0, spans).unwrap() +}