diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5d5b24c4c..060a2226c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -122,6 +122,8 @@ jobs: - name: Run subset of regex-automata tests if: matrix.build != 'win-gnu' # Just horrifically slow. run: ${{ env.CARGO }} test --verbose --manifest-path regex-automata/Cargo.toml $TARGET + - name: Run regex-lite tests + run: ${{ env.CARGO }} test --verbose --manifest-path regex-lite/Cargo.toml $TARGET - if: matrix.build == 'nightly' name: Run benchmarks as tests run: | diff --git a/Cargo.toml b/Cargo.toml index 50f6ca6de..c8781f39f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,6 +23,7 @@ members = [ "regex-automata", "regex-capi", "regex-cli", + "regex-lite", "regex-syntax", "regex-test", ] diff --git a/regex-lite/Cargo.toml b/regex-lite/Cargo.toml new file mode 100644 index 000000000..642e7dd64 --- /dev/null +++ b/regex-lite/Cargo.toml @@ -0,0 +1,27 @@ +[package] +name = "regex-lite" +version = "0.1.0" #:version +authors = ["The Rust Project Developers", "Andrew Gallant "] +license = "MIT OR Apache-2.0" +repository = "/~https://github.com/rust-lang/regex/tree/master/regex-lite" +documentation = "https://docs.rs/regex-lite" +description = """ +A lightweight regex engine that optimizes for binary size and compilation time. +""" +workspace = ".." +edition = "2021" +rust-version = "1.60.0" + +# Features are documented in the "Crate features" section of the crate docs: +# https://docs.rs/regex-syntax/*/#crate-features +[features] +default = ["std"] +std = [] + +[package.metadata.docs.rs] +# We want to document all features. +all-features = true +# To test this locally, run: +# +# RUSTDOCFLAGS="--cfg docsrs" cargo +nightly doc --all-features +rustdoc-args = ["--cfg", "docsrs"] diff --git a/regex-lite/LICENSE-APACHE b/regex-lite/LICENSE-APACHE new file mode 100644 index 000000000..16fe87b06 --- /dev/null +++ b/regex-lite/LICENSE-APACHE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/regex-lite/LICENSE-MIT b/regex-lite/LICENSE-MIT new file mode 100644 index 000000000..39d4bdb5a --- /dev/null +++ b/regex-lite/LICENSE-MIT @@ -0,0 +1,25 @@ +Copyright (c) 2014 The Rust Project Developers + +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/regex-lite/README.md b/regex-lite/README.md new file mode 100644 index 000000000..00d7bdd40 --- /dev/null +++ b/regex-lite/README.md @@ -0,0 +1 @@ +WIP diff --git a/regex-lite/src/error.rs b/regex-lite/src/error.rs new file mode 100644 index 000000000..a6313aa8a --- /dev/null +++ b/regex-lite/src/error.rs @@ -0,0 +1,19 @@ +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Error { + msg: &'static str, +} + +impl Error { + pub(crate) fn new(msg: &'static str) -> Error { + Error { msg } + } +} + +#[cfg(feature = "std")] +impl std::error::Error for Error {} + +impl core::fmt::Display for Error { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "{}", self.msg) + } +} diff --git a/regex-lite/src/hir/mod.rs b/regex-lite/src/hir/mod.rs new file mode 100644 index 000000000..6a9935029 --- /dev/null +++ b/regex-lite/src/hir/mod.rs @@ -0,0 +1,553 @@ +use alloc::{boxed::Box, vec, vec::Vec}; + +use crate::{error::Error, utf8}; + +mod parse; + +/// The configuration for a regex parser. +#[derive(Clone, Copy, Debug)] +pub(crate) struct Config { + /// The maximum number of times we're allowed to recurse. + /// + /// Note that unlike the regex-syntax parser, we actually use recursion in + /// this parser for simplicity. My hope is that by setting a conservative + /// default call limit and providing a way to configure it, that we can + /// keep this simplification. But if we must, we can re-work the parser to + /// put the call stack on the heap like regex-syntax does. + pub(crate) nest_limit: u32, + /// Various flags that control how a pattern is interpreted. + pub(crate) flags: Flags, +} + +impl Default for Config { + fn default() -> Config { + Config { nest_limit: 50, flags: Flags::default() } + } +} + +/// Various flags that control the interpretation of the pattern. +/// +/// These can be set via explicit configuration in code, or change dynamically +/// during parsing via inline flags. For example, `foo(?i:bar)baz` will match +/// `foo` and `baz` case sensitiviely and `bar` case insensitively (assuming a +/// default configuration). +#[derive(Clone, Copy, Debug, Default)] +pub(crate) struct Flags { + /// Whether to match case insensitively. + /// + /// This is the `i` flag. + pub(crate) case_insensitive: bool, + /// Whether `^` and `$` should be treated as line anchors or not. + /// + /// This is the `m` flag. + pub(crate) multi_line: bool, + /// Whether `.` should match line terminators or not. + /// + /// This is the `s` flag. + pub(crate) dot_matches_new_line: bool, + /// Whether to swap the meaning of greedy and non-greedy operators. + /// + /// This is the `U` flag. + pub(crate) swap_greed: bool, + /// Whether to enable CRLF mode. + /// + /// This is the `R` flag. + pub(crate) crlf: bool, + /// Whether to ignore whitespace. i.e., verbose mode. + /// + /// This is the `x` flag. + pub(crate) ignore_whitespace: bool, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub(crate) struct Hir { + kind: HirKind, + is_start_anchored: bool, + is_match_empty: bool, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub(crate) enum HirKind { + Empty, + Char(char), + Class(Class), + Look(Look), + Repetition(Repetition), + Capture(Capture), + Concat(Vec), + Alternation(Vec), +} + +impl Hir { + /// Parses the given pattern string with the given configuration into a + /// structured representation. If the pattern is invalid, then an error + /// is returned. + pub(crate) fn parse(config: Config, pattern: &str) -> Result { + self::parse::Parser::new(config, pattern).parse() + } + + /// Returns the underlying kind of this high-level intermediate + /// representation. + /// + /// Note that there is explicitly no way to build an `Hir` directly from + /// an `HirKind`. If you need to do that, then you must do case analysis + /// on the `HirKind` and call the appropriate smart constructor on `Hir`. + pub(crate) fn kind(&self) -> &HirKind { + &self.kind + } + + /// Returns true if and only if this Hir expression can only match at the + /// beginning of a haystack. + pub(crate) fn is_start_anchored(&self) -> bool { + self.is_start_anchored + } + + /// Returns true if and only if this Hir expression can match the empty + /// string. + pub(crate) fn is_match_empty(&self) -> bool { + self.is_match_empty + } + + fn fail() -> Hir { + let kind = HirKind::Class(Class { ranges: vec![] }); + Hir { kind, is_start_anchored: false, is_match_empty: false } + } + + fn empty() -> Hir { + let kind = HirKind::Empty; + Hir { kind, is_start_anchored: false, is_match_empty: true } + } + + fn char(ch: char) -> Hir { + let kind = HirKind::Char(ch); + Hir { kind, is_start_anchored: false, is_match_empty: false } + } + + fn class(class: Class) -> Hir { + let kind = HirKind::Class(class); + Hir { kind, is_start_anchored: false, is_match_empty: false } + } + + fn look(look: Look) -> Hir { + let kind = HirKind::Look(look); + Hir { + kind, + is_start_anchored: matches!(look, Look::Start), + is_match_empty: true, + } + } + + fn repetition(rep: Repetition) -> Hir { + if rep.min == 0 && rep.max == Some(0) { + return Hir::empty(); + } else if rep.min == 1 && rep.max == Some(1) { + return *rep.sub; + } + let is_start_anchored = rep.min > 0 && rep.sub.is_start_anchored; + let is_match_empty = rep.min == 0 || rep.sub.is_match_empty; + let kind = HirKind::Repetition(rep); + Hir { kind, is_start_anchored, is_match_empty } + } + + fn capture(cap: Capture) -> Hir { + let is_start_anchored = cap.sub.is_start_anchored; + let is_match_empty = cap.sub.is_match_empty; + let kind = HirKind::Capture(cap); + Hir { kind, is_start_anchored, is_match_empty } + } + + fn concat(mut subs: Vec) -> Hir { + if subs.is_empty() { + Hir::empty() + } else if subs.len() == 1 { + subs.pop().unwrap() + } else { + let is_start_anchored = subs[0].is_start_anchored; + let is_match_empty = subs.iter().all(|s| s.is_match_empty); + let kind = HirKind::Concat(subs); + Hir { kind, is_start_anchored, is_match_empty } + } + } + + fn alternation(mut subs: Vec) -> Hir { + if subs.is_empty() { + Hir::fail() + } else if subs.len() == 1 { + subs.pop().unwrap() + } else { + let is_start_anchored = subs.iter().all(|s| s.is_start_anchored); + let is_match_empty = subs.iter().any(|s| s.is_match_empty); + let kind = HirKind::Alternation(subs); + Hir { kind, is_start_anchored, is_match_empty } + } + } +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub(crate) struct Class { + pub(crate) ranges: Vec, +} + +impl Class { + /// Create a new class from the given ranges. The ranges may be provided + /// in any order or may even overlap. They will be automatically + /// canonicalized. + fn new>(ranges: I) -> Class { + let mut class = Class { ranges: ranges.into_iter().collect() }; + class.canonicalize(); + class + } + + /// Add a new range to this set. + fn push(&mut self, range: ClassRange) { + self.ranges.push(range); + self.canonicalize(); + } + + /// Expand this class such that it matches the ASCII codepoints in this set + /// case insensitively. + fn ascii_case_fold(&mut self) { + let len = self.ranges.len(); + for i in 0..len { + if let Some(folded) = self.ranges[i].ascii_case_fold() { + self.ranges.push(folded); + } + } + self.canonicalize(); + } + + /// Negate this set. + /// + /// For all `x` where `x` is any element, if `x` was in this set, then it + /// will not be in this set after negation. + fn negate(&mut self) { + const MIN: char = '\x00'; + const MAX: char = char::MAX; + + if self.ranges.is_empty() { + self.ranges.push(ClassRange { start: MIN, end: MAX }); + return; + } + + // There should be a way to do this in-place with constant memory, + // but I couldn't figure out a simple way to do it. So just append + // the negation to the end of this range, and then drain it before + // we're done. + let drain_end = self.ranges.len(); + + // If our class doesn't start the minimum possible char, then negation + // needs to include all codepoints up to the minimum in this set. + if self.ranges[0].start > MIN { + self.ranges.push(ClassRange { + start: MIN, + // OK because we know it's bigger than MIN. + end: prev_char(self.ranges[0].start).unwrap(), + }); + } + for i in 1..drain_end { + // let lower = self.ranges[i - 1].upper().increment(); + // let upper = self.ranges[i].lower().decrement(); + // self.ranges.push(I::create(lower, upper)); + self.ranges.push(ClassRange { + // OK because we know i-1 is never the last range and therefore + // there must be a range greater than it. It therefore follows + // that 'end' can never be char::MAX, and thus there must be + // a next char. + start: next_char(self.ranges[i - 1].end).unwrap(), + // Since 'i' is guaranteed to never be the first range, it + // follows that there is always a range before this and thus + // 'start' can never be '\x00'. Thus, there must be a previous + // char. + end: prev_char(self.ranges[i].start).unwrap(), + }); + } + if self.ranges[drain_end - 1].end < MAX { + // let lower = self.ranges[drain_end - 1].upper().increment(); + // self.ranges.push(I::create(lower, I::Bound::max_value())); + self.ranges.push(ClassRange { + // OK because we know 'end' is less than char::MAX, and thus + // there is a next char. + start: next_char(self.ranges[drain_end - 1].end).unwrap(), + end: MAX, + }); + } + self.ranges.drain(..drain_end); + // We don't need to canonicalize because we processed the ranges above + // in canonical order and the new ranges we added based on those are + // also necessarily in canonical order. + } + + /// Union this set with the given set, in place. + fn union(&mut self, other: &Class) { + if other.ranges.is_empty() || self.ranges == other.ranges { + return; + } + // This could almost certainly be done more efficiently. + self.ranges.extend(&other.ranges); + self.canonicalize(); + } + + /// Converts this set into a canonical ordering. + fn canonicalize(&mut self) { + if self.is_canonical() { + return; + } + self.ranges.sort(); + assert!(!self.ranges.is_empty()); + + // Is there a way to do this in-place with constant memory? I couldn't + // figure out a way to do it. So just append the canonicalization to + // the end of this range, and then drain it before we're done. + let drain_end = self.ranges.len(); + for oldi in 0..drain_end { + // If we've added at least one new range, then check if we can + // merge this range in the previously added range. + if self.ranges.len() > drain_end { + let (last, rest) = self.ranges.split_last_mut().unwrap(); + if let Some(union) = last.union(&rest[oldi]) { + *last = union; + continue; + } + } + self.ranges.push(self.ranges[oldi]); + } + self.ranges.drain(..drain_end); + } + + /// Returns true if and only if this class is in a canonical ordering. + fn is_canonical(&self) -> bool { + for pair in self.ranges.windows(2) { + if pair[0] >= pair[1] { + return false; + } + if pair[0].is_contiguous(&pair[1]) { + return false; + } + } + true + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq, PartialOrd, Ord)] +pub(crate) struct ClassRange { + pub(crate) start: char, + pub(crate) end: char, +} + +impl ClassRange { + /// Apply simple case folding to this byte range. Only ASCII case mappings + /// (for A-Za-z) are applied. + /// + /// Additional ranges are appended to the given vector. Canonical ordering + /// is *not* maintained in the given vector. + fn ascii_case_fold(&self) -> Option { + if !(ClassRange { start: 'a', end: 'z' }).is_intersection_empty(self) { + let start = core::cmp::max(self.start, 'a'); + let end = core::cmp::min(self.end, 'z'); + return Some(ClassRange { + start: char::try_from(u32::from(start) - 32).unwrap(), + end: char::try_from(u32::from(end) - 32).unwrap(), + }); + } + if !(ClassRange { start: 'A', end: 'Z' }).is_intersection_empty(self) { + let start = core::cmp::max(self.start, 'A'); + let end = core::cmp::min(self.end, 'Z'); + return Some(ClassRange { + start: char::try_from(u32::from(start) + 32).unwrap(), + end: char::try_from(u32::from(end) + 32).unwrap(), + }); + } + None + } + + /// Union the given overlapping range into this range. + /// + /// If the two ranges aren't contiguous, then this returns `None`. + fn union(&self, other: &ClassRange) -> Option { + if !self.is_contiguous(other) { + return None; + } + let start = core::cmp::min(self.start, other.start); + let end = core::cmp::max(self.end, other.end); + Some(ClassRange { start, end }) + } + + /// Returns true if and only if the two ranges are contiguous. Two ranges + /// are contiguous if and only if the ranges are either overlapping or + /// adjacent. + fn is_contiguous(&self, other: &ClassRange) -> bool { + let (s1, e1) = (u32::from(self.start), u32::from(self.end)); + let (s2, e2) = (u32::from(other.start), u32::from(other.end)); + core::cmp::max(s1, s2) <= core::cmp::min(e1, e2).saturating_add(1) + } + + /// Returns true if and only if the intersection of this range and the + /// other range is empty. + fn is_intersection_empty(&self, other: &ClassRange) -> bool { + let (s1, e1) = (self.start, self.end); + let (s2, e2) = (other.start, other.end); + core::cmp::max(s1, s2) > core::cmp::min(e1, e2) + } +} + +/// The high-level intermediate representation for a look-around assertion. +/// +/// An assertion match is always zero-length. Also called an "empty match." +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub(crate) enum Look { + /// Match the beginning of text. Specifically, this matches at the starting + /// position of the input. + Start = 1 << 0, + /// Match the end of text. Specifically, this matches at the ending + /// position of the input. + End = 1 << 1, + /// Match the beginning of a line or the beginning of text. Specifically, + /// this matches at the starting position of the input, or at the position + /// immediately following a `\n` character. + StartLF = 1 << 2, + /// Match the end of a line or the end of text. Specifically, this matches + /// at the end position of the input, or at the position immediately + /// preceding a `\n` character. + EndLF = 1 << 3, + /// Match the beginning of a line or the beginning of text. Specifically, + /// this matches at the starting position of the input, or at the position + /// immediately following either a `\r` or `\n` character, but never after + /// a `\r` when a `\n` follows. + StartCRLF = 1 << 4, + /// Match the end of a line or the end of text. Specifically, this matches + /// at the end position of the input, or at the position immediately + /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r` + /// precedes it. + EndCRLF = 1 << 5, + /// Match an ASCII-only word boundary. That is, this matches a position + /// where the left adjacent character and right adjacent character + /// correspond to a word and non-word or a non-word and word character. + Word = 1 << 6, + /// Match an ASCII-only negation of a word boundary. + WordNegate = 1 << 7, +} + +impl Look { + /// Returns true if the given position in the given haystack matches this + /// look-around assertion. + pub(crate) fn is_match(&self, haystack: &[u8], at: usize) -> bool { + use self::Look::*; + + match *self { + Start => at == 0, + End => at == haystack.len(), + StartLF => at == 0 || haystack[at - 1] == b'\n', + EndLF => at == haystack.len() || haystack[at] == b'\n', + StartCRLF => { + at == 0 + || haystack[at - 1] == b'\n' + || (haystack[at - 1] == b'\r' + && (at >= haystack.len() || haystack[at] != b'\n')) + } + EndCRLF => { + at == haystack.len() + || haystack[at] == b'\r' + || (haystack[at] == b'\n' + && (at == 0 || haystack[at - 1] != b'\r')) + } + Word => { + let word_before = + at > 0 && utf8::is_word_byte(haystack[at - 1]); + let word_after = + at < haystack.len() && utf8::is_word_byte(haystack[at]); + word_before != word_after + } + WordNegate => { + let word_before = + at > 0 && utf8::is_word_byte(haystack[at - 1]); + let word_after = + at < haystack.len() && utf8::is_word_byte(haystack[at]); + word_before == word_after + } + } + } +} + +/// The high-level intermediate representation of a repetition operator. +/// +/// A repetition operator permits the repetition of an arbitrary +/// sub-expression. +#[derive(Clone, Debug, Eq, PartialEq)] +pub(crate) struct Repetition { + /// The minimum range of the repetition. + /// + /// Note that special cases like `?`, `+` and `*` all get translated into + /// the ranges `{0,1}`, `{1,}` and `{0,}`, respectively. + /// + /// When `min` is zero, this expression can match the empty string + /// regardless of what its sub-expression is. + pub(crate) min: u32, + /// The maximum range of the repetition. + /// + /// Note that when `max` is `None`, `min` acts as a lower bound but where + /// there is no upper bound. For something like `x{5}` where the min and + /// max are equivalent, `min` will be set to `5` and `max` will be set to + /// `Some(5)`. + pub(crate) max: Option, + /// Whether this repetition operator is greedy or not. A greedy operator + /// will match as much as it can. A non-greedy operator will match as + /// little as it can. + /// + /// Typically, operators are greedy by default and are only non-greedy when + /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is + /// not. However, this can be inverted via the `U` "ungreedy" flag. + pub(crate) greedy: bool, + /// The expression being repeated. + pub(crate) sub: Box, +} + +impl Repetition { + /// Returns a new repetition with the same `min`, `max` and `greedy` + /// values, but with its sub-expression replaced with the one given. + pub(crate) fn with(&self, sub: Hir) -> Repetition { + Repetition { + min: self.min, + max: self.max, + greedy: self.greedy, + sub: Box::new(sub), + } + } +} + +/// The high-level intermediate representation for a capturing group. +/// +/// A capturing group always has an index and a child expression. It may +/// also have a name associated with it (e.g., `(?P\w)`), but it's not +/// necessary. +/// +/// Note that there is no explicit representation of a non-capturing group +/// in a `Hir`. Instead, non-capturing grouping is handled automatically by +/// the recursive structure of the `Hir` itself. +#[derive(Clone, Debug, Eq, PartialEq)] +pub(crate) struct Capture { + /// The capture index of the capture. + pub(crate) index: u32, + /// The name of the capture, if it exists. + pub(crate) name: Option>, + /// The expression inside the capturing group, which may be empty. + pub(crate) sub: Box, +} + +fn next_char(ch: char) -> Option { + // Skip over the surrogate range. + if ch == '\u{D7FF}' { + return Some('\u{E000}'); + } + // OK because char::MAX < u32::MAX and we handle U+D7FF above. + char::from_u32(u32::from(ch).checked_add(1).unwrap()) +} + +fn prev_char(ch: char) -> Option { + // Skip over the surrogate range. + if ch == '\u{E000}' { + return Some('\u{D7FF}'); + } + // OK because subtracting 1 from any valid scalar value other than 0 + // and U+E000 yields a valid scalar value. + Some(char::from_u32(u32::from(ch).checked_sub(1)?).unwrap()) +} diff --git a/regex-lite/src/hir/parse.rs b/regex-lite/src/hir/parse.rs new file mode 100644 index 000000000..4ce2895f8 --- /dev/null +++ b/regex-lite/src/hir/parse.rs @@ -0,0 +1,2073 @@ +use core::cell::{Cell, RefCell}; + +use alloc::{ + boxed::Box, + string::{String, ToString}, + vec, + vec::Vec, +}; + +use crate::{ + error::Error, + hir::{self, Config, Flags, Hir, HirKind}, + util, +}; + +// These are all of the errors that can occur while parsing a regex. Unlike +// regex-syntax, our errors are not particularly great. They are just enough +// to get a general sense of what went wrong. But in exchange, the error +// reporting mechanism is *much* simpler than what's in regex-syntax. +// +// By convention, we use each of these messages in exactly one place. That +// way, every branch that leads to an error has a unique message. This in turn +// means that given a message, one can precisely identify which part of the +// parser reported it. +// +// Finally, we give names to each message so that we can reference them in +// tests. +const ERR_TOO_MUCH_NESTING: &str = "pattern has too much nesting"; +const ERR_TOO_MANY_CAPTURES: &str = "too many capture groups"; +const ERR_DUPLICATE_CAPTURE_NAME: &str = "duplicate capture group name"; +const ERR_UNCLOSED_GROUP: &str = "found open group without closing ')'"; +const ERR_UNCLOSED_GROUP_QUESTION: &str = + "expected closing ')', but got end of pattern"; +const ERR_UNOPENED_GROUP: &str = "found closing ')' without matching '('"; +const ERR_LOOK_UNSUPPORTED: &str = "look-around is not supported"; +const ERR_EMPTY_FLAGS: &str = "empty flag directive '(?)' is not allowed"; +const ERR_MISSING_GROUP_NAME: &str = + "exepcted capture group name, but got end of pattern"; +const ERR_INVALID_GROUP_NAME: &str = "invalid group name"; +const ERR_UNCLOSED_GROUP_NAME: &str = + "expected end of capture group name, but got end of pattern"; +const ERR_EMPTY_GROUP_NAME: &str = "empty capture group names are not allowed"; +const ERR_FLAG_UNRECOGNIZED: &str = "unrecognized inline flag"; +const ERR_FLAG_REPEATED_NEGATION: &str = + "inline flag negation cannot be repeated"; +const ERR_FLAG_DUPLICATE: &str = "duplicate inline flag is not allowed"; +const ERR_FLAG_UNEXPECTED_EOF: &str = + "expected ':' or ')' to end inline flags, but got end of pattern"; +const ERR_FLAG_DANGLING_NEGATION: &str = + "inline flags cannot end with negation directive"; +const ERR_DECIMAL_NO_DIGITS: &str = + "expected decimal number, but found no digits"; +const ERR_DECIMAL_INVALID: &str = "got invalid decimal number"; +const ERR_HEX_BRACE_INVALID_DIGIT: &str = + "expected hexadecimal number in braces, but got non-hex digit"; +const ERR_HEX_BRACE_UNEXPECTED_EOF: &str = + "expected hexadecimal number, but saw end of pattern before closing brace"; +const ERR_HEX_BRACE_EMPTY: &str = + "expected hexadecimal number in braces, but got no digits"; +const ERR_HEX_BRACE_INVALID: &str = "got invalid hexadecimal number in braces"; +const ERR_HEX_FIXED_UNEXPECTED_EOF: &str = + "expected fixed length hexadecimal number, but saw end of pattern first"; +const ERR_HEX_FIXED_INVALID_DIGIT: &str = + "expected fixed length hexadecimal number, but got non-hex digit"; +const ERR_HEX_FIXED_INVALID: &str = + "got invalid fixed length hexadecimal number"; +const ERR_HEX_UNEXPECTED_EOF: &str = + "expected hexadecimal number, but saw end of pattern first"; +const ERR_ESCAPE_UNEXPECTED_EOF: &str = + "saw start of escape sequence, but saw end of pattern before it finished"; +const ERR_BACKREF_UNSUPPORTED: &str = "backreferences are not supported"; +const ERR_UNICODE_CLASS_UNSUPPORTED: &str = + "Unicode character classes are not supported"; +const ERR_ESCAPE_UNRECOGNIZED: &str = "unrecognized escape sequence"; +const ERR_POSIX_CLASS_UNRECOGNIZED: &str = + "unrecognized POSIX character class"; +const ERR_UNCOUNTED_REP_SUB_MISSING: &str = + "uncounted repetition operator must be applied to a sub-expression"; +const ERR_COUNTED_REP_SUB_MISSING: &str = + "counted repetition operator must be applied to a sub-expression"; +const ERR_COUNTED_REP_UNCLOSED: &str = + "found unclosed counted repetition operator"; +const ERR_COUNTED_REP_MIN_UNCLOSED: &str = + "found incomplete and unclosed counted repetition operator"; +const ERR_COUNTED_REP_COMMA_UNCLOSED: &str = + "found counted repetition operator with a comma that is unclosed"; +const ERR_COUNTED_REP_MIN_MAX_UNCLOSED: &str = + "found counted repetition with min and max that is unclosed"; +const ERR_COUNTED_REP_INVALID: &str = + "expected closing brace for counted repetition, but got something else"; +const ERR_COUNTED_REP_INVALID_RANGE: &str = + "found counted repetition with a min bigger than its max"; +const ERR_CLASS_UNCLOSED_AFTER_ITEM: &str = + "non-empty character class has no closing bracket"; +const ERR_CLASS_INVALID_RANGE_ITEM: &str = + "character class ranges must start and end with a single character"; +const ERR_CLASS_INVALID_ITEM: &str = + "invalid escape sequence in character class"; +const ERR_CLASS_UNCLOSED_AFTER_DASH: &str = + "non-empty character class has no closing bracket after dash"; +const ERR_CLASS_UNCLOSED_AFTER_NEGATION: &str = + "negated character class has no closing bracket"; +const ERR_CLASS_UNCLOSED_AFTER_CLOSING: &str = + "character class begins with literal ']' but has no closing bracket"; +const ERR_CLASS_INVALID_RANGE: &str = "invalid range in character class"; +const ERR_CLASS_UNCLOSED: &str = "found unclosed character class"; +const ERR_CLASS_NEST_UNSUPPORTED: &str = + "nested character classes are not supported"; +const ERR_CLASS_INTERSECTION_UNSUPPORTED: &str = + "character class intersection is not supported"; +const ERR_CLASS_DIFFERENCE_UNSUPPORTED: &str = + "character class difference is not supported"; +const ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED: &str = + "character class symmetric difference is not supported"; + +/// A regular expression parser. +/// +/// This parses a string representation of a regular expression into an +/// abstract syntax tree. The size of the tree is proportional to the length +/// of the regular expression pattern. +/// +/// A `Parser` can be configured in more detail via a [`ParserBuilder`]. +#[derive(Clone, Debug)] +pub(super) struct Parser<'a> { + /// The configuration of the parser as given by the caller. + config: Config, + /// The pattern we're parsing as given by the caller. + pattern: &'a str, + /// The call depth of the parser. This is incremented for each + /// sub-expression parsed. Its peak value is the maximum nesting of the + /// pattern. + depth: Cell, + /// The current position of the parser. + pos: Cell, + /// The current codepoint of the parser. The codepoint corresponds to the + /// codepoint encoded in `pattern` beginning at `pos`. + /// + /// This is `None` if and only if `pos == pattern.len()`. + char: Cell>, + /// The current capture index. + capture_index: Cell, + /// The flags that are currently set. + flags: RefCell, + /// A sorted sequence of capture names. This is used to detect duplicate + /// capture names and report an error if one is detected. + capture_names: RefCell>, + /// A scratch buffer used in various places. Mostly this is used to + /// accumulate relevant characters from parts of a pattern. + scratch: RefCell, +} + +/// The constructor and a variety of helper routines. +impl<'a> Parser<'a> { + /// Build a parser from this configuration with the given pattern. + pub(super) fn new(config: Config, pattern: &'a str) -> Parser<'a> { + Parser { + config, + pattern, + depth: Cell::new(0), + pos: Cell::new(0), + char: Cell::new(pattern.chars().next()), + capture_index: Cell::new(0), + flags: RefCell::new(Flags::default()), + capture_names: RefCell::new(vec![]), + scratch: RefCell::new(String::new()), + } + } + + /// Returns the full pattern string that we're parsing. + fn pattern(&self) -> &str { + self.pattern + } + + /// Return the current byte offset of the parser. + /// + /// The offset starts at `0` from the beginning of the regular expression + /// pattern string. + fn pos(&self) -> usize { + self.pos.get() + } + + /// Increments the call depth of the parser. + /// + /// If the call depth would exceed the configured nest limit, then this + /// returns an error. + /// + /// This returns the old depth. + fn increment_depth(&self) -> Result { + let old = self.depth.get(); + // OK because our depth starts at 0, and we return an error if it + // ever reaches the limit. So the call depth can never exceed u32::MAX. + let new = old.checked_add(1).unwrap(); + if new >= self.config.nest_limit { + return Err(Error::new(ERR_TOO_MUCH_NESTING)); + } + self.depth.set(new); + Ok(old) + } + + /// Decrements the call depth of the parser. + /// + /// This panics if the current depth is 0. + fn decrement_depth(&self) { + let old = self.depth.get(); + // If this fails then the caller has a bug in how they're incrementing + // and decrementing the depth of the parser's call stack. + let new = old.checked_sub(1).unwrap(); + self.depth.set(new); + } + + /// Return the codepoint at the current position of the parser. + /// + /// This panics if the parser is positioned at the end of the pattern. + fn char(&self) -> char { + self.char.get().expect("codepoint, but parser is done") + } + + /// Return the character at the given position. + /// + /// This panics if the given position does not point to a valid char. + fn char_at(&self, i: usize) -> char { + self.pattern()[i..] + .chars() + .next() + .unwrap_or_else(|| panic!("expected char at offset {}", i)) + } + + /// Returns true if the next call to `bump` would return false. + fn is_done(&self) -> bool { + self.pos() == self.pattern.len() + } + + /// Returns the flags that are current set for this regex. + fn flags(&self) -> Flags { + *self.flags.borrow() + } + + /// Bump the parser to the next Unicode scalar value. + /// + /// If the end of the input has been reached, then `false` is returned. + fn bump(&self) -> bool { + if self.is_done() { + return false; + } + self.pos.set(self.pos() + self.char().len_utf8()); + self.char.set(self.pattern()[self.pos()..].chars().next()); + self.char.get().is_some() + } + + /// If the substring starting at the current position of the parser has + /// the given prefix, then bump the parser to the character immediately + /// following the prefix and return true. Otherwise, don't bump the parser + /// and return false. + fn bump_if(&self, prefix: &str) -> bool { + if self.pattern()[self.pos()..].starts_with(prefix) { + for _ in 0..prefix.chars().count() { + self.bump(); + } + true + } else { + false + } + } + + /// Bump the parser, and if the `x` flag is enabled, bump through any + /// subsequent spaces. Return true if and only if the parser is not done. + fn bump_and_bump_space(&self) -> bool { + if !self.bump() { + return false; + } + self.bump_space(); + !self.is_done() + } + + /// If the `x` flag is enabled (i.e., whitespace insensitivity with + /// comments), then this will advance the parser through all whitespace + /// and comments to the next non-whitespace non-comment byte. + /// + /// If the `x` flag is disabled, then this is a no-op. + /// + /// This should be used selectively throughout the parser where + /// arbitrary whitespace is permitted when the `x` flag is enabled. For + /// example, `{ 5 , 6}` is equivalent to `{5,6}`. + fn bump_space(&self) { + if !self.flags().ignore_whitespace { + return; + } + while !self.is_done() { + if self.char().is_whitespace() { + self.bump(); + } else if self.char() == '#' { + self.bump(); + while !self.is_done() { + let c = self.char(); + self.bump(); + if c == '\n' { + break; + } + } + } else { + break; + } + } + } + + /// Peek at the next character in the input without advancing the parser. + /// + /// If the input has been exhausted, then this returns `None`. + fn peek(&self) -> Option { + if self.is_done() { + return None; + } + self.pattern()[self.pos() + self.char().len_utf8()..].chars().next() + } + + /// Peeks at the next character in the pattern from the current offset, and + /// will ignore spaces when the parser is in whitespace insensitive mode. + fn peek_space(&self) -> Option { + if !self.flags().ignore_whitespace { + return self.peek(); + } + if self.is_done() { + return None; + } + let mut start = self.pos() + self.char().len_utf8(); + let mut in_comment = false; + for (i, ch) in self.pattern()[start..].char_indices() { + if ch.is_whitespace() { + continue; + } else if !in_comment && ch == '#' { + in_comment = true; + } else if in_comment && ch == '\n' { + in_comment = false; + } else { + start += i; + break; + } + } + self.pattern()[start..].chars().next() + } + + /// Return the next capturing index. Each subsequent call increments the + /// internal index. Since the way capture indices are computed is a public + /// API guarantee, use of this routine depends on the parser being depth + /// first and left-to-right. + /// + /// If the capture limit is exceeded, then an error is returned. + fn next_capture_index(&self) -> Result { + let current = self.capture_index.get(); + let next = current + .checked_add(1) + .ok_or_else(|| Error::new(ERR_TOO_MANY_CAPTURES))?; + self.capture_index.set(next); + Ok(next) + } + + /// Adds the given capture name to this parser. If this capture name has + /// already been used, then an error is returned. + fn add_capture_name(&self, name: &str) -> Result<(), Error> { + let mut names = self.capture_names.borrow_mut(); + match names.binary_search_by(|n| name.cmp(n)) { + Ok(i) => Err(Error::new(ERR_DUPLICATE_CAPTURE_NAME)), + Err(i) => { + names.insert(i, name.to_string()); + Ok(()) + } + } + } + + /// Returns true if and only if the parser is positioned at a look-around + /// prefix. The conditions under which this returns true must always + /// correspond to a regular expression that would otherwise be consider + /// invalid. + /// + /// This should only be called immediately after parsing the opening of + /// a group or a set of flags. + fn is_lookaround_prefix(&self) -> bool { + self.bump_if("?=") + || self.bump_if("?!") + || self.bump_if("?<=") + || self.bump_if("? Parser<'a> { + pub(super) fn parse(&self) -> Result { + let depth = self.increment_depth()?; + let mut alternates = vec![]; + let mut concat = vec![]; + loop { + self.bump_space(); + if self.is_done() { + break; + } + match self.char() { + '(' => { + // Save the old flags and reset them only when we close + // the group. + let oldflags = *self.flags.borrow(); + if let Some(sub) = self.parse_group()? { + concat.push(sub); + // We only reset them here because if 'parse_group' + // returns None, then that means it handled a flag + // directive, e.g., '(?ism)'. And the whole point is + // that those flags remain active until either disabled + // or the end of the pattern or current group. + *self.flags.borrow_mut() = oldflags; + } + if self.char.get() != Some(')') { + return Err(Error::new(ERR_UNCLOSED_GROUP)); + } + self.bump(); + } + ')' => { + if depth == 0 { + return Err(Error::new(ERR_UNOPENED_GROUP)); + } + break; + } + '|' => { + alternates.push(Hir::concat(core::mem::take(&mut concat))); + self.bump(); + } + '[' => concat.push(self.parse_class()?), + '?' | '*' | '+' => { + concat = self.parse_uncounted_repetition(concat)?; + } + '{' => { + concat = self.parse_counted_repetition(concat)?; + } + _ => concat.push(self.parse_primitive()?), + } + } + self.decrement_depth(); + alternates.push(Hir::concat(concat)); + // N.B. This strips off the "alternation" if there's only one branch. + Ok(Hir::alternation(alternates)) + } + + /// Parses a "primitive" pattern. A primitive is any expression that does + /// not contain any sub-expressions. + /// + /// This assumes the parser is pointing at the beginning of the primitive. + fn parse_primitive(&self) -> Result { + let ch = self.char(); + self.bump(); + match ch { + '\\' => self.parse_escape(), + '.' => Ok(self.hir_dot()), + '^' => Ok(self.hir_anchor_start()), + '$' => Ok(self.hir_anchor_end()), + ch => Ok(self.hir_char(ch)), + } + } + + /// Parse an escape sequence. This always results in a "primitive" HIR, + /// that is, an HIR with no sub-expressions. + /// + /// This assumes the parser is positioned at the start of the sequence, + /// immediately *after* the `\`. It advances the parser to the first + /// position immediately following the escape sequence. + fn parse_escape(&self) -> Result { + if self.is_done() { + return Err(Error::new(ERR_ESCAPE_UNEXPECTED_EOF)); + } + let ch = self.char(); + // Put some of the more complicated routines into helpers. + match ch { + '0'..='9' => return Err(Error::new(ERR_BACKREF_UNSUPPORTED)), + 'p' | 'P' => { + return Err(Error::new(ERR_UNICODE_CLASS_UNSUPPORTED)) + } + 'x' | 'u' | 'U' => return self.parse_hex(), + 'd' | 's' | 'w' | 'D' | 'S' | 'W' => { + return Ok(self.parse_perl_class()); + } + _ => {} + } + + // Handle all of the one letter sequences inline. + self.bump(); + if util::is_meta_character(ch) || util::is_escapeable_character(ch) { + return Ok(self.hir_char(ch)); + } + let special = |ch| Ok(self.hir_char(ch)); + match ch { + 'a' => special('\x07'), + 'f' => special('\x0C'), + 't' => special('\t'), + 'n' => special('\n'), + 'r' => special('\r'), + 'v' => special('\x0B'), + 'A' => Ok(Hir::look(hir::Look::Start)), + 'z' => Ok(Hir::look(hir::Look::End)), + 'b' => Ok(Hir::look(hir::Look::Word)), + 'B' => Ok(Hir::look(hir::Look::WordNegate)), + _ => Err(Error::new(ERR_ESCAPE_UNRECOGNIZED)), + } + } + + /// Parse a hex representation of a Unicode codepoint. This handles both + /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to + /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to + /// the first character immediately following the hexadecimal literal. + fn parse_hex(&self) -> Result { + let digit_len = match self.char() { + 'x' => 2, + 'u' => 4, + 'U' => 8, + unk => unreachable!( + "invalid start of fixed length hexadecimal number {}", + unk + ), + }; + if !self.bump_and_bump_space() { + return Err(Error::new(ERR_HEX_UNEXPECTED_EOF)); + } + if self.char() == '{' { + self.parse_hex_brace() + } else { + self.parse_hex_digits(digit_len) + } + } + + /// Parse an N-digit hex representation of a Unicode codepoint. This + /// expects the parser to be positioned at the first digit and will advance + /// the parser to the first character immediately following the escape + /// sequence. + /// + /// The number of digits given must be 2 (for `\xNN`), 4 (for `\uNNNN`) + /// or 8 (for `\UNNNNNNNN`). + fn parse_hex_digits(&self, digit_len: usize) -> Result { + let mut scratch = String::new(); + for i in 0..digit_len { + if i > 0 && !self.bump_and_bump_space() { + return Err(Error::new(ERR_HEX_FIXED_UNEXPECTED_EOF)); + } + if !is_hex(self.char()) { + return Err(Error::new(ERR_HEX_FIXED_INVALID_DIGIT)); + } + scratch.push(self.char()); + } + // The final bump just moves the parser past the literal, which may + // be EOF. + self.bump_and_bump_space(); + match u32::from_str_radix(&scratch, 16).ok().and_then(char::from_u32) { + None => Err(Error::new(ERR_HEX_FIXED_INVALID)), + Some(ch) => Ok(self.hir_char(ch)), + } + } + + /// Parse a hex representation of any Unicode scalar value. This expects + /// the parser to be positioned at the opening brace `{` and will advance + /// the parser to the first character following the closing brace `}`. + fn parse_hex_brace(&self) -> Result { + let mut scratch = String::new(); + while self.bump_and_bump_space() && self.char() != '}' { + if !is_hex(self.char()) { + return Err(Error::new(ERR_HEX_BRACE_INVALID_DIGIT)); + } + scratch.push(self.char()); + } + if self.is_done() { + return Err(Error::new(ERR_HEX_BRACE_UNEXPECTED_EOF)); + } + assert_eq!(self.char(), '}'); + self.bump_and_bump_space(); + + if scratch.is_empty() { + return Err(Error::new(ERR_HEX_BRACE_EMPTY)); + } + match u32::from_str_radix(&scratch, 16).ok().and_then(char::from_u32) { + None => Err(Error::new(ERR_HEX_BRACE_INVALID)), + Some(ch) => Ok(self.hir_char(ch)), + } + } + + /// Parse a decimal number into a u32 while trimming leading and trailing + /// whitespace. + /// + /// This expects the parser to be positioned at the first position where + /// a decimal digit could occur. This will advance the parser to the byte + /// immediately following the last contiguous decimal digit. + /// + /// If no decimal digit could be found or if there was a problem parsing + /// the complete set of digits into a u32, then an error is returned. + fn parse_decimal(&self) -> Result { + let mut scratch = String::new(); + while !self.is_done() && self.char().is_whitespace() { + self.bump(); + } + let start = self.pos(); + while !self.is_done() && '0' <= self.char() && self.char() <= '9' { + scratch.push(self.char()); + self.bump_and_bump_space(); + } + while !self.is_done() && self.char().is_whitespace() { + self.bump_and_bump_space(); + } + let digits = scratch.as_str(); + if digits.is_empty() { + return Err(Error::new(ERR_DECIMAL_NO_DIGITS)); + } + match u32::from_str_radix(digits, 10).ok() { + Some(n) => Ok(n), + None => Err(Error::new(ERR_DECIMAL_INVALID)), + } + } + + /// Parses an uncounted repetition operator. An uncounted repetition + /// operator includes `?`, `*` and `+`, but does not include the `{m,n}` + /// syntax. The current character should be one of `?`, `*` or `+`. Any + /// other character will result in a panic. + /// + /// This assumes that the parser is currently positioned at the repetition + /// operator and advances the parser to the first character after the + /// operator. (Note that the operator may include a single additional `?`, + /// which makes the operator ungreedy.) + /// + /// The caller should include the concatenation that is being built. The + /// concatenation returned includes the repetition operator applied to the + /// last expression in the given concatenation. + /// + /// If the concatenation is empty, then this returns an error. + fn parse_uncounted_repetition( + &self, + mut concat: Vec, + ) -> Result, Error> { + let sub = match concat.pop() { + Some(hir) => Box::new(hir), + None => { + return Err(Error::new(ERR_UNCOUNTED_REP_SUB_MISSING)); + } + }; + let (min, max) = match self.char() { + '?' => (0, Some(1)), + '*' => (0, None), + '+' => (1, None), + unk => unreachable!("unrecognized repetition operator '{}'", unk), + }; + let mut greedy = true; + if self.bump() && self.char() == '?' { + greedy = false; + self.bump(); + } + concat.push(Hir::repetition(hir::Repetition { + min, + max, + greedy, + sub, + })); + Ok(concat) + } + + /// Parses a counted repetition operation. A counted repetition operator + /// corresponds to the `{m,n}` syntax, and does not include the `?`, `*` or + /// `+` operators. + /// + /// This assumes that the parser is currently at the opening `{` and + /// advances the parser to the first character after the operator. (Note + /// that the operator may include a single additional `?`, which makes the + /// operator ungreedy.) + /// + /// The caller should include the concatenation that is being built. The + /// concatenation returned includes the repetition operator applied to the + /// last expression in the given concatenation. + /// + /// If the concatenation is empty, then this returns an error. + fn parse_counted_repetition( + &self, + mut concat: Vec, + ) -> Result, Error> { + assert_eq!(self.char(), '{', "expected opening brace"); + let sub = match concat.pop() { + Some(hir) => Box::new(hir), + None => { + return Err(Error::new(ERR_COUNTED_REP_SUB_MISSING)); + } + }; + if !self.bump_and_bump_space() { + return Err(Error::new(ERR_COUNTED_REP_UNCLOSED)); + } + let min = self.parse_decimal()?; + let mut max = Some(min); + if self.is_done() { + return Err(Error::new(ERR_COUNTED_REP_MIN_UNCLOSED)); + } + if self.char() == ',' { + if !self.bump_and_bump_space() { + return Err(Error::new(ERR_COUNTED_REP_COMMA_UNCLOSED)); + } + if self.char() != '}' { + max = Some(self.parse_decimal()?); + } else { + max = None; + } + if self.is_done() { + return Err(Error::new(ERR_COUNTED_REP_MIN_MAX_UNCLOSED)); + } + } + if self.char() != '}' { + return Err(Error::new(ERR_COUNTED_REP_INVALID)); + } + + let mut greedy = true; + if self.bump_and_bump_space() && self.char() == '?' { + greedy = false; + self.bump(); + } + + if max.map_or(false, |max| min > max) { + return Err(Error::new(ERR_COUNTED_REP_INVALID_RANGE)); + } + concat.push(Hir::repetition(hir::Repetition { + min, + max, + greedy, + sub, + })); + Ok(concat) + } + + /// Parses the part of a pattern that starts with a `(`. This is usually + /// a group sub-expression, but might just be a directive that enables + /// (or disables) certain flags. + /// + /// This assumes the parser is pointing at the opening `(`. + fn parse_group(&self) -> Result, Error> { + assert_eq!(self.char(), '('); + self.bump_and_bump_space(); + if self.is_lookaround_prefix() { + return Err(Error::new(ERR_LOOK_UNSUPPORTED)); + } + if self.bump_if("?P<") || self.bump_if("?<") { + let index = self.next_capture_index()?; + let name = Some(Box::from(self.parse_capture_name(index)?)); + let sub = Box::new(self.parse()?); + let cap = hir::Capture { index, name, sub }; + Ok(Some(Hir::capture(cap))) + } else if self.bump_if("?") { + if self.is_done() { + return Err(Error::new(ERR_UNCLOSED_GROUP_QUESTION)); + } + let start = self.pos(); + // The flags get reset in the top-level 'parse' routine. + *self.flags.borrow_mut() = self.parse_flags()?; + let consumed = self.pos() - start; + if self.char() == ')' { + // We don't allow empty flags, e.g., `(?)`. + if consumed == 0 { + return Err(Error::new(ERR_EMPTY_FLAGS)); + } + Ok(None) + } else { + assert_eq!(':', self.char()); + self.bump(); + self.parse().map(Some) + } + } else { + let index = self.next_capture_index()?; + let sub = Box::new(self.parse()?); + let cap = hir::Capture { index, name: None, sub }; + Ok(Some(Hir::capture(cap))) + } + } + + /// Parses a capture group name. Assumes that the parser is positioned at + /// the first character in the name following the opening `<` (and may + /// possibly be EOF). This advances the parser to the first character + /// following the closing `>`. + /// + /// The caller must provide the capture index of the group for this name. + fn parse_capture_name(&self, capture_index: u32) -> Result<&str, Error> { + if self.is_done() { + return Err(Error::new(ERR_MISSING_GROUP_NAME)); + } + let start = self.pos(); + loop { + if self.char() == '>' { + break; + } + if !is_capture_char(self.char(), self.pos() == start) { + return Err(Error::new(ERR_INVALID_GROUP_NAME)); + } + if !self.bump() { + break; + } + } + let end = self.pos(); + if self.is_done() { + return Err(Error::new(ERR_UNCLOSED_GROUP_NAME)); + } + assert_eq!(self.char(), '>'); + self.bump(); + let name = &self.pattern()[start..end]; + if name.is_empty() { + return Err(Error::new(ERR_EMPTY_GROUP_NAME)); + } + self.add_capture_name(name)?; + Ok(name) + } + + /// Parse a sequence of flags starting at the current character. + /// + /// This advances the parser to the character immediately following the + /// flags, which is guaranteed to be either `:` or `)`. + /// + /// # Errors + /// + /// If any flags are duplicated, then an error is returned. + /// + /// If the negation operator is used more than once, then an error is + /// returned. + /// + /// If no flags could be found or if the negation operation is not followed + /// by any flags, then an error is returned. + fn parse_flags(&self) -> Result { + let mut flags = *self.flags.borrow(); + let mut negate = false; + // Keeps track of whether the previous flag item was a '-'. We use this + // to detect whether there is a dangling '-', which is invalid. + let mut last_was_negation = false; + // A set to keep track of the flags we've seen. Since all flags are + // ASCII, we only need 128 bytes. + let mut seen = [false; 128]; + while self.char() != ':' && self.char() != ')' { + if self.char() == '-' { + last_was_negation = true; + if negate { + return Err(Error::new(ERR_FLAG_REPEATED_NEGATION)); + } + negate = true; + } else { + last_was_negation = false; + self.parse_flag(&mut flags, negate)?; + // OK because every valid flag is ASCII, and we're only here if + // the flag is valid. + let flag_byte = u8::try_from(self.char()).unwrap(); + if seen[usize::from(flag_byte)] { + return Err(Error::new(ERR_FLAG_DUPLICATE)); + } + seen[usize::from(flag_byte)] = true; + } + if !self.bump() { + return Err(Error::new(ERR_FLAG_UNEXPECTED_EOF)); + } + } + if last_was_negation { + return Err(Error::new(ERR_FLAG_DANGLING_NEGATION)); + } + Ok(flags) + } + + /// Parse the current character as a flag. Do not advance the parser. + /// + /// This sets the appropriate boolean value in place on the set of flags + /// given. The boolean is inverted when `negate` is true. + /// + /// # Errors + /// + /// If the flag is not recognized, then an error is returned. + fn parse_flag( + &self, + flags: &mut Flags, + negate: bool, + ) -> Result<(), Error> { + let enabled = !negate; + match self.char() { + 'i' => flags.case_insensitive = enabled, + 'm' => flags.multi_line = enabled, + 's' => flags.dot_matches_new_line = enabled, + 'U' => flags.swap_greed = enabled, + 'R' => flags.crlf = enabled, + 'x' => flags.ignore_whitespace = enabled, + _ => return Err(Error::new(ERR_FLAG_UNRECOGNIZED)), + } + Ok(()) + } + + /// Parse a standard character class consisting primarily of characters or + /// character ranges. + /// + /// This assumes the parser is positioned at the opening `[`. If parsing + /// is successful, then the parser is advanced to the position immediately + /// following the closing `]`. + fn parse_class(&self) -> Result { + assert_eq!(self.char(), '['); + + let mut union = vec![]; + if !self.bump_and_bump_space() { + return Err(Error::new(ERR_CLASS_UNCLOSED)); + } + // Determine whether the class is negated or not. + let negate = if self.char() != '^' { + false + } else { + if !self.bump_and_bump_space() { + return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_NEGATION)); + } + true + }; + // Accept any number of `-` as literal `-`. + while self.char() == '-' { + union.push(hir::ClassRange { start: '-', end: '-' }); + if !self.bump_and_bump_space() { + return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_DASH)); + } + } + // If `]` is the *first* char in a set, then interpret it as a literal + // `]`. That is, an empty class is impossible to write. + if union.is_empty() && self.char() == ']' { + union.push(hir::ClassRange { start: ']', end: ']' }); + if !self.bump_and_bump_space() { + return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_CLOSING)); + } + } + loop { + self.bump_space(); + if self.is_done() { + return Err(Error::new(ERR_CLASS_UNCLOSED)); + } + match self.char() { + '[' => { + // Attempt to treat this as the beginning of a POSIX class. + // If POSIX class parsing fails, then the parser backs up + // to `[`. + if let Some(ranges) = self.maybe_parse_posix_class() { + union.extend(ranges); + continue; + } + // ... otherwise we don't support nested classes. + return Err(Error::new(ERR_CLASS_NEST_UNSUPPORTED)); + } + ']' => { + self.bump(); + let mut class = hir::Class::new(union); + // Note that we must apply case folding before negation! + // Consider `(?i)[^x]`. If we applied negation first, then + // the result would be the character class that matched any + // Unicode scalar value. + if self.flags().case_insensitive { + class.ascii_case_fold(); + } + if negate { + class.negate(); + } + return Ok(Hir::class(class)); + } + '&' if self.peek() == Some('&') => { + return Err(Error::new( + ERR_CLASS_INTERSECTION_UNSUPPORTED, + )); + } + '-' if self.peek() == Some('-') => { + return Err(Error::new(ERR_CLASS_DIFFERENCE_UNSUPPORTED)); + } + '~' if self.peek() == Some('~') => { + return Err(Error::new( + ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED, + )); + } + _ => self.parse_class_range(&mut union)?, + } + } + } + + /// Parse a single primitive item in a character class set. The item to + /// be parsed can either be one of a simple literal character, a range + /// between two simple literal characters or a "primitive" character + /// class like `\w`. + /// + /// If an invalid escape is found, or if a character class is found where + /// a simple literal is expected (e.g., in a range), then an error is + /// returned. + /// + /// Otherwise, the range (or ranges) are appended to the given union of + /// ranges. + fn parse_class_range( + &self, + union: &mut Vec, + ) -> Result<(), Error> { + let prim1 = self.parse_class_item()?; + self.bump_space(); + if self.is_done() { + return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_ITEM)); + } + // If the next char isn't a `-`, then we don't have a range. + // There are two exceptions. If the char after a `-` is a `]`, then + // `-` is interpreted as a literal `-`. Alternatively, if the char + // after a `-` is a `-`, then `--` corresponds to a "difference" + // operation. (Which we don't support in regex-lite, but error about + // specifically in an effort to be loud about differences between the + // main regex crate where possible.) + if self.char() != '-' + || self.peek_space() == Some(']') + || self.peek_space() == Some('-') + { + union.extend_from_slice(&into_class_item_ranges(prim1)?); + return Ok(()); + } + // OK, now we're parsing a range, so bump past the `-` and parse the + // second half of the range. + if !self.bump_and_bump_space() { + return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_DASH)); + } + let prim2 = self.parse_class_item()?; + let range = hir::ClassRange { + start: into_class_item_range(prim1)?, + end: into_class_item_range(prim2)?, + }; + if range.start > range.end { + return Err(Error::new(ERR_CLASS_INVALID_RANGE)); + } + union.push(range); + Ok(()) + } + + /// Parse a single item in a character class as a primitive, where the + /// primitive either consists of a verbatim literal or a single escape + /// sequence. + /// + /// This assumes the parser is positioned at the beginning of a primitive, + /// and advances the parser to the first position after the primitive if + /// successful. + /// + /// Note that it is the caller's responsibility to report an error if an + /// illegal primitive was parsed. + fn parse_class_item(&self) -> Result { + let ch = self.char(); + self.bump(); + if ch == '\\' { + self.parse_escape() + } else { + Ok(self.hir_char(ch)) + } + } + + /// Attempt to parse a POSIX character class, e.g., `[:alnum:]`. + /// + /// This assumes the parser is positioned at the opening `[`. + /// + /// If no valid POSIX character class could be found, then this does not + /// advance the parser and `None` is returned. Otherwise, the parser is + /// advanced to the first byte following the closing `]` and the + /// corresponding POSIX class is returned. + fn maybe_parse_posix_class( + &self, + ) -> Option> { + // POSIX character classes are interesting from a parsing perspective + // because parsing cannot fail with any interesting error. For example, + // in order to use an POSIX character class, it must be enclosed in + // double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think + // of it as "POSIX character classes have the syntax `[:NAME:]` which + // can only appear within character brackets." This means that things + // like `[[:lower:]A]` are legal constructs. + // + // However, if one types an incorrect POSIX character class, e.g., + // `[[:loower:]]`, then we treat that as if it were normal nested + // character class containing the characters `:elorw`. (Which isn't + // supported and results in an error in regex-lite.) One might argue + // that we should return an error instead since the repeated colons + // give away the intent to write an POSIX class. But what if the user + // typed `[[:lower]]` instead? How can we tell that was intended to be + // a POSXI class and not just a normal nested class? + // + // Reasonable people can probably disagree over this, but for better + // or worse, we implement semantics that never fails at the expense of + // better failure modes. + assert_eq!(self.char(), '['); + + // If parsing fails, then we back up the parser to this starting point. + let start_pos = self.pos(); + let start_char = self.char.get(); + let reset = || { + self.pos.set(start_pos); + self.char.set(start_char); + }; + + let mut negated = false; + if !self.bump() || self.char() != ':' { + reset(); + return None; + } + if !self.bump() { + reset(); + return None; + } + if self.char() == '^' { + negated = true; + if !self.bump() { + reset(); + return None; + } + } + let name_start = self.pos(); + while self.char() != ':' && self.bump() {} + if self.is_done() { + reset(); + return None; + } + let name = &self.pattern()[name_start..self.pos()]; + if !self.bump_if(":]") { + reset(); + return None; + } + if let Ok(ranges) = posix_class(name) { + return Some(ranges); + } + reset(); + None + } + + /// Parse a Perl character class, e.g., `\d` or `\W`. This assumes the + /// parser is currently at a valid character class name and will be + /// advanced to the character immediately following the class. + fn parse_perl_class(&self) -> Hir { + let ch = self.char(); + self.bump(); + let mut class = hir::Class::new(match ch { + 'd' | 'D' => posix_class("digit").unwrap(), + 's' | 'S' => posix_class("space").unwrap(), + 'w' | 'W' => posix_class("word").unwrap(), + unk => unreachable!("invalid Perl class \\{}", unk), + }); + if ch.is_ascii_uppercase() { + class.negate(); + } + Hir::class(class) + } + + fn hir_dot(&self) -> Hir { + if self.flags().dot_matches_new_line { + Hir::class(hir::Class::new([hir::ClassRange { + start: '\x00', + end: '\u{10FFFF}', + }])) + } else if self.flags().crlf { + Hir::class(hir::Class::new([ + hir::ClassRange { start: '\x00', end: '\x09' }, + hir::ClassRange { start: '\x0B', end: '\x0C' }, + hir::ClassRange { start: '\x0E', end: '\u{10FFFF}' }, + ])) + } else { + Hir::class(hir::Class::new([ + hir::ClassRange { start: '\x00', end: '\x09' }, + hir::ClassRange { start: '\x0B', end: '\u{10FFFF}' }, + ])) + } + } + + fn hir_anchor_start(&self) -> Hir { + let look = if self.flags().multi_line { + if self.flags().crlf { + hir::Look::StartCRLF + } else { + hir::Look::StartLF + } + } else { + hir::Look::Start + }; + Hir::look(look) + } + + fn hir_anchor_end(&self) -> Hir { + let look = if self.flags().multi_line { + if self.flags().crlf { + hir::Look::EndCRLF + } else { + hir::Look::EndLF + } + } else { + hir::Look::End + }; + Hir::look(look) + } + + fn hir_char(&self, ch: char) -> Hir { + if self.flags().case_insensitive { + let this = hir::ClassRange { start: ch, end: ch }; + if let Some(folded) = this.ascii_case_fold() { + return Hir::class(hir::Class::new([this, folded])); + } + } + Hir::char(ch) + } +} + +/// Converts the given Hir to a literal char if the Hir is just a single +/// character. Otherwise this returns an error. +/// +/// This is useful in contexts where you can only accept a single character, +/// but where it is convenient to parse something more general. For example, +/// parsing a single part of a character class range. It's useful to reuse +/// the literal parsing code, but that code can itself return entire classes +/// which can't be used as the start/end of a class range. +fn into_class_item_range(hir: Hir) -> Result { + match hir.kind { + HirKind::Char(ch) => Ok(ch), + _ => Err(Error::new(ERR_CLASS_INVALID_RANGE_ITEM)), + } +} + +fn into_class_item_ranges(hir: Hir) -> Result, Error> { + match hir.kind { + HirKind::Char(ch) => Ok(vec![hir::ClassRange { start: ch, end: ch }]), + HirKind::Class(hir::Class { ranges }) => Ok(ranges), + _ => Err(Error::new(ERR_CLASS_INVALID_ITEM)), + } +} + +/// Returns an iterator of character class ranges for the given named POSIX +/// character class. If no such character class exists for the name given, then +/// an error is returned. +fn posix_class( + kind: &str, +) -> Result, Error> { + let slice: &'static [(u8, u8)] = match kind { + "alnum" => &[(b'0', b'9'), (b'A', b'Z'), (b'a', b'z')], + "alpha" => &[(b'A', b'Z'), (b'a', b'z')], + "ascii" => &[(b'\x00', b'\x7F')], + "blank" => &[(b'\t', b'\t'), (b' ', b' ')], + "cntrl" => &[(b'\x00', b'\x1F'), (b'\x7F', b'\x7F')], + "digit" => &[(b'0', b'9')], + "graph" => &[(b'!', b'~')], + "lower" => &[(b'a', b'z')], + "print" => &[(b' ', b'~')], + "punct" => &[(b'!', b'/'), (b':', b'@'), (b'[', b'`'), (b'{', b'~')], + "space" => &[ + (b'\t', b'\t'), + (b'\n', b'\n'), + (b'\x0B', b'\x0B'), + (b'\x0C', b'\x0C'), + (b'\r', b'\r'), + (b' ', b' '), + ], + "upper" => &[(b'A', b'Z')], + "word" => &[(b'0', b'9'), (b'A', b'Z'), (b'_', b'_'), (b'a', b'z')], + "xdigit" => &[(b'0', b'9'), (b'A', b'F'), (b'a', b'f')], + _ => return Err(Error::new(ERR_POSIX_CLASS_UNRECOGNIZED)), + }; + Ok(slice.iter().map(|&(start, end)| hir::ClassRange { + start: char::from(start), + end: char::from(end), + })) +} + +/// Returns true if the given character is a hexadecimal digit. +fn is_hex(c: char) -> bool { + ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F') +} + +/// Returns true if the given character is a valid in a capture group name. +/// +/// If `first` is true, then `c` is treated as the first character in the +/// group name (which must be alphabetic or underscore). +fn is_capture_char(c: char, first: bool) -> bool { + if first { + c == '_' || c.is_alphabetic() + } else { + c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn p(pattern: &str) -> Hir { + Parser::new(Config::default(), pattern).parse().unwrap() + } + + fn perr(pattern: &str) -> String { + Parser::new(Config::default(), pattern) + .parse() + .unwrap_err() + .to_string() + } + + fn class>(it: I) -> Hir { + Hir::class(hir::Class::new( + it.into_iter().map(|(start, end)| hir::ClassRange { start, end }), + )) + } + + fn singles>(it: I) -> Hir { + Hir::class(hir::Class::new( + it.into_iter().map(|ch| hir::ClassRange { start: ch, end: ch }), + )) + } + + fn posix(name: &str) -> Hir { + Hir::class(hir::Class::new(posix_class(name).unwrap())) + } + + fn cap(index: u32, sub: Hir) -> Hir { + Hir::capture(hir::Capture { index, name: None, sub: Box::new(sub) }) + } + + fn named_cap(index: u32, name: &str, sub: Hir) -> Hir { + Hir::capture(hir::Capture { + index, + name: Some(Box::from(name)), + sub: Box::new(sub), + }) + } + + #[test] + fn ok_literal() { + assert_eq!(p("a"), Hir::char('a')); + assert_eq!(p("ab"), Hir::concat(vec![Hir::char('a'), Hir::char('b')])); + assert_eq!(p("💩"), Hir::char('💩')); + } + + #[test] + fn ok_meta_escapes() { + assert_eq!(p(r"\*"), Hir::char('*')); + assert_eq!(p(r"\+"), Hir::char('+')); + assert_eq!(p(r"\?"), Hir::char('?')); + assert_eq!(p(r"\|"), Hir::char('|')); + assert_eq!(p(r"\("), Hir::char('(')); + assert_eq!(p(r"\)"), Hir::char(')')); + assert_eq!(p(r"\^"), Hir::char('^')); + assert_eq!(p(r"\$"), Hir::char('$')); + assert_eq!(p(r"\["), Hir::char('[')); + assert_eq!(p(r"\]"), Hir::char(']')); + } + + #[test] + fn ok_special_escapes() { + assert_eq!(p(r"\a"), Hir::char('\x07')); + assert_eq!(p(r"\f"), Hir::char('\x0C')); + assert_eq!(p(r"\t"), Hir::char('\t')); + assert_eq!(p(r"\n"), Hir::char('\n')); + assert_eq!(p(r"\r"), Hir::char('\r')); + assert_eq!(p(r"\v"), Hir::char('\x0B')); + assert_eq!(p(r"\A"), Hir::look(hir::Look::Start)); + assert_eq!(p(r"\z"), Hir::look(hir::Look::End)); + assert_eq!(p(r"\b"), Hir::look(hir::Look::Word)); + assert_eq!(p(r"\B"), Hir::look(hir::Look::WordNegate)); + } + + #[test] + fn ok_hex() { + // fixed length + assert_eq!(p(r"\x41"), Hir::char('A')); + assert_eq!(p(r"\u2603"), Hir::char('☃')); + assert_eq!(p(r"\U0001F4A9"), Hir::char('💩')); + // braces + assert_eq!(p(r"\x{1F4A9}"), Hir::char('💩')); + assert_eq!(p(r"\u{1F4A9}"), Hir::char('💩')); + assert_eq!(p(r"\U{1F4A9}"), Hir::char('💩')); + } + + #[test] + fn ok_perl() { + assert_eq!(p(r"\d"), posix("digit")); + assert_eq!(p(r"\s"), posix("space")); + assert_eq!(p(r"\w"), posix("word")); + + let negated = |name| { + let mut class = hir::Class::new(posix_class(name).unwrap()); + class.negate(); + Hir::class(class) + }; + assert_eq!(p(r"\D"), negated("digit")); + assert_eq!(p(r"\S"), negated("space")); + assert_eq!(p(r"\W"), negated("word")); + } + + #[test] + fn ok_flags_and_primitives() { + assert_eq!(p(r"a"), Hir::char('a')); + assert_eq!(p(r"(?i:a)"), singles(['A', 'a'])); + + assert_eq!(p(r"^"), Hir::look(hir::Look::Start)); + assert_eq!(p(r"(?m:^)"), Hir::look(hir::Look::StartLF)); + assert_eq!(p(r"(?mR:^)"), Hir::look(hir::Look::StartCRLF)); + + assert_eq!(p(r"$"), Hir::look(hir::Look::End)); + assert_eq!(p(r"(?m:$)"), Hir::look(hir::Look::EndLF)); + assert_eq!(p(r"(?mR:$)"), Hir::look(hir::Look::EndCRLF)); + + assert_eq!(p(r"."), class([('\x00', '\x09'), ('\x0B', '\u{10FFFF}')])); + assert_eq!( + p(r"(?R:.)"), + class([ + ('\x00', '\x09'), + ('\x0B', '\x0C'), + ('\x0E', '\u{10FFFF}'), + ]) + ); + assert_eq!(p(r"(?s:.)"), class([('\x00', '\u{10FFFF}')])); + assert_eq!(p(r"(?sR:.)"), class([('\x00', '\u{10FFFF}')])); + } + + #[test] + fn ok_alternate() { + assert_eq!( + p(r"a|b"), + Hir::alternation(vec![Hir::char('a'), Hir::char('b')]) + ); + assert_eq!( + p(r"(?:a|b)"), + Hir::alternation(vec![Hir::char('a'), Hir::char('b')]) + ); + + assert_eq!( + p(r"(a|b)"), + cap(1, Hir::alternation(vec![Hir::char('a'), Hir::char('b')])) + ); + assert_eq!( + p(r"(?a|b)"), + named_cap( + 1, + "foo", + Hir::alternation(vec![Hir::char('a'), Hir::char('b')]) + ) + ); + + assert_eq!( + p(r"a|b|c"), + Hir::alternation(vec![ + Hir::char('a'), + Hir::char('b'), + Hir::char('c') + ]) + ); + + assert_eq!( + p(r"ax|by|cz"), + Hir::alternation(vec![ + Hir::concat(vec![Hir::char('a'), Hir::char('x')]), + Hir::concat(vec![Hir::char('b'), Hir::char('y')]), + Hir::concat(vec![Hir::char('c'), Hir::char('z')]), + ]) + ); + assert_eq!( + p(r"(ax|(by|(cz)))"), + cap( + 1, + Hir::alternation(vec![ + Hir::concat(vec![Hir::char('a'), Hir::char('x')]), + cap( + 2, + Hir::alternation(vec![ + Hir::concat(vec![Hir::char('b'), Hir::char('y')]), + cap( + 3, + Hir::concat(vec![ + Hir::char('c'), + Hir::char('z') + ]) + ), + ]) + ), + ]) + ) + ); + + assert_eq!( + p(r"|"), + Hir::alternation(vec![Hir::empty(), Hir::empty()]) + ); + assert_eq!( + p(r"||"), + Hir::alternation(vec![Hir::empty(), Hir::empty(), Hir::empty()]) + ); + + assert_eq!( + p(r"a|"), + Hir::alternation(vec![Hir::char('a'), Hir::empty()]) + ); + assert_eq!( + p(r"|a"), + Hir::alternation(vec![Hir::empty(), Hir::char('a')]) + ); + + assert_eq!( + p(r"(|)"), + cap(1, Hir::alternation(vec![Hir::empty(), Hir::empty()])) + ); + assert_eq!( + p(r"(a|)"), + cap(1, Hir::alternation(vec![Hir::char('a'), Hir::empty()])) + ); + assert_eq!( + p(r"(|a)"), + cap(1, Hir::alternation(vec![Hir::empty(), Hir::char('a')])) + ); + } + + #[test] + fn ok_flag_group() { + assert_eq!( + p("a(?i:b)"), + Hir::concat(vec![Hir::char('a'), singles(['B', 'b'])]) + ); + } + + #[test] + fn ok_flag_directive() { + assert_eq!(p("(?i)a"), singles(['A', 'a'])); + assert_eq!(p("a(?i)"), Hir::char('a')); + assert_eq!( + p("a(?i)b"), + Hir::concat(vec![Hir::char('a'), singles(['B', 'b'])]) + ); + assert_eq!( + p("a(?i)a(?-i)a"), + Hir::concat(vec![ + Hir::char('a'), + singles(['A', 'a']), + Hir::char('a'), + ]) + ); + assert_eq!( + p("a(?:(?i)a)a"), + Hir::concat(vec![ + Hir::char('a'), + singles(['A', 'a']), + Hir::char('a'), + ]) + ); + assert_eq!( + p("a((?i)a)a"), + Hir::concat(vec![ + Hir::char('a'), + cap(1, singles(['A', 'a'])), + Hir::char('a'), + ]) + ); + } + + #[test] + fn ok_uncounted_repetition() { + assert_eq!( + p(r"a?"), + Hir::repetition(hir::Repetition { + min: 0, + max: Some(1), + greedy: true, + sub: Box::new(Hir::char('a')), + }), + ); + assert_eq!( + p(r"a*"), + Hir::repetition(hir::Repetition { + min: 0, + max: None, + greedy: true, + sub: Box::new(Hir::char('a')), + }), + ); + assert_eq!( + p(r"a+"), + Hir::repetition(hir::Repetition { + min: 1, + max: None, + greedy: true, + sub: Box::new(Hir::char('a')), + }), + ); + + assert_eq!( + p(r"a??"), + Hir::repetition(hir::Repetition { + min: 0, + max: Some(1), + greedy: false, + sub: Box::new(Hir::char('a')), + }), + ); + assert_eq!( + p(r"a*?"), + Hir::repetition(hir::Repetition { + min: 0, + max: None, + greedy: false, + sub: Box::new(Hir::char('a')), + }), + ); + assert_eq!( + p(r"a+?"), + Hir::repetition(hir::Repetition { + min: 1, + max: None, + greedy: false, + sub: Box::new(Hir::char('a')), + }), + ); + + assert_eq!( + p(r"a?b"), + Hir::concat(vec![ + Hir::repetition(hir::Repetition { + min: 0, + max: Some(1), + greedy: true, + sub: Box::new(Hir::char('a')), + }), + Hir::char('b'), + ]), + ); + + assert_eq!( + p(r"ab?"), + Hir::concat(vec![ + Hir::char('a'), + Hir::repetition(hir::Repetition { + min: 0, + max: Some(1), + greedy: true, + sub: Box::new(Hir::char('b')), + }), + ]), + ); + + assert_eq!( + p(r"(?:ab)?"), + Hir::repetition(hir::Repetition { + min: 0, + max: Some(1), + greedy: true, + sub: Box::new(Hir::concat(vec![ + Hir::char('a'), + Hir::char('b') + ])), + }), + ); + + assert_eq!( + p(r"(ab)?"), + Hir::repetition(hir::Repetition { + min: 0, + max: Some(1), + greedy: true, + sub: Box::new(cap( + 1, + Hir::concat(vec![Hir::char('a'), Hir::char('b')]) + )), + }), + ); + + assert_eq!( + p(r"|a?"), + Hir::alternation(vec![ + Hir::empty(), + Hir::repetition(hir::Repetition { + min: 0, + max: Some(1), + greedy: true, + sub: Box::new(Hir::char('a')), + }) + ]), + ); + } + + #[test] + fn ok_counted_repetition() { + assert_eq!( + p(r"a{5}"), + Hir::repetition(hir::Repetition { + min: 5, + max: Some(5), + greedy: true, + sub: Box::new(Hir::char('a')), + }), + ); + assert_eq!( + p(r"a{5}?"), + Hir::repetition(hir::Repetition { + min: 5, + max: Some(5), + greedy: false, + sub: Box::new(Hir::char('a')), + }), + ); + + assert_eq!( + p(r"a{5,}"), + Hir::repetition(hir::Repetition { + min: 5, + max: None, + greedy: true, + sub: Box::new(Hir::char('a')), + }), + ); + + assert_eq!( + p(r"a{5,9}"), + Hir::repetition(hir::Repetition { + min: 5, + max: Some(9), + greedy: true, + sub: Box::new(Hir::char('a')), + }), + ); + + assert_eq!( + p(r"ab{5}c"), + Hir::concat(vec![ + Hir::char('a'), + Hir::repetition(hir::Repetition { + min: 5, + max: Some(5), + greedy: true, + sub: Box::new(Hir::char('b')), + }), + Hir::char('c'), + ]), + ); + + assert_eq!( + p(r"a{ 5 }"), + Hir::repetition(hir::Repetition { + min: 5, + max: Some(5), + greedy: true, + sub: Box::new(Hir::char('a')), + }), + ); + assert_eq!( + p(r"a{ 5 , 9 }"), + Hir::repetition(hir::Repetition { + min: 5, + max: Some(9), + greedy: true, + sub: Box::new(Hir::char('a')), + }), + ); + } + + #[test] + fn ok_group_unnamed() { + assert_eq!(p("(a)"), cap(1, Hir::char('a'))); + assert_eq!( + p("(ab)"), + cap(1, Hir::concat(vec![Hir::char('a'), Hir::char('b')])) + ); + } + + #[test] + fn ok_group_named() { + assert_eq!(p("(?Pa)"), named_cap(1, "foo", Hir::char('a'))); + assert_eq!(p("(?a)"), named_cap(1, "foo", Hir::char('a'))); + + assert_eq!( + p("(?Pab)"), + named_cap( + 1, + "foo", + Hir::concat(vec![Hir::char('a'), Hir::char('b')]) + ) + ); + assert_eq!( + p("(?ab)"), + named_cap( + 1, + "foo", + Hir::concat(vec![Hir::char('a'), Hir::char('b')]) + ) + ); + + assert_eq!(p(r"(?z)"), named_cap(1, "a", Hir::char('z'))); + assert_eq!(p(r"(?Pz)"), named_cap(1, "a", Hir::char('z'))); + + assert_eq!(p(r"(?z)"), named_cap(1, "a_1", Hir::char('z'))); + assert_eq!(p(r"(?Pz)"), named_cap(1, "a_1", Hir::char('z'))); + + assert_eq!(p(r"(?z)"), named_cap(1, "a.1", Hir::char('z'))); + assert_eq!(p(r"(?Pz)"), named_cap(1, "a.1", Hir::char('z'))); + + assert_eq!(p(r"(?z)"), named_cap(1, "a[1]", Hir::char('z'))); + assert_eq!(p(r"(?Pz)"), named_cap(1, "a[1]", Hir::char('z'))); + + assert_eq!(p(r"(?z)"), named_cap(1, "a¾", Hir::char('z'))); + assert_eq!(p(r"(?Pz)"), named_cap(1, "a¾", Hir::char('z'))); + + assert_eq!(p(r"(?<名字>z)"), named_cap(1, "名字", Hir::char('z'))); + assert_eq!(p(r"(?P<名字>z)"), named_cap(1, "名字", Hir::char('z'))); + } + + #[test] + fn ok_class() { + assert_eq!(p(r"[a]"), singles(['a'])); + assert_eq!(p(r"[a\]]"), singles(['a', ']'])); + assert_eq!(p(r"[a\-z]"), singles(['a', '-', 'z'])); + assert_eq!(p(r"[ab]"), class([('a', 'b')])); + assert_eq!(p(r"[a-]"), singles(['a', '-'])); + assert_eq!(p(r"[-a]"), singles(['a', '-'])); + assert_eq!(p(r"[--a]"), singles(['a', '-'])); + assert_eq!(p(r"[---a]"), singles(['a', '-'])); + assert_eq!(p(r"[[:alnum:]]"), posix("alnum")); + assert_eq!(p(r"[\w]"), posix("word")); + assert_eq!(p(r"[a\wz]"), posix("word")); + assert_eq!(p(r"[\s\S]"), class([('\x00', '\u{10FFFF}')])); + assert_eq!(p(r"[^\s\S]"), Hir::fail()); + assert_eq!(p(r"[a-cx-z]"), class([('a', 'c'), ('x', 'z')])); + assert_eq!(p(r"[☃-⛄]"), class([('☃', '⛄')])); + assert_eq!(p(r"[]]"), singles([']'])); + assert_eq!(p(r"[]a]"), singles([']', 'a'])); + assert_eq!(p(r"[]\[]"), singles(['[', ']'])); + assert_eq!(p(r"[\[]"), singles(['['])); + + assert_eq!(p(r"(?i)[a]"), singles(['A', 'a'])); + assert_eq!(p(r"(?i)[A]"), singles(['A', 'a'])); + assert_eq!(p(r"(?i)[k]"), singles(['K', 'k'])); + assert_eq!(p(r"(?i)[s]"), singles(['S', 's'])); + assert_eq!(p(r"(?i)[β]"), singles(['β'])); + + assert_eq!(p(r"[^^]"), class([('\x00', ']'), ('_', '\u{10FFFF}')])); + assert_eq!( + p(r"[^-a]"), + class([('\x00', ','), ('.', '`'), ('b', '\u{10FFFF}')]) + ); + + assert_eq!( + p(r"[-]a]"), + Hir::concat(vec![singles(['-']), Hir::char('a'), Hir::char(']')]) + ); + } + + #[test] + fn ok_verbatim() { + assert_eq!( + p(r"(?x)a{5,9} ?"), + Hir::repetition(hir::Repetition { + min: 5, + max: Some(9), + greedy: false, + sub: Box::new(Hir::char('a')), + }) + ); + assert_eq!(p(r"(?x)[ a]"), singles(['a'])); + assert_eq!( + p(r"(?x)[ ^ a]"), + class([('\x00', '`'), ('b', '\u{10FFFF}')]) + ); + assert_eq!(p(r"(?x)[ - a]"), singles(['a', '-'])); + assert_eq!(p(r"(?x)[ ] a]"), singles([']', 'a'])); + + assert_eq!( + p(r"(?x)a b"), + Hir::concat(vec![Hir::char('a'), Hir::char('b')]) + ); + assert_eq!( + p(r"(?x)a b(?-x)a b"), + Hir::concat(vec![ + Hir::char('a'), + Hir::char('b'), + Hir::char('a'), + Hir::char(' '), + Hir::char('b'), + ]) + ); + assert_eq!( + p(r"a (?x:a )a "), + Hir::concat(vec![ + Hir::char('a'), + Hir::char(' '), + Hir::char('a'), + Hir::char('a'), + Hir::char(' '), + ]) + ); + assert_eq!( + p(r"(?x)( ?P a )"), + named_cap(1, "foo", Hir::char('a')), + ); + assert_eq!(p(r"(?x)( a )"), cap(1, Hir::char('a'))); + assert_eq!(p(r"(?x)( ?: a )"), Hir::char('a')); + assert_eq!(p(r"(?x)\x { 53 }"), Hir::char('\x53')); + assert_eq!(p(r"(?x)\ "), Hir::char(' ')); + } + + #[test] + fn ok_comments() { + let pat = "(?x) +# This is comment 1. +foo # This is comment 2. + # This is comment 3. +bar +# This is comment 4."; + assert_eq!( + p(pat), + Hir::concat(vec![ + Hir::char('f'), + Hir::char('o'), + Hir::char('o'), + Hir::char('b'), + Hir::char('a'), + Hir::char('r'), + ]) + ); + } + + #[test] + fn err_standard() { + assert_eq!( + ERR_TOO_MUCH_NESTING, + perr("(((((((((((((((((((((((((((((((((((((((((((((((((a)))))))))))))))))))))))))))))))))))))))))))))))))"), + ); + // This one is tricky, because the only way it can happen is if the + // number of captures overflows u32. Perhaps we should allow setting a + // lower limit? + // assert_eq!(ERR_TOO_MANY_CAPTURES, perr("")); + assert_eq!(ERR_DUPLICATE_CAPTURE_NAME, perr(r"(?Py)(?Pz)")); + assert_eq!(ERR_UNCLOSED_GROUP, perr("(")); + assert_eq!(ERR_UNCLOSED_GROUP_QUESTION, perr("(?")); + assert_eq!(ERR_UNOPENED_GROUP, perr(")")); + assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?=a)")); + assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?!a)")); + assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?<=a)")); + assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?z)")); + assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<1abc>z)")); + assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<¾>z)")); + assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<¾a>z)")); + assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<☃>z)")); + assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?z)")); + assert_eq!(ERR_UNCLOSED_GROUP_NAME, perr(r"(?Pz)")); + assert_eq!(ERR_EMPTY_GROUP_NAME, perr(r"(?<>z)")); + assert_eq!(ERR_FLAG_UNRECOGNIZED, perr(r"(?z:foo)")); + assert_eq!(ERR_FLAG_REPEATED_NEGATION, perr(r"(?s-i-R)")); + assert_eq!(ERR_FLAG_DUPLICATE, perr(r"(?isi)")); + assert_eq!(ERR_FLAG_DUPLICATE, perr(r"(?is-i)")); + assert_eq!(ERR_FLAG_UNEXPECTED_EOF, perr(r"(?is")); + assert_eq!(ERR_FLAG_DANGLING_NEGATION, perr(r"(?is-:foo)")); + assert_eq!(ERR_HEX_BRACE_INVALID_DIGIT, perr(r"\x{Z}")); + assert_eq!(ERR_HEX_BRACE_UNEXPECTED_EOF, perr(r"\x{")); + assert_eq!(ERR_HEX_BRACE_UNEXPECTED_EOF, perr(r"\x{A")); + assert_eq!(ERR_HEX_BRACE_EMPTY, perr(r"\x{}")); + assert_eq!(ERR_HEX_BRACE_INVALID, perr(r"\x{FFFFFFFFFFFFFFFFF}")); + assert_eq!(ERR_HEX_FIXED_UNEXPECTED_EOF, perr(r"\xA")); + assert_eq!(ERR_HEX_FIXED_INVALID_DIGIT, perr(r"\xZ")); + assert_eq!(ERR_HEX_FIXED_INVALID_DIGIT, perr(r"\xZA")); + assert_eq!(ERR_HEX_FIXED_INVALID_DIGIT, perr(r"\xAZ")); + assert_eq!(ERR_HEX_FIXED_INVALID, perr(r"\uD800")); + assert_eq!(ERR_HEX_FIXED_INVALID, perr(r"\UFFFFFFFF")); + assert_eq!(ERR_HEX_UNEXPECTED_EOF, perr(r"\x")); + assert_eq!(ERR_ESCAPE_UNEXPECTED_EOF, perr(r"\")); + assert_eq!(ERR_BACKREF_UNSUPPORTED, perr(r"\0")); + assert_eq!(ERR_BACKREF_UNSUPPORTED, perr(r"\1")); + assert_eq!(ERR_BACKREF_UNSUPPORTED, perr(r"\8")); + assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\pL")); + assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\p{L}")); + assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\i")); + assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\<")); + assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\>")); + assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"?")); + assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"*")); + assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"+")); + assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"(+)")); + assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"|?")); + assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"(?i)?")); + assert_eq!(ERR_COUNTED_REP_SUB_MISSING, perr(r"{5}")); + assert_eq!(ERR_COUNTED_REP_SUB_MISSING, perr(r"({5})")); + assert_eq!(ERR_COUNTED_REP_SUB_MISSING, perr(r"(?i){5}")); + assert_eq!(ERR_COUNTED_REP_UNCLOSED, perr(r"a{")); + assert_eq!(ERR_COUNTED_REP_MIN_UNCLOSED, perr(r"a{5")); + assert_eq!(ERR_COUNTED_REP_COMMA_UNCLOSED, perr(r"a{5,")); + assert_eq!(ERR_COUNTED_REP_MIN_MAX_UNCLOSED, perr(r"a{5,6")); + assert_eq!(ERR_COUNTED_REP_INVALID, perr(r"a{5,6Z")); + assert_eq!(ERR_COUNTED_REP_INVALID_RANGE, perr(r"a{6,5}")); + assert_eq!(ERR_DECIMAL_NO_DIGITS, perr(r"a{}")); + assert_eq!(ERR_DECIMAL_NO_DIGITS, perr(r"a{]}")); + assert_eq!(ERR_DECIMAL_INVALID, perr(r"a{999999999999999}")); + assert_eq!(ERR_CLASS_UNCLOSED_AFTER_ITEM, perr(r"[a")); + assert_eq!(ERR_CLASS_INVALID_RANGE_ITEM, perr(r"[\w-a]")); + assert_eq!(ERR_CLASS_INVALID_RANGE_ITEM, perr(r"[a-\w]")); + assert_eq!(ERR_CLASS_INVALID_ITEM, perr(r"[\b]")); + assert_eq!(ERR_CLASS_UNCLOSED_AFTER_DASH, perr(r"[a-")); + assert_eq!(ERR_CLASS_UNCLOSED_AFTER_NEGATION, perr(r"[^")); + assert_eq!(ERR_CLASS_UNCLOSED_AFTER_CLOSING, perr(r"[]")); + assert_eq!(ERR_CLASS_INVALID_RANGE, perr(r"[z-a]")); + assert_eq!(ERR_CLASS_UNCLOSED, perr(r"[")); + assert_eq!(ERR_CLASS_UNCLOSED, perr(r"[a-z")); + assert_eq!(ERR_CLASS_NEST_UNSUPPORTED, perr(r"[a-z[A-Z]]")); + assert_eq!(ERR_CLASS_NEST_UNSUPPORTED, perr(r"[[:alnum]]")); + assert_eq!(ERR_CLASS_INTERSECTION_UNSUPPORTED, perr(r"[a&&b]")); + assert_eq!(ERR_CLASS_DIFFERENCE_UNSUPPORTED, perr(r"[a--b]")); + assert_eq!(ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED, perr(r"[a~~b]")); + } + + #[test] + fn err_verbatim() { + // See: /~https://github.com/rust-lang/regex/issues/792 + assert_eq!(ERR_CLASS_UNCLOSED_AFTER_DASH, perr(r"(?x)[-#]")); + assert_eq!(ERR_CLASS_UNCLOSED_AFTER_ITEM, perr(r"(?x)[a ")); + assert_eq!(ERR_CLASS_UNCLOSED_AFTER_DASH, perr(r"(?x)[a- ")); + assert_eq!(ERR_CLASS_UNCLOSED, perr(r"(?x)[ ")); + } + + // This tests a bug fix where the nest limit checker wasn't decrementing + // its depth during post-traversal, which causes long regexes to trip + // the default limit too aggressively. + #[test] + fn regression_454_nest_too_big() { + let pattern = r#" + 2(?: + [45]\d{3}| + 7(?: + 1[0-267]| + 2[0-289]| + 3[0-29]| + 4[01]| + 5[1-3]| + 6[013]| + 7[0178]| + 91 + )| + 8(?: + 0[125]| + [139][1-6]| + 2[0157-9]| + 41| + 6[1-35]| + 7[1-5]| + 8[1-8]| + 90 + )| + 9(?: + 0[0-2]| + 1[0-4]| + 2[568]| + 3[3-6]| + 5[5-7]| + 6[0167]| + 7[15]| + 8[0146-9] + ) + )\d{4} + "#; + p(pattern); + } + + // This tests that we treat a trailing `-` in a character class as a + // literal `-` even when whitespace mode is enabled and there is whitespace + // after the trailing `-`. + #[test] + fn regression_455_trailing_dash_ignore_whitespace() { + p("(?x)[ / - ]"); + p("(?x)[ a - ]"); + p("(?x)[ + a + - ] + "); + p("(?x)[ + a # wat + - ] + "); + + perr("(?x)[ / -"); + perr("(?x)[ / - "); + perr( + "(?x)[ + / - + ", + ); + perr( + "(?x)[ + / - # wat + ", + ); + } +} diff --git a/regex-lite/src/int.rs b/regex-lite/src/int.rs new file mode 100644 index 000000000..c369f0429 --- /dev/null +++ b/regex-lite/src/int.rs @@ -0,0 +1,56 @@ +use core::num::NonZeroUsize; + +/// An extension trait that adds routines to the `u32` primitive type. +pub(crate) trait U32 { + fn as_usize(self) -> usize; +} + +impl U32 for u32 { + fn as_usize(self) -> usize { + // OK because we require 32 or 64 bit targets. Therefore, every u32 + // necessarily fits into a usize. + self as usize + } +} + +/// A `usize` that can never be `usize::MAX`. +/// +/// This is similar to `core::num::NonZeroUsize`, but instead of not permitting +/// a zero value, this does not permit a max value. +/// +/// This is useful in certain contexts where one wants to optimize the memory +/// usage of things that contain match offsets. Namely, since Rust slices +/// are guaranteed to never have a length exceeding `isize::MAX`, we can use +/// `usize::MAX` as a sentinel to indicate that no match was found. Indeed, +/// types like `Option` have exactly the same size in memory as a +/// `usize`. +/// +/// This type is defined to be `repr(transparent)` for +/// `core::num::NonZeroUsize`, which is in turn defined to be +/// `repr(transparent)` for `usize`. +#[derive(Clone, Copy, Eq, Hash, PartialEq, PartialOrd, Ord)] +#[repr(transparent)] +pub(crate) struct NonMaxUsize(NonZeroUsize); + +impl NonMaxUsize { + /// Create a new `NonMaxUsize` from the given value. + /// + /// This returns `None` only when the given value is equal to `usize::MAX`. + pub(crate) fn new(value: usize) -> Option { + NonZeroUsize::new(value.wrapping_add(1)).map(NonMaxUsize) + } + + /// Return the underlying `usize` value. The returned value is guaranteed + /// to not equal `usize::MAX`. + pub(crate) fn get(self) -> usize { + self.0.get().wrapping_sub(1) + } +} + +// We provide our own Debug impl because seeing the internal repr can be quite +// surprising if you aren't expecting it. e.g., 'NonMaxUsize(5)' vs just '5'. +impl core::fmt::Debug for NonMaxUsize { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "{:?}", self.get()) + } +} diff --git a/regex-lite/src/lib.rs b/regex-lite/src/lib.rs new file mode 100644 index 000000000..a5747a20f --- /dev/null +++ b/regex-lite/src/lib.rs @@ -0,0 +1,31 @@ +/*! +TODO +*/ + +#![allow(warnings)] +#![no_std] +#![forbid(unsafe_code)] +// #![deny(missing_docs, rustdoc::broken_intra_doc_links)] +#![warn(missing_debug_implementations)] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] + +#[cfg(not(any(target_pointer_width = "32", target_pointer_width = "64")))] +compile_error!("not supported on non-{32,64}, please file an issue"); + +extern crate alloc; +#[cfg(any(test, feature = "std"))] +extern crate std; + +pub use self::{ + string::Regex, + util::{is_escapeable_character, is_meta_character}, +}; + +mod error; +mod hir; +mod int; +mod nfa; +mod pikevm; +mod string; +mod utf8; +mod util; diff --git a/regex-lite/src/nfa.rs b/regex-lite/src/nfa.rs new file mode 100644 index 000000000..30c680f2a --- /dev/null +++ b/regex-lite/src/nfa.rs @@ -0,0 +1,678 @@ +use core::{cell::RefCell, mem::size_of}; + +use alloc::{sync::Arc, vec, vec::Vec}; + +use crate::{ + error::Error, + hir::{self, Hir, HirKind}, + int::U32, +}; + +pub(crate) type StateID = u32; + +#[derive(Clone, Copy, Debug)] +pub(crate) struct Config { + size_limit: Option, +} + +impl Default for Config { + fn default() -> Config { + Config { size_limit: Some(10 * (1 << 20)) } + } +} + +#[derive(Clone)] +pub(crate) struct NFA { + /// The states that make up this NFA. + states: Vec, + /// The ID of the start state. + start: StateID, + /// Whether this NFA can only match at the beginning of a haystack. + is_start_anchored: bool, + /// Whether this NFA can match the empty string. + is_match_empty: bool, + /// A map from capture group name to its corresponding index. + cap_name_to_index: CaptureNameMap, + /// A map from capture group index to the corresponding name, if one + /// exists. + cap_index_to_name: Vec>>, + /// Heap memory used indirectly by NFA states and other things (like the + /// various capturing group representations above). Since each state + /// might use a different amount of heap, we need to keep track of this + /// incrementally. + memory_extra: usize, +} + +impl NFA { + /// Creates a new NFA from the given configuration and HIR. + pub(crate) fn new(config: Config, hir: &Hir) -> Result { + Compiler::new(config).compile(hir) + } + + /// Returns the state corresponding to the given ID. + /// + /// # Panics + /// + /// If the ID does not refer to a valid state, then this panics. + pub(crate) fn state(&self, id: StateID) -> &State { + &self.states[id.as_usize()] + } + + /// Returns the total number of states in this NFA. + pub(crate) fn len(&self) -> usize { + self.states.len() + } + + /// Returns the ID of the starting state for this NFA. + pub(crate) fn start(&self) -> StateID { + self.start + } + + /// Returns the capture group index for the corresponding named group. + /// If no such group with the given name exists, then `None` is returned. + pub(crate) fn to_index(&self, name: &str) -> Option { + self.cap_name_to_index.get(name).cloned().map(|i| i.as_usize()) + } + + /// Returns the capture group name for the corresponding capture group + /// index. If no such group, then `None` is returned. + pub(crate) fn to_name(&self, index: usize) -> Option<&str> { + self.cap_index_to_name.get(index)?.as_deref() + } + + /// Returns an iterator over all of the capture groups, along with their + /// names if they exist, in this NFA. + pub(crate) fn capture_names(&self) -> CaptureNames<'_> { + CaptureNames { it: self.cap_index_to_name.iter() } + } + + /// Returns the total number of capture groups, including the first and + /// implicit group, in this NFA. + pub(crate) fn group_len(&self) -> usize { + self.cap_index_to_name.len() + } + + /// Returns true if and only if this NFA can only match at the beginning of + /// a haystack. + pub(crate) fn is_start_anchored(&self) -> bool { + self.is_start_anchored + } + + /// Returns true if and only if this NFA can match the empty string. + pub(crate) fn is_match_empty(&self) -> bool { + self.is_match_empty + } + + /// Returns the heap memory usage, in bytes, used by this NFA. + fn memory_usage(&self) -> usize { + (self.states.len() * size_of::()) + + (self.cap_index_to_name.len() * size_of::>>()) + + self.memory_extra + } +} + +impl core::fmt::Debug for NFA { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + writeln!(f, "NFA(")?; + for (sid, state) in self.states.iter().enumerate() { + writeln!(f, "{:06?}: {:?}", sid, state)?; + } + writeln!(f, ")")?; + Ok(()) + } +} + +/// An iterator over all capture groups in an NFA. +/// +/// If a particular group has a name, then it is yielded. Otherwise, `None` +/// is yielded. +#[derive(Clone, Debug)] +pub(crate) struct CaptureNames<'a> { + it: core::slice::Iter<'a, Option>>, +} + +impl<'a> Iterator for CaptureNames<'a> { + type Item = Option<&'a str>; + + fn next(&mut self) -> Option> { + self.it.next().map(|n| n.as_deref()) + } +} + +#[derive(Clone, Eq, PartialEq)] +pub(crate) enum State { + Char { target: StateID, ch: char }, + Ranges { target: StateID, ranges: Vec<(char, char)> }, + Splits { targets: Vec, reverse: bool }, + Goto { target: StateID, look: Option }, + Capture { target: StateID, slot: u32 }, + Fail, + Match, +} + +impl State { + /// Returns the heap memory usage of this NFA state in bytes. + fn memory_usage(&self) -> usize { + match *self { + State::Char { .. } + | State::Goto { .. } + | State::Capture { .. } + | State::Fail { .. } + | State::Match => 0, + State::Splits { ref targets, .. } => { + targets.len() * size_of::() + } + State::Ranges { ref ranges, .. } => { + ranges.len() * size_of::<(char, char)>() + } + } + } + + /// Returns an iterator over the given split targets. The order of the + /// iterator yields elements in reverse when `reverse` is true. + pub(crate) fn iter_splits<'a>( + splits: &'a [StateID], + reverse: bool, + ) -> impl Iterator + 'a { + let mut it = splits.iter(); + core::iter::from_fn(move || { + if reverse { it.next_back() } else { it.next() }.copied() + }) + } +} + +impl core::fmt::Debug for State { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match *self { + State::Char { target, ch } => { + write!(f, "{:?} => {:?}", ch, target) + } + State::Ranges { target, ref ranges } => { + for (i, &(start, end)) in ranges.iter().enumerate() { + if i > 0 { + write!(f, ", ")?; + } + write!(f, "{:?}-{:?} => {:?}", start, end, target)?; + } + Ok(()) + } + State::Splits { ref targets, reverse } => { + write!(f, "splits(")?; + for (i, sid) in + State::iter_splits(targets, reverse).enumerate() + { + if i > 0 { + write!(f, ", ")?; + } + write!(f, "{:?}", sid)?; + } + write!(f, ")") + } + State::Goto { target, look } => { + write!(f, "{:?} => {:?}", look, target) + } + State::Capture { target, slot } => { + write!(f, "capture(slot={:?}) => {:?}", slot, target,) + } + State::Fail => write!(f, "FAIL"), + State::Match => { + write!(f, "MATCH") + } + } + } +} + +/// A map from capture group name to its corresponding capture group index. +/// +/// We define a type alias here so that we can transparently use a `HashMap` +/// whenever it's available. We do so presumably because it's faster, although +/// there are no benchmarks verifying this. +#[cfg(feature = "std")] +type CaptureNameMap = std::collections::HashMap, u32>; +#[cfg(not(feature = "std"))] +type CaptureNameMap = alloc::collections::BTreeMap, u32>; + +#[derive(Debug)] +struct Compiler { + config: Config, + nfa: RefCell, +} + +impl Compiler { + fn new(config: Config) -> Compiler { + let nfa = RefCell::new(NFA { + states: vec![], + start: 0, + is_start_anchored: false, + is_match_empty: false, + cap_name_to_index: CaptureNameMap::default(), + cap_index_to_name: vec![], + memory_extra: 0, + }); + Compiler { config, nfa } + } + + fn compile(self, hir: &Hir) -> Result { + self.nfa.borrow_mut().is_start_anchored = hir.is_start_anchored(); + self.nfa.borrow_mut().is_match_empty = hir.is_match_empty(); + let compiled = self.c_capture(0, None, hir)?; + let mat = self.add(State::Match)?; + self.patch(compiled.end, mat); + self.nfa.borrow_mut().start = compiled.start; + Ok(self.nfa.into_inner()) + } + + fn c(&self, hir: &Hir) -> Result { + match *hir.kind() { + HirKind::Empty => self.c_empty(), + HirKind::Char(ch) => self.c_char(ch), + HirKind::Class(ref class) => self.c_class(class), + HirKind::Look(ref look) => self.c_look(look), + HirKind::Repetition(ref rep) => self.c_repetition(rep), + HirKind::Capture(ref cap) => { + self.c_capture(cap.index, cap.name.as_deref(), &cap.sub) + } + HirKind::Concat(ref subs) => { + self.c_concat(subs.iter().map(|s| self.c(s))) + } + HirKind::Alternation(ref subs) => { + self.c_alternation(subs.iter().map(|s| self.c(s))) + } + } + } + + /// Compile a "fail" state that can never be transitioned out of. + fn c_fail(&self) -> Result { + let id = self.add(State::Fail)?; + Ok(ThompsonRef { start: id, end: id }) + } + + /// Compile an "empty" state with one unconditional epsilon transition. + /// + /// Both the `start` and `end` locations point to the state created. + /// Callers will likely want to keep the `start`, but patch the `end` to + /// point to some other state. + fn c_empty(&self) -> Result { + let id = self.add_empty()?; + Ok(ThompsonRef { start: id, end: id }) + } + + /// Compile the given literal char to an NFA. + fn c_char(&self, ch: char) -> Result { + let id = self.add(State::Char { target: 0, ch })?; + Ok(ThompsonRef { start: id, end: id }) + } + + /// Compile the given character class into an NFA. + /// + /// If the class is empty, then this compiles to a `Fail` state. + fn c_class(&self, class: &hir::Class) -> Result { + let id = if class.ranges.is_empty() { + // Technically using an explicit fail state probably isn't + // necessary. Because if you try to match against an empty Ranges, + // then it should turn up with nothing regardless of input, and + // thus "acts" like a Fail state. But it's better to be more + // explicit, and there's no real cost to doing so. + self.add(State::Fail) + } else { + let ranges = + class.ranges.iter().map(|r| (r.start, r.end)).collect(); + self.add(State::Ranges { target: 0, ranges }) + }?; + Ok(ThompsonRef { start: id, end: id }) + } + + /// Compile the given HIR look-around assertion to an NFA look-around + /// assertion. + fn c_look(&self, look: &hir::Look) -> Result { + let id = self.add(State::Goto { target: 0, look: Some(*look) })?; + Ok(ThompsonRef { start: id, end: id }) + } + + /// Compile the given repetition expression. This handles all types of + /// repetitions and greediness. + fn c_repetition( + &self, + rep: &hir::Repetition, + ) -> Result { + match (rep.min, rep.max) { + (0, Some(1)) => self.c_zero_or_one(&rep.sub, rep.greedy), + (min, None) => self.c_at_least(&rep.sub, rep.greedy, min), + (min, Some(max)) if min == max => self.c_exactly(&rep.sub, min), + (min, Some(max)) => self.c_bounded(&rep.sub, rep.greedy, min, max), + } + } + + /// Compile the given expression such that it matches at least `min` times, + /// but no more than `max` times. + /// + /// When `greedy` is true, then the preference is for the expression to + /// match as much as possible. Otheriwse, it will match as little as + /// possible. + fn c_bounded( + &self, + hir: &Hir, + greedy: bool, + min: u32, + max: u32, + ) -> Result { + let prefix = self.c_exactly(hir, min)?; + if min == max { + return Ok(prefix); + } + + // It is tempting here to compile the rest here as a concatenation + // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it + // were `aaa?a?a?`. The problem here is that it leads to this program: + // + // >000000: 61 => 01 + // 000001: 61 => 02 + // 000002: union(03, 04) + // 000003: 61 => 04 + // 000004: union(05, 06) + // 000005: 61 => 06 + // 000006: union(07, 08) + // 000007: 61 => 08 + // 000008: MATCH + // + // And effectively, once you hit state 2, the epsilon closure will + // include states 3, 5, 6, 7 and 8, which is quite a bit. It is better + // to instead compile it like so: + // + // >000000: 61 => 01 + // 000001: 61 => 02 + // 000002: union(03, 08) + // 000003: 61 => 04 + // 000004: union(05, 08) + // 000005: 61 => 06 + // 000006: union(07, 08) + // 000007: 61 => 08 + // 000008: MATCH + // + // So that the epsilon closure of state 2 is now just 3 and 8. + let empty = self.add_empty()?; + let mut prev_end = prefix.end; + for _ in min..max { + let splits = + self.add(State::Splits { targets: vec![], reverse: !greedy })?; + let compiled = self.c(hir)?; + self.patch(prev_end, splits)?; + self.patch(splits, compiled.start)?; + self.patch(splits, empty)?; + prev_end = compiled.end; + } + self.patch(prev_end, empty)?; + Ok(ThompsonRef { start: prefix.start, end: empty }) + } + + /// Compile the given expression such that it may be matched `n` or more + /// times, where `n` can be any integer. (Although a particularly large + /// integer is likely to run afoul of any configured size limits.) + /// + /// When `greedy` is true, then the preference is for the expression to + /// match as much as possible. Otheriwse, it will match as little as + /// possible. + fn c_at_least( + &self, + hir: &Hir, + greedy: bool, + n: u32, + ) -> Result { + if n == 0 { + // When the expression cannot match the empty string, then we + // can get away with something much simpler: just one 'alt' + // instruction that optionally repeats itself. But if the expr + // can match the empty string... see below. + if !hir.is_match_empty() { + let splits = self.add(State::Splits { + targets: vec![], + reverse: !greedy, + })?; + let compiled = self.c(hir)?; + self.patch(splits, compiled.start)?; + self.patch(compiled.end, splits)?; + return Ok(ThompsonRef { start: splits, end: splits }); + } + + // What's going on here? Shouldn't x* be simpler than this? It + // turns out that when implementing leftmost-first (Perl-like) + // match semantics, x* results in an incorrect preference order + // when computing the transitive closure of states if and only if + // 'x' can match the empty string. So instead, we compile x* as + // (x+)?, which preserves the correct preference order. + // + // See: /~https://github.com/rust-lang/regex/issues/779 + let compiled = self.c(hir)?; + let plus = + self.add(State::Splits { targets: vec![], reverse: !greedy })?; + self.patch(compiled.end, plus)?; + self.patch(plus, compiled.start)?; + + let question = + self.add(State::Splits { targets: vec![], reverse: !greedy })?; + let empty = self.add_empty()?; + self.patch(question, compiled.start)?; + self.patch(question, empty)?; + self.patch(plus, empty)?; + Ok(ThompsonRef { start: question, end: empty }) + } else if n == 1 { + let compiled = self.c(hir)?; + let splits = + self.add(State::Splits { targets: vec![], reverse: !greedy })?; + self.patch(compiled.end, splits)?; + self.patch(splits, compiled.start)?; + Ok(ThompsonRef { start: compiled.start, end: splits }) + } else { + let prefix = self.c_exactly(hir, n - 1)?; + let last = self.c(hir)?; + let splits = + self.add(State::Splits { targets: vec![], reverse: !greedy })?; + self.patch(prefix.end, last.start)?; + self.patch(last.end, splits)?; + self.patch(splits, last.start)?; + Ok(ThompsonRef { start: prefix.start, end: splits }) + } + } + + /// Compile the given expression such that it may be matched zero or one + /// times. + /// + /// When `greedy` is true, then the preference is for the expression to + /// match as much as possible. Otheriwse, it will match as little as + /// possible. + fn c_zero_or_one( + &self, + hir: &Hir, + greedy: bool, + ) -> Result { + let splits = + self.add(State::Splits { targets: vec![], reverse: !greedy })?; + let compiled = self.c(hir)?; + let empty = self.add_empty()?; + self.patch(splits, compiled.start)?; + self.patch(splits, empty)?; + self.patch(compiled.end, empty)?; + Ok(ThompsonRef { start: splits, end: empty }) + } + + /// Compile the given HIR expression exactly `n` times. + fn c_exactly(&self, hir: &Hir, n: u32) -> Result { + self.c_concat((0..n).map(|_| self.c(hir))) + } + + /// Compile the given expression and insert capturing states at the + /// beginning and end of it. The slot for the capture states is computed + /// from the index. + fn c_capture( + &self, + index: u32, + name: Option<&str>, + hir: &Hir, + ) -> Result { + let Some(slot) = index.checked_mul(1) else { + return Err(Error::new("capture group slots exhausted")); + }; + let start = self.add(State::Capture { target: 0, slot })?; + let inner = self.c(hir)?; + let Some(slot) = slot.checked_add(1) else { + return Err(Error::new("capture group slots exhausted")); + }; + let end = self.add(State::Capture { target: 0, slot })?; + self.patch(start, inner.start)?; + self.patch(inner.end, end)?; + + assert_eq!( + index.as_usize(), + self.nfa.borrow().cap_index_to_name.len(), + "captures compiled in wrong order" + ); + if let Some(name) = name { + let name = Arc::from(name); + let mut nfa = self.nfa.borrow_mut(); + nfa.cap_name_to_index.insert(Arc::clone(&name), index); + nfa.cap_index_to_name.push(Some(Arc::clone(&name))); + nfa.memory_extra += name.len() + size_of::(); + } else { + self.nfa.borrow_mut().cap_index_to_name.push(None); + } + Ok(ThompsonRef { start, end }) + } + + /// Compile a concatenation of the sub-expressions yielded by the given + /// iterator. If the iterator yields no elements, then this compiles down + /// to an "empty" state that always matches. + fn c_concat(&self, mut it: I) -> Result + where + I: Iterator>, + { + let ThompsonRef { start, mut end } = match it.next() { + Some(result) => result?, + None => return self.c_empty(), + }; + for result in it { + let compiled = result?; + self.patch(end, compiled.start)?; + end = compiled.end; + } + Ok(ThompsonRef { start, end }) + } + + /// Compile an alternation, where each element yielded by the given + /// iterator represents an item in the alternation. If the iterator yields + /// no elements, then this compiles down to a "fail" state. + /// + /// In an alternation, expressions appearing earlier are "preferred" at + /// match time over expressions appearing later. (This is currently always + /// true, as this crate only supports leftmost-first semantics.) + fn c_alternation(&self, mut it: I) -> Result + where + I: Iterator>, + { + let first = match it.next() { + None => return self.c_fail(), + Some(result) => result?, + }; + let second = match it.next() { + None => return Ok(first), + Some(result) => result?, + }; + + let splits = + self.add(State::Splits { targets: vec![], reverse: false })?; + let end = self.add_empty()?; + self.patch(splits, first.start)?; + self.patch(first.end, end)?; + self.patch(splits, second.start)?; + self.patch(second.end, end)?; + for result in it { + let compiled = result?; + self.patch(splits, compiled.start)?; + self.patch(compiled.end, end)?; + } + Ok(ThompsonRef { start: splits, end }) + } + + /// A convenience routine for adding an empty state, also known as an + /// unconditional epsilon transition. These are quite useful for making + /// NFA construction simpler. + /// + /// (In the regex crate, we do a second pass to remove these, but don't + /// bother with that here.) + fn add_empty(&self) -> Result { + self.add(State::Goto { target: 0, look: None }) + } + + /// The common implementation of "add a state." It handles the common + /// error cases of state ID exhausting (by owning state ID allocation) and + /// whether the size limit has been exceeded. + fn add(&self, state: State) -> Result { + let id = u32::try_from(self.nfa.borrow().states.len()) + .map_err(|_| Error::new("exhausted state IDs, too many states"))?; + self.nfa.borrow_mut().memory_extra += state.memory_usage(); + self.nfa.borrow_mut().states.push(state); + self.check_size_limit()?; + Ok(id) + } + + /// Add a transition from one state to another. + /// + /// This routine is called "patch" since it is very common to add the + /// states you want, typically with "dummy" state ID transitions, and then + /// "patch" in the real state IDs later. This is because you don't always + /// know all of the necessary state IDs to add because they might not + /// exist yet. + /// + /// # Errors + /// + /// This may error if patching leads to an increase in heap usage beyond + /// the configured size limit. Heap usage only grows when patching adds a + /// new transition (as in the case of a "splits" state). + fn patch(&self, from: StateID, to: StateID) -> Result<(), Error> { + let mut new_memory_extra = self.nfa.borrow().memory_extra; + match self.nfa.borrow_mut().states[from.as_usize()] { + State::Char { ref mut target, .. } => { + *target = to; + } + State::Ranges { ref mut target, .. } => { + *target = to; + } + State::Splits { ref mut targets, .. } => { + targets.push(to); + new_memory_extra += size_of::(); + } + State::Goto { ref mut target, .. } => { + *target = to; + } + State::Capture { ref mut target, .. } => { + *target = to; + } + State::Fail | State::Match => {} + } + if new_memory_extra != self.nfa.borrow().memory_extra { + self.nfa.borrow_mut().memory_extra = new_memory_extra; + self.check_size_limit()?; + } + Ok(()) + } + + /// Checks that the current heap memory usage of the NFA being compiled + /// doesn't exceed the configured size limit. If it does, an error is + /// returned. + fn check_size_limit(&self) -> Result<(), Error> { + if let Some(limit) = self.config.size_limit { + if self.nfa.borrow().memory_usage() > limit { + return Err(Error::new("compiled regex exceeded size limit")); + } + } + Ok(()) + } +} + +/// A value that represents the result of compiling a sub-expression of a +/// regex's HIR. Specifically, this represents a sub-graph of the NFA that +/// has an initial state at `start` and a final state at `end`. +#[derive(Clone, Copy, Debug)] +struct ThompsonRef { + start: StateID, + end: StateID, +} diff --git a/regex-lite/src/pikevm.rs b/regex-lite/src/pikevm.rs new file mode 100644 index 000000000..5af4d6198 --- /dev/null +++ b/regex-lite/src/pikevm.rs @@ -0,0 +1,831 @@ +use alloc::{vec, vec::Vec}; + +use crate::{ + int::{NonMaxUsize, U32}, + nfa::{State, StateID, NFA}, + utf8, +}; + +/// A PikeVM searcher. +/// +/// A PikeVM uses the standard Thompson NFA linear time search algorithm, but +/// augmented to support tracking the offsets of matching capture groups. +#[derive(Clone, Debug)] +pub(crate) struct PikeVM { + nfa: NFA, +} + +impl PikeVM { + /// Create a new PikeVM searcher that uses the given NFA. + pub(crate) fn new(nfa: NFA) -> PikeVM { + PikeVM { nfa } + } + + /// Return the underlying NFA used by this PikeVM. + pub(crate) fn nfa(&self) -> &NFA { + &self.nfa + } + + /// The implementation of standard leftmost search. + /// + /// Capturing group spans are written to `slots`, but only if requested. + /// `slots` can be any length. Any slot in the NFA that is activated but + /// which is out of bounds for the given `slots` is ignored. + pub(crate) fn search( + &self, + cache: &mut Cache, + haystack: &[u8], + start: usize, + end: usize, + earliest: bool, + slots: &mut [Option], + ) -> bool { + cache.setup_search(slots.len()); + if start > end { + return false; + } + // Why do we even care about this? Well, in our `slots` representation, + // we use usize::MAX as a sentinel to indicate "no match." This isn't + // problematic so long as our haystack doesn't have a maximal length. + // Byte slices are guaranteed by Rust to have a length that fits into + // isize, and so this assert should always pass. But we put it here to + // make our assumption explicit. + assert!( + haystack.len() < core::usize::MAX, + "byte slice lengths must be less than usize MAX", + ); + + let start_id = self.nfa().start(); + let anchored = self.nfa().is_start_anchored(); + + let Cache { ref mut stack, ref mut curr, ref mut next } = cache; + let mut matched = false; + // Yes, our search doesn't end at `end`, but includes it. This is + // necessary because matches are delayed by one byte. The delay is used + // to handle look-behind assertions. In the case of the PikeVM, the + // delay is implemented by not considering a match to exist until it + // is visited in `nexts`. Technically, we know a match exists in the + // previous iteration via `epsilon_closure`. + let mut at = start; + while at <= end { + // If we have no states left to visit, then there are some cases + // where we know we can quit early or even skip ahead. + if curr.set.is_empty() { + // We have a match so we can quit. + if matched { + break; + } + // If we're running an anchored search and we've advanced + // beyond the start position with no other states to try, then + // we will never observe a match and thus can stop. + if anchored && at > start { + break; + } + } + // Instead of using a hypothetical unanchored start state in the + // NFA (which doesn't exist, but we could add it), we actually + // always use its anchored starting state. As a result, when doing + // an unanchored search, we need to simulate our own '(?s:.)*?' + // prefix, to permit a match to appear anywhere. + // + // Now, we don't *have* to do things this way. We could create + // a proper unanchored start state in the NFA and do one + // `epsilon_closure` call from that starting state before the main + // loop here. And that is just as correct. However, it turns out to + // be slower than our approach here because it slightly increases + // the cost of processing each byte by requiring us to visit + // more NFA states to deal with the additional NFA states in the + // unanchored prefix. By simulating it explicitly here, we lower + // those costs substantially. The cost is itself small, but it adds + // up for large haystacks. + // + // In order to simulate the '(?s:.)*?' prefix---which is not + // greedy---we are careful not to perform an epsilon closure on + // the start state if we already have a match. Namely, if we + // did otherwise, we would never reach a terminating condition + // because there would always be additional states to process. + if !matched { + // Since we are adding to the 'curr' active states and since + // this is for the start ID, we use a slots slice that is + // guaranteed to have the right length but where every element + // is absent. This is exactly what we want, because this + // epsilon closure is responsible for simulating an unanchored + // '(?s:.)*?' prefix. It is specifically outside of any + // capturing groups, and thus, using slots that are always + // absent is correct. + // + // Note though that we can't just use `&mut []` here, since + // this epsilon closure may traverse through `Capture` states + // transitions, and thus must be able to write offsets to the + // slots given which are later copied to slot values in `curr`. + let slots = next.slot_table.all_absent(); + self.epsilon_closure( + stack, slots, curr, haystack, at, start_id, + ); + } + let (ch, len) = utf8::decode_lossy(&haystack[at..]); + matched = + self.nexts(stack, curr, next, haystack, at, ch, len, slots); + // Unless the caller asked us to return early, we need to mush + // on to see if we can extend our match. (But note that 'nexts' + // will quit right after seeing a match, as is consistent with + // leftmost-first match priority.) + if (earliest && matched) || len == 0 { + break; + } + core::mem::swap(curr, next); + next.set.clear(); + at += len; + } + matched + } + + /// Process the active states in 'curr' to find the states (written to + /// 'next') we should process for the next byte in the haystack. + /// + /// 'stack' is used to perform a depth first traversal of the NFA when + /// computing an epsilon closure. + /// + /// When a match is found, the slots for that match state (in 'curr') are + /// copied to 'caps'. Moreover, once a match is seen, processing for 'curr' + /// stops (unless the PikeVM was configured with MatchKind::All semantics). + /// + /// `at_ch` is the Unicode scalar value whose UTF-8 encoding begins at `at` + /// in `haystack`. + /// + /// `at_len` is the number of bytes consumed by `at_ch`. This is usually + /// equal to `at_ch.len_utf8()`, but not always. For example, in the case + /// where `at_ch` is the replacement codepoint that results from decoding + /// invalid UTF-8. In that case, `at_len` can be 1, 2 or 3. + fn nexts( + &self, + stack: &mut Vec, + curr: &mut ActiveStates, + next: &mut ActiveStates, + haystack: &[u8], + at: usize, + at_ch: char, + at_len: usize, + slots: &mut [Option], + ) -> bool { + let ActiveStates { ref set, ref mut slot_table } = *curr; + for sid in set.iter() { + if self.next( + stack, slot_table, next, haystack, at, at_ch, at_len, sid, + ) { + slots.copy_from_slice(slot_table.for_state(sid)); + return true; + } + } + false + } + + /// Starting from `sid`, if the position `at` in the `haystack` has a + /// transition defined out of `sid`, then add the state transitioned to and + /// its epsilon closure to the `next` set of states to explore. + /// + /// `stack` is used by the epsilon closure computation to perform a depth + /// first traversal of the NFA. + /// + /// `curr_slot_table` should be the table of slots for the current set of + /// states being explored. If there is a transition out of `sid`, then + /// sid's row in the slot table is used to perform the epsilon closure. + /// + /// `at_ch` is the Unicode scalar value whose UTF-8 encoding begins at `at` + /// in `haystack`. The caller provides it so that this routine doesn't + /// need to re-decode it. (Since it's expected that this routine is called + /// multiple times for each position.) + /// + /// `at_len` is the number of bytes consumed by `at_ch`. This is usually + /// equal to `at_ch.len_utf8()`, but not always. For example, in the case + /// where `at_ch` is the replacement codepoint that results from decoding + /// invalid UTF-8. In that case, `at_len` can be 1, 2 or 3. + fn next( + &self, + stack: &mut Vec, + curr_slot_table: &mut SlotTable, + next: &mut ActiveStates, + haystack: &[u8], + at: usize, + at_ch: char, + at_len: usize, + sid: StateID, + ) -> bool { + match *self.nfa.state(sid) { + State::Fail + | State::Goto { .. } + | State::Splits { .. } + | State::Capture { .. } => false, + State::Char { target, ch } => { + if at_ch == ch && at_len > 0 { + let slots = curr_slot_table.for_state(sid); + // OK because `at_len` is always derived from the number + // of bytes read from `at` that make up `at_ch`. So this + // will never wrap. + let at = at.wrapping_add(at_len); + self.epsilon_closure( + stack, slots, next, haystack, at, target, + ); + } + false + } + State::Ranges { target, ref ranges } => { + for (start, end) in ranges.iter().copied() { + if start > at_ch { + break; + } else if start <= at_ch && at_ch <= end { + if at_len == 0 { + return false; + } + let slots = curr_slot_table.for_state(sid); + // OK because `at_len` is always derived from the + // number of bytes read from `at` that make up `at_ch`. + // So this will never wrap. + let at = at.wrapping_add(at_len); + self.epsilon_closure( + stack, slots, next, haystack, at, target, + ); + } + } + false + } + State::Match => true, + } + } + + /// Compute the epsilon closure of `sid`, writing the closure into `next` + /// while copying slot values from `curr_slots` into corresponding states + /// in `next`. `curr_slots` should be the slot values corresponding to + /// `sid`. + /// + /// The given `stack` is used to perform a depth first traversal of the + /// NFA by recursively following all epsilon transitions out of `sid`. + /// Conditional epsilon transitions are followed if and only if they are + /// satisfied for the position `at` in the `input` haystack. + /// + /// While this routine may write to `curr_slots`, once it returns, any + /// writes are undone and the original values (even if absent) are + /// restored. + fn epsilon_closure( + &self, + stack: &mut Vec, + curr_slots: &mut [Option], + next: &mut ActiveStates, + haystack: &[u8], + at: usize, + sid: StateID, + ) { + stack.push(FollowEpsilon::Explore(sid)); + while let Some(frame) = stack.pop() { + match frame { + FollowEpsilon::RestoreCapture { slot, offset } => { + curr_slots[slot.as_usize()] = offset; + } + FollowEpsilon::Explore(sid) => { + self.epsilon_closure_explore( + stack, curr_slots, next, haystack, at, sid, + ); + } + } + } + } + + /// Explore all of the epsilon transitions out of `sid`. This is mostly + /// split out from `epsilon_closure` in order to clearly delineate + /// the actual work of computing an epsilon closure from the stack + /// book-keeping. + /// + /// This will push any additional explorations needed on to `stack`. + /// + /// `curr_slots` should refer to the slots for the currently active NFA + /// state. That is, the current state we are stepping through. These + /// slots are mutated in place as new `Captures` states are traversed + /// during epsilon closure, but the slots are restored to their original + /// values once the full epsilon closure is completed. The ultimate use of + /// `curr_slots` is to copy them to the corresponding `next_slots`, so that + /// the capturing group spans are forwarded from the currently active state + /// to the next. + /// + /// `next` refers to the next set of active states. Computing an epsilon + /// closure may increase the next set of active states. + /// + /// `haystack` refers to the what we're searching and `at` refers to the + /// current position in the haystack. These are used to check whether + /// conditional epsilon transitions (like look-around) are satisfied at + /// the current position. If they aren't, then the epsilon closure won't + /// include them. + fn epsilon_closure_explore( + &self, + stack: &mut Vec, + curr_slots: &mut [Option], + next: &mut ActiveStates, + haystack: &[u8], + at: usize, + mut sid: StateID, + ) { + // We can avoid pushing some state IDs on to our stack in precisely + // the cases where a 'push(x)' would be immediately followed by a 'x + // = pop()'. This is achieved by this outer-loop. We simply set 'sid' + // to be the next state ID we want to explore once we're done with + // our initial exploration. In practice, this avoids a lot of stack + // thrashing. + loop { + // Record this state as part of our next set of active states. If + // we've already explored it, then no need to do it again. + if !next.set.insert(sid) { + return; + } + match *self.nfa.state(sid) { + State::Fail + | State::Match { .. } + | State::Char { .. } + | State::Ranges { .. } => { + next.slot_table.for_state(sid).copy_from_slice(curr_slots); + return; + } + State::Goto { target, look: None } => { + sid = target; + } + State::Goto { target, look: Some(look) } => { + if !look.is_match(haystack, at) { + return; + } + sid = target; + } + State::Splits { ref targets, reverse: false } => { + sid = match targets.get(0) { + None => return, + Some(&sid) => sid, + }; + stack.extend( + targets[1..] + .iter() + .copied() + .rev() + .map(FollowEpsilon::Explore), + ); + } + State::Splits { ref targets, reverse: true } => { + sid = match targets.last() { + None => return, + Some(&sid) => sid, + }; + stack.extend( + targets[..targets.len() - 1] + .iter() + .copied() + .map(FollowEpsilon::Explore), + ); + } + State::Capture { target, slot } => { + // There's no need to do anything with slots that + // ultimately won't be copied into the caller-provided + // 'Captures' value. So we just skip dealing with them at + // all. + if slot.as_usize() < curr_slots.len() { + stack.push(FollowEpsilon::RestoreCapture { + slot, + offset: curr_slots[slot.as_usize()], + }); + // OK because length of a slice must fit into an isize. + curr_slots[slot.as_usize()] = + Some(NonMaxUsize::new(at).unwrap()); + } + sid = target; + } + } + } + } +} + +/// A cache represents mutable state that a `PikeVM` requires during a search. +/// +/// For a given `PikeVM`, its corresponding cache may be created either via +/// `PikeVM::create_cache`, or via `Cache::new`. They are equivalent in every +/// way, except the former does not require explicitly importing `Cache`. +/// +/// A particular `Cache` is coupled with the `PikeVM` from which it was +/// created. It may only be used with that `PikeVM`. A cache and its +/// allocations may be re-purposed via `Cache::reset`, in which case, it can +/// only be used with the new `PikeVM` (and not the old one). +#[derive(Clone, Debug)] +pub(crate) struct Cache { + /// Stack used while computing epsilon closure. This effectively lets us + /// move what is more naturally expressed through recursion to a stack + /// on the heap. + stack: Vec, + /// The current active states being explored for the current byte in the + /// haystack. + curr: ActiveStates, + /// The next set of states we're building that will be explored for the + /// next byte in the haystack. + next: ActiveStates, +} + +impl Cache { + /// Create a new `PikeVM` cache. + /// + /// A potentially more convenient routine to create a cache is + /// `PikeVM::create_cache`, as it does not require also importing the + /// `Cache` type. + /// + /// If you want to reuse the returned `Cache` with some other `PikeVM`, + /// then you must call `Cache::reset` with the desired `PikeVM`. + pub(crate) fn new(re: &PikeVM) -> Cache { + Cache { + stack: vec![], + curr: ActiveStates::new(re), + next: ActiveStates::new(re), + } + } + + /// Reset this cache such that it can be used for searching with a + /// different `PikeVM`. + /// + /// A cache reset permits reusing memory already allocated in this cache + /// with a different `PikeVM`. + pub(crate) fn reset(&mut self, re: &PikeVM) { + self.curr.reset(re); + self.next.reset(re); + } + + /// Returns the heap memory usage, in bytes, of this cache. + /// + /// This does **not** include the stack size used up by this cache. To + /// compute that, use `std::mem::size_of::()`. + pub(crate) fn memory_usage(&self) -> usize { + (self.stack.len() * core::mem::size_of::()) + + self.curr.memory_usage() + + self.next.memory_usage() + } + + /// Clears this cache. This should be called at the start of every search + /// to ensure we start with a clean slate. + /// + /// This also sets the length of the capturing groups used in the current + /// search. This permits an optimization where by 'SlotTable::for_state' + /// only returns the number of slots equivalent to the number of slots + /// given in the 'Captures' value. This may be less than the total number + /// of possible slots, e.g., when one only wants to track overall match + /// offsets. This in turn permits less copying of capturing group spans + /// in the PikeVM. + fn setup_search(&mut self, captures_slot_len: usize) { + self.stack.clear(); + self.curr.setup_search(captures_slot_len); + self.next.setup_search(captures_slot_len); + } +} + +/// A set of active states used to "simulate" the execution of an NFA via the +/// PikeVM. +/// +/// There are two sets of these used during NFA simulation. One set corresponds +/// to the "current" set of states being traversed for the current position +/// in a haystack. The other set corresponds to the "next" set of states being +/// built, which will become the new "current" set for the next position in the +/// haystack. These two sets correspond to CLIST and NLIST in Thompson's +/// original paper regexes: https://dl.acm.org/doi/pdf/10.1145/363347.363387 +/// +/// In addition to representing a set of NFA states, this also maintains slot +/// values for each state. These slot values are what turn the NFA simulation +/// into the "Pike VM." Namely, they track capturing group values for each +/// state. During the computation of epsilon closure, we copy slot values from +/// states in the "current" set to the "next" set. Eventually, once a match +/// is found, the slot values for that match state are what we write to the +/// caller provided slots. +#[derive(Clone, Debug)] +struct ActiveStates { + /// The set of active NFA states. This set preserves insertion order, which + /// is critical for simulating the match semantics of backtracking regex + /// engines. + set: SparseSet, + /// The slots for every NFA state, where each slot stores a (possibly + /// absent) offset. Every capturing group has two slots. One for a start + /// offset and one for an end offset. + slot_table: SlotTable, +} + +impl ActiveStates { + /// Create a new set of active states for the given PikeVM. The active + /// states returned may only be used with the given PikeVM. (Use 'reset' + /// to re-purpose the allocation for a different PikeVM.) + fn new(re: &PikeVM) -> ActiveStates { + let mut active = ActiveStates { + set: SparseSet::new(0), + slot_table: SlotTable::new(), + }; + active.reset(re); + active + } + + /// Reset this set of active states such that it can be used with the given + /// PikeVM (and only that PikeVM). + fn reset(&mut self, re: &PikeVM) { + self.set.resize(re.nfa().len()); + self.slot_table.reset(re); + } + + /// Return the heap memory usage, in bytes, used by this set of active + /// states. + /// + /// This does not include the stack size of this value. + fn memory_usage(&self) -> usize { + self.set.memory_usage() + self.slot_table.memory_usage() + } + + /// Setup this set of active states for a new search. The given slot + /// length should be the number of slots in a caller provided 'Captures' + /// (and may be zero). + fn setup_search(&mut self, captures_slot_len: usize) { + self.set.clear(); + self.slot_table.setup_search(captures_slot_len); + } +} + +/// A table of slots, where each row represent a state in an NFA. Thus, the +/// table has room for storing slots for every single state in an NFA. +/// +/// This table is represented with a single contiguous allocation. In general, +/// the notion of "capturing group" doesn't really exist at this level of +/// abstraction, hence the name "slot" instead. (Indeed, every capturing group +/// maps to a pair of slots, one for the start offset and one for the end +/// offset.) Slots are indexed by the `Captures` NFA state. +#[derive(Clone, Debug)] +struct SlotTable { + /// The actual table of offsets. + table: Vec>, + /// The number of slots per state, i.e., the table's stride or the length + /// of each row. + slots_per_state: usize, + /// The number of slots in the caller-provided `Captures` value for the + /// current search. Setting this to `slots_per_state` is always correct, + /// but may be wasteful. + slots_for_captures: usize, +} + +impl SlotTable { + /// Create a new slot table. + /// + /// One should call 'reset' with the corresponding PikeVM before use. + fn new() -> SlotTable { + SlotTable { table: vec![], slots_for_captures: 0, slots_per_state: 0 } + } + + /// Reset this slot table such that it can be used with the given PikeVM + /// (and only that PikeVM). + fn reset(&mut self, re: &PikeVM) { + let nfa = re.nfa(); + // OK because NFA construction would have failed if this overflowed. + self.slots_per_state = nfa.group_len().checked_mul(2).unwrap(); + // This is always correct, but may be reduced for a particular search + // if fewer slots were given by the caller, e.g., none at all or only + // slots for tracking the overall match instead of all slots for every + // group. + self.slots_for_captures = self.slots_per_state; + let len = nfa + .len() + // We add 1 so that our last row is always empty. We use it as + // "scratch" space for computing the epsilon closure off of the + // starting state. + .checked_add(1) + .and_then(|x| x.checked_mul(self.slots_per_state)) + // It seems like this could actually panic on legitimate inputs + // on 32-bit targets. Should we somehow convert this to an error? + // What about something similar for the lazy DFA cache? If you're + // tripping this assert, please file a bug. + .expect("slot table length doesn't overflow"); + self.table.resize(len, None); + } + + /// Return the heap memory usage, in bytes, used by this slot table. + /// + /// This does not include the stack size of this value. + fn memory_usage(&self) -> usize { + self.table.len() * core::mem::size_of::>() + } + + /// Perform any per-search setup for this slot table. + /// + /// In particular, this sets the length of the number of slots used in the + /// slots given by the caller (if any at all). This number may be smaller + /// than the total number of slots available, e.g., when the caller is only + /// interested in tracking the overall match and not the spans of every + /// matching capturing group. Only tracking the overall match can save a + /// substantial amount of time copying capturing spans during a search. + fn setup_search(&mut self, captures_slot_len: usize) { + self.slots_for_captures = captures_slot_len; + } + + /// Return a mutable slice of the slots for the given state. + /// + /// Note that the length of the slice returned may be less than the total + /// number of slots available for this state. In particular, the length + /// always matches the number of slots indicated via `setup_search`. + fn for_state(&mut self, sid: StateID) -> &mut [Option] { + let i = sid.as_usize() * self.slots_per_state; + &mut self.table[i..i + self.slots_for_captures] + } + + /// Return a slice of slots of appropriate length where every slot offset + /// is guaranteed to be absent. This is useful in cases where you need to + /// compute an epsilon closure outside of the user supplied regex, and thus + /// never want it to have any capturing slots set. + fn all_absent(&mut self) -> &mut [Option] { + let i = self.table.len() - self.slots_per_state; + &mut self.table[i..i + self.slots_for_captures] + } +} + +/// Represents a stack frame for use while computing an epsilon closure. +/// +/// (An "epsilon closure" refers to the set of reachable NFA states from a +/// single state without consuming any input. That is, the set of all epsilon +/// transitions not only from that single state, but from every other state +/// reachable by an epsilon transition as well. This is why it's called a +/// "closure.") +/// +/// Computing the epsilon closure in a Thompson NFA proceeds via a depth +/// first traversal over all epsilon transitions from a particular state. +/// (A depth first traversal is important because it emulates the same priority +/// of matches that is typically found in backtracking regex engines.) This +/// depth first traversal is naturally expressed using recursion, but to avoid +/// a call stack size proportional to the size of a regex, we put our stack on +/// the heap instead. +/// +/// This stack thus consists of call frames. The typical call frame is +/// `Explore`, which instructs epsilon closure to explore the epsilon +/// transitions from that state. (Subsequent epsilon transitions are then +/// pushed on to the stack as more `Explore` frames.) If the state ID being +/// explored has no epsilon transitions, then the capturing group slots are +/// copied from the original state that sparked the epsilon closure (from the +/// 'step' routine) to the state ID being explored. This way, capturing group +/// slots are forwarded from the previous state to the next. +/// +/// The other stack frame, `RestoreCaptures`, instructs the epsilon closure to +/// set the position for a particular slot back to some particular offset. This +/// frame is pushed when `Explore` sees a `Capture` transition. `Explore` will +/// set the offset of the slot indicated in `Capture` to the current offset, +/// and then push the old offset on to the stack as a `RestoreCapture` frame. +/// Thus, the new offset is only used until the epsilon closure reverts back to +/// the `RestoreCapture` frame. In effect, this gives the `Capture` epsilon +/// transition its "scope" to only states that come "after" it during depth +/// first traversal. +#[derive(Clone, Debug)] +enum FollowEpsilon { + /// Explore the epsilon transitions from a state ID. + Explore(StateID), + /// Reset the given `slot` to the given `offset` (which might be `None`). + RestoreCapture { slot: u32, offset: Option }, +} + +/// A sparse set used for representing ordered NFA states. +/// +/// This supports constant time addition and membership testing. Clearing an +/// entire set can also be done in constant time. Iteration yields elements +/// in the order in which they were inserted. +/// +/// The data structure is based on: https://research.swtch.com/sparse +/// Note though that we don't actually use uninitialized memory. We generally +/// reuse sparse sets, so the initial allocation cost is bareable. However, its +/// other properties listed above are extremely useful. +#[derive(Clone)] +struct SparseSet { + /// The number of elements currently in this set. + len: usize, + /// Dense contains the ids in the order in which they were inserted. + dense: Vec, + /// Sparse maps ids to their location in dense. + /// + /// A state ID is in the set if and only if + /// sparse[id] < len && id == dense[sparse[id]]. + /// + /// Note that these are indices into 'dense'. It's a little weird to use + /// StateID here, but we know our length can never exceed the bounds of + /// StateID (enforced by 'resize') and StateID will be at most 4 bytes + /// where as a usize is likely double that in most cases. + sparse: Vec, +} + +impl SparseSet { + /// Create a new sparse set with the given capacity. + /// + /// Sparse sets have a fixed size and they cannot grow. Attempting to + /// insert more distinct elements than the total capacity of the set will + /// result in a panic. + /// + /// This panics if the capacity given is bigger than `StateID::LIMIT`. + fn new(capacity: usize) -> SparseSet { + let mut set = SparseSet { len: 0, dense: vec![], sparse: vec![] }; + set.resize(capacity); + set + } + + /// Resizes this sparse set to have the new capacity given. + /// + /// This set is automatically cleared. + /// + /// This panics if the capacity given is bigger than `StateID::LIMIT`. + fn resize(&mut self, new_capacity: usize) { + assert!( + new_capacity <= u32::MAX.as_usize(), + "sparse set capacity cannot excced {:?}", + u32::MAX, + ); + self.clear(); + self.dense.resize(new_capacity, 0); + self.sparse.resize(new_capacity, 0); + } + + /// Returns the capacity of this set. + /// + /// The capacity represents a fixed limit on the number of distinct + /// elements that are allowed in this set. The capacity cannot be changed. + fn capacity(&self) -> usize { + self.dense.len() + } + + /// Returns the number of elements in this set. + fn len(&self) -> usize { + self.len + } + + /// Returns true if and only if this set is empty. + fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Insert the state ID value into this set and return true if the given + /// state ID was not previously in this set. + /// + /// This operation is idempotent. If the given value is already in this + /// set, then this is a no-op. + /// + /// If more than `capacity` ids are inserted, then this panics. + /// + /// This is marked as inline(always) since the compiler won't inline it + /// otherwise, and it's a fairly hot piece of code in DFA determinization. + fn insert(&mut self, id: StateID) -> bool { + if self.contains(id) { + return false; + } + + let index = self.len(); + assert!( + index < self.capacity(), + "{:?} exceeds capacity of {:?} when inserting {:?}", + index, + self.capacity(), + id, + ); + self.dense[index] = id; + // OK because we don't permit the capacity to be set higher than + // u32::MAX. + self.sparse[id.as_usize()] = u32::try_from(index).unwrap(); + self.len += 1; + true + } + + /// Returns true if and only if this set contains the given value. + fn contains(&self, id: StateID) -> bool { + let index = self.sparse[id.as_usize()]; + index.as_usize() < self.len() && self.dense[index.as_usize()] == id + } + + /// Clear this set such that it has no members. + fn clear(&mut self) { + self.len = 0; + } + + /// Returns an iterator over all the state IDs in this set in the order in + /// which they were inserted. + fn iter(&self) -> SparseSetIter<'_> { + SparseSetIter(self.dense[..self.len()].iter()) + } + + /// Returns the heap memory usage, in bytes, used by this sparse set. + fn memory_usage(&self) -> usize { + let idsize = core::mem::size_of::(); + (self.dense.len() * idsize) + (self.sparse.len() * idsize) + } +} + +impl core::fmt::Debug for SparseSet { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + let elements: Vec = self.iter().collect(); + f.debug_tuple("SparseSet").field(&elements).finish() + } +} + +/// An iterator over all elements in a sparse set. +/// +/// The lifetime `'a` refers to the lifetime of the set being iterated over. +#[derive(Debug)] +struct SparseSetIter<'a>(core::slice::Iter<'a, StateID>); + +impl<'a> Iterator for SparseSetIter<'a> { + type Item = StateID; + + fn next(&mut self) -> Option { + self.0.next().map(|&id| id) + } +} diff --git a/regex-lite/src/string.rs b/regex-lite/src/string.rs new file mode 100644 index 000000000..e358db9be --- /dev/null +++ b/regex-lite/src/string.rs @@ -0,0 +1,80 @@ +use core::cell::RefCell; + +use alloc::sync::Arc; + +use crate::{ + error::Error, + hir::{self, Hir}, + nfa::{self, NFA}, + pikevm::{Cache, PikeVM}, +}; + +#[derive(Clone, Debug)] +pub struct Regex { + pikevm: Arc, + // TODO: Replace with pool. + cache: RefCell, +} + +impl Regex { + pub fn new(pattern: &str) -> Result { + let hir = Hir::parse(hir::Config::default(), pattern)?; + let nfa = NFA::new(nfa::Config::default(), &hir)?; + std::dbg!(&nfa); + let pikevm = PikeVM::new(nfa); + let cache = Cache::new(&pikevm); + Ok(Regex { pikevm: Arc::new(pikevm), cache: RefCell::new(cache) }) + } + + pub fn is_match(&self, haystack: &str) -> bool { + let mut cache = self.cache.borrow_mut(); + self.pikevm.search( + &mut cache, + haystack.as_bytes(), + 0, + haystack.len(), + true, + &mut [], + ) + } + + pub fn find(&self, haystack: &str) -> Option<(usize, usize)> { + let mut cache = self.cache.borrow_mut(); + let mut slots = [None, None]; + let matched = self.pikevm.search( + &mut cache, + haystack.as_bytes(), + 0, + haystack.len(), + false, + &mut slots, + ); + if !matched { + return None; + } + Some((slots[0].unwrap().get(), slots[1].unwrap().get())) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn scratch() { + let re = Regex::new("abc").unwrap(); + assert_eq!(Some((0, 3)), re.find("abc")); + + let re = Regex::new("abc").unwrap(); + assert_eq!(Some((4, 7)), re.find("foo abc")); + + let re = Regex::new("^abc").unwrap(); + assert_eq!(Some((0, 3)), re.find("abc")); + + let re = Regex::new("^abc").unwrap(); + assert_eq!(None, re.find("foo abc")); + + let re = Regex::new("(?Rm)^foo$").unwrap(); + assert_eq!(Some((2, 5)), re.find("\r\nfoo\r\n")); + } +} diff --git a/regex-lite/src/utf8.rs b/regex-lite/src/utf8.rs new file mode 100644 index 000000000..cb361ac5a --- /dev/null +++ b/regex-lite/src/utf8.rs @@ -0,0 +1,445 @@ +/// Returns true if and only if the given byte is considered a word character. +/// This only applies to ASCII. +pub(crate) fn is_word_byte(b: u8) -> bool { + const fn mkwordset() -> [bool; 256] { + // FIXME: Use as_usize() once const functions in traits are stable. + let mut set = [false; 256]; + set[b'_' as usize] = true; + + let mut byte = b'0'; + while byte <= b'9' { + set[byte as usize] = true; + byte += 1; + } + byte = b'A'; + while byte <= b'Z' { + set[byte as usize] = true; + byte += 1; + } + byte = b'a'; + while byte <= b'z' { + set[byte as usize] = true; + byte += 1; + } + set + } + const WORD: [bool; 256] = mkwordset(); + WORD[b as usize] +} + +/// The accept state index. When we enter this state, we know we've found a +/// valid Unicode scalar value. +const ACCEPT: usize = 12; +/// The reject state index. When we enter this state, we know that we've found +/// invalid UTF-8. +const REJECT: usize = 0; + +/// Like `decode`, but automatically converts the `None` case to the +/// replacement codepoint. +pub(crate) fn decode_lossy>(slice: B) -> (char, usize) { + match decode(slice) { + (Some(ch), size) => (ch, size), + (None, size) => ('\u{FFFD}', size), + } +} + +/// Like `decode_last`, but automatically converts the `None` case to the +/// replacement codepoint. +pub(crate) fn decode_last_lossy>(slice: B) -> (char, usize) { + match decode_last(slice) { + (Some(ch), size) => (ch, size), + (None, size) => ('\u{FFFD}', size), + } +} + +/// UTF-8 decode a single Unicode scalar value from the beginning of a slice. +/// +/// When successful, the corresponding Unicode scalar value is returned along +/// with the number of bytes it was encoded with. The number of bytes consumed +/// for a successful decode is always between 1 and 4, inclusive. +/// +/// When unsuccessful, `None` is returned along with the number of bytes that +/// make up a maximal prefix of a valid UTF-8 code unit sequence. In this case, +/// the number of bytes consumed is always between 0 and 3, inclusive, where +/// 0 is only returned when `slice` is empty. +pub(crate) fn decode>(slice: B) -> (Option, usize) { + let slice = slice.as_ref(); + match slice.get(0) { + None => return (None, 0), + Some(&b) if b <= 0x7F => return (Some(b as char), 1), + _ => {} + } + + let (mut state, mut cp, mut i) = (ACCEPT, 0, 0); + while i < slice.len() { + decode_step(&mut state, &mut cp, slice[i]); + i += 1; + + if state == ACCEPT { + // OK since `decode_step` guarantees that `cp` is a valid Unicode + // scalar value in an ACCEPT state. + // + // We don't have to use safe code here, but do so because perf + // isn't our primary objective in regex-lite. + let ch = char::from_u32(cp).unwrap(); + return (Some(ch), i); + } else if state == REJECT { + // At this point, we always want to advance at least one byte. + return (None, core::cmp::max(1, i.saturating_sub(1))); + } + } + (None, i) +} + +/// Like `decode`, but in reverse from the end of the given slice. +pub(crate) fn decode_last>(slice: B) -> (Option, usize) { + // TODO: We could implement this by reversing the UTF-8 automaton, but for + // now, we do it the slow way by using the forward automaton. + + let slice = slice.as_ref(); + if slice.is_empty() { + return (None, 0); + } + let mut start = slice.len() - 1; + let limit = slice.len().saturating_sub(4); + while start > limit && !is_leading_or_invalid_utf8_byte(slice[start]) { + start -= 1; + } + let (ch, size) = decode(&slice[start..]); + // If we didn't consume all of the bytes, then that means there's at least + // one stray byte that never occurs in a valid code unit prefix, so we can + // advance by one byte. + if start + size != slice.len() { + (None, 1) + } else { + (ch, size) + } +} + +/// Transitions to the next state and updates `cp` while it does. +fn decode_step(state: &mut usize, cp: &mut u32, b: u8) { + // Splits the space of all bytes into equivalence classes, such that + // any byte in the same class can never discriminate between whether a + // particular sequence is valid UTF-8 or not. + #[cfg_attr(rustfmt, rustfmt::skip)] + const CLASSES: [u8; 256] = [ + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, + ]; + + // A state machine taken from `bstr` which was in turn adapted from: + // https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ + #[cfg_attr(rustfmt, rustfmt::skip)] + const STATES_FORWARD: &'static [u8] = &[ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 12, 0, 24, 36, 60, 96, 84, 0, 0, 0, 48, 72, + 0, 12, 0, 0, 0, 0, 0, 12, 0, 12, 0, 0, + 0, 24, 0, 0, 0, 0, 0, 24, 0, 24, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 24, 0, 0, 0, 0, + 0, 24, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0, + 0, 36, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0, + 0, 36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ]; + + let class = CLASSES[b as usize]; + if *state == ACCEPT { + *cp = (0xFF >> class) & (b as u32); + } else { + *cp = (b as u32 & 0b111111) | (*cp << 6); + } + *state = STATES_FORWARD[*state + class as usize] as usize; +} + +/// Returns true if and only if the given byte is either a valid leading UTF-8 +/// byte, or is otherwise an invalid byte that can never appear anywhere in a +/// valid UTF-8 sequence. +fn is_leading_or_invalid_utf8_byte(b: u8) -> bool { + // In the ASCII case, the most significant bit is never set. The leading + // byte of a 2/3/4-byte sequence always has the top two most significant + // bits set. For bytes that can never appear anywhere in valid UTF-8, this + // also returns true, since every such byte has its two most significant + // bits set: + // + // \xC0 :: 11000000 + // \xC1 :: 11000001 + // \xF5 :: 11110101 + // \xF6 :: 11110110 + // \xF7 :: 11110111 + // \xF8 :: 11111000 + // \xF9 :: 11111001 + // \xFA :: 11111010 + // \xFB :: 11111011 + // \xFC :: 11111100 + // \xFD :: 11111101 + // \xFE :: 11111110 + // \xFF :: 11111111 + (b & 0b1100_0000) != 0b1000_0000 +} + +#[cfg(test)] +mod tests { + use alloc::{vec, vec::Vec}; + + use super::*; + + #[test] + fn decode_valid() { + fn d(mut s: &str) -> Vec { + let mut chars = vec![]; + while !s.is_empty() { + let (ch, size) = decode(s.as_bytes()); + s = &s[size..]; + chars.push(ch.unwrap()); + } + chars + } + + assert_eq!(vec!['☃'], d("☃")); + assert_eq!(vec!['☃', '☃'], d("☃☃")); + assert_eq!(vec!['α', 'β', 'γ', 'δ', 'ε'], d("αβγδε")); + assert_eq!(vec!['☃', '⛄', '⛇'], d("☃⛄⛇")); + assert_eq!(vec!['𝗮', '𝗯', '𝗰', '𝗱', '𝗲'], d("𝗮𝗯𝗰𝗱𝗲")); + } + + #[test] + fn decode_invalid() { + let (ch, size) = decode(b""); + assert_eq!(None, ch); + assert_eq!(0, size); + + let (ch, size) = decode(b"\xFF"); + assert_eq!(None, ch); + assert_eq!(1, size); + + let (ch, size) = decode(b"\xCE\xF0"); + assert_eq!(None, ch); + assert_eq!(1, size); + + let (ch, size) = decode(b"\xE2\x98\xF0"); + assert_eq!(None, ch); + assert_eq!(2, size); + + let (ch, size) = decode(b"\xF0\x9D\x9D"); + assert_eq!(None, ch); + assert_eq!(3, size); + + let (ch, size) = decode(b"\xF0\x9D\x9D\xF0"); + assert_eq!(None, ch); + assert_eq!(3, size); + + let (ch, size) = decode(b"\xF0\x82\x82\xAC"); + assert_eq!(None, ch); + assert_eq!(1, size); + + let (ch, size) = decode(b"\xED\xA0\x80"); + assert_eq!(None, ch); + assert_eq!(1, size); + + let (ch, size) = decode(b"\xCEa"); + assert_eq!(None, ch); + assert_eq!(1, size); + + let (ch, size) = decode(b"\xE2\x98a"); + assert_eq!(None, ch); + assert_eq!(2, size); + + let (ch, size) = decode(b"\xF0\x9D\x9Ca"); + assert_eq!(None, ch); + assert_eq!(3, size); + } + + #[test] + fn decode_lossily() { + let (ch, size) = decode_lossy(b""); + assert_eq!('\u{FFFD}', ch); + assert_eq!(0, size); + + let (ch, size) = decode_lossy(b"\xFF"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(1, size); + + let (ch, size) = decode_lossy(b"\xCE\xF0"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(1, size); + + let (ch, size) = decode_lossy(b"\xE2\x98\xF0"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(2, size); + + let (ch, size) = decode_lossy(b"\xF0\x9D\x9D\xF0"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(3, size); + + let (ch, size) = decode_lossy(b"\xF0\x82\x82\xAC"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(1, size); + + let (ch, size) = decode_lossy(b"\xED\xA0\x80"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(1, size); + + let (ch, size) = decode_lossy(b"\xCEa"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(1, size); + + let (ch, size) = decode_lossy(b"\xE2\x98a"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(2, size); + + let (ch, size) = decode_lossy(b"\xF0\x9D\x9Ca"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(3, size); + } + + #[test] + fn decode_last_valid() { + fn d(mut s: &str) -> Vec { + let mut chars = vec![]; + while !s.is_empty() { + let (ch, size) = decode_last(s.as_bytes()); + s = &s[..s.len() - size]; + chars.push(ch.unwrap()); + } + chars + } + + assert_eq!(vec!['☃'], d("☃")); + assert_eq!(vec!['☃', '☃'], d("☃☃")); + assert_eq!(vec!['ε', 'δ', 'γ', 'β', 'α'], d("αβγδε")); + assert_eq!(vec!['⛇', '⛄', '☃'], d("☃⛄⛇")); + assert_eq!(vec!['𝗲', '𝗱', '𝗰', '𝗯', '𝗮'], d("𝗮𝗯𝗰𝗱𝗲")); + } + + #[test] + fn decode_last_invalid() { + let (ch, size) = decode_last(b""); + assert_eq!(None, ch); + assert_eq!(0, size); + + let (ch, size) = decode_last(b"\xFF"); + assert_eq!(None, ch); + assert_eq!(1, size); + + let (ch, size) = decode_last(b"\xCE\xF0"); + assert_eq!(None, ch); + assert_eq!(1, size); + + let (ch, size) = decode_last(b"\xCE"); + assert_eq!(None, ch); + assert_eq!(1, size); + + let (ch, size) = decode_last(b"\xE2\x98\xF0"); + assert_eq!(None, ch); + assert_eq!(1, size); + + let (ch, size) = decode_last(b"\xE2\x98"); + assert_eq!(None, ch); + assert_eq!(2, size); + + let (ch, size) = decode_last(b"\xF0\x9D\x9D\xF0"); + assert_eq!(None, ch); + assert_eq!(1, size); + + let (ch, size) = decode_last(b"\xF0\x9D\x9D"); + assert_eq!(None, ch); + assert_eq!(3, size); + + let (ch, size) = decode_last(b"\xF0\x82\x82\xAC"); + assert_eq!(None, ch); + assert_eq!(1, size); + + let (ch, size) = decode_last(b"\xED\xA0\x80"); + assert_eq!(None, ch); + assert_eq!(1, size); + + let (ch, size) = decode_last(b"\xED\xA0"); + assert_eq!(None, ch); + assert_eq!(1, size); + + let (ch, size) = decode_last(b"\xED"); + assert_eq!(None, ch); + assert_eq!(1, size); + + let (ch, size) = decode_last(b"a\xCE"); + assert_eq!(None, ch); + assert_eq!(1, size); + + let (ch, size) = decode_last(b"a\xE2\x98"); + assert_eq!(None, ch); + assert_eq!(2, size); + + let (ch, size) = decode_last(b"a\xF0\x9D\x9C"); + assert_eq!(None, ch); + assert_eq!(3, size); + } + + #[test] + fn decode_last_lossily() { + let (ch, size) = decode_last_lossy(b""); + assert_eq!('\u{FFFD}', ch); + assert_eq!(0, size); + + let (ch, size) = decode_last_lossy(b"\xFF"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(1, size); + + let (ch, size) = decode_last_lossy(b"\xCE\xF0"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(1, size); + + let (ch, size) = decode_last_lossy(b"\xCE"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(1, size); + + let (ch, size) = decode_last_lossy(b"\xE2\x98\xF0"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(1, size); + + let (ch, size) = decode_last_lossy(b"\xE2\x98"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(2, size); + + let (ch, size) = decode_last_lossy(b"\xF0\x9D\x9D\xF0"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(1, size); + + let (ch, size) = decode_last_lossy(b"\xF0\x9D\x9D"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(3, size); + + let (ch, size) = decode_last_lossy(b"\xF0\x82\x82\xAC"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(1, size); + + let (ch, size) = decode_last_lossy(b"\xED\xA0\x80"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(1, size); + + let (ch, size) = decode_last_lossy(b"\xED\xA0"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(1, size); + + let (ch, size) = decode_last_lossy(b"\xED"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(1, size); + + let (ch, size) = decode_last_lossy(b"a\xCE"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(1, size); + + let (ch, size) = decode_last_lossy(b"a\xE2\x98"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(2, size); + + let (ch, size) = decode_last_lossy(b"a\xF0\x9D\x9C"); + assert_eq!('\u{FFFD}', ch); + assert_eq!(3, size); + } +} diff --git a/regex-lite/src/util.rs b/regex-lite/src/util.rs new file mode 100644 index 000000000..f186b7a95 --- /dev/null +++ b/regex-lite/src/util.rs @@ -0,0 +1,108 @@ +/// Returns true if the given character has significance in a regex. +/// +/// Generally speaking, these are the only characters which _must_ be escaped +/// in order to match their literal meaning. For example, to match a literal +/// `|`, one could write `\|`. Sometimes escaping isn't always necessary. For +/// example, `-` is treated as a meta character because of its significance +/// for writing ranges inside of character classes, but the regex `-` will +/// match a literal `-` because `-` has no special meaning outside of character +/// classes. +/// +/// In order to determine whether a character may be escaped at all, the +/// [`is_escapeable_character`] routine should be used. The difference between +/// `is_meta_character` and `is_escapeable_character` is that the latter will +/// return true for some characters that are _not_ meta characters. For +/// example, `%` and `\%` both match a literal `%` in all contexts. In other +/// words, `is_escapeable_character` includes "superfluous" escapes. +/// +/// Note that the set of characters for which this function returns `true` or +/// `false` is fixed and won't change in a semver compatible release. (In this +/// case, "semver compatible release" actually refers to the `regex` crate +/// itself, since reducing or expanding the set of meta characters would be a +/// breaking change for not just `regex-syntax` but also `regex` itself.) +/// +/// # Example +/// +/// ``` +/// use regex_lite::is_meta_character; +/// +/// assert!(is_meta_character('?')); +/// assert!(is_meta_character('-')); +/// assert!(is_meta_character('&')); +/// assert!(is_meta_character('#')); +/// +/// assert!(!is_meta_character('%')); +/// assert!(!is_meta_character('/')); +/// assert!(!is_meta_character('!')); +/// assert!(!is_meta_character('"')); +/// assert!(!is_meta_character('e')); +/// ``` +pub fn is_meta_character(c: char) -> bool { + match c { + '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{' + | '}' | '^' | '$' | '#' | '&' | '-' | '~' => true, + _ => false, + } +} + +/// Returns true if the given character can be escaped in a regex. +/// +/// This returns true in all cases that `is_meta_character` returns true, but +/// also returns true in some cases where `is_meta_character` returns false. +/// For example, `%` is not a meta character, but it is escapeable. That is, +/// `%` and `\%` both match a literal `%` in all contexts. +/// +/// The purpose of this routine is to provide knowledge about what characters +/// may be escaped. Namely, most regex engines permit "superfluous" escapes +/// where characters without any special significance may be escaped even +/// though there is no actual _need_ to do so. +/// +/// This will return false for some characters. For example, `e` is not +/// escapeable. Therefore, `\e` will either result in a parse error (which is +/// true today), or it could backwards compatibly evolve into a new construct +/// with its own meaning. Indeed, that is the purpose of banning _some_ +/// superfluous escapes: it provides a way to evolve the syntax in a compatible +/// manner. +/// +/// # Example +/// +/// ``` +/// use regex_lite::is_escapeable_character; +/// +/// assert!(is_escapeable_character('?')); +/// assert!(is_escapeable_character('-')); +/// assert!(is_escapeable_character('&')); +/// assert!(is_escapeable_character('#')); +/// assert!(is_escapeable_character('%')); +/// assert!(is_escapeable_character('/')); +/// assert!(is_escapeable_character('!')); +/// assert!(is_escapeable_character('"')); +/// +/// assert!(!is_escapeable_character('e')); +/// ``` +pub fn is_escapeable_character(c: char) -> bool { + // Certainly escapeable if it's a meta character. + if is_meta_character(c) { + return true; + } + // Any character that isn't ASCII is definitely not escapeable. There's + // no real need to allow things like \☃ right? + if !c.is_ascii() { + return false; + } + // Otherwise, we basically say that everything is escapeable unless it's a + // letter or digit. Things like \3 are either octal (when enabled) or an + // error, and we should keep it that way. Otherwise, letters are reserved + // for adding new syntax in a backwards compatible way. + match c { + '0'..='9' | 'A'..='Z' | 'a'..='z' => false, + // While not currently supported, we keep these as not escapeable to + // give us some flexibility with respect to supporting the \< and + // \> word boundary assertions in the future. By rejecting them as + // escapeable, \< and \> will result in a parse error. Thus, we can + // turn them into something else in the future without it being a + // backwards incompatible change. + '<' | '>' => false, + _ => true, + } +}