diff --git a/regex-automata/src/util/prefilter/aho_corasick.rs b/regex-automata/src/util/prefilter/aho_corasick.rs index a7474d29a..50cce827e 100644 --- a/regex-automata/src/util/prefilter/aho_corasick.rs +++ b/regex-automata/src/util/prefilter/aho_corasick.rs @@ -22,11 +22,20 @@ impl AhoCorasick { } #[cfg(feature = "perf-literal-multisubstring")] { + // We used to use `aho_corasick::MatchKind::Standard` here when + // `kind` was `MatchKind::All`, but this is not correct. The + // "standard" Aho-Corasick match semantics are to report a match + // immediately as soon as it is seen, but `All` isn't like that. + // In particular, with "standard" semantics, given the needles + // "abc" and "b" and the haystack "abc," it would report a match + // at offset 1 before a match at offset 0. This is never what we + // want in the context of the regex engine, regardless of whether + // we have leftmost-first or 'all' semantics. Namely, we always + // want the leftmost match. let ac_match_kind = match kind { - MatchKind::LeftmostFirst => { + MatchKind::LeftmostFirst | MatchKind::All => { aho_corasick::MatchKind::LeftmostFirst } - MatchKind::All => aho_corasick::MatchKind::Standard, }; // This is kind of just an arbitrary number, but basically, if we // have a small enough set of literals, then we try to use the VERY diff --git a/regex-automata/src/util/prefilter/mod.rs b/regex-automata/src/util/prefilter/mod.rs index ea3eb73d8..51fc92233 100644 --- a/regex-automata/src/util/prefilter/mod.rs +++ b/regex-automata/src/util/prefilter/mod.rs @@ -195,15 +195,6 @@ impl Prefilter { /// Some(Span::from(6..9)), /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), /// ); - /// // Now we put 'samwise' back before 'sam', but change the match - /// // semantics to 'All'. In this case, there is no preference - /// // order semantics and the first match detected is returned. - /// let pre = Prefilter::new(MatchKind::All, &["samwise", "sam"]) - /// .expect("a prefilter"); - /// assert_eq!( - /// Some(Span::from(6..9)), - /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), - /// ); /// /// # Ok::<(), Box>(()) /// ``` diff --git a/regex-automata/src/util/prefilter/teddy.rs b/regex-automata/src/util/prefilter/teddy.rs index 02210a5ec..fc79f2b2f 100644 --- a/regex-automata/src/util/prefilter/teddy.rs +++ b/regex-automata/src/util/prefilter/teddy.rs @@ -50,12 +50,17 @@ impl Teddy { // theory we could at least support leftmost-longest, as the // aho-corasick crate does, but regex-automata doesn't know about // leftmost-longest currently. + // + // And like the aho-corasick prefilter, if we're using `All` + // semantics, then we can still use leftmost semantics for a + // prefilter. (This might be a suspicious choice for the literal + // engine, which uses a prefilter as a regex engine directly, but + // that only happens when using leftmost-first semantics.) let (packed_match_kind, ac_match_kind) = match kind { - MatchKind::LeftmostFirst => ( + MatchKind::LeftmostFirst | MatchKind::All => ( aho_corasick::packed::MatchKind::LeftmostFirst, aho_corasick::MatchKind::LeftmostFirst, ), - _ => return None, }; let minimum_len = needles.iter().map(|n| n.as_ref().len()).min().unwrap_or(0); diff --git a/testdata/regression.toml b/testdata/regression.toml index a2efa2ad3..03b15d6d5 100644 --- a/testdata/regression.toml +++ b/testdata/regression.toml @@ -756,3 +756,29 @@ name = "reverse-inner-short" regex = '(?:([0-9][0-9][0-9]):)?([0-9][0-9]):([0-9][0-9])' haystack = '102:12:39' matches = [[[0, 9], [0, 3], [4, 6], [7, 9]]] + +# This regression test was found via the RegexSet APIs. It triggered a +# particular code path where a regex was compiled with 'All' match semantics +# (to support overlapping search), but got funneled down into a standard +# leftmost search when calling 'is_match'. This is fine on its own, but the +# leftmost search will use a prefilter and that's where this went awry. +# +# Namely, since 'All' semantics were used, the aho-corasick prefilter was +# incorrectly compiled with 'Standard' semantics. This was wrong because +# 'Standard' immediately attempts to report a match at every position, even if +# that would mean reporting a match past the leftmost match before reporting +# the leftmost match. This breaks the prefilter contract of never having false +# negatives and leads overall to the engine not finding a match. +# +# See: /~https://github.com/rust-lang/regex/issues/1070 +[[test]] +name = "prefilter-with-aho-corasick-standard-semantics" +regex = '(?m)^ *v [0-9]' +haystack = 'v 0' +matches = [ + { id = 0, spans = [[0, 3]] }, +] +match-kind = "all" +search-kind = "overlapping" +unicode = true +utf8 = true