Skip to content

Commit

Permalink
Fix a bug in capture with match
Browse files Browse the repository at this point in the history
When performing "EndText" matching, it is necessary to check whether
the current position matches the input text length.
However, when capturing a submatch using the matching result of DFA, "EndText" matching
wasn't actually performed correctly because the input character string sliced.

This patch resolve this problem by specifying the end position of the capture target
match by the argument "end", not using slice when performing capture with the matching result of DFA.

Fixes rust-lang#557
  • Loading branch information
pipopa committed Mar 10, 2019
1 parent 9b951a6 commit 011b2be
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 44 deletions.
7 changes: 4 additions & 3 deletions src/backtrack.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
slots: &'s mut [Slot],
input: I,
start: usize,
end: usize,
) -> bool {
let mut cache = cache.borrow_mut();
let cache = &mut cache.backtrack;
Expand All @@ -109,7 +110,7 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
slots: slots,
m: cache,
};
b.exec_(start)
b.exec_(start, end)
}

/// Clears the cache such that the backtracking engine can be executed
Expand Down Expand Up @@ -147,7 +148,7 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {

/// Start backtracking at the given position in the input, but also look
/// for literal prefixes.
fn exec_(&mut self, mut at: InputAt) -> bool {
fn exec_(&mut self, mut at: InputAt, end: usize) -> bool {
self.clear();
// If this is an anchored regex at the beginning of the input, then
// we're either already done or we only need to try backtracking once.
Expand All @@ -170,7 +171,7 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
if matched && self.prog.matches.len() == 1 {
return true;
}
if at.is_end() {
if at.pos() == end {
break;
}
at = self.input.at(at.next_pos());
Expand Down
66 changes: 27 additions & 39 deletions src/exec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@

use std::cell::RefCell;
use std::collections::HashMap;
use std::cmp;
use std::sync::Arc;

use thread_local::CachedThreadLocal;
Expand Down Expand Up @@ -557,7 +556,7 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
match self.ro.match_type {
MatchType::Literal(ty) => {
self.find_literals(ty, text, start).and_then(|(s, e)| {
self.captures_nfa_with_match(slots, text, s, e)
self.captures_nfa_type(MatchNfaType::Auto, slots, text, s, e)
})
}
MatchType::Dfa => {
Expand All @@ -566,7 +565,7 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
} else {
match self.find_dfa_forward(text, start) {
dfa::Result::Match((s, e)) => {
self.captures_nfa_with_match(slots, text, s, e)
self.captures_nfa_type(MatchNfaType::Auto, slots, text, s, e)
}
dfa::Result::NoMatch(_) => None,
dfa::Result::Quit => self.captures_nfa(slots, text, start),
Expand All @@ -576,7 +575,7 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
MatchType::DfaAnchoredReverse => {
match self.find_dfa_anchored_reverse(text, start) {
dfa::Result::Match((s, e)) => {
self.captures_nfa_with_match(slots, text, s, e)
self.captures_nfa_type(MatchNfaType::Auto, slots, text, s, e)
}
dfa::Result::NoMatch(_) => None,
dfa::Result::Quit => self.captures_nfa(slots, text, start),
Expand All @@ -585,14 +584,14 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
MatchType::DfaSuffix => {
match self.find_dfa_reverse_suffix(text, start) {
dfa::Result::Match((s, e)) => {
self.captures_nfa_with_match(slots, text, s, e)
self.captures_nfa_type(MatchNfaType::Auto, slots, text, s, e)
}
dfa::Result::NoMatch(_) => None,
dfa::Result::Quit => self.captures_nfa(slots, text, start),
}
}
MatchType::Nfa(ty) => {
self.captures_nfa_type(ty, slots, text, start)
self.captures_nfa_type(ty, slots, text, start, text.len())
}
MatchType::Nothing => None,
MatchType::DfaMany => {
Expand Down Expand Up @@ -830,7 +829,7 @@ impl<'c> ExecNoSync<'c> {
text: &[u8],
start: usize,
) -> bool {
self.exec_nfa(ty, &mut [false], &mut [], true, text, start)
self.exec_nfa(ty, &mut [false], &mut [], true, text, start, text.len())
}

/// Finds the shortest match using an NFA.
Expand All @@ -846,7 +845,7 @@ impl<'c> ExecNoSync<'c> {
start: usize,
) -> Option<usize> {
let mut slots = [None, None];
if self.exec_nfa(ty, &mut [false], &mut slots, true, text, start) {
if self.exec_nfa(ty, &mut [false], &mut slots, true, text, start, text.len()) {
slots[1]
} else {
None
Expand All @@ -861,7 +860,7 @@ impl<'c> ExecNoSync<'c> {
start: usize,
) -> Option<(usize, usize)> {
let mut slots = [None, None];
if self.exec_nfa(ty, &mut [false], &mut slots, false, text, start) {
if self.exec_nfa(ty, &mut [false], &mut slots, false, text, start, text.len()) {
match (slots[0], slots[1]) {
(Some(s), Some(e)) => Some((s, e)),
_ => None,
Expand All @@ -871,26 +870,6 @@ impl<'c> ExecNoSync<'c> {
}
}

/// Like find_nfa, but fills in captures and restricts the search space
/// using previously found match information.
///
/// `slots` should have length equal to `2 * nfa.captures.len()`.
fn captures_nfa_with_match(
&self,
slots: &mut [Slot],
text: &[u8],
match_start: usize,
match_end: usize,
) -> Option<(usize, usize)> {
// We can't use match_end directly, because we may need to examine one
// "character" after the end of a match for lookahead operators. We
// need to move two characters beyond the end, since some look-around
// operations may falsely assume a premature end of text otherwise.
let e = cmp::min(
next_utf8(text, next_utf8(text, match_end)), text.len());
self.captures_nfa(slots, &text[..e], match_start)
}

/// Like find_nfa, but fills in captures.
///
/// `slots` should have length equal to `2 * nfa.captures.len()`.
Expand All @@ -900,7 +879,7 @@ impl<'c> ExecNoSync<'c> {
text: &[u8],
start: usize,
) -> Option<(usize, usize)> {
self.captures_nfa_type(MatchNfaType::Auto, slots, text, start)
self.captures_nfa_type(MatchNfaType::Auto, slots, text, start, text.len())
}

/// Like captures_nfa, but allows specification of type of NFA engine.
Expand All @@ -910,8 +889,9 @@ impl<'c> ExecNoSync<'c> {
slots: &mut [Slot],
text: &[u8],
start: usize,
end: usize,
) -> Option<(usize, usize)> {
if self.exec_nfa(ty, &mut [false], slots, false, text, start) {
if self.exec_nfa(ty, &mut [false], slots, false, text, start, end) {
match (slots[0], slots[1]) {
(Some(s), Some(e)) => Some((s, e)),
_ => None,
Expand All @@ -929,6 +909,7 @@ impl<'c> ExecNoSync<'c> {
quit_after_match: bool,
text: &[u8],
start: usize,
end: usize,
) -> bool {
use self::MatchNfaType::*;
if let Auto = ty {
Expand All @@ -940,10 +921,10 @@ impl<'c> ExecNoSync<'c> {
}
match ty {
Auto => unreachable!(),
Backtrack => self.exec_backtrack(matches, slots, text, start),
Backtrack => self.exec_backtrack(matches, slots, text, start, end),
PikeVM => {
self.exec_pikevm(
matches, slots, quit_after_match, text, start)
matches, slots, quit_after_match, text, start, end)
}
}
}
Expand All @@ -956,6 +937,7 @@ impl<'c> ExecNoSync<'c> {
quit_after_match: bool,
text: &[u8],
start: usize,
end: usize,
) -> bool {
if self.ro.nfa.uses_bytes() {
pikevm::Fsm::exec(
Expand All @@ -965,7 +947,8 @@ impl<'c> ExecNoSync<'c> {
slots,
quit_after_match,
ByteInput::new(text, self.ro.nfa.only_utf8),
start)
start,
end)
} else {
pikevm::Fsm::exec(
&self.ro.nfa,
Expand All @@ -974,7 +957,8 @@ impl<'c> ExecNoSync<'c> {
slots,
quit_after_match,
CharInput::new(text),
start)
start,
end)
}
}

Expand All @@ -985,6 +969,7 @@ impl<'c> ExecNoSync<'c> {
slots: &mut [Slot],
text: &[u8],
start: usize,
end: usize
) -> bool {
if self.ro.nfa.uses_bytes() {
backtrack::Bounded::exec(
Expand All @@ -993,15 +978,17 @@ impl<'c> ExecNoSync<'c> {
matches,
slots,
ByteInput::new(text, self.ro.nfa.only_utf8),
start)
start,
end)
} else {
backtrack::Bounded::exec(
&self.ro.nfa,
self.cache,
matches,
slots,
CharInput::new(text),
start)
start,
end)
}
}

Expand Down Expand Up @@ -1045,11 +1032,12 @@ impl<'c> ExecNoSync<'c> {
&mut [],
false,
text,
start)
start,
text.len())
}
}
}
Nfa(ty) => self.exec_nfa(ty, matches, &mut [], false, text, start),
Nfa(ty) => self.exec_nfa(ty, matches, &mut [], false, text, start, text.len()),
Nothing => false,
}
}
Expand Down
5 changes: 4 additions & 1 deletion src/pikevm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ impl<'r, I: Input> Fsm<'r, I> {
quit_after_match: bool,
input: I,
start: usize,
end: usize,
) -> bool {
let mut cache = cache.borrow_mut();
let cache = &mut cache.pikevm;
Expand All @@ -124,6 +125,7 @@ impl<'r, I: Input> Fsm<'r, I> {
slots,
quit_after_match,
at,
end,
)
}

Expand All @@ -135,6 +137,7 @@ impl<'r, I: Input> Fsm<'r, I> {
slots: &mut [Slot],
quit_after_match: bool,
mut at: InputAt,
end: usize,
) -> bool {
let mut matched = false;
let mut all_matched = false;
Expand Down Expand Up @@ -212,7 +215,7 @@ impl<'r, I: Input> Fsm<'r, I> {
}
}
}
if at.is_end() {
if at.pos() == end {
break;
}
at = at_next;
Expand Down
7 changes: 6 additions & 1 deletion tests/regression.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,13 @@ ismatch!(reverse_suffix2, r"\d\d\d000", "153.230000\n", true);
matiter!(reverse_suffix3, r"\d\d\d000", "153.230000\n", (4, 10));

// See: /~https://github.com/rust-lang/regex/issues/334
mat!(captures_after_dfa_premature_end, r"a(b*(X|$))?", "abcbX",
// See: /~https://github.com/rust-lang/regex/issues/557
mat!(captures_after_dfa_premature_end1, r"a(b*(X|$))?", "abcbX",
Some((0, 1)), None, None);
mat!(captures_after_dfa_premature_end2, r"a(bc*(X|$))?", "abcbX",
Some((0, 1)), None, None);
mat!(captures_after_dfa_premature_end3, r"(aa$)?", "aaz",
Some((0, 0)));

// See: /~https://github.com/rust-lang/regex/issues/437
ismatch!(
Expand Down

0 comments on commit 011b2be

Please sign in to comment.