Skip to content

Commit

Permalink
Auto merge of #308 - BurntSushi:fix-271, r=BurntSushi
Browse files Browse the repository at this point in the history
Compute word boundary flags in start state.

At some point, I think I had convinced myself that we didn't need to
compute word boundary flags for the initial state, but it turns out that
we do.

Fixes #271
  • Loading branch information
bors committed Dec 29, 2016
2 parents 5233b14 + 8d764ea commit e2f0850
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 3 deletions.
25 changes: 22 additions & 3 deletions src/dfa.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1378,7 +1378,9 @@ impl<'a> Fsm<'a> {
((empty_flags.end as u8) << 1) |
((empty_flags.start_line as u8) << 2) |
((empty_flags.end_line as u8) << 3) |
((state_flags.is_word() as u8) << 4))
((empty_flags.word_boundary as u8) << 4) |
((empty_flags.not_word_boundary as u8) << 5) |
((state_flags.is_word() as u8) << 6))
as usize
};
match self.cache.start_states[flagi] {
Expand Down Expand Up @@ -1412,9 +1414,17 @@ impl<'a> Fsm<'a> {
empty_flags.end = text.len() == 0;
empty_flags.start_line = at == 0 || text[at - 1] == b'\n';
empty_flags.end_line = text.len() == 0;
if at > 0 && Byte::byte(text[at - 1]).is_ascii_word() {

let is_word_last = at > 0 && Byte::byte(text[at - 1]).is_ascii_word();
let is_word = at < text.len() && Byte::byte(text[at]).is_ascii_word();
if is_word_last {
state_flags.set_word();
}
if is_word == is_word_last {
empty_flags.not_word_boundary = true;
} else {
empty_flags.word_boundary = true;
}
(empty_flags, state_flags)
}

Expand All @@ -1433,9 +1443,18 @@ impl<'a> Fsm<'a> {
empty_flags.end = text.len() == 0;
empty_flags.start_line = at == text.len() || text[at] == b'\n';
empty_flags.end_line = text.len() == 0;
if at < text.len() && Byte::byte(text[at]).is_ascii_word() {

let is_word_last =
at < text.len() && Byte::byte(text[at]).is_ascii_word();
let is_word = at > 0 && Byte::byte(text[at - 1]).is_ascii_word();
if is_word_last {
state_flags.set_word();
}
if is_word == is_word_last {
empty_flags.not_word_boundary = true;
} else {
empty_flags.word_boundary = true;
}
(empty_flags, state_flags)
}

Expand Down
7 changes: 7 additions & 0 deletions tests/regression.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,3 +75,10 @@ mat!(lits_unambiguous1, u!(r"(ABC|CDA|BC)X"), "CDAX", Some((0, 4)));
// See: /~https://github.com/rust-lang-nursery/regex/issues/291
mat!(lits_unambiguous2, u!(r"((IMG|CAM|MG|MB2)_|(DSCN|CIMG))(?P<n>[0-9]+)$"),
"CIMG2341", Some((0, 8)), Some((0, 4)), None, Some((0, 4)), Some((4, 8)));

// See: /~https://github.com/rust-lang-nursery/regex/issues/271
mat!(end_not_wb, u!(r"$(?-u:\B)"), "\u{5c124}\u{b576c}", Some((8, 8)));
mat!(endl_or_wb, u!(r"(?m:$)|(?-u:\b)"), "\u{6084e}", Some((4, 4)));
mat!(zero_or_end, u!(r"(?i-u:\x00)|$"), "\u{e682f}", Some((4, 4)));
mat!(y_or_endl, u!(r"(?i-u:y)|(?m:$)"), "\u{b4331}", Some((4, 4)));
mat!(wb_start_x, u!(r"(?u:\b)^(?-u:X)"), "X", Some((0, 1)));

0 comments on commit e2f0850

Please sign in to comment.