Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Fix escaped like wildcards
Browse files Browse the repository at this point in the history
Added a new function that replaces the like wildcards '%' and '_' for
the regex counterparts before executing them. It also takes into account
that the wildcards can be escaped, in that case, it does remove the
escape characters and leaves the wildcards so that they are matched
against the raw character.

This is implemented iterating over all the characters of the pattern to
figure out when it needs to be transformed or not.
  • Loading branch information
daniel-martinez-maqueda-sap committed Aug 3, 2022
1 parent 3f3febf commit e430c9b
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 4 deletions.
4 changes: 3 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ csv-core = { version = "0.1", optional = true }
csv-async = { version = "^1.1", optional = true }

regex = { version = "^1.3", optional = true }
regex-syntax = { version = "^0.6", optional = true }
streaming-iterator = { version = "0.1", optional = true }
fallible-streaming-iterator = { version = "0.1", optional = true }

Expand Down Expand Up @@ -135,6 +136,7 @@ full = [
"io_avro_compression",
"io_avro_async",
"regex",
"regex-syntax",
"compute",
# parses timezones used in timestamp conversions
"chrono-tz",
Expand Down Expand Up @@ -190,7 +192,7 @@ compute_filter = []
compute_hash = ["multiversion"]
compute_if_then_else = []
compute_length = []
compute_like = ["regex"]
compute_like = ["regex", "regex-syntax"]
compute_limit = []
compute_merge_sort = ["itertools", "compute_sort"]
compute_nullif = ["compute_comparison"]
Expand Down
40 changes: 37 additions & 3 deletions src/compute/like.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,42 @@ fn is_like_pattern(c: char) -> bool {
c == '%' || c == '_'
}

/// Transforms a like `pattern` to a regex compatible pattern. To achieve that, it does:
///
/// 1. Replace like wildcards for regex expressions as the pattern will be evaluated using regex match: `%` => `.*` and `_` => `.`
/// 2. Escape regex meta characters to match them and not be evaluated as regex special chars. For example: `.` => `\\.`
/// 3. Replace escaped like wildcards removing the escape characters to be able to match it as a regex. For example: `\\%` => `%`
fn replace_pattern(pattern: &str) -> String {
pattern.replace('%', ".*").replace('_', ".")
let mut result = String::new();
let text = String::from(pattern);
let mut chars_iter = text.chars().peekable();
while let Some(c) = chars_iter.next() {
if c == '\\' {
let next = chars_iter.peek();
match next {
Some(next) if is_like_pattern(*next) => {
result.push(*next);
// Skipping the next char as it is already appended
chars_iter.next();
}
_ => {
result.push('\\');
result.push('\\');
}

}
} else if regex_syntax::is_meta_character(c) {
result.push('\\');
result.push(c);
} else if c == '%' {
result.push_str(".*");
} else if c == '_' {
result.push('.');
} else {
result.push(c);
}
}
result
}

#[inline]
Expand Down Expand Up @@ -108,7 +142,7 @@ fn a_like_utf8_scalar<O: Offset, F: Fn(bool) -> bool>(

let values = if !rhs.contains(is_like_pattern) {
Bitmap::from_trusted_len_iter(lhs.values_iter().map(|x| op(x == rhs)))
} else if rhs.ends_with('%') && !rhs[..rhs.len() - 1].contains(is_like_pattern) {
} else if rhs.ends_with('%') && !rhs.ends_with("\\%") && !rhs[..rhs.len() - 1].contains(is_like_pattern) {
// fast path, can use starts_with
let starts_with = &rhs[..rhs.len() - 1];
Bitmap::from_trusted_len_iter(lhs.values_iter().map(|x| op(x.starts_with(starts_with))))
Expand Down Expand Up @@ -260,7 +294,7 @@ fn a_like_binary_scalar<O: Offset, F: Fn(bool) -> bool>(

let values = if !pattern.contains(is_like_pattern) {
Bitmap::from_trusted_len_iter(lhs.values_iter().map(|x| op(x == rhs)))
} else if pattern.ends_with('%') && !pattern[..pattern.len() - 1].contains(is_like_pattern) {
} else if pattern.ends_with('%') && !pattern.ends_with("\\%") && !pattern[..pattern.len() - 1].contains(is_like_pattern) {
// fast path, can use starts_with
let starts_with = &rhs[..rhs.len() - 1];
Bitmap::from_trusted_len_iter(lhs.values_iter().map(|x| op(x.starts_with(starts_with))))
Expand Down
23 changes: 23 additions & 0 deletions tests/it/compute/like.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,29 @@ fn test_like_binary_scalar() -> Result<()> {
Ok(())
}

#[test]
fn test_like_utf8_scalar() -> Result<()> {
let array = Utf8Array::<i32>::from_slice(&["Arrow", "Arrow", "Arrow", "BA"]);

let result = like_utf8_scalar(&array, "A%").unwrap();
assert_eq!(result, BooleanArray::from_slice(&[true, true, true, false]));

let result = like_utf8_scalar(&array, "Arrow").unwrap();
assert_eq!(result, BooleanArray::from_slice(&[true, true, true, false]));

let array = Utf8Array::<i32>::from_slice(&["A%", "Arrow"]);

let result = like_utf8_scalar(&array, "A\\%").unwrap();
assert_eq!(result, BooleanArray::from_slice(&[true, false]));

let array = Utf8Array::<i32>::from_slice(&["A_row", "Arrow"]);
let result = like_utf8_scalar(&array, "A\\_row").unwrap();
assert_eq!(result, BooleanArray::from_slice(&[true, false]));


Ok(())
}

#[test]
fn test_nlike_binary_scalar() -> Result<()> {
let array = BinaryArray::<i32>::from_slice(&["Arrow", "Arrow", "Arrow", "BA"]);
Expand Down

0 comments on commit e430c9b

Please sign in to comment.