Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Fix escaped like wildcards (#1204)
Browse files Browse the repository at this point in the history
  • Loading branch information
daniel-martinez-maqueda-sap authored Aug 3, 2022
1 parent 9916716 commit 4df28c9
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 4 deletions.
4 changes: 3 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ csv-core = { version = "0.1", optional = true }
csv-async = { version = "^1.1", optional = true }

regex = { version = "^1.3", optional = true }
regex-syntax = { version = "^0.6", optional = true }
streaming-iterator = { version = "0.1", optional = true }
fallible-streaming-iterator = { version = "0.1", optional = true }

Expand Down Expand Up @@ -135,6 +136,7 @@ full = [
"io_avro_compression",
"io_avro_async",
"regex",
"regex-syntax",
"compute",
# parses timezones used in timestamp conversions
"chrono-tz",
Expand Down Expand Up @@ -190,7 +192,7 @@ compute_filter = []
compute_hash = ["multiversion"]
compute_if_then_else = []
compute_length = []
compute_like = ["regex"]
compute_like = ["regex", "regex-syntax"]
compute_limit = []
compute_merge_sort = ["itertools", "compute_sort"]
compute_nullif = ["compute_comparison"]
Expand Down
45 changes: 42 additions & 3 deletions src/compute/like.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,41 @@ fn is_like_pattern(c: char) -> bool {
c == '%' || c == '_'
}

/// Transforms a like `pattern` to a regex compatible pattern. To achieve that, it does:
///
/// 1. Replace like wildcards for regex expressions as the pattern will be evaluated using regex match: `%` => `.*` and `_` => `.`
/// 2. Escape regex meta characters to match them and not be evaluated as regex special chars. For example: `.` => `\\.`
/// 3. Replace escaped like wildcards removing the escape characters to be able to match it as a regex. For example: `\\%` => `%`
fn replace_pattern(pattern: &str) -> String {
pattern.replace('%', ".*").replace('_', ".")
let mut result = String::new();
let text = String::from(pattern);
let mut chars_iter = text.chars().peekable();
while let Some(c) = chars_iter.next() {
if c == '\\' {
let next = chars_iter.peek();
match next {
Some(next) if is_like_pattern(*next) => {
result.push(*next);
// Skipping the next char as it is already appended
chars_iter.next();
}
_ => {
result.push('\\');
result.push('\\');
}
}
} else if regex_syntax::is_meta_character(c) {
result.push('\\');
result.push(c);
} else if c == '%' {
result.push_str(".*");
} else if c == '_' {
result.push('.');
} else {
result.push(c);
}
}
result
}

#[inline]
Expand Down Expand Up @@ -108,7 +141,10 @@ fn a_like_utf8_scalar<O: Offset, F: Fn(bool) -> bool>(

let values = if !rhs.contains(is_like_pattern) {
Bitmap::from_trusted_len_iter(lhs.values_iter().map(|x| op(x == rhs)))
} else if rhs.ends_with('%') && !rhs[..rhs.len() - 1].contains(is_like_pattern) {
} else if rhs.ends_with('%')
&& !rhs.ends_with("\\%")
&& !rhs[..rhs.len() - 1].contains(is_like_pattern)
{
// fast path, can use starts_with
let starts_with = &rhs[..rhs.len() - 1];
Bitmap::from_trusted_len_iter(lhs.values_iter().map(|x| op(x.starts_with(starts_with))))
Expand Down Expand Up @@ -260,7 +296,10 @@ fn a_like_binary_scalar<O: Offset, F: Fn(bool) -> bool>(

let values = if !pattern.contains(is_like_pattern) {
Bitmap::from_trusted_len_iter(lhs.values_iter().map(|x| op(x == rhs)))
} else if pattern.ends_with('%') && !pattern[..pattern.len() - 1].contains(is_like_pattern) {
} else if pattern.ends_with('%')
&& !pattern.ends_with("\\%")
&& !pattern[..pattern.len() - 1].contains(is_like_pattern)
{
// fast path, can use starts_with
let starts_with = &rhs[..rhs.len() - 1];
Bitmap::from_trusted_len_iter(lhs.values_iter().map(|x| op(x.starts_with(starts_with))))
Expand Down
22 changes: 22 additions & 0 deletions tests/it/compute/like.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,28 @@ fn test_like_binary_scalar() -> Result<()> {
Ok(())
}

#[test]
fn test_like_utf8_scalar() -> Result<()> {
let array = Utf8Array::<i32>::from_slice(&["Arrow", "Arrow", "Arrow", "BA"]);

let result = like_utf8_scalar(&array, "A%").unwrap();
assert_eq!(result, BooleanArray::from_slice(&[true, true, true, false]));

let result = like_utf8_scalar(&array, "Arrow").unwrap();
assert_eq!(result, BooleanArray::from_slice(&[true, true, true, false]));

let array = Utf8Array::<i32>::from_slice(&["A%", "Arrow"]);

let result = like_utf8_scalar(&array, "A\\%").unwrap();
assert_eq!(result, BooleanArray::from_slice(&[true, false]));

let array = Utf8Array::<i32>::from_slice(&["A_row", "Arrow"]);
let result = like_utf8_scalar(&array, "A\\_row").unwrap();
assert_eq!(result, BooleanArray::from_slice(&[true, false]));

Ok(())
}

#[test]
fn test_nlike_binary_scalar() -> Result<()> {
let array = BinaryArray::<i32>::from_slice(&["Arrow", "Arrow", "Arrow", "BA"]);
Expand Down

0 comments on commit 4df28c9

Please sign in to comment.