diff --git a/Cargo.toml b/Cargo.toml index 3fe3cb06c78..1d246a80be5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,6 +38,7 @@ csv-core = { version = "0.1", optional = true } csv-async = { version = "^1.1", optional = true } regex = { version = "^1.3", optional = true } +regex-syntax = { version = "^0.6", optional = true } streaming-iterator = { version = "0.1", optional = true } fallible-streaming-iterator = { version = "0.1", optional = true } @@ -135,6 +136,7 @@ full = [ "io_avro_compression", "io_avro_async", "regex", + "regex-syntax", "compute", # parses timezones used in timestamp conversions "chrono-tz", @@ -190,7 +192,7 @@ compute_filter = [] compute_hash = ["multiversion"] compute_if_then_else = [] compute_length = [] -compute_like = ["regex"] +compute_like = ["regex", "regex-syntax"] compute_limit = [] compute_merge_sort = ["itertools", "compute_sort"] compute_nullif = ["compute_comparison"] diff --git a/src/compute/like.rs b/src/compute/like.rs index cc7517e823f..bf363972a1a 100644 --- a/src/compute/like.rs +++ b/src/compute/like.rs @@ -17,8 +17,41 @@ fn is_like_pattern(c: char) -> bool { c == '%' || c == '_' } +/// Transforms a like `pattern` to a regex compatible pattern. To achieve that, it does: +/// +/// 1. Replace like wildcards for regex expressions as the pattern will be evaluated using regex match: `%` => `.*` and `_` => `.` +/// 2. Escape regex meta characters to match them and not be evaluated as regex special chars. For example: `.` => `\\.` +/// 3. Replace escaped like wildcards removing the escape characters to be able to match it as a regex. For example: `\\%` => `%` fn replace_pattern(pattern: &str) -> String { - pattern.replace('%', ".*").replace('_', ".") + let mut result = String::new(); + let text = String::from(pattern); + let mut chars_iter = text.chars().peekable(); + while let Some(c) = chars_iter.next() { + if c == '\\' { + let next = chars_iter.peek(); + match next { + Some(next) if is_like_pattern(*next) => { + result.push(*next); + // Skipping the next char as it is already appended + chars_iter.next(); + } + _ => { + result.push('\\'); + result.push('\\'); + } + } + } else if regex_syntax::is_meta_character(c) { + result.push('\\'); + result.push(c); + } else if c == '%' { + result.push_str(".*"); + } else if c == '_' { + result.push('.'); + } else { + result.push(c); + } + } + result } #[inline] @@ -108,7 +141,10 @@ fn a_like_utf8_scalar bool>( let values = if !rhs.contains(is_like_pattern) { Bitmap::from_trusted_len_iter(lhs.values_iter().map(|x| op(x == rhs))) - } else if rhs.ends_with('%') && !rhs[..rhs.len() - 1].contains(is_like_pattern) { + } else if rhs.ends_with('%') + && !rhs.ends_with("\\%") + && !rhs[..rhs.len() - 1].contains(is_like_pattern) + { // fast path, can use starts_with let starts_with = &rhs[..rhs.len() - 1]; Bitmap::from_trusted_len_iter(lhs.values_iter().map(|x| op(x.starts_with(starts_with)))) @@ -260,7 +296,10 @@ fn a_like_binary_scalar bool>( let values = if !pattern.contains(is_like_pattern) { Bitmap::from_trusted_len_iter(lhs.values_iter().map(|x| op(x == rhs))) - } else if pattern.ends_with('%') && !pattern[..pattern.len() - 1].contains(is_like_pattern) { + } else if pattern.ends_with('%') + && !pattern.ends_with("\\%") + && !pattern[..pattern.len() - 1].contains(is_like_pattern) + { // fast path, can use starts_with let starts_with = &rhs[..rhs.len() - 1]; Bitmap::from_trusted_len_iter(lhs.values_iter().map(|x| op(x.starts_with(starts_with)))) diff --git a/tests/it/compute/like.rs b/tests/it/compute/like.rs index 61cb01cc13f..3bd4c6aaf3d 100644 --- a/tests/it/compute/like.rs +++ b/tests/it/compute/like.rs @@ -39,6 +39,28 @@ fn test_like_binary_scalar() -> Result<()> { Ok(()) } +#[test] +fn test_like_utf8_scalar() -> Result<()> { + let array = Utf8Array::::from_slice(&["Arrow", "Arrow", "Arrow", "BA"]); + + let result = like_utf8_scalar(&array, "A%").unwrap(); + assert_eq!(result, BooleanArray::from_slice(&[true, true, true, false])); + + let result = like_utf8_scalar(&array, "Arrow").unwrap(); + assert_eq!(result, BooleanArray::from_slice(&[true, true, true, false])); + + let array = Utf8Array::::from_slice(&["A%", "Arrow"]); + + let result = like_utf8_scalar(&array, "A\\%").unwrap(); + assert_eq!(result, BooleanArray::from_slice(&[true, false])); + + let array = Utf8Array::::from_slice(&["A_row", "Arrow"]); + let result = like_utf8_scalar(&array, "A\\_row").unwrap(); + assert_eq!(result, BooleanArray::from_slice(&[true, false])); + + Ok(()) +} + #[test] fn test_nlike_binary_scalar() -> Result<()> { let array = BinaryArray::::from_slice(&["Arrow", "Arrow", "Arrow", "BA"]);