Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Simplified compute (lower/upper) #847

Merged
merged 1 commit into from
Feb 23, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 3 additions & 5 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -181,8 +181,7 @@ compute_substring = []
compute_take = []
compute_temporal = []
compute_window = ["compute_concatenate"]
compute_lower = []
compute_upper = []
compute_utf8 = []
compute = [
"compute_aggregate",
"compute_arithmetics",
Expand All @@ -207,9 +206,8 @@ compute = [
"compute_substring",
"compute_take",
"compute_temporal",
"compute_window",
"compute_lower",
"compute_upper"
"compute_utf8",
"compute_window"
]
benchmarks = ["rand"]
simd = ["packed_simd"]
Expand Down
67 changes: 0 additions & 67 deletions src/compute/lower.rs

This file was deleted.

9 changes: 3 additions & 6 deletions src/compute/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,6 @@ pub mod like;
#[cfg(feature = "compute_limit")]
#[cfg_attr(docsrs, doc(cfg(feature = "compute_limit")))]
pub mod limit;
#[cfg(feature = "compute_lower")]
#[cfg_attr(docsrs, doc(cfg(feature = "compute_lower")))]
pub mod lower;
#[cfg(feature = "compute_merge_sort")]
#[cfg_attr(docsrs, doc(cfg(feature = "compute_merge_sort")))]
pub mod merge_sort;
Expand All @@ -84,9 +81,9 @@ pub mod take;
#[cfg(feature = "compute_temporal")]
#[cfg_attr(docsrs, doc(cfg(feature = "compute_temporal")))]
pub mod temporal;
#[cfg(feature = "compute_upper")]
#[cfg_attr(docsrs, doc(cfg(feature = "compute_upper")))]
pub mod upper;
#[cfg(feature = "compute_utf8")]
#[cfg_attr(docsrs, doc(cfg(feature = "compute_utf8")))]
pub mod utf8;
mod utils;
#[cfg(feature = "compute_window")]
#[cfg_attr(docsrs, doc(cfg(feature = "compute_window")))]
Expand Down
18 changes: 17 additions & 1 deletion src/compute/regex_match.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use std::collections::HashMap;

use regex::Regex;

use super::utils::{combine_validities, unary_utf8_boolean};
use super::utils::combine_validities;
use crate::array::{BooleanArray, Offset, Utf8Array};
use crate::bitmap::Bitmap;
use crate::datatypes::DataType;
Expand Down Expand Up @@ -69,3 +69,19 @@ pub fn regex_match_scalar<O: Offset>(values: &Utf8Array<O>, regex: &str) -> Resu
.map_err(|e| ArrowError::InvalidArgumentError(format!("Unable to compile regex: {}", e)))?;
Ok(unary_utf8_boolean(values, |x| regex.is_match(x)))
}

fn unary_utf8_boolean<O: Offset, F: Fn(&str) -> bool>(
values: &Utf8Array<O>,
op: F,
) -> BooleanArray {
let validity = values.validity().cloned();

let iterator = values.iter().map(|value| {
if value.is_none() {
return false;
};
op(value.unwrap())
});
let values = Bitmap::from_trusted_len_iter(iterator);
BooleanArray::from_data(DataType::Boolean, values, validity)
}
67 changes: 0 additions & 67 deletions src/compute/upper.rs

This file was deleted.

99 changes: 99 additions & 0 deletions src/compute/utf8.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
//! Defines common maps to a [`Utf8Array`]

use crate::{
array::{Array, Offset, Utf8Array},
datatypes::DataType,
error::{ArrowError, Result},
};

/// utf8_apply will apply `Fn(&str) -> String` to every value in Utf8Array.
pub fn utf8_apply<O: Offset, F: Fn(&str) -> String>(f: F, array: &Utf8Array<O>) -> Utf8Array<O> {
let iter = array.values_iter().map(f);

let new = Utf8Array::<O>::from_trusted_len_values_iter(iter);
new.with_validity(array.validity().cloned())
}

/// Returns a new `Array` where each of each of the elements is upper-cased.
/// this function errors when the passed array is not a \[Large\]String array.
pub fn upper(array: &dyn Array) -> Result<Box<dyn Array>> {
match array.data_type() {
DataType::LargeUtf8 => Ok(Box::new(utf8_apply(
str::to_uppercase,
array
.as_any()
.downcast_ref::<Utf8Array<i64>>()
.expect("A large string is expected"),
))),
DataType::Utf8 => Ok(Box::new(utf8_apply(
str::to_uppercase,
array
.as_any()
.downcast_ref::<Utf8Array<i32>>()
.expect("A string is expected"),
))),
_ => Err(ArrowError::InvalidArgumentError(format!(
"upper does not support type {:?}",
array.data_type()
))),
}
}

/// Checks if an array of type `datatype` can perform upper operation
///
/// # Examples
/// ```
/// use arrow2::compute::utf8::can_upper;
/// use arrow2::datatypes::{DataType};
///
/// let data_type = DataType::Utf8;
/// assert_eq!(can_upper(&data_type), true);
///
/// let data_type = DataType::Null;
/// assert_eq!(can_upper(&data_type), false);
/// ```
pub fn can_upper(data_type: &DataType) -> bool {
matches!(data_type, DataType::LargeUtf8 | DataType::Utf8)
}

/// Returns a new `Array` where each of each of the elements is lower-cased.
/// this function errors when the passed array is not a \[Large\]String array.
pub fn lower(array: &dyn Array) -> Result<Box<dyn Array>> {
match array.data_type() {
DataType::LargeUtf8 => Ok(Box::new(utf8_apply(
str::to_lowercase,
array
.as_any()
.downcast_ref::<Utf8Array<i64>>()
.expect("A large string is expected"),
))),
DataType::Utf8 => Ok(Box::new(utf8_apply(
str::to_lowercase,
array
.as_any()
.downcast_ref::<Utf8Array<i32>>()
.expect("A string is expected"),
))),
_ => Err(ArrowError::InvalidArgumentError(format!(
"lower does not support type {:?}",
array.data_type()
))),
}
}

/// Checks if an array of type `datatype` can perform lower operation
///
/// # Examples
/// ```
/// use arrow2::compute::utf8::can_lower;
/// use arrow2::datatypes::{DataType};
///
/// let data_type = DataType::Utf8;
/// assert_eq!(can_lower(&data_type), true);
///
/// let data_type = DataType::Null;
/// assert_eq!(can_lower(&data_type), false);
/// ```
pub fn can_lower(data_type: &DataType) -> bool {
matches!(data_type, DataType::LargeUtf8 | DataType::Utf8)
}
27 changes: 1 addition & 26 deletions src/compute/utils.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
use crate::{
array::{Array, BooleanArray, Offset, Utf8Array},
array::Array,
bitmap::Bitmap,
datatypes::DataType,
error::{ArrowError, Result},
};

Expand All @@ -14,30 +13,6 @@ pub fn combine_validities(lhs: Option<&Bitmap>, rhs: Option<&Bitmap>) -> Option<
}
}

pub fn unary_utf8_boolean<O: Offset, F: Fn(&str) -> bool>(
values: &Utf8Array<O>,
op: F,
) -> BooleanArray {
let validity = values.validity().cloned();

let iterator = values.iter().map(|value| {
if value.is_none() {
return false;
};
op(value.unwrap())
});
let values = Bitmap::from_trusted_len_iter(iterator);
BooleanArray::from_data(DataType::Boolean, values, validity)
}

/// utf8_apply will apply `Fn(&str) -> String` to every value in Utf8Array.
pub fn utf8_apply<O: Offset, F: Fn(&str) -> String>(f: F, array: &Utf8Array<O>) -> Utf8Array<O> {
let iter = array.values_iter().map(f);

let new = Utf8Array::<O>::from_trusted_len_values_iter(iter);
new.with_validity(array.validity().cloned())
}

// Errors iff the two arrays have a different length.
#[inline]
pub fn check_same_len(lhs: &dyn Array, rhs: &dyn Array) -> Result<()> {
Expand Down
1 change: 1 addition & 0 deletions src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ impl ArrowError {
Self::OutOfSpec(msg.into())
}

#[allow(dead_code)]
pub(crate) fn nyi<A: Into<String>>(msg: A) -> Self {
Self::NotYetImplemented(msg.into())
}
Expand Down
Loading