diff --git a/Cargo.lock b/Cargo.lock index 51bd8320c..e0ad7ecb6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5217,6 +5217,7 @@ name = "vortex-mask" version = "0.22.1" dependencies = [ "arrow-buffer", + "itertools 0.14.0", "vortex-error", ] diff --git a/vortex-mask/Cargo.toml b/vortex-mask/Cargo.toml index 8eab4fd58..bbc0c95ac 100644 --- a/vortex-mask/Cargo.toml +++ b/vortex-mask/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "vortex-mask" -description = "Vortex Mask - sorted, unique, positive integers" +description = "Vortex Mask - sorted, unique, non-negative integers" version.workspace = true homepage.workspace = true repository.workspace = true @@ -15,6 +15,7 @@ categories.workspace = true [dependencies] arrow-buffer = { workspace = true } +itertools = { workspace = true } vortex-error = { workspace = true } [lints] diff --git a/vortex-mask/src/intersect_by_rank.rs b/vortex-mask/src/intersect_by_rank.rs index cce12dee5..805df5607 100644 --- a/vortex-mask/src/intersect_by_rank.rs +++ b/vortex-mask/src/intersect_by_rank.rs @@ -1,10 +1,24 @@ use crate::Mask; impl Mask { - /// take the intersection of the `mask` with the set of true values in `self`. + /// Take the intersection of the `mask` with the set of true values in `self`. /// /// We are more interested in low selectivity `self` (as indices) with a boolean buffer mask, /// so we don't optimize for other cases, yet. + /// + /// # Examples + /// + /// Keep the third and fifth set values from mask `m1`: + /// ``` + /// use vortex_mask::Mask; + /// + /// let m1 = Mask::from_iter([true, false, false, true, true, true, false, true]); + /// let m2 = Mask::from_iter([false, false, true, false, true]); + /// assert_eq!( + /// m1.intersect_by_rank(&m2), + /// Mask::from_iter([false, false, false, false, true, false, false, true]) + /// ); + /// ``` pub fn intersect_by_rank(&self, mask: &Mask) -> Mask { assert_eq!(self.true_count(), mask.len()); diff --git a/vortex-mask/src/lib.rs b/vortex-mask/src/lib.rs index 77d3171e2..bf1dc3c91 100644 --- a/vortex-mask/src/lib.rs +++ b/vortex-mask/src/lib.rs @@ -6,6 +6,7 @@ use std::cmp::Ordering; use std::sync::{Arc, OnceLock}; use arrow_buffer::{BooleanBuffer, BooleanBufferBuilder}; +use itertools::Itertools; use vortex_error::vortex_panic; /// If the mask selects more than this fraction of rows, iterate over slices instead of indices. @@ -31,6 +32,7 @@ struct Inner { // Pre-computed values. len: usize, true_count: usize, + // i.e., the fraction of values that are true selectivity: f64, } @@ -51,6 +53,7 @@ impl Inner { // TODO(ngates): for dense indices, we can do better by collecting into u64s. buf.append_n(self.len, false); indices.iter().for_each(|idx| buf.set_bit(*idx, true)); + debug_assert_eq!(buf.len(), self.len); return BooleanBuffer::from(buf); } @@ -85,12 +88,16 @@ impl Inner { if let Some(buffer) = self.buffer.get() { let mut indices = Vec::with_capacity(self.true_count); indices.extend(buffer.set_indices()); + debug_assert!(indices.is_sorted()); + assert_eq!(indices.len(), self.true_count); return indices; } if let Some(slices) = self.slices.get() { let mut indices = Vec::with_capacity(self.true_count); indices.extend(slices.iter().flat_map(|(start, end)| *start..*end)); + debug_assert!(indices.is_sorted()); + assert_eq!(indices.len(), self.true_count); return indices; } @@ -99,6 +106,7 @@ impl Inner { } /// Constructs a slices vector from one of the other representations. + #[allow(clippy::cast_possible_truncation)] fn slices(&self) -> &[(usize, usize)] { self.slices.get_or_init(|| { if self.true_count == self.len { @@ -110,7 +118,10 @@ impl Inner { } if let Some(indices) = self.indices.get() { - let mut slices = Vec::with_capacity(self.true_count); // Upper bound + // Expected number of contiguous slices assuming a uniform distribution of true values. + let expected_num_slices = + (self.selectivity * (self.len - self.true_count + 1) as f64).round() as usize; + let mut slices = Vec::with_capacity(expected_num_slices); let mut iter = indices.iter().copied(); // Handle empty input @@ -200,7 +211,11 @@ impl Mask { /// Create a new [`Mask`] from a [`Vec`]. pub fn from_indices(len: usize, vec: Vec) -> Self { let true_count = vec.len(); - assert!(vec.iter().all(|&idx| idx < len)); + assert!(vec.is_sorted(), "Mask indices must be sorted"); + assert!( + vec.last().is_none_or(|&idx| idx < len), + "Mask indices must be in bounds (len={len})" + ); Self(Arc::new(Inner { buffer: Default::default(), indices: OnceLock::from(vec), @@ -214,7 +229,14 @@ impl Mask { /// Create a new [`Mask`] from a [`Vec<(usize, usize)>`] where each range /// represents a contiguous range of true values. pub fn from_slices(len: usize, vec: Vec<(usize, usize)>) -> Self { - assert!(vec.iter().all(|&(b, e)| b < e && e <= len)); + Self::check_slices(len, &vec); + Self::from_slices_unchecked(len, vec) + } + + fn from_slices_unchecked(len: usize, vec: Vec<(usize, usize)>) -> Self { + #[cfg(debug_assertions)] + Self::check_slices(len, &vec); + let true_count = vec.iter().map(|(b, e)| e - b).sum(); Self(Arc::new(Inner { buffer: Default::default(), @@ -226,6 +248,25 @@ impl Mask { })) } + #[inline(always)] + fn check_slices(len: usize, vec: &[(usize, usize)]) { + assert!(vec.iter().all(|&(b, e)| b < e && e <= len)); + for (first, second) in vec.iter().tuple_windows() { + assert!( + first.0 < second.0, + "Slices must be sorted, got {:?} and {:?}", + first, + second + ); + assert!( + first.1 <= second.0, + "Slices must be non-overlapping, got {:?} and {:?}", + first, + second + ); + } + } + /// Create a new [`Mask`] from the intersection of two indices slices. pub fn from_intersection_indices( len: usize, @@ -254,7 +295,7 @@ impl Mask { } #[inline] - // There is good definition of is_empty, does it mean len == 0 or true_count == 0? + // There is no good definition of is_empty, does it mean len == 0 or true_count == 0? #[allow(clippy::len_without_is_empty)] pub fn len(&self) -> usize { self.0.len @@ -328,7 +369,8 @@ impl Mask { let indices = indices .iter() .copied() - .filter(|&idx| offset <= idx && idx < end) + .skip_while(|idx| *idx < offset) + .take_while(|idx| *idx < end) .map(|idx| idx - offset) .collect(); return Self::from_indices(length, indices); @@ -338,10 +380,11 @@ impl Mask { let slices = slices .iter() .copied() - .filter(|(s, e)| *s < end && *e > offset) + .skip_while(|(_, e)| *e <= offset) + .take_while(|(s, _)| *s < end) .map(|(s, e)| (s.max(offset), e.min(end))) .collect(); - return Self::from_slices(length, slices); + return Self::from_slices_unchecked(length, slices); } vortex_panic!("No mask representation found")