Skip to content

Commit

Permalink
feat: small mask improvements (#2035)
Browse files Browse the repository at this point in the history
fixes #2030
  • Loading branch information
lwwmanning authored Jan 21, 2025
1 parent 64b6087 commit a091281
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 9 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion vortex-mask/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "vortex-mask"
description = "Vortex Mask - sorted, unique, positive integers"
description = "Vortex Mask - sorted, unique, non-negative integers"
version.workspace = true
homepage.workspace = true
repository.workspace = true
Expand All @@ -15,6 +15,7 @@ categories.workspace = true

[dependencies]
arrow-buffer = { workspace = true }
itertools = { workspace = true }
vortex-error = { workspace = true }

[lints]
Expand Down
16 changes: 15 additions & 1 deletion vortex-mask/src/intersect_by_rank.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,24 @@
use crate::Mask;

impl Mask {
/// take the intersection of the `mask` with the set of true values in `self`.
/// Take the intersection of the `mask` with the set of true values in `self`.
///
/// We are more interested in low selectivity `self` (as indices) with a boolean buffer mask,
/// so we don't optimize for other cases, yet.
///
/// # Examples
///
/// Keep the third and fifth set values from mask `m1`:
/// ```
/// use vortex_mask::Mask;
///
/// let m1 = Mask::from_iter([true, false, false, true, true, true, false, true]);
/// let m2 = Mask::from_iter([false, false, true, false, true]);
/// assert_eq!(
/// m1.intersect_by_rank(&m2),
/// Mask::from_iter([false, false, false, false, true, false, false, true])
/// );
/// ```
pub fn intersect_by_rank(&self, mask: &Mask) -> Mask {
assert_eq!(self.true_count(), mask.len());

Expand Down
57 changes: 50 additions & 7 deletions vortex-mask/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use std::cmp::Ordering;
use std::sync::{Arc, OnceLock};

use arrow_buffer::{BooleanBuffer, BooleanBufferBuilder};
use itertools::Itertools;
use vortex_error::vortex_panic;

/// If the mask selects more than this fraction of rows, iterate over slices instead of indices.
Expand All @@ -31,6 +32,7 @@ struct Inner {
// Pre-computed values.
len: usize,
true_count: usize,
// i.e., the fraction of values that are true
selectivity: f64,
}

Expand All @@ -51,6 +53,7 @@ impl Inner {
// TODO(ngates): for dense indices, we can do better by collecting into u64s.
buf.append_n(self.len, false);
indices.iter().for_each(|idx| buf.set_bit(*idx, true));
debug_assert_eq!(buf.len(), self.len);
return BooleanBuffer::from(buf);
}

Expand Down Expand Up @@ -85,12 +88,16 @@ impl Inner {
if let Some(buffer) = self.buffer.get() {
let mut indices = Vec::with_capacity(self.true_count);
indices.extend(buffer.set_indices());
debug_assert!(indices.is_sorted());
assert_eq!(indices.len(), self.true_count);
return indices;
}

if let Some(slices) = self.slices.get() {
let mut indices = Vec::with_capacity(self.true_count);
indices.extend(slices.iter().flat_map(|(start, end)| *start..*end));
debug_assert!(indices.is_sorted());
assert_eq!(indices.len(), self.true_count);
return indices;
}

Expand All @@ -99,6 +106,7 @@ impl Inner {
}

/// Constructs a slices vector from one of the other representations.
#[allow(clippy::cast_possible_truncation)]
fn slices(&self) -> &[(usize, usize)] {
self.slices.get_or_init(|| {
if self.true_count == self.len {
Expand All @@ -110,7 +118,10 @@ impl Inner {
}

if let Some(indices) = self.indices.get() {
let mut slices = Vec::with_capacity(self.true_count); // Upper bound
// Expected number of contiguous slices assuming a uniform distribution of true values.
let expected_num_slices =
(self.selectivity * (self.len - self.true_count + 1) as f64).round() as usize;
let mut slices = Vec::with_capacity(expected_num_slices);
let mut iter = indices.iter().copied();

// Handle empty input
Expand Down Expand Up @@ -200,7 +211,11 @@ impl Mask {
/// Create a new [`Mask`] from a [`Vec<usize>`].
pub fn from_indices(len: usize, vec: Vec<usize>) -> Self {
let true_count = vec.len();
assert!(vec.iter().all(|&idx| idx < len));
assert!(vec.is_sorted(), "Mask indices must be sorted");
assert!(
vec.last().is_none_or(|&idx| idx < len),
"Mask indices must be in bounds (len={len})"
);
Self(Arc::new(Inner {
buffer: Default::default(),
indices: OnceLock::from(vec),
Expand All @@ -214,7 +229,14 @@ impl Mask {
/// Create a new [`Mask`] from a [`Vec<(usize, usize)>`] where each range
/// represents a contiguous range of true values.
pub fn from_slices(len: usize, vec: Vec<(usize, usize)>) -> Self {
assert!(vec.iter().all(|&(b, e)| b < e && e <= len));
Self::check_slices(len, &vec);
Self::from_slices_unchecked(len, vec)
}

fn from_slices_unchecked(len: usize, vec: Vec<(usize, usize)>) -> Self {
#[cfg(debug_assertions)]
Self::check_slices(len, &vec);

let true_count = vec.iter().map(|(b, e)| e - b).sum();
Self(Arc::new(Inner {
buffer: Default::default(),
Expand All @@ -226,6 +248,25 @@ impl Mask {
}))
}

#[inline(always)]
fn check_slices(len: usize, vec: &[(usize, usize)]) {
assert!(vec.iter().all(|&(b, e)| b < e && e <= len));
for (first, second) in vec.iter().tuple_windows() {
assert!(
first.0 < second.0,
"Slices must be sorted, got {:?} and {:?}",
first,
second
);
assert!(
first.1 <= second.0,
"Slices must be non-overlapping, got {:?} and {:?}",
first,
second
);
}
}

/// Create a new [`Mask`] from the intersection of two indices slices.
pub fn from_intersection_indices(
len: usize,
Expand Down Expand Up @@ -254,7 +295,7 @@ impl Mask {
}

#[inline]
// There is good definition of is_empty, does it mean len == 0 or true_count == 0?
// There is no good definition of is_empty, does it mean len == 0 or true_count == 0?
#[allow(clippy::len_without_is_empty)]
pub fn len(&self) -> usize {
self.0.len
Expand Down Expand Up @@ -328,7 +369,8 @@ impl Mask {
let indices = indices
.iter()
.copied()
.filter(|&idx| offset <= idx && idx < end)
.skip_while(|idx| *idx < offset)
.take_while(|idx| *idx < end)
.map(|idx| idx - offset)
.collect();
return Self::from_indices(length, indices);
Expand All @@ -338,10 +380,11 @@ impl Mask {
let slices = slices
.iter()
.copied()
.filter(|(s, e)| *s < end && *e > offset)
.skip_while(|(_, e)| *e <= offset)
.take_while(|(s, _)| *s < end)
.map(|(s, e)| (s.max(offset), e.min(end)))
.collect();
return Self::from_slices(length, slices);
return Self::from_slices_unchecked(length, slices);
}

vortex_panic!("No mask representation found")
Expand Down

0 comments on commit a091281

Please sign in to comment.