Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[char/range] Add CharRange and CharIter #112

Merged
merged 21 commits into from
Aug 13, 2017
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions unic/char/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ travis-ci = { repository = "behnam/rust-unic", branch = "master" }

[dependencies]
unic-char-property = { path = "property/", version = "0.5.0" }
unic-char-range = { path = "range/", version = "0.5.0" }
24 changes: 24 additions & 0 deletions unic/char/range/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
[package]
name = "unic-char-range"
version = "0.5.0"
authors = ["The UNIC Project Developers"]
repository = "/~https://github.com/behnam/rust-unic/"
license = "MIT/Apache-2.0"
keywords = ["text", "unicode", "iteration"]
description = "UNIC - Unicode Characters - Character Range and Iteration"
categories = ["text-processing"]

# No tests/benches that depends on /data/
exclude = []

[features]
default = []

# Unstable features
unstable = [ "fused", "trusted-len" ]
fused = []
trusted-len = []


[badges]
travis-ci = { repository = "behnam/rust-unic", branch = "master" }
32 changes: 32 additions & 0 deletions unic/char/range/benches/benchmarks.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#![feature(test)]

extern crate test;
extern crate unic_char_range;

use std::char;
use unic_char_range::CharRange;

#[bench]
fn forward_iteration(b: &mut test::Bencher) {
b.iter(|| CharRange::all().count())
}

#[bench]
fn forward_iteration_baseline(b: &mut test::Bencher) {
b.iter(|| (0..0x110000).filter_map(char::from_u32).count())
}

#[bench]
fn reverse_iteration(b: &mut test::Bencher) {
b.iter(|| CharRange::all().rev().count())
}

#[bench]
fn reverse_iteration_baseline(b: &mut test::Bencher) {
b.iter(|| (0..0x110000).rev().filter_map(char::from_u32).count())
}

#[bench]
fn range_length(b: &mut test::Bencher) {
b.iter(|| CharRange::all().len())
}
41 changes: 41 additions & 0 deletions unic/char/range/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
//! # Unic - Char - Range
//!
//! A simple way to control iteration over a range of characters.
//!
//! # Examples
//!
//! ```
//! # #[macro_use] extern crate unic_char_range;
//! # use unic_char_range::*;
//! # fn main() {
//! for character in chars!('a'..='z') {
//! // character is each character in the lowercase english alphabet in order
//! }
//!
//! for character in chars!(..) {
//! // character is every valid char from lowest codepoint to highest
//! }
//! # }
//! ```
//!
//! # Features
//!
//! None of these features are included by default; they rely on unstable Rust feature gates.
//!
//! - `unstable`: enables all features
//! - `fused`: impl the [`FusedIterator`] contract
//! - `trusted-len`: impl the [`TrustedLen`] contract
//!
//! [`FusedIterator`](https://doc.rust-lang.org/std/iter/trait.FusedIterator.html)
//! [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html)
//!
#![forbid(bad_style, missing_debug_implementations, unconditional_recursion)]
#![deny(missing_docs, unsafe_code, unused, future_incompatible)]
#![cfg_attr(feature = "fused", feature(fused))]
#![cfg_attr(feature = "trusted-len", feature(trusted_len))]

mod macros;
mod range;
mod step;

pub use range::CharRange;
25 changes: 25 additions & 0 deletions unic/char/range/src/macros.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#[macro_export]
/// Convenience macro for the initialization of `CharRange`s.
///
/// # Syntax
///
/// ```
/// # #[macro_use] extern crate unic_char_range;
/// # fn main() {
/// chars!('a'..'z'); // Iterate the half open range including 'a' and excluding 'z'
/// chars!('a'..='z'); // Iterate the closed range including 'a' and including 'z'
/// chars!(..); // Iterate all characters
/// # }
/// ```
///
/// `chars!('a'..='z')` and `chars!(..)` are constant-time expressions, and can be used
/// where such are required, such as in the initialization of constant data structures.
///
/// Note that because an `expr` capture cannot be followed by a `..`/`..=`,
/// this macro captures token trees. This means that if you want to pass more than one token,
/// you must parenthesize it (e.g. `chars!('\0' ..= (char::MAX)`).
macro_rules! chars {
( $low:tt .. $high:tt ) => ( $crate::CharRange { low: $low, high: $high } );
( $low:tt ..= $high:tt ) => ( $crate::CharRange::open_right($low, $high) );
( .. ) => ( chars!( '\0' ..= (::std::char::MAX) ) );
}
203 changes: 203 additions & 0 deletions unic/char/range/src/range.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
use std::char;
use std::collections::Bound;
use std::ops::Range;
use step;

const SURROGATE_RANGE: Range<u32> = 0xD800..0xE000;

/// A range of unicode code points.
///
/// The members of this struct are public for const initialization by `chars!(..=)` only.
/// They should be considered unstable private API that may change at any time.
/// If you decide to use them anyway, make sure to note the safety notes.
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think derived PartialEq doesn't work for us here, because any case with high < low is considered empty for us, but eq() would return false depending on the numbers.

For rustc RangeInclusive, there's been a talk about having an enum for the type with an Empty variant. I don't think we want that here, but need to well-define emptiness.

Also a reminder that CharRange also needs an is_empty(), to not enforce .len() == 0 to call-sites.

Another matter with empty case is the open question of if either of low and high matter in equality. IMHO, it doesn't matter, and we can have this:

fn eq(&self, other: Self) -> bool {
  if self.is_empty() {
    other.is_empty()
  } else {
    self.low == other.low && self.high == other.high
  }
}

What do you think?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I actually did a bit of reading on RangeInclusive while making this and I think the current design that's been landed on is having just two fields--a start and a stop--and creating Step fn to make a known-ended state.

And yes, it makes sense that two empty ranges no matter their internal state should compare the same.

pub struct CharRange {
/// The lowest uniterated character (inclusive).
///
/// Iteration is finished if this is higher than `high`.
///
/// # Safety
///
/// This is not guaranteed to always be a valid character. Check before using!
/// Note that `high` _is_ guaranteed to be a valid character,
/// so this will always be a valid character when iteration is not yet finished.
#[doc(hidden)]
pub low: char,

/// The highest uniterated character (inclusive).
///
/// Iteration is finished if this is lower than `low`.
#[doc(hidden)]
pub high: char,
}

/// Constructors
impl CharRange {
/// Construct a closed range of characters.
pub fn closed(start: char, stop: char) -> CharRange {
CharRange {
low: start,
high: stop,
}
}

/// Construct a half open (right) range of characters.
pub fn open_right(start: char, stop: char) -> CharRange {
let mut range = CharRange::closed(start, stop);
range.step_backward();
range
}

/// Construct a half open (left) range of characters.
pub fn open_left(start: char, stop: char) -> CharRange {
let mut range = CharRange::closed(start, stop);
range.step_forward();
range
}

/// Construct a fully open range of characters.
pub fn open(start: char, stop: char) -> CharRange {
let mut range = CharRange::closed(start, stop);
range.step_forward();
range.step_backward();
range
}

/// Construct a range of characters from bounds.
pub fn bound(mut start: Bound<char>, mut stop: Bound<char>) -> CharRange {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a very good way to keep a generic API here. ❤️

But, why do you expect the input to be mut? I think what you need is to take a readonly bound, and construct a value inside:

let start = if start == Bound::Unbounded { start = Bound::Included('\0') } else { start };

I guess this is the kind of issues we usually catch with unit tests. Want to add one for this?

Copy link
Collaborator Author

@CAD97 CAD97 Aug 12, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Of note: you can pass an imutable owned value here, because it's taken by (copied) value.

But this formulation makes more sense than what I have. 👍

if start == Bound::Unbounded {
start = Bound::Included('\0');
}
if stop == Bound::Unbounded {
stop = Bound::Included(char::MAX);
}
match (start, stop) {
(Bound::Included(start), Bound::Included(stop)) => CharRange::closed(start, stop),
(Bound::Excluded(start), Bound::Excluded(stop)) => CharRange::open(start, stop),
(Bound::Included(start), Bound::Excluded(stop)) => CharRange::open_right(start, stop),
(Bound::Excluded(start), Bound::Included(stop)) => CharRange::open_left(start, stop),
(Bound::Unbounded, _) | (_, Bound::Unbounded) => unreachable!(),
}
}

/// Construct a range over all characters.
pub fn all() -> CharRange {
CharRange::closed('\0', char::MAX)
}
}

impl CharRange {
#[inline]
#[allow(unsafe_code)]
// It is always safe to step `self.low` forward because
// `self.low` will only be used when less than `self.high`.
fn step_forward(&mut self) {
self.low = unsafe { step::forward(self.low) }
}

#[inline]
#[allow(unsafe_code)]
// When stepping `self.high` backward would cause underflow,
// step `self.low` forward instead. It will have the same effect --
// consuming the last element from the iterator and ending iteration.
fn step_backward(&mut self) {
if self.high == '\0' {
self.step_forward();
} else {
self.high = unsafe { step::backward(self.high) }
}
}
}

impl CharRange {
/// Does this range include a character?
///
/// # Examples
///
/// ```
/// # use unic_char_range::CharRange;
/// assert!( CharRange::closed('a', 'g').contains('d'));
/// assert!( ! CharRange::closed('a', 'g').contains('z'));
///
/// assert!( ! CharRange:: open ('a', 'a').contains('a'));
/// assert!( ! CharRange::closed('z', 'a').contains('g'));
/// ```
pub fn contains(&self, ch: char) -> bool {
self.low <= ch && ch <= self.high
}
}

impl Iterator for CharRange {
type Item = char;

#[inline]
fn next(&mut self) -> Option<char> {
if self.low > self.high {
return None;
}

let ch = self.low;
self.step_forward();
Some(ch)
}

fn size_hint(&self) -> (usize, Option<usize>) {
let len = self.len();
(len, Some(len))
}

fn last(self) -> Option<char> {
if self.low > self.high {
None
} else {
Some(self.high)
}
}

fn max(self) -> Option<char> {
self.last()
}

fn min(mut self) -> Option<char> {
self.next()
}
}

impl DoubleEndedIterator for CharRange {
#[inline]
fn next_back(&mut self) -> Option<Self::Item> {
if self.low > self.high {
return None;
}

let ch = self.high;
self.step_backward();
Some(ch)
}
}

impl ExactSizeIterator for CharRange {
fn len(&self) -> usize {
if self.low > self.high {
return 0;
}
let start = self.low as u32;
let end = self.high as u32;
let naive_len = self.high as usize - self.low as usize + 1;
if start <= SURROGATE_RANGE.start && SURROGATE_RANGE.end <= end {
naive_len - SURROGATE_RANGE.len()
} else {
naive_len
}
}
}

#[cfg(any(feature = "fused", feature = "trusted-len"))]
use std::iter;

#[cfg(feature = "fused")]
impl iter::FusedIterator for CharRange {}

#[allow(unsafe_code)]
#[cfg(feature = "trusted-len")]
unsafe impl iter::TrustedLen for CharRange {}
35 changes: 35 additions & 0 deletions unic/char/range/src/step.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
use std::char;

const BEFORE_SURROGATE: char = '\u{D7FF}';
const AFTER_SURROGATE: char = '\u{E000}';

#[inline]
#[allow(unsafe_code)]
/// Step a character one step towards `char::MAX`.
///
/// # Safety
///
/// If the given character is `char::MAX`, the return value is not a valid character.
pub unsafe fn forward(ch: char) -> char {
if ch == BEFORE_SURROGATE {
AFTER_SURROGATE
} else {
char::from_u32_unchecked(ch as u32 + 1)
}
}

#[inline]
#[allow(unsafe_code)]
/// Step a character one step towards `'\0'`.
///
/// # Safety
///
/// If the given character is `'\0'`, this will cause an underflow.
/// (Thus, it will panic in debug mode, undefined behavior in release mode.)
pub unsafe fn backward(ch: char) -> char {
if ch == AFTER_SURROGATE {
BEFORE_SURROGATE
} else {
char::from_u32_unchecked(ch as u32 - 1)
}
}
Loading