-
Notifications
You must be signed in to change notification settings - Fork 24
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[char/range] Add CharRange and CharIter #112
Changes from 16 commits
165cc3d
72b2000
ea8e6d9
56b3249
aee0479
8f0bd86
d1a28f6
641916f
d04365f
f5520de
0e0ba6b
ad6ce50
8153d22
34c01de
61a831c
eb22f35
bf5fabf
6c2d967
7c0c5dc
200e721
cf50802
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
[package] | ||
name = "unic-char-range" | ||
version = "0.5.0" | ||
authors = ["The UNIC Project Developers"] | ||
repository = "/~https://github.com/behnam/rust-unic/" | ||
license = "MIT/Apache-2.0" | ||
keywords = ["text", "unicode", "iteration"] | ||
description = "UNIC - Unicode Characters - Character Range and Iteration" | ||
categories = ["text-processing"] | ||
|
||
# No tests/benches that depends on /data/ | ||
exclude = [] | ||
|
||
[features] | ||
default = [] | ||
|
||
# Unstable features | ||
unstable = [ "fused", "trusted-len" ] | ||
fused = [] | ||
trusted-len = [] | ||
|
||
|
||
[badges] | ||
travis-ci = { repository = "behnam/rust-unic", branch = "master" } |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
#![feature(test)] | ||
|
||
extern crate test; | ||
extern crate unic_char_range; | ||
|
||
use std::char; | ||
use unic_char_range::CharRange; | ||
|
||
#[bench] | ||
fn forward_iteration(b: &mut test::Bencher) { | ||
b.iter(|| CharRange::all().count()) | ||
} | ||
|
||
#[bench] | ||
fn forward_iteration_baseline(b: &mut test::Bencher) { | ||
b.iter(|| (0..0x110000).filter_map(char::from_u32).count()) | ||
} | ||
|
||
#[bench] | ||
fn reverse_iteration(b: &mut test::Bencher) { | ||
b.iter(|| CharRange::all().rev().count()) | ||
} | ||
|
||
#[bench] | ||
fn reverse_iteration_baseline(b: &mut test::Bencher) { | ||
b.iter(|| (0..0x110000).rev().filter_map(char::from_u32).count()) | ||
} | ||
|
||
#[bench] | ||
fn range_length(b: &mut test::Bencher) { | ||
b.iter(|| CharRange::all().len()) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
//! # Unic - Char - Range | ||
//! | ||
//! A simple way to control iteration over a range of characters. | ||
//! | ||
//! # Examples | ||
//! | ||
//! ``` | ||
//! # #[macro_use] extern crate unic_char_range; | ||
//! # use unic_char_range::*; | ||
//! # fn main() { | ||
//! for character in chars!('a'..='z') { | ||
//! // character is each character in the lowercase english alphabet in order | ||
//! } | ||
//! | ||
//! for character in chars!(..) { | ||
//! // character is every valid char from lowest codepoint to highest | ||
//! } | ||
//! # } | ||
//! ``` | ||
//! | ||
//! # Features | ||
//! | ||
//! None of these features are included by default; they rely on unstable Rust feature gates. | ||
//! | ||
//! - `unstable`: enables all features | ||
//! - `fused`: impl the [`FusedIterator`] contract | ||
//! - `trusted-len`: impl the [`TrustedLen`] contract | ||
//! | ||
//! [`FusedIterator`](https://doc.rust-lang.org/std/iter/trait.FusedIterator.html) | ||
//! [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html) | ||
//! | ||
#![forbid(bad_style, missing_debug_implementations, unconditional_recursion)] | ||
#![deny(missing_docs, unsafe_code, unused, future_incompatible)] | ||
#![cfg_attr(feature = "fused", feature(fused))] | ||
#![cfg_attr(feature = "trusted-len", feature(trusted_len))] | ||
|
||
mod macros; | ||
mod range; | ||
mod step; | ||
|
||
pub use range::CharRange; |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
#[macro_export] | ||
/// Convenience macro for the initialization of `CharRange`s. | ||
/// | ||
/// # Syntax | ||
/// | ||
/// ``` | ||
/// # #[macro_use] extern crate unic_char_range; | ||
/// # fn main() { | ||
/// chars!('a'..'z'); // Iterate the half open range including 'a' and excluding 'z' | ||
/// chars!('a'..='z'); // Iterate the closed range including 'a' and including 'z' | ||
/// chars!(..); // Iterate all characters | ||
/// # } | ||
/// ``` | ||
/// | ||
/// `chars!('a'..='z')` and `chars!(..)` are constant-time expressions, and can be used | ||
/// where such are required, such as in the initialization of constant data structures. | ||
/// | ||
/// Note that because an `expr` capture cannot be followed by a `..`/`..=`, | ||
/// this macro captures token trees. This means that if you want to pass more than one token, | ||
/// you must parenthesize it (e.g. `chars!('\0' ..= (char::MAX)`). | ||
macro_rules! chars { | ||
( $low:tt .. $high:tt ) => ( $crate::CharRange { low: $low, high: $high } ); | ||
( $low:tt ..= $high:tt ) => ( $crate::CharRange::open_right($low, $high) ); | ||
( .. ) => ( chars!( '\0' ..= (::std::char::MAX) ) ); | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,203 @@ | ||
use std::char; | ||
use std::collections::Bound; | ||
use std::ops::Range; | ||
use step; | ||
|
||
const SURROGATE_RANGE: Range<u32> = 0xD800..0xE000; | ||
|
||
/// A range of unicode code points. | ||
/// | ||
/// The members of this struct are public for const initialization by `chars!(..=)` only. | ||
/// They should be considered unstable private API that may change at any time. | ||
/// If you decide to use them anyway, make sure to note the safety notes. | ||
#[derive(Copy, Clone, Debug, Eq, PartialEq)] | ||
pub struct CharRange { | ||
/// The lowest uniterated character (inclusive). | ||
/// | ||
/// Iteration is finished if this is higher than `high`. | ||
/// | ||
/// # Safety | ||
/// | ||
/// This is not guaranteed to always be a valid character. Check before using! | ||
/// Note that `high` _is_ guaranteed to be a valid character, | ||
/// so this will always be a valid character when iteration is not yet finished. | ||
#[doc(hidden)] | ||
pub low: char, | ||
|
||
/// The highest uniterated character (inclusive). | ||
/// | ||
/// Iteration is finished if this is lower than `low`. | ||
#[doc(hidden)] | ||
pub high: char, | ||
} | ||
|
||
/// Constructors | ||
impl CharRange { | ||
/// Construct a closed range of characters. | ||
pub fn closed(start: char, stop: char) -> CharRange { | ||
CharRange { | ||
low: start, | ||
high: stop, | ||
} | ||
} | ||
|
||
/// Construct a half open (right) range of characters. | ||
pub fn open_right(start: char, stop: char) -> CharRange { | ||
let mut range = CharRange::closed(start, stop); | ||
range.step_backward(); | ||
range | ||
} | ||
|
||
/// Construct a half open (left) range of characters. | ||
pub fn open_left(start: char, stop: char) -> CharRange { | ||
let mut range = CharRange::closed(start, stop); | ||
range.step_forward(); | ||
range | ||
} | ||
|
||
/// Construct a fully open range of characters. | ||
pub fn open(start: char, stop: char) -> CharRange { | ||
let mut range = CharRange::closed(start, stop); | ||
range.step_forward(); | ||
range.step_backward(); | ||
range | ||
} | ||
|
||
/// Construct a range of characters from bounds. | ||
pub fn bound(mut start: Bound<char>, mut stop: Bound<char>) -> CharRange { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a very good way to keep a generic API here. ❤️ But, why do you expect the input to be
I guess this is the kind of issues we usually catch with unit tests. Want to add one for this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Of note: you can pass an imutable owned value here, because it's taken by (copied) value. But this formulation makes more sense than what I have. 👍 |
||
if start == Bound::Unbounded { | ||
start = Bound::Included('\0'); | ||
} | ||
if stop == Bound::Unbounded { | ||
stop = Bound::Included(char::MAX); | ||
} | ||
match (start, stop) { | ||
(Bound::Included(start), Bound::Included(stop)) => CharRange::closed(start, stop), | ||
(Bound::Excluded(start), Bound::Excluded(stop)) => CharRange::open(start, stop), | ||
(Bound::Included(start), Bound::Excluded(stop)) => CharRange::open_right(start, stop), | ||
(Bound::Excluded(start), Bound::Included(stop)) => CharRange::open_left(start, stop), | ||
(Bound::Unbounded, _) | (_, Bound::Unbounded) => unreachable!(), | ||
} | ||
} | ||
|
||
/// Construct a range over all characters. | ||
pub fn all() -> CharRange { | ||
CharRange::closed('\0', char::MAX) | ||
} | ||
} | ||
|
||
impl CharRange { | ||
#[inline] | ||
#[allow(unsafe_code)] | ||
// It is always safe to step `self.low` forward because | ||
// `self.low` will only be used when less than `self.high`. | ||
fn step_forward(&mut self) { | ||
self.low = unsafe { step::forward(self.low) } | ||
} | ||
|
||
#[inline] | ||
#[allow(unsafe_code)] | ||
// When stepping `self.high` backward would cause underflow, | ||
// step `self.low` forward instead. It will have the same effect -- | ||
// consuming the last element from the iterator and ending iteration. | ||
fn step_backward(&mut self) { | ||
if self.high == '\0' { | ||
self.step_forward(); | ||
} else { | ||
self.high = unsafe { step::backward(self.high) } | ||
} | ||
} | ||
} | ||
|
||
impl CharRange { | ||
/// Does this range include a character? | ||
/// | ||
/// # Examples | ||
/// | ||
/// ``` | ||
/// # use unic_char_range::CharRange; | ||
/// assert!( CharRange::closed('a', 'g').contains('d')); | ||
/// assert!( ! CharRange::closed('a', 'g').contains('z')); | ||
/// | ||
/// assert!( ! CharRange:: open ('a', 'a').contains('a')); | ||
/// assert!( ! CharRange::closed('z', 'a').contains('g')); | ||
/// ``` | ||
pub fn contains(&self, ch: char) -> bool { | ||
self.low <= ch && ch <= self.high | ||
} | ||
} | ||
|
||
impl Iterator for CharRange { | ||
type Item = char; | ||
|
||
#[inline] | ||
fn next(&mut self) -> Option<char> { | ||
if self.low > self.high { | ||
return None; | ||
} | ||
|
||
let ch = self.low; | ||
self.step_forward(); | ||
Some(ch) | ||
} | ||
|
||
fn size_hint(&self) -> (usize, Option<usize>) { | ||
let len = self.len(); | ||
(len, Some(len)) | ||
} | ||
|
||
fn last(self) -> Option<char> { | ||
if self.low > self.high { | ||
None | ||
} else { | ||
Some(self.high) | ||
} | ||
} | ||
|
||
fn max(self) -> Option<char> { | ||
self.last() | ||
} | ||
|
||
fn min(mut self) -> Option<char> { | ||
self.next() | ||
} | ||
} | ||
|
||
impl DoubleEndedIterator for CharRange { | ||
#[inline] | ||
fn next_back(&mut self) -> Option<Self::Item> { | ||
if self.low > self.high { | ||
return None; | ||
} | ||
|
||
let ch = self.high; | ||
self.step_backward(); | ||
Some(ch) | ||
} | ||
} | ||
|
||
impl ExactSizeIterator for CharRange { | ||
fn len(&self) -> usize { | ||
if self.low > self.high { | ||
return 0; | ||
} | ||
let start = self.low as u32; | ||
let end = self.high as u32; | ||
let naive_len = self.high as usize - self.low as usize + 1; | ||
if start <= SURROGATE_RANGE.start && SURROGATE_RANGE.end <= end { | ||
naive_len - SURROGATE_RANGE.len() | ||
} else { | ||
naive_len | ||
} | ||
} | ||
} | ||
|
||
#[cfg(any(feature = "fused", feature = "trusted-len"))] | ||
use std::iter; | ||
|
||
#[cfg(feature = "fused")] | ||
impl iter::FusedIterator for CharRange {} | ||
|
||
#[allow(unsafe_code)] | ||
#[cfg(feature = "trusted-len")] | ||
unsafe impl iter::TrustedLen for CharRange {} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
use std::char; | ||
|
||
const BEFORE_SURROGATE: char = '\u{D7FF}'; | ||
const AFTER_SURROGATE: char = '\u{E000}'; | ||
|
||
#[inline] | ||
#[allow(unsafe_code)] | ||
/// Step a character one step towards `char::MAX`. | ||
/// | ||
/// # Safety | ||
/// | ||
/// If the given character is `char::MAX`, the return value is not a valid character. | ||
pub unsafe fn forward(ch: char) -> char { | ||
if ch == BEFORE_SURROGATE { | ||
AFTER_SURROGATE | ||
} else { | ||
char::from_u32_unchecked(ch as u32 + 1) | ||
} | ||
} | ||
|
||
#[inline] | ||
#[allow(unsafe_code)] | ||
/// Step a character one step towards `'\0'`. | ||
/// | ||
/// # Safety | ||
/// | ||
/// If the given character is `'\0'`, this will cause an underflow. | ||
/// (Thus, it will panic in debug mode, undefined behavior in release mode.) | ||
pub unsafe fn backward(ch: char) -> char { | ||
if ch == AFTER_SURROGATE { | ||
BEFORE_SURROGATE | ||
} else { | ||
char::from_u32_unchecked(ch as u32 - 1) | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think derived
PartialEq
doesn't work for us here, because any case withhigh < low
is considered empty for us, buteq()
would return false depending on the numbers.For rustc
RangeInclusive
, there's been a talk about having anenum
for the type with anEmpty
variant. I don't think we want that here, but need to well-define emptiness.Also a reminder that
CharRange
also needs anis_empty()
, to not enforce.len() == 0
to call-sites.Another matter with empty case is the open question of if either of
low
andhigh
matter in equality. IMHO, it doesn't matter, and we can have this:What do you think?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I actually did a bit of reading on
RangeInclusive
while making this and I think the current design that's been landed on is having just two fields--a start and a stop--and creatingStep
fn to make a known-ended state.And yes, it makes sense that two empty ranges no matter their internal state should compare the same.