From d68eb3d24843d2e269989563d45ceda920391fe0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marvin=20L=C3=B6bel?= Date: Wed, 28 Jan 2015 02:19:40 +0100 Subject: [PATCH 1/9] Added benchmarks for string pattern matching functions --- src/libcoretest/str.rs | 107 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) diff --git a/src/libcoretest/str.rs b/src/libcoretest/str.rs index 375564c39bb5b..308c5282a92a3 100644 --- a/src/libcoretest/str.rs +++ b/src/libcoretest/str.rs @@ -121,3 +121,110 @@ fn test_utf16_code_units() { assert_eq!(Utf16Encoder::new(vec!['é', '\u{1F4A9}'].into_iter()).collect::>(), vec![0xE9, 0xD83D, 0xDCA9]) } + + +// rm x86_64-unknown-linux-gnu/stage1/test/coretesttest-x86_64-unknown-linux-gnu; env PLEASE_BENCH=1 make check-stage1-coretest TESTNAME=str::bench + +mod bench { + macro_rules! make_test_inner { + ($s:ident, $code:expr, $name:ident, $str:expr) => { + #[bench] + fn $name(bencher: &mut Bencher) { + let mut $s = $str; + black_box(&mut $s); + bencher.iter(|| $code); + } + } + } + + macro_rules! make_test { + ($name:ident, $s:ident, $code:expr) => { + mod $name { + use test::Bencher; + use test::black_box; + + // Short strings: 65 bytes each + make_test_inner!($s, $code, short_ascii, + "Mary had a little lamb, Little lamb Mary had a littl lamb, lamb!"); + make_test_inner!($s, $code, short_mixed, + "ศไทย中华Việt Nam; Mary had a little lamb, Little lam!"); + make_test_inner!($s, $code, short_pile_of_poo, + "💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩💩!"); + make_test_inner!($s, $code, long_lorem_ipsum,"\ +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Suspendisse quis lorem sit amet dolor \ +ultricies condimentum. Praesent iaculis purus elit, ac malesuada quam malesuada in. Duis sed orci \ +eros. Suspendisse sit amet magna mollis, mollis nunc luctus, imperdiet mi. Integer fringilla non \ +sem ut lacinia. Fusce varius tortor a risus porttitor hendrerit. Morbi mauris dui, ultricies nec \ +tempus vel, gravida nec quam. + +In est dui, tincidunt sed tempus interdum, adipiscing laoreet ante. Etiam tempor, tellus quis \ +sagittis interdum, nulla purus mattis sem, quis auctor erat odio ac tellus. In nec nunc sit amet \ +diam volutpat molestie at sed ipsum. Vestibulum laoreet consequat vulputate. Integer accumsan \ +lorem ac dignissim placerat. Suspendisse convallis faucibus lorem. Aliquam erat volutpat. In vel \ +eleifend felis. Sed suscipit nulla lorem, sed mollis est sollicitudin et. Nam fermentum egestas \ +interdum. Curabitur ut nisi justo. + +Sed sollicitudin ipsum tellus, ut condimentum leo eleifend nec. Cras ut velit ante. Phasellus nec \ +mollis odio. Mauris molestie erat in arcu mattis, at aliquet dolor vehicula. Quisque malesuada \ +lectus sit amet nisi pretium, a condimentum ipsum porta. Morbi at dapibus diam. Praesent egestas \ +est sed risus elementum, eu rutrum metus ultrices. Etiam fermentum consectetur magna, id rutrum \ +felis accumsan a. Aliquam ut pellentesque libero. Sed mi nulla, lobortis eu tortor id, suscipit \ +ultricies neque. Morbi iaculis sit amet risus at iaculis. Praesent eget ligula quis turpis \ +feugiat suscipit vel non arcu. Interdum et malesuada fames ac ante ipsum primis in faucibus. \ +Aliquam sit amet placerat lorem. + +Cras a lacus vel ante posuere elementum. Nunc est leo, bibendum ut facilisis vel, bibendum at \ +mauris. Nullam adipiscing diam vel odio ornare, luctus adipiscing mi luctus. Nulla facilisi. \ +Mauris adipiscing bibendum neque, quis adipiscing lectus tempus et. Sed feugiat erat et nisl \ +lobortis pharetra. Donec vitae erat enim. Nullam sit amet felis et quam lacinia tincidunt. Aliquam \ +suscipit dapibus urna. Sed volutpat urna in magna pulvinar volutpat. Phasellus nec tellus ac diam \ +cursus accumsan. + +Nam lectus enim, dapibus non nisi tempor, consectetur convallis massa. Maecenas eleifend dictum \ +feugiat. Etiam quis mauris vel risus luctus mattis a a nunc. Nullam orci quam, imperdiet id \ +vehicula in, porttitor ut nibh. Duis sagittis adipiscing nisl vitae congue. Donec mollis risus eu \ +leo suscipit, varius porttitor nulla porta. Pellentesque ut sem nec nisi euismod vehicula. Nulla \ +malesuada sollicitudin quam eu fermentum!"); + } + } + } + + make_test!(chars_count, s, s.chars().count()); + + make_test!(contains_bang_str, s, s.contains("!")); + make_test!(contains_bang_char, s, s.contains_char('!')); + + make_test!(match_indices_a_str, s, s.match_indices("a").count()); + + make_test!(split_str_a_str, s, s.split_str("a").count()); + + make_test!(trim_ascii_char, s, { + use std::ascii::AsciiExt; + s.trim_matches(|&mut: c: char| c.is_ascii()) + }); + make_test!(trim_left_ascii_char, s, { + use std::ascii::AsciiExt; + s.trim_left_matches(|&mut: c: char| c.is_ascii()) + }); + make_test!(trim_right_ascii_char, s, { + use std::ascii::AsciiExt; + s.trim_right_matches(|&mut: c: char| c.is_ascii()) + }); + + make_test!(find_underscore_char, s, s.find('_')); + make_test!(rfind_underscore_char, s, s.rfind('_')); + make_test!(find_underscore_str, s, s.find_str("_")); + + make_test!(find_zzz_char, s, s.find('\u{1F4A4}')); + make_test!(rfind_zzz_char, s, s.rfind('\u{1F4A4}')); + make_test!(find_zzz_str, s, s.find_str("\u{1F4A4}")); + + make_test!(split_space_char, s, s.split(' ').count()); + make_test!(split_terminator_space_char, s, s.split_terminator(' ').count()); + + make_test!(splitn_space_char, s, s.splitn(10, ' ').count()); + make_test!(rsplitn_space_char, s, s.rsplitn(10, ' ').count()); + + make_test!(split_str_space_str, s, s.split_str(" ").count()); + make_test!(split_str_ad_str, s, s.split_str("ad").count()); +} From 54f0bead8158eaf948c93d1cae93b60978937417 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marvin=20L=C3=B6bel?= Date: Tue, 30 Dec 2014 21:54:17 +0100 Subject: [PATCH 2/9] Added string pattern traits and basic implementantions --- src/libcore/str/mod.rs | 295 +++++++++++++++++++++---------------- src/libcore/str/pattern.rs | 113 ++++++++++++++ src/libcoretest/str.rs | 7 + 3 files changed, 289 insertions(+), 126 deletions(-) create mode 100644 src/libcore/str/pattern.rs diff --git a/src/libcore/str/mod.rs b/src/libcore/str/mod.rs index eec997b9f10fc..fb0c4c4f34f8a 100644 --- a/src/libcore/str/mod.rs +++ b/src/libcore/str/mod.rs @@ -36,12 +36,16 @@ use result::Result::{self, Ok, Err}; use slice::{self, SliceExt}; use usize; +pub use self::pattern::{Pattern, Matcher, ReverseMatcher, DoubleEndedMatcher}; + +mod pattern; + macro_rules! delegate_iter { (exact $te:ty : $ti:ty) => { delegate_iter!{$te : $ti} impl<'a> ExactSizeIterator for $ti { #[inline] - fn len(&self) -> usize { + fn len(&self) -> uint { self.0.len() } } @@ -56,7 +60,7 @@ macro_rules! delegate_iter { self.0.next() } #[inline] - fn size_hint(&self) -> (usize, Option) { + fn size_hint(&self) -> (uint, Option) { self.0.size_hint() } } @@ -78,7 +82,7 @@ macro_rules! delegate_iter { self.0.next() } #[inline] - fn size_hint(&self) -> (usize, Option) { + fn size_hint(&self) -> (uint, Option) { self.0.size_hint() } } @@ -100,7 +104,7 @@ macro_rules! delegate_iter { self.0.next() } #[inline] - fn size_hint(&self) -> (usize, Option) { + fn size_hint(&self) -> (uint, Option) { self.0.size_hint() } } @@ -149,6 +153,7 @@ impl FromStr for bool { /// An error returned when parsing a `bool` from a string fails. #[derive(Debug, Clone, PartialEq)] +#[allow(missing_copy_implementations)] #[stable(feature = "rust1", since = "1.0.0")] pub struct ParseBoolError { _priv: () } @@ -178,7 +183,7 @@ pub enum Utf8Error { /// The offset is guaranteed to be in bounds of the slice in question, and /// the byte at the specified offset was the first invalid byte in the /// sequence detected. - InvalidByte(usize), + InvalidByte(uint), /// The byte slice was invalid because more bytes were needed but no more /// bytes were available. @@ -227,7 +232,7 @@ pub unsafe fn from_utf8_unchecked<'a>(v: &'a [u8]) -> &'a str { pub unsafe fn from_c_str(s: *const i8) -> &'static str { let s = s as *const u8; let mut len = 0; - while *s.offset(len as isize) != 0 { + while *s.offset(len as int) != 0 { len += 1; } let v: &'static [u8] = ::mem::transmute(Slice { data: s, len: len }); @@ -250,7 +255,7 @@ impl CharEq for char { fn matches(&mut self, c: char) -> bool { *self == c } #[inline] - fn only_ascii(&self) -> bool { (*self as u32) < 128 } + fn only_ascii(&self) -> bool { (*self as uint) < 128 } } impl CharEq for F where F: FnMut(char) -> bool { @@ -383,7 +388,7 @@ impl<'a> Iterator for Chars<'a> { } #[inline] - fn size_hint(&self) -> (usize, Option) { + fn size_hint(&self) -> (uint, Option) { let (len, _) = self.iter.size_hint(); (len.saturating_add(3) / 4, Some(len)) } @@ -428,16 +433,16 @@ impl<'a> DoubleEndedIterator for Chars<'a> { #[derive(Clone)] #[stable(feature = "rust1", since = "1.0.0")] pub struct CharIndices<'a> { - front_offset: usize, + front_offset: uint, iter: Chars<'a>, } #[stable(feature = "rust1", since = "1.0.0")] impl<'a> Iterator for CharIndices<'a> { - type Item = (usize, char); + type Item = (uint, char); #[inline] - fn next(&mut self) -> Option<(usize, char)> { + fn next(&mut self) -> Option<(uint, char)> { let (pre_len, _) = self.iter.iter.size_hint(); match self.iter.next() { None => None, @@ -451,7 +456,7 @@ impl<'a> Iterator for CharIndices<'a> { } #[inline] - fn size_hint(&self) -> (usize, Option) { + fn size_hint(&self) -> (uint, Option) { self.iter.size_hint() } } @@ -459,7 +464,7 @@ impl<'a> Iterator for CharIndices<'a> { #[stable(feature = "rust1", since = "1.0.0")] impl<'a> DoubleEndedIterator for CharIndices<'a> { #[inline] - fn next_back(&mut self) -> Option<(usize, char)> { + fn next_back(&mut self) -> Option<(uint, char)> { match self.iter.next_back() { None => None, Some(ch) => { @@ -512,7 +517,7 @@ struct CharSplits<'a, Sep> { struct CharSplitsN<'a, Sep> { iter: CharSplits<'a, Sep>, /// The number of splits remaining - count: usize, + count: uint, invert: bool, } @@ -636,7 +641,7 @@ impl<'a, Sep: CharEq> Iterator for CharSplitsN<'a, Sep> { /// within a larger string using naive search #[derive(Clone)] struct NaiveSearcher { - position: usize + position: uint } impl NaiveSearcher { @@ -644,7 +649,7 @@ impl NaiveSearcher { NaiveSearcher { position: 0 } } - fn next(&mut self, haystack: &[u8], needle: &[u8]) -> Option<(usize, usize)> { + fn next(&mut self, haystack: &[u8], needle: &[u8]) -> Option<(uint, uint)> { while self.position + needle.len() <= haystack.len() { if &haystack[self.position .. self.position + needle.len()] == needle { let match_pos = self.position; @@ -663,13 +668,13 @@ impl NaiveSearcher { #[derive(Clone)] struct TwoWaySearcher { // constants - crit_pos: usize, - period: usize, + crit_pos: uint, + period: uint, byteset: u64, // variables - position: usize, - memory: usize + position: uint, + memory: uint } /* @@ -756,7 +761,7 @@ impl TwoWaySearcher { // This isn't in the original algorithm, as far as I'm aware. let byteset = needle.iter() - .fold(0, |a, &b| (1 << ((b & 0x3f) as usize)) | a); + .fold(0, |a, &b| (1 << ((b & 0x3f) as uint)) | a); // A particularly readable explanation of what's going on here can be found // in Crochemore and Rytter's book "Text Algorithms", ch 13. Specifically @@ -794,8 +799,7 @@ impl TwoWaySearcher { // How far we can jump when we encounter a mismatch is all based on the fact // that (u, v) is a critical factorization for the needle. #[inline] - fn next(&mut self, haystack: &[u8], needle: &[u8], long_period: bool) - -> Option<(usize, usize)> { + fn next(&mut self, haystack: &[u8], needle: &[u8], long_period: bool) -> Option<(uint, uint)> { 'search: loop { // Check that we have room to search in if self.position + needle.len() > haystack.len() { @@ -805,7 +809,7 @@ impl TwoWaySearcher { // Quickly skip by large portions unrelated to our substring if (self.byteset >> ((haystack[self.position + needle.len() - 1] & 0x3f) - as usize)) & 1 == 0 { + as uint)) & 1 == 0 { self.position += needle.len(); if !long_period { self.memory = 0; @@ -852,7 +856,7 @@ impl TwoWaySearcher { // Specifically, returns (i, p), where i is the starting index of v in some // critical factorization (u, v) and p = period(v) #[inline] - fn maximal_suffix(arr: &[u8], reversed: bool) -> (usize, usize) { + fn maximal_suffix(arr: &[u8], reversed: bool) -> (uint, uint) { let mut left = -1; // Corresponds to i in the paper let mut right = 0; // Corresponds to j in the paper let mut offset = 1; // Corresponds to k in the paper @@ -897,6 +901,7 @@ impl TwoWaySearcher { /// within a larger string using a dynamically chosen search algorithm #[derive(Clone)] enum Searcher { + EmptyNeedle { pos: usize, done: bool }, Naive(NaiveSearcher), TwoWay(TwoWaySearcher), TwoWayLong(TwoWaySearcher) @@ -904,11 +909,16 @@ enum Searcher { impl Searcher { fn new(haystack: &[u8], needle: &[u8]) -> Searcher { + if needle.len() == 0 { + Searcher::EmptyNeedle { + pos: 0, + done: false + } // FIXME: Tune this. // FIXME(#16715): This unsigned integer addition will probably not // overflow because that would mean that the memory almost solely // consists of the needle. Needs #16715 to be formally fixed. - if needle.len() + 20 > haystack.len() { + } else if needle.len() + 20 > haystack.len() { Naive(NaiveSearcher::new()) } else { let searcher = TwoWaySearcher::new(needle); @@ -938,23 +948,37 @@ pub struct MatchIndices<'a> { #[unstable(feature = "core", reason = "type may be removed")] pub struct SplitStr<'a> { it: MatchIndices<'a>, - last_end: usize, + last_end: uint, finished: bool } #[stable(feature = "rust1", since = "1.0.0")] impl<'a> Iterator for MatchIndices<'a> { - type Item = (usize, usize); + type Item = (uint, uint); #[inline] - fn next(&mut self) -> Option<(usize, usize)> { + fn next(&mut self) -> Option<(uint, uint)> { match self.searcher { Naive(ref mut searcher) => searcher.next(self.haystack.as_bytes(), self.needle.as_bytes()), TwoWay(ref mut searcher) => searcher.next(self.haystack.as_bytes(), self.needle.as_bytes(), false), TwoWayLong(ref mut searcher) - => searcher.next(self.haystack.as_bytes(), self.needle.as_bytes(), true) + => searcher.next(self.haystack.as_bytes(), self.needle.as_bytes(), true), + Searcher::EmptyNeedle { ref mut pos, ref mut done } => { + if !*done { + let r = Some((*pos, *pos)); + if *pos == self.haystack.len() { + *done = true; + } else { + use char::CharExt; + *pos += self.haystack.char_at(*pos).len_utf8(); + } + r + } else { + None + } + } } } } @@ -994,7 +1018,7 @@ Section: Comparing strings fn eq_slice_(a: &str, b: &str) -> bool { // NOTE: In theory n should be libc::size_t and not usize, but libc is not available here #[allow(improper_ctypes)] - extern { fn memcmp(s1: *const i8, s2: *const i8, n: usize) -> i32; } + extern { fn memcmp(s1: *const i8, s2: *const i8, n: uint) -> i32; } a.len() == b.len() && unsafe { memcmp(a.as_ptr() as *const i8, b.as_ptr() as *const i8, @@ -1051,7 +1075,7 @@ fn run_utf8_validation_iterator(iter: &mut slice::Iter) // ASCII characters are always valid, so only large // bytes need more examination. if first >= 128 { - let w = UTF8_CHAR_WIDTH[first as usize] as usize; + let w = UTF8_CHAR_WIDTH[first as uint] as uint; let second = next!(); // 2-byte encoding is for codepoints \u{0080} to \u{07ff} // first C2 80 last DF BF @@ -1126,7 +1150,7 @@ pub struct CharRange { /// Current `char` pub ch: char, /// Index of the first byte of the next `char` - pub next: usize, + pub next: uint, } /// Mask of the value bits of a continuation byte @@ -1211,10 +1235,10 @@ mod traits { /// // &s[3 .. 100]; /// ``` #[stable(feature = "rust1", since = "1.0.0")] - impl ops::Index> for str { + impl ops::Index> for str { type Output = str; #[inline] - fn index(&self, index: &ops::Range) -> &str { + fn index(&self, index: &ops::Range) -> &str { // is_char_boundary checks that the index is in [0, .len()] if index.start <= index.end && self.is_char_boundary(index.start) && @@ -1234,10 +1258,10 @@ mod traits { /// Panics when `end` does not point to a valid character, or is /// out of bounds. #[stable(feature = "rust1", since = "1.0.0")] - impl ops::Index> for str { + impl ops::Index> for str { type Output = str; #[inline] - fn index(&self, index: &ops::RangeTo) -> &str { + fn index(&self, index: &ops::RangeTo) -> &str { // is_char_boundary checks that the index is in [0, .len()] if self.is_char_boundary(index.end) { unsafe { self.slice_unchecked(0, index.end) } @@ -1254,10 +1278,10 @@ mod traits { /// Panics when `begin` does not point to a valid character, or is /// out of bounds. #[stable(feature = "rust1", since = "1.0.0")] - impl ops::Index> for str { + impl ops::Index> for str { type Output = str; #[inline] - fn index(&self, index: &ops::RangeFrom) -> &str { + fn index(&self, index: &ops::RangeFrom) -> &str { // is_char_boundary checks that the index is in [0, .len()] if self.is_char_boundary(index.start) { unsafe { self.slice_unchecked(index.start, self.len()) } @@ -1328,46 +1352,49 @@ pub trait StrExt { // NB there are no docs here are they're all located on the StrExt trait in // libcollections, not here. - fn contains(&self, pat: &str) -> bool; - fn contains_char(&self, pat: P) -> bool; + fn contains<'a, P: Pattern<'a>>(&'a self, pat: P) -> bool; + fn contains_char<'a, P: Pattern<'a>>(&'a self, pat: P) -> bool; fn chars<'a>(&'a self) -> Chars<'a>; fn bytes<'a>(&'a self) -> Bytes<'a>; fn char_indices<'a>(&'a self) -> CharIndices<'a>; fn split<'a, P: CharEq>(&'a self, pat: P) -> Split<'a, P>; - fn splitn<'a, P: CharEq>(&'a self, count: usize, pat: P) -> SplitN<'a, P>; + fn splitn<'a, P: CharEq>(&'a self, count: uint, pat: P) -> SplitN<'a, P>; fn split_terminator<'a, P: CharEq>(&'a self, pat: P) -> SplitTerminator<'a, P>; - fn rsplitn<'a, P: CharEq>(&'a self, count: usize, pat: P) -> RSplitN<'a, P>; + fn rsplitn<'a, P: CharEq>(&'a self, count: uint, pat: P) -> RSplitN<'a, P>; fn match_indices<'a>(&'a self, sep: &'a str) -> MatchIndices<'a>; fn split_str<'a>(&'a self, pat: &'a str) -> SplitStr<'a>; fn lines<'a>(&'a self) -> Lines<'a>; fn lines_any<'a>(&'a self) -> LinesAny<'a>; - fn char_len(&self) -> usize; - fn slice_chars<'a>(&'a self, begin: usize, end: usize) -> &'a str; - unsafe fn slice_unchecked<'a>(&'a self, begin: usize, end: usize) -> &'a str; + fn char_len(&self) -> uint; + fn slice_chars<'a>(&'a self, begin: uint, end: uint) -> &'a str; + unsafe fn slice_unchecked<'a>(&'a self, begin: uint, end: uint) -> &'a str; fn starts_with(&self, pat: &str) -> bool; fn ends_with(&self, pat: &str) -> bool; - fn trim_matches<'a, P: CharEq>(&'a self, pat: P) -> &'a str; - fn trim_left_matches<'a, P: CharEq>(&'a self, pat: P) -> &'a str; - fn trim_right_matches<'a, P: CharEq>(&'a self, pat: P) -> &'a str; - fn is_char_boundary(&self, index: usize) -> bool; - fn char_range_at(&self, start: usize) -> CharRange; - fn char_range_at_reverse(&self, start: usize) -> CharRange; - fn char_at(&self, i: usize) -> char; - fn char_at_reverse(&self, i: usize) -> char; + fn trim_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str + where P::Matcher: DoubleEndedMatcher<'a>; + fn trim_left_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str; + fn trim_right_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str + where P::Matcher: ReverseMatcher<'a>; + fn is_char_boundary(&self, index: uint) -> bool; + fn char_range_at(&self, start: uint) -> CharRange; + fn char_range_at_reverse(&self, start: uint) -> CharRange; + fn char_at(&self, i: uint) -> char; + fn char_at_reverse(&self, i: uint) -> char; fn as_bytes<'a>(&'a self) -> &'a [u8]; - fn find(&self, pat: P) -> Option; - fn rfind(&self, pat: P) -> Option; - fn find_str(&self, pat: &str) -> Option; + fn find<'a, P: Pattern<'a>>(&'a self, pat: P) -> Option; + fn rfind<'a, P: Pattern<'a>>(&'a self, pat: P) -> Option + where P::Matcher: ReverseMatcher<'a>; + fn find_str(&self, pat: &str) -> Option; fn slice_shift_char<'a>(&'a self) -> Option<(char, &'a str)>; - fn subslice_offset(&self, inner: &str) -> usize; + fn subslice_offset(&self, inner: &str) -> uint; fn as_ptr(&self) -> *const u8; - fn len(&self) -> usize; + fn len(&self) -> uint; fn is_empty(&self) -> bool; fn parse(&self) -> Result; } #[inline(never)] -fn slice_error_fail(s: &str, begin: usize, end: usize) -> ! { +fn slice_error_fail(s: &str, begin: uint, end: uint) -> ! { assert!(begin <= end); panic!("index {} and/or {} in `{}` do not lie on character boundary", begin, end, s); @@ -1375,13 +1402,13 @@ fn slice_error_fail(s: &str, begin: usize, end: usize) -> ! { impl StrExt for str { #[inline] - fn contains(&self, needle: &str) -> bool { - self.find_str(needle).is_some() + fn contains<'a, P: Pattern<'a>>(&'a self, pat: P) -> bool { + pat.is_contained_in(self) } #[inline] - fn contains_char(&self, pat: P) -> bool { - self.find(pat).is_some() + fn contains_char<'a, P: Pattern<'a>>(&'a self, pat: P) -> bool { + pat.is_contained_in(self) } #[inline] @@ -1411,7 +1438,7 @@ impl StrExt for str { } #[inline] - fn splitn(&self, count: usize, pat: P) -> SplitN

{ + fn splitn(&self, count: uint, pat: P) -> SplitN

{ SplitN(CharSplitsN { iter: self.split(pat).0, count: count, @@ -1428,7 +1455,7 @@ impl StrExt for str { } #[inline] - fn rsplitn(&self, count: usize, pat: P) -> RSplitN

{ + fn rsplitn(&self, count: uint, pat: P) -> RSplitN

{ RSplitN(CharSplitsN { iter: self.split(pat).0, count: count, @@ -1438,7 +1465,6 @@ impl StrExt for str { #[inline] fn match_indices<'a>(&'a self, sep: &'a str) -> MatchIndices<'a> { - assert!(!sep.is_empty()); MatchIndices { haystack: self, needle: sep, @@ -1472,9 +1498,9 @@ impl StrExt for str { } #[inline] - fn char_len(&self) -> usize { self.chars().count() } + fn char_len(&self) -> uint { self.chars().count() } - fn slice_chars(&self, begin: usize, end: usize) -> &str { + fn slice_chars(&self, begin: uint, end: uint) -> &str { assert!(begin <= end); let mut count = 0; let mut begin_byte = None; @@ -1498,9 +1524,9 @@ impl StrExt for str { } #[inline] - unsafe fn slice_unchecked(&self, begin: usize, end: usize) -> &str { + unsafe fn slice_unchecked(&self, begin: uint, end: uint) -> &str { mem::transmute(Slice { - data: self.as_ptr().offset(begin as isize), + data: self.as_ptr().offset(begin as int), len: end - begin, }) } @@ -1518,41 +1544,71 @@ impl StrExt for str { } #[inline] - fn trim_matches(&self, mut pat: P) -> &str { - let cur = match self.find(|c: char| !pat.matches(c)) { - None => "", - Some(i) => unsafe { self.slice_unchecked(i, self.len()) } - }; - match cur.rfind(|c: char| !pat.matches(c)) { - None => "", - Some(i) => { - let right = cur.char_range_at(i).next; - unsafe { cur.slice_unchecked(0, right) } + fn trim_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str + where P::Matcher: DoubleEndedMatcher<'a> { + let mut i = 0; + let mut matcher = pat.into_matcher(self); + let mut possible_end_match = None; + while let Some((a, b)) = Matcher::next(&mut matcher) { + if a == i { + i = b; + } else { + possible_end_match = Some((a, b)); + break; + } + } + let mut j = self.len(); + while let Some((a, b)) = ReverseMatcher::next_back(&mut matcher) + .or_else(|| possible_end_match.take()) { + if b == j { + j = a; + } else { + break; } } + unsafe { + // Matcher is known to return valid indices + self.slice_unchecked(i, j) + } } #[inline] - fn trim_left_matches(&self, mut pat: P) -> &str { - match self.find(|c: char| !pat.matches(c)) { - None => "", - Some(first) => unsafe { self.slice_unchecked(first, self.len()) } + fn trim_left_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &str { + let mut i = 0; + let mut matcher = pat.into_matcher(self); + while let Some((a, b)) = Matcher::next(&mut matcher) { + if a == i { + i = b; + } else { + break; + } + } + unsafe { + // Matcher is known to return valid indices + self.slice_unchecked(i, self.len()) } } #[inline] - fn trim_right_matches(&self, mut pat: P) -> &str { - match self.rfind(|c: char| !pat.matches(c)) { - None => "", - Some(last) => { - let next = self.char_range_at(last).next; - unsafe { self.slice_unchecked(0, next) } + fn trim_right_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &str + where P::Matcher: ReverseMatcher<'a> { + let mut i = self.len(); + let mut matcher = pat.into_matcher(self); + while let Some((a, b)) = ReverseMatcher::next_back(&mut matcher) { + if b == i { + i = a; + } else { + break; } } + unsafe { + // Matcher is known to return valid indices + self.slice_unchecked(0, i) + } } #[inline] - fn is_char_boundary(&self, index: usize) -> bool { + fn is_char_boundary(&self, index: uint) -> bool { if index == self.len() { return true; } match self.as_bytes().get(index) { None => false, @@ -1561,13 +1617,13 @@ impl StrExt for str { } #[inline] - fn char_range_at(&self, i: usize) -> CharRange { + fn char_range_at(&self, i: uint) -> CharRange { let (c, n) = char_range_at_raw(self.as_bytes(), i); CharRange { ch: unsafe { mem::transmute(c) }, next: n } } #[inline] - fn char_range_at_reverse(&self, start: usize) -> CharRange { + fn char_range_at_reverse(&self, start: uint) -> CharRange { let mut prev = start; prev = prev.saturating_sub(1); @@ -1576,14 +1632,14 @@ impl StrExt for str { } // Multibyte case is a fn to allow char_range_at_reverse to inline cleanly - fn multibyte_char_range_at_reverse(s: &str, mut i: usize) -> CharRange { + fn multibyte_char_range_at_reverse(s: &str, mut i: uint) -> CharRange { // while there is a previous byte == 10...... while i > 0 && s.as_bytes()[i] & !CONT_MASK == TAG_CONT_U8 { i -= 1; } let mut val = s.as_bytes()[i] as u32; - let w = UTF8_CHAR_WIDTH[val as usize] as usize; + let w = UTF8_CHAR_WIDTH[val as uint] as uint; assert!((w != 0)); val = utf8_first_byte!(val, w); @@ -1598,12 +1654,12 @@ impl StrExt for str { } #[inline] - fn char_at(&self, i: usize) -> char { + fn char_at(&self, i: uint) -> char { self.char_range_at(i).ch } #[inline] - fn char_at_reverse(&self, i: usize) -> char { + fn char_at_reverse(&self, i: uint) -> char { self.char_range_at_reverse(i).ch } @@ -1612,29 +1668,16 @@ impl StrExt for str { unsafe { mem::transmute(self) } } - fn find(&self, mut pat: P) -> Option { - if pat.only_ascii() { - self.bytes().position(|b| pat.matches(b as char)) - } else { - for (index, c) in self.char_indices() { - if pat.matches(c) { return Some(index); } - } - None - } + fn find<'a, P: Pattern<'a>>(&'a self, pat: P) -> Option { + Matcher::next(&mut pat.into_matcher(self)).map(|(i, _)| i) } - fn rfind(&self, mut pat: P) -> Option { - if pat.only_ascii() { - self.bytes().rposition(|b| pat.matches(b as char)) - } else { - for (index, c) in self.char_indices().rev() { - if pat.matches(c) { return Some(index); } - } - None - } + fn rfind<'a, P: Pattern<'a>>(&'a self, pat: P) -> Option + where P::Matcher: ReverseMatcher<'a> { + ReverseMatcher::next_back(&mut pat.into_matcher(self)).map(|(i, _)| i) } - fn find_str(&self, needle: &str) -> Option { + fn find_str(&self, needle: &str) -> Option { if needle.is_empty() { Some(0) } else { @@ -1655,10 +1698,10 @@ impl StrExt for str { } } - fn subslice_offset(&self, inner: &str) -> usize { - let a_start = self.as_ptr() as usize; + fn subslice_offset(&self, inner: &str) -> uint { + let a_start = self.as_ptr() as uint; let a_end = a_start + self.len(); - let b_start = inner.as_ptr() as usize; + let b_start = inner.as_ptr() as uint; let b_end = b_start + inner.len(); assert!(a_start <= b_start); @@ -1672,7 +1715,7 @@ impl StrExt for str { } #[inline] - fn len(&self) -> usize { self.repr().len } + fn len(&self) -> uint { self.repr().len } #[inline] fn is_empty(&self) -> bool { self.len() == 0 } @@ -1685,15 +1728,15 @@ impl StrExt for str { /// index of the next code point. #[inline] #[unstable(feature = "core")] -pub fn char_range_at_raw(bytes: &[u8], i: usize) -> (u32, usize) { +pub fn char_range_at_raw(bytes: &[u8], i: uint) -> (u32, usize) { if bytes[i] < 128u8 { return (bytes[i] as u32, i + 1); } // Multibyte case is a fn to allow char_range_at to inline cleanly - fn multibyte_char_range_at(bytes: &[u8], i: usize) -> (u32, usize) { + fn multibyte_char_range_at(bytes: &[u8], i: uint) -> (u32, usize) { let mut val = bytes[i] as u32; - let w = UTF8_CHAR_WIDTH[val as usize] as usize; + let w = UTF8_CHAR_WIDTH[val as uint] as uint; assert!((w != 0)); val = utf8_first_byte!(val, w); @@ -1720,7 +1763,7 @@ impl<'a> Iterator for Lines<'a> { #[inline] fn next(&mut self) -> Option<&'a str> { self.inner.next() } #[inline] - fn size_hint(&self) -> (usize, Option) { self.inner.size_hint() } + fn size_hint(&self) -> (uint, Option) { self.inner.size_hint() } } #[stable(feature = "rust1", since = "1.0.0")] @@ -1736,7 +1779,7 @@ impl<'a> Iterator for LinesAny<'a> { #[inline] fn next(&mut self) -> Option<&'a str> { self.inner.next() } #[inline] - fn size_hint(&self) -> (usize, Option) { self.inner.size_hint() } + fn size_hint(&self) -> (uint, Option) { self.inner.size_hint() } } #[stable(feature = "rust1", since = "1.0.0")] diff --git a/src/libcore/str/pattern.rs b/src/libcore/str/pattern.rs new file mode 100644 index 0000000000000..13b0d1df45ec7 --- /dev/null +++ b/src/libcore/str/pattern.rs @@ -0,0 +1,113 @@ +// Copyright 2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#![allow(missing_docs)] + +use prelude::*; +use super::CharEq; + +// Pattern + +pub trait Pattern<'a>: Sized { + type Matcher: Matcher<'a>; + fn into_matcher(self, haystack: &'a str) -> Self::Matcher; + + #[inline] + fn is_contained_in(self, haystack: &'a str) -> bool { + Matcher::next(&mut self.into_matcher(haystack)).is_some() + } +} + +// Matcher + +pub unsafe trait Matcher<'a> { + fn haystack(&self) -> &'a str; + fn next(&mut self) -> Option<(usize, usize)>; +} + +pub unsafe trait ReverseMatcher<'a>: Matcher<'a> { + fn next_back(&mut self) -> Option<(usize, usize)>; +} + +pub trait DoubleEndedMatcher<'a>: ReverseMatcher<'a> {} + +// Impl for CharEq + +struct CharEqMatcher<'a, C>(C, &'a str, super::CharIndices<'a>); + +impl<'a, C: CharEq> Pattern<'a> for C { + type Matcher = CharEqMatcher<'a, C>; + + #[inline] + fn into_matcher(self, haystack: &'a str) -> CharEqMatcher<'a, C> { + CharEqMatcher(self, haystack, haystack.char_indices()) + } +} + +unsafe impl<'a, C: CharEq> Matcher<'a> for CharEqMatcher<'a, C> { + #[inline] + fn haystack(&self) -> &'a str { + self.1 + } + + #[inline] + fn next(&mut self) -> Option<(usize, usize)> { + while let Some((i, c)) = self.2.next() { + if self.0.matches(c) { + return Some((i, i + c.len_utf8())); + } + } + None + } +} + +unsafe impl<'a, C: CharEq> ReverseMatcher<'a> for CharEqMatcher<'a, C> { + #[inline] + fn next_back(&mut self) -> Option<(usize, usize)> { + while let Some((i, c)) = self.2.next_back() { + if self.0.matches(c) { + return Some((i, i + c.len_utf8())); + } + } + None + } +} + +impl<'a, C: CharEq> DoubleEndedMatcher<'a> for CharEqMatcher<'a, C> {} + +// Impl for &str + +struct StrMatcher<'a>(super::MatchIndices<'a>); + +impl<'a> Pattern<'a> for &'a str { + type Matcher = StrMatcher<'a>; + + #[inline] + fn into_matcher(self, haystack: &'a str) -> StrMatcher<'a> { + let mi = super::MatchIndices { + haystack: haystack, + needle: self, + searcher: super::Searcher::new(haystack.as_bytes(), self.as_bytes()) + }; + StrMatcher(mi) + } +} + +unsafe impl<'a> Matcher<'a> for StrMatcher<'a> { + #[inline] + fn haystack(&self) -> &'a str { + self.0.haystack + } + + #[inline] + fn next(&mut self) -> Option<(usize, usize)> { + self.0.next() + } +} diff --git a/src/libcoretest/str.rs b/src/libcoretest/str.rs index 308c5282a92a3..ddbec47eeff49 100644 --- a/src/libcoretest/str.rs +++ b/src/libcoretest/str.rs @@ -8,6 +8,13 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. +#[test] +fn test_empty_match_indices() { + let data = "aä中!"; + let vec: Vec<_> = data.match_indices("").collect(); + assert_eq!(vec, vec![(0, 0), (1, 1), (3, 3), (6, 6), (7, 7)]); +} + #[test] fn test_bool_from_str() { assert_eq!("true".parse().ok(), Some(true)); From bc09c1ddc5604642926428d69f2ebd7557b3230b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marvin=20L=C3=B6bel?= Date: Wed, 14 Jan 2015 19:34:39 +0100 Subject: [PATCH 3/9] Made str::MatchIndices a private implementantion detail --- src/libcore/str/mod.rs | 30 ++++++++++++++++++++++-------- src/libcore/str/pattern.rs | 4 ++-- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/src/libcore/str/mod.rs b/src/libcore/str/mod.rs index fb0c4c4f34f8a..cbd103ee765a4 100644 --- a/src/libcore/str/mod.rs +++ b/src/libcore/str/mod.rs @@ -931,29 +931,43 @@ impl Searcher { } } -/// An iterator over the start and end indices of the matches of a -/// substring within a larger string #[derive(Clone)] #[unstable(feature = "core", reason = "type may be removed")] -pub struct MatchIndices<'a> { +struct OldMatchIndices<'a> { // constants haystack: &'a str, needle: &'a str, searcher: Searcher } +/// An iterator over the start and end indices of the matches of a +/// substring within a larger string +#[derive(Clone)] +#[unstable(feature = "core", reason = "type may be removed")] +pub struct MatchIndices<'a>(OldMatchIndices<'a>); + +#[stable] +impl<'a> Iterator for MatchIndices<'a> { + type Item = (uint, uint); + + #[inline] + fn next(&mut self) -> Option<(uint, uint)> { + self.0.next() + } +} + /// An iterator over the substrings of a string separated by a given /// search string #[derive(Clone)] #[unstable(feature = "core", reason = "type may be removed")] pub struct SplitStr<'a> { - it: MatchIndices<'a>, + it: OldMatchIndices<'a>, last_end: uint, finished: bool } #[stable(feature = "rust1", since = "1.0.0")] -impl<'a> Iterator for MatchIndices<'a> { +impl<'a> Iterator for OldMatchIndices<'a> { type Item = (uint, uint); #[inline] @@ -1465,17 +1479,17 @@ impl StrExt for str { #[inline] fn match_indices<'a>(&'a self, sep: &'a str) -> MatchIndices<'a> { - MatchIndices { + MatchIndices(OldMatchIndices { haystack: self, needle: sep, searcher: Searcher::new(self.as_bytes(), sep.as_bytes()) - } + }) } #[inline] fn split_str<'a>(&'a self, sep: &'a str) -> SplitStr<'a> { SplitStr { - it: self.match_indices(sep), + it: self.match_indices(sep).0, last_end: 0, finished: false } diff --git a/src/libcore/str/pattern.rs b/src/libcore/str/pattern.rs index 13b0d1df45ec7..4eff47f382098 100644 --- a/src/libcore/str/pattern.rs +++ b/src/libcore/str/pattern.rs @@ -84,14 +84,14 @@ impl<'a, C: CharEq> DoubleEndedMatcher<'a> for CharEqMatcher<'a, C> {} // Impl for &str -struct StrMatcher<'a>(super::MatchIndices<'a>); +struct StrMatcher<'a>(super::OldMatchIndices<'a>); impl<'a> Pattern<'a> for &'a str { type Matcher = StrMatcher<'a>; #[inline] fn into_matcher(self, haystack: &'a str) -> StrMatcher<'a> { - let mi = super::MatchIndices { + let mi = super::OldMatchIndices { haystack: haystack, needle: self, searcher: super::Searcher::new(haystack.as_bytes(), self.as_bytes()) From 13ea9062a918e3e60d82186135610a575bf92394 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marvin=20L=C3=B6bel?= Date: Wed, 14 Jan 2015 20:45:51 +0100 Subject: [PATCH 4/9] Made match_indices use the generic pattern API --- src/libcore/str/mod.rs | 52 ++++++++++++++++++++------------------ src/libcore/str/pattern.rs | 11 ++++---- 2 files changed, 34 insertions(+), 29 deletions(-) diff --git a/src/libcore/str/mod.rs b/src/libcore/str/mod.rs index cbd103ee765a4..ada9b71211ce0 100644 --- a/src/libcore/str/mod.rs +++ b/src/libcore/str/mod.rs @@ -35,6 +35,7 @@ use raw::{Repr, Slice}; use result::Result::{self, Ok, Err}; use slice::{self, SliceExt}; use usize; +use clone::Clone; pub use self::pattern::{Pattern, Matcher, ReverseMatcher, DoubleEndedMatcher}; @@ -933,41 +934,48 @@ impl Searcher { #[derive(Clone)] #[unstable(feature = "core", reason = "type may be removed")] -struct OldMatchIndices<'a> { +struct OldMatchIndices<'a, 'b> { // constants haystack: &'a str, - needle: &'a str, + needle: &'b str, searcher: Searcher } /// An iterator over the start and end indices of the matches of a /// substring within a larger string -#[derive(Clone)] #[unstable(feature = "core", reason = "type may be removed")] -pub struct MatchIndices<'a>(OldMatchIndices<'a>); +pub struct MatchIndices<'a, P: Pattern<'a>>(P::Matcher); #[stable] -impl<'a> Iterator for MatchIndices<'a> { +impl<'a, P: Pattern<'a>> Iterator for MatchIndices<'a, P> { type Item = (uint, uint); #[inline] fn next(&mut self) -> Option<(uint, uint)> { - self.0.next() + Matcher::next(&mut self.0) } } /// An iterator over the substrings of a string separated by a given /// search string -#[derive(Clone)] #[unstable(feature = "core", reason = "type may be removed")] -pub struct SplitStr<'a> { - it: OldMatchIndices<'a>, +pub struct SplitStr<'a, 'b> { + it: pattern::StrMatcher<'a, 'b>, last_end: uint, finished: bool } +impl<'a, 'b> Clone for SplitStr<'a, 'b> { + fn clone(&self) -> Self { + SplitStr { + it: Clone::clone(&self.it), + ..*self + } + } +} + #[stable(feature = "rust1", since = "1.0.0")] -impl<'a> Iterator for OldMatchIndices<'a> { +impl<'a, 'b> Iterator for OldMatchIndices<'a, 'b> { type Item = (uint, uint); #[inline] @@ -998,22 +1006,22 @@ impl<'a> Iterator for OldMatchIndices<'a> { } #[stable(feature = "rust1", since = "1.0.0")] -impl<'a> Iterator for SplitStr<'a> { +impl<'a, 'b> Iterator for SplitStr<'a, 'b> { type Item = &'a str; #[inline] fn next(&mut self) -> Option<&'a str> { if self.finished { return None; } - - match self.it.next() { + let haystack = Matcher::haystack(&self.it); + match Matcher::next(&mut self.it) { Some((from, to)) => { - let ret = Some(&self.it.haystack[self.last_end .. from]); + let ret = Some(&haystack[self.last_end..from]); self.last_end = to; ret } None => { self.finished = true; - Some(&self.it.haystack[self.last_end .. self.it.haystack.len()]) + Some(&haystack[self.last_end..]) } } } @@ -1375,8 +1383,8 @@ pub trait StrExt { fn splitn<'a, P: CharEq>(&'a self, count: uint, pat: P) -> SplitN<'a, P>; fn split_terminator<'a, P: CharEq>(&'a self, pat: P) -> SplitTerminator<'a, P>; fn rsplitn<'a, P: CharEq>(&'a self, count: uint, pat: P) -> RSplitN<'a, P>; - fn match_indices<'a>(&'a self, sep: &'a str) -> MatchIndices<'a>; - fn split_str<'a>(&'a self, pat: &'a str) -> SplitStr<'a>; + fn match_indices<'a, P: Pattern<'a>>(&'a self, pat: P) -> MatchIndices<'a, P>; + fn split_str<'a, 'b>(&'a self, pat: &'b str) -> SplitStr<'a, 'b>; fn lines<'a>(&'a self) -> Lines<'a>; fn lines_any<'a>(&'a self) -> LinesAny<'a>; fn char_len(&self) -> uint; @@ -1478,16 +1486,12 @@ impl StrExt for str { } #[inline] - fn match_indices<'a>(&'a self, sep: &'a str) -> MatchIndices<'a> { - MatchIndices(OldMatchIndices { - haystack: self, - needle: sep, - searcher: Searcher::new(self.as_bytes(), sep.as_bytes()) - }) + fn match_indices<'a, P: Pattern<'a>>(&'a self, pat: P) -> MatchIndices<'a, P> { + MatchIndices(pat.into_matcher(self)) } #[inline] - fn split_str<'a>(&'a self, sep: &'a str) -> SplitStr<'a> { + fn split_str<'a, 'b>(&'a self, sep: &'b str) -> SplitStr<'a, 'b> { SplitStr { it: self.match_indices(sep).0, last_end: 0, diff --git a/src/libcore/str/pattern.rs b/src/libcore/str/pattern.rs index 4eff47f382098..077c4c8f7b46c 100644 --- a/src/libcore/str/pattern.rs +++ b/src/libcore/str/pattern.rs @@ -84,13 +84,14 @@ impl<'a, C: CharEq> DoubleEndedMatcher<'a> for CharEqMatcher<'a, C> {} // Impl for &str -struct StrMatcher<'a>(super::OldMatchIndices<'a>); +#[derive(Clone)] +pub struct StrMatcher<'a, 'b>(super::OldMatchIndices<'a, 'b>); -impl<'a> Pattern<'a> for &'a str { - type Matcher = StrMatcher<'a>; +impl<'a, 'b> Pattern<'a> for &'b str { + type Matcher = StrMatcher<'a, 'b>; #[inline] - fn into_matcher(self, haystack: &'a str) -> StrMatcher<'a> { + fn into_matcher(self, haystack: &'a str) -> StrMatcher<'a, 'b> { let mi = super::OldMatchIndices { haystack: haystack, needle: self, @@ -100,7 +101,7 @@ impl<'a> Pattern<'a> for &'a str { } } -unsafe impl<'a> Matcher<'a> for StrMatcher<'a> { +unsafe impl<'a, 'b> Matcher<'a> for StrMatcher<'a, 'b> { #[inline] fn haystack(&self) -> &'a str { self.0.haystack From f9ef8cd55512842f2481aac6332dbfb92df58c52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marvin=20L=C3=B6bel?= Date: Tue, 27 Jan 2015 14:09:18 +0100 Subject: [PATCH 5/9] Refactored code into Searcher traits with naive implementations Made the family of Split iterators use the Pattern API Renamed the Matcher traits into Searcher --- src/compiletest/compiletest.rs | 2 +- src/libcollections/str.rs | 4 +- src/libcore/char.rs | 27 +- src/libcore/slice.rs | 4 + src/libcore/str/mod.rs | 566 ++++++++++++++------------------- src/libcore/str/pattern.rs | 257 ++++++++++++--- src/libcoretest/str.rs | 6 +- 7 files changed, 476 insertions(+), 390 deletions(-) diff --git a/src/compiletest/compiletest.rs b/src/compiletest/compiletest.rs index 278ce5565d9fc..30de253fbad42 100644 --- a/src/compiletest/compiletest.rs +++ b/src/compiletest/compiletest.rs @@ -23,7 +23,7 @@ #![feature(env)] #![feature(core)] -#![deny(warnings)] +// #![deny(warnings)] extern crate test; extern crate getopts; diff --git a/src/libcollections/str.rs b/src/libcollections/str.rs index ec0a487acdc77..dff331ac62007 100644 --- a/src/libcollections/str.rs +++ b/src/libcollections/str.rs @@ -706,7 +706,7 @@ pub trait StrExt: Index { /// ``` #[unstable(feature = "collections", reason = "might have its iterator type changed")] - fn match_indices<'a>(&'a self, pat: &'a str) -> MatchIndices<'a> { + fn match_indices<'a, 'b>(&'a self, pat: &'b str) -> MatchIndices<'a, &'b str> { core_str::StrExt::match_indices(&self[..], pat) } @@ -723,7 +723,7 @@ pub trait StrExt: Index { /// ``` #[unstable(feature = "collections", reason = "might get removed in the future in favor of a more generic split()")] - fn split_str<'a>(&'a self, pat: &'a str) -> SplitStr<'a> { + fn split_str<'a, 'b>(&'a self, pat: &'b str) -> SplitStr<'a, &'b str> { core_str::StrExt::split_str(&self[..], pat) } diff --git a/src/libcore/char.rs b/src/libcore/char.rs index c45fac1bc9490..8e27ae1cea970 100644 --- a/src/libcore/char.rs +++ b/src/libcore/char.rs @@ -22,13 +22,13 @@ use option::Option; use slice::SliceExt; // UTF-8 ranges and tags for encoding characters -static TAG_CONT: u8 = 0b1000_0000u8; -static TAG_TWO_B: u8 = 0b1100_0000u8; -static TAG_THREE_B: u8 = 0b1110_0000u8; -static TAG_FOUR_B: u8 = 0b1111_0000u8; -static MAX_ONE_B: u32 = 0x80u32; -static MAX_TWO_B: u32 = 0x800u32; -static MAX_THREE_B: u32 = 0x10000u32; +const TAG_CONT: u8 = 0b1000_0000u8; +const TAG_TWO_B: u8 = 0b1100_0000u8; +const TAG_THREE_B: u8 = 0b1110_0000u8; +const TAG_FOUR_B: u8 = 0b1111_0000u8; +const MAX_ONE_B: u32 = 0x80u32; +const MAX_TWO_B: u32 = 0x800u32; +const MAX_THREE_B: u32 = 0x10000u32; /* Lu Uppercase_Letter an uppercase letter @@ -398,11 +398,14 @@ impl CharExt for char { #[stable(feature = "rust1", since = "1.0.0")] fn len_utf8(self) -> usize { let code = self as u32; - match () { - _ if code < MAX_ONE_B => 1, - _ if code < MAX_TWO_B => 2, - _ if code < MAX_THREE_B => 3, - _ => 4, + if code < MAX_ONE_B { + 1 + } else if code < MAX_TWO_B { + 2 + } else if code < MAX_THREE_B { + 3 + } else { + 4 } } diff --git a/src/libcore/slice.rs b/src/libcore/slice.rs index a86da53b372a9..2debcaa581342 100644 --- a/src/libcore/slice.rs +++ b/src/libcore/slice.rs @@ -657,6 +657,8 @@ macro_rules! iterator { fn next(&mut self) -> Option<$elem> { // could be implemented with slices, but this avoids bounds checks unsafe { + ::intrinsics::assume(!self.ptr.is_null()); + ::intrinsics::assume(!self.end.is_null()); if self.ptr == self.end { None } else { @@ -693,6 +695,8 @@ macro_rules! iterator { fn next_back(&mut self) -> Option<$elem> { // could be implemented with slices, but this avoids bounds checks unsafe { + ::intrinsics::assume(!self.ptr.is_null()); + ::intrinsics::assume(!self.end.is_null()); if self.end == self.ptr { None } else { diff --git a/src/libcore/str/mod.rs b/src/libcore/str/mod.rs index ada9b71211ce0..bdb3b854fe2c0 100644 --- a/src/libcore/str/mod.rs +++ b/src/libcore/str/mod.rs @@ -16,7 +16,7 @@ #![doc(primitive = "str")] -use self::Searcher::{Naive, TwoWay, TwoWayLong}; +use self::OldSearcher::{TwoWay, TwoWayLong}; use clone::Clone; use cmp::{self, Eq}; @@ -35,9 +35,9 @@ use raw::{Repr, Slice}; use result::Result::{self, Ok, Err}; use slice::{self, SliceExt}; use usize; -use clone::Clone; -pub use self::pattern::{Pattern, Matcher, ReverseMatcher, DoubleEndedMatcher}; +pub use self::pattern::Pattern; +pub use self::pattern::{Searcher, ReverseSearcher, DoubleEndedSearcher, SearchStep}; mod pattern; @@ -46,7 +46,7 @@ macro_rules! delegate_iter { delegate_iter!{$te : $ti} impl<'a> ExactSizeIterator for $ti { #[inline] - fn len(&self) -> uint { + fn len(&self) -> usize { self.0.len() } } @@ -61,7 +61,7 @@ macro_rules! delegate_iter { self.0.next() } #[inline] - fn size_hint(&self) -> (uint, Option) { + fn size_hint(&self) -> (usize, Option) { self.0.size_hint() } } @@ -83,7 +83,7 @@ macro_rules! delegate_iter { self.0.next() } #[inline] - fn size_hint(&self) -> (uint, Option) { + fn size_hint(&self) -> (usize, Option) { self.0.size_hint() } } @@ -105,7 +105,7 @@ macro_rules! delegate_iter { self.0.next() } #[inline] - fn size_hint(&self) -> (uint, Option) { + fn size_hint(&self) -> (usize, Option) { self.0.size_hint() } } @@ -184,7 +184,7 @@ pub enum Utf8Error { /// The offset is guaranteed to be in bounds of the slice in question, and /// the byte at the specified offset was the first invalid byte in the /// sequence detected. - InvalidByte(uint), + InvalidByte(usize), /// The byte slice was invalid because more bytes were needed but no more /// bytes were available. @@ -256,7 +256,7 @@ impl CharEq for char { fn matches(&mut self, c: char) -> bool { *self == c } #[inline] - fn only_ascii(&self) -> bool { (*self as uint) < 128 } + fn only_ascii(&self) -> bool { (*self as usize) < 128 } } impl CharEq for F where F: FnMut(char) -> bool { @@ -343,6 +343,7 @@ fn unwrap_or_0(opt: Option<&u8>) -> u8 { /// Reads the next code point out of a byte iterator (assuming a /// UTF-8-like encoding). #[unstable(feature = "core")] +#[inline] pub fn next_code_point(bytes: &mut slice::Iter) -> Option { // Decode UTF-8 let x = match bytes.next() { @@ -374,6 +375,38 @@ pub fn next_code_point(bytes: &mut slice::Iter) -> Option { Some(ch) } +/// Reads the last code point out of a byte iterator (assuming a +/// UTF-8-like encoding). +#[unstable(feature = "core")] +#[inline] +pub fn next_code_point_reverse(bytes: &mut slice::Iter) -> Option { + // Decode UTF-8 + let w = match bytes.next_back() { + None => return None, + Some(&next_byte) if next_byte < 128 => return Some(next_byte as u32), + Some(&back_byte) => back_byte, + }; + + // Multibyte case follows + // Decode from a byte combination out of: [x [y [z w]]] + let mut ch; + let z = unwrap_or_0(bytes.next_back()); + ch = utf8_first_byte!(z, 2); + if utf8_is_cont_byte!(z) { + let y = unwrap_or_0(bytes.next_back()); + ch = utf8_first_byte!(y, 3); + if utf8_is_cont_byte!(y) { + let x = unwrap_or_0(bytes.next_back()); + ch = utf8_first_byte!(x, 4); + ch = utf8_acc_cont_byte!(ch, y); + } + ch = utf8_acc_cont_byte!(ch, z); + } + ch = utf8_acc_cont_byte!(ch, w); + + Some(ch) +} + #[stable(feature = "rust1", since = "1.0.0")] impl<'a> Iterator for Chars<'a> { type Item = char; @@ -389,7 +422,7 @@ impl<'a> Iterator for Chars<'a> { } #[inline] - fn size_hint(&self) -> (uint, Option) { + fn size_hint(&self) -> (usize, Option) { let (len, _) = self.iter.size_hint(); (len.saturating_add(3) / 4, Some(len)) } @@ -399,33 +432,12 @@ impl<'a> Iterator for Chars<'a> { impl<'a> DoubleEndedIterator for Chars<'a> { #[inline] fn next_back(&mut self) -> Option { - let w = match self.iter.next_back() { - None => return None, - Some(&back_byte) if back_byte < 128 => return Some(back_byte as char), - Some(&back_byte) => back_byte, - }; - - // Multibyte case follows - // Decode from a byte combination out of: [x [y [z w]]] - let mut ch; - let z = unwrap_or_0(self.iter.next_back()); - ch = utf8_first_byte!(z, 2); - if utf8_is_cont_byte!(z) { - let y = unwrap_or_0(self.iter.next_back()); - ch = utf8_first_byte!(y, 3); - if utf8_is_cont_byte!(y) { - let x = unwrap_or_0(self.iter.next_back()); - ch = utf8_first_byte!(x, 4); - ch = utf8_acc_cont_byte!(ch, y); + next_code_point_reverse(&mut self.iter).map(|ch| { + // str invariant says `ch` is a valid Unicode Scalar Value + unsafe { + mem::transmute(ch) } - ch = utf8_acc_cont_byte!(ch, z); - } - ch = utf8_acc_cont_byte!(ch, w); - - // str invariant says `ch` is a valid Unicode Scalar Value - unsafe { - Some(mem::transmute(ch)) - } + }) } } @@ -434,16 +446,16 @@ impl<'a> DoubleEndedIterator for Chars<'a> { #[derive(Clone)] #[stable(feature = "rust1", since = "1.0.0")] pub struct CharIndices<'a> { - front_offset: uint, + front_offset: usize, iter: Chars<'a>, } #[stable(feature = "rust1", since = "1.0.0")] impl<'a> Iterator for CharIndices<'a> { - type Item = (uint, char); + type Item = (usize, char); #[inline] - fn next(&mut self) -> Option<(uint, char)> { + fn next(&mut self) -> Option<(usize, char)> { let (pre_len, _) = self.iter.iter.size_hint(); match self.iter.next() { None => None, @@ -457,7 +469,7 @@ impl<'a> Iterator for CharIndices<'a> { } #[inline] - fn size_hint(&self) -> (uint, Option) { + fn size_hint(&self) -> (usize, Option) { self.iter.size_hint() } } @@ -465,7 +477,7 @@ impl<'a> Iterator for CharIndices<'a> { #[stable(feature = "rust1", since = "1.0.0")] impl<'a> DoubleEndedIterator for CharIndices<'a> { #[inline] - fn next_back(&mut self) -> Option<(uint, char)> { + fn next_back(&mut self) -> Option<(usize, char)> { match self.iter.next_back() { None => None, Some(ch) => { @@ -501,24 +513,22 @@ impl<'a> Fn<(&'a u8,)> for BytesDeref { } /// An iterator over the substrings of a string, separated by `sep`. -#[derive(Clone)] -struct CharSplits<'a, Sep> { +struct CharSplits<'a, P: Pattern<'a>> { /// The slice remaining to be iterated - string: &'a str, - sep: Sep, + start: usize, + end: usize, + matcher: P::Searcher, /// Whether an empty string at the end is allowed allow_trailing_empty: bool, - only_ascii: bool, finished: bool, } /// An iterator over the substrings of a string, separated by `sep`, /// splitting at most `count` times. -#[derive(Clone)] -struct CharSplitsN<'a, Sep> { - iter: CharSplits<'a, Sep>, +struct CharSplitsN<'a, P: Pattern<'a>> { + iter: CharSplits<'a, P>, /// The number of splits remaining - count: uint, + count: usize, invert: bool, } @@ -534,12 +544,15 @@ pub struct LinesAny<'a> { inner: Map, fn(&str) -> &str>, } -impl<'a, Sep> CharSplits<'a, Sep> { +impl<'a, P: Pattern<'a>> CharSplits<'a, P> { #[inline] fn get_end(&mut self) -> Option<&'a str> { - if !self.finished && (self.allow_trailing_empty || self.string.len() > 0) { + if !self.finished && (self.allow_trailing_empty || self.end - self.start > 0) { self.finished = true; - Some(self.string) + unsafe { + let string = self.matcher.haystack().slice_unchecked(self.start, self.end); + Some(string) + } } else { None } @@ -547,33 +560,18 @@ impl<'a, Sep> CharSplits<'a, Sep> { } #[stable(feature = "rust1", since = "1.0.0")] -impl<'a, Sep: CharEq> Iterator for CharSplits<'a, Sep> { +impl<'a, P: Pattern<'a>> Iterator for CharSplits<'a, P> { type Item = &'a str; #[inline] fn next(&mut self) -> Option<&'a str> { if self.finished { return None } - let mut next_split = None; - if self.only_ascii { - for (idx, byte) in self.string.bytes().enumerate() { - if self.sep.matches(byte as char) && byte < 128u8 { - next_split = Some((idx, idx + 1)); - break; - } - } - } else { - for (idx, ch) in self.string.char_indices() { - if self.sep.matches(ch) { - next_split = Some((idx, self.string.char_range_at(idx).next)); - break; - } - } - } - match next_split { + let haystack = self.matcher.haystack(); + match self.matcher.next_match() { Some((a, b)) => unsafe { - let elt = self.string.slice_unchecked(0, a); - self.string = self.string.slice_unchecked(b, self.string.len()); + let elt = haystack.slice_unchecked(self.start, a); + self.start = b; Some(elt) }, None => self.get_end(), @@ -582,7 +580,8 @@ impl<'a, Sep: CharEq> Iterator for CharSplits<'a, Sep> { } #[stable(feature = "rust1", since = "1.0.0")] -impl<'a, Sep: CharEq> DoubleEndedIterator for CharSplits<'a, Sep> { +impl<'a, P: Pattern<'a>> DoubleEndedIterator for CharSplits<'a, P> +where P::Searcher: DoubleEndedSearcher<'a> { #[inline] fn next_back(&mut self) -> Option<&'a str> { if self.finished { return None } @@ -594,31 +593,18 @@ impl<'a, Sep: CharEq> DoubleEndedIterator for CharSplits<'a, Sep> { _ => if self.finished { return None } } } - let len = self.string.len(); - let mut next_split = None; - - if self.only_ascii { - for (idx, byte) in self.string.bytes().enumerate().rev() { - if self.sep.matches(byte as char) && byte < 128u8 { - next_split = Some((idx, idx + 1)); - break; - } - } - } else { - for (idx, ch) in self.string.char_indices().rev() { - if self.sep.matches(ch) { - next_split = Some((idx, self.string.char_range_at(idx).next)); - break; - } - } - } - match next_split { + + let haystack = self.matcher.haystack(); + match self.matcher.next_match_back() { Some((a, b)) => unsafe { - let elt = self.string.slice_unchecked(b, len); - self.string = self.string.slice_unchecked(0, a); + let elt = haystack.slice_unchecked(b, self.end); + self.end = a; Some(elt) }, - None => { self.finished = true; Some(self.string) } + None => unsafe { + self.finished = true; + Some(haystack.slice_unchecked(self.start, self.end)) + }, } } } @@ -638,44 +624,18 @@ impl<'a, Sep: CharEq> Iterator for CharSplitsN<'a, Sep> { } } -/// The internal state of an iterator that searches for matches of a substring -/// within a larger string using naive search -#[derive(Clone)] -struct NaiveSearcher { - position: uint -} - -impl NaiveSearcher { - fn new() -> NaiveSearcher { - NaiveSearcher { position: 0 } - } - - fn next(&mut self, haystack: &[u8], needle: &[u8]) -> Option<(uint, uint)> { - while self.position + needle.len() <= haystack.len() { - if &haystack[self.position .. self.position + needle.len()] == needle { - let match_pos = self.position; - self.position += needle.len(); // add 1 for all matches - return Some((match_pos, match_pos + needle.len())); - } else { - self.position += 1; - } - } - None - } -} - /// The internal state of an iterator that searches for matches of a substring /// within a larger string using two-way search #[derive(Clone)] struct TwoWaySearcher { // constants - crit_pos: uint, - period: uint, + crit_pos: usize, + period: usize, byteset: u64, // variables - position: uint, - memory: uint + position: usize, + memory: usize } /* @@ -749,6 +709,7 @@ struct TwoWaySearcher { */ impl TwoWaySearcher { + #[allow(dead_code)] fn new(needle: &[u8]) -> TwoWaySearcher { let (crit_pos_false, period_false) = TwoWaySearcher::maximal_suffix(needle, false); let (crit_pos_true, period_true) = TwoWaySearcher::maximal_suffix(needle, true); @@ -762,7 +723,7 @@ impl TwoWaySearcher { // This isn't in the original algorithm, as far as I'm aware. let byteset = needle.iter() - .fold(0, |a, &b| (1 << ((b & 0x3f) as uint)) | a); + .fold(0, |a, &b| (1 << ((b & 0x3f) as usize)) | a); // A particularly readable explanation of what's going on here can be found // in Crochemore and Rytter's book "Text Algorithms", ch 13. Specifically @@ -800,7 +761,7 @@ impl TwoWaySearcher { // How far we can jump when we encounter a mismatch is all based on the fact // that (u, v) is a critical factorization for the needle. #[inline] - fn next(&mut self, haystack: &[u8], needle: &[u8], long_period: bool) -> Option<(uint, uint)> { + fn next(&mut self, haystack: &[u8], needle: &[u8], long_period: bool) -> Option<(usize, usize)> { 'search: loop { // Check that we have room to search in if self.position + needle.len() > haystack.len() { @@ -810,7 +771,7 @@ impl TwoWaySearcher { // Quickly skip by large portions unrelated to our substring if (self.byteset >> ((haystack[self.position + needle.len() - 1] & 0x3f) - as uint)) & 1 == 0 { + as usize)) & 1 == 0 { self.position += needle.len(); if !long_period { self.memory = 0; @@ -857,7 +818,8 @@ impl TwoWaySearcher { // Specifically, returns (i, p), where i is the starting index of v in some // critical factorization (u, v) and p = period(v) #[inline] - fn maximal_suffix(arr: &[u8], reversed: bool) -> (uint, uint) { + #[allow(dead_code)] + fn maximal_suffix(arr: &[u8], reversed: bool) -> (usize, usize) { let mut left = -1; // Corresponds to i in the paper let mut right = 0; // Corresponds to j in the paper let mut offset = 1; // Corresponds to k in the paper @@ -901,26 +863,24 @@ impl TwoWaySearcher { /// The internal state of an iterator that searches for matches of a substring /// within a larger string using a dynamically chosen search algorithm #[derive(Clone)] -enum Searcher { - EmptyNeedle { pos: usize, done: bool }, - Naive(NaiveSearcher), +enum OldSearcher { TwoWay(TwoWaySearcher), - TwoWayLong(TwoWaySearcher) + TwoWayLong(TwoWaySearcher), } -impl Searcher { - fn new(haystack: &[u8], needle: &[u8]) -> Searcher { +impl OldSearcher { + #[allow(dead_code)] + fn new(haystack: &[u8], needle: &[u8]) -> OldSearcher { if needle.len() == 0 { - Searcher::EmptyNeedle { - pos: 0, - done: false - } + // Handle specially + unimplemented!() // FIXME: Tune this. // FIXME(#16715): This unsigned integer addition will probably not // overflow because that would mean that the memory almost solely // consists of the needle. Needs #16715 to be formally fixed. } else if needle.len() + 20 > haystack.len() { - Naive(NaiveSearcher::new()) + // Use naive searcher + unimplemented!() } else { let searcher = TwoWaySearcher::new(needle); if searcher.memory == usize::MAX { // If the period is long @@ -933,101 +893,55 @@ impl Searcher { } #[derive(Clone)] -#[unstable(feature = "core", reason = "type may be removed")] struct OldMatchIndices<'a, 'b> { // constants haystack: &'a str, needle: &'b str, - searcher: Searcher + searcher: OldSearcher } +// FIXME: #21637 Prevents a Clone impl /// An iterator over the start and end indices of the matches of a /// substring within a larger string #[unstable(feature = "core", reason = "type may be removed")] -pub struct MatchIndices<'a, P: Pattern<'a>>(P::Matcher); +pub struct MatchIndices<'a, P: Pattern<'a>>(P::Searcher); -#[stable] +#[stable(feature = "rust1", since = "1.0.0")] impl<'a, P: Pattern<'a>> Iterator for MatchIndices<'a, P> { - type Item = (uint, uint); + type Item = (usize, usize); #[inline] - fn next(&mut self) -> Option<(uint, uint)> { - Matcher::next(&mut self.0) + fn next(&mut self) -> Option<(usize, usize)> { + self.0.next_match() } } /// An iterator over the substrings of a string separated by a given /// search string #[unstable(feature = "core", reason = "type may be removed")] -pub struct SplitStr<'a, 'b> { - it: pattern::StrMatcher<'a, 'b>, - last_end: uint, - finished: bool -} +pub struct SplitStr<'a, P: Pattern<'a>>(Split<'a, P>); +impl<'a, P: Pattern<'a>> Iterator for SplitStr<'a, P> { + type Item = &'a str; -impl<'a, 'b> Clone for SplitStr<'a, 'b> { - fn clone(&self) -> Self { - SplitStr { - it: Clone::clone(&self.it), - ..*self - } + #[inline] + fn next(&mut self) -> Option<&'a str> { + Iterator::next(&mut self.0) } } -#[stable(feature = "rust1", since = "1.0.0")] -impl<'a, 'b> Iterator for OldMatchIndices<'a, 'b> { - type Item = (uint, uint); - +impl<'a, 'b> OldMatchIndices<'a, 'b> { #[inline] - fn next(&mut self) -> Option<(uint, uint)> { + #[allow(dead_code)] + fn next(&mut self) -> Option<(usize, usize)> { match self.searcher { - Naive(ref mut searcher) - => searcher.next(self.haystack.as_bytes(), self.needle.as_bytes()), TwoWay(ref mut searcher) => searcher.next(self.haystack.as_bytes(), self.needle.as_bytes(), false), TwoWayLong(ref mut searcher) => searcher.next(self.haystack.as_bytes(), self.needle.as_bytes(), true), - Searcher::EmptyNeedle { ref mut pos, ref mut done } => { - if !*done { - let r = Some((*pos, *pos)); - if *pos == self.haystack.len() { - *done = true; - } else { - use char::CharExt; - *pos += self.haystack.char_at(*pos).len_utf8(); - } - r - } else { - None - } - } } } } -#[stable(feature = "rust1", since = "1.0.0")] -impl<'a, 'b> Iterator for SplitStr<'a, 'b> { - type Item = &'a str; - - #[inline] - fn next(&mut self) -> Option<&'a str> { - if self.finished { return None; } - let haystack = Matcher::haystack(&self.it); - match Matcher::next(&mut self.it) { - Some((from, to)) => { - let ret = Some(&haystack[self.last_end..from]); - self.last_end = to; - ret - } - None => { - self.finished = true; - Some(&haystack[self.last_end..]) - } - } - } -} - - /* Section: Comparing strings */ @@ -1038,9 +952,8 @@ Section: Comparing strings /// to compare &[u8] byte slices that are not necessarily valid UTF-8. #[inline] fn eq_slice_(a: &str, b: &str) -> bool { - // NOTE: In theory n should be libc::size_t and not usize, but libc is not available here #[allow(improper_ctypes)] - extern { fn memcmp(s1: *const i8, s2: *const i8, n: uint) -> i32; } + extern { fn memcmp(s1: *const i8, s2: *const i8, n: usize) -> i32; } a.len() == b.len() && unsafe { memcmp(a.as_ptr() as *const i8, b.as_ptr() as *const i8, @@ -1097,7 +1010,7 @@ fn run_utf8_validation_iterator(iter: &mut slice::Iter) // ASCII characters are always valid, so only large // bytes need more examination. if first >= 128 { - let w = UTF8_CHAR_WIDTH[first as uint] as uint; + let w = UTF8_CHAR_WIDTH[first as usize] as usize; let second = next!(); // 2-byte encoding is for codepoints \u{0080} to \u{07ff} // first C2 80 last DF BF @@ -1172,7 +1085,7 @@ pub struct CharRange { /// Current `char` pub ch: char, /// Index of the first byte of the next `char` - pub next: uint, + pub next: usize, } /// Mask of the value bits of a continuation byte @@ -1257,10 +1170,10 @@ mod traits { /// // &s[3 .. 100]; /// ``` #[stable(feature = "rust1", since = "1.0.0")] - impl ops::Index> for str { + impl ops::Index> for str { type Output = str; #[inline] - fn index(&self, index: &ops::Range) -> &str { + fn index(&self, index: &ops::Range) -> &str { // is_char_boundary checks that the index is in [0, .len()] if index.start <= index.end && self.is_char_boundary(index.start) && @@ -1280,10 +1193,10 @@ mod traits { /// Panics when `end` does not point to a valid character, or is /// out of bounds. #[stable(feature = "rust1", since = "1.0.0")] - impl ops::Index> for str { + impl ops::Index> for str { type Output = str; #[inline] - fn index(&self, index: &ops::RangeTo) -> &str { + fn index(&self, index: &ops::RangeTo) -> &str { // is_char_boundary checks that the index is in [0, .len()] if self.is_char_boundary(index.end) { unsafe { self.slice_unchecked(0, index.end) } @@ -1300,10 +1213,10 @@ mod traits { /// Panics when `begin` does not point to a valid character, or is /// out of bounds. #[stable(feature = "rust1", since = "1.0.0")] - impl ops::Index> for str { + impl ops::Index> for str { type Output = str; #[inline] - fn index(&self, index: &ops::RangeFrom) -> &str { + fn index(&self, index: &ops::RangeFrom) -> &str { // is_char_boundary checks that the index is in [0, .len()] if self.is_char_boundary(index.start) { unsafe { self.slice_unchecked(index.start, self.len()) } @@ -1344,28 +1257,40 @@ impl<'a, S: ?Sized> Str for &'a S where S: Str { } /// Return type of `StrExt::split` -#[derive(Clone)] #[stable(feature = "rust1", since = "1.0.0")] -pub struct Split<'a, P>(CharSplits<'a, P>); -delegate_iter!{pattern &'a str : Split<'a, P>} +pub struct Split<'a, P: Pattern<'a>>(CharSplits<'a, P>); +#[stable(feature = "rust1", since = "1.0.0")] +impl<'a, P: Pattern<'a>> Iterator for Split<'a, P> { + type Item = &'a str; + + #[inline] + fn next(&mut self) -> Option<&'a str> { + self.0.next() + } +} +#[stable(feature = "rust1", since = "1.0.0")] +impl<'a, P: Pattern<'a>> DoubleEndedIterator for Split<'a, P> +where P::Searcher: DoubleEndedSearcher<'a> { + #[inline] + fn next_back(&mut self) -> Option<&'a str> { + self.0.next_back() + } +} /// Return type of `StrExt::split_terminator` -#[derive(Clone)] #[unstable(feature = "core", reason = "might get removed in favour of a constructor method on Split")] -pub struct SplitTerminator<'a, P>(CharSplits<'a, P>); +pub struct SplitTerminator<'a, P: Pattern<'a>>(CharSplits<'a, P>); delegate_iter!{pattern &'a str : SplitTerminator<'a, P>} /// Return type of `StrExt::splitn` -#[derive(Clone)] #[stable(feature = "rust1", since = "1.0.0")] -pub struct SplitN<'a, P>(CharSplitsN<'a, P>); +pub struct SplitN<'a, P: Pattern<'a>>(CharSplitsN<'a, P>); delegate_iter!{pattern forward &'a str : SplitN<'a, P>} /// Return type of `StrExt::rsplitn` -#[derive(Clone)] #[stable(feature = "rust1", since = "1.0.0")] -pub struct RSplitN<'a, P>(CharSplitsN<'a, P>); +pub struct RSplitN<'a, P: Pattern<'a>>(CharSplitsN<'a, P>); delegate_iter!{pattern forward &'a str : RSplitN<'a, P>} /// Methods for string slices @@ -1379,44 +1304,45 @@ pub trait StrExt { fn chars<'a>(&'a self) -> Chars<'a>; fn bytes<'a>(&'a self) -> Bytes<'a>; fn char_indices<'a>(&'a self) -> CharIndices<'a>; - fn split<'a, P: CharEq>(&'a self, pat: P) -> Split<'a, P>; - fn splitn<'a, P: CharEq>(&'a self, count: uint, pat: P) -> SplitN<'a, P>; - fn split_terminator<'a, P: CharEq>(&'a self, pat: P) -> SplitTerminator<'a, P>; - fn rsplitn<'a, P: CharEq>(&'a self, count: uint, pat: P) -> RSplitN<'a, P>; + fn split<'a, P: Pattern<'a>>(&'a self, pat: P) -> Split<'a, P>; + fn splitn<'a, P: Pattern<'a>>(&'a self, count: usize, pat: P) -> SplitN<'a, P>; + fn split_terminator<'a, P: Pattern<'a>>(&'a self, pat: P) -> SplitTerminator<'a, P>; + fn rsplitn<'a, P: Pattern<'a>>(&'a self, count: usize, pat: P) -> RSplitN<'a, P>; fn match_indices<'a, P: Pattern<'a>>(&'a self, pat: P) -> MatchIndices<'a, P>; - fn split_str<'a, 'b>(&'a self, pat: &'b str) -> SplitStr<'a, 'b>; + fn split_str<'a, P: Pattern<'a>>(&'a self, pat: P) -> SplitStr<'a, P>; fn lines<'a>(&'a self) -> Lines<'a>; fn lines_any<'a>(&'a self) -> LinesAny<'a>; - fn char_len(&self) -> uint; - fn slice_chars<'a>(&'a self, begin: uint, end: uint) -> &'a str; - unsafe fn slice_unchecked<'a>(&'a self, begin: uint, end: uint) -> &'a str; - fn starts_with(&self, pat: &str) -> bool; - fn ends_with(&self, pat: &str) -> bool; + fn char_len(&self) -> usize; + fn slice_chars<'a>(&'a self, begin: usize, end: usize) -> &'a str; + unsafe fn slice_unchecked<'a>(&'a self, begin: usize, end: usize) -> &'a str; + fn starts_with<'a, P: Pattern<'a>>(&'a self, pat: P) -> bool; + fn ends_with<'a, P: Pattern<'a>>(&'a self, pat: P) -> bool + where P::Searcher: ReverseSearcher<'a>; fn trim_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str - where P::Matcher: DoubleEndedMatcher<'a>; + where P::Searcher: DoubleEndedSearcher<'a>; fn trim_left_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str; fn trim_right_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str - where P::Matcher: ReverseMatcher<'a>; - fn is_char_boundary(&self, index: uint) -> bool; - fn char_range_at(&self, start: uint) -> CharRange; - fn char_range_at_reverse(&self, start: uint) -> CharRange; - fn char_at(&self, i: uint) -> char; - fn char_at_reverse(&self, i: uint) -> char; + where P::Searcher: ReverseSearcher<'a>; + fn is_char_boundary(&self, index: usize) -> bool; + fn char_range_at(&self, start: usize) -> CharRange; + fn char_range_at_reverse(&self, start: usize) -> CharRange; + fn char_at(&self, i: usize) -> char; + fn char_at_reverse(&self, i: usize) -> char; fn as_bytes<'a>(&'a self) -> &'a [u8]; - fn find<'a, P: Pattern<'a>>(&'a self, pat: P) -> Option; - fn rfind<'a, P: Pattern<'a>>(&'a self, pat: P) -> Option - where P::Matcher: ReverseMatcher<'a>; - fn find_str(&self, pat: &str) -> Option; + fn find<'a, P: Pattern<'a>>(&'a self, pat: P) -> Option; + fn rfind<'a, P: Pattern<'a>>(&'a self, pat: P) -> Option + where P::Searcher: ReverseSearcher<'a>; + fn find_str<'a, P: Pattern<'a>>(&'a self, pat: P) -> Option; fn slice_shift_char<'a>(&'a self) -> Option<(char, &'a str)>; - fn subslice_offset(&self, inner: &str) -> uint; + fn subslice_offset(&self, inner: &str) -> usize; fn as_ptr(&self) -> *const u8; - fn len(&self) -> uint; + fn len(&self) -> usize; fn is_empty(&self) -> bool; fn parse(&self) -> Result; } #[inline(never)] -fn slice_error_fail(s: &str, begin: uint, end: uint) -> ! { +fn slice_error_fail(s: &str, begin: usize, end: usize) -> ! { assert!(begin <= end); panic!("index {} and/or {} in `{}` do not lie on character boundary", begin, end, s); @@ -1449,18 +1375,18 @@ impl StrExt for str { } #[inline] - fn split(&self, pat: P) -> Split

{ + fn split<'a, P: Pattern<'a>>(&'a self, pat: P) -> Split<'a, P> { Split(CharSplits { - string: self, - only_ascii: pat.only_ascii(), - sep: pat, + start: 0, + end: self.len(), + matcher: pat.into_matcher(self), allow_trailing_empty: true, finished: false, }) } #[inline] - fn splitn(&self, count: uint, pat: P) -> SplitN

{ + fn splitn<'a, P: Pattern<'a>>(&'a self, count: usize, pat: P) -> SplitN<'a, P> { SplitN(CharSplitsN { iter: self.split(pat).0, count: count, @@ -1469,7 +1395,7 @@ impl StrExt for str { } #[inline] - fn split_terminator(&self, pat: P) -> SplitTerminator

{ + fn split_terminator<'a, P: Pattern<'a>>(&'a self, pat: P) -> SplitTerminator<'a, P> { SplitTerminator(CharSplits { allow_trailing_empty: false, ..self.split(pat).0 @@ -1477,7 +1403,7 @@ impl StrExt for str { } #[inline] - fn rsplitn(&self, count: uint, pat: P) -> RSplitN

{ + fn rsplitn<'a, P: Pattern<'a>>(&'a self, count: usize, pat: P) -> RSplitN<'a, P> { RSplitN(CharSplitsN { iter: self.split(pat).0, count: count, @@ -1491,12 +1417,8 @@ impl StrExt for str { } #[inline] - fn split_str<'a, 'b>(&'a self, sep: &'b str) -> SplitStr<'a, 'b> { - SplitStr { - it: self.match_indices(sep).0, - last_end: 0, - finished: false - } + fn split_str<'a, P: Pattern<'a>>(&'a self, pat: P) -> SplitStr<'a, P> { + SplitStr(self.split(pat)) } #[inline] @@ -1516,9 +1438,9 @@ impl StrExt for str { } #[inline] - fn char_len(&self) -> uint { self.chars().count() } + fn char_len(&self) -> usize { self.chars().count() } - fn slice_chars(&self, begin: uint, end: uint) -> &str { + fn slice_chars(&self, begin: usize, end: usize) -> &str { assert!(begin <= end); let mut count = 0; let mut begin_byte = None; @@ -1542,7 +1464,7 @@ impl StrExt for str { } #[inline] - unsafe fn slice_unchecked(&self, begin: uint, end: uint) -> &str { + unsafe fn slice_unchecked(&self, begin: usize, end: usize) -> &str { mem::transmute(Slice { data: self.as_ptr().offset(begin as int), len: end - begin, @@ -1550,83 +1472,65 @@ impl StrExt for str { } #[inline] - fn starts_with(&self, needle: &str) -> bool { - let n = needle.len(); - self.len() >= n && needle.as_bytes() == &self.as_bytes()[..n] + fn starts_with<'a, P: Pattern<'a>>(&'a self, pat: P) -> bool { + pat.match_starts_at(self, 0) } #[inline] - fn ends_with(&self, needle: &str) -> bool { - let (m, n) = (self.len(), needle.len()); - m >= n && needle.as_bytes() == &self.as_bytes()[m-n..] + fn ends_with<'a, P: Pattern<'a>>(&'a self, pat: P) -> bool + where P::Searcher: ReverseSearcher<'a> { + pat.match_ends_at(self, self.len()) } #[inline] fn trim_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str - where P::Matcher: DoubleEndedMatcher<'a> { + where P::Searcher: DoubleEndedSearcher<'a> { let mut i = 0; + let mut j = self.len(); let mut matcher = pat.into_matcher(self); - let mut possible_end_match = None; - while let Some((a, b)) = Matcher::next(&mut matcher) { - if a == i { - i = b; - } else { - possible_end_match = Some((a, b)); - break; - } + if let Some((a, b)) = matcher.next_reject() { + i = a; + j = b; // Rember earliest known match, correct it below if + // last match is different } - let mut j = self.len(); - while let Some((a, b)) = ReverseMatcher::next_back(&mut matcher) - .or_else(|| possible_end_match.take()) { - if b == j { - j = a; - } else { - break; - } + if let Some((_, b)) = matcher.next_reject_back() { + j = b; } unsafe { - // Matcher is known to return valid indices + // Searcher is known to return valid indices self.slice_unchecked(i, j) } } #[inline] - fn trim_left_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &str { + fn trim_left_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str { let mut i = 0; let mut matcher = pat.into_matcher(self); - while let Some((a, b)) = Matcher::next(&mut matcher) { - if a == i { - i = b; - } else { - break; - } + if let Some((a, _)) = matcher.next_reject() { + i = a; } unsafe { - // Matcher is known to return valid indices + // Searcher is known to return valid indices self.slice_unchecked(i, self.len()) } } #[inline] - fn trim_right_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &str - where P::Matcher: ReverseMatcher<'a> { - let mut i = self.len(); + fn trim_right_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str + where P::Searcher: ReverseSearcher<'a> { + let mut j = self.len(); let mut matcher = pat.into_matcher(self); - while let Some((a, b)) = ReverseMatcher::next_back(&mut matcher) { - if b == i { - i = a; - } else { - break; - } + if let Some((_, b)) = matcher.next_reject_back() { + j = b; } unsafe { - // Matcher is known to return valid indices - self.slice_unchecked(0, i) + // Searcher is known to return valid indices + self.slice_unchecked(0, j) } } #[inline] - fn is_char_boundary(&self, index: uint) -> bool { + fn is_char_boundary(&self, index: usize) -> bool { if index == self.len() { return true; } match self.as_bytes().get(index) { None => false, @@ -1635,13 +1539,13 @@ impl StrExt for str { } #[inline] - fn char_range_at(&self, i: uint) -> CharRange { + fn char_range_at(&self, i: usize) -> CharRange { let (c, n) = char_range_at_raw(self.as_bytes(), i); CharRange { ch: unsafe { mem::transmute(c) }, next: n } } #[inline] - fn char_range_at_reverse(&self, start: uint) -> CharRange { + fn char_range_at_reverse(&self, start: usize) -> CharRange { let mut prev = start; prev = prev.saturating_sub(1); @@ -1650,14 +1554,14 @@ impl StrExt for str { } // Multibyte case is a fn to allow char_range_at_reverse to inline cleanly - fn multibyte_char_range_at_reverse(s: &str, mut i: uint) -> CharRange { + fn multibyte_char_range_at_reverse(s: &str, mut i: usize) -> CharRange { // while there is a previous byte == 10...... while i > 0 && s.as_bytes()[i] & !CONT_MASK == TAG_CONT_U8 { i -= 1; } let mut val = s.as_bytes()[i] as u32; - let w = UTF8_CHAR_WIDTH[val as uint] as uint; + let w = UTF8_CHAR_WIDTH[val as usize] as usize; assert!((w != 0)); val = utf8_first_byte!(val, w); @@ -1672,12 +1576,12 @@ impl StrExt for str { } #[inline] - fn char_at(&self, i: uint) -> char { + fn char_at(&self, i: usize) -> char { self.char_range_at(i).ch } #[inline] - fn char_at_reverse(&self, i: uint) -> char { + fn char_at_reverse(&self, i: usize) -> char { self.char_range_at_reverse(i).ch } @@ -1686,23 +1590,17 @@ impl StrExt for str { unsafe { mem::transmute(self) } } - fn find<'a, P: Pattern<'a>>(&'a self, pat: P) -> Option { - Matcher::next(&mut pat.into_matcher(self)).map(|(i, _)| i) + fn find<'a, P: Pattern<'a>>(&'a self, pat: P) -> Option { + pat.into_matcher(self).next_match().map(|(i, _)| i) } - fn rfind<'a, P: Pattern<'a>>(&'a self, pat: P) -> Option - where P::Matcher: ReverseMatcher<'a> { - ReverseMatcher::next_back(&mut pat.into_matcher(self)).map(|(i, _)| i) + fn rfind<'a, P: Pattern<'a>>(&'a self, pat: P) -> Option + where P::Searcher: ReverseSearcher<'a> { + pat.into_matcher(self).next_match_back().map(|(i, _)| i) } - fn find_str(&self, needle: &str) -> Option { - if needle.is_empty() { - Some(0) - } else { - self.match_indices(needle) - .next() - .map(|(start, _end)| start) - } + fn find_str<'a, P: Pattern<'a>>(&'a self, pat: P) -> Option { + self.find(pat) } #[inline] @@ -1716,10 +1614,10 @@ impl StrExt for str { } } - fn subslice_offset(&self, inner: &str) -> uint { - let a_start = self.as_ptr() as uint; + fn subslice_offset(&self, inner: &str) -> usize { + let a_start = self.as_ptr() as usize; let a_end = a_start + self.len(); - let b_start = inner.as_ptr() as uint; + let b_start = inner.as_ptr() as usize; let b_end = b_start + inner.len(); assert!(a_start <= b_start); @@ -1733,7 +1631,7 @@ impl StrExt for str { } #[inline] - fn len(&self) -> uint { self.repr().len } + fn len(&self) -> usize { self.repr().len } #[inline] fn is_empty(&self) -> bool { self.len() == 0 } @@ -1746,15 +1644,15 @@ impl StrExt for str { /// index of the next code point. #[inline] #[unstable(feature = "core")] -pub fn char_range_at_raw(bytes: &[u8], i: uint) -> (u32, usize) { +pub fn char_range_at_raw(bytes: &[u8], i: usize) -> (u32, usize) { if bytes[i] < 128u8 { return (bytes[i] as u32, i + 1); } // Multibyte case is a fn to allow char_range_at to inline cleanly - fn multibyte_char_range_at(bytes: &[u8], i: uint) -> (u32, usize) { + fn multibyte_char_range_at(bytes: &[u8], i: usize) -> (u32, usize) { let mut val = bytes[i] as u32; - let w = UTF8_CHAR_WIDTH[val as uint] as uint; + let w = UTF8_CHAR_WIDTH[val as usize] as usize; assert!((w != 0)); val = utf8_first_byte!(val, w); @@ -1781,7 +1679,7 @@ impl<'a> Iterator for Lines<'a> { #[inline] fn next(&mut self) -> Option<&'a str> { self.inner.next() } #[inline] - fn size_hint(&self) -> (uint, Option) { self.inner.size_hint() } + fn size_hint(&self) -> (usize, Option) { self.inner.size_hint() } } #[stable(feature = "rust1", since = "1.0.0")] @@ -1797,7 +1695,7 @@ impl<'a> Iterator for LinesAny<'a> { #[inline] fn next(&mut self) -> Option<&'a str> { self.inner.next() } #[inline] - fn size_hint(&self) -> (uint, Option) { self.inner.size_hint() } + fn size_hint(&self) -> (usize, Option) { self.inner.size_hint() } } #[stable(feature = "rust1", since = "1.0.0")] diff --git a/src/libcore/str/pattern.rs b/src/libcore/str/pattern.rs index 077c4c8f7b46c..2b77d877cf4fd 100644 --- a/src/libcore/str/pattern.rs +++ b/src/libcore/str/pattern.rs @@ -16,99 +16,280 @@ use super::CharEq; // Pattern pub trait Pattern<'a>: Sized { - type Matcher: Matcher<'a>; - fn into_matcher(self, haystack: &'a str) -> Self::Matcher; + type Searcher: Searcher<'a>; + fn into_matcher(self, haystack: &'a str) -> Self::Searcher; #[inline] fn is_contained_in(self, haystack: &'a str) -> bool { - Matcher::next(&mut self.into_matcher(haystack)).is_some() + self.into_matcher(haystack).next_match().is_some() + } + + #[inline] + fn match_starts_at(self, haystack: &'a str, idx: usize) -> bool { + let mut matcher = self.into_matcher(haystack); + loop { + match matcher.next() { + SearchStep::Match(i, _) if i == idx => return true, + SearchStep::Match(i, _) + | SearchStep::Reject(i, _) if i >= idx => break, + SearchStep::Done => break, + _ => continue, + } + } + false + } + + #[inline] + fn match_ends_at(self, haystack: &'a str, idx: usize) -> bool + where Self::Searcher: ReverseSearcher<'a> { + let mut matcher = self.into_matcher(haystack); + loop { + match matcher.next_back() { + SearchStep::Match(_, j) if idx == j => return true, + SearchStep::Match(_, j) + | SearchStep::Reject(_, j) if idx >= j => break, + SearchStep::Done => break, + _ => continue, + } + } + false } } -// Matcher +// Searcher + +pub enum SearchStep { + Match(usize, usize), + Reject(usize, usize), + Done +} -pub unsafe trait Matcher<'a> { +pub unsafe trait Searcher<'a> { fn haystack(&self) -> &'a str; - fn next(&mut self) -> Option<(usize, usize)>; + fn next(&mut self) -> SearchStep; + #[inline] + fn next_match(&mut self) -> Option<(usize, usize)> { + loop { + match self.next() { + SearchStep::Match(a, b) => return Some((a, b)), + SearchStep::Done => return None, + _ => continue, + } + } + } + #[inline] + fn next_reject(&mut self) -> Option<(usize, usize)>{ + loop { + match self.next() { + SearchStep::Reject(a, b) => return Some((a, b)), + SearchStep::Done => return None, + _ => continue, + } + } + } } -pub unsafe trait ReverseMatcher<'a>: Matcher<'a> { - fn next_back(&mut self) -> Option<(usize, usize)>; +pub unsafe trait ReverseSearcher<'a>: Searcher<'a> { + fn next_back(&mut self) -> SearchStep; + #[inline] + fn next_match_back(&mut self) -> Option<(usize, usize)>{ + loop { + match self.next_back() { + SearchStep::Match(a, b) => return Some((a, b)), + SearchStep::Done => return None, + _ => continue, + } + } + } + #[inline] + fn next_reject_back(&mut self) -> Option<(usize, usize)>{ + loop { + match self.next_back() { + SearchStep::Reject(a, b) => return Some((a, b)), + SearchStep::Done => return None, + _ => continue, + } + } + } } -pub trait DoubleEndedMatcher<'a>: ReverseMatcher<'a> {} +pub trait DoubleEndedSearcher<'a>: ReverseSearcher<'a> {} // Impl for CharEq -struct CharEqMatcher<'a, C>(C, &'a str, super::CharIndices<'a>); +pub struct CharEqSearcher<'a, C> { + char_eq: C, + haystack: &'a str, + char_indices: super::CharIndices<'a>, + #[allow(dead_code)] + ascii_only: bool, +} impl<'a, C: CharEq> Pattern<'a> for C { - type Matcher = CharEqMatcher<'a, C>; + type Searcher = CharEqSearcher<'a, C>; #[inline] - fn into_matcher(self, haystack: &'a str) -> CharEqMatcher<'a, C> { - CharEqMatcher(self, haystack, haystack.char_indices()) + fn into_matcher(self, haystack: &'a str) -> CharEqSearcher<'a, C> { + CharEqSearcher { + ascii_only: self.only_ascii(), + haystack: haystack, + char_eq: self, + char_indices: haystack.char_indices(), + } } } -unsafe impl<'a, C: CharEq> Matcher<'a> for CharEqMatcher<'a, C> { +unsafe impl<'a, C: CharEq> Searcher<'a> for CharEqSearcher<'a, C> { #[inline] fn haystack(&self) -> &'a str { - self.1 + self.haystack } #[inline] - fn next(&mut self) -> Option<(usize, usize)> { - while let Some((i, c)) = self.2.next() { - if self.0.matches(c) { - return Some((i, i + c.len_utf8())); + fn next(&mut self) -> SearchStep { + let s = &mut self.char_indices; + // Compare lengths of the internal byte slice iterator + // to find length of current char + let (pre_len, _) = s.iter.iter.size_hint(); + if let Some((i, c)) = s.next() { + let (len, _) = s.iter.iter.size_hint(); + let char_len = pre_len - len; + if self.char_eq.matches(c) { + return SearchStep::Match(i, i + char_len); + } else { + return SearchStep::Reject(i, i + char_len); } } - None + SearchStep::Done } } -unsafe impl<'a, C: CharEq> ReverseMatcher<'a> for CharEqMatcher<'a, C> { +unsafe impl<'a, C: CharEq> ReverseSearcher<'a> for CharEqSearcher<'a, C> { #[inline] - fn next_back(&mut self) -> Option<(usize, usize)> { - while let Some((i, c)) = self.2.next_back() { - if self.0.matches(c) { - return Some((i, i + c.len_utf8())); + fn next_back(&mut self) -> SearchStep { + let s = &mut self.char_indices; + // Compare lengths of the internal byte slice iterator + // to find length of current char + let (pre_len, _) = s.iter.iter.size_hint(); + if let Some((i, c)) = s.next_back() { + let (len, _) = s.iter.iter.size_hint(); + let char_len = pre_len - len; + if self.char_eq.matches(c) { + return SearchStep::Match(i, i + char_len); + } else { + return SearchStep::Reject(i, i + char_len); } } - None + SearchStep::Done } } -impl<'a, C: CharEq> DoubleEndedMatcher<'a> for CharEqMatcher<'a, C> {} +impl<'a, C: CharEq> DoubleEndedSearcher<'a> for CharEqSearcher<'a, C> {} // Impl for &str +// TODO: Optimize the naive implementation here + #[derive(Clone)] -pub struct StrMatcher<'a, 'b>(super::OldMatchIndices<'a, 'b>); +pub struct StrSearcher<'a, 'b> { + haystack: &'a str, + needle: &'b str, + start: usize, + end: usize, + done: bool, +} impl<'a, 'b> Pattern<'a> for &'b str { - type Matcher = StrMatcher<'a, 'b>; + type Searcher = StrSearcher<'a, 'b>; #[inline] - fn into_matcher(self, haystack: &'a str) -> StrMatcher<'a, 'b> { - let mi = super::OldMatchIndices { + fn into_matcher(self, haystack: &'a str) -> StrSearcher<'a, 'b> { + StrSearcher { haystack: haystack, needle: self, - searcher: super::Searcher::new(haystack.as_bytes(), self.as_bytes()) - }; - StrMatcher(mi) + start: 0, + end: haystack.len(), + done: false, + } } } -unsafe impl<'a, 'b> Matcher<'a> for StrMatcher<'a, 'b> { +unsafe impl<'a, 'b> Searcher<'a> for StrSearcher<'a, 'b> { #[inline] fn haystack(&self) -> &'a str { - self.0.haystack + self.haystack } #[inline] - fn next(&mut self) -> Option<(usize, usize)> { - self.0.next() + fn next(&mut self) -> SearchStep { + str_search_step(self, + |m: &mut StrSearcher| { + // Forward step for empty needle + let current_start = m.start; + if !m.done { + m.start = m.haystack.char_range_at(current_start).next; + } + SearchStep::Match(current_start, current_start) + }, + |m: &mut StrSearcher| { + // Forward step for nonempty needle + let possible_match = &m.haystack[m.start .. m.start + m.needle.len()]; + let current_start = m.start; + if possible_match == m.needle { + m.start += m.needle.len(); + SearchStep::Match(current_start, m.start) + } else { + m.start += possible_match.chars().next().unwrap().len_utf8(); + SearchStep::Reject(current_start, m.start) + } + }) + } +} + +unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> { + #[inline] + fn next_back(&mut self) -> SearchStep { + str_search_step(self, + |m: &mut StrSearcher| { + // Backward step for empty needle + let current_end = m.end; + if !m.done { + m.end = m.haystack.char_range_at_reverse(current_end).next; + } + SearchStep::Match(current_end, current_end) + }, + |m: &mut StrSearcher| { + // Backward step for nonempty needle + let possible_match = &m.haystack[m.end - m.needle.len() .. m.end]; + let current_end = m.end; + if possible_match == m.needle { + m.end -= m.needle.len(); + SearchStep::Match(m.end, current_end) + } else { + m.end -= possible_match.chars().rev().next().unwrap().len_utf8(); + SearchStep::Reject(m.end, current_end) + } + }) + } +} + +fn str_search_step(mut m: &mut StrSearcher, f: F, g: G) -> SearchStep +where F: FnOnce(&mut StrSearcher) -> SearchStep, + G: FnOnce(&mut StrSearcher) -> SearchStep +{ + if m.done { + SearchStep::Done + } else if m.needle.len() == 0 && m.start <= m.end { + // Case for needle == "" + if m.start == m.end { + m.done = true; + } + f(&mut m) + } else if m.start + m.needle.len() <= m.end { + // Case for needle != "" + g(&mut m) + } else { + m.done = true; + SearchStep::Done } } diff --git a/src/libcoretest/str.rs b/src/libcoretest/str.rs index ddbec47eeff49..9bd7cf9833e90 100644 --- a/src/libcoretest/str.rs +++ b/src/libcoretest/str.rs @@ -207,15 +207,15 @@ malesuada sollicitudin quam eu fermentum!"); make_test!(trim_ascii_char, s, { use std::ascii::AsciiExt; - s.trim_matches(|&mut: c: char| c.is_ascii()) + s.trim_matches(|c: char| c.is_ascii()) }); make_test!(trim_left_ascii_char, s, { use std::ascii::AsciiExt; - s.trim_left_matches(|&mut: c: char| c.is_ascii()) + s.trim_left_matches(|c: char| c.is_ascii()) }); make_test!(trim_right_ascii_char, s, { use std::ascii::AsciiExt; - s.trim_right_matches(|&mut: c: char| c.is_ascii()) + s.trim_right_matches(|c: char| c.is_ascii()) }); make_test!(find_underscore_char, s, s.find('_')); From ee930b0262520e25c5b66b037c65dab538ce5e64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marvin=20L=C3=B6bel?= Date: Tue, 17 Feb 2015 17:50:44 +0100 Subject: [PATCH 6/9] Enabled new pattern API in the libstd facade --- src/libcollections/str.rs | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/src/libcollections/str.rs b/src/libcollections/str.rs index dff331ac62007..d355421039e0c 100644 --- a/src/libcollections/str.rs +++ b/src/libcollections/str.rs @@ -82,6 +82,8 @@ pub use core::str::{SplitN, RSplitN}; pub use core::str::{from_utf8, CharEq, Chars, CharIndices, Bytes}; pub use core::str::{from_utf8_unchecked, from_c_str, ParseBoolError}; pub use unicode::str::{Words, Graphemes, GraphemeIndices}; +pub use core::str::Pattern; +pub use core::str::{Searcher, ReverseSearcher, DoubleEndedSearcher, SearchStep}; /* Section: Creating a string @@ -530,7 +532,7 @@ pub trait StrExt: Index { /// assert!("bananas".contains("nana")); /// ``` #[stable(feature = "rust1", since = "1.0.0")] - fn contains(&self, pat: &str) -> bool { + fn contains<'a, P: Pattern<'a>>(&'a self, pat: P) -> bool { core_str::StrExt::contains(&self[..], pat) } @@ -547,7 +549,7 @@ pub trait StrExt: Index { /// ``` #[unstable(feature = "collections", reason = "might get removed in favour of a more generic contains()")] - fn contains_char(&self, pat: P) -> bool { + fn contains_char<'a, P: Pattern<'a>>(&'a self, pat: P) -> bool { core_str::StrExt::contains_char(&self[..], pat) } @@ -603,7 +605,7 @@ pub trait StrExt: Index { /// assert_eq!(v, vec![""]); /// ``` #[stable(feature = "rust1", since = "1.0.0")] - fn split(&self, pat: P) -> Split

{ + fn split<'a, P: Pattern<'a>>(&'a self, pat: P) -> Split<'a, P> { core_str::StrExt::split(&self[..], pat) } @@ -630,7 +632,7 @@ pub trait StrExt: Index { /// assert_eq!(v, vec![""]); /// ``` #[stable(feature = "rust1", since = "1.0.0")] - fn splitn(&self, count: usize, pat: P) -> SplitN

{ + fn splitn<'a, P: Pattern<'a>>(&'a self, count: usize, pat: P) -> SplitN<'a, P> { core_str::StrExt::splitn(&self[..], count, pat) } @@ -659,7 +661,7 @@ pub trait StrExt: Index { /// assert_eq!(v, vec!["leopard", "tiger", "", "lion"]); /// ``` #[unstable(feature = "collections", reason = "might get removed")] - fn split_terminator(&self, pat: P) -> SplitTerminator

{ + fn split_terminator<'a, P: Pattern<'a>>(&'a self, pat: P) -> SplitTerminator<'a, P> { core_str::StrExt::split_terminator(&self[..], pat) } @@ -680,7 +682,7 @@ pub trait StrExt: Index { /// assert_eq!(v, vec!["leopard", "tiger", "lionX"]); /// ``` #[stable(feature = "rust1", since = "1.0.0")] - fn rsplitn(&self, count: usize, pat: P) -> RSplitN

{ + fn rsplitn<'a, P: Pattern<'a>>(&'a self, count: usize, pat: P) -> RSplitN<'a, P> { core_str::StrExt::rsplitn(&self[..], count, pat) } @@ -706,7 +708,7 @@ pub trait StrExt: Index { /// ``` #[unstable(feature = "collections", reason = "might have its iterator type changed")] - fn match_indices<'a, 'b>(&'a self, pat: &'b str) -> MatchIndices<'a, &'b str> { + fn match_indices<'a, P: Pattern<'a>>(&'a self, pat: P) -> MatchIndices<'a, P> { core_str::StrExt::match_indices(&self[..], pat) } @@ -723,7 +725,7 @@ pub trait StrExt: Index { /// ``` #[unstable(feature = "collections", reason = "might get removed in the future in favor of a more generic split()")] - fn split_str<'a, 'b>(&'a self, pat: &'b str) -> SplitStr<'a, &'b str> { + fn split_str<'a, P: Pattern<'a>>(&'a self, pat: P) -> SplitStr<'a, P> { core_str::StrExt::split_str(&self[..], pat) } @@ -825,7 +827,7 @@ pub trait StrExt: Index { /// assert!("banana".starts_with("ba")); /// ``` #[stable(feature = "rust1", since = "1.0.0")] - fn starts_with(&self, pat: &str) -> bool { + fn starts_with<'a, P: Pattern<'a>>(&'a self, pat: P) -> bool { core_str::StrExt::starts_with(&self[..], pat) } @@ -837,7 +839,8 @@ pub trait StrExt: Index { /// assert!("banana".ends_with("nana")); /// ``` #[stable(feature = "rust1", since = "1.0.0")] - fn ends_with(&self, pat: &str) -> bool { + fn ends_with<'a, P: Pattern<'a>>(&'a self, pat: P) -> bool + where P::Searcher: ReverseSearcher<'a> { core_str::StrExt::ends_with(&self[..], pat) } @@ -857,7 +860,8 @@ pub trait StrExt: Index { /// assert_eq!("123foo1bar123".trim_matches(|c: char| c.is_numeric()), "foo1bar"); /// ``` #[stable(feature = "rust1", since = "1.0.0")] - fn trim_matches(&self, pat: P) -> &str { + fn trim_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str + where P::Searcher: DoubleEndedSearcher<'a> { core_str::StrExt::trim_matches(&self[..], pat) } @@ -877,7 +881,7 @@ pub trait StrExt: Index { /// assert_eq!("123foo1bar123".trim_left_matches(|c: char| c.is_numeric()), "foo1bar123"); /// ``` #[stable(feature = "rust1", since = "1.0.0")] - fn trim_left_matches(&self, pat: P) -> &str { + fn trim_left_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str { core_str::StrExt::trim_left_matches(&self[..], pat) } @@ -897,7 +901,8 @@ pub trait StrExt: Index { /// assert_eq!("123foo1bar123".trim_right_matches(|c: char| c.is_numeric()), "123foo1bar"); /// ``` #[stable(feature = "rust1", since = "1.0.0")] - fn trim_right_matches(&self, pat: P) -> &str { + fn trim_right_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str + where P::Searcher: ReverseSearcher<'a> { core_str::StrExt::trim_right_matches(&self[..], pat) } @@ -1074,7 +1079,7 @@ pub trait StrExt: Index { /// assert_eq!(s.find(x), None); /// ``` #[stable(feature = "rust1", since = "1.0.0")] - fn find(&self, pat: P) -> Option { + fn find<'a, P: Pattern<'a>>(&'a self, pat: P) -> Option { core_str::StrExt::find(&self[..], pat) } @@ -1102,7 +1107,8 @@ pub trait StrExt: Index { /// assert_eq!(s.rfind(x), None); /// ``` #[stable(feature = "rust1", since = "1.0.0")] - fn rfind(&self, pat: P) -> Option { + fn rfind<'a, P: Pattern<'a>>(&'a self, pat: P) -> Option + where P::Searcher: ReverseSearcher<'a> { core_str::StrExt::rfind(&self[..], pat) } @@ -1127,7 +1133,7 @@ pub trait StrExt: Index { /// ``` #[unstable(feature = "collections", reason = "might get removed in favor of a more generic find in the future")] - fn find_str(&self, needle: &str) -> Option { + fn find_str<'a, P: Pattern<'a>>(&'a self, needle: P) -> Option { core_str::StrExt::find_str(&self[..], needle) } From c1de0a0f9ea9863407363ce31bb698e9988215ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marvin=20L=C3=B6bel?= Date: Tue, 17 Feb 2015 22:57:14 +0100 Subject: [PATCH 7/9] Added a Pattern impl that delegates to the dereference of a type. This allows to match with a `&String` or `&&str`, for example. --- src/libcore/str/mod.rs | 25 ++++++----- src/libcore/str/pattern.rs | 86 +++++++++++++++++++++++++++++++++----- src/libcoretest/str.rs | 10 +++++ 3 files changed, 99 insertions(+), 22 deletions(-) diff --git a/src/libcore/str/mod.rs b/src/libcore/str/mod.rs index bdb3b854fe2c0..a93083020334e 100644 --- a/src/libcore/str/mod.rs +++ b/src/libcore/str/mod.rs @@ -75,7 +75,7 @@ macro_rules! delegate_iter { }; (pattern $te:ty : $ti:ty) => { #[stable(feature = "rust1", since = "1.0.0")] - impl<'a, P: CharEq> Iterator for $ti { + impl<'a, P: Pattern<'a>> Iterator for $ti { type Item = $te; #[inline] @@ -88,7 +88,8 @@ macro_rules! delegate_iter { } } #[stable(feature = "rust1", since = "1.0.0")] - impl<'a, P: CharEq> DoubleEndedIterator for $ti { + impl<'a, P: Pattern<'a>> DoubleEndedIterator for $ti + where P::Searcher: DoubleEndedSearcher<'a> { #[inline] fn next_back(&mut self) -> Option<$te> { self.0.next_back() @@ -97,7 +98,8 @@ macro_rules! delegate_iter { }; (pattern forward $te:ty : $ti:ty) => { #[stable(feature = "rust1", since = "1.0.0")] - impl<'a, P: CharEq> Iterator for $ti { + impl<'a, P: Pattern<'a>> Iterator for $ti + where P::Searcher: DoubleEndedSearcher<'a> { type Item = $te; #[inline] @@ -610,7 +612,8 @@ where P::Searcher: DoubleEndedSearcher<'a> { } #[stable(feature = "rust1", since = "1.0.0")] -impl<'a, Sep: CharEq> Iterator for CharSplitsN<'a, Sep> { +impl<'a, P: Pattern<'a>> Iterator for CharSplitsN<'a, P> +where P::Searcher: DoubleEndedSearcher<'a> { type Item = &'a str; #[inline] @@ -1379,7 +1382,7 @@ impl StrExt for str { Split(CharSplits { start: 0, end: self.len(), - matcher: pat.into_matcher(self), + matcher: pat.into_searcher(self), allow_trailing_empty: true, finished: false, }) @@ -1413,7 +1416,7 @@ impl StrExt for str { #[inline] fn match_indices<'a, P: Pattern<'a>>(&'a self, pat: P) -> MatchIndices<'a, P> { - MatchIndices(pat.into_matcher(self)) + MatchIndices(pat.into_searcher(self)) } #[inline] @@ -1487,7 +1490,7 @@ impl StrExt for str { where P::Searcher: DoubleEndedSearcher<'a> { let mut i = 0; let mut j = self.len(); - let mut matcher = pat.into_matcher(self); + let mut matcher = pat.into_searcher(self); if let Some((a, b)) = matcher.next_reject() { i = a; j = b; // Rember earliest known match, correct it below if @@ -1505,7 +1508,7 @@ impl StrExt for str { #[inline] fn trim_left_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str { let mut i = 0; - let mut matcher = pat.into_matcher(self); + let mut matcher = pat.into_searcher(self); if let Some((a, _)) = matcher.next_reject() { i = a; } @@ -1519,7 +1522,7 @@ impl StrExt for str { fn trim_right_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str where P::Searcher: ReverseSearcher<'a> { let mut j = self.len(); - let mut matcher = pat.into_matcher(self); + let mut matcher = pat.into_searcher(self); if let Some((_, b)) = matcher.next_reject_back() { j = b; } @@ -1591,12 +1594,12 @@ impl StrExt for str { } fn find<'a, P: Pattern<'a>>(&'a self, pat: P) -> Option { - pat.into_matcher(self).next_match().map(|(i, _)| i) + pat.into_searcher(self).next_match().map(|(i, _)| i) } fn rfind<'a, P: Pattern<'a>>(&'a self, pat: P) -> Option where P::Searcher: ReverseSearcher<'a> { - pat.into_matcher(self).next_match_back().map(|(i, _)| i) + pat.into_searcher(self).next_match_back().map(|(i, _)| i) } fn find_str<'a, P: Pattern<'a>>(&'a self, pat: P) -> Option { diff --git a/src/libcore/str/pattern.rs b/src/libcore/str/pattern.rs index 2b77d877cf4fd..501fc27b37626 100644 --- a/src/libcore/str/pattern.rs +++ b/src/libcore/str/pattern.rs @@ -17,16 +17,16 @@ use super::CharEq; pub trait Pattern<'a>: Sized { type Searcher: Searcher<'a>; - fn into_matcher(self, haystack: &'a str) -> Self::Searcher; + fn into_searcher(self, haystack: &'a str) -> Self::Searcher; #[inline] fn is_contained_in(self, haystack: &'a str) -> bool { - self.into_matcher(haystack).next_match().is_some() + self.into_searcher(haystack).next_match().is_some() } #[inline] fn match_starts_at(self, haystack: &'a str, idx: usize) -> bool { - let mut matcher = self.into_matcher(haystack); + let mut matcher = self.into_searcher(haystack); loop { match matcher.next() { SearchStep::Match(i, _) if i == idx => return true, @@ -42,7 +42,7 @@ pub trait Pattern<'a>: Sized { #[inline] fn match_ends_at(self, haystack: &'a str, idx: usize) -> bool where Self::Searcher: ReverseSearcher<'a> { - let mut matcher = self.into_matcher(haystack); + let mut matcher = self.into_searcher(haystack); loop { match matcher.next_back() { SearchStep::Match(_, j) if idx == j => return true, @@ -115,9 +115,11 @@ pub unsafe trait ReverseSearcher<'a>: Searcher<'a> { pub trait DoubleEndedSearcher<'a>: ReverseSearcher<'a> {} -// Impl for CharEq +// Impl for a CharEq wrapper -pub struct CharEqSearcher<'a, C> { +struct CharEqPattern(C); + +pub struct CharEqSearcher<'a, C: CharEq> { char_eq: C, haystack: &'a str, char_indices: super::CharIndices<'a>, @@ -125,15 +127,15 @@ pub struct CharEqSearcher<'a, C> { ascii_only: bool, } -impl<'a, C: CharEq> Pattern<'a> for C { +impl<'a, C: CharEq> Pattern<'a> for CharEqPattern { type Searcher = CharEqSearcher<'a, C>; #[inline] - fn into_matcher(self, haystack: &'a str) -> CharEqSearcher<'a, C> { + fn into_searcher(self, haystack: &'a str) -> CharEqSearcher<'a, C> { CharEqSearcher { - ascii_only: self.only_ascii(), + ascii_only: self.0.only_ascii(), haystack: haystack, - char_eq: self, + char_eq: self.0, char_indices: haystack.char_indices(), } } @@ -203,7 +205,7 @@ impl<'a, 'b> Pattern<'a> for &'b str { type Searcher = StrSearcher<'a, 'b>; #[inline] - fn into_matcher(self, haystack: &'a str) -> StrSearcher<'a, 'b> { + fn into_searcher(self, haystack: &'a str) -> StrSearcher<'a, 'b> { StrSearcher { haystack: haystack, needle: self, @@ -293,3 +295,65 @@ where F: FnOnce(&mut StrSearcher) -> SearchStep, SearchStep::Done } } + +macro_rules! associated_items { + ($t:ty, $s:ident, $e:expr) => { + // FIXME: #22463 + //type Searcher = $t; + + fn into_searcher(self, haystack: &'a str) -> $t { + let $s = self; + $e.into_searcher(haystack) + } + + #[inline] + fn is_contained_in(self, haystack: &'a str) -> bool { + let $s = self; + $e.is_contained_in(haystack) + } + + #[inline] + fn match_starts_at(self, haystack: &'a str, idx: usize) -> bool { + let $s = self; + $e.match_starts_at(haystack, idx) + } + + // FIXME: #21750 + /*#[inline] + fn match_ends_at(self, haystack: &'a str, idx: usize) -> bool + where $t: ReverseSearcher<'a> { + let $s = self; + $e.match_ends_at(haystack, idx) + }*/ + } +} + +// CharEq delegation impls + +impl<'a, 'b> Pattern<'a> for &'b [char] { + type Searcher = as Pattern<'a>>::Searcher; + associated_items!( as Pattern<'a>>::Searcher, + s, CharEqPattern(s)); +} + +impl<'a> Pattern<'a> for char { + type Searcher = as Pattern<'a>>::Searcher; + associated_items!( as Pattern<'a>>::Searcher, + s, CharEqPattern(s)); +} + +impl<'a, F> Pattern<'a> for F where F: FnMut(char) -> bool { + type Searcher = as Pattern<'a>>::Searcher; + associated_items!( as Pattern<'a>>::Searcher, + s, CharEqPattern(s)); +} + +// Deref-forward impl + +use ops::Deref; + +impl<'a, 'b, P: 'b + ?Sized, T: Deref + ?Sized> Pattern<'a> for &'b T where &'b P: Pattern<'a> { + type Searcher = <&'b P as Pattern<'a>>::Searcher; + associated_items!(<&'b P as Pattern<'a>>::Searcher, + s, (&**s)); +} diff --git a/src/libcoretest/str.rs b/src/libcoretest/str.rs index 9bd7cf9833e90..acd8cc42c7298 100644 --- a/src/libcoretest/str.rs +++ b/src/libcoretest/str.rs @@ -8,6 +8,16 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. +#[test] +fn test_pattern_deref_forward() { + let data = "aabcdaa"; + assert!(data.contains("bcd")); + assert!(data.contains(&"bcd")); + assert!(data.contains(&&"bcd")); + assert!(data.contains(&"bcd".to_string())); + assert!(data.contains(&&"bcd".to_string())); +} + #[test] fn test_empty_match_indices() { let data = "aä中!"; From a641996796f0ab11021671c0ce70a3c975bb4e37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marvin=20L=C3=B6bel?= Date: Tue, 17 Feb 2015 23:47:08 +0100 Subject: [PATCH 8/9] Fix tidy and rebase fallout Added a few bugfixes and additional testcases --- src/libcollections/str.rs | 33 --------- src/libcore/str/mod.rs | 15 ++-- src/libcore/str/pattern.rs | 27 +++++-- src/libcoretest/str.rs | 146 ++++++++++++++++++++++++++++++++++++- 4 files changed, 171 insertions(+), 50 deletions(-) diff --git a/src/libcollections/str.rs b/src/libcollections/str.rs index d355421039e0c..e86cf462cab9c 100644 --- a/src/libcollections/str.rs +++ b/src/libcollections/str.rs @@ -2893,22 +2893,6 @@ mod bench { b.iter(|| assert_eq!(s.split('V').count(), 3)); } - #[bench] - fn split_unicode_not_ascii(b: &mut Bencher) { - struct NotAscii(char); - impl CharEq for NotAscii { - fn matches(&mut self, c: char) -> bool { - let NotAscii(cc) = *self; - cc == c - } - fn only_ascii(&self) -> bool { false } - } - let s = "ประเทศไทย中华Việt Namประเทศไทย中华Việt Nam"; - - b.iter(|| assert_eq!(s.split(NotAscii('V')).count(), 3)); - } - - #[bench] fn split_ascii(b: &mut Bencher) { let s = "Mary had a little lamb, Little lamb, little-lamb."; @@ -2917,23 +2901,6 @@ mod bench { b.iter(|| assert_eq!(s.split(' ').count(), len)); } - #[bench] - fn split_not_ascii(b: &mut Bencher) { - struct NotAscii(char); - impl CharEq for NotAscii { - #[inline] - fn matches(&mut self, c: char) -> bool { - let NotAscii(cc) = *self; - cc == c - } - fn only_ascii(&self) -> bool { false } - } - let s = "Mary had a little lamb, Little lamb, little-lamb."; - let len = s.split(' ').count(); - - b.iter(|| assert_eq!(s.split(NotAscii(' ')).count(), len)); - } - #[bench] fn split_extern_fn(b: &mut Bencher) { let s = "Mary had a little lamb, Little lamb, little-lamb."; diff --git a/src/libcore/str/mod.rs b/src/libcore/str/mod.rs index a93083020334e..820ad4d8586ad 100644 --- a/src/libcore/str/mod.rs +++ b/src/libcore/str/mod.rs @@ -156,7 +156,6 @@ impl FromStr for bool { /// An error returned when parsing a `bool` from a string fails. #[derive(Debug, Clone, PartialEq)] -#[allow(missing_copy_implementations)] #[stable(feature = "rust1", since = "1.0.0")] pub struct ParseBoolError { _priv: () } @@ -235,7 +234,7 @@ pub unsafe fn from_utf8_unchecked<'a>(v: &'a [u8]) -> &'a str { pub unsafe fn from_c_str(s: *const i8) -> &'static str { let s = s as *const u8; let mut len = 0; - while *s.offset(len as int) != 0 { + while *s.offset(len as isize) != 0 { len += 1; } let v: &'static [u8] = ::mem::transmute(Slice { data: s, len: len }); @@ -258,7 +257,7 @@ impl CharEq for char { fn matches(&mut self, c: char) -> bool { *self == c } #[inline] - fn only_ascii(&self) -> bool { (*self as usize) < 128 } + fn only_ascii(&self) -> bool { (*self as u32) < 128 } } impl CharEq for F where F: FnMut(char) -> bool { @@ -764,7 +763,8 @@ impl TwoWaySearcher { // How far we can jump when we encounter a mismatch is all based on the fact // that (u, v) is a critical factorization for the needle. #[inline] - fn next(&mut self, haystack: &[u8], needle: &[u8], long_period: bool) -> Option<(usize, usize)> { + fn next(&mut self, haystack: &[u8], needle: &[u8], long_period: bool) + -> Option<(usize, usize)> { 'search: loop { // Check that we have room to search in if self.position + needle.len() > haystack.len() { @@ -955,6 +955,7 @@ Section: Comparing strings /// to compare &[u8] byte slices that are not necessarily valid UTF-8. #[inline] fn eq_slice_(a: &str, b: &str) -> bool { + // NOTE: In theory n should be libc::size_t and not usize, but libc is not available here #[allow(improper_ctypes)] extern { fn memcmp(s1: *const i8, s2: *const i8, n: usize) -> i32; } a.len() == b.len() && unsafe { @@ -1489,7 +1490,7 @@ impl StrExt for str { fn trim_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str where P::Searcher: DoubleEndedSearcher<'a> { let mut i = 0; - let mut j = self.len(); + let mut j = 0; let mut matcher = pat.into_searcher(self); if let Some((a, b)) = matcher.next_reject() { i = a; @@ -1507,7 +1508,7 @@ impl StrExt for str { #[inline] fn trim_left_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str { - let mut i = 0; + let mut i = self.len(); let mut matcher = pat.into_searcher(self); if let Some((a, _)) = matcher.next_reject() { i = a; @@ -1521,7 +1522,7 @@ impl StrExt for str { #[inline] fn trim_right_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str where P::Searcher: ReverseSearcher<'a> { - let mut j = self.len(); + let mut j = 0; let mut matcher = pat.into_searcher(self); if let Some((_, b)) = matcher.next_reject_back() { j = b; diff --git a/src/libcore/str/pattern.rs b/src/libcore/str/pattern.rs index 501fc27b37626..9cd5510db3702 100644 --- a/src/libcore/str/pattern.rs +++ b/src/libcore/str/pattern.rs @@ -58,6 +58,7 @@ pub trait Pattern<'a>: Sized { // Searcher +#[derive(Copy, Clone, Eq, PartialEq, Debug)] pub enum SearchStep { Match(usize, usize), Reject(usize, usize), @@ -190,7 +191,7 @@ impl<'a, C: CharEq> DoubleEndedSearcher<'a> for CharEqSearcher<'a, C> {} // Impl for &str -// TODO: Optimize the naive implementation here +// Todo: Optimize the naive implementation here #[derive(Clone)] pub struct StrSearcher<'a, 'b> { @@ -235,13 +236,16 @@ unsafe impl<'a, 'b> Searcher<'a> for StrSearcher<'a, 'b> { }, |m: &mut StrSearcher| { // Forward step for nonempty needle - let possible_match = &m.haystack[m.start .. m.start + m.needle.len()]; + // Compare if bytes are equal + let possible_match = &m.haystack.as_bytes()[m.start .. m.start + m.needle.len()]; let current_start = m.start; - if possible_match == m.needle { + if possible_match == m.needle.as_bytes() { m.start += m.needle.len(); SearchStep::Match(current_start, m.start) } else { - m.start += possible_match.chars().next().unwrap().len_utf8(); + // Skip a char + let haystack_suffix = &m.haystack[m.start..]; + m.start += haystack_suffix.chars().next().unwrap().len_utf8(); SearchStep::Reject(current_start, m.start) } }) @@ -262,13 +266,16 @@ unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> { }, |m: &mut StrSearcher| { // Backward step for nonempty needle - let possible_match = &m.haystack[m.end - m.needle.len() .. m.end]; + // Compare if bytes are equal + let possible_match = &m.haystack.as_bytes()[m.end - m.needle.len() .. m.end]; let current_end = m.end; - if possible_match == m.needle { + if possible_match == m.needle.as_bytes() { m.end -= m.needle.len(); SearchStep::Match(m.end, current_end) } else { - m.end -= possible_match.chars().rev().next().unwrap().len_utf8(); + // Skip a char + let haystack_prefix = &m.haystack[..m.end]; + m.end -= haystack_prefix.chars().rev().next().unwrap().len_utf8(); SearchStep::Reject(m.end, current_end) } }) @@ -290,6 +297,9 @@ where F: FnOnce(&mut StrSearcher) -> SearchStep, } else if m.start + m.needle.len() <= m.end { // Case for needle != "" g(&mut m) + } else if m.start < m.end { + m.done = true; + SearchStep::Reject(m.start, m.end) } else { m.done = true; SearchStep::Done @@ -352,7 +362,8 @@ impl<'a, F> Pattern<'a> for F where F: FnMut(char) -> bool { use ops::Deref; -impl<'a, 'b, P: 'b + ?Sized, T: Deref + ?Sized> Pattern<'a> for &'b T where &'b P: Pattern<'a> { +impl<'a, 'b, P: 'b + ?Sized, T: Deref + ?Sized> Pattern<'a> for &'b T +where &'b P: Pattern<'a> { type Searcher = <&'b P as Pattern<'a>>::Searcher; associated_items!(<&'b P as Pattern<'a>>::Searcher, s, (&**s)); diff --git a/src/libcoretest/str.rs b/src/libcoretest/str.rs index acd8cc42c7298..beb746d25b61c 100644 --- a/src/libcoretest/str.rs +++ b/src/libcoretest/str.rs @@ -1,4 +1,4 @@ -// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT // file at the top-level directory of this distribution and at // http://rust-lang.org/COPYRIGHT. // @@ -139,8 +139,150 @@ fn test_utf16_code_units() { vec![0xE9, 0xD83D, 0xDCA9]) } +#[test] +fn starts_with_in_unicode() { + assert!(!"├── Cargo.toml".starts_with("# ")); +} -// rm x86_64-unknown-linux-gnu/stage1/test/coretesttest-x86_64-unknown-linux-gnu; env PLEASE_BENCH=1 make check-stage1-coretest TESTNAME=str::bench +#[test] +fn starts_short_long() { + assert!(!"".starts_with("##")); + assert!(!"##".starts_with("####")); + assert!("####".starts_with("##")); + assert!(!"##ä".starts_with("####")); + assert!("####ä".starts_with("##")); + assert!(!"##".starts_with("####ä")); + assert!("##ä##".starts_with("##ä")); + + assert!("".starts_with("")); + assert!("ä".starts_with("")); + assert!("#ä".starts_with("")); + assert!("##ä".starts_with("")); + assert!("ä###".starts_with("")); + assert!("#ä##".starts_with("")); + assert!("##ä#".starts_with("")); +} + +#[test] +fn contains_weird_cases() { + assert!("* \t".contains_char(' ')); + assert!(!"* \t".contains_char('?')); + assert!(!"* \t".contains_char('\u{1F4A9}')); +} + +#[test] +fn trim_ws() { + assert_eq!(" \t a \t ".trim_left_matches(|c: char| c.is_whitespace()), + "a \t "); + assert_eq!(" \t a \t ".trim_right_matches(|c: char| c.is_whitespace()), + " \t a"); + assert_eq!(" \t a \t ".trim_matches(|c: char| c.is_whitespace()), + "a"); + assert_eq!(" \t \t ".trim_left_matches(|c: char| c.is_whitespace()), + ""); + assert_eq!(" \t \t ".trim_right_matches(|c: char| c.is_whitespace()), + ""); + assert_eq!(" \t \t ".trim_matches(|c: char| c.is_whitespace()), + ""); +} + +mod pattern { + use std::str::Pattern; + use std::str::{Searcher, ReverseSearcher, DoubleEndedSearcher}; + use std::str::SearchStep::{self, Match, Reject, Done}; + + macro_rules! make_test { + ($name:ident, $p:expr, $h:expr, [$($e:expr,)*]) => { + mod $name { + use std::str::Pattern; + use std::str::{Searcher, ReverseSearcher, DoubleEndedSearcher}; + use std::str::SearchStep::{self, Match, Reject, Done}; + use super::{cmp_search_to_vec}; + #[test] + fn fwd() { + cmp_search_to_vec(false, $p, $h, vec![$($e),*]); + } + #[test] + fn bwd() { + cmp_search_to_vec(true, $p, $h, vec![$($e),*]); + } + } + } + } + + fn cmp_search_to_vec<'a, P: Pattern<'a>>(rev: bool, pat: P, haystack: &'a str, + right: Vec) + where P::Searcher: ReverseSearcher<'a> + { + let mut searcher = pat.into_searcher(haystack); + let mut v = vec![]; + loop { + match if !rev {searcher.next()} else {searcher.next_back()} { + Match(a, b) => v.push(Match(a, b)), + Reject(a, b) => v.push(Reject(a, b)), + Done => break, + } + } + if rev { + v.reverse(); + } + assert_eq!(v, right); + } + + make_test!(str_searcher_ascii_haystack, "bb", "abbcbbd", [ + Reject(0, 1), + Match (1, 3), + Reject(3, 4), + Match (4, 6), + Reject(6, 7), + ]); + make_test!(str_searcher_empty_needle_ascii_haystack, "", "abbcbbd", [ + Match(0, 0), + Match(1, 1), + Match(2, 2), + Match(3, 3), + Match(4, 4), + Match(5, 5), + Match(6, 6), + Match(7, 7), + ]); + make_test!(str_searcher_mulibyte_haystack, " ", "├──", [ + Reject(0, 3), + Reject(3, 6), + Reject(6, 9), + ]); + make_test!(str_searcher_empty_needle_mulibyte_haystack, "", "├──", [ + Match(0, 0), + Match(3, 3), + Match(6, 6), + Match(9, 9), + ]); + make_test!(str_searcher_empty_needle_empty_haystack, "", "", [ + Match(0, 0), + ]); + make_test!(str_searcher_nonempty_needle_empty_haystack, "├", "", [ + ]); + make_test!(char_searcher_ascii_haystack, 'b', "abbcbbd", [ + Reject(0, 1), + Match (1, 2), + Match (2, 3), + Reject(3, 4), + Match (4, 5), + Match (5, 6), + Reject(6, 7), + ]); + make_test!(char_searcher_mulibyte_haystack, ' ', "├──", [ + Reject(0, 3), + Reject(3, 6), + Reject(6, 9), + ]); + make_test!(char_searcher_short_haystack, '\u{1F4A9}', "* \t", [ + Reject(0, 1), + Reject(1, 2), + Reject(2, 3), + ]); + +} mod bench { macro_rules! make_test_inner { From c8dd2d066d7b25246d2b940b7c161b8b67608b74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marvin=20L=C3=B6bel?= Date: Thu, 19 Feb 2015 14:36:58 +0100 Subject: [PATCH 9/9] Addressed PR comments --- src/compiletest/compiletest.rs | 2 +- src/compiletest/errors.rs | 2 +- src/compiletest/header.rs | 2 +- src/compiletest/runtest.rs | 4 +- src/libcollections/str.rs | 28 ++- src/libcore/str/mod.rs | 40 +++- src/libcore/str/pattern.rs | 213 ++++++++++++++---- src/libgraphviz/lib.rs | 2 +- src/librustc/lint/builtin.rs | 2 +- src/librustc/metadata/filesearch.rs | 2 +- .../middle/infer/region_inference/graphviz.rs | 2 +- src/librustc/session/mod.rs | 6 +- src/librustc_driver/pretty.rs | 2 +- src/librustdoc/html/render.rs | 2 +- src/libstd/old_path/windows.rs | 2 +- src/libsyntax/parse/lexer/comments.rs | 4 +- src/libsyntax/parse/parser.rs | 2 +- src/libunicode/u_str.rs | 2 +- 18 files changed, 233 insertions(+), 86 deletions(-) diff --git a/src/compiletest/compiletest.rs b/src/compiletest/compiletest.rs index 30de253fbad42..278ce5565d9fc 100644 --- a/src/compiletest/compiletest.rs +++ b/src/compiletest/compiletest.rs @@ -23,7 +23,7 @@ #![feature(env)] #![feature(core)] -// #![deny(warnings)] +#![deny(warnings)] extern crate test; extern crate getopts; diff --git a/src/compiletest/errors.rs b/src/compiletest/errors.rs index d8faa53a2de64..7411a9b48d417 100644 --- a/src/compiletest/errors.rs +++ b/src/compiletest/errors.rs @@ -58,7 +58,7 @@ pub fn load_errors(testfile: &Path) -> Vec { fn parse_expected(last_nonfollow_error: Option, line_num: uint, line: &str) -> Option<(WhichLine, ExpectedError)> { - let start = match line.find_str("//~") { Some(i) => i, None => return None }; + let start = match line.find("//~") { Some(i) => i, None => return None }; let (follow, adjusts) = if line.char_at(start + 3) == '|' { (true, 0) } else { diff --git a/src/compiletest/header.rs b/src/compiletest/header.rs index c253967964322..9c217651c3edc 100644 --- a/src/compiletest/header.rs +++ b/src/compiletest/header.rs @@ -330,7 +330,7 @@ fn parse_name_directive(line: &str, directive: &str) -> bool { pub fn parse_name_value_directive(line: &str, directive: &str) -> Option { let keycolon = format!("{}:", directive); - match line.find_str(&keycolon) { + match line.find(&keycolon) { Some(colon) => { let value = line[(colon + keycolon.len()) .. line.len()].to_string(); debug!("{}: {}", directive, value); diff --git a/src/compiletest/runtest.rs b/src/compiletest/runtest.rs index 1cbb8742bbc5a..39ecc323125b7 100644 --- a/src/compiletest/runtest.rs +++ b/src/compiletest/runtest.rs @@ -847,7 +847,7 @@ fn check_debugger_output(debugger_run_result: &ProcRes, check_lines: &[String]) check_lines.iter().map(|s| { s .trim() - .split_str("[...]") + .split("[...]") .map(|x| x.to_string()) .collect() }).collect(); @@ -866,7 +866,7 @@ fn check_debugger_output(debugger_run_result: &ProcRes, check_lines: &[String]) None } } else { - rest.find_str(frag) + rest.find(frag) }; match found { None => { diff --git a/src/libcollections/str.rs b/src/libcollections/str.rs index e86cf462cab9c..92dc01dc3e4e4 100644 --- a/src/libcollections/str.rs +++ b/src/libcollections/str.rs @@ -547,8 +547,8 @@ pub trait StrExt: Index { /// ```rust /// assert!("hello".contains_char('e')); /// ``` - #[unstable(feature = "collections", - reason = "might get removed in favour of a more generic contains()")] + #[unstable(feature = "collections")] + #[deprecated(since = "1.0.0", reason = "use `contains()` with a char")] fn contains_char<'a, P: Pattern<'a>>(&'a self, pat: P) -> bool { core_str::StrExt::contains_char(&self[..], pat) } @@ -660,7 +660,7 @@ pub trait StrExt: Index { /// let v: Vec<&str> = "lionXXtigerXleopard".split('X').rev().collect(); /// assert_eq!(v, vec!["leopard", "tiger", "", "lion"]); /// ``` - #[unstable(feature = "collections", reason = "might get removed")] + #[stable(feature = "rust1", since = "1.0.0")] fn split_terminator<'a, P: Pattern<'a>>(&'a self, pat: P) -> SplitTerminator<'a, P> { core_str::StrExt::split_terminator(&self[..], pat) } @@ -708,6 +708,8 @@ pub trait StrExt: Index { /// ``` #[unstable(feature = "collections", reason = "might have its iterator type changed")] + // NB: Right now MatchIndices yields `(usize, usize)`, + // but it would be more consistent and useful to return `(usize, &str)` fn match_indices<'a, P: Pattern<'a>>(&'a self, pat: P) -> MatchIndices<'a, P> { core_str::StrExt::match_indices(&self[..], pat) } @@ -723,8 +725,8 @@ pub trait StrExt: Index { /// let v: Vec<&str> = "1abcabc2".split_str("abc").collect(); /// assert_eq!(v, vec!["1", "", "2"]); /// ``` - #[unstable(feature = "collections", - reason = "might get removed in the future in favor of a more generic split()")] + #[unstable(feature = "collections")] + #[deprecated(since = "1.0.0", reason = "use `split()` with a `&str`")] fn split_str<'a, P: Pattern<'a>>(&'a self, pat: P) -> SplitStr<'a, P> { core_str::StrExt::split_str(&self[..], pat) } @@ -840,7 +842,8 @@ pub trait StrExt: Index { /// ``` #[stable(feature = "rust1", since = "1.0.0")] fn ends_with<'a, P: Pattern<'a>>(&'a self, pat: P) -> bool - where P::Searcher: ReverseSearcher<'a> { + where P::Searcher: ReverseSearcher<'a> + { core_str::StrExt::ends_with(&self[..], pat) } @@ -861,7 +864,8 @@ pub trait StrExt: Index { /// ``` #[stable(feature = "rust1", since = "1.0.0")] fn trim_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str - where P::Searcher: DoubleEndedSearcher<'a> { + where P::Searcher: DoubleEndedSearcher<'a> + { core_str::StrExt::trim_matches(&self[..], pat) } @@ -902,7 +906,8 @@ pub trait StrExt: Index { /// ``` #[stable(feature = "rust1", since = "1.0.0")] fn trim_right_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str - where P::Searcher: ReverseSearcher<'a> { + where P::Searcher: ReverseSearcher<'a> + { core_str::StrExt::trim_right_matches(&self[..], pat) } @@ -1108,7 +1113,8 @@ pub trait StrExt: Index { /// ``` #[stable(feature = "rust1", since = "1.0.0")] fn rfind<'a, P: Pattern<'a>>(&'a self, pat: P) -> Option - where P::Searcher: ReverseSearcher<'a> { + where P::Searcher: ReverseSearcher<'a> + { core_str::StrExt::rfind(&self[..], pat) } @@ -1131,8 +1137,8 @@ pub trait StrExt: Index { /// assert_eq!(s.find_str("老虎 L"), Some(6)); /// assert_eq!(s.find_str("muffin man"), None); /// ``` - #[unstable(feature = "collections", - reason = "might get removed in favor of a more generic find in the future")] + #[unstable(feature = "collections")] + #[deprecated(since = "1.0.0", reason = "use `find()` with a `&str`")] fn find_str<'a, P: Pattern<'a>>(&'a self, needle: P) -> Option { core_str::StrExt::find_str(&self[..], needle) } diff --git a/src/libcore/str/mod.rs b/src/libcore/str/mod.rs index 820ad4d8586ad..7e51f8e8503b4 100644 --- a/src/libcore/str/mod.rs +++ b/src/libcore/str/mod.rs @@ -242,8 +242,10 @@ pub unsafe fn from_c_str(s: *const i8) -> &'static str { } /// Something that can be used to compare against a character -#[unstable(feature = "core", - reason = "definition may change as pattern-related methods are stabilized")] +#[unstable(feature = "core")] +#[deprecated(since = "1.0.0", + reason = "use `Pattern` instead")] +// NB: Rather than removing it, make it private and move it into self::pattern pub trait CharEq { /// Determine if the splitter should split at the given character fn matches(&mut self, char) -> bool; @@ -252,6 +254,7 @@ pub trait CharEq { fn only_ascii(&self) -> bool; } +#[allow(deprecated) /* for CharEq */ ] impl CharEq for char { #[inline] fn matches(&mut self, c: char) -> bool { *self == c } @@ -260,6 +263,7 @@ impl CharEq for char { fn only_ascii(&self) -> bool { (*self as u32) < 128 } } +#[allow(deprecated) /* for CharEq */ ] impl CharEq for F where F: FnMut(char) -> bool { #[inline] fn matches(&mut self, c: char) -> bool { (*self)(c) } @@ -268,13 +272,16 @@ impl CharEq for F where F: FnMut(char) -> bool { fn only_ascii(&self) -> bool { false } } +#[allow(deprecated) /* for CharEq */ ] impl<'a> CharEq for &'a [char] { #[inline] + #[allow(deprecated) /* for CharEq */ ] fn matches(&mut self, c: char) -> bool { self.iter().any(|&m| { let mut m = m; m.matches(c) }) } #[inline] + #[allow(deprecated) /* for CharEq */ ] fn only_ascii(&self) -> bool { self.iter().all(|m| m.only_ascii()) } @@ -764,7 +771,7 @@ impl TwoWaySearcher { // that (u, v) is a critical factorization for the needle. #[inline] fn next(&mut self, haystack: &[u8], needle: &[u8], long_period: bool) - -> Option<(usize, usize)> { + -> Option<(usize, usize)> { 'search: loop { // Check that we have room to search in if self.position + needle.len() > haystack.len() { @@ -866,6 +873,8 @@ impl TwoWaySearcher { /// The internal state of an iterator that searches for matches of a substring /// within a larger string using a dynamically chosen search algorithm #[derive(Clone)] +// NB: This is kept around for convenience because +// it is planned to be used again in the future enum OldSearcher { TwoWay(TwoWaySearcher), TwoWayLong(TwoWaySearcher), @@ -896,6 +905,8 @@ impl OldSearcher { } #[derive(Clone)] +// NB: This is kept around for convenience because +// it is planned to be used again in the future struct OldMatchIndices<'a, 'b> { // constants haystack: &'a str, @@ -921,7 +932,8 @@ impl<'a, P: Pattern<'a>> Iterator for MatchIndices<'a, P> { /// An iterator over the substrings of a string separated by a given /// search string -#[unstable(feature = "core", reason = "type may be removed")] +#[unstable(feature = "core")] +#[deprecated(since = "1.0.0", reason = "use `Split` with a `&str`")] pub struct SplitStr<'a, P: Pattern<'a>>(Split<'a, P>); impl<'a, P: Pattern<'a>> Iterator for SplitStr<'a, P> { type Item = &'a str; @@ -1282,8 +1294,7 @@ where P::Searcher: DoubleEndedSearcher<'a> { } /// Return type of `StrExt::split_terminator` -#[unstable(feature = "core", - reason = "might get removed in favour of a constructor method on Split")] +#[stable(feature = "rust1", since = "1.0.0")] pub struct SplitTerminator<'a, P: Pattern<'a>>(CharSplits<'a, P>); delegate_iter!{pattern &'a str : SplitTerminator<'a, P>} @@ -1421,6 +1432,7 @@ impl StrExt for str { } #[inline] + #[allow(deprecated) /* for SplitStr */ ] fn split_str<'a, P: Pattern<'a>>(&'a self, pat: P) -> SplitStr<'a, P> { SplitStr(self.split(pat)) } @@ -1477,18 +1489,20 @@ impl StrExt for str { #[inline] fn starts_with<'a, P: Pattern<'a>>(&'a self, pat: P) -> bool { - pat.match_starts_at(self, 0) + pat.is_prefix_of(self) } #[inline] fn ends_with<'a, P: Pattern<'a>>(&'a self, pat: P) -> bool - where P::Searcher: ReverseSearcher<'a> { - pat.match_ends_at(self, self.len()) + where P::Searcher: ReverseSearcher<'a> + { + pat.is_suffix_of(self) } #[inline] fn trim_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str - where P::Searcher: DoubleEndedSearcher<'a> { + where P::Searcher: DoubleEndedSearcher<'a> + { let mut i = 0; let mut j = 0; let mut matcher = pat.into_searcher(self); @@ -1521,7 +1535,8 @@ impl StrExt for str { #[inline] fn trim_right_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str - where P::Searcher: ReverseSearcher<'a> { + where P::Searcher: ReverseSearcher<'a> + { let mut j = 0; let mut matcher = pat.into_searcher(self); if let Some((_, b)) = matcher.next_reject_back() { @@ -1599,7 +1614,8 @@ impl StrExt for str { } fn rfind<'a, P: Pattern<'a>>(&'a self, pat: P) -> Option - where P::Searcher: ReverseSearcher<'a> { + where P::Searcher: ReverseSearcher<'a> + { pat.into_searcher(self).next_match_back().map(|(i, _)| i) } diff --git a/src/libcore/str/pattern.rs b/src/libcore/str/pattern.rs index 9cd5510db3702..1f669c66eb117 100644 --- a/src/libcore/str/pattern.rs +++ b/src/libcore/str/pattern.rs @@ -8,66 +8,117 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -#![allow(missing_docs)] +#![allow(deprecated) /* for CharEq */ ] use prelude::*; use super::CharEq; // Pattern +/// A string pattern. +/// +/// A `Pattern<'a>` expresses that the implementing type +/// can be used as a string pattern for searching in a `&'a str`. +/// +/// For example, both `'a'` and `"aa"` are patterns that +/// would match at index `1` in the string `"baaaab"`. +/// +/// The trait itself acts as a builder for an associated +/// `Searcher` type, which does the actual work of finding +/// occurences of the pattern in a string. pub trait Pattern<'a>: Sized { + /// Associated searcher for this pattern type Searcher: Searcher<'a>; + + /// Construct the associated searcher from + /// `self` and the `haystack` to search in. fn into_searcher(self, haystack: &'a str) -> Self::Searcher; + /// Check whether the pattern matches anywhere in the haystack #[inline] fn is_contained_in(self, haystack: &'a str) -> bool { self.into_searcher(haystack).next_match().is_some() } + /// Check whether the pattern matches at the front of the haystack #[inline] - fn match_starts_at(self, haystack: &'a str, idx: usize) -> bool { - let mut matcher = self.into_searcher(haystack); - loop { - match matcher.next() { - SearchStep::Match(i, _) if i == idx => return true, - SearchStep::Match(i, _) - | SearchStep::Reject(i, _) if i >= idx => break, - SearchStep::Done => break, - _ => continue, - } + fn is_prefix_of(self, haystack: &'a str) -> bool { + match self.into_searcher(haystack).next() { + SearchStep::Match(0, _) => true, + _ => false, } - false } + /// Check whether the pattern matches at the back of the haystack #[inline] - fn match_ends_at(self, haystack: &'a str, idx: usize) -> bool - where Self::Searcher: ReverseSearcher<'a> { - let mut matcher = self.into_searcher(haystack); - loop { - match matcher.next_back() { - SearchStep::Match(_, j) if idx == j => return true, - SearchStep::Match(_, j) - | SearchStep::Reject(_, j) if idx >= j => break, - SearchStep::Done => break, - _ => continue, - } + fn is_suffix_of(self, haystack: &'a str) -> bool + where Self::Searcher: ReverseSearcher<'a> + { + match self.into_searcher(haystack).next_back() { + SearchStep::Match(_, j) if haystack.len() == j => true, + _ => false, } - false } } // Searcher +/// Result of calling `Searcher::next()` or `ReverseSearcher::next_back()`. #[derive(Copy, Clone, Eq, PartialEq, Debug)] pub enum SearchStep { + /// Expresses that a match of the pattern has been found at + /// `haystack[a..b]`. Match(usize, usize), + /// Expresses that `haystack[a..b]` has been rejected as a possible match + /// of the pattern. + /// + /// Note that there might be more than one `Reject` betwen two `Match`es, + /// there is no requirement for them to be combined into one. Reject(usize, usize), + /// Expresses that every byte of the haystack has been visted, ending + /// the iteration. Done } +/// A searcher for a string pattern. +/// +/// This trait provides methods for searching for non-overlapping +/// matches of a pattern starting from the front (left) of a string. +/// +/// It will be implemented by associated `Searcher` +/// types of the `Pattern` trait. +/// +/// The trait is marked unsafe because the indices returned by the +/// `next()` methods are required to lie on valid utf8 boundaries in +/// the haystack. This enables consumers of this trait to +/// slice the haystack without additional runtime checks. pub unsafe trait Searcher<'a> { + /// Getter for the underlaying string to be searched in + /// + /// Will always return the same `&str` fn haystack(&self) -> &'a str; + + /// Performs the next search step starting from the front. + /// + /// - Returns `Match(a, b)` if `haystack[a..b]` matches the pattern. + /// - Returns `Reject(a, b)` if `haystack[a..b]` can not match the + /// pattern, even partially. + /// - Returns `Done` if every byte of the haystack has been visited + /// + /// The stream of `Match` and `Reject` values up to a `Done` + /// will contain index ranges that are adjacent, non-overlapping, + /// covering the whole haystack, and laying on utf8 boundaries. + /// + /// A `Match` result needs to contain the whole matched pattern, + /// however `Reject` results may be split up into arbitrary + /// many adjacent fragments. Both ranges may have zero length. + /// + /// As an example, the pattern `"aaa"` and the haystack `"cbaaaaab"` + /// might produce the stream + /// `[Reject(0, 1), Reject(1, 2), Match(2, 5), Reject(5, 8)]` fn next(&mut self) -> SearchStep; + + /// Find the next `Match` result. See `next()` #[inline] fn next_match(&mut self) -> Option<(usize, usize)> { loop { @@ -78,8 +129,10 @@ pub unsafe trait Searcher<'a> { } } } + + /// Find the next `Reject` result. See `next()` #[inline] - fn next_reject(&mut self) -> Option<(usize, usize)>{ + fn next_reject(&mut self) -> Option<(usize, usize)> { loop { match self.next() { SearchStep::Reject(a, b) => return Some((a, b)), @@ -90,8 +143,42 @@ pub unsafe trait Searcher<'a> { } } +/// A reverse searcher for a string pattern. +/// +/// This trait provides methods for searching for non-overlapping +/// matches of a pattern starting from the back (right) of a string. +/// +/// It will be implemented by associated `Searcher` +/// types of the `Pattern` trait if the pattern supports searching +/// for it from the back. +/// +/// The index ranges returned by this trait are not required +/// to exactly match those of the forward search in reverse. +/// +/// For the reason why this trait is marked unsafe, see them +/// parent trait `Searcher`. pub unsafe trait ReverseSearcher<'a>: Searcher<'a> { + /// Performs the next search step starting from the back. + /// + /// - Returns `Match(a, b)` if `haystack[a..b]` matches the pattern. + /// - Returns `Reject(a, b)` if `haystack[a..b]` can not match the + /// pattern, even partially. + /// - Returns `Done` if every byte of the haystack has been visited + /// + /// The stream of `Match` and `Reject` values up to a `Done` + /// will contain index ranges that are adjacent, non-overlapping, + /// covering the whole haystack, and laying on utf8 boundaries. + /// + /// A `Match` result needs to contain the whole matched pattern, + /// however `Reject` results may be split up into arbitrary + /// many adjacent fragments. Both ranges may have zero length. + /// + /// As an example, the pattern `"aaa"` and the haystack `"cbaaaaab"` + /// might produce the stream + /// `[Reject(7, 8), Match(4, 7), Reject(1, 4), Reject(0, 1)]` fn next_back(&mut self) -> SearchStep; + + /// Find the next `Match` result. See `next_back()` #[inline] fn next_match_back(&mut self) -> Option<(usize, usize)>{ loop { @@ -102,6 +189,8 @@ pub unsafe trait ReverseSearcher<'a>: Searcher<'a> { } } } + + /// Find the next `Reject` result. See `next_back()` #[inline] fn next_reject_back(&mut self) -> Option<(usize, usize)>{ loop { @@ -114,13 +203,34 @@ pub unsafe trait ReverseSearcher<'a>: Searcher<'a> { } } +/// A marker trait to express that a `ReverseSearcher` +/// can be used for a `DoubleEndedIterator` implementation. +/// +/// For this, the impl of `Searcher` and `ReverseSearcher` need +/// to follow these conditions: +/// +/// - All results of `next()` need to be identical +/// to the results of `next_back()` in reverse order. +/// - `next()` and `next_back()` need to behave as +/// the two ends of a range of values, that is they +/// can not "walk past each other". +/// +/// # Example +/// +/// `char::Searcher` is a `DoubleEndedSearcher` because searching for a +/// `char` only requires looking at one at a time, which behaves the same +/// from both ends. +/// +/// `(&str)::Searcher` is not a `DoubleEndedSearcher` because +/// the pattern `"aa"` in the haystack `"aaa"` matches as either +/// `"[aa]a"` or `"a[aa]"`, depending from which side it is searched. pub trait DoubleEndedSearcher<'a>: ReverseSearcher<'a> {} // Impl for a CharEq wrapper struct CharEqPattern(C); -pub struct CharEqSearcher<'a, C: CharEq> { +struct CharEqSearcher<'a, C: CharEq> { char_eq: C, haystack: &'a str, char_indices: super::CharIndices<'a>, @@ -194,7 +304,7 @@ impl<'a, C: CharEq> DoubleEndedSearcher<'a> for CharEqSearcher<'a, C> {} // Todo: Optimize the naive implementation here #[derive(Clone)] -pub struct StrSearcher<'a, 'b> { +struct StrSearcher<'a, 'b> { haystack: &'a str, needle: &'b str, start: usize, @@ -202,6 +312,10 @@ pub struct StrSearcher<'a, 'b> { done: bool, } +/// Non-allocating substring search. +/// +/// Will handle the pattern `""` as returning empty matches at each utf8 +/// boundary. impl<'a, 'b> Pattern<'a> for &'b str { type Searcher = StrSearcher<'a, 'b>; @@ -236,9 +350,9 @@ unsafe impl<'a, 'b> Searcher<'a> for StrSearcher<'a, 'b> { }, |m: &mut StrSearcher| { // Forward step for nonempty needle - // Compare if bytes are equal - let possible_match = &m.haystack.as_bytes()[m.start .. m.start + m.needle.len()]; let current_start = m.start; + // Compare byte window because this might break utf8 boundaries + let possible_match = &m.haystack.as_bytes()[m.start .. m.start + m.needle.len()]; if possible_match == m.needle.as_bytes() { m.start += m.needle.len(); SearchStep::Match(current_start, m.start) @@ -266,9 +380,9 @@ unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> { }, |m: &mut StrSearcher| { // Backward step for nonempty needle - // Compare if bytes are equal - let possible_match = &m.haystack.as_bytes()[m.end - m.needle.len() .. m.end]; let current_end = m.end; + // Compare byte window because this might break utf8 boundaries + let possible_match = &m.haystack.as_bytes()[m.end - m.needle.len() .. m.end]; if possible_match == m.needle.as_bytes() { m.end -= m.needle.len(); SearchStep::Match(m.end, current_end) @@ -282,9 +396,13 @@ unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> { } } -fn str_search_step(mut m: &mut StrSearcher, f: F, g: G) -> SearchStep -where F: FnOnce(&mut StrSearcher) -> SearchStep, - G: FnOnce(&mut StrSearcher) -> SearchStep +// Helper function for encapsulating the common control flow +// of doing a search step from the front or doing a search step from the back +fn str_search_step(mut m: &mut StrSearcher, + empty_needle_step: F, + nonempty_needle_step: G) -> SearchStep + where F: FnOnce(&mut StrSearcher) -> SearchStep, + G: FnOnce(&mut StrSearcher) -> SearchStep { if m.done { SearchStep::Done @@ -293,11 +411,12 @@ where F: FnOnce(&mut StrSearcher) -> SearchStep, if m.start == m.end { m.done = true; } - f(&mut m) + empty_needle_step(&mut m) } else if m.start + m.needle.len() <= m.end { // Case for needle != "" - g(&mut m) + nonempty_needle_step(&mut m) } else if m.start < m.end { + // Remaining slice shorter than needle, reject it m.done = true; SearchStep::Reject(m.start, m.end) } else { @@ -323,35 +442,39 @@ macro_rules! associated_items { } #[inline] - fn match_starts_at(self, haystack: &'a str, idx: usize) -> bool { + fn is_prefix_of(self, haystack: &'a str) -> bool { let $s = self; - $e.match_starts_at(haystack, idx) + $e.is_prefix_of(haystack) } // FIXME: #21750 /*#[inline] - fn match_ends_at(self, haystack: &'a str, idx: usize) -> bool - where $t: ReverseSearcher<'a> { + fn is_suffix_of(self, haystack: &'a str) -> bool + where $t: ReverseSearcher<'a> + { let $s = self; - $e.match_ends_at(haystack, idx) + $e.is_suffix_of(haystack) }*/ } } // CharEq delegation impls -impl<'a, 'b> Pattern<'a> for &'b [char] { +/// Searches for chars that are equal to a given char +impl<'a> Pattern<'a> for char { type Searcher = as Pattern<'a>>::Searcher; associated_items!( as Pattern<'a>>::Searcher, s, CharEqPattern(s)); } -impl<'a> Pattern<'a> for char { +/// Searches for chars that are equal to any of the chars in the array +impl<'a, 'b> Pattern<'a> for &'b [char] { type Searcher = as Pattern<'a>>::Searcher; associated_items!( as Pattern<'a>>::Searcher, s, CharEqPattern(s)); } +/// Searches for chars that match the given predicate impl<'a, F> Pattern<'a> for F where F: FnMut(char) -> bool { type Searcher = as Pattern<'a>>::Searcher; associated_items!( as Pattern<'a>>::Searcher, @@ -362,8 +485,10 @@ impl<'a, F> Pattern<'a> for F where F: FnMut(char) -> bool { use ops::Deref; +/// Delegates to the next deref coercion of `Self` that implements `Pattern` impl<'a, 'b, P: 'b + ?Sized, T: Deref + ?Sized> Pattern<'a> for &'b T -where &'b P: Pattern<'a> { + where &'b P: Pattern<'a> +{ type Searcher = <&'b P as Pattern<'a>>::Searcher; associated_items!(<&'b P as Pattern<'a>>::Searcher, s, (&**s)); diff --git a/src/libgraphviz/lib.rs b/src/libgraphviz/lib.rs index acd52c752e8aa..09fbf4935e4c1 100644 --- a/src/libgraphviz/lib.rs +++ b/src/libgraphviz/lib.rs @@ -463,7 +463,7 @@ impl<'a> LabelText<'a> { fn pre_escaped_content(self) -> Cow<'a, str> { match self { EscStr(s) => s, - LabelStr(s) => if s.contains_char('\\') { + LabelStr(s) => if s.contains('\\') { (&*s).escape_default().into_cow() } else { s diff --git a/src/librustc/lint/builtin.rs b/src/librustc/lint/builtin.rs index dc81e89902bb4..74ae9692abd61 100644 --- a/src/librustc/lint/builtin.rs +++ b/src/librustc/lint/builtin.rs @@ -784,7 +784,7 @@ impl NonCamelCaseTypes { // start with a non-lowercase letter rather than non-uppercase // ones (some scripts don't have a concept of upper/lowercase) - ident.len() > 0 && !ident.char_at(0).is_lowercase() && !ident.contains_char('_') + ident.len() > 0 && !ident.char_at(0).is_lowercase() && !ident.contains('_') } fn to_camel_case(s: &str) -> String { diff --git a/src/librustc/metadata/filesearch.rs b/src/librustc/metadata/filesearch.rs index 3caa0f5b4db4c..d63b3dd60d01d 100644 --- a/src/librustc/metadata/filesearch.rs +++ b/src/librustc/metadata/filesearch.rs @@ -219,7 +219,7 @@ pub fn rust_path() -> Vec { let mut env_rust_path: Vec = match get_rust_path() { Some(env_path) => { let env_path_components = - env_path.split_str(PATH_ENTRY_SEPARATOR); + env_path.split(PATH_ENTRY_SEPARATOR); env_path_components.map(|s| Path::new(s)).collect() } None => Vec::new() diff --git a/src/librustc/middle/infer/region_inference/graphviz.rs b/src/librustc/middle/infer/region_inference/graphviz.rs index 67875ae225224..43cd1fc8edbac 100644 --- a/src/librustc/middle/infer/region_inference/graphviz.rs +++ b/src/librustc/middle/infer/region_inference/graphviz.rs @@ -90,7 +90,7 @@ pub fn maybe_print_constraints_for<'a, 'tcx>(region_vars: &RegionVarBindings<'a, tcx.sess.bug("empty string provided as RUST_REGION_GRAPH"); } - if output_template.contains_char('%') { + if output_template.contains('%') { let mut new_str = String::new(); for c in output_template.chars() { if c == '%' { diff --git a/src/librustc/session/mod.rs b/src/librustc/session/mod.rs index c1c5518887577..4a3efc861d1a0 100644 --- a/src/librustc/session/mod.rs +++ b/src/librustc/session/mod.rs @@ -280,9 +280,9 @@ fn split_msg_into_multilines(msg: &str) -> Option { } let mut tail = &msg[head..]; - let third = tail.find_str("(values differ") - .or(tail.find_str("(lifetime")) - .or(tail.find_str("(cyclic type of infinite size")); + let third = tail.find("(values differ") + .or(tail.find("(lifetime")) + .or(tail.find("(cyclic type of infinite size")); // Insert `\n` before any remaining messages which match. if let Some(pos) = third { // The end of the message may just be wrapped in `()` without diff --git a/src/librustc_driver/pretty.rs b/src/librustc_driver/pretty.rs index 0fbfa5fd89dd7..e349e8d7bb5b4 100644 --- a/src/librustc_driver/pretty.rs +++ b/src/librustc_driver/pretty.rs @@ -348,7 +348,7 @@ impl FromStr for UserIdentifiedItem { type Err = (); fn from_str(s: &str) -> Result { Ok(s.parse().map(ItemViaNode).unwrap_or_else(|_| { - ItemViaPath(s.split_str("::").map(|s| s.to_string()).collect()) + ItemViaPath(s.split("::").map(|s| s.to_string()).collect()) })) } } diff --git a/src/librustdoc/html/render.rs b/src/librustdoc/html/render.rs index fc3c87389917a..735487611dc50 100644 --- a/src/librustdoc/html/render.rs +++ b/src/librustdoc/html/render.rs @@ -1469,7 +1469,7 @@ fn full_path(cx: &Context, item: &clean::Item) -> String { fn shorter<'a>(s: Option<&'a str>) -> &'a str { match s { - Some(s) => match s.find_str("\n\n") { + Some(s) => match s.find("\n\n") { Some(pos) => &s[..pos], None => s, }, diff --git a/src/libstd/old_path/windows.rs b/src/libstd/old_path/windows.rs index 887dc804c7af3..80c19816bd506 100644 --- a/src/libstd/old_path/windows.rs +++ b/src/libstd/old_path/windows.rs @@ -522,7 +522,7 @@ impl GenericPath for Path { fn path_relative_from(&self, base: &Path) -> Option { fn comp_requires_verbatim(s: &str) -> bool { - s == "." || s == ".." || s.contains_char(SEP2) + s == "." || s == ".." || s.contains(SEP2) } if !self.equiv_prefix(base) { diff --git a/src/libsyntax/parse/lexer/comments.rs b/src/libsyntax/parse/lexer/comments.rs index 1f06db600278b..7a5d75581a511 100644 --- a/src/libsyntax/parse/lexer/comments.rs +++ b/src/libsyntax/parse/lexer/comments.rs @@ -92,7 +92,7 @@ pub fn strip_doc_comment_decoration(comment: &str) -> String { let mut first = true; for line in &lines { for (j, c) in line.chars().enumerate() { - if j > i || !"* \t".contains_char(c) { + if j > i || !"* \t".contains(c) { can_trim = false; break; } @@ -264,7 +264,7 @@ fn read_block_comment(rdr: &mut StringReader, if is_block_doc_comment(&curr_line[..]) { return } - assert!(!curr_line.contains_char('\n')); + assert!(!curr_line.contains('\n')); lines.push(curr_line); } else { let mut level: isize = 1; diff --git a/src/libsyntax/parse/parser.rs b/src/libsyntax/parse/parser.rs index 370201e53825e..eb5688b3ed89a 100644 --- a/src/libsyntax/parse/parser.rs +++ b/src/libsyntax/parse/parser.rs @@ -2496,7 +2496,7 @@ impl<'a> Parser<'a> { let fstr = n.as_str(); self.span_err(last_span, &format!("unexpected token: `{}`", n.as_str())); - if fstr.chars().all(|x| "0123456789.".contains_char(x)) { + if fstr.chars().all(|x| "0123456789.".contains(x)) { let float = match fstr.parse::().ok() { Some(f) => f, None => continue, diff --git a/src/libunicode/u_str.rs b/src/libunicode/u_str.rs index 9bd8c5525a056..38cbe5c7dea16 100644 --- a/src/libunicode/u_str.rs +++ b/src/libunicode/u_str.rs @@ -84,7 +84,7 @@ impl UnicodeStr for str { #[inline] fn trim(&self) -> &str { - self.trim_left().trim_right() + self.trim_matches(|c: char| c.is_whitespace()) } #[inline]