diff --git a/regex-debug/Cargo.toml b/regex-debug/Cargo.toml index 54b2bb511d..1291448bbb 100644 --- a/regex-debug/Cargo.toml +++ b/regex-debug/Cargo.toml @@ -14,5 +14,6 @@ workspace = ".." docopt = "0.8" regex = { version = "0.2", path = ".." } regex-syntax = { version = "0.4.0", path = "../regex-syntax" } +regex-syntax2 = { version = "0.5.0", path = "../regex-syntax-2" } serde = "1" serde_derive = "1" diff --git a/regex-debug/src/main.rs b/regex-debug/src/main.rs index f31dc22a9c..97f628d542 100644 --- a/regex-debug/src/main.rs +++ b/regex-debug/src/main.rs @@ -1,6 +1,7 @@ extern crate docopt; extern crate regex; extern crate regex_syntax as syntax; +extern crate regex_syntax2; extern crate serde; #[macro_use] extern crate serde_derive; @@ -17,6 +18,8 @@ use syntax::{ExprBuilder, Expr, Literals}; const USAGE: &'static str = " Usage: regex-debug [options] ast + regex-debug [options] ast2 + regex-debug [options] hir2 regex-debug [options] prefixes ... regex-debug [options] suffixes ... regex-debug [options] anchors @@ -51,6 +54,8 @@ Options: #[derive(Deserialize)] struct Args { cmd_ast: bool, + cmd_ast2: bool, + cmd_hir2: bool, cmd_prefixes: bool, cmd_suffixes: bool, cmd_anchors: bool, @@ -93,6 +98,10 @@ fn main() { fn run(args: &Args) -> Result<()> { if args.cmd_ast { cmd_ast(args) + } else if args.cmd_ast2 { + cmd_ast2(args) + } else if args.cmd_hir2 { + cmd_hir2(args) } else if args.cmd_prefixes { cmd_literals(args) } else if args.cmd_suffixes { @@ -109,7 +118,28 @@ fn run(args: &Args) -> Result<()> { } fn cmd_ast(args: &Args) -> Result<()> { - println!("{:#?}", try!(args.parse_one())); + let ast = try!(args.parse_one()); + println!("{:#?}", ast); + Ok(()) +} + +fn cmd_ast2(args: &Args) -> Result<()> { + use regex_syntax2::ast::parse::Parser; + + let mut parser = Parser::new(); + let ast = try!(parser.parse(&args.arg_pattern)); + println!("{:#?}", ast); + Ok(()) +} + +fn cmd_hir2(args: &Args) -> Result<()> { + use regex_syntax2::ParserBuilder; + + let mut parser = ParserBuilder::new() + .allow_invalid_utf8(false) + .build(); + let hir = try!(parser.parse(&args.arg_pattern)); + println!("{:#?}", hir); Ok(()) } diff --git a/regex-syntax-2/Cargo.toml b/regex-syntax-2/Cargo.toml new file mode 100644 index 0000000000..44172f0524 --- /dev/null +++ b/regex-syntax-2/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "regex-syntax2" +version = "0.5.0" #:version +authors = ["The Rust Project Developers"] +license = "MIT/Apache-2.0" +repository = "/~https://github.com/rust-lang/regex" +documentation = "https://docs.rs/regex-syntax" +homepage = "/~https://github.com/rust-lang/regex" +description = "A regular expression parser." +workspace = ".." + +[dependencies] +ucd-util = { version = "*", path = "/home/andrew/rust/rucd/ucd-util" } diff --git a/regex-syntax-2/benches/bench.rs b/regex-syntax-2/benches/bench.rs new file mode 100644 index 0000000000..d1eeedd91d --- /dev/null +++ b/regex-syntax-2/benches/bench.rs @@ -0,0 +1,73 @@ +// Copyright 2018 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#![feature(test)] + +extern crate regex_syntax2; +extern crate test; + +use regex_syntax2::Parser; +use test::Bencher; + +#[bench] +fn parse_simple1(b: &mut Bencher) { + b.iter(|| { + let re = r"^bc(d|e)*$"; + Parser::new().parse(re).unwrap() + }); +} + +#[bench] +fn parse_simple2(b: &mut Bencher) { + b.iter(|| { + let re = r"'[a-zA-Z_][a-zA-Z0-9_]*(')\b"; + Parser::new().parse(re).unwrap() + }); +} + +#[bench] +fn parse_small1(b: &mut Bencher) { + b.iter(|| { + let re = r"\p{L}|\p{N}|\s|.|\d"; + Parser::new().parse(re).unwrap() + }); +} + +#[bench] +fn parse_medium1(b: &mut Bencher) { + b.iter(|| { + let re = r"\pL\p{Greek}\p{Hiragana}\p{Alphabetic}\p{Hebrew}\p{Arabic}"; + Parser::new().parse(re).unwrap() + }); +} + +#[bench] +fn parse_medium2(b: &mut Bencher) { + b.iter(|| { + let re = r"\s\S\w\W\d\D"; + Parser::new().parse(re).unwrap() + }); +} + +#[bench] +fn parse_medium3(b: &mut Bencher) { + b.iter(|| { + let re = r"\p{age:3.2}\p{hira}\p{scx:hira}\p{alphabetic}\p{sc:Greek}\pL"; + Parser::new().parse(re).unwrap() + }); +} + +#[bench] +fn parse_huge(b: &mut Bencher) { + b.iter(|| { + let re = r"\p{L}{100}"; + Parser::new().parse(re).unwrap() + }); +} diff --git a/regex-syntax-2/src/ast/mod.rs b/regex-syntax-2/src/ast/mod.rs new file mode 100644 index 0000000000..ad63a1b491 --- /dev/null +++ b/regex-syntax-2/src/ast/mod.rs @@ -0,0 +1,1507 @@ +// Copyright 2017 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +/*! +Defines an abstract syntax for regular expressions. +*/ + +use std::cmp::Ordering; +use std::error; +use std::fmt; + +pub use ast::visitor::{Visitor, visit}; + +pub mod parse; +pub mod print; +mod visitor; + +/// An error that occurred while parsing a regular expression into an abstract +/// syntax tree. +/// +/// Note that note all ASTs represents a valid regular expression. For example, +/// an AST is constructed without error for `\p{Quux}`, but `Quux` is not a +/// valid Unicode property name. That particular error is reported when +/// translating an AST to the high-level intermediate representation (`HIR`). +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Error { + /// The kind of error. + kind: ErrorKind, + /// The original pattern that the parser generated the error from. Every + /// span in an error is a valid range into this string. + pattern: String, + /// The span of this error. + span: Span, +} + +impl Error { + /// Return the type of this error. + pub fn kind(&self) -> &ErrorKind { + &self.kind + } + + /// The original pattern string in which this error occurred. + /// + /// Every span reported by this error is reported in terms of this string. + pub fn pattern(&self) -> &str { + &self.pattern + } + + /// Return the span at which this error occurred. + pub fn span(&self) -> &Span { + &self.span + } + + /// Return an auxiliary span. This span exists only for some errors that + /// benefit from being able to point to two locations in the original + /// regular expression. For example, "duplicate" errors will have the + /// main error position set to the duplicate occurrence while its + /// auxiliary span will be set to the initial occurrence. + pub fn auxiliary_span(&self) -> Option<&Span> { + use self::ErrorKind::*; + match self.kind { + FlagDuplicate { ref original } => Some(original), + FlagRepeatedNegation { ref original, .. } => Some(original), + GroupNameDuplicate { ref original, .. } => Some(original), + _ => None, + } + } +} + +/// The type of an error that occurred while building an AST. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ErrorKind { + /// The capturing group limit was exceeded. + /// + /// Note that this represents a limit on the total number of capturing + /// groups in a regex and not necessarily the number of nested capturing + /// groups. That is, the nest limit can be low and it is still possible for + /// this error to occur. + CaptureLimitExceeded, + /// An invalid escape sequence was found in a character class set. + ClassEscapeInvalid, + /// An invalid character class range was found. An invalid range is any + /// range where the start is greater than the end. + ClassRangeInvalid, + /// An opening `[` was found with no corresponding closing `]`. + ClassUnclosed, + /// An empty decimal number was given where one was expected. + DecimalEmpty, + /// An invalid decimal number was given where one was expected. + DecimalInvalid, + /// A bracketed hex literal was empty. + EscapeHexEmpty, + /// A bracketed hex literal did not correspond to a Unicode scalar value. + EscapeHexInvalid, + /// An invalid hexadecimal digit was found. + EscapeHexInvalidDigit, + /// EOF was found before an escape sequence was completed. + EscapeUnexpectedEof, + /// An unrecognized escape sequence. + EscapeUnrecognized, + /// A dangling negation was used when setting flags, e.g., `i-`. + FlagDanglingNegation, + /// A flag was used twice, e.g., `i-i`. + FlagDuplicate { + /// The position of the original flag. The error position + /// points to the duplicate flag. + original: Span, + }, + /// The negation operator was used twice, e.g., `-i-s`. + FlagRepeatedNegation { + /// The position of the original negation operator. The error position + /// points to the duplicate negation operator. + original: Span, + }, + /// Expected a flag but got EOF, e.g., `(?`. + FlagUnexpectedEof, + /// Unrecognized flag, e.g., `a`. + FlagUnrecognized, + /// A duplicate capture name was found. + GroupNameDuplicate { + /// The position of the initial occurrence of the capture name. The + /// error position itself points to the duplicate occurrence. + original: Span, + }, + /// A capture group name is empty, e.g., `(?P<>abc)`. + GroupNameEmpty, + /// An invalid character was seen for a capture group name. This includes + /// errors where the first character is a digit (even though subsequent + /// characters are allowed to be digits). + GroupNameInvalid, + /// A closing `>` could not be found for a capture group name. + GroupNameUnexpectedEof, + /// An unclosed group, e.g., `(ab`. + /// + /// The span of this error corresponds to the unclosed parenthesis. + GroupUnclosed, + /// An unopened group, e.g., `ab)`. + GroupUnopened, + /// The nest limit was exceeded. The limit stored here is the limit + /// configured in the parser. + NestLimitExceeded(u32), + /// The range provided in a counted repetition operator is invalid. The + /// range is invalid if the start is greater than the end. + RepetitionCountInvalid, + /// An opening `{` was found with no corresponding closing `}`. + RepetitionCountUnclosed, + /// A repetition operator was applied to a missing sub-expression. This + /// occurs, for example, in the regex consisting of just a `*`. It is, + /// however, possible to create a repetition operating on an empty + /// sub-expression. For example, `()*` is still considered valid. + RepetitionMissing, + /// When octal support is disabled, this error is produced when an octal + /// escape is used. The octal escape is assumed to be an invocation of + /// a backreference, which is the common case. + UnsupportedBackreference, + /// When syntax similar to PCRE's look-around is used, this error is + /// returned. Some example syntaxes that are rejected include, but are + /// not necessarily limited to, `(?=re)`, `(?!re)`, `(?<=re)` and + /// `(? &str { + use self::ErrorKind::*; + match self.kind { + CaptureLimitExceeded => "capture group limit exceeded", + ClassEscapeInvalid => "invalid escape sequence in character class", + ClassRangeInvalid => "invalid character class range", + ClassUnclosed => "unclosed character class", + DecimalEmpty => "empty decimal literal", + DecimalInvalid => "invalid decimal literal", + EscapeHexEmpty => "empty hexadecimal literal", + EscapeHexInvalid => "invalid hexadecimal literal", + EscapeHexInvalidDigit => "invalid hexadecimal digit", + EscapeUnexpectedEof => "unexpected eof (escape sequence)", + EscapeUnrecognized => "unrecognized escape sequence", + FlagDanglingNegation => "dangling flag negation operator", + FlagDuplicate{..} => "duplicate flag", + FlagRepeatedNegation{..} => "repeated negation", + FlagUnexpectedEof => "unexpected eof (flag)", + FlagUnrecognized => "unrecognized flag", + GroupNameDuplicate{..} => "duplicate capture group name", + GroupNameEmpty => "empty capture group name", + GroupNameInvalid => "invalid capture group name", + GroupNameUnexpectedEof => "unclosed capture group name", + GroupUnclosed => "unclosed group", + GroupUnopened => "unopened group", + NestLimitExceeded(_) => "nest limit exceeded", + RepetitionCountInvalid => "invalid repetition count range", + RepetitionCountUnclosed => "unclosed counted repetition", + RepetitionMissing => "repetition operator missing expression", + UnsupportedBackreference => "backreferences are not supported", + UnsupportedLookAround => "look-around is not supported", + _ => unreachable!(), + } + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + ::error::Formatter::from(self).fmt(f) + } +} + +impl fmt::Display for ErrorKind { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + use self::ErrorKind::*; + match *self { + CaptureLimitExceeded => { + write!(f, "exceeded the maximum number of \ + capturing groups ({})", ::std::u32::MAX) + } + ClassEscapeInvalid => { + write!(f, "invalid escape sequence found in character class") + } + ClassRangeInvalid => { + write!(f, "invalid character class range, \ + the start must be <= the end") + } + ClassUnclosed => { + write!(f, "unclosed character class") + } + DecimalEmpty => { + write!(f, "decimal literal empty") + } + DecimalInvalid => { + write!(f, "decimal literal invalid") + } + EscapeHexEmpty => { + write!(f, "hexadecimal literal empty") + } + EscapeHexInvalid => { + write!(f, "hexadecimal literal is not a Unicode scalar value") + } + EscapeHexInvalidDigit => { + write!(f, "invalid hexadecimal digit") + } + EscapeUnexpectedEof => { + write!(f, "incomplete escape sequence, \ + reached end of pattern prematurely") + } + EscapeUnrecognized => { + write!(f, "unrecognized escape sequence") + } + FlagDanglingNegation => { + write!(f, "dangling flag negation operator") + } + FlagDuplicate{..} => { + write!(f, "duplicate flag") + } + FlagRepeatedNegation{..} => { + write!(f, "flag negation operator repeated") + } + FlagUnexpectedEof => { + write!(f, "expected flag but got end of regex") + } + FlagUnrecognized => { + write!(f, "unrecognized flag") + } + GroupNameDuplicate{..} => { + write!(f, "duplicate capture group name") + } + GroupNameEmpty => { + write!(f, "empty capture group name") + } + GroupNameInvalid => { + write!(f, "invalid capture group character") + } + GroupNameUnexpectedEof => { + write!(f, "unclosed capture group name") + } + GroupUnclosed => { + write!(f, "unclosed group") + } + GroupUnopened => { + write!(f, "unopened group") + } + NestLimitExceeded(limit) => { + write!(f, "exceed the maximum number of \ + nested parentheses/brackets ({})", limit) + } + RepetitionCountInvalid => { + write!(f, "invalid repetition count range, \ + the start must be <= the end") + } + RepetitionCountUnclosed => { + write!(f, "unclosed counted repetition") + } + RepetitionMissing => { + write!(f, "repetition operator missing expression") + } + UnsupportedBackreference => { + write!(f, "backreferences are not supported") + } + UnsupportedLookAround => { + write!(f, "look-around, including look-ahead and look-behind, \ + is not supported") + } + _ => unreachable!(), + } + } +} + +/// Span represents the position information of a single AST item. +/// +/// All span positions are absolute byte offsets that can be used on the +/// original regular expression that was parsed. +#[derive(Clone, Copy, Eq, PartialEq)] +pub struct Span { + /// The start byte offset. + pub start: Position, + /// The end byte offset. + pub end: Position, +} + +impl fmt::Debug for Span { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "Span({:?}, {:?})", self.start, self.end) + } +} + +impl Ord for Span { + fn cmp(&self, other: &Span) -> Ordering { + (&self.start, &self.end).cmp(&(&other.start, &other.end)) + } +} + +impl PartialOrd for Span { + fn partial_cmp(&self, other: &Span) -> Option { + Some(self.cmp(other)) + } +} + +/// A single position in a regular expression. +/// +/// A position encodes one half of a span, and include the byte offset, line +/// number and column number. +#[derive(Clone, Copy, Eq, PartialEq)] +pub struct Position { + /// The absolute offset of this position, starting at `0` from the + /// beginning of the regular expression pattern string. + pub offset: usize, + /// The line number, starting at `1`. + pub line: usize, + /// The approximate column number, starting at `1`. + pub column: usize, +} + +impl fmt::Debug for Position { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "Position(o: {:?}, l: {:?}, c: {:?})", + self.offset, self.line, self.column) + } +} + +impl Ord for Position { + fn cmp(&self, other: &Position) -> Ordering { + self.offset.cmp(&other.offset) + } +} + +impl PartialOrd for Position { + fn partial_cmp(&self, other: &Position) -> Option { + Some(self.cmp(other)) + } +} + +impl Span { + /// Create a new span with the given positions. + pub fn new(start: Position, end: Position) -> Span { + Span { start: start, end: end } + } + + /// Create a new span using the given position as the start and end. + pub fn splat(pos: Position) -> Span { + Span::new(pos, pos) + } + + /// Create a new span by replacing the starting the position with the one + /// given. + pub fn with_start(self, pos: Position) -> Span { + Span { start: pos, ..self } + } + + /// Create a new span by replacing the ending the position with the one + /// given. + pub fn with_end(self, pos: Position) -> Span { + Span { end: pos, ..self } + } + + /// Returns true if and only if this span occurs on a single line. + pub fn is_one_line(&self) -> bool { + self.start.line == self.end.line + } + + /// Returns true if and only if this span is empty. That is, it points to + /// a single position in the concrete syntax of a regular expression. + pub fn is_empty(&self) -> bool { + self.start.offset == self.end.offset + } +} + +impl Position { + /// Create a new position with the given information. + /// + /// `offset` is the absolute offset of the position, starting at `0` from + /// the beginning of the regular expression pattern string. + /// + /// `line` is the line number, starting at `1`. + /// + /// `column` is the approximate column number, starting at `1`. + pub fn new(offset: usize, line: usize, column: usize) -> Position { + Position { offset: offset, line: line, column: column } + } +} + +/// An abstract syntax tree for a singular expression along with comments +/// found. +/// +/// Comments are not stored in the tree itself to avoid complexity. Each +/// comment contains a span of precisely where it occurred in the original +/// regular expression. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct WithComments { + /// The actual ast. + pub ast: Ast, + /// All comments found in the original regular expression. + pub comments: Vec, +} + +/// A comment from a regular expression with an associated span. +/// +/// A regular expression can only contain comments when the `x` flag is +/// enabled. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Comment { + /// The span of this comment, including the beginning `#` and ending `\n`. + pub span: Span, + /// The comment text, starting with the first character following the `#` + /// and ending with the last character preceding the `\n`. + pub comment: String, +} + +/// An abstract syntax tree for a single regular expression. +/// +/// An `Ast`'s `fmt::Display` implementation uses constant stack space and heap +/// space proportional to the size of the `Ast`. +/// +/// This type defines its own destructor that uses constant stack space and +/// heap space proportional to the size of the `Ast`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum Ast { + /// An empty regex that matches everything. + Empty(Span), + /// A set of flags, e.g., `(?is)`. + Flags(SetFlags), + /// A single character literal, which includes escape sequences. + Literal(Literal), + /// The "any character" class. + Dot(Span), + /// A single zero-width assertion. + Assertion(Assertion), + /// A single character class. This includes all forms of character classes + /// except for `.`. e.g., `\d`, `\pN`, `[a-z]` and `[[:alpha:]]`. + Class(Class), + /// A repetition operator applied to an arbitrary regular expression. + Repetition(Repetition), + /// A grouped regular expression. + Group(Group), + /// An alternation of regular expressions. + Alternation(Alternation), + /// A concatenation of regular expressions. + Concat(Concat), +} + +impl Ast { + /// Return the span of this abstract syntax tree. + pub fn span(&self) -> &Span { + match *self { + Ast::Empty(ref span) => span, + Ast::Flags(ref x) => &x.span, + Ast::Literal(ref x) => &x.span, + Ast::Dot(ref span) => span, + Ast::Assertion(ref x) => &x.span, + Ast::Class(ref x) => x.span(), + Ast::Repetition(ref x) => &x.span, + Ast::Group(ref x) => &x.span, + Ast::Alternation(ref x) => &x.span, + Ast::Concat(ref x) => &x.span, + } + } + + /// Return true if and only if this Ast is empty. + pub fn is_empty(&self) -> bool { + match *self { + Ast::Empty(_) => true, + _ => false, + } + } + + /// Returns true if and only if this AST has any (including possibly empty) + /// subexpressions. + fn has_subexprs(&self) -> bool { + match *self { + Ast::Empty(_) + | Ast::Flags(_) + | Ast::Literal(_) + | Ast::Dot(_) + | Ast::Assertion(_) => false, + Ast::Class(_) + | Ast::Repetition(_) + | Ast::Group(_) + | Ast::Alternation(_) + | Ast::Concat(_) => true, + } + } +} + +/// Print a display representation of this Ast. +/// +/// This does not preserve any of the original whitespace formatting that may +/// have originally been present in the concrete syntax from which this Ast +/// was generated. +/// +/// This implementation uses constant stack space and heap space proportional +/// to the size of the `Ast`. +impl fmt::Display for Ast { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + use ast::print::Printer; + Printer::new().print(self, f) + } +} + +/// An alternation of regular expressions. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Alternation { + /// The span of this alternation. + pub span: Span, + /// The alternate regular expressions. + pub asts: Vec, +} + +impl Alternation { + /// Return this alternation as an AST. + /// + /// If this alternation contains zero ASTs, then Ast::Empty is + /// returned. If this alternation contains exactly 1 AST, then the + /// corresponding AST is returned. Otherwise, Ast::Alternation is returned. + pub fn into_ast(mut self) -> Ast { + match self.asts.len() { + 0 => Ast::Empty(self.span), + 1 => self.asts.pop().unwrap(), + _ => Ast::Alternation(self), + } + } +} + +/// A concatenation of regular expressions. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Concat { + /// The span of this concatenation. + pub span: Span, + /// The concatenation regular expressions. + pub asts: Vec, +} + +impl Concat { + /// Return this concatenation as an AST. + /// + /// If this concatenation contains zero ASTs, then Ast::Empty is + /// returned. If this concatenation contains exactly 1 AST, then the + /// corresponding AST is returned. Otherwise, Ast::Concat is returned. + pub fn into_ast(mut self) -> Ast { + match self.asts.len() { + 0 => Ast::Empty(self.span), + 1 => self.asts.pop().unwrap(), + _ => Ast::Concat(self), + } + } +} + +/// A single literal expression. +/// +/// A literal corresponds to a single Unicode scalar value. Literals may be +/// represented in their literal form, e.g., `a` or in their escaped form, +/// e.g., `\x61`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Literal { + /// The span of this literal. + pub span: Span, + /// The kind of this literal. + pub kind: LiteralKind, + /// The Unicode scalar value corresponding to this literal. + pub c: char, +} + +impl Literal { + /// If this literal was written as a `\x` hex escape, then this returns + /// the corresponding byte value. Otherwise, this returns `None`. + pub fn byte(&self) -> Option { + let short_hex = LiteralKind::HexFixed(HexLiteralKind::X); + if self.c as u32 <= 255 && self.kind == short_hex { + Some(self.c as u8) + } else { + None + } + } +} + +/// The kind of a single literal expression. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum LiteralKind { + /// The literal is written verbatim, e.g., `a` or `☃`. + Verbatim, + /// The literal is written as an escape because it is punctuation, e.g., + /// `\*` or `\[`. + Punctuation, + /// The literal is written as an octal escape, e.g., `\141`. + Octal, + /// The literal is written as a hex code with a fixed number of digits + /// depending on the type of the escape, e.g., `\x61` or or `\u0061` or + /// `\U00000061`. + HexFixed(HexLiteralKind), + /// The literal is written as a hex code with a bracketed number of + /// digits. The only restriction is that the bracketed hex code must refer + /// to a valid Unicode scalar value. + HexBrace(HexLiteralKind), + /// The literal is written as a specially recognized escape, e.g., `\f` + /// or `\n`. + Special(SpecialLiteralKind), +} + +/// The type of a special literal. +/// +/// A special literal is a special escape sequence recognized by the regex +/// parser, e.g., `\f` or `\n`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum SpecialLiteralKind { + /// Bell, spelled `\a` (`\x07`). + Bell, + /// Form feed, spelled `\f` (`\x0C`). + FormFeed, + /// Tab, spelled `\t` (`\x09`). + Tab, + /// Line feed, spelled `\n` (`\x0A`). + LineFeed, + /// Carriage return, spelled `\r` (`\x0D`). + CarriageReturn, + /// Vertical tab, spelled `\v` (`\x0B`). + VerticalTab, + /// Space, spelled `\ ` (`\x20`). Note that this can only appear when + /// parsing in verbose mode. + Space, +} + +/// The type of a Unicode hex literal. +/// +/// Note that all variants behave the same when used with brackets. They only +/// differ when used without brackets in the number of hex digits that must +/// follow. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum HexLiteralKind { + /// A `\x` prefix. When used without brackets, this form is limited to + /// two digits. + X, + /// A `\u` prefix. When used without brackets, this form is limited to + /// four digits. + UnicodeShort, + /// A `\U` prefix. When used without brackets, this form is limited to + /// eight digits. + UnicodeLong, +} + +impl HexLiteralKind { + /// The number of digits that must be used with this literal form when + /// used without brackets. When used with brackets, there is no + /// restriction on the number of digits. + pub fn digits(&self) -> u32 { + match *self { + HexLiteralKind::X => 2, + HexLiteralKind::UnicodeShort => 4, + HexLiteralKind::UnicodeLong => 8, + } + } +} + +/// A single character class expression. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum Class { + /// A Unicode character class, e.g., `\pL` or `\p{Greek}`. + Unicode(ClassUnicode), + /// A perl character class, e.g., `\d` or `\W`. + Perl(ClassPerl), + /// A bracketed character class set, which may contain zero or more + /// character ranges and/or zero or more nested classes. e.g., + /// `[a-zA-Z\pL]`. + Bracketed(ClassBracketed), +} + +impl Class { + /// Return the span of this character class. + pub fn span(&self) -> &Span { + match *self { + Class::Perl(ref x) => &x.span, + Class::Unicode(ref x) => &x.span, + Class::Bracketed(ref x) => &x.span, + } + } +} + +/// A Perl character class. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassPerl { + /// The span of this class. + pub span: Span, + /// The kind of Perl class. + pub kind: ClassPerlKind, + /// Whether the class is negated or not. e.g., `\d` is not negated but + /// `\D` is. + pub negated: bool, +} + +/// The available Perl character classes. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ClassPerlKind { + /// Decimal numbers. + Digit, + /// Whitespace. + Space, + /// Word characters. + Word, +} + +/// An ASCII character class. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassAscii { + /// The span of this class. + pub span: Span, + /// The kind of ASCII class. + pub kind: ClassAsciiKind, + /// Whether the class is negated or not. e.g., `[[:alpha:]]` is not negated + /// but `[[:^alpha:]]` is. + pub negated: bool, +} + +/// The available ASCII character classes. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ClassAsciiKind { + /// `[0-9A-Za-z]` + Alnum, + /// `[A-Za-z]` + Alpha, + /// `[\x00-\x7F]` + Ascii, + /// `[ \t]` + Blank, + /// `[\x00-\x1F\x7F]` + Cntrl, + /// `[0-9]` + Digit, + /// `[!-~]` + Graph, + /// `[a-z]` + Lower, + /// `[ -~]` + Print, + /// `[!-/:-@\[-`{-~]` + Punct, + /// `[\t\n\v\f\r ]` + Space, + /// `[A-Z]` + Upper, + /// `[0-9A-Za-z_]` + Word, + /// `[0-9A-Fa-f]` + Xdigit, +} + +impl ClassAsciiKind { + /// Return the corresponding ClassAsciiKind variant for the given name. + /// + /// The name given should correspond to the lowercase version of the + /// variant name. e.g., `cntrl` is the name for `ClassAsciiKind::Cntrl`. + /// + /// If no variant with the corresponding name exists, then `None` is + /// returned. + pub fn from_name(name: &str) -> Option { + use self::ClassAsciiKind::*; + match name { + "alnum" => Some(Alnum), + "alpha" => Some(Alpha), + "ascii" => Some(Ascii), + "blank" => Some(Blank), + "cntrl" => Some(Cntrl), + "digit" => Some(Digit), + "graph" => Some(Graph), + "lower" => Some(Lower), + "print" => Some(Print), + "punct" => Some(Punct), + "space" => Some(Space), + "upper" => Some(Upper), + "word" => Some(Word), + "xdigit" => Some(Xdigit), + _ => None, + } + } +} + +/// A Unicode character class. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassUnicode { + /// The span of this class. + pub span: Span, + /// Whether this class is negated or not. + /// + /// Note: be careful when using this attribute. This specifically refers + /// to whether the class is written as `\p` or `\P`, where the latter + /// is `negated = true`. However, it also possible to write something like + /// `\P{scx!=Katakana}` which is actually equivalent to + /// `\p{scx=Katakana}` and is therefore not actually negated even though + /// `negated = true` here. To test whether this class is truly negated + /// or not, use the `is_negated` method. + pub negated: bool, + /// The kind of Unicode class. + pub kind: ClassUnicodeKind, +} + +impl ClassUnicode { + /// Returns true if this class has been negated. + /// + /// Note that this takes the Unicode op into account, if it's present. + /// e.g., `is_negated` for `\P{scx!=Katakana}` will return `false`. + pub fn is_negated(&self) -> bool { + match self.kind { + ClassUnicodeKind::NamedValue { + op: ClassUnicodeOpKind::NotEqual, .. + } => !self.negated, + _ => self.negated, + } + } +} + +/// The available forms of Unicode character classes. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ClassUnicodeKind { + /// A one letter abbreviated class, e.g., `\pN`. + OneLetter(char), + /// A binary property, general category or script. The string may be + /// empty. + Named(String), + /// A property name and an associated value. + NamedValue { + /// The type of Unicode op used to associate `name` with `value`. + op: ClassUnicodeOpKind, + /// The property name (which may be empty). + name: String, + /// The property value (which may be empty). + value: String, + }, +} + +/// The type of op used in a Unicode character class. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ClassUnicodeOpKind { + /// A property set to a specific value, e.g., `\p{scx=Katakana}`. + Equal, + /// A property set to a specific value using a colon, e.g., + /// `\p{scx:Katakana}`. + Colon, + /// A property that isn't a particular value, e.g., `\p{scx!=Katakana}`. + NotEqual, +} + +impl ClassUnicodeOpKind { + /// Whether the op is an equality op or not. + pub fn is_equal(&self) -> bool { + match *self { + ClassUnicodeOpKind::Equal|ClassUnicodeOpKind::Colon => true, + _ => false, + } + } +} + +/// A bracketed character class, e.g., `[a-z0-9]`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassBracketed { + /// The span of this class. + pub span: Span, + /// Whether this class is negated or not. e.g., `[a]` is not negated but + /// `[^a]` is. + pub negated: bool, + /// The type of this set. A set is either a normal union of things, e.g., + /// `[abc]` or a result of applying set operations, e.g., `[\pL--c]`. + pub kind: ClassSet, +} + +/// A character class set. +/// +/// This type corresponds to the internal structure of a bracketed character +/// class. That is, every bracketed character is one of two types: a union of +/// items (literals, ranges, other bracketed classes) or a tree of binary set +/// operations. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ClassSet { + /// An item, which can be a single literal, range, nested character class + /// or a union of items. + Item(ClassSetItem), + /// A single binary operation (i.e., &&, -- or ~~). + BinaryOp(ClassSetBinaryOp), +} + +impl ClassSet { + /// Build a set from a union. + pub fn union(ast: ClassSetUnion) -> ClassSet { + ClassSet::Item(ClassSetItem::Union(ast)) + } + + /// Return the span of this character class set. + pub fn span(&self) -> &Span { + match *self { + ClassSet::Item(ref x) => x.span(), + ClassSet::BinaryOp(ref x) => &x.span, + } + } + + /// Return true if and only if this class set is empty. + fn is_empty(&self) -> bool { + match *self { + ClassSet::Item(ClassSetItem::Empty(_)) => true, + _ => false, + } + } +} + +/// A single component of a character class set. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ClassSetItem { + /// An empty item. + /// + /// Note that a bracketed character class cannot contain a single empty + /// item. Empty items can appear when using one of the binary operators. + /// For example, `[&&]` is the intersection of two empty classes. + Empty(Span), + /// A single literal. + Literal(Literal), + /// A range between two literals. + Range(ClassSetRange), + /// An ASCII character class, e.g., `[:alnum:]` or `[:punct:]`. + Ascii(ClassAscii), + /// A Unicode character class, e.g., `\pL` or `\p{Greek}`. + Unicode(ClassUnicode), + /// A perl character class, e.g., `\d` or `\W`. + Perl(ClassPerl), + /// A bracketed character class set, which may contain zero or more + /// character ranges and/or zero or more nested classes. e.g., + /// `[a-zA-Z\pL]`. + Bracketed(Box), + /// A union of items. + Union(ClassSetUnion), +} + +impl ClassSetItem { + /// Return the span of this character class set item. + pub fn span(&self) -> &Span { + match *self { + ClassSetItem::Empty(ref span) => span, + ClassSetItem::Literal(ref x) => &x.span, + ClassSetItem::Range(ref x) => &x.span, + ClassSetItem::Ascii(ref x) => &x.span, + ClassSetItem::Perl(ref x) => &x.span, + ClassSetItem::Unicode(ref x) => &x.span, + ClassSetItem::Bracketed(ref x) => &x.span, + ClassSetItem::Union(ref x) => &x.span, + } + } +} + +/// A single character class range in a set. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassSetRange { + /// The span of this range. + pub span: Span, + /// The start of this range. + pub start: Literal, + /// The end of this range. + pub end: Literal, +} + +impl ClassSetRange { + /// Returns true if and only if this character class range is valid. + /// + /// The only case where a range is invalid is if its start is greater than + /// its end. + pub fn is_valid(&self) -> bool { + self.start.c <= self.end.c + } +} + +/// A union of items inside a character class set. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassSetUnion { + /// The span of the items in this operation. e.g., the `a-z0-9` in + /// `[^a-z0-9]` + pub span: Span, + /// The sequence of items that make up this union. + pub items: Vec, +} + +impl ClassSetUnion { + /// Push a new item in this union. + /// + /// The ending position of this union's span is updated to the ending + /// position of the span of the item given. If the union is empty, then + /// the starting position of this union is set to the starting position + /// of this item. + /// + /// In other words, if you only use this method to add items to a union + /// and you set the spans on each item correctly, then you should never + /// need to adjust the span of the union directly. + pub fn push(&mut self, item: ClassSetItem) { + if self.items.is_empty() { + self.span.start = item.span().start; + } + self.span.end = item.span().end; + self.items.push(item); + } + + /// Return this union as a character class set item. + /// + /// If this union contains zero items, then an empty union is + /// returned. If this concatenation contains exactly 1 item, then the + /// corresponding item is returned. Otherwise, ClassSetItem::Union is + /// returned. + pub fn into_item(mut self) -> ClassSetItem { + match self.items.len() { + 0 => ClassSetItem::Empty(self.span), + 1 => self.items.pop().unwrap(), + _ => ClassSetItem::Union(self), + } + } +} + +/// A Unicode character class set operation. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassSetBinaryOp { + /// The span of this operation. e.g., the `a-z--[h-p]` in `[a-z--h-p]`. + pub span: Span, + /// The type of this set operation. + pub kind: ClassSetBinaryOpKind, + /// The left hand side of the operation. + pub lhs: Box, + /// The right hand side of the operation. + pub rhs: Box, +} + +/// The type of a Unicode character class set operation. +/// +/// Note that this doesn't explicitly represent union since there is no +/// explicit union operator. Concatenation inside a character class corresponds +/// to the union operation. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum ClassSetBinaryOpKind { + /// The intersection of two sets, e.g., `\pN&&[a-z]`. + Intersection, + /// The difference of two sets, e.g., `\pN--[0-9]`. + Difference, + /// The symmetric difference of two sets. The symmetric difference is the + /// set of elements belonging to one but not both sets. + /// e.g., `[\pL~~[:ascii:]]`. + SymmetricDifference, +} + +/// A single zero-width assertion. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Assertion { + /// The span of this assertion. + pub span: Span, + /// The assertion kind, e.g., `\b` or `^`. + pub kind: AssertionKind, +} + +/// An assertion kind. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum AssertionKind { + /// `^` + StartLine, + /// `$` + EndLine, + /// `\A` + StartText, + /// `\z` + EndText, + /// `\b` + WordBoundary, + /// `\B` + NotWordBoundary, +} + +/// A repetition operation applied to a regular expression. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Repetition { + /// The span of this operation. + pub span: Span, + /// The actual operation. + pub op: RepetitionOp, + /// Whether this operation was applied greedily or not. + pub greedy: bool, + /// The regular expression under repetition. + pub ast: Box, +} + +/// The repetition operator itself. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct RepetitionOp { + /// The span of this operator. This includes things like `+`, `*?` and + /// `{m,n}`. + pub span: Span, + /// The type of operation. + pub kind: RepetitionKind, +} + +/// The kind of a repetition operator. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum RepetitionKind { + /// `?` + ZeroOrOne, + /// `*` + ZeroOrMore, + /// `+` + OneOrMore, + /// `{m,n}` + Range(RepetitionRange), +} + +/// A range repetition operator. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum RepetitionRange { + /// `{m}` + Exactly(u32), + /// `{m,}` + AtLeast(u32), + /// `{m,n}` + Bounded(u32, u32), +} + +impl RepetitionRange { + /// Returns true if and only if this repetition range is valid. + /// + /// The only case where a repetition range is invalid is if it is bounded + /// and its start is greater than its end. + pub fn is_valid(&self) -> bool { + match *self { + RepetitionRange::Bounded(s, e) if s > e => false, + _ => true, + } + } +} + +/// A grouped regular expression. +/// +/// This includes both capturing and non-capturing groups. This does **not** +/// include flag-only groups like `(?is)`, but does contain any group that +/// contains a sub-expression, e.g., `(a)`, `(?Pa)`, `(?:a)` and +/// `(?is:a)`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Group { + /// The span of this group. + pub span: Span, + /// The kind of this group. + pub kind: GroupKind, + /// The regular expression in this group. + pub ast: Box, +} + +impl Group { + /// If this group is non-capturing, then this returns the (possibly empty) + /// set of flags. Otherwise, `None` is returned. + pub fn flags(&self) -> Option<&Flags> { + match self.kind { + GroupKind::NonCapturing(ref flags) => Some(flags), + _ => None, + } + } + + /// Returns true if and only if this group is capturing. + pub fn is_capturing(&self) -> bool { + match self.kind { + GroupKind::CaptureIndex(_) | GroupKind::CaptureName(_) => true, + GroupKind::NonCapturing(_) => false, + } + } + + /// Returns the capture index of this group, if this is a capturing group. + /// + /// This returns a capture index precisely when `is_capturing` is `true`. + pub fn capture_index(&self) -> Option { + match self.kind { + GroupKind::CaptureIndex(i) => Some(i), + GroupKind::CaptureName(ref x) => Some(x.index), + GroupKind::NonCapturing(_) => None, + } + } +} + +/// The kind of a group. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum GroupKind { + /// `(a)` + CaptureIndex(u32), + /// `(?Pa)` + CaptureName(CaptureName), + /// `(?:a)` and `(?i:a)` + NonCapturing(Flags), +} + +/// A capture name. +/// +/// This corresponds to the name itself between the angle brackets in, e.g., +/// `(?Pexpr)`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct CaptureName { + /// The span of this capture name. + pub span: Span, + /// The capture name. + pub name: String, + /// The capture index. + pub index: u32, +} + +/// A group of flags that is not applied to a particular regular expression. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct SetFlags { + /// The span of these flags, including the grouping parentheses. + pub span: Span, + /// The actual sequence of flags. + pub flags: Flags, +} + +/// A group of flags. +/// +/// This corresponds only to the sequence of flags themselves, e.g., `is-u`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Flags { + /// The span of this group of flags. + pub span: Span, + /// A sequence of flag items. Each item is either a flag or a negation + /// operator. + pub items: Vec, +} + +impl Flags { + /// Add the given item to this sequence of flags. + /// + /// If the item was added successfully, then `None` is returned. If the + /// given item is a duplicate, then `Some(i)` is returned, where + /// `items[i].kind == item.kind`. + pub fn add_item(&mut self, item: FlagsItem) -> Option { + for (i, x) in self.items.iter().enumerate() { + if x.kind == item.kind { + return Some(i); + } + } + self.items.push(item); + None + } + + /// Returns the state of the given flag in this set. + /// + /// If the given flag is in the set but is negated, then `Some(false)` is + /// returned. + /// + /// If the given flag is in the set and is not negated, then `Some(true)` + /// is returned. + /// + /// Otherwise, `None` is returned. + pub fn flag_state(&self, flag: Flag) -> Option { + let mut negated = false; + for x in &self.items { + match x.kind { + FlagsItemKind::Negation => { + negated = true; + } + FlagsItemKind::Flag(ref xflag) if xflag == &flag => { + return Some(!negated); + } + _ => {} + } + } + None + } +} + +/// A single item in a group of flags. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct FlagsItem { + /// The span of this item. + pub span: Span, + /// The kind of this item. + pub kind: FlagsItemKind, +} + +/// The kind of an item in a group of flags. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum FlagsItemKind { + /// A negation operator applied to all subsequent flags in the enclosing + /// group. + Negation, + /// A single flag in a group. + Flag(Flag), +} + +impl FlagsItemKind { + /// Returns true if and only if this item is a negation operator. + pub fn is_negation(&self) -> bool { + match *self { + FlagsItemKind::Negation => true, + _ => false, + } + } +} + +/// A single flag. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum Flag { + /// `i` + CaseInsensitive, + /// `m` + MultiLine, + /// `s` + DotMatchesNewLine, + /// `U` + SwapGreed, + /// `u` + Unicode, + /// `x` + IgnoreWhitespace, +} + +/// A custom `Drop` impl is used for `Ast` such that it uses constant stack +/// space but heap space proportional to the depth of the `Ast`. +impl Drop for Ast { + fn drop(&mut self) { + use std::mem; + + match *self { + Ast::Empty(_) + | Ast::Flags(_) + | Ast::Literal(_) + | Ast::Dot(_) + | Ast::Assertion(_) + // Classes are recursive, so they get their own Drop impl. + | Ast::Class(_) => return, + Ast::Repetition(ref x) if !x.ast.has_subexprs() => return, + Ast::Group(ref x) if !x.ast.has_subexprs() => return, + Ast::Alternation(ref x) if x.asts.is_empty() => return, + Ast::Concat(ref x) if x.asts.is_empty() => return, + _ => {} + } + + let empty_span = || Span::splat(Position::new(0, 0, 0)); + let empty_ast = || Ast::Empty(empty_span()); + let mut stack = vec![mem::replace(self, empty_ast())]; + while let Some(mut ast) = stack.pop() { + match ast { + Ast::Empty(_) + | Ast::Flags(_) + | Ast::Literal(_) + | Ast::Dot(_) + | Ast::Assertion(_) + // Classes are recursive, so they get their own Drop impl. + | Ast::Class(_) => {} + Ast::Repetition(ref mut x) => { + stack.push(mem::replace(&mut x.ast, empty_ast())); + } + Ast::Group(ref mut x) => { + stack.push(mem::replace(&mut x.ast, empty_ast())); + } + Ast::Alternation(ref mut x) => { + stack.extend(x.asts.drain(..)); + } + Ast::Concat(ref mut x) => { + stack.extend(x.asts.drain(..)); + } + } + } + } +} + +/// A custom `Drop` impl is used for `ClassSet` such that it uses constant +/// stack space but heap space proportional to the depth of the `ClassSet`. +impl Drop for ClassSet { + fn drop(&mut self) { + use std::mem; + + match *self { + ClassSet::Item(ref item) => { + match *item { + ClassSetItem::Empty(_) + | ClassSetItem::Literal(_) + | ClassSetItem::Range(_) + | ClassSetItem::Ascii(_) + | ClassSetItem::Unicode(_) + | ClassSetItem::Perl(_) => return, + ClassSetItem::Bracketed(ref x) => { + if x.kind.is_empty() { + return; + } + } + ClassSetItem::Union(ref x) => { + if x.items.is_empty() { + return; + } + } + } + } + ClassSet::BinaryOp(ref op) => { + if op.lhs.is_empty() && op.rhs.is_empty() { + return; + } + } + } + + let empty_span = || Span::splat(Position::new(0, 0, 0)); + let empty_set = || ClassSet::Item(ClassSetItem::Empty(empty_span())); + let mut stack = vec![mem::replace(self, empty_set())]; + while let Some(mut set) = stack.pop() { + match set { + ClassSet::Item(ref mut item) => { + match *item { + ClassSetItem::Empty(_) + | ClassSetItem::Literal(_) + | ClassSetItem::Range(_) + | ClassSetItem::Ascii(_) + | ClassSetItem::Unicode(_) + | ClassSetItem::Perl(_) => {} + ClassSetItem::Bracketed(ref mut x) => { + stack.push(mem::replace(&mut x.kind, empty_set())); + } + ClassSetItem::Union(ref mut x) => { + stack.extend( + x.items.drain(..).map(ClassSet::Item)); + } + } + } + ClassSet::BinaryOp(ref mut op) => { + stack.push(mem::replace(&mut op.lhs, empty_set())); + stack.push(mem::replace(&mut op.rhs, empty_set())); + } + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // We use a thread with an explicit stack size to test that our destructor + // for Ast can handle arbitrarily sized expressions in constant stack + // space. In case we run on a platform without threads (WASM?), we limit + // this test to Windows/Unix. + #[test] + #[cfg(any(unix, windows))] + fn no_stack_overflow_on_drop() { + use std::thread; + + let run = || { + let span = || Span::splat(Position::new(0, 0, 0)); + let mut ast = Ast::Empty(span()); + for i in 0..200 { + ast = Ast::Group(Group { + span: span(), + kind: GroupKind::CaptureIndex(i), + ast: Box::new(ast), + }); + } + assert!(!ast.is_empty()); + }; + + // We run our test on a thread with a small stack size so we can + // force the issue more easily. + thread::Builder::new() + .stack_size(1<<10) + .spawn(run) + .unwrap() + .join() + .unwrap(); + } +} diff --git a/regex-syntax-2/src/ast/parse.rs b/regex-syntax-2/src/ast/parse.rs new file mode 100644 index 0000000000..b1a44fc822 --- /dev/null +++ b/regex-syntax-2/src/ast/parse.rs @@ -0,0 +1,5257 @@ +// Copyright 2018 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +/*! +This module provides a regular expression parser. +*/ + +use std::borrow::Borrow; +use std::cell::{Cell, RefCell}; +use std::mem; +use std::result; + +use ast::{self, Ast, Position, Span}; +use either::Either; + +use is_meta_character; + +type Result = result::Result; + +/// A primitive is an expression with no sub-expressions. This includes +/// literals, assertions and non-set character classes. This representation +/// is used as intermediate state in the parser. +/// +/// This does not include ASCII character classes, since they can only appear +/// within a set character class. +#[derive(Clone, Debug, Eq, PartialEq)] +enum Primitive { + Literal(ast::Literal), + Assertion(ast::Assertion), + Dot(Span), + Perl(ast::ClassPerl), + Unicode(ast::ClassUnicode), +} + +impl Primitive { + /// Return the span of this primitive. + fn span(&self) -> &Span { + match *self { + Primitive::Literal(ref x) => &x.span, + Primitive::Assertion(ref x) => &x.span, + Primitive::Dot(ref span) => span, + Primitive::Perl(ref x) => &x.span, + Primitive::Unicode(ref x) => &x.span, + } + } + + /// Convert this primitive into a proper AST. + fn into_ast(self) -> Ast { + match self { + Primitive::Literal(lit) => Ast::Literal(lit), + Primitive::Assertion(assert) => Ast::Assertion(assert), + Primitive::Dot(span) => Ast::Dot(span), + Primitive::Perl(cls) => Ast::Class(ast::Class::Perl(cls)), + Primitive::Unicode(cls) => Ast::Class(ast::Class::Unicode(cls)), + } + } + + /// Convert this primitive into an item in a character class. + /// + /// If this primitive is not a legal item (i.e., an assertion or a dot), + /// then return an error. + fn into_class_set_item>( + self, + p: &ParserI

, + ) -> Result { + use ast::ClassSetItem; + use self::Primitive::*; + + match self { + Literal(lit) => Ok(ClassSetItem::Literal(lit)), + Perl(cls) => Ok(ClassSetItem::Perl(cls)), + Unicode(cls) => Ok(ClassSetItem::Unicode(cls)), + x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)), + } + } + + /// Convert this primitive into a literal in a character class. In + /// particular, literals are the only valid items that can appear in + /// ranges. + /// + /// If this primitive is not a legal item (i.e., a class, assertion or a + /// dot), then return an error. + fn into_class_literal>( + self, + p: &ParserI

, + ) -> Result { + use self::Primitive::*; + + match self { + Literal(lit) => Ok(lit), + x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)), + } + } +} + +/// Returns true if the given character is a hexadecimal digit. +fn is_hex(c: char) -> bool { + ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F') +} + +/// Returns true if the given character is a valid in a capture group name. +/// +/// If `first` is true, then `c` is treated as the first character in the +/// group name (which is not allowed to be a digit). +fn is_capture_char(c: char, first: bool) -> bool { + c == '_' || (!first && c >= '0' && c <= '9') + || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') +} + +/// A builder for a regular expression parser. +/// +/// This builder permits modifying configuration options for the parser. +#[derive(Clone, Debug)] +pub struct ParserBuilder { + ignore_whitespace: bool, + nest_limit: u32, + octal: bool, +} + +impl Default for ParserBuilder { + fn default() -> ParserBuilder { + ParserBuilder::new() + } +} + +impl ParserBuilder { + /// Create a new parser builder with a default configuration. + pub fn new() -> ParserBuilder { + ParserBuilder { + ignore_whitespace: false, + nest_limit: 100, + octal: false, + } + } + + /// Build a parser from this configuration with the given pattern. + pub fn build(&self) -> Parser { + Parser { + pos: Cell::new(Position { offset: 0, line: 1, column: 1 }), + capture_index: Cell::new(0), + nest_limit: self.nest_limit, + octal: self.octal, + initial_ignore_whitespace: self.ignore_whitespace, + ignore_whitespace: Cell::new(self.ignore_whitespace), + comments: RefCell::new(vec![]), + stack_group: RefCell::new(vec![]), + stack_class: RefCell::new(vec![]), + capture_names: RefCell::new(vec![]), + scratch: RefCell::new(String::new()), + } + } + + /// Set the nesting limit for this parser. + /// + /// The nesting limit controls how deep the abstract syntax tree is allowed + /// to be. If the AST exceeds the given limit (e.g., with too many nested + /// groups), then an error is returned by the parser. + /// + /// The purpose of this limit is to act as a heuristic to prevent stack + /// overflow for consumers that do structural induction on an `Ast` using + /// explicit recursion. While this crate never does this (instead using + /// constant stack space and moving the call stack to the heap), other + /// crates may. + /// + /// This limit is not checked until the entire Ast is parsed. Therefore, + /// if callers want to put a limit on the amount of heap space used, then + /// they should impose a limit on the length, in bytes, of the concrete + /// pattern string. In particular, this is viable since this parser + /// implementation will limit itself to heap space proportional to the + /// lenth of the pattern string. + /// + /// Note that a nest limit of `0` will return a nest limit error for most + /// patterns but not all. For example, a nest limit of `0` permits `a` but + /// not `ab`, since `ab` requires a concatenation, which results in a nest + /// depth of `1`. In general, a nest limit is not something that manifests + /// in an obvious way in the concrete syntax, therefore, it should not be + /// used in a granular way. + pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder { + self.nest_limit = limit; + self + } + + /// Whether to support octal syntax or not. + /// + /// Octal syntax is a little-known way of uttering Unicode codepoints in + /// a regular expression. For example, `a`, `\x61`, `\u0061` and + /// `\141` are all equivalent regular expressions, where the last example + /// shows octal syntax. + /// + /// While supporting octal syntax isn't in and of itself a problem, it does + /// make good error messages harder. That is, in PCRE based regex engines, + /// syntax like `\0` invokes a backreference, which is explicitly + /// unsupported in Rust's regex engine. However, many users expect it to + /// be supported. Therefore, when octal support is disabled, the error + /// message will explicitly mention that backreferences aren't supported. + /// + /// Octal syntax is disabled by default. + pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder { + self.octal = yes; + self + } + + /// Enable verbose mode in the regular expression. + /// + /// When enabled, verbose mode permits insigificant whitespace in many + /// places in the regular expression, as well as comments. Comments are + /// started using `#` and continue until the end of the line. + /// + /// By default, this is disabled. It may be selectively enabled in the + /// regular expression by using the `x` flag regardless of this setting. + pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder { + self.ignore_whitespace = yes; + self + } +} + +/// A regular expression parser. +/// +/// This parses a string representation of a regular expression into an +/// abstract syntax tree. The size of the tree is proportional to the length +/// of the regular expression pattern. +/// +/// A `Parser` can be configured in more detail via a +/// [`ParserBuilder`](struct.ParserBuilder.html). +#[derive(Clone, Debug)] +pub struct Parser { + /// The current position of the parser. + pos: Cell, + /// The current capture index. + capture_index: Cell, + /// The maximum number of open parens/brackets allowed. If the parser + /// exceeds this number, then an error is returned. + nest_limit: u32, + /// Whether to support octal syntax or not. When `false`, the parser will + /// return an error helpfully pointing out that backreferences are not + /// supported. + octal: bool, + /// The initial setting for `ignore_whitespace` as provided by + /// Th`ParserBuilder`. is is used when reseting the parser's state. + initial_ignore_whitespace: bool, + /// Whether whitespace should be ignored. When enabled, comments are + /// also permitted. + ignore_whitespace: Cell, + /// A list of comments, in order of appearance. + comments: RefCell>, + /// A stack of grouped sub-expressions, including alternations. + stack_group: RefCell>, + /// A stack of nested character classes. This is only non-empty when + /// parsing a class. + stack_class: RefCell>, + /// A sorted sequence of capture names. This is used to detect duplicate + /// capture names and report an error if one is detected. + capture_names: RefCell>, + /// A scratch buffer used in various places. Mostly this is used to + /// accumulate relevant characters from parts of a pattern. + scratch: RefCell, +} + +/// ParserI is the internal parser implementation. +/// +/// We use this separate type so that we can carry the provided pattern string +/// along with us. In particular, a `Parser` internal state is not tied to any +/// one pattern, but `ParserI` is. +/// +/// This type also lets us use `ParserI<&Parser>` in production code while +/// retaining the convenience of `ParserI` for tests, which sometimes +/// work against the internal interface of the parser. +#[derive(Clone, Debug)] +struct ParserI<'s, P> { + /// The parser state/configuration. + parser: P, + /// The full regular expression provided by the user. + pattern: &'s str, +} + +/// GroupState represents a single stack frame while parsing nested groups +/// and alternations. Each frame records the state up to an opening parenthesis +/// or a alternating bracket `|`. +#[derive(Clone, Debug)] +enum GroupState { + /// This state is pushed whenever an opening group is found. + Group { + /// The concatenation immediately preceding the opening group. + concat: ast::Concat, + /// The group that has been opened. Its sub-AST is always empty. + group: ast::Group, + /// Whether this group has the `x` flag enabled or not. + ignore_whitespace: bool, + }, + /// This state is pushed whenever a new alternation branch is found. If + /// an alternation branch is found and this state is at the top of the + /// stack, then this state should be modified to include the new + /// alternation. + Alternation(ast::Alternation), +} + +/// ClassState represents a single stack frame while parsing character classes. +/// Each frame records the state up to an intersection, difference, symmetric +/// difference or nested class. +/// +/// Note that a parser's character class stack is only non-empty when parsing +/// a character class. In all other cases, it is empty. +#[derive(Clone, Debug)] +enum ClassState { + /// This state is pushed whenever an opening bracket is found. + Open { + /// The union of class items immediately preceding this class. + union: ast::ClassSetUnion, + /// The class that has been opened. Typically this just corresponds + /// to the `[`, but it can also include `[^` since `^` indicates + /// negation of the class. + set: ast::ClassBracketed, + }, + /// This state is pushed when a operator is seen. When popped, the stored + /// set becomes the left hand side of the operator. + Op { + /// The type of the operation, i.e., &&, -- or ~~. + kind: ast::ClassSetBinaryOpKind, + /// The left-hand side of the operator. + lhs: ast::ClassSet, + }, +} + +impl Parser { + /// Create a new parser with a default configuration. + /// + /// The parser can be run with either the `parse` or `parse_with_comments` + /// methods. The parse methods return an abstract syntax tree. + /// + /// To set configuration options on the parser, use + /// [`ParserBuilder`](struct.ParserBuilder.html). + pub fn new() -> Parser { + ParserBuilder::new().build() + } + + /// Parse the regular expression into an abstract syntax tree. + pub fn parse(&mut self, pattern: &str) -> Result { + ParserI::new(self, pattern).parse() + } + + /// Parse the regular expression and return an abstract syntax tree with + /// all of the comments found in the pattern. + pub fn parse_with_comments( + &mut self, + pattern: &str, + ) -> Result { + ParserI::new(self, pattern).parse_with_comments() + } + + /// Reset the internal state of a parser. + /// + /// This is called at the beginning of every parse. This prevents the + /// parser from running with inconsistent state (say, if a previous + /// invocation returned an error and the parser is reused). + fn reset(&self) { + // These settings should be in line with the construction + // in `ParserBuilder::build`. + self.pos.set(Position { offset: 0, line: 1, column: 1}); + self.ignore_whitespace.set(self.initial_ignore_whitespace); + self.comments.borrow_mut().clear(); + self.stack_group.borrow_mut().clear(); + self.stack_class.borrow_mut().clear(); + } +} + +impl<'s, P: Borrow> ParserI<'s, P> { + /// Build an internal parser from a parser configuration and a pattern. + fn new(parser: P, pattern: &'s str) -> ParserI<'s, P> { + ParserI { parser: parser, pattern: pattern } + } + + /// Return a reference to the parser state. + fn parser(&self) -> &Parser { + self.parser.borrow() + } + + /// Return a reference to the pattern being parsed. + fn pattern(&self) -> &str { + self.pattern.borrow() + } + + /// Create a new error with the given span and error type. + fn error(&self, span: Span, kind: ast::ErrorKind) -> ast::Error { + ast::Error { + kind: kind, + pattern: self.pattern().to_string(), + span: span, + } + } + + /// Return the current offset of the parser. + /// + /// The offset starts at `0` from the beginning of the regular expression + /// pattern string. + fn offset(&self) -> usize { + self.parser().pos.get().offset + } + + /// Return the current line number of the parser. + /// + /// The line number starts at `1`. + fn line(&self) -> usize { + self.parser().pos.get().line + } + + /// Return the current column of the parser. + /// + /// The column number starts at `1` and is reset whenever a `\n` is seen. + fn column(&self) -> usize { + self.parser().pos.get().column + } + + /// Return the next capturing index. Each subsequent call increments the + /// internal index. + /// + /// The span given should correspond to the location of the opening + /// parenthesis. + /// + /// If the capture limit is exceeded, then an error is returned. + fn next_capture_index(&self, span: Span) -> Result { + let current = self.parser().capture_index.get(); + let i = try!(current.checked_add(1).ok_or_else(|| { + self.error(span, ast::ErrorKind::CaptureLimitExceeded) + })); + self.parser().capture_index.set(i); + Ok(i) + } + + /// Adds the given capture name to this parser. If this capture name has + /// already been used, then an error is returned. + fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> { + let mut names = self.parser().capture_names.borrow_mut(); + match names.binary_search_by_key( + &cap.name.as_str(), + |c| c.name.as_str(), + ) { + Err(i) => { + names.insert(i, cap.clone()); + Ok(()) + } + Ok(i) => { + Err(self.error(cap.span, ast::ErrorKind::GroupNameDuplicate { + original: names[i].span, + })) + } + } + } + + /// Return whether the parser should ignore whitespace or not. + fn ignore_whitespace(&self) -> bool { + self.parser().ignore_whitespace.get() + } + + /// Return the character at the current position of the parser. + /// + /// This panics if the current position does not point to a valid char. + fn char(&self) -> char { + self.char_at(self.offset()) + } + + /// Return the character at the given position. + /// + /// This panics if the given position does not point to a valid char. + fn char_at(&self, i: usize) -> char { + self.pattern()[i..].chars().next() + .unwrap_or_else(|| { + panic!("expected char at offset {}", i) + }) + } + + /// Bump the parser to the next Unicode scalar value. + /// + /// If the end of the input has been reached, then `false` is returned. + fn bump(&self) -> bool { + if self.is_eof() { + return false; + } + let Position { mut offset, mut line, mut column } = self.pos(); + if self.char() == '\n' { + line = line.checked_add(1).unwrap(); + column = 1; + } else { + column = column.checked_add(1).unwrap(); + } + offset += self.char().len_utf8(); + self.parser().pos.set(Position { + offset: offset, + line: line, + column: column, + }); + self.pattern()[self.offset()..].chars().next().is_some() + } + + /// If the substring starting at the current position of the parser has + /// the given prefix, then bump the parser to the character immediately + /// following the prefix and return true. Otherwise, don't bump the parser + /// and return false. + fn bump_if(&self, prefix: &str) -> bool { + if self.pattern()[self.offset()..].starts_with(prefix) { + for _ in 0..prefix.chars().count() { + self.bump(); + } + true + } else { + false + } + } + + /// Returns true if and only if the parser is positioned at a look-around + /// prefix. The conditions under which this returns true must always + /// correspond to a regular expression that would otherwise be consider + /// invalid. + /// + /// This should only be called immediately after parsing the opening of + /// a group or a set of flags. + fn is_lookaround_prefix(&self) -> bool { + self.bump_if("?=") + || self.bump_if("?!") + || self.bump_if("?<=") + || self.bump_if("? bool { + if !self.bump() { + return false; + } + self.bump_space(); + !self.is_eof() + } + + /// If the `x` flag is enabled (i.e., whitespace insensitivity with + /// comments), then this will advance the parser through all whitespace + /// and comments to the next non-whitespace non-comment byte. + /// + /// If the `x` flag is disabled, then this is a no-op. + /// + /// This should be used selectively throughout the parser where + /// arbitrary whitespace is permitted when the `x` flag is enabled. For + /// example, `{ 5 , 6}` is equivalent to `{5,6}`. + fn bump_space(&self) { + if !self.ignore_whitespace() { + return; + } + while !self.is_eof() { + if self.char().is_whitespace() { + self.bump(); + } else if self.char() == '#' { + let start = self.pos(); + let mut comment_text = String::new(); + self.bump(); + while !self.is_eof() { + let c = self.char(); + self.bump(); + if c == '\n' { + break; + } + comment_text.push(c); + } + let comment = ast::Comment { + span: Span::new(start, self.pos()), + comment: comment_text, + }; + self.parser().comments.borrow_mut().push(comment); + } else { + break; + } + } + } + + /// Peek at the next character in the input without advancing the parser. + /// + /// If the input has been exhausted, then this returns `None`. + fn peek(&self) -> Option { + if self.is_eof() { + return None; + } + self.pattern()[self.offset() + self.char().len_utf8()..].chars().next() + } + + /// Returns true if the next call to `bump` would return false. + fn is_eof(&self) -> bool { + self.offset() == self.pattern().len() + } + + /// Return the current position of the parser, which includes the offset, + /// line and column. + fn pos(&self) -> Position { + self.parser().pos.get() + } + + /// Create a span at the current position of the parser. Both the start + /// and end of the span are set. + fn span(&self) -> Span { + Span::splat(self.pos()) + } + + /// Create a span that covers the current character. + fn span_char(&self) -> Span { + let mut next = Position { + offset: self.offset().checked_add(self.char().len_utf8()).unwrap(), + line: self.line(), + column: self.column().checked_add(1).unwrap(), + }; + if self.char() == '\n' { + next.line += 1; + next.column = 1; + } + Span::new(self.pos(), next) + } + + /// Parse and push a single alternation on to the parser's internal stack. + /// If the top of the stack already has an alternation, then add to that + /// instead of pushing a new one. + /// + /// The concatenation given corresponds to a single alternation branch. + /// The concatenation returned starts the next branch and is empty. + /// + /// This assumes the parser is currently positioned at `|` and will advance + /// the parser to the character following `|`. + fn push_alternate(&self, mut concat: ast::Concat) -> Result { + assert_eq!(self.char(), '|'); + concat.span.end = self.pos(); + self.push_or_add_alternation(concat); + self.bump(); + Ok(ast::Concat { + span: self.span(), + asts: vec![], + }) + } + + /// Pushes or adds the given branch of an alternation to the parser's + /// internal stack of state. + fn push_or_add_alternation(&self, concat: ast::Concat) { + use self::GroupState::*; + + let mut stack = self.parser().stack_group.borrow_mut(); + if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() { + alts.asts.push(concat.into_ast()); + return; + } + stack.push(Alternation(ast::Alternation { + span: Span::new(concat.span.start, self.pos()), + asts: vec![concat.into_ast()], + })); + } + + /// Parse and push a group AST (and its parent concatenation) on to the + /// parser's internal stack. Return a fresh concatenation corresponding + /// to the group's sub-AST. + /// + /// If a set of flags was found (with no group), then the concatenation + /// is returned with that set of flags added. + /// + /// This assumes that the parser is currently positioned on the opening + /// parenthesis. It advances the parser to the character at the start + /// of the sub-expression (or adjoining expression). + /// + /// If there was a problem parsing the start of the group, then an error + /// is returned. + fn push_group(&self, mut concat: ast::Concat) -> Result { + assert_eq!(self.char(), '('); + match try!(self.parse_group()) { + Either::Left(set) => { + let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace); + if let Some(v) = ignore { + self.parser().ignore_whitespace.set(v); + } + + concat.asts.push(Ast::Flags(set)); + Ok(concat) + } + Either::Right(group) => { + let old_ignore_whitespace = self.ignore_whitespace(); + let new_ignore_whitespace = group + .flags() + .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace)) + .unwrap_or(old_ignore_whitespace); + self.parser().stack_group.borrow_mut().push(GroupState::Group { + concat: concat, + group: group, + ignore_whitespace: old_ignore_whitespace, + }); + self.parser().ignore_whitespace.set(new_ignore_whitespace); + Ok(ast::Concat { + span: self.span(), + asts: vec![], + }) + } + } + } + + /// Pop a group AST from the parser's internal stack and set the group's + /// AST to the given concatenation. Return the concatenation containing + /// the group. + /// + /// This assumes that the parser is currently positioned on the closing + /// parenthesis and advances the parser to the character following the `)`. + /// + /// If no such group could be popped, then an unopened group error is + /// returned. + fn pop_group(&self, mut group_concat: ast::Concat) -> Result { + use self::GroupState::*; + + assert_eq!(self.char(), ')'); + let mut stack = self.parser().stack_group.borrow_mut(); + let (mut prior_concat, mut group, ignore_whitespace, alt) = + match stack.pop() { + Some(Group { concat, group, ignore_whitespace }) => { + (concat, group, ignore_whitespace, None) + } + Some(Alternation(alt)) => { + match stack.pop() { + Some(Group { concat, group, ignore_whitespace }) => { + (concat, group, ignore_whitespace, Some(alt)) + } + None | Some(Alternation(_)) => { + return Err(self.error( + self.span_char(), + ast::ErrorKind::GroupUnopened, + )); + } + } + } + None => { + return Err(self.error( + self.span_char(), + ast::ErrorKind::GroupUnopened, + )); + } + }; + self.parser().ignore_whitespace.set(ignore_whitespace); + group_concat.span.end = self.pos(); + self.bump(); + group.span.end = self.pos(); + match alt { + Some(mut alt) => { + alt.span.end = group_concat.span.end; + alt.asts.push(group_concat.into_ast()); + group.ast = Box::new(alt.into_ast()); + } + None => { + group.ast = Box::new(group_concat.into_ast()); + } + } + prior_concat.asts.push(Ast::Group(group)); + Ok(prior_concat) + } + + /// Pop the last state from the parser's internal stack, if it exists, and + /// add the given concatenation to it. There either must be no state or a + /// single alternation item on the stack. Any other scenario produces an + /// error. + /// + /// This assumes that the parser has advanced to the end. + fn pop_group_end(&self, mut concat: ast::Concat) -> Result { + concat.span.end = self.pos(); + let mut stack = self.parser().stack_group.borrow_mut(); + let ast = match stack.pop() { + None => Ok(concat.into_ast()), + Some(GroupState::Alternation(mut alt)) => { + alt.span.end = self.pos(); + alt.asts.push(concat.into_ast()); + Ok(Ast::Alternation(alt)) + } + Some(GroupState::Group { group, .. }) => { + return Err(self.error( + group.span, + ast::ErrorKind::GroupUnclosed, + )); + } + }; + // If we try to pop again, there should be nothing. + match stack.pop() { + None => ast, + Some(GroupState::Alternation(_)) => { + // This unreachable is unfortunate. This case can't happen + // because the only way we can be here is if there were two + // `GroupState::Alternation`s adjacent in the parser's stack, + // which we guarantee to never happen because we never push a + // `GroupState::Alternation` if one is already at the top of + // the stack. + unreachable!() + } + Some(GroupState::Group { group, .. }) => { + Err(self.error(group.span, ast::ErrorKind::GroupUnclosed)) + } + } + } + + /// Parse the opening of a character class and push the current class + /// parsing context onto the parser's stack. This assumes that the parser + /// is positioned at an opening `[`. The given union should correspond to + /// the union of set items built up before seeing the `[`. + /// + /// If there was a problem parsing the opening of the class, then an error + /// is returned. Otherwise, a new union of set items for the class is + /// returned (which may be populated with either a `]` or a `-`). + fn push_class_open( + &self, + parent_union: ast::ClassSetUnion, + ) -> Result { + assert_eq!(self.char(), '['); + + let (nested_set, nested_union) = try!(self.parse_set_class_open()); + self.parser().stack_class.borrow_mut().push(ClassState::Open { + union: parent_union, + set: nested_set, + }); + Ok(nested_union) + } + + /// Parse the end of a character class set and pop the character class + /// parser stack. The union given corresponds to the last union built + /// before seeing the closing `]`. The union returned corresponds to the + /// parent character class set with the nested class added to it. + /// + /// This assumes that the parser is positioned at a `]` and will advance + /// the parser to the byte immediately following the `]`. + /// + /// If the stack is empty after popping, then this returns the final + /// "top-level" character class AST (where a "top-level" character class + /// is one that is not nested inside any other character class). + /// + /// If there is no corresponding opening bracket on the parser's stack, + /// then an error is returned. + fn pop_class( + &self, + nested_union: ast::ClassSetUnion, + ) -> Result> { + assert_eq!(self.char(), ']'); + + let item = ast::ClassSet::Item(nested_union.into_item()); + let prevset = self.pop_class_op(item); + let mut stack = self.parser().stack_class.borrow_mut(); + match stack.pop() { + None => { + // We can never observe an empty stack: + // + // 1) We are guaranteed to start with a non-empty stack since + // the character class parser is only initiated when it sees + // a `[`. + // 2) If we ever observe an empty stack while popping after + // seeing a `]`, then we signal the character class parser + // to terminate. + panic!("unexpected empty character class stack") + }, + Some(ClassState::Op { .. }) => { + // This panic is unfortunate, but this case is impossible + // since we already popped the Op state if one exists above. + // Namely, every push to the class parser stack is guarded by + // whether an existing Op is already on the top of the stack. + // If it is, the existing Op is modified. That is, the stack + // can never have consecutive Op states. + panic!("unexpected ClassState::Op") + } + Some(ClassState::Open { mut union, mut set }) => { + self.bump(); + set.span.end = self.pos(); + set.kind = prevset; + if stack.is_empty() { + Ok(Either::Right(ast::Class::Bracketed(set))) + } else { + union.push(ast::ClassSetItem::Bracketed(Box::new(set))); + Ok(Either::Left(union)) + } + } + } + } + + /// Return an "unclosed class" error whose span points to the most + /// recently opened class. + /// + /// This should only be called while parsing a character class. + fn unclosed_class_error(&self) -> ast::Error { + for state in self.parser().stack_class.borrow().iter().rev() { + match *state { + ClassState::Open { ref set, .. } => { + return self.error(set.span, ast::ErrorKind::ClassUnclosed); + } + _ => {} + } + } + // We are guaranteed to have a non-empty stack with at least + // one open bracket, so we should never get here. + panic!("no open character class found") + } + + /// Push the current set of class items on to the class parser's stack as + /// the left hand side of the given operator. + /// + /// A fresh set union is returned, which should be used to build the right + /// hand side of this operator. + fn push_class_op( + &self, + next_kind: ast::ClassSetBinaryOpKind, + next_union: ast::ClassSetUnion, + ) -> ast::ClassSetUnion { + + let item = ast::ClassSet::Item(next_union.into_item()); + let new_lhs = self.pop_class_op(item); + self.parser().stack_class.borrow_mut().push(ClassState::Op { + kind: next_kind, + lhs: new_lhs, + }); + ast::ClassSetUnion { span: self.span(), items: vec![] } + } + + /// Pop a character class set from the character class parser stack. If the + /// top of the stack is just an item (not an operation), then return the + /// given set unchanged. If the top of the stack is an operation, then the + /// given set will be used as the rhs of the operation on the top of the + /// stack. In that case, the binary operation is returned as a set. + fn pop_class_op(&self, rhs: ast::ClassSet) -> ast::ClassSet { + let mut stack = self.parser().stack_class.borrow_mut(); + let (kind, lhs) = match stack.pop() { + Some(ClassState::Op { kind, lhs }) => (kind, lhs), + Some(state @ ClassState::Open { .. }) => { + stack.push(state); + return rhs; + } + None => unreachable!(), + }; + let span = Span::new(lhs.span().start, rhs.span().end); + ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { + span: span, + kind: kind, + lhs: Box::new(lhs), + rhs: Box::new(rhs), + }) + } +} + +impl<'s, P: Borrow> ParserI<'s, P> { + /// Parse the regular expression into an abstract syntax tree. + fn parse(&self) -> Result { + self.parse_with_comments().map(|astc| astc.ast) + } + + /// Parse the regular expression and return an abstract syntax tree with + /// all of the comments found in the pattern. + fn parse_with_comments(&self) -> Result { + assert_eq!(self.offset(), 0, "parser can only be used once"); + self.parser().reset(); + let mut concat = ast::Concat { + span: self.span(), + asts: vec![], + }; + loop { + self.bump_space(); + if self.is_eof() { + break; + } + match self.char() { + '(' => concat = try!(self.push_group(concat)), + ')' => concat = try!(self.pop_group(concat)), + '|' => concat = try!(self.push_alternate(concat)), + '[' => { + let class = try!(self.parse_set_class()); + concat.asts.push(Ast::Class(class)); + } + '?' => { + concat = try!(self.parse_uncounted_repetition( + concat, ast::RepetitionKind::ZeroOrOne)); + } + '*' => { + concat = try!(self.parse_uncounted_repetition( + concat, ast::RepetitionKind::ZeroOrMore)); + } + '+' => { + concat = try!(self.parse_uncounted_repetition( + concat, ast::RepetitionKind::OneOrMore)); + } + '{' => { + concat = try!(self.parse_counted_repetition(concat)); + } + _ => concat.asts.push(try!(self.parse_primitive()).into_ast()), + } + } + let ast = try!(self.pop_group_end(concat)); + try!(NestLimiter::new(self).check(&ast)); + Ok(ast::WithComments { + ast: ast, + comments: mem::replace( + &mut *self.parser().comments.borrow_mut(), + vec![], + ), + }) + } + + /// Parses an uncounted repetition operation. An uncounted repetition + /// operator includes ?, * and +, but does not include the {m,n} syntax. + /// The given `kind` should correspond to the operator observed by the + /// caller. + /// + /// This assumes that the paser is currently positioned at the repetition + /// operator and advances the parser to the first character after the + /// operator. (Note that the operator may include a single additional `?`, + /// which makes the operator ungreedy.) + /// + /// The caller should include the concatenation that is being built. The + /// concatenation returned includes the repetition operator applied to the + /// last expression in the given concatenation. + fn parse_uncounted_repetition( + &self, + mut concat: ast::Concat, + kind: ast::RepetitionKind, + ) -> Result { + assert!( + self.char() == '?' || self.char() == '*' || self.char() == '+'); + let op_start = self.pos(); + let ast = match concat.asts.pop() { + Some(ast) => ast, + None => return Err(self.error( + self.span(), + ast::ErrorKind::RepetitionMissing, + )), + }; + let mut greedy = true; + if self.bump() && self.char() == '?' { + greedy = false; + self.bump(); + } + concat.asts.push(Ast::Repetition(ast::Repetition { + span: ast.span().with_end(self.pos()), + op: ast::RepetitionOp { + span: Span::new(op_start, self.pos()), + kind: kind, + }, + greedy: greedy, + ast: Box::new(ast), + })); + Ok(concat) + } + + /// Parses a counted repetition operation. A counted repetition operator + /// corresponds to the {m,n} syntax, and does not include the ?, * or + + /// operators. + /// + /// This assumes that the paser is currently positioned at the opening `{` + /// and advances the parser to the first character after the operator. + /// (Note that the operator may include a single additional `?`, which + /// makes the operator ungreedy.) + /// + /// The caller should include the concatenation that is being built. The + /// concatenation returned includes the repetition operator applied to the + /// last expression in the given concatenation. + fn parse_counted_repetition( + &self, + mut concat: ast::Concat, + ) -> Result { + assert!(self.char() == '{'); + let start = self.pos(); + let ast = match concat.asts.pop() { + Some(ast) => ast, + None => return Err(self.error( + self.span(), + ast::ErrorKind::RepetitionMissing, + )), + }; + if !self.bump_and_bump_space() { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::RepetitionCountUnclosed, + )); + } + let count_start = try!(self.parse_decimal()); + let mut range = ast::RepetitionRange::Exactly(count_start); + if self.is_eof() { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::RepetitionCountUnclosed, + )); + } + if self.char() == ',' { + if !self.bump_and_bump_space() { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::RepetitionCountUnclosed, + )); + } + if self.char() != '}' { + let count_end = try!(self.parse_decimal()); + range = ast::RepetitionRange::Bounded(count_start, count_end); + } else { + range = ast::RepetitionRange::AtLeast(count_start); + } + } + if self.is_eof() || self.char() != '}' { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::RepetitionCountUnclosed, + )); + } + + let mut greedy = true; + if self.bump_and_bump_space() && self.char() == '?' { + greedy = false; + self.bump(); + } + + let op_span = Span::new(start, self.pos()); + if !range.is_valid() { + return Err(self.error( + op_span, + ast::ErrorKind::RepetitionCountInvalid, + )); + } + concat.asts.push(Ast::Repetition(ast::Repetition { + span: ast.span().with_end(self.pos()), + op: ast::RepetitionOp { + span: op_span, + kind: ast::RepetitionKind::Range(range), + }, + greedy: greedy, + ast: Box::new(ast), + })); + Ok(concat) + } + + /// Parse a group (which contains a sub-expression) or a set of flags. + /// + /// If a group was found, then it is returned with an empty AST. If a set + /// of flags is found, then that set is returned. + /// + /// The parser should be positioned at the opening parenthesis. + /// + /// This advances the parser to the character before the start of the + /// sub-expression (in the case of a group) or to the closing parenthesis + /// immediately following the set of flags. + /// + /// # Errors + /// + /// If flags are given and incorrectly specified, then a corresponding + /// error is returned. + /// + /// If a capture name is given and it is incorrectly specified, then a + /// corresponding error is returned. + fn parse_group(&self) -> Result> { + assert_eq!(self.char(), '('); + let open_span = self.span_char(); + self.bump(); + self.bump_space(); + if self.is_lookaround_prefix() { + return Err(self.error( + Span::new(open_span.start, self.span().end), + ast::ErrorKind::UnsupportedLookAround, + )); + } + let inner_span = self.span(); + if self.bump_if("?P<") { + let capture_index = try!(self.next_capture_index(open_span)); + let cap = try!(self.parse_capture_name(capture_index)); + Ok(Either::Right(ast::Group { + span: open_span, + kind: ast::GroupKind::CaptureName(cap), + ast: Box::new(Ast::Empty(self.span())), + })) + } else if self.bump_if("?") { + if self.is_eof() { + return Err(self.error( + open_span, + ast::ErrorKind::GroupUnclosed, + )); + } + let flags = try!(self.parse_flags()); + let char_end = self.char(); + self.bump(); + if char_end == ')' { + // We don't allow empty flags, e.g., `(?)`. We instead + // interpret it as a repetition operator missing its argument. + if flags.items.is_empty() { + return Err(self.error( + inner_span, + ast::ErrorKind::RepetitionMissing, + )); + } + Ok(Either::Left(ast::SetFlags { + span: Span { end: self.pos(), ..open_span }, + flags: flags, + })) + } else { + assert_eq!(char_end, ':'); + Ok(Either::Right(ast::Group { + span: open_span, + kind: ast::GroupKind::NonCapturing(flags), + ast: Box::new(Ast::Empty(self.span())), + })) + } + } else { + let capture_index = try!(self.next_capture_index(open_span)); + Ok(Either::Right(ast::Group { + span: open_span, + kind: ast::GroupKind::CaptureIndex(capture_index), + ast: Box::new(Ast::Empty(self.span())), + })) + } + } + + /// Parses a capture group name. Assumes that the parser is positioned at + /// the first character in the name following the opening `<` (and may + /// possibly be EOF). This advances the parser to the first character + /// following the closing `>`. + /// + /// The caller must provide the capture index of the group for this name. + fn parse_capture_name( + &self, + capture_index: u32, + ) -> Result { + if self.is_eof() { + return Err(self.error( + self.span(), + ast::ErrorKind::GroupNameUnexpectedEof, + )); + } + let start = self.pos(); + loop { + if self.char() == '>' { + break; + } + if !is_capture_char(self.char(), self.pos() == start) { + return Err(self.error( + self.span_char(), + ast::ErrorKind::GroupNameInvalid, + )); + } + if !self.bump() { + break; + } + } + let end = self.pos(); + if self.is_eof() { + return Err(self.error( + self.span(), + ast::ErrorKind::GroupNameUnexpectedEof, + )); + } + assert_eq!(self.char(), '>'); + self.bump(); + let name = &self.pattern()[start.offset..end.offset]; + if name.is_empty() { + return Err(self.error( + Span::new(start, start), + ast::ErrorKind::GroupNameEmpty, + )); + } + let capname = ast::CaptureName { + span: Span::new(start, end), + name: name.to_string(), + index: capture_index, + }; + try!(self.add_capture_name(&capname)); + Ok(capname) + } + + /// Parse a sequence of flags starting at the current character. + /// + /// This advances the parser to the character immediately following the + /// flags, which is guaranteed to be either `:` or `)`. + /// + /// # Errors + /// + /// If any flags are duplicated, then an error is returned. + /// + /// If the negation operator is used more than once, then an error is + /// returned. + /// + /// If no flags could be found or if the negation operation is not followed + /// by any flags, then an error is returned. + fn parse_flags(&self) -> Result { + let mut flags = ast::Flags { + span: self.span(), + items: vec![], + }; + let mut last_was_negation = None; + while self.char() != ':' && self.char() != ')' { + if self.char() == '-' { + last_was_negation = Some(self.span_char()); + let item = ast::FlagsItem { + span: self.span_char(), + kind: ast::FlagsItemKind::Negation, + }; + if let Some(i) = flags.add_item(item) { + return Err(self.error( + self.span_char(), + ast::ErrorKind::FlagRepeatedNegation { + original: flags.items[i].span, + }, + )); + } + } else { + last_was_negation = None; + let item = ast::FlagsItem { + span: self.span_char(), + kind: ast::FlagsItemKind::Flag(try!(self.parse_flag())), + }; + if let Some(i) = flags.add_item(item) { + return Err(self.error( + self.span_char(), + ast::ErrorKind::FlagDuplicate { + original: flags.items[i].span, + }, + )); + } + } + if !self.bump() { + return Err(self.error( + self.span(), + ast::ErrorKind::FlagUnexpectedEof, + )); + } + } + if let Some(span) = last_was_negation { + return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation)); + } + flags.span.end = self.pos(); + Ok(flags) + } + + /// Parse the current character as a flag. Do not advance the parser. + /// + /// # Errors + /// + /// If the flag is not recognized, then an error is returned. + fn parse_flag(&self) -> Result { + match self.char() { + 'i' => Ok(ast::Flag::CaseInsensitive), + 'm' => Ok(ast::Flag::MultiLine), + 's' => Ok(ast::Flag::DotMatchesNewLine), + 'U' => Ok(ast::Flag::SwapGreed), + 'u' => Ok(ast::Flag::Unicode), + 'x' => Ok(ast::Flag::IgnoreWhitespace), + _ => Err(self.error( + self.span_char(), + ast::ErrorKind::FlagUnrecognized, + )), + } + } + + /// Parse a primitive AST. e.g., A literal, non-set character class or + /// assertion. + /// + /// This assumes that the parser expects a primitive at the current + /// location. i.e., All other non-primitive cases have been handled. + /// For example, if the parser's position is at `|`, then `|` will be + /// treated as a literal (e.g., inside a character class). + /// + /// This advances the parser to the first character immediately following + /// the primitive. + fn parse_primitive(&self) -> Result { + match self.char() { + '\\' => self.parse_escape(), + '.' => { + let ast = Primitive::Dot(self.span_char()); + self.bump(); + Ok(ast) + } + '^' => { + let ast = Primitive::Assertion(ast::Assertion { + span: self.span_char(), + kind: ast::AssertionKind::StartLine, + }); + self.bump(); + Ok(ast) + } + '$' => { + let ast = Primitive::Assertion(ast::Assertion { + span: self.span_char(), + kind: ast::AssertionKind::EndLine, + }); + self.bump(); + Ok(ast) + } + c => { + let ast = Primitive::Literal(ast::Literal { + span: self.span_char(), + kind: ast::LiteralKind::Verbatim, + c: c, + }); + self.bump(); + Ok(ast) + } + } + } + + /// Parse an escape sequence as a primitive AST. + /// + /// This assumes the parser is positioned at the start of the escape + /// sequence, i.e., `\`. It advances the parser to the first position + /// immediately following the escape sequence. + fn parse_escape(&self) -> Result { + assert_eq!(self.char(), '\\'); + let start = self.pos(); + if !self.bump() { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::EscapeUnexpectedEof, + )); + } + let c = self.char(); + // Put some of the more complicated routines into helpers. + match c { + '0'...'7' => { + if !self.parser().octal { + return Err(self.error( + Span::new(start, self.span_char().end), + ast::ErrorKind::UnsupportedBackreference, + )); + } + let mut lit = self.parse_octal(); + lit.span.start = start; + return Ok(Primitive::Literal(lit)); + } + '8'...'9' if !self.parser().octal => { + return Err(self.error( + Span::new(start, self.span_char().end), + ast::ErrorKind::UnsupportedBackreference, + )); + } + 'x' | 'u' | 'U' => { + let mut lit = try!(self.parse_hex()); + lit.span.start = start; + return Ok(Primitive::Literal(lit)); + } + 'p' | 'P' => { + let mut cls = try!(self.parse_unicode_class()); + cls.span.start = start; + return Ok(Primitive::Unicode(cls)); + } + 'd' | 's' | 'w' | 'D' | 'S' | 'W' => { + let mut cls = self.parse_perl_class(); + cls.span.start = start; + return Ok(Primitive::Perl(cls)); + } + _ => {} + } + + // Handle all of the one letter sequences inline. + self.bump(); + let span = Span::new(start, self.pos()); + if is_meta_character(c) { + return Ok(Primitive::Literal(ast::Literal { + span: span, + kind: ast::LiteralKind::Punctuation, + c: c, + })); + } + let special = |kind, c| Ok(Primitive::Literal(ast::Literal { + span: span, + kind: ast::LiteralKind::Special(kind), + c: c, + })); + match c { + 'a' => special(ast::SpecialLiteralKind::Bell, '\x07'), + 'f' => special(ast::SpecialLiteralKind::FormFeed, '\x0C'), + 't' => special(ast::SpecialLiteralKind::Tab, '\t'), + 'n' => special(ast::SpecialLiteralKind::LineFeed, '\n'), + 'r' => special(ast::SpecialLiteralKind::CarriageReturn, '\r'), + 'v' => special(ast::SpecialLiteralKind::VerticalTab, '\x0B'), + ' ' if self.ignore_whitespace() => { + special(ast::SpecialLiteralKind::Space, ' ') + } + 'A' => Ok(Primitive::Assertion(ast::Assertion { + span: span, + kind: ast::AssertionKind::StartText, + })), + 'z' => Ok(Primitive::Assertion(ast::Assertion { + span: span, + kind: ast::AssertionKind::EndText, + })), + 'b' => Ok(Primitive::Assertion(ast::Assertion { + span: span, + kind: ast::AssertionKind::WordBoundary, + })), + 'B' => Ok(Primitive::Assertion(ast::Assertion { + span: span, + kind: ast::AssertionKind::NotWordBoundary, + })), + _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)), + } + } + + /// Parse an octal representation of a Unicode codepoint up to 3 digits + /// long. This expects the parser to be positioned at the first octal + /// digit and advances the parser to the first character immediately + /// following the octal number. This also assumes that parsing octal + /// escapes is enabled. + /// + /// Assuming the preconditions are met, this routine can never fail. + fn parse_octal(&self) -> ast::Literal { + use std::char; + use std::u32; + + assert!(self.parser().octal); + assert!('0' <= self.char() && self.char() <= '7'); + let start = self.pos(); + // Parse up to two more digits. + while + self.bump() && + '0' <= self.char() && self.char() <= '7' && + self.pos().offset - start.offset <= 2 + {} + let end = self.pos(); + let octal = &self.pattern()[start.offset..end.offset]; + // Parsing the octal should never fail since the above guarantees a + // valid number. + let codepoint = + u32::from_str_radix(octal, 8).expect("valid octal number"); + // The max value for 3 digit octal is 0777 = 511 and [0, 511] has no + // invalid Unicode scalar values. + let c = char::from_u32(codepoint).expect("Unicode scalar value"); + ast::Literal { + span: Span::new(start, end), + kind: ast::LiteralKind::Octal, + c: c, + } + } + + /// Parse a hex representation of a Unicode codepoint. This handles both + /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to + /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to + /// the first character immediately following the hexadecimal literal. + fn parse_hex(&self) -> Result { + assert!(self.char() == 'x' + || self.char() == 'u' + || self.char() == 'U'); + + let hex_kind = match self.char() { + 'x' => ast::HexLiteralKind::X, + 'u' => ast::HexLiteralKind::UnicodeShort, + _ => ast::HexLiteralKind::UnicodeLong, + }; + if !self.bump_and_bump_space() { + return Err(self.error( + self.span(), + ast::ErrorKind::EscapeUnexpectedEof, + )); + } + if self.char() == '{' { + self.parse_hex_brace(hex_kind) + } else { + self.parse_hex_digits(hex_kind) + } + } + + /// Parse an N-digit hex representation of a Unicode codepoint. This + /// expects the parser to be positioned at the first digit and will advance + /// the parser to the first character immediately following the escape + /// sequence. + /// + /// The number of digits given must be 2 (for `\xNN`), 4 (for `\uNNNN`) + /// or 8 (for `\UNNNNNNNN`). + fn parse_hex_digits( + &self, + kind: ast::HexLiteralKind, + ) -> Result { + use std::char; + use std::u32; + + let mut scratch = self.parser().scratch.borrow_mut(); + scratch.clear(); + + let start = self.pos(); + for i in 0..kind.digits() { + if i > 0 && !self.bump_and_bump_space() { + return Err(self.error( + self.span(), + ast::ErrorKind::EscapeUnexpectedEof, + )); + } + if !is_hex(self.char()) { + return Err(self.error( + self.span_char(), + ast::ErrorKind::EscapeHexInvalidDigit, + )); + } + scratch.push(self.char()); + } + // The final bump just moves the parser past the literal, which may + // be EOF. + self.bump_and_bump_space(); + let end = self.pos(); + let hex = scratch.as_str(); + match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) { + None => Err(self.error( + Span::new(start, end), + ast::ErrorKind::EscapeHexInvalid, + )), + Some(c) => Ok(ast::Literal { + span: Span::new(start, end), + kind: ast::LiteralKind::HexFixed(kind), + c: c, + }), + } + } + + /// Parse a hex representation of any Unicode scalar value. This expects + /// the parser to be positioned at the opening brace `{` and will advance + /// the parser to the first character following the closing brace `}`. + fn parse_hex_brace( + &self, + kind: ast::HexLiteralKind, + ) -> Result { + use std::char; + use std::u32; + + let mut scratch = self.parser().scratch.borrow_mut(); + scratch.clear(); + + let brace_pos = self.pos(); + let start = self.span_char().end; + while self.bump_and_bump_space() && self.char() != '}' { + if !is_hex(self.char()) { + return Err(self.error( + self.span_char(), + ast::ErrorKind::EscapeHexInvalidDigit, + )); + } + scratch.push(self.char()); + } + if self.is_eof() { + return Err(self.error( + Span::new(brace_pos, self.pos()), + ast::ErrorKind::EscapeUnexpectedEof, + )); + } + let end = self.pos(); + let hex = scratch.as_str(); + assert_eq!(self.char(), '}'); + self.bump_and_bump_space(); + + if hex.is_empty() { + return Err(self.error( + Span::new(brace_pos, self.pos()), + ast::ErrorKind::EscapeHexEmpty, + )); + } + match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) { + None => Err(self.error( + Span::new(start, end), + ast::ErrorKind::EscapeHexInvalid, + )), + Some(c) => Ok(ast::Literal { + span: Span::new(start, self.pos()), + kind: ast::LiteralKind::HexBrace(kind), + c: c, + }), + } + } + + /// Parse a decimal number into a u32 while trimming leading and trailing + /// whitespace. + /// + /// This expects the parser to be positioned at the first position where + /// a decimal digit could occur. This will advance the parser to the byte + /// immediately following the last contiguous decimal digit. + /// + /// If no decimal digit could be found or if there was a problem parsing + /// the complete set of digits into a u32, then an error is returned. + fn parse_decimal(&self) -> Result { + let mut scratch = self.parser().scratch.borrow_mut(); + scratch.clear(); + + while !self.is_eof() && self.char().is_whitespace() { + self.bump(); + } + let start = self.pos(); + while !self.is_eof() && '0' <= self.char() && self.char() <= '9' { + scratch.push(self.char()); + self.bump_and_bump_space(); + } + let span = Span::new(start, self.pos()); + while !self.is_eof() && self.char().is_whitespace() { + self.bump_and_bump_space(); + } + let digits = scratch.as_str(); + if digits.is_empty() { + return Err(self.error(span, ast::ErrorKind::DecimalEmpty)); + } + match u32::from_str_radix(digits, 10).ok() { + Some(n) => Ok(n), + None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)), + } + } + + /// Parse a standard character class consisting primarily of characters or + /// character ranges, but can also contain nested character classes of + /// any type (sans `.`). + /// + /// This assumes the parser is positioned at the opening `[`. If parsing + /// is successful, then the parser is advanced to the position immediately + /// following the closing `]`. + fn parse_set_class(&self) -> Result { + assert_eq!(self.char(), '['); + + let mut union = ast::ClassSetUnion { + span: self.span(), + items: vec![], + }; + loop { + self.bump_space(); + if self.is_eof() { + return Err(self.unclosed_class_error()); + } + match self.char() { + '[' => { + // If we've already parsed the opening bracket, then + // attempt to treat this as the beginning of an ASCII + // class. If ASCII class parsing fails, then the parser + // backs up to `[`. + if !self.parser().stack_class.borrow().is_empty() { + if let Some(cls) = self.maybe_parse_ascii_class() { + union.push(ast::ClassSetItem::Ascii(cls)); + continue; + } + } + union = try!(self.push_class_open(union)); + } + ']' => { + match try!(self.pop_class(union)) { + Either::Left(nested_union) => { union = nested_union; } + Either::Right(class) => return Ok(class), + } + } + '&' if self.peek() == Some('&') => { + assert!(self.bump_if("&&")); + union = self.push_class_op( + ast::ClassSetBinaryOpKind::Intersection, union); + } + '-' if self.peek() == Some('-') => { + assert!(self.bump_if("--")); + union = self.push_class_op( + ast::ClassSetBinaryOpKind::Difference, union); + } + '~' if self.peek() == Some('~') => { + assert!(self.bump_if("~~")); + union = self.push_class_op( + ast::ClassSetBinaryOpKind::SymmetricDifference, union); + } + _ => { + union.push(try!(self.parse_set_class_range())); + } + } + } + } + + /// Parse a single primitive item in a character class set. The item to + /// be parsed can either be one of a simple literal character, a range + /// between two simple literal characters or a "primitive" character + /// class like \w or \p{Greek}. + /// + /// If an invalid escape is found, or if a character class is found where + /// a simple literal is expected (e.g., in a range), then an error is + /// returned. + fn parse_set_class_range(&self) -> Result { + let prim1 = try!(self.parse_set_class_item()); + self.bump_space(); + if self.is_eof() { + return Err(self.unclosed_class_error()); + } + // If the next char isn't a `-`, then we don't have a range. + // There are two exceptions. If the char after a `-` is a `]`, then + // `-` is interpreted as a literal `-`. Alternatively, if the char + // after a `-` is a `-`, then `--` corresponds to a "difference" + // operation. + if self.char() != '-' + || self.peek() == Some(']') + || self.peek() == Some('-') + { + return prim1.into_class_set_item(self); + } + // OK, now we're parsing a range, so bump past the `-` and parse the + // second half of the range. + if !self.bump_and_bump_space() { + return Err(self.unclosed_class_error()); + } + let prim2 = try!(self.parse_set_class_item()); + let range = ast::ClassSetRange { + span: Span::new(prim1.span().start, prim2.span().end), + start: try!(prim1.into_class_literal(self)), + end: try!(prim2.into_class_literal(self)), + }; + if !range.is_valid() { + return Err(self.error( + range.span, + ast::ErrorKind::ClassRangeInvalid, + )); + } + Ok(ast::ClassSetItem::Range(range)) + } + + /// Parse a single item in a character class as a primitive, where the + /// primitive either consists of a verbatim literal or a single escape + /// sequence. + /// + /// This assumes the parser is positioned at the beginning of a primitive, + /// and advances the parser to the first position after the primitive if + /// successful. + /// + /// Note that it is the caller's responsibility to report an error if an + /// illegal primitive was parsed. + fn parse_set_class_item(&self) -> Result { + if self.char() == '\\' { + self.parse_escape() + } else { + let x = Primitive::Literal(ast::Literal { + span: self.span_char(), + kind: ast::LiteralKind::Verbatim, + c: self.char(), + }); + self.bump(); + Ok(x) + } + } + + /// Parses the opening of a character class set. This includes the opening + /// bracket along with `^` if present to indicate negation. This also + /// starts parsing the opening set of unioned items if applicable, since + /// there are special rules applied to certain characters in the opening + /// of a character class. For example, `[^]]` is the class of all + /// characters not equal to `]`. (`]` would need to be escaped in any other + /// position.) Similarly for `-`. + /// + /// In all cases, the op inside the returned `ast::ClassBracketed` is an + /// empty union. This empty union should be replaced with the actual item + /// when it is popped from the parser's stack. + /// + /// This assumes the parser is positioned at the opening `[` and advances + /// the parser to the first non-special byte of the character class. + /// + /// An error is returned if EOF is found. + fn parse_set_class_open( + &self, + ) -> Result<(ast::ClassBracketed, ast::ClassSetUnion)> { + assert_eq!(self.char(), '['); + let start = self.pos(); + if !self.bump_and_bump_space() { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::ClassUnclosed, + )); + } + + let negated = + if self.char() != '^' { + false + } else { + if !self.bump_and_bump_space() { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::ClassUnclosed, + )); + } + true + }; + // Accept any number of `-` as literal `-`. + let mut union = ast::ClassSetUnion { + span: self.span(), + items: vec![], + }; + while self.char() == '-' { + union.push(ast::ClassSetItem::Literal(ast::Literal { + span: self.span_char(), + kind: ast::LiteralKind::Verbatim, + c: '-', + })); + if !self.bump_and_bump_space() { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::ClassUnclosed, + )); + } + } + // If `]` is the *first* char in a set, then interpret it as a literal + // `]`. That is, an empty class is impossible to write. + if union.items.is_empty() && self.char() == ']' { + union.push(ast::ClassSetItem::Literal(ast::Literal { + span: self.span_char(), + kind: ast::LiteralKind::Verbatim, + c: ']', + })); + if !self.bump_and_bump_space() { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::ClassUnclosed, + )); + } + } + let set = ast::ClassBracketed { + span: Span::new(start, self.pos()), + negated: negated, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: Span::new(union.span.start, union.span.start), + items: vec![], + }), + }; + Ok((set, union)) + } + + /// Attempt to parse an ASCII character class, e.g., `[:alnum:]`. + /// + /// This assumes the parser is positioned at the opening `[`. + /// + /// If no valid ASCII character class could be found, then this does not + /// advance the parser and `None` is returned. Otherwise, the parser is + /// advanced to the first byte following the closing `]` and the + /// corresponding ASCII class is returned. + fn maybe_parse_ascii_class(&self) -> Option { + // ASCII character classes are interesting from a parsing perspective + // because parsing cannot fail with any interesting error. For example, + // in order to use an ASCII character class, it must be enclosed in + // double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think + // of it as "ASCII character characters have the syntax `[:NAME:]` + // which can only appear within character brackets." This means that + // things like `[[:lower:]A]` are legal constructs. + // + // However, if one types an incorrect ASCII character class, e.g., + // `[[:loower:]]`, then we treat that as a normal nested character + // class containing the characters `:elorw`. One might argue that we + // should return an error instead since the repeated colons give away + // the intent to write an ASCII class. But what if the user typed + // `[[:lower]]` instead? How can we tell that was intended to be an + // ASCII class and not just a normal nested class? + // + // Reasonable people can probably disagree over this, but for better + // or worse, we implement semantics that never fails at the expense + // of better failure modes. + assert_eq!(self.char(), '['); + // If parsing fails, then we back up the parser to this starting point. + let start = self.pos(); + let mut negated = false; + if !self.bump() || self.char() != ':' { + self.parser().pos.set(start); + return None; + } + if !self.bump() { + self.parser().pos.set(start); + return None; + } + if self.char() == '^' { + negated = true; + if !self.bump() { + self.parser().pos.set(start); + return None; + } + } + let name_start = self.offset(); + while self.char() != ':' && self.bump() {} + if self.is_eof() { + self.parser().pos.set(start); + return None; + } + let name = &self.pattern()[name_start..self.offset()]; + if !self.bump_if(":]") { + self.parser().pos.set(start); + return None; + } + let kind = match ast::ClassAsciiKind::from_name(name) { + Some(kind) => kind, + None => { + self.parser().pos.set(start); + return None; + } + }; + Some(ast::ClassAscii { + span: Span::new(start, self.pos()), + kind: kind, + negated: negated, + }) + } + + /// Parse a Unicode class in either the single character notation, `\pN` + /// or the multi-character bracketed notation, `\p{Greek}`. This assumes + /// the parser is positioned at the `p` (or `P` for negation) and will + /// advance the parser to the character immediately following the class. + /// + /// Note that this does not check whether the class name is valid or not. + fn parse_unicode_class(&self) -> Result { + assert!(self.char() == 'p' || self.char() == 'P'); + + let mut scratch = self.parser().scratch.borrow_mut(); + scratch.clear(); + + let negated = self.char() == 'P'; + if !self.bump_and_bump_space() { + return Err(self.error( + self.span(), + ast::ErrorKind::EscapeUnexpectedEof, + )); + } + let (start, kind) = + if self.char() == '{' { + let start = self.span_char().end; + while self.bump_and_bump_space() && self.char() != '}' { + scratch.push(self.char()); + } + if self.is_eof() { + return Err(self.error( + self.span(), + ast::ErrorKind::EscapeUnexpectedEof, + )); + } + assert_eq!(self.char(), '}'); + self.bump(); + + let name = scratch.as_str(); + if let Some(i) = name.find("!=") { + (start, ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::NotEqual, + name: name[..i].to_string(), + value: name[i+2..].to_string(), + }) + } else if let Some(i) = name.find(':') { + (start, ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::Colon, + name: name[..i].to_string(), + value: name[i+1..].to_string(), + }) + } else if let Some(i) = name.find('=') { + (start, ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::Equal, + name: name[..i].to_string(), + value: name[i+1..].to_string(), + }) + } else { + (start, ast::ClassUnicodeKind::Named(name.to_string())) + } + } else { + let start = self.pos(); + let c = self.char(); + self.bump_and_bump_space(); + let kind = ast::ClassUnicodeKind::OneLetter(c); + (start, kind) + }; + Ok(ast::ClassUnicode { + span: Span::new(start, self.pos()), + negated: negated, + kind: kind, + }) + } + + /// Parse a Perl character class, e.g., `\d` or `\W`. This assumes the + /// parser is currently at a valid character class name and will be + /// advanced to the character immediately following the class. + fn parse_perl_class(&self) -> ast::ClassPerl { + let c = self.char(); + let span = self.span_char(); + self.bump(); + let (negated, kind) = match c { + 'd' => (false, ast::ClassPerlKind::Digit), + 'D' => (true, ast::ClassPerlKind::Digit), + 's' => (false, ast::ClassPerlKind::Space), + 'S' => (true, ast::ClassPerlKind::Space), + 'w' => (false, ast::ClassPerlKind::Word), + 'W' => (true, ast::ClassPerlKind::Word), + c => panic!("expected valid Perl class but got '{}'", c), + }; + ast::ClassPerl { span: span, kind: kind, negated: negated } + } +} + +/// A type that traverses a fully parsed Ast and checks whether its depth +/// exceeds the specified nesting limit. If it does, then an error is returned. +#[derive(Debug)] +struct NestLimiter<'p, 's: 'p, P: 'p + 's> { + /// The parser that is checking the nest limit. + p: &'p ParserI<'s, P>, + /// The current depth while walking an Ast. + depth: u32, +} + +impl<'p, 's, P: Borrow> NestLimiter<'p, 's, P> { + fn new(p: &'p ParserI<'s, P>) -> NestLimiter<'p, 's, P> { + NestLimiter { p: p, depth: 0 } + } + + fn check(self, ast: &Ast) -> Result<()> { + ast::visit(ast, self) + } + + fn increment_depth(&mut self, span: &Span) -> Result<()> { + let new = try!(self.depth.checked_add(1).ok_or_else(|| self.p.error( + span.clone(), + ast::ErrorKind::NestLimitExceeded(::std::u32::MAX), + ))); + let limit = self.p.parser().nest_limit; + if new > limit { + return Err(self.p.error( + span.clone(), + ast::ErrorKind::NestLimitExceeded(limit), + )); + } + self.depth = new; + Ok(()) + } + + fn decrement_depth(&mut self) { + // Assuming the correctness of the visitor, this should never drop + // below 0. + self.depth.checked_sub(1).unwrap(); + } +} + +impl<'p, 's, P: Borrow> ast::Visitor for NestLimiter<'p, 's, P> { + type Output = (); + type Err = ast::Error; + + fn finish(self) -> Result<()> { + Ok(()) + } + + fn visit_pre(&mut self, ast: &Ast) -> Result<()> { + let span = match *ast { + Ast::Empty(_) + | Ast::Flags(_) + | Ast::Literal(_) + | Ast::Dot(_) + | Ast::Assertion(_) + | Ast::Class(ast::Class::Unicode(_)) + | Ast::Class(ast::Class::Perl(_)) => { + // These are all base cases, so we don't increment depth. + return Ok(()); + } + Ast::Class(ast::Class::Bracketed(ref x)) => &x.span, + Ast::Repetition(ref x) => &x.span, + Ast::Group(ref x) => &x.span, + Ast::Alternation(ref x) => &x.span, + Ast::Concat(ref x) => &x.span, + }; + self.increment_depth(span) + } + + fn visit_post(&mut self, ast: &Ast) -> Result<()> { + match *ast { + Ast::Empty(_) + | Ast::Flags(_) + | Ast::Literal(_) + | Ast::Dot(_) + | Ast::Assertion(_) + | Ast::Class(ast::Class::Unicode(_)) + | Ast::Class(ast::Class::Perl(_)) => { + // These are all base cases, so we don't decrement depth. + Ok(()) + } + Ast::Class(ast::Class::Bracketed(_)) + | Ast::Repetition(_) + | Ast::Group(_) + | Ast::Alternation(_) + | Ast::Concat(_) => { + self.decrement_depth(); + Ok(()) + } + } + } + + fn visit_class_set_item_pre( + &mut self, + ast: &ast::ClassSetItem, + ) -> Result<()> { + let span = match *ast { + ast::ClassSetItem::Empty(_) + | ast::ClassSetItem::Literal(_) + | ast::ClassSetItem::Range(_) + | ast::ClassSetItem::Ascii(_) + | ast::ClassSetItem::Unicode(_) + | ast::ClassSetItem::Perl(_) => { + // These are all base cases, so we don't increment depth. + return Ok(()); + } + ast::ClassSetItem::Bracketed(ref x) => &x.span, + ast::ClassSetItem::Union(ref x) => &x.span, + }; + self.increment_depth(span) + } + + fn visit_class_set_item_post( + &mut self, + ast: &ast::ClassSetItem, + ) -> Result<()> { + match *ast { + ast::ClassSetItem::Empty(_) + | ast::ClassSetItem::Literal(_) + | ast::ClassSetItem::Range(_) + | ast::ClassSetItem::Ascii(_) + | ast::ClassSetItem::Unicode(_) + | ast::ClassSetItem::Perl(_) => { + // These are all base cases, so we don't decrement depth. + Ok(()) + } + ast::ClassSetItem::Bracketed(_) + | ast::ClassSetItem::Union(_) => { + self.decrement_depth(); + Ok(()) + } + } + } + + fn visit_class_set_binary_op_pre( + &mut self, + ast: &ast::ClassSetBinaryOp, + ) -> Result<()> { + self.increment_depth(&ast.span) + } + + fn visit_class_set_binary_op_post( + &mut self, + _ast: &ast::ClassSetBinaryOp, + ) -> Result<()> { + self.decrement_depth(); + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use std::ops::Range; + + use ast::{self, Ast, Position, Span}; + use super::{Parser, ParserI, ParserBuilder, Primitive}; + + // Our own assert_eq, which has slightly better formatting (but honestly + // still kind of crappy). + macro_rules! assert_eq { + ($left:expr, $right:expr) => ({ + match (&$left, &$right) { + (left_val, right_val) => { + if !(*left_val == *right_val) { + panic!("assertion failed: `(left == right)`\n\n\ + left: `{:?}`\nright: `{:?}`\n\n", + left_val, right_val) + } + } + } + }); + } + + // We create these errors to compare with real ast::Errors in the tests. + // We define equality between TestError and ast::Error to disregard the + // pattern string in ast::Error, which is annoying to provide in tests. + #[derive(Clone, Debug)] + struct TestError { + span: Span, + kind: ast::ErrorKind, + } + + impl PartialEq for TestError { + fn eq(&self, other: &ast::Error) -> bool { + self.span == other.span && self.kind == other.kind + } + } + + impl PartialEq for ast::Error { + fn eq(&self, other: &TestError) -> bool { + self.span == other.span && self.kind == other.kind + } + } + + fn s(str: &str) -> String { + str.to_string() + } + + fn parser(pattern: &str) -> ParserI { + ParserI::new(Parser::new(), pattern) + } + + fn parser_octal(pattern: &str) -> ParserI { + let parser = ParserBuilder::new().octal(true).build(); + ParserI::new(parser, pattern) + } + + fn parser_nest_limit(pattern: &str, nest_limit: u32) -> ParserI { + let p = ParserBuilder::new().nest_limit(nest_limit).build(); + ParserI::new(p, pattern) + } + + fn parser_ignore_whitespace(pattern: &str) -> ParserI { + let p = ParserBuilder::new().ignore_whitespace(true).build(); + ParserI::new(p, pattern) + } + + /// Short alias for creating a new span. + fn nspan(start: Position, end: Position) -> Span { + Span::new(start, end) + } + + /// Short alias for creating a new position. + fn npos(offset: usize, line: usize, column: usize) -> Position { + Position::new(offset, line, column) + } + + /// Create a new span from the given offset range. This assumes a single + /// line and sets the columns based on the offsets. i.e., This only works + /// out of the box for ASCII, which is fine for most tests. + fn span(range: Range) -> Span { + let start = Position::new(range.start, 1, range.start + 1); + let end = Position::new(range.end, 1, range.end + 1); + Span::new(start, end) + } + + /// Create a new span for the corresponding byte range in the given string. + fn span_range(subject: &str, range: Range) -> Span { + let start = Position { + offset: range.start, + line: 1 + subject[..range.start].matches('\n').count(), + column: 1 + subject[..range.start] + .chars() + .rev() + .position(|c| c == '\n') + .unwrap_or(subject[..range.start].chars().count()), + }; + let end = Position { + offset: range.end, + line: 1 + subject[..range.end].matches('\n').count(), + column: 1 + subject[..range.end] + .chars() + .rev() + .position(|c| c == '\n') + .unwrap_or(subject[..range.end].chars().count()), + }; + Span::new(start, end) + } + + /// Create a verbatim literal starting at the given position. + fn lit(c: char, start: usize) -> Ast { + lit_with(c, span(start..start + c.len_utf8())) + } + + /// Create a punctuation literal starting at the given position. + fn punct_lit(c: char, span: Span) -> Ast { + Ast::Literal(ast::Literal { + span: span, + kind: ast::LiteralKind::Punctuation, + c: c, + }) + } + + /// Create a verbatim literal with the given span. + fn lit_with(c: char, span: Span) -> Ast { + Ast::Literal(ast::Literal { + span: span, + kind: ast::LiteralKind::Verbatim, + c: c, + }) + } + + /// Create a concatenation with the given range. + fn concat(range: Range, asts: Vec) -> Ast { + concat_with(span(range), asts) + } + + /// Create a concatenation with the given span. + fn concat_with(span: Span, asts: Vec) -> Ast { + Ast::Concat(ast::Concat { span: span, asts: asts }) + } + + /// Create an alternation with the given span. + fn alt(range: Range, asts: Vec) -> Ast { + Ast::Alternation(ast::Alternation { span: span(range), asts: asts }) + } + + /// Create a capturing group with the given span. + fn group(range: Range, index: u32, ast: Ast) -> Ast { + Ast::Group(ast::Group { + span: span(range), + kind: ast::GroupKind::CaptureIndex(index), + ast: Box::new(ast), + }) + } + + /// Create an ast::SetFlags. + /// + /// The given pattern should be the full pattern string. The range given + /// should correspond to the byte offsets where the flag set occurs. + /// + /// If negated is true, then the set is interpreted as beginning with a + /// negation. + fn flag_set( + pat: &str, + range: Range, + flag: ast::Flag, + negated: bool, + ) -> Ast { + let mut items = vec![ + ast::FlagsItem { + span: span_range(pat, (range.end - 2)..(range.end - 1)), + kind: ast::FlagsItemKind::Flag(flag), + }, + ]; + if negated { + items.insert(0, ast::FlagsItem { + span: span_range(pat, (range.start + 2)..(range.end - 2)), + kind: ast::FlagsItemKind::Negation, + }); + } + Ast::Flags(ast::SetFlags { + span: span_range(pat, range.clone()), + flags: ast::Flags { + span: span_range(pat, (range.start + 2)..(range.end - 1)), + items: items, + }, + }) + } + + #[test] + fn parse_nest_limit() { + // A nest limit of 0 still allows some types of regexes. + assert_eq!( + parser_nest_limit("", 0).parse(), + Ok(Ast::Empty(span(0..0)))); + assert_eq!( + parser_nest_limit("a", 0).parse(), + Ok(lit('a', 0))); + + // Test repetition operations, which require one level of nesting. + assert_eq!( + parser_nest_limit("a+", 0).parse().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::NestLimitExceeded(0), + }); + assert_eq!( + parser_nest_limit("a+", 1).parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..2), + op: ast::RepetitionOp { + span: span(1..2), + kind: ast::RepetitionKind::OneOrMore, + }, + greedy: true, + ast: Box::new(lit('a', 0)), + }))); + assert_eq!( + parser_nest_limit("(a)+", 1).parse().unwrap_err(), + TestError { + span: span(0..3), + kind: ast::ErrorKind::NestLimitExceeded(1), + }); + assert_eq!( + parser_nest_limit("a+*", 1).parse().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::NestLimitExceeded(1), + }); + assert_eq!( + parser_nest_limit("a+*", 2).parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..3), + op: ast::RepetitionOp { + span: span(2..3), + kind: ast::RepetitionKind::ZeroOrMore, + }, + greedy: true, + ast: Box::new(Ast::Repetition(ast::Repetition { + span: span(0..2), + op: ast::RepetitionOp { + span: span(1..2), + kind: ast::RepetitionKind::OneOrMore, + }, + greedy: true, + ast: Box::new(lit('a', 0)), + })), + }))); + + // Test concatenations. A concatenation requires one level of nesting. + assert_eq!( + parser_nest_limit("ab", 0).parse().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::NestLimitExceeded(0), + }); + assert_eq!( + parser_nest_limit("ab", 1).parse(), + Ok(concat(0..2, vec![lit('a', 0), lit('b', 1)]))); + assert_eq!( + parser_nest_limit("abc", 1).parse(), + Ok(concat(0..3, vec![lit('a', 0), lit('b', 1), lit('c', 2)]))); + + // Test alternations. An alternation requires one level of nesting. + assert_eq!( + parser_nest_limit("a|b", 0).parse().unwrap_err(), + TestError { + span: span(0..3), + kind: ast::ErrorKind::NestLimitExceeded(0), + }); + assert_eq!( + parser_nest_limit("a|b", 1).parse(), + Ok(alt(0..3, vec![lit('a', 0), lit('b', 2)]))); + assert_eq!( + parser_nest_limit("a|b|c", 1).parse(), + Ok(alt(0..5, vec![lit('a', 0), lit('b', 2), lit('c', 4)]))); + + // Test character classes. Classes form their own mini-recursive + // syntax! + assert_eq!( + parser_nest_limit("[a]", 0).parse().unwrap_err(), + TestError { + span: span(0..3), + kind: ast::ErrorKind::NestLimitExceeded(0), + }); + assert_eq!( + parser_nest_limit("[a]", 1).parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..3), + negated: false, + kind: ast::ClassSet::Item( + ast::ClassSetItem::Literal(ast::Literal { + span: span(1..2), + kind: ast::LiteralKind::Verbatim, + c: 'a', + }) + ), + })))); + assert_eq!( + parser_nest_limit("[ab]", 1).parse().unwrap_err(), + TestError { + span: span(1..3), + kind: ast::ErrorKind::NestLimitExceeded(1), + }); + assert_eq!( + parser_nest_limit("[ab[cd]]", 2).parse().unwrap_err(), + TestError { + span: span(3..7), + kind: ast::ErrorKind::NestLimitExceeded(2), + }); + assert_eq!( + parser_nest_limit("[ab[cd]]", 3).parse().unwrap_err(), + TestError { + span: span(4..6), + kind: ast::ErrorKind::NestLimitExceeded(3), + }); + assert_eq!( + parser_nest_limit("[a--b]", 1).parse().unwrap_err(), + TestError { + span: span(1..5), + kind: ast::ErrorKind::NestLimitExceeded(1), + }); + assert_eq!( + parser_nest_limit("[a--bc]", 2).parse().unwrap_err(), + TestError { + span: span(4..6), + kind: ast::ErrorKind::NestLimitExceeded(2), + }); + } + + #[test] + fn parse_comments() { + let pat = "(?x) +# This is comment 1. +foo # This is comment 2. + # This is comment 3. +bar +# This is comment 4."; + let astc = parser(pat).parse_with_comments().unwrap(); + assert_eq!( + astc.ast, + concat_with(span_range(pat, 0..pat.len()), vec![ + flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), + lit_with('f', span_range(pat, 26..27)), + lit_with('o', span_range(pat, 27..28)), + lit_with('o', span_range(pat, 28..29)), + lit_with('b', span_range(pat, 74..75)), + lit_with('a', span_range(pat, 75..76)), + lit_with('r', span_range(pat, 76..77)), + ])); + assert_eq!(astc.comments, vec![ + ast::Comment { + span: span_range(pat, 5..26), + comment: s(" This is comment 1."), + }, + ast::Comment { + span: span_range(pat, 30..51), + comment: s(" This is comment 2."), + }, + ast::Comment { + span: span_range(pat, 53..74), + comment: s(" This is comment 3."), + }, + ast::Comment { + span: span_range(pat, 78..98), + comment: s(" This is comment 4."), + }, + ]); + } + + #[test] + fn parse_holistic() { + assert_eq!( + parser("]").parse(), + Ok(lit(']', 0))); + assert_eq!( + parser(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#\&\-\~").parse(), + Ok(concat(0..36, vec![ + punct_lit('\\', span(0..2)), + punct_lit('.', span(2..4)), + punct_lit('+', span(4..6)), + punct_lit('*', span(6..8)), + punct_lit('?', span(8..10)), + punct_lit('(', span(10..12)), + punct_lit(')', span(12..14)), + punct_lit('|', span(14..16)), + punct_lit('[', span(16..18)), + punct_lit(']', span(18..20)), + punct_lit('{', span(20..22)), + punct_lit('}', span(22..24)), + punct_lit('^', span(24..26)), + punct_lit('$', span(26..28)), + punct_lit('#', span(28..30)), + punct_lit('&', span(30..32)), + punct_lit('-', span(32..34)), + punct_lit('~', span(34..36)), + ]))); + } + + #[test] + fn parse_ignore_whitespace() { + // Test that basic whitespace insensitivity works. + let pat = "(?x)a b"; + assert_eq!( + parser(pat).parse(), + Ok(concat_with(nspan(npos(0, 1, 1), npos(7, 1, 8)), vec![ + flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), + lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))), + lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))), + ]))); + + // Test that we can toggle whitespace insensitivity. + let pat = "(?x)a b(?-x)a b"; + assert_eq!( + parser(pat).parse(), + Ok(concat_with(nspan(npos(0, 1, 1), npos(15, 1, 16)), vec![ + flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), + lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))), + lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))), + flag_set(pat, 7..12, ast::Flag::IgnoreWhitespace, true), + lit_with('a', nspan(npos(12, 1, 13), npos(13, 1, 14))), + lit_with(' ', nspan(npos(13, 1, 14), npos(14, 1, 15))), + lit_with('b', nspan(npos(14, 1, 15), npos(15, 1, 16))), + ]))); + + // Test that nesting whitespace insensitive flags works. + let pat = "a (?x:a )a "; + assert_eq!( + parser(pat).parse(), + Ok(concat_with(span_range(pat, 0..11), vec![ + lit_with('a', span_range(pat, 0..1)), + lit_with(' ', span_range(pat, 1..2)), + Ast::Group(ast::Group { + span: span_range(pat, 2..9), + kind: ast::GroupKind::NonCapturing(ast::Flags { + span: span_range(pat, 4..5), + items: vec![ + ast::FlagsItem { + span: span_range(pat, 4..5), + kind: ast::FlagsItemKind::Flag( + ast::Flag::IgnoreWhitespace), + }, + ], + }), + ast: Box::new(lit_with('a', span_range(pat, 6..7))), + }), + lit_with('a', span_range(pat, 9..10)), + lit_with(' ', span_range(pat, 10..11)), + ]))); + + // Test that whitespace after an opening paren is insignificant. + let pat = "(?x)( ?P a )"; + assert_eq!( + parser(pat).parse(), + Ok(concat_with(span_range(pat, 0..pat.len()), vec![ + flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), + Ast::Group(ast::Group { + span: span_range(pat, 4..pat.len()), + kind: ast::GroupKind::CaptureName(ast::CaptureName { + span: span_range(pat, 9..12), + name: s("foo"), + index: 1, + }), + ast: Box::new(lit_with('a', span_range(pat, 14..15))), + }), + ]))); + let pat = "(?x)( a )"; + assert_eq!( + parser(pat).parse(), + Ok(concat_with(span_range(pat, 0..pat.len()), vec![ + flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), + Ast::Group(ast::Group { + span: span_range(pat, 4..pat.len()), + kind: ast::GroupKind::CaptureIndex(1), + ast: Box::new(lit_with('a', span_range(pat, 7..8))), + }), + ]))); + let pat = "(?x)( ?: a )"; + assert_eq!( + parser(pat).parse(), + Ok(concat_with(span_range(pat, 0..pat.len()), vec![ + flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), + Ast::Group(ast::Group { + span: span_range(pat, 4..pat.len()), + kind: ast::GroupKind::NonCapturing(ast::Flags { + span: span_range(pat, 8..8), + items: vec![], + }), + ast: Box::new(lit_with('a', span_range(pat, 11..12))), + }), + ]))); + let pat = r"(?x)\x { 53 }"; + assert_eq!( + parser(pat).parse(), + Ok(concat_with(span_range(pat, 0..pat.len()), vec![ + flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), + Ast::Literal(ast::Literal { + span: span(4..13), + kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X), + c: 'S', + }), + ]))); + + // Test that whitespace after an escape is OK. + let pat = r"(?x)\ "; + assert_eq!( + parser(pat).parse(), + Ok(concat_with(span_range(pat, 0..pat.len()), vec![ + flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), + Ast::Literal(ast::Literal { + span: span_range(pat, 4..6), + kind: ast::LiteralKind::Special( + ast::SpecialLiteralKind::Space), + c: ' ', + }), + ]))); + // ... but only when `x` mode is enabled. + let pat = r"\ "; + assert_eq!( + parser(pat).parse().unwrap_err(), + TestError { + span: span_range(pat, 0..2), + kind: ast::ErrorKind::EscapeUnrecognized, + }); + } + + #[test] + fn parse_newlines() { + let pat = ".\n."; + assert_eq!( + parser(pat).parse(), + Ok(concat_with(span_range(pat, 0..3), vec![ + Ast::Dot(span_range(pat, 0..1)), + lit_with('\n', span_range(pat, 1..2)), + Ast::Dot(span_range(pat, 2..3)), + ]))); + + let pat = "foobar\nbaz\nquux\n"; + assert_eq!( + parser(pat).parse(), + Ok(concat_with(span_range(pat, 0..pat.len()), vec![ + lit_with('f', nspan(npos(0, 1, 1), npos(1, 1, 2))), + lit_with('o', nspan(npos(1, 1, 2), npos(2, 1, 3))), + lit_with('o', nspan(npos(2, 1, 3), npos(3, 1, 4))), + lit_with('b', nspan(npos(3, 1, 4), npos(4, 1, 5))), + lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))), + lit_with('r', nspan(npos(5, 1, 6), npos(6, 1, 7))), + lit_with('\n', nspan(npos(6, 1, 7), npos(7, 2, 1))), + lit_with('b', nspan(npos(7, 2, 1), npos(8, 2, 2))), + lit_with('a', nspan(npos(8, 2, 2), npos(9, 2, 3))), + lit_with('z', nspan(npos(9, 2, 3), npos(10, 2, 4))), + lit_with('\n', nspan(npos(10, 2, 4), npos(11, 3, 1))), + lit_with('q', nspan(npos(11, 3, 1), npos(12, 3, 2))), + lit_with('u', nspan(npos(12, 3, 2), npos(13, 3, 3))), + lit_with('u', nspan(npos(13, 3, 3), npos(14, 3, 4))), + lit_with('x', nspan(npos(14, 3, 4), npos(15, 3, 5))), + lit_with('\n', nspan(npos(15, 3, 5), npos(16, 4, 1))), + ]))); + } + + #[test] + fn parse_uncounted_repetition() { + assert_eq!( + parser(r"a*").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..2), + op: ast::RepetitionOp { + span: span(1..2), + kind: ast::RepetitionKind::ZeroOrMore, + }, + greedy: true, + ast: Box::new(lit('a', 0)), + }))); + assert_eq!( + parser(r"a+").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..2), + op: ast::RepetitionOp { + span: span(1..2), + kind: ast::RepetitionKind::OneOrMore, + }, + greedy: true, + ast: Box::new(lit('a', 0)), + }))); + + assert_eq!( + parser(r"a?").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..2), + op: ast::RepetitionOp { + span: span(1..2), + kind: ast::RepetitionKind::ZeroOrOne, + }, + greedy: true, + ast: Box::new(lit('a', 0)), + }))); + assert_eq!( + parser(r"a??").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..3), + op: ast::RepetitionOp { + span: span(1..3), + kind: ast::RepetitionKind::ZeroOrOne, + }, + greedy: false, + ast: Box::new(lit('a', 0)), + }))); + assert_eq!( + parser(r"a?").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..2), + op: ast::RepetitionOp { + span: span(1..2), + kind: ast::RepetitionKind::ZeroOrOne, + }, + greedy: true, + ast: Box::new(lit('a', 0)), + }))); + assert_eq!( + parser(r"a?b").parse(), + Ok(concat(0..3, vec![ + Ast::Repetition(ast::Repetition { + span: span(0..2), + op: ast::RepetitionOp { + span: span(1..2), + kind: ast::RepetitionKind::ZeroOrOne, + }, + greedy: true, + ast: Box::new(lit('a', 0)), + }), + lit('b', 2), + ]))); + assert_eq!( + parser(r"a??b").parse(), + Ok(concat(0..4, vec![ + Ast::Repetition(ast::Repetition { + span: span(0..3), + op: ast::RepetitionOp { + span: span(1..3), + kind: ast::RepetitionKind::ZeroOrOne, + }, + greedy: false, + ast: Box::new(lit('a', 0)), + }), + lit('b', 3), + ]))); + assert_eq!( + parser(r"ab?").parse(), + Ok(concat(0..3, vec![ + lit('a', 0), + Ast::Repetition(ast::Repetition { + span: span(1..3), + op: ast::RepetitionOp { + span: span(2..3), + kind: ast::RepetitionKind::ZeroOrOne, + }, + greedy: true, + ast: Box::new(lit('b', 1)), + }), + ]))); + assert_eq!( + parser(r"(ab)?").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..5), + op: ast::RepetitionOp { + span: span(4..5), + kind: ast::RepetitionKind::ZeroOrOne, + }, + greedy: true, + ast: Box::new(group(0..4, 1, concat(1..3, vec![ + lit('a', 1), + lit('b', 2), + ]))), + }))); + assert_eq!( + parser(r"|a?").parse(), + Ok(alt(0..3, vec![ + Ast::Empty(span(0..0)), + Ast::Repetition(ast::Repetition { + span: span(1..3), + op: ast::RepetitionOp { + span: span(2..3), + kind: ast::RepetitionKind::ZeroOrOne, + }, + greedy: true, + ast: Box::new(lit('a', 1)), + }), + ]))); + + assert_eq!( + parser(r"*").parse().unwrap_err(), + TestError { + span: span(0..0), + kind: ast::ErrorKind::RepetitionMissing, + }); + assert_eq!( + parser(r"(*)").parse().unwrap_err(), + TestError { + span: span(1..1), + kind: ast::ErrorKind::RepetitionMissing, + }); + assert_eq!( + parser(r"(?:?)").parse().unwrap_err(), + TestError { + span: span(3..3), + kind: ast::ErrorKind::RepetitionMissing, + }); + assert_eq!( + parser(r"+").parse().unwrap_err(), + TestError { + span: span(0..0), + kind: ast::ErrorKind::RepetitionMissing, + }); + assert_eq!( + parser(r"?").parse().unwrap_err(), + TestError { + span: span(0..0), + kind: ast::ErrorKind::RepetitionMissing, + }); + assert_eq!( + parser(r"(?)").parse().unwrap_err(), + TestError { + span: span(1..1), + kind: ast::ErrorKind::RepetitionMissing, + }); + assert_eq!( + parser(r"|*").parse().unwrap_err(), + TestError { + span: span(1..1), + kind: ast::ErrorKind::RepetitionMissing, + }); + assert_eq!( + parser(r"|+").parse().unwrap_err(), + TestError { + span: span(1..1), + kind: ast::ErrorKind::RepetitionMissing, + }); + assert_eq!( + parser(r"|?").parse().unwrap_err(), + TestError { + span: span(1..1), + kind: ast::ErrorKind::RepetitionMissing, + }); + } + + #[test] + fn parse_counted_repetition() { + assert_eq!( + parser(r"a{5}").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..4), + op: ast::RepetitionOp { + span: span(1..4), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Exactly(5)), + }, + greedy: true, + ast: Box::new(lit('a', 0)), + }))); + assert_eq!( + parser(r"a{5,}").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..5), + op: ast::RepetitionOp { + span: span(1..5), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::AtLeast(5)), + }, + greedy: true, + ast: Box::new(lit('a', 0)), + }))); + assert_eq!( + parser(r"a{5,9}").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..6), + op: ast::RepetitionOp { + span: span(1..6), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Bounded(5, 9)), + }, + greedy: true, + ast: Box::new(lit('a', 0)), + }))); + assert_eq!( + parser(r"a{5}?").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..5), + op: ast::RepetitionOp { + span: span(1..5), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Exactly(5)), + }, + greedy: false, + ast: Box::new(lit('a', 0)), + }))); + assert_eq!( + parser(r"ab{5}").parse(), + Ok(concat(0..5, vec![ + lit('a', 0), + Ast::Repetition(ast::Repetition { + span: span(1..5), + op: ast::RepetitionOp { + span: span(2..5), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Exactly(5)), + }, + greedy: true, + ast: Box::new(lit('b', 1)), + }), + ]))); + assert_eq!( + parser(r"ab{5}c").parse(), + Ok(concat(0..6, vec![ + lit('a', 0), + Ast::Repetition(ast::Repetition { + span: span(1..5), + op: ast::RepetitionOp { + span: span(2..5), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Exactly(5)), + }, + greedy: true, + ast: Box::new(lit('b', 1)), + }), + lit('c', 5), + ]))); + + assert_eq!( + parser(r"a{ 5 }").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..6), + op: ast::RepetitionOp { + span: span(1..6), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Exactly(5)), + }, + greedy: true, + ast: Box::new(lit('a', 0)), + }))); + assert_eq!( + parser(r"a{ 5 , 9 }").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..10), + op: ast::RepetitionOp { + span: span(1..10), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Bounded(5, 9)), + }, + greedy: true, + ast: Box::new(lit('a', 0)), + }))); + assert_eq!( + parser_ignore_whitespace(r"a{5,9} ?").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..8), + op: ast::RepetitionOp { + span: span(1..8), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Bounded(5, 9)), + }, + greedy: false, + ast: Box::new(lit('a', 0)), + }))); + + assert_eq!( + parser(r"a{").parse().unwrap_err(), + TestError { + span: span(1..2), + kind: ast::ErrorKind::RepetitionCountUnclosed, + }); + assert_eq!( + parser(r"a{}").parse().unwrap_err(), + TestError { + span: span(2..2), + kind: ast::ErrorKind::DecimalEmpty, + }); + assert_eq!( + parser(r"a{a").parse().unwrap_err(), + TestError { + span: span(2..2), + kind: ast::ErrorKind::DecimalEmpty, + }); + assert_eq!( + parser(r"a{9999999999}").parse().unwrap_err(), + TestError { + span: span(2..12), + kind: ast::ErrorKind::DecimalInvalid, + }); + assert_eq!( + parser(r"a{9").parse().unwrap_err(), + TestError { + span: span(1..3), + kind: ast::ErrorKind::RepetitionCountUnclosed, + }); + assert_eq!( + parser(r"a{9,a").parse().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::DecimalEmpty, + }); + assert_eq!( + parser(r"a{9,9999999999}").parse().unwrap_err(), + TestError { + span: span(4..14), + kind: ast::ErrorKind::DecimalInvalid, + }); + assert_eq!( + parser(r"a{9,").parse().unwrap_err(), + TestError { + span: span(1..4), + kind: ast::ErrorKind::RepetitionCountUnclosed, + }); + assert_eq!( + parser(r"a{9,11").parse().unwrap_err(), + TestError { + span: span(1..6), + kind: ast::ErrorKind::RepetitionCountUnclosed, + }); + assert_eq!( + parser(r"a{2,1}").parse().unwrap_err(), + TestError { + span: span(1..6), + kind: ast::ErrorKind::RepetitionCountInvalid, + }); + assert_eq!( + parser(r"{5}").parse().unwrap_err(), + TestError { + span: span(0..0), + kind: ast::ErrorKind::RepetitionMissing, + }); + assert_eq!( + parser(r"|{5}").parse().unwrap_err(), + TestError { + span: span(1..1), + kind: ast::ErrorKind::RepetitionMissing, + }); + } + + #[test] + fn parse_alternate() { + assert_eq!( + parser(r"a|b").parse(), + Ok(Ast::Alternation(ast::Alternation { + span: span(0..3), + asts: vec![lit('a', 0), lit('b', 2)], + }))); + assert_eq!( + parser(r"(a|b)").parse(), + Ok(group(0..5, 1, Ast::Alternation(ast::Alternation { + span: span(1..4), + asts: vec![lit('a', 1), lit('b', 3)], + })))); + + assert_eq!( + parser(r"a|b|c").parse(), + Ok(Ast::Alternation(ast::Alternation { + span: span(0..5), + asts: vec![lit('a', 0), lit('b', 2), lit('c', 4)], + }))); + assert_eq!( + parser(r"ax|by|cz").parse(), + Ok(Ast::Alternation(ast::Alternation { + span: span(0..8), + asts: vec![ + concat(0..2, vec![lit('a', 0), lit('x', 1)]), + concat(3..5, vec![lit('b', 3), lit('y', 4)]), + concat(6..8, vec![lit('c', 6), lit('z', 7)]), + ], + }))); + assert_eq!( + parser(r"(ax|by|cz)").parse(), + Ok(group(0..10, 1, Ast::Alternation(ast::Alternation { + span: span(1..9), + asts: vec![ + concat(1..3, vec![lit('a', 1), lit('x', 2)]), + concat(4..6, vec![lit('b', 4), lit('y', 5)]), + concat(7..9, vec![lit('c', 7), lit('z', 8)]), + ], + })))); + assert_eq!( + parser(r"(ax|(by|(cz)))").parse(), + Ok(group(0..14, 1, alt(1..13, vec![ + concat(1..3, vec![lit('a', 1), lit('x', 2)]), + group(4..13, 2, alt(5..12, vec![ + concat(5..7, vec![lit('b', 5), lit('y', 6)]), + group(8..12, 3, concat(9..11, vec![ + lit('c', 9), + lit('z', 10), + ])), + ])), + ])))); + + assert_eq!( + parser(r"|").parse(), Ok(alt(0..1, vec![ + Ast::Empty(span(0..0)), Ast::Empty(span(1..1)), + ]))); + assert_eq!( + parser(r"||").parse(), Ok(alt(0..2, vec![ + Ast::Empty(span(0..0)), + Ast::Empty(span(1..1)), + Ast::Empty(span(2..2)), + ]))); + assert_eq!( + parser(r"a|").parse(), Ok(alt(0..2, vec![ + lit('a', 0), Ast::Empty(span(2..2)), + ]))); + assert_eq!( + parser(r"|a").parse(), Ok(alt(0..2, vec![ + Ast::Empty(span(0..0)), lit('a', 1), + ]))); + + assert_eq!( + parser(r"(|)").parse(), Ok(group(0..3, 1, alt(1..2, vec![ + Ast::Empty(span(1..1)), Ast::Empty(span(2..2)), + ])))); + assert_eq!( + parser(r"(a|)").parse(), Ok(group(0..4, 1, alt(1..3, vec![ + lit('a', 1), Ast::Empty(span(3..3)), + ])))); + assert_eq!( + parser(r"(|a)").parse(), Ok(group(0..4, 1, alt(1..3, vec![ + Ast::Empty(span(1..1)), lit('a', 2), + ])))); + + assert_eq!( + parser(r"a|b)").parse().unwrap_err(), + TestError { + span: span(3..4), + kind: ast::ErrorKind::GroupUnopened, + }); + assert_eq!( + parser(r"(a|b").parse().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::GroupUnclosed, + }); + } + + #[test] + fn parse_unsupported_lookaround() { + assert_eq!( + parser(r"(?=a)").parse().unwrap_err(), + TestError { + span: span(0..3), + kind: ast::ErrorKind::UnsupportedLookAround, + }); + assert_eq!( + parser(r"(?!a)").parse().unwrap_err(), + TestError { + span: span(0..3), + kind: ast::ErrorKind::UnsupportedLookAround, + }); + assert_eq!( + parser(r"(?<=a)").parse().unwrap_err(), + TestError { + span: span(0..4), + kind: ast::ErrorKind::UnsupportedLookAround, + }); + assert_eq!( + parser(r"(?z)").parse(), Ok(Ast::Group(ast::Group { + span: span(0..8), + kind: ast::GroupKind::CaptureName(ast::CaptureName { + span: span(4..5), + name: s("a"), + index: 1, + }), + ast: Box::new(lit('z', 6)), + }))); + assert_eq!(parser("(?Pz)").parse(), Ok(Ast::Group(ast::Group { + span: span(0..10), + kind: ast::GroupKind::CaptureName(ast::CaptureName { + span: span(4..7), + name: s("abc"), + index: 1, + }), + ast: Box::new(lit('z', 8)), + }))); + + assert_eq!( + parser("(?P<").parse().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::GroupNameUnexpectedEof, + }); + assert_eq!( + parser("(?P<>z)").parse().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::GroupNameEmpty, + }); + assert_eq!( + parser("(?Py)(?Pz)").parse().unwrap_err(), + TestError { + span: span(12..13), + kind: ast::ErrorKind::GroupNameDuplicate { + original: span(4..5), + }, + }); + } + + #[test] + fn parse_flags() { + assert_eq!(parser("i:").parse_flags(), Ok(ast::Flags { + span: span(0..1), + items: vec![ast::FlagsItem { + span: span(0..1), + kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive), + }], + })); + assert_eq!(parser("i)").parse_flags(), Ok(ast::Flags { + span: span(0..1), + items: vec![ast::FlagsItem { + span: span(0..1), + kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive), + }], + })); + + assert_eq!(parser("isU:").parse_flags(), Ok(ast::Flags { + span: span(0..3), + items: vec![ + ast::FlagsItem { + span: span(0..1), + kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive), + }, + ast::FlagsItem { + span: span(1..2), + kind: ast::FlagsItemKind::Flag( + ast::Flag::DotMatchesNewLine), + }, + ast::FlagsItem { + span: span(2..3), + kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed), + }, + ], + })); + + assert_eq!(parser("-isU:").parse_flags(), Ok(ast::Flags { + span: span(0..4), + items: vec![ + ast::FlagsItem { + span: span(0..1), + kind: ast::FlagsItemKind::Negation, + }, + ast::FlagsItem { + span: span(1..2), + kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive), + }, + ast::FlagsItem { + span: span(2..3), + kind: ast::FlagsItemKind::Flag( + ast::Flag::DotMatchesNewLine), + }, + ast::FlagsItem { + span: span(3..4), + kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed), + }, + ], + })); + assert_eq!(parser("i-sU:").parse_flags(), Ok(ast::Flags { + span: span(0..4), + items: vec![ + ast::FlagsItem { + span: span(0..1), + kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive), + }, + ast::FlagsItem { + span: span(1..2), + kind: ast::FlagsItemKind::Negation, + }, + ast::FlagsItem { + span: span(2..3), + kind: ast::FlagsItemKind::Flag( + ast::Flag::DotMatchesNewLine), + }, + ast::FlagsItem { + span: span(3..4), + kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed), + }, + ], + })); + + assert_eq!( + parser("isU").parse_flags().unwrap_err(), + TestError { + span: span(3..3), + kind: ast::ErrorKind::FlagUnexpectedEof, + }); + assert_eq!( + parser("isUa:").parse_flags().unwrap_err(), + TestError { + span: span(3..4), + kind: ast::ErrorKind::FlagUnrecognized, + }); + assert_eq!( + parser("isUi:").parse_flags().unwrap_err(), + TestError { + span: span(3..4), + kind: ast::ErrorKind::FlagDuplicate { + original: span(0..1), + }, + }); + assert_eq!( + parser("i-sU-i:").parse_flags().unwrap_err(), + TestError { + span: span(4..5), + kind: ast::ErrorKind::FlagRepeatedNegation { + original: span(1..2), + }, + }); + assert_eq!( + parser("-)").parse_flags().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::FlagDanglingNegation, + }); + assert_eq!( + parser("i-)").parse_flags().unwrap_err(), + TestError { + span: span(1..2), + kind: ast::ErrorKind::FlagDanglingNegation, + }); + assert_eq!( + parser("iU-)").parse_flags().unwrap_err(), + TestError { + span: span(2..3), + kind: ast::ErrorKind::FlagDanglingNegation, + }); + } + + #[test] + fn parse_flag() { + assert_eq!(parser("i").parse_flag(), Ok(ast::Flag::CaseInsensitive)); + assert_eq!(parser("m").parse_flag(), Ok(ast::Flag::MultiLine)); + assert_eq!(parser("s").parse_flag(), Ok(ast::Flag::DotMatchesNewLine)); + assert_eq!(parser("U").parse_flag(), Ok(ast::Flag::SwapGreed)); + assert_eq!(parser("u").parse_flag(), Ok(ast::Flag::Unicode)); + assert_eq!(parser("x").parse_flag(), Ok(ast::Flag::IgnoreWhitespace)); + + assert_eq!( + parser("a").parse_flag().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::FlagUnrecognized, + }); + assert_eq!( + parser("☃").parse_flag().unwrap_err(), + TestError { + span: span_range("☃", 0..3), + kind: ast::ErrorKind::FlagUnrecognized, + }); + } + + #[test] + fn parse_primitive_non_escape() { + assert_eq!( + parser(r".").parse_primitive(), + Ok(Primitive::Dot(span(0..1)))); + assert_eq!( + parser(r"^").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..1), + kind: ast::AssertionKind::StartLine, + }))); + assert_eq!( + parser(r"$").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..1), + kind: ast::AssertionKind::EndLine, + }))); + + assert_eq!( + parser(r"a").parse_primitive(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..1), + kind: ast::LiteralKind::Verbatim, + c: 'a', + }))); + assert_eq!( + parser(r"|").parse_primitive(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..1), + kind: ast::LiteralKind::Verbatim, + c: '|', + }))); + assert_eq!( + parser(r"☃").parse_primitive(), + Ok(Primitive::Literal(ast::Literal { + span: span_range("☃", 0..3), + kind: ast::LiteralKind::Verbatim, + c: '☃', + }))); + } + + #[test] + fn parse_escape() { + assert_eq!( + parser(r"\|").parse_primitive(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..2), + kind: ast::LiteralKind::Punctuation, + c: '|', + }))); + let specials = &[ + (r"\a", '\x07', ast::SpecialLiteralKind::Bell), + (r"\f", '\x0C', ast::SpecialLiteralKind::FormFeed), + (r"\t", '\t', ast::SpecialLiteralKind::Tab), + (r"\n", '\n', ast::SpecialLiteralKind::LineFeed), + (r"\r", '\r', ast::SpecialLiteralKind::CarriageReturn), + (r"\v", '\x0B', ast::SpecialLiteralKind::VerticalTab), + ]; + for &(pat, c, ref kind) in specials { + assert_eq!( + parser(pat).parse_primitive(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..2), + kind: ast::LiteralKind::Special(kind.clone()), + c: c, + }))); + } + assert_eq!( + parser(r"\A").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..2), + kind: ast::AssertionKind::StartText, + }))); + assert_eq!( + parser(r"\z").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..2), + kind: ast::AssertionKind::EndText, + }))); + assert_eq!( + parser(r"\b").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..2), + kind: ast::AssertionKind::WordBoundary, + }))); + assert_eq!( + parser(r"\B").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..2), + kind: ast::AssertionKind::NotWordBoundary, + }))); + + assert_eq!( + parser(r"\").parse_escape().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::EscapeUnexpectedEof, + }); + assert_eq!( + parser(r"\y").parse_escape().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::EscapeUnrecognized, + }); + } + + #[test] + fn parse_unsupported_backreference() { + assert_eq!( + parser(r"\0").parse_escape().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::UnsupportedBackreference, + }); + assert_eq!( + parser(r"\9").parse_escape().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::UnsupportedBackreference, + }); + } + + #[test] + fn parse_octal() { + for i in 0..511 { + let pat = format!(r"\{:o}", i); + assert_eq!( + parser_octal(&pat).parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..pat.len()), + kind: ast::LiteralKind::Octal, + c: ::std::char::from_u32(i).unwrap(), + }))); + } + assert_eq!( + parser_octal(r"\778").parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..3), + kind: ast::LiteralKind::Octal, + c: '?', + }))); + assert_eq!( + parser_octal(r"\7777").parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..4), + kind: ast::LiteralKind::Octal, + c: '\u{01FF}', + }))); + assert_eq!( + parser_octal(r"\778").parse(), + Ok(Ast::Concat(ast::Concat { + span: span(0..4), + asts: vec![ + Ast::Literal(ast::Literal { + span: span(0..3), + kind: ast::LiteralKind::Octal, + c: '?', + }), + Ast::Literal(ast::Literal { + span: span(3..4), + kind: ast::LiteralKind::Verbatim, + c: '8', + }), + ], + }))); + assert_eq!( + parser_octal(r"\7777").parse(), + Ok(Ast::Concat(ast::Concat { + span: span(0..5), + asts: vec![ + Ast::Literal(ast::Literal { + span: span(0..4), + kind: ast::LiteralKind::Octal, + c: '\u{01FF}', + }), + Ast::Literal(ast::Literal { + span: span(4..5), + kind: ast::LiteralKind::Verbatim, + c: '7', + }), + ], + }))); + + assert_eq!( + parser_octal(r"\8").parse_escape().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::EscapeUnrecognized, + }); + } + + #[test] + fn parse_hex_two() { + for i in 0..256 { + let pat = format!(r"\x{:02x}", i); + assert_eq!( + parser(&pat).parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..pat.len()), + kind: ast::LiteralKind::HexFixed(ast::HexLiteralKind::X), + c: ::std::char::from_u32(i).unwrap(), + }))); + } + + assert_eq!( + parser(r"\xF").parse_escape().unwrap_err(), + TestError { + span: span(3..3), + kind: ast::ErrorKind::EscapeUnexpectedEof, + }); + assert_eq!( + parser(r"\xG").parse_escape().unwrap_err(), + TestError { + span: span(2..3), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + }); + assert_eq!( + parser(r"\xFG").parse_escape().unwrap_err(), + TestError { + span: span(3..4), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + }); + } + + #[test] + fn parse_hex_four() { + for i in 0..65536 { + let c = match ::std::char::from_u32(i) { + None => continue, + Some(c) => c, + }; + let pat = format!(r"\u{:04x}", i); + assert_eq!( + parser(&pat).parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..pat.len()), + kind: ast::LiteralKind::HexFixed( + ast::HexLiteralKind::UnicodeShort), + c: c, + }))); + } + + assert_eq!( + parser(r"\uF").parse_escape().unwrap_err(), + TestError { + span: span(3..3), + kind: ast::ErrorKind::EscapeUnexpectedEof, + }); + assert_eq!( + parser(r"\uG").parse_escape().unwrap_err(), + TestError { + span: span(2..3), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + }); + assert_eq!( + parser(r"\uFG").parse_escape().unwrap_err(), + TestError { + span: span(3..4), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + }); + assert_eq!( + parser(r"\uFFG").parse_escape().unwrap_err(), + TestError { + span: span(4..5), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + }); + assert_eq!( + parser(r"\uFFFG").parse_escape().unwrap_err(), + TestError { + span: span(5..6), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + }); + assert_eq!( + parser(r"\uD800").parse_escape().unwrap_err(), + TestError { + span: span(2..6), + kind: ast::ErrorKind::EscapeHexInvalid, + }); + } + + #[test] + fn parse_hex_eight() { + for i in 0..65536 { + let c = match ::std::char::from_u32(i) { + None => continue, + Some(c) => c, + }; + let pat = format!(r"\U{:08x}", i); + assert_eq!( + parser(&pat).parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..pat.len()), + kind: ast::LiteralKind::HexFixed( + ast::HexLiteralKind::UnicodeLong), + c: c, + }))); + } + + assert_eq!( + parser(r"\UF").parse_escape().unwrap_err(), + TestError { + span: span(3..3), + kind: ast::ErrorKind::EscapeUnexpectedEof, + }); + assert_eq!( + parser(r"\UG").parse_escape().unwrap_err(), + TestError { + span: span(2..3), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + }); + assert_eq!( + parser(r"\UFG").parse_escape().unwrap_err(), + TestError { + span: span(3..4), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + }); + assert_eq!( + parser(r"\UFFG").parse_escape().unwrap_err(), + TestError { + span: span(4..5), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + }); + assert_eq!( + parser(r"\UFFFG").parse_escape().unwrap_err(), + TestError { + span: span(5..6), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + }); + assert_eq!( + parser(r"\UFFFFG").parse_escape().unwrap_err(), + TestError { + span: span(6..7), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + }); + assert_eq!( + parser(r"\UFFFFFG").parse_escape().unwrap_err(), + TestError { + span: span(7..8), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + }); + assert_eq!( + parser(r"\UFFFFFFG").parse_escape().unwrap_err(), + TestError { + span: span(8..9), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + }); + assert_eq!( + parser(r"\UFFFFFFFG").parse_escape().unwrap_err(), + TestError { + span: span(9..10), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + }); + } + + #[test] + fn parse_hex_brace() { + assert_eq!( + parser(r"\u{26c4}").parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..8), + kind: ast::LiteralKind::HexBrace( + ast::HexLiteralKind::UnicodeShort), + c: '⛄', + }))); + assert_eq!( + parser(r"\U{26c4}").parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..8), + kind: ast::LiteralKind::HexBrace( + ast::HexLiteralKind::UnicodeLong), + c: '⛄', + }))); + assert_eq!( + parser(r"\x{26c4}").parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..8), + kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X), + c: '⛄', + }))); + assert_eq!( + parser(r"\x{26C4}").parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..8), + kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X), + c: '⛄', + }))); + assert_eq!( + parser(r"\x{10fFfF}").parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..10), + kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X), + c: '\u{10FFFF}', + }))); + + assert_eq!( + parser(r"\x").parse_escape().unwrap_err(), + TestError { + span: span(2..2), + kind: ast::ErrorKind::EscapeUnexpectedEof, + }); + assert_eq!( + parser(r"\x{").parse_escape().unwrap_err(), + TestError { + span: span(2..3), + kind: ast::ErrorKind::EscapeUnexpectedEof, + }); + assert_eq!( + parser(r"\x{FF").parse_escape().unwrap_err(), + TestError { + span: span(2..5), + kind: ast::ErrorKind::EscapeUnexpectedEof, + }); + assert_eq!( + parser(r"\x{}").parse_escape().unwrap_err(), + TestError { + span: span(2..4), + kind: ast::ErrorKind::EscapeHexEmpty, + }); + assert_eq!( + parser(r"\x{FGF}").parse_escape().unwrap_err(), + TestError { + span: span(4..5), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + }); + assert_eq!( + parser(r"\x{FFFFFF}").parse_escape().unwrap_err(), + TestError { + span: span(3..9), + kind: ast::ErrorKind::EscapeHexInvalid, + }); + assert_eq!( + parser(r"\x{D800}").parse_escape().unwrap_err(), + TestError { + span: span(3..7), + kind: ast::ErrorKind::EscapeHexInvalid, + }); + assert_eq!( + parser(r"\x{FFFFFFFFF}").parse_escape().unwrap_err(), + TestError { + span: span(3..12), + kind: ast::ErrorKind::EscapeHexInvalid, + }); + } + + #[test] + fn parse_decimal() { + assert_eq!(parser("123").parse_decimal(), Ok(123)); + assert_eq!(parser("0").parse_decimal(), Ok(0)); + assert_eq!(parser("01").parse_decimal(), Ok(1)); + + assert_eq!( + parser("-1").parse_decimal().unwrap_err(), + TestError { + span: span(0..0), + kind: ast::ErrorKind::DecimalEmpty, + }); + assert_eq!( + parser("").parse_decimal().unwrap_err(), + TestError { + span: span(0..0), + kind: ast::ErrorKind::DecimalEmpty, + }); + assert_eq!( + parser("9999999999").parse_decimal().unwrap_err(), + TestError { + span: span(0..10), + kind: ast::ErrorKind::DecimalInvalid, + }); + } + + #[test] + fn parse_set_class() { + fn union(span: Span, items: Vec) -> ast::ClassSet { + ast::ClassSet::union(ast::ClassSetUnion { + span: span, + items: items, + }) + } + + fn intersection( + span: Span, + lhs: ast::ClassSet, + rhs: ast::ClassSet, + ) -> ast::ClassSet { + ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { + span: span, + kind: ast::ClassSetBinaryOpKind::Intersection, + lhs: Box::new(lhs), + rhs: Box::new(rhs), + }) + } + + fn difference( + span: Span, + lhs: ast::ClassSet, + rhs: ast::ClassSet, + ) -> ast::ClassSet { + ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { + span: span, + kind: ast::ClassSetBinaryOpKind::Difference, + lhs: Box::new(lhs), + rhs: Box::new(rhs), + }) + } + + fn symdifference( + span: Span, + lhs: ast::ClassSet, + rhs: ast::ClassSet, + ) -> ast::ClassSet { + ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { + span: span, + kind: ast::ClassSetBinaryOpKind::SymmetricDifference, + lhs: Box::new(lhs), + rhs: Box::new(rhs), + }) + } + + fn itemset(item: ast::ClassSetItem) -> ast::ClassSet { + ast::ClassSet::Item(item) + } + + fn item_ascii(cls: ast::ClassAscii) -> ast::ClassSetItem { + ast::ClassSetItem::Ascii(cls) + } + + fn item_unicode(cls: ast::ClassUnicode) -> ast::ClassSetItem { + ast::ClassSetItem::Unicode(cls) + } + + fn item_perl(cls: ast::ClassPerl) -> ast::ClassSetItem { + ast::ClassSetItem::Perl(cls) + } + + fn item_bracket(cls: ast::ClassBracketed) -> ast::ClassSetItem { + ast::ClassSetItem::Bracketed(Box::new(cls)) + } + + fn lit(span: Span, c: char) -> ast::ClassSetItem { + ast::ClassSetItem::Literal(ast::Literal { + span: span, + kind: ast::LiteralKind::Verbatim, + c: c, + }) + } + + fn empty(span: Span) -> ast::ClassSetItem { + ast::ClassSetItem::Empty(span) + } + + fn range(span: Span, start: char, end: char) -> ast::ClassSetItem { + let pos1 = Position { + offset: span.start.offset + start.len_utf8(), + column: span.start.column + 1, + ..span.start + }; + let pos2 = Position { + offset: span.end.offset - end.len_utf8(), + column: span.end.column - 1, + ..span.end + }; + ast::ClassSetItem::Range(ast::ClassSetRange { + span: span, + start: ast::Literal { + span: Span { end: pos1, ..span }, + kind: ast::LiteralKind::Verbatim, + c: start, + }, + end: ast::Literal { + span: Span { start: pos2, ..span }, + kind: ast::LiteralKind::Verbatim, + c: end, + }, + }) + } + + fn alnum(span: Span, negated: bool) -> ast::ClassAscii { + ast::ClassAscii { + span: span, + kind: ast::ClassAsciiKind::Alnum, + negated: negated, + } + } + + fn lower(span: Span, negated: bool) -> ast::ClassAscii { + ast::ClassAscii { + span: span, + kind: ast::ClassAsciiKind::Lower, + negated: negated, + } + } + + assert_eq!( + parser("[[:alnum:]]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..11), + negated: false, + kind: itemset(item_ascii(alnum(span(1..10), false))), + })))); + assert_eq!( + parser("[[[:alnum:]]]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..13), + negated: false, + kind: itemset(item_bracket(ast::ClassBracketed { + span: span(1..12), + negated: false, + kind: itemset(item_ascii(alnum(span(2..11), false))), + })), + })))); + assert_eq!( + parser("[[:alnum:]&&[:lower:]]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..22), + negated: false, + kind: intersection( + span(1..21), + itemset(item_ascii(alnum(span(1..10), false))), + itemset(item_ascii(lower(span(12..21), false))), + ), + })))); + assert_eq!( + parser("[[:alnum:]--[:lower:]]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..22), + negated: false, + kind: difference( + span(1..21), + itemset(item_ascii(alnum(span(1..10), false))), + itemset(item_ascii(lower(span(12..21), false))), + ), + })))); + assert_eq!( + parser("[[:alnum:]~~[:lower:]]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..22), + negated: false, + kind: symdifference( + span(1..21), + itemset(item_ascii(alnum(span(1..10), false))), + itemset(item_ascii(lower(span(12..21), false))), + ), + })))); + + assert_eq!( + parser("[a]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..3), + negated: false, + kind: itemset(lit(span(1..2), 'a')), + })))); + assert_eq!( + parser(r"[a\]]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..5), + negated: false, + kind: union(span(1..4), vec![ + lit(span(1..2), 'a'), + ast::ClassSetItem::Literal(ast::Literal { + span: span(2..4), + kind: ast::LiteralKind::Punctuation, + c: ']', + }), + ]), + })))); + assert_eq!( + parser(r"[a\-z]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..6), + negated: false, + kind: union(span(1..5), vec![ + lit(span(1..2), 'a'), + ast::ClassSetItem::Literal(ast::Literal { + span: span(2..4), + kind: ast::LiteralKind::Punctuation, + c: '-', + }), + lit(span(4..5), 'z'), + ]), + })))); + assert_eq!( + parser("[ab]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..4), + negated: false, + kind: union(span(1..3), vec![ + lit(span(1..2), 'a'), + lit(span(2..3), 'b'), + ]), + })))); + assert_eq!( + parser("[a-]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..4), + negated: false, + kind: union(span(1..3), vec![ + lit(span(1..2), 'a'), + lit(span(2..3), '-'), + ]), + })))); + assert_eq!( + parser("[-a]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..4), + negated: false, + kind: union(span(1..3), vec![ + lit(span(1..2), '-'), + lit(span(2..3), 'a'), + ]), + })))); + assert_eq!( + parser(r"[\pL]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..5), + negated: false, + kind: itemset(item_unicode(ast::ClassUnicode { + span: span(1..4), + negated: false, + kind: ast::ClassUnicodeKind::OneLetter('L'), + })), + })))); + assert_eq!( + parser(r"[\w]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..4), + negated: false, + kind: itemset(item_perl(ast::ClassPerl { + span: span(1..3), + kind: ast::ClassPerlKind::Word, + negated: false, + })), + })))); + assert_eq!( + parser(r"[a\wz]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..6), + negated: false, + kind: union(span(1..5), vec![ + lit(span(1..2), 'a'), + item_perl(ast::ClassPerl { + span: span(2..4), + kind: ast::ClassPerlKind::Word, + negated: false, + }), + lit(span(4..5), 'z'), + ]), + })))); + + assert_eq!( + parser("[a-z]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..5), + negated: false, + kind: itemset(range(span(1..4), 'a', 'z')), + })))); + assert_eq!( + parser("[a-cx-z]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..8), + negated: false, + kind: union(span(1..7), vec![ + range(span(1..4), 'a', 'c'), + range(span(4..7), 'x', 'z'), + ]), + })))); + assert_eq!( + parser(r"[\w&&a-cx-z]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..12), + negated: false, + kind: intersection( + span(1..11), + itemset(item_perl(ast::ClassPerl { + span: span(1..3), + kind: ast::ClassPerlKind::Word, + negated: false, + })), + union(span(5..11), vec![ + range(span(5..8), 'a', 'c'), + range(span(8..11), 'x', 'z'), + ]), + ), + })))); + assert_eq!( + parser(r"[a-cx-z&&\w]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..12), + negated: false, + kind: intersection( + span(1..11), + union(span(1..7), vec![ + range(span(1..4), 'a', 'c'), + range(span(4..7), 'x', 'z'), + ]), + itemset(item_perl(ast::ClassPerl { + span: span(9..11), + kind: ast::ClassPerlKind::Word, + negated: false, + })), + ), + })))); + assert_eq!( + parser(r"[a--b--c]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..9), + negated: false, + kind: difference( + span(1..8), + difference( + span(1..5), + itemset(lit(span(1..2), 'a')), + itemset(lit(span(4..5), 'b')), + ), + itemset(lit(span(7..8), 'c')), + ), + })))); + assert_eq!( + parser(r"[a~~b~~c]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..9), + negated: false, + kind: symdifference( + span(1..8), + symdifference( + span(1..5), + itemset(lit(span(1..2), 'a')), + itemset(lit(span(4..5), 'b')), + ), + itemset(lit(span(7..8), 'c')), + ), + })))); + assert_eq!( + parser(r"[\^&&^]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..7), + negated: false, + kind: intersection( + span(1..6), + itemset(ast::ClassSetItem::Literal(ast::Literal { + span: span(1..3), + kind: ast::LiteralKind::Punctuation, + c: '^', + })), + itemset(lit(span(5..6), '^')), + ), + })))); + assert_eq!( + parser(r"[\&&&&]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..7), + negated: false, + kind: intersection( + span(1..6), + itemset(ast::ClassSetItem::Literal(ast::Literal { + span: span(1..3), + kind: ast::LiteralKind::Punctuation, + c: '&', + })), + itemset(lit(span(5..6), '&')), + ), + })))); + assert_eq!( + parser(r"[&&&&]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..6), + negated: false, + kind: intersection( + span(1..5), + intersection( + span(1..3), + itemset(empty(span(1..1))), + itemset(empty(span(3..3))), + ), + itemset(empty(span(5..5))), + ), + })))); + + let pat = "[☃-⛄]"; + assert_eq!( + parser(pat).parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span_range(pat, 0..9), + negated: false, + kind: itemset(ast::ClassSetItem::Range(ast::ClassSetRange { + span: span_range(pat, 1..8), + start: ast::Literal { + span: span_range(pat, 1..4), + kind: ast::LiteralKind::Verbatim, + c: '☃', + }, + end: ast::Literal { + span: span_range(pat, 5..8), + kind: ast::LiteralKind::Verbatim, + c: '⛄', + }, + })), + })))); + + assert_eq!( + parser(r"[]]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..3), + negated: false, + kind: itemset(lit(span(1..2), ']')), + })))); + assert_eq!( + parser(r"[]\[]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..5), + negated: false, + kind: union(span(1..4), vec![ + lit(span(1..2), ']'), + ast::ClassSetItem::Literal(ast::Literal { + span: span(2..4), + kind: ast::LiteralKind::Punctuation, + c: '[', + }), + ]), + })))); + assert_eq!( + parser(r"[\[]]").parse(), + Ok(concat(0..5, vec![ + Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..4), + negated: false, + kind: itemset(ast::ClassSetItem::Literal(ast::Literal { + span: span(1..3), + kind: ast::LiteralKind::Punctuation, + c: '[', + })), + })), + Ast::Literal(ast::Literal { + span: span(4..5), + kind: ast::LiteralKind::Verbatim, + c: ']', + }), + ]))); + + assert_eq!( + parser("[").parse().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::ClassUnclosed, + }); + assert_eq!( + parser("[[").parse().unwrap_err(), + TestError { + span: span(1..2), + kind: ast::ErrorKind::ClassUnclosed, + }); + assert_eq!( + parser("[[-]").parse().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::ClassUnclosed, + }); + assert_eq!( + parser("[[[:alnum:]").parse().unwrap_err(), + TestError { + span: span(1..2), + kind: ast::ErrorKind::ClassUnclosed, + }); + assert_eq!( + parser(r"[\b]").parse().unwrap_err(), + TestError { + span: span(1..3), + kind: ast::ErrorKind::ClassEscapeInvalid, + }); + assert_eq!( + parser(r"[\w-a]").parse().unwrap_err(), + TestError { + span: span(1..3), + kind: ast::ErrorKind::ClassEscapeInvalid, + }); + assert_eq!( + parser(r"[a-\w]").parse().unwrap_err(), + TestError { + span: span(3..5), + kind: ast::ErrorKind::ClassEscapeInvalid, + }); + assert_eq!( + parser(r"[z-a]").parse().unwrap_err(), + TestError { + span: span(1..4), + kind: ast::ErrorKind::ClassRangeInvalid, + }); + + assert_eq!( + parser_ignore_whitespace("[a ").parse().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::ClassUnclosed, + }); + assert_eq!( + parser_ignore_whitespace("[a- ").parse().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::ClassUnclosed, + }); + } + + #[test] + fn parse_set_class_open() { + assert_eq!( + parser("[a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..1), + negated: false, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(1..1), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(1..1), + items: vec![], + }; + Ok((set, union)) + }); + assert_eq!( + parser_ignore_whitespace("[ a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..4), + negated: false, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(4..4), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(4..4), + items: vec![], + }; + Ok((set, union)) + }); + assert_eq!( + parser("[^a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..2), + negated: true, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(2..2), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(2..2), + items: vec![], + }; + Ok((set, union)) + }); + assert_eq!( + parser_ignore_whitespace("[ ^ a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..4), + negated: true, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(4..4), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(4..4), + items: vec![], + }; + Ok((set, union)) + }); + assert_eq!( + parser("[-a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..2), + negated: false, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(1..1), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(1..2), + items: vec![ + ast::ClassSetItem::Literal(ast::Literal { + span: span(1..2), + kind: ast::LiteralKind::Verbatim, + c: '-', + }), + ], + }; + Ok((set, union)) + }); + assert_eq!( + parser_ignore_whitespace("[ - a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..4), + negated: false, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(2..2), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(2..3), + items: vec![ + ast::ClassSetItem::Literal(ast::Literal { + span: span(2..3), + kind: ast::LiteralKind::Verbatim, + c: '-', + }), + ], + }; + Ok((set, union)) + }); + assert_eq!( + parser("[^-a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..3), + negated: true, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(2..2), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(2..3), + items: vec![ + ast::ClassSetItem::Literal(ast::Literal { + span: span(2..3), + kind: ast::LiteralKind::Verbatim, + c: '-', + }), + ], + }; + Ok((set, union)) + }); + assert_eq!( + parser("[--a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..3), + negated: false, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(1..1), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(1..3), + items: vec![ + ast::ClassSetItem::Literal(ast::Literal { + span: span(1..2), + kind: ast::LiteralKind::Verbatim, + c: '-', + }), + ast::ClassSetItem::Literal(ast::Literal { + span: span(2..3), + kind: ast::LiteralKind::Verbatim, + c: '-', + }), + ], + }; + Ok((set, union)) + }); + assert_eq!( + parser("[]a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..2), + negated: false, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(1..1), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(1..2), + items: vec![ + ast::ClassSetItem::Literal(ast::Literal { + span: span(1..2), + kind: ast::LiteralKind::Verbatim, + c: ']', + }), + ], + }; + Ok((set, union)) + }); + assert_eq!( + parser_ignore_whitespace("[ ] a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..4), + negated: false, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(2..2), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(2..3), + items: vec![ + ast::ClassSetItem::Literal(ast::Literal { + span: span(2..3), + kind: ast::LiteralKind::Verbatim, + c: ']', + }), + ], + }; + Ok((set, union)) + }); + assert_eq!( + parser("[^]a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..3), + negated: true, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(2..2), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(2..3), + items: vec![ + ast::ClassSetItem::Literal(ast::Literal { + span: span(2..3), + kind: ast::LiteralKind::Verbatim, + c: ']', + }), + ], + }; + Ok((set, union)) + }); + assert_eq!( + parser("[-]a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..2), + negated: false, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(1..1), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(1..2), + items: vec![ + ast::ClassSetItem::Literal(ast::Literal { + span: span(1..2), + kind: ast::LiteralKind::Verbatim, + c: '-', + }), + ], + }; + Ok((set, union)) + }); + + assert_eq!( + parser("[").parse_set_class_open().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::ClassUnclosed, + }); + assert_eq!( + parser_ignore_whitespace("[ ") + .parse_set_class_open() + .unwrap_err(), + TestError { + span: span(0..5), + kind: ast::ErrorKind::ClassUnclosed, + }); + assert_eq!( + parser("[^").parse_set_class_open().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::ClassUnclosed, + }); + assert_eq!( + parser("[]").parse_set_class_open().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::ClassUnclosed, + }); + assert_eq!( + parser("[-").parse_set_class_open().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::ClassUnclosed, + }); + assert_eq!( + parser("[--").parse_set_class_open().unwrap_err(), + TestError { + span: span(0..3), + kind: ast::ErrorKind::ClassUnclosed, + }); + } + + #[test] + fn maybe_parse_ascii_class() { + assert_eq!( + parser(r"[:alnum:]").maybe_parse_ascii_class(), + Some(ast::ClassAscii { + span: span(0..9), + kind: ast::ClassAsciiKind::Alnum, + negated: false, + })); + assert_eq!( + parser(r"[:alnum:]A").maybe_parse_ascii_class(), + Some(ast::ClassAscii { + span: span(0..9), + kind: ast::ClassAsciiKind::Alnum, + negated: false, + })); + assert_eq!( + parser(r"[:^alnum:]").maybe_parse_ascii_class(), + Some(ast::ClassAscii { + span: span(0..10), + kind: ast::ClassAsciiKind::Alnum, + negated: true, + })); + + let p = parser(r"[:"); + assert_eq!(p.maybe_parse_ascii_class(), None); + assert_eq!(p.offset(), 0); + + let p = parser(r"[:^"); + assert_eq!(p.maybe_parse_ascii_class(), None); + assert_eq!(p.offset(), 0); + + let p = parser(r"[^:alnum:]"); + assert_eq!(p.maybe_parse_ascii_class(), None); + assert_eq!(p.offset(), 0); + + let p = parser(r"[:alnnum:]"); + assert_eq!(p.maybe_parse_ascii_class(), None); + assert_eq!(p.offset(), 0); + + let p = parser(r"[:alnum]"); + assert_eq!(p.maybe_parse_ascii_class(), None); + assert_eq!(p.offset(), 0); + + let p = parser(r"[:alnum:"); + assert_eq!(p.maybe_parse_ascii_class(), None); + assert_eq!(p.offset(), 0); + } + + #[test] + fn parse_unicode_class() { + assert_eq!( + parser(r"\pN").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..3), + negated: false, + kind: ast::ClassUnicodeKind::OneLetter('N'), + }))); + assert_eq!( + parser(r"\PN").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..3), + negated: true, + kind: ast::ClassUnicodeKind::OneLetter('N'), + }))); + assert_eq!( + parser(r"\p{N}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..5), + negated: false, + kind: ast::ClassUnicodeKind::Named(s("N")), + }))); + assert_eq!( + parser(r"\P{N}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..5), + negated: true, + kind: ast::ClassUnicodeKind::Named(s("N")), + }))); + assert_eq!( + parser(r"\p{Greek}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..9), + negated: false, + kind: ast::ClassUnicodeKind::Named(s("Greek")), + }))); + + assert_eq!( + parser(r"\p{scx:Katakana}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..16), + negated: false, + kind: ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::Colon, + name: s("scx"), + value: s("Katakana"), + }, + }))); + assert_eq!( + parser(r"\p{scx=Katakana}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..16), + negated: false, + kind: ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::Equal, + name: s("scx"), + value: s("Katakana"), + }, + }))); + assert_eq!( + parser(r"\p{scx!=Katakana}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..17), + negated: false, + kind: ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::NotEqual, + name: s("scx"), + value: s("Katakana"), + }, + }))); + + assert_eq!( + parser(r"\p{:}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..5), + negated: false, + kind: ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::Colon, + name: s(""), + value: s(""), + }, + }))); + assert_eq!( + parser(r"\p{=}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..5), + negated: false, + kind: ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::Equal, + name: s(""), + value: s(""), + }, + }))); + assert_eq!( + parser(r"\p{!=}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..6), + negated: false, + kind: ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::NotEqual, + name: s(""), + value: s(""), + }, + }))); + + assert_eq!( + parser(r"\p").parse_escape().unwrap_err(), + TestError { + span: span(2..2), + kind: ast::ErrorKind::EscapeUnexpectedEof, + }); + assert_eq!( + parser(r"\p{").parse_escape().unwrap_err(), + TestError { + span: span(3..3), + kind: ast::ErrorKind::EscapeUnexpectedEof, + }); + assert_eq!( + parser(r"\p{N").parse_escape().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::EscapeUnexpectedEof, + }); + assert_eq!( + parser(r"\p{Greek").parse_escape().unwrap_err(), + TestError { + span: span(8..8), + kind: ast::ErrorKind::EscapeUnexpectedEof, + }); + + assert_eq!( + parser(r"\pNz").parse(), + Ok(Ast::Concat(ast::Concat { + span: span(0..4), + asts: vec![ + Ast::Class(ast::Class::Unicode(ast::ClassUnicode { + span: span(0..3), + negated: false, + kind: ast::ClassUnicodeKind::OneLetter('N'), + })), + Ast::Literal(ast::Literal { + span: span(3..4), + kind: ast::LiteralKind::Verbatim, + c: 'z', + }), + ], + }))); + assert_eq!( + parser(r"\p{Greek}z").parse(), + Ok(Ast::Concat(ast::Concat { + span: span(0..10), + asts: vec![ + Ast::Class(ast::Class::Unicode(ast::ClassUnicode { + span: span(0..9), + negated: false, + kind: ast::ClassUnicodeKind::Named(s("Greek")), + })), + Ast::Literal(ast::Literal { + span: span(9..10), + kind: ast::LiteralKind::Verbatim, + c: 'z', + }), + ], + }))); + } + + #[test] + fn parse_perl_class() { + assert_eq!( + parser(r"\d").parse_escape(), + Ok(Primitive::Perl(ast::ClassPerl { + span: span(0..2), + kind: ast::ClassPerlKind::Digit, + negated: false, + }))); + assert_eq!( + parser(r"\D").parse_escape(), + Ok(Primitive::Perl(ast::ClassPerl { + span: span(0..2), + kind: ast::ClassPerlKind::Digit, + negated: true, + }))); + assert_eq!( + parser(r"\s").parse_escape(), + Ok(Primitive::Perl(ast::ClassPerl { + span: span(0..2), + kind: ast::ClassPerlKind::Space, + negated: false, + }))); + assert_eq!( + parser(r"\S").parse_escape(), + Ok(Primitive::Perl(ast::ClassPerl { + span: span(0..2), + kind: ast::ClassPerlKind::Space, + negated: true, + }))); + assert_eq!( + parser(r"\w").parse_escape(), + Ok(Primitive::Perl(ast::ClassPerl { + span: span(0..2), + kind: ast::ClassPerlKind::Word, + negated: false, + }))); + assert_eq!( + parser(r"\W").parse_escape(), + Ok(Primitive::Perl(ast::ClassPerl { + span: span(0..2), + kind: ast::ClassPerlKind::Word, + negated: true, + }))); + + assert_eq!( + parser(r"\d").parse(), + Ok(Ast::Class(ast::Class::Perl(ast::ClassPerl { + span: span(0..2), + kind: ast::ClassPerlKind::Digit, + negated: false, + })))); + assert_eq!( + parser(r"\dz").parse(), + Ok(Ast::Concat(ast::Concat { + span: span(0..3), + asts: vec![ + Ast::Class(ast::Class::Perl(ast::ClassPerl { + span: span(0..2), + kind: ast::ClassPerlKind::Digit, + negated: false, + })), + Ast::Literal(ast::Literal { + span: span(2..3), + kind: ast::LiteralKind::Verbatim, + c: 'z', + }), + ], + }))); + } +} diff --git a/regex-syntax-2/src/ast/print.rs b/regex-syntax-2/src/ast/print.rs new file mode 100644 index 0000000000..0d6dfb0a20 --- /dev/null +++ b/regex-syntax-2/src/ast/print.rs @@ -0,0 +1,591 @@ +// Copyright 2018 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +/*! +This module provides a regular expression printer. +*/ + +use std::fmt; + +use ast::{self, Ast}; +use ast::visitor::{self, Visitor}; + +/// A builder for constructing a printer. +/// +/// Note that since a printer doesn't have any configuration knobs, this type +/// remains unexported. +#[derive(Clone, Debug)] +struct PrinterBuilder { + _priv: (), +} + +impl Default for PrinterBuilder { + fn default() -> PrinterBuilder { + PrinterBuilder::new() + } +} + +impl PrinterBuilder { + fn new() -> PrinterBuilder { + PrinterBuilder { + _priv: (), + } + } + + fn build(&self) -> Printer { + Printer { + _priv: (), + } + } +} + +/// A printer for a regular expression abstract syntax tree. +/// +/// A printer converts an abstract syntax tree (AST) to a regular expression +/// pattern string. This particular printer uses constant stack space and heap +/// space proportional to the size of the AST. +/// +/// This printer will not necessarily preserve the original formatting of the +/// regular expression pattern string. For example, all whitespace and comments +/// are ignored. +#[derive(Debug)] +pub struct Printer { + _priv: (), +} + +impl Printer { + /// Create a new printer. + pub fn new() -> Printer { + PrinterBuilder::new().build() + } + + /// Print the given `Ast` to the given writer. The writer must implement + /// `fmt::Write`. Typical implementations of `fmt::Write` that can be used + /// here are a `fmt::Formatter` (which is available in `fmt::Display` + /// implementations) or a `&mut String`. + pub fn print(&mut self, ast: &Ast, wtr: W) -> fmt::Result { + visitor::visit(ast, Writer { printer: self, wtr: wtr }) + } +} + +#[derive(Debug)] +struct Writer<'p, W> { + printer: &'p mut Printer, + wtr: W, +} + +impl<'p, W: fmt::Write> Visitor for Writer<'p, W> { + type Output = (); + type Err = fmt::Error; + + fn finish(self) -> fmt::Result { + Ok(()) + } + + fn visit_pre(&mut self, ast: &Ast) -> fmt::Result { + match *ast { + Ast::Group(ref x) => self.fmt_group_pre(x), + Ast::Class(ast::Class::Bracketed(ref x)) => { + self.fmt_class_bracketed_pre(x) + } + _ => Ok(()) + } + } + + fn visit_post(&mut self, ast: &Ast) -> fmt::Result { + use ast::Class; + + match *ast { + Ast::Empty(_) => Ok(()), + Ast::Flags(ref x) => self.fmt_set_flags(x), + Ast::Literal(ref x) => self.fmt_literal(x), + Ast::Dot(_) => self.wtr.write_str("."), + Ast::Assertion(ref x) => self.fmt_assertion(x), + Ast::Class(Class::Perl(ref x)) => self.fmt_class_perl(x), + Ast::Class(Class::Unicode(ref x)) => self.fmt_class_unicode(x), + Ast::Class(Class::Bracketed(ref x)) => { + self.fmt_class_bracketed_post(x) + } + Ast::Repetition(ref x) => self.fmt_repetition(x), + Ast::Group(ref x) => self.fmt_group_post(x), + Ast::Alternation(_) => Ok(()), + Ast::Concat(_) => Ok(()), + } + } + + fn visit_alternation_in(&mut self) -> fmt::Result { + self.wtr.write_str("|") + } + + fn visit_class_set_item_pre( + &mut self, + ast: &ast::ClassSetItem, + ) -> Result<(), Self::Err> { + match *ast { + ast::ClassSetItem::Bracketed(ref x) => { + self.fmt_class_bracketed_pre(x) + } + _ => Ok(()), + } + } + + fn visit_class_set_item_post( + &mut self, + ast: &ast::ClassSetItem, + ) -> Result<(), Self::Err> { + use ast::ClassSetItem::*; + + match *ast { + Empty(_) => Ok(()), + Literal(ref x) => self.fmt_literal(x), + Range(ref x) => { + try!(self.fmt_literal(&x.start)); + try!(self.wtr.write_str("-")); + try!(self.fmt_literal(&x.end)); + Ok(()) + } + Ascii(ref x) => self.fmt_class_ascii(x), + Unicode(ref x) => self.fmt_class_unicode(x), + Perl(ref x) => self.fmt_class_perl(x), + Bracketed(ref x) => self.fmt_class_bracketed_post(x), + Union(_) => Ok(()), + } + } + + fn visit_class_set_binary_op_in( + &mut self, + ast: &ast::ClassSetBinaryOp, + ) -> Result<(), Self::Err> { + self.fmt_class_set_binary_op_kind(&ast.kind) + } +} + +impl<'p, W: fmt::Write> Writer<'p, W> { + fn fmt_group_pre(&mut self, ast: &ast::Group) -> fmt::Result { + use ast::GroupKind::*; + match ast.kind { + CaptureIndex(_) => self.wtr.write_str("("), + CaptureName(ref x) => { + try!(self.wtr.write_str("(?P<")); + try!(self.wtr.write_str(&x.name)); + try!(self.wtr.write_str(">")); + Ok(()) + } + NonCapturing(ref flags) => { + try!(self.wtr.write_str("(?")); + try!(self.fmt_flags(flags)); + try!(self.wtr.write_str(":")); + Ok(()) + } + } + } + + fn fmt_group_post(&mut self, _ast: &ast::Group) -> fmt::Result { + self.wtr.write_str(")") + } + + fn fmt_repetition(&mut self, ast: &ast::Repetition) -> fmt::Result { + use ast::RepetitionKind::*; + match ast.op.kind { + ZeroOrOne if ast.greedy => self.wtr.write_str("?"), + ZeroOrOne => self.wtr.write_str("??"), + ZeroOrMore if ast.greedy => self.wtr.write_str("*"), + ZeroOrMore => self.wtr.write_str("*?"), + OneOrMore if ast.greedy => self.wtr.write_str("+"), + OneOrMore => self.wtr.write_str("+?"), + Range(ref x) => { + try!(self.fmt_repetition_range(x)); + if !ast.greedy { + try!(self.wtr.write_str("?")); + } + Ok(()) + } + } + } + + fn fmt_repetition_range( + &mut self, + ast: &ast::RepetitionRange, + ) -> fmt::Result { + use ast::RepetitionRange::*; + match *ast { + Exactly(x) => write!(self.wtr, "{{{}}}", x), + AtLeast(x) => write!(self.wtr, "{{{},}}", x), + Bounded(x, y) => write!(self.wtr, "{{{},{}}}", x, y), + } + } + + fn fmt_literal(&mut self, ast: &ast::Literal) -> fmt::Result { + use ast::LiteralKind::*; + + match ast.kind { + Verbatim => self.wtr.write_char(ast.c), + Punctuation => write!(self.wtr, r"\{}", ast.c), + Octal => write!(self.wtr, r"\{:o}", ast.c as u32), + HexFixed(ast::HexLiteralKind::X) => { + write!(self.wtr, r"\x{:02X}", ast.c as u32) + } + HexFixed(ast::HexLiteralKind::UnicodeShort) => { + write!(self.wtr, r"\u{:04X}", ast.c as u32) + } + HexFixed(ast::HexLiteralKind::UnicodeLong) => { + write!(self.wtr, r"\U{:08X}", ast.c as u32) + } + HexBrace(ast::HexLiteralKind::X) => { + write!(self.wtr, r"\x{{{:X}}}", ast.c as u32) + } + HexBrace(ast::HexLiteralKind::UnicodeShort) => { + write!(self.wtr, r"\u{{{:X}}}", ast.c as u32) + } + HexBrace(ast::HexLiteralKind::UnicodeLong) => { + write!(self.wtr, r"\U{{{:X}}}", ast.c as u32) + } + Special(ast::SpecialLiteralKind::Bell) => { + self.wtr.write_str(r"\a") + } + Special(ast::SpecialLiteralKind::FormFeed) => { + self.wtr.write_str(r"\f") + } + Special(ast::SpecialLiteralKind::Tab) => { + self.wtr.write_str(r"\t") + } + Special(ast::SpecialLiteralKind::LineFeed) => { + self.wtr.write_str(r"\n") + } + Special(ast::SpecialLiteralKind::CarriageReturn) => { + self.wtr.write_str(r"\r") + } + Special(ast::SpecialLiteralKind::VerticalTab) => { + self.wtr.write_str(r"\v") + } + Special(ast::SpecialLiteralKind::Space) => { + self.wtr.write_str(r"\ ") + } + } + } + + fn fmt_assertion(&mut self, ast: &ast::Assertion) -> fmt::Result { + use ast::AssertionKind::*; + match ast.kind { + StartLine => self.wtr.write_str("^"), + EndLine => self.wtr.write_str("$"), + StartText => self.wtr.write_str(r"\A"), + EndText => self.wtr.write_str(r"\z"), + WordBoundary => self.wtr.write_str(r"\b"), + NotWordBoundary => self.wtr.write_str(r"\B"), + } + } + + fn fmt_set_flags(&mut self, ast: &ast::SetFlags) -> fmt::Result { + try!(self.wtr.write_str("(?")); + try!(self.fmt_flags(&ast.flags)); + try!(self.wtr.write_str(")")); + Ok(()) + } + + fn fmt_flags(&mut self, ast: &ast::Flags) -> fmt::Result { + use ast::{Flag, FlagsItemKind}; + + for item in &ast.items { + try!(match item.kind { + FlagsItemKind::Negation => self.wtr.write_str("-"), + FlagsItemKind::Flag(ref flag) => { + match *flag { + Flag::CaseInsensitive => self.wtr.write_str("i"), + Flag::MultiLine => self.wtr.write_str("m"), + Flag::DotMatchesNewLine => self.wtr.write_str("s"), + Flag::SwapGreed => self.wtr.write_str("U"), + Flag::Unicode => self.wtr.write_str("u"), + Flag::IgnoreWhitespace => self.wtr.write_str("x"), + } + } + }); + } + Ok(()) + } + + fn fmt_class_bracketed_pre( + &mut self, + ast: &ast::ClassBracketed, + ) -> fmt::Result { + if ast.negated { + self.wtr.write_str("[^") + } else { + self.wtr.write_str("[") + } + } + + fn fmt_class_bracketed_post( + &mut self, + _ast: &ast::ClassBracketed, + ) -> fmt::Result { + self.wtr.write_str("]") + } + + fn fmt_class_set_binary_op_kind( + &mut self, + ast: &ast::ClassSetBinaryOpKind, + ) -> fmt::Result { + use ast::ClassSetBinaryOpKind::*; + match *ast { + Intersection => self.wtr.write_str("&&"), + Difference => self.wtr.write_str("--"), + SymmetricDifference => self.wtr.write_str("~~"), + } + } + + fn fmt_class_perl(&mut self, ast: &ast::ClassPerl) -> fmt::Result { + use ast::ClassPerlKind::*; + match ast.kind { + Digit if ast.negated => self.wtr.write_str(r"\D"), + Digit => self.wtr.write_str(r"\d"), + Space if ast.negated => self.wtr.write_str(r"\S"), + Space => self.wtr.write_str(r"\s"), + Word if ast.negated => self.wtr.write_str(r"\W"), + Word => self.wtr.write_str(r"\w"), + } + } + + fn fmt_class_ascii(&mut self, ast: &ast::ClassAscii) -> fmt::Result { + use ast::ClassAsciiKind::*; + match ast.kind { + Alnum if ast.negated => self.wtr.write_str("[:^alnum:]"), + Alnum => self.wtr.write_str("[:alnum:]"), + Alpha if ast.negated => self.wtr.write_str("[:^alpha:]"), + Alpha => self.wtr.write_str("[:alpha:]"), + Ascii if ast.negated => self.wtr.write_str("[:^ascii:]"), + Ascii => self.wtr.write_str("[:ascii:]"), + Blank if ast.negated => self.wtr.write_str("[:^blank:]"), + Blank => self.wtr.write_str("[:blank:]"), + Cntrl if ast.negated => self.wtr.write_str("[:^cntrl:]"), + Cntrl => self.wtr.write_str("[:cntrl:]"), + Digit if ast.negated => self.wtr.write_str("[:^digit:]"), + Digit => self.wtr.write_str("[:digit:]"), + Graph if ast.negated => self.wtr.write_str("[:^graph:]"), + Graph => self.wtr.write_str("[:graph:]"), + Lower if ast.negated => self.wtr.write_str("[:^lower:]"), + Lower => self.wtr.write_str("[:lower:]"), + Print if ast.negated => self.wtr.write_str("[:^print:]"), + Print => self.wtr.write_str("[:print:]"), + Punct if ast.negated => self.wtr.write_str("[:^punct:]"), + Punct => self.wtr.write_str("[:punct:]"), + Space if ast.negated => self.wtr.write_str("[:^space:]"), + Space => self.wtr.write_str("[:space:]"), + Upper if ast.negated => self.wtr.write_str("[:^upper:]"), + Upper => self.wtr.write_str("[:upper:]"), + Word if ast.negated => self.wtr.write_str("[:^word:]"), + Word => self.wtr.write_str("[:word:]"), + Xdigit if ast.negated => self.wtr.write_str("[:^xdigit:]"), + Xdigit => self.wtr.write_str("[:xdigit:]"), + } + } + + fn fmt_class_unicode(&mut self, ast: &ast::ClassUnicode) -> fmt::Result { + use ast::ClassUnicodeKind::*; + use ast::ClassUnicodeOpKind::*; + + if ast.negated { + try!(self.wtr.write_str(r"\P")); + } else { + try!(self.wtr.write_str(r"\p")); + } + match ast.kind { + OneLetter(c) => self.wtr.write_char(c), + Named(ref x) => write!(self.wtr, "{{{}}}", x), + NamedValue { op: Equal, ref name, ref value } => { + write!(self.wtr, "{{{}={}}}", name, value) + } + NamedValue { op: Colon, ref name, ref value } => { + write!(self.wtr, "{{{}:{}}}", name, value) + } + NamedValue { op: NotEqual, ref name, ref value } => { + write!(self.wtr, "{{{}!={}}}", name, value) + } + } + } +} + +#[cfg(test)] +mod tests { + use ast::parse::ParserBuilder; + use super::Printer; + + fn roundtrip(given: &str) { + roundtrip_with(|b| b, given); + } + + fn roundtrip_with(mut f: F, given: &str) + where F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder + { + let mut builder = ParserBuilder::new(); + f(&mut builder); + let ast = builder.build().parse(given).unwrap(); + + let mut printer = Printer::new(); + let mut dst = String::new(); + printer.print(&ast, &mut dst).unwrap(); + assert_eq!(given, dst); + } + + #[test] + fn scratch() { + roundtrip("."); + } + + #[test] + fn print_literal() { + roundtrip("a"); + roundtrip(r"\["); + roundtrip_with(|b| b.octal(true), r"\141"); + roundtrip(r"\x61"); + roundtrip(r"\x7F"); + roundtrip(r"\u0061"); + roundtrip(r"\U00000061"); + roundtrip(r"\x{61}"); + roundtrip(r"\x{7F}"); + roundtrip(r"\u{61}"); + roundtrip(r"\U{61}"); + + roundtrip(r"\a"); + roundtrip(r"\f"); + roundtrip(r"\t"); + roundtrip(r"\n"); + roundtrip(r"\r"); + roundtrip(r"\v"); + roundtrip(r"(?x)\ "); + } + + #[test] + fn print_dot() { + roundtrip("."); + } + + #[test] + fn print_concat() { + roundtrip("ab"); + roundtrip("abcde"); + roundtrip("a(bcd)ef"); + } + + #[test] + fn print_alternation() { + roundtrip("a|b"); + roundtrip("a|b|c|d|e"); + roundtrip("|a|b|c|d|e"); + roundtrip("|a|b|c|d|e|"); + roundtrip("a(b|c|d)|e|f"); + } + + #[test] + fn print_assertion() { + roundtrip(r"^"); + roundtrip(r"$"); + roundtrip(r"\A"); + roundtrip(r"\z"); + roundtrip(r"\b"); + roundtrip(r"\B"); + } + + #[test] + fn print_repetition() { + roundtrip("a?"); + roundtrip("a??"); + roundtrip("a*"); + roundtrip("a*?"); + roundtrip("a+"); + roundtrip("a+?"); + roundtrip("a{5}"); + roundtrip("a{5}?"); + roundtrip("a{5,}"); + roundtrip("a{5,}?"); + roundtrip("a{5,10}"); + roundtrip("a{5,10}?"); + } + + #[test] + fn print_flags() { + roundtrip("(?i)"); + roundtrip("(?-i)"); + roundtrip("(?s-i)"); + roundtrip("(?-si)"); + roundtrip("(?siUmux)"); + } + + #[test] + fn print_group() { + roundtrip("(?i:a)"); + roundtrip("(?Pa)"); + roundtrip("(a)"); + } + + #[test] + fn print_class() { + roundtrip(r"[abc]"); + roundtrip(r"[a-z]"); + roundtrip(r"[^a-z]"); + roundtrip(r"[a-z0-9]"); + roundtrip(r"[-a-z0-9]"); + roundtrip(r"[-a-z0-9]"); + roundtrip(r"[a-z0-9---]"); + roundtrip(r"[a-z&&m-n]"); + roundtrip(r"[[a-z&&m-n]]"); + roundtrip(r"[a-z--m-n]"); + roundtrip(r"[a-z~~m-n]"); + roundtrip(r"[a-z[0-9]]"); + roundtrip(r"[a-z[^0-9]]"); + + roundtrip(r"\d"); + roundtrip(r"\D"); + roundtrip(r"\s"); + roundtrip(r"\S"); + roundtrip(r"\w"); + roundtrip(r"\W"); + + roundtrip(r"[[:alnum:]]"); + roundtrip(r"[[:^alnum:]]"); + roundtrip(r"[[:alpha:]]"); + roundtrip(r"[[:^alpha:]]"); + roundtrip(r"[[:ascii:]]"); + roundtrip(r"[[:^ascii:]]"); + roundtrip(r"[[:blank:]]"); + roundtrip(r"[[:^blank:]]"); + roundtrip(r"[[:cntrl:]]"); + roundtrip(r"[[:^cntrl:]]"); + roundtrip(r"[[:digit:]]"); + roundtrip(r"[[:^digit:]]"); + roundtrip(r"[[:graph:]]"); + roundtrip(r"[[:^graph:]]"); + roundtrip(r"[[:lower:]]"); + roundtrip(r"[[:^lower:]]"); + roundtrip(r"[[:print:]]"); + roundtrip(r"[[:^print:]]"); + roundtrip(r"[[:punct:]]"); + roundtrip(r"[[:^punct:]]"); + roundtrip(r"[[:space:]]"); + roundtrip(r"[[:^space:]]"); + roundtrip(r"[[:upper:]]"); + roundtrip(r"[[:^upper:]]"); + roundtrip(r"[[:word:]]"); + roundtrip(r"[[:^word:]]"); + roundtrip(r"[[:xdigit:]]"); + roundtrip(r"[[:^xdigit:]]"); + + roundtrip(r"\pL"); + roundtrip(r"\PL"); + roundtrip(r"\p{L}"); + roundtrip(r"\P{L}"); + roundtrip(r"\p{X=Y}"); + roundtrip(r"\P{X=Y}"); + roundtrip(r"\p{X:Y}"); + roundtrip(r"\P{X:Y}"); + roundtrip(r"\p{X!=Y}"); + roundtrip(r"\P{X!=Y}"); + } +} diff --git a/regex-syntax-2/src/ast/visitor.rs b/regex-syntax-2/src/ast/visitor.rs new file mode 100644 index 0000000000..268ac45f1b --- /dev/null +++ b/regex-syntax-2/src/ast/visitor.rs @@ -0,0 +1,557 @@ +// Copyright 2018 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use std::fmt; + +use ast::{self, Ast}; + +/// A trait for visiting an abstract syntax tree (AST) in depth first order. +/// +/// The principle aim of this trait is to enable callers to perform case +/// analysis on an abstract syntax tree without necessarily using recursion. +/// In particular, this permits callers to do case analysis with constant stack +/// usage, which can be important since the size of an abstract syntax tree +/// may be proportional to end user input. +/// +/// Typical usage of this trait involves providing an implementation and then +/// running it using the [`visit`](fn.visit.html) function. +/// +/// Note that the abstract syntax tree for a regular expression is quite +/// complex. Unless you specifically need it, you might be able to use the +/// much simpler +/// [high-level intermediate representation](../hir/struct.Hir.html) +/// and its +/// [corresponding `Visitor` trait](../hir/trait.Visitor.html) +/// instead. +pub trait Visitor { + /// The result of visiting an AST. + type Output; + /// An error that visiting an AST might return. + type Err; + + /// All implementors of `Visitor` must provide a `finish` method, which + /// yields the result of visiting the AST or an error. + fn finish(self) -> Result; + + /// This method is called before beginning traversal of the AST. + fn start(&mut self) {} + + /// This method is called on an `Ast` before descending into child `Ast` + /// nodes. + fn visit_pre(&mut self, _ast: &Ast) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called on an `Ast` after descending all of its child + /// `Ast` nodes. + fn visit_post(&mut self, _ast: &Ast) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called between child nodes of an + /// [`Alternation`](struct.Alternation.html). + fn visit_alternation_in(&mut self) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called on every + /// [`ClassSetItem`](enum.ClassSetItem.html) + /// before descending into child nodes. + fn visit_class_set_item_pre( + &mut self, + _ast: &ast::ClassSetItem, + ) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called on every + /// [`ClassSetItem`](enum.ClassSetItem.html) + /// after descending into child nodes. + fn visit_class_set_item_post( + &mut self, + _ast: &ast::ClassSetItem, + ) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called on every + /// [`ClassSetBinaryOp`](struct.ClassSetBinaryOp.html) + /// before descending into child nodes. + fn visit_class_set_binary_op_pre( + &mut self, + _ast: &ast::ClassSetBinaryOp, + ) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called on every + /// [`ClassSetBinaryOp`](struct.ClassSetBinaryOp.html) + /// after descending into child nodes. + fn visit_class_set_binary_op_post( + &mut self, + _ast: &ast::ClassSetBinaryOp, + ) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called between the left hand and right hand child nodes + /// of a [`ClassSetBinaryOp`](struct.ClassSetBinaryOp.html). + fn visit_class_set_binary_op_in( + &mut self, + _ast: &ast::ClassSetBinaryOp, + ) -> Result<(), Self::Err> { + Ok(()) + } +} + +/// Executes an implementation of `Visitor` in constant stack space. +/// +/// This function will visit every node in the given `Ast` while calling the +/// appropriate methods provided by the +/// [`Visitor`](trait.Visitor.html) trait. +/// +/// The primary use case for this method is when one wants to perform case +/// analysis over an `Ast` without using a stack size proportional to the depth +/// of the `Ast`. Namely, this method will instead use constant stack size, but +/// will use heap space proportional to the size of the `Ast`. This may be +/// desirable in cases where the size of `Ast` is proportional to end user +/// input. +/// +/// If the visitor returns an error at any point, then visiting is stopped and +/// the error is returned. +pub fn visit(ast: &Ast, visitor: V) -> Result { + HeapVisitor::new().visit(ast, visitor) +} + +/// HeapVisitor visits every item in an `Ast` recursively using constant stack +/// size and a heap size proportional to the size of the `Ast`. +struct HeapVisitor<'a> { + /// A stack of `Ast` nodes. This is roughly analogous to the call stack + /// used in a typical recursive visitor. + stack: Vec<(&'a Ast, Frame<'a>)>, + /// Similar to the `Ast` stack above, but is used only for character + /// classes. In particular, character classes embed their own mini + /// recursive syntax. + stack_class: Vec<(ClassInduct<'a>, ClassFrame<'a>)>, +} + +/// Represents a single stack frame while performing structural induction over +/// an `Ast`. +enum Frame<'a> { + /// A stack frame allocated just before descending into a repetition + /// operator's child node. + Repetition(&'a ast::Repetition), + /// A stack frame allocated just before descending into a group's child + /// node. + Group(&'a ast::Group), + /// The stack frame used while visiting every child node of a concatenation + /// of expressions. + Concat { + /// The child node we are currently visiting. + head: &'a Ast, + /// The remaining child nodes to visit (which may be empty). + tail: &'a [Ast], + }, + /// The stack frame used while visiting every child node of an alternation + /// of expressions. + Alternation { + /// The child node we are currently visiting. + head: &'a Ast, + /// The remaining child nodes to visit (which may be empty). + tail: &'a [Ast], + }, +} + +/// Represents a single stack frame while performing structural induction over +/// a character class. +enum ClassFrame<'a> { + /// The stack frame used while visiting every child node of a union of + /// character class items. + Union { + /// The child node we are currently visiting. + head: &'a ast::ClassSetItem, + /// The remaining child nodes to visit (which may be empty). + tail: &'a [ast::ClassSetItem], + }, + /// The stack frame used while a binary class operation. + Binary { + op: &'a ast::ClassSetBinaryOp, + }, + /// A stack frame allocated just before descending into a binary operator's + /// left hand child node. + BinaryLHS { + op: &'a ast::ClassSetBinaryOp, + lhs: &'a ast::ClassSet, + rhs: &'a ast::ClassSet, + }, + /// A stack frame allocated just before descending into a binary operator's + /// right hand child node. + BinaryRHS { + op: &'a ast::ClassSetBinaryOp, + rhs: &'a ast::ClassSet, + }, +} + +/// A representation of the inductive step when performing structural induction +/// over a character class. +/// +/// Note that there is no analogous explicit type for the inductive step for +/// `Ast` nodes because the inductive step is just an `Ast`. For character +/// classes, the inductive step can produce one of two possible child nodes: +/// an item or a binary operation. (An item cannot be a binary operation +/// because that would imply binary operations can be unioned in the concrete +/// syntax, which is not possible.) +enum ClassInduct<'a> { + Item(&'a ast::ClassSetItem), + BinaryOp(&'a ast::ClassSetBinaryOp), +} + +impl<'a> HeapVisitor<'a> { + fn new() -> HeapVisitor<'a> { + HeapVisitor { stack: vec![], stack_class: vec![] } + } + + fn visit( + &mut self, + mut ast: &'a Ast, + mut visitor: V, + ) -> Result { + self.stack.clear(); + self.stack_class.clear(); + + visitor.start(); + loop { + try!(visitor.visit_pre(ast)); + if let Some(x) = try!(self.induct(ast, &mut visitor)) { + let child = x.child(); + self.stack.push((ast, x)); + ast = child; + continue; + } + // No induction means we have a base case, so we can post visit + // it now. + try!(visitor.visit_post(ast)); + + // At this point, we now try to pop our call stack until it is + // either empty or we hit another inductive case. + loop { + let (post_ast, frame) = match self.stack.pop() { + None => return visitor.finish(), + Some((post_ast, frame)) => (post_ast, frame), + }; + // If this is a concat/alternate, then we might have additional + // inductive steps to process. + if let Some(x) = self.pop(frame) { + if let Frame::Alternation {..} = x { + try!(visitor.visit_alternation_in()); + } + ast = x.child(); + self.stack.push((post_ast, x)); + break; + } + // Otherwise, we've finished visiting all the child nodes for + // this AST, so we can post visit it now. + try!(visitor.visit_post(post_ast)); + } + } + } + + /// Build a stack frame for the given AST if one is needed (which occurs if + /// and only if there are child nodes in the AST). Otherwise, return None. + /// + /// If this visits a class, then the underlying visitor implementation may + /// return an error which will be passed on here. + fn induct( + &mut self, + ast: &'a Ast, + visitor: &mut V, + ) -> Result>, V::Err> { + Ok(match *ast { + Ast::Class(ast::Class::Bracketed(ref x)) => { + try!(self.visit_class(x, visitor)); + None + } + Ast::Repetition(ref x) => Some(Frame::Repetition(x)), + Ast::Group(ref x) => Some(Frame::Group(x)), + Ast::Concat(ref x) if x.asts.is_empty() => None, + Ast::Concat(ref x) => { + Some(Frame::Concat { + head: &x.asts[0], + tail: &x.asts[1..], + }) + } + Ast::Alternation(ref x) if x.asts.is_empty() => None, + Ast::Alternation(ref x) => { + Some(Frame::Alternation { + head: &x.asts[0], + tail: &x.asts[1..], + }) + } + _ => None, + }) + } + + /// Pops the given frame. If the frame has an additional inductive step, + /// then return it, otherwise return `None`. + fn pop(&self, induct: Frame<'a>) -> Option> { + match induct { + Frame::Repetition(_) => None, + Frame::Group(_) => None, + Frame::Concat { tail, .. } => { + if tail.is_empty() { + None + } else { + Some(Frame::Concat { + head: &tail[0], + tail: &tail[1..], + }) + } + } + Frame::Alternation { tail, .. } => { + if tail.is_empty() { + None + } else { + Some(Frame::Alternation { + head: &tail[0], + tail: &tail[1..], + }) + } + } + } + } + + fn visit_class( + &mut self, + ast: &'a ast::ClassBracketed, + visitor: &mut V, + ) -> Result<(), V::Err> { + let mut ast = ClassInduct::from_bracketed(ast); + loop { + try!(self.visit_class_pre(&ast, visitor)); + if let Some(x) = self.induct_class(&ast) { + let child = x.child(); + self.stack_class.push((ast, x)); + ast = child; + continue; + } + try!(self.visit_class_post(&ast, visitor)); + + // At this point, we now try to pop our call stack until it is + // either empty or we hit another inductive case. + loop { + let (post_ast, frame) = match self.stack_class.pop() { + None => return Ok(()), + Some((post_ast, frame)) => (post_ast, frame), + }; + // If this is a union or a binary op, then we might have + // additional inductive steps to process. + if let Some(x) = self.pop_class(frame) { + if let ClassFrame::BinaryRHS { ref op, .. } = x { + try!(visitor.visit_class_set_binary_op_in(op)); + } + ast = x.child(); + self.stack_class.push((post_ast, x)); + break; + } + // Otherwise, we've finished visiting all the child nodes for + // this class node, so we can post visit it now. + try!(self.visit_class_post(&post_ast, visitor)); + } + } + } + + /// Call the appropriate `Visitor` methods given an inductive step. + fn visit_class_pre( + &self, + ast: &ClassInduct<'a>, + visitor: &mut V, + ) -> Result<(), V::Err> { + match *ast { + ClassInduct::Item(item) => { + try!(visitor.visit_class_set_item_pre(item)); + } + ClassInduct::BinaryOp(op) => { + try!(visitor.visit_class_set_binary_op_pre(op)); + } + } + Ok(()) + } + + /// Call the appropriate `Visitor` methods given an inductive step. + fn visit_class_post( + &self, + ast: &ClassInduct<'a>, + visitor: &mut V, + ) -> Result<(), V::Err> { + match *ast { + ClassInduct::Item(item) => { + try!(visitor.visit_class_set_item_post(item)); + } + ClassInduct::BinaryOp(op) => { + try!(visitor.visit_class_set_binary_op_post(op)); + } + } + Ok(()) + } + + /// Build a stack frame for the given class node if one is needed (which + /// occurs if and only if there are child nodes). Otherwise, return None. + fn induct_class( + &self, + ast: &ClassInduct<'a>, + ) -> Option> { + match *ast { + ClassInduct::Item(&ast::ClassSetItem::Bracketed(ref x)) => { + match x.kind { + ast::ClassSet::Item(ref item) => { + Some(ClassFrame::Union { + head: item, + tail: &[], + }) + } + ast::ClassSet::BinaryOp(ref op) => { + Some(ClassFrame::Binary { op: op }) + } + } + } + ClassInduct::Item(&ast::ClassSetItem::Union(ref x)) => { + if x.items.is_empty() { + None + } else { + Some(ClassFrame::Union { + head: &x.items[0], + tail: &x.items[1..], + }) + } + } + ClassInduct::BinaryOp(op) => { + Some(ClassFrame::BinaryLHS { + op: op, + lhs: &op.lhs, + rhs: &op.rhs, + }) + } + _ => None, + } + } + + /// Pops the given frame. If the frame has an additional inductive step, + /// then return it, otherwise return `None`. + fn pop_class(&self, induct: ClassFrame<'a>) -> Option> { + match induct { + ClassFrame::Union { tail, .. } => { + if tail.is_empty() { + None + } else { + Some(ClassFrame::Union { + head: &tail[0], + tail: &tail[1..], + }) + } + } + ClassFrame::Binary {..} => None, + ClassFrame::BinaryLHS { op, rhs, .. } => { + Some(ClassFrame::BinaryRHS { + op: op, + rhs: rhs, + }) + } + ClassFrame::BinaryRHS {..} => None, + } + } +} + +impl<'a> Frame<'a> { + /// Perform the next inductive step on this frame and return the next + /// child AST node to visit. + fn child(&self) -> &'a Ast { + match *self { + Frame::Repetition(rep) => &rep.ast, + Frame::Group(group) => &group.ast, + Frame::Concat { head, .. } => head, + Frame::Alternation { head, .. } => head, + } + } +} + +impl<'a> ClassFrame<'a> { + /// Perform the next inductive step on this frame and return the next + /// child class node to visit. + fn child(&self) -> ClassInduct<'a> { + match *self { + ClassFrame::Union { head, .. } => ClassInduct::Item(head), + ClassFrame::Binary { op, .. } => ClassInduct::BinaryOp(op), + ClassFrame::BinaryLHS { ref lhs, .. } => { + ClassInduct::from_set(lhs) + } + ClassFrame::BinaryRHS { ref rhs, .. } => { + ClassInduct::from_set(rhs) + } + } + } +} + +impl<'a> ClassInduct<'a> { + fn from_bracketed(ast: &'a ast::ClassBracketed) -> ClassInduct<'a> { + ClassInduct::from_set(&ast.kind) + } + + fn from_set(ast: &'a ast::ClassSet) -> ClassInduct<'a> { + match *ast { + ast::ClassSet::Item(ref item) => ClassInduct::Item(item), + ast::ClassSet::BinaryOp(ref op) => ClassInduct::BinaryOp(op), + } + } +} + +impl<'a> fmt::Debug for ClassFrame<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let x = match *self { + ClassFrame::Union{..} => "Union", + ClassFrame::Binary{..} => "Binary", + ClassFrame::BinaryLHS{..} => "BinaryLHS", + ClassFrame::BinaryRHS{..} => "BinaryRHS", + }; + write!(f, "{}", x) + } +} + +impl<'a> fmt::Debug for ClassInduct<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let x = match *self { + ClassInduct::Item(it) => { + match *it { + ast::ClassSetItem::Empty(_) => "Item(Empty)", + ast::ClassSetItem::Literal(_) => "Item(Literal)", + ast::ClassSetItem::Range(_) => "Item(Range)", + ast::ClassSetItem::Ascii(_) => "Item(Ascii)", + ast::ClassSetItem::Perl(_) => "Item(Perl)", + ast::ClassSetItem::Unicode(_) => "Item(Unicode)", + ast::ClassSetItem::Bracketed(_) => "Item(Bracketed)", + ast::ClassSetItem::Union(_) => "Item(Union)", + } + } + ClassInduct::BinaryOp(it) => { + match it.kind { + ast::ClassSetBinaryOpKind::Intersection => { + "BinaryOp(Intersection)" + } + ast::ClassSetBinaryOpKind::Difference => { + "BinaryOp(Difference)" + } + ast::ClassSetBinaryOpKind::SymmetricDifference => { + "BinaryOp(SymmetricDifference)" + } + } + } + }; + write!(f, "{}", x) + } +} diff --git a/regex-syntax-2/src/either.rs b/regex-syntax-2/src/either.rs new file mode 100644 index 0000000000..7ae41e4ced --- /dev/null +++ b/regex-syntax-2/src/either.rs @@ -0,0 +1,8 @@ +/// A simple binary sum type. +/// +/// This is occasionally useful in an ad hoc fashion. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum Either { + Left(Left), + Right(Right), +} diff --git a/regex-syntax-2/src/error.rs b/regex-syntax-2/src/error.rs new file mode 100644 index 0000000000..53f1231d8e --- /dev/null +++ b/regex-syntax-2/src/error.rs @@ -0,0 +1,278 @@ +use std::cmp; +use std::error; +use std::fmt; +use std::result; + +use ast; +use hir; + +/// A type alias for dealing with errors returned by this crate. +pub type Result = result::Result; + +/// This error type encompasses any error that can be returned by this crate. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum Error { + /// An error that occurred while translating concrete syntax into abstract + /// syntax (AST). + Parse(ast::Error), + /// An error that occurred while translating abstract syntax into a high + /// level intermediate representation (HIR). + Translate(hir::Error), + /// Hints that destructuring should not be exhaustive. + /// + /// This enum may grow additional variants, so this makes sure clients + /// don't count on exhaustive matching. (Otherwise, adding a new variant + /// could break existing code.) + #[doc(hidden)] + __Nonexhaustive, +} + +impl From for Error { + fn from(err: ast::Error) -> Error { + Error::Parse(err) + } +} + +impl From for Error { + fn from(err: hir::Error) -> Error { + Error::Translate(err) + } +} + +impl error::Error for Error { + fn description(&self) -> &str { + match *self { + Error::Parse(ref x) => x.description(), + Error::Translate(ref x) => x.description(), + _ => unreachable!(), + } + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + Error::Parse(ref x) => x.fmt(f), + Error::Translate(ref x) => x.fmt(f), + _ => unreachable!(), + } + } +} + +/// A helper type for formatting nice error messages. +/// +/// This type is responsible for reporting regex parse errors in a nice human +/// readable format. Most of its complexity is from interspersing notational +/// markers pointing out the position where an error occurred. +#[derive(Debug)] +pub struct Formatter<'e, E: 'e> { + /// The original regex pattern in which the error occurred. + pattern: &'e str, + /// The error kind. It must impl fmt::Display. + err: &'e E, + /// The primary span of the error. + span: &'e ast::Span, + /// An auxiliary and optional span, in case the error needs to point to + /// two locations (e.g., when reporting a duplicate capture group name). + aux_span: Option<&'e ast::Span>, +} + +impl<'e> From<&'e ast::Error> for Formatter<'e, ast::ErrorKind> { + fn from(err: &'e ast::Error) -> Self { + Formatter { + pattern: err.pattern(), + err: err.kind(), + span: err.span(), + aux_span: err.auxiliary_span(), + } + } +} + +impl<'e> From<&'e hir::Error> for Formatter<'e, hir::ErrorKind> { + fn from(err: &'e hir::Error) -> Self { + Formatter { + pattern: err.pattern(), + err: err.kind(), + span: err.span(), + aux_span: None, + } + } +} + +impl<'e, E: fmt::Display> fmt::Display for Formatter<'e, E> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let spans = Spans::from_formatter(self); + if self.pattern.contains('\n') { + let divider = repeat_char('~', 79); + + try!(writeln!(f, "regex parse error:")); + try!(writeln!(f, "{}", divider)); + let notated = spans.notate(); + try!(write!(f, "{}", notated)); + try!(writeln!(f, "{}", divider)); + // If we have error spans that cover multiple lines, then we just + // note the line numbers. + if !spans.multi_line.is_empty() { + let mut notes = vec![]; + for span in &spans.multi_line { + notes.push(format!( + "on line {} (column {}) through line {} (column {})", + span.start.line, span.start.column, + span.end.line, span.end.column - 1)); + } + try!(writeln!(f, "{}", notes.join("\n"))); + } + try!(write!(f, "error: {}", self.err)); + } else { + try!(writeln!(f, "regex parse error:")); + let notated = Spans::from_formatter(self).notate(); + try!(write!(f, "{}", notated)); + try!(write!(f, "error: {}", self.err)); + } + Ok(()) + } +} + +/// This type represents an arbitrary number of error spans in a way that makes +/// it convenient to notate the regex pattern. ("Notate" means "point out +/// exactly where the error occurred in the regex pattern.") +/// +/// Technically, we can only ever have two spans given our current error +/// structure. However, after toiling with a specific algorithm for handling +/// two spans, it became obvious that an algorithm to handle an arbitrary +/// number of spans was actually much simpler. +struct Spans<'p> { + /// The original regex pattern string. + pattern: &'p str, + /// The total width that should be used for line numbers. The width is + /// used for left padding the line numbers for alignment. + /// + /// A value of `0` means line numbers should not be displayed. That is, + /// the pattern is itself only one line. + line_number_width: usize, + /// All error spans that occur on a single line. This sequence always has + /// length equivalent to the number of lines in `pattern`, where the index + /// of the sequence represents a line number, starting at `0`. The spans + /// in each line are sorted in ascending order. + by_line: Vec>, + /// All error spans that occur over one or more lines. That is, the start + /// and end position of the span have different line numbers. The spans are + /// sorted in ascending order. + multi_line: Vec, +} + +impl<'p> Spans<'p> { + /// Build a sequence of spans from a formatter. + fn from_formatter<'e, E: fmt::Display>( + fmter: &'p Formatter<'e, E>, + ) -> Spans<'p> { + let line_count = fmter.pattern.lines().count(); + let line_number_width = + if line_count <= 1 { + 0 + } else { + line_count.to_string().len() + }; + let mut spans = Spans { + pattern: &fmter.pattern, + line_number_width: line_number_width, + by_line: vec![vec![]; line_count], + multi_line: vec![], + }; + spans.add(fmter.span.clone()); + if let Some(span) = fmter.aux_span { + spans.add(span.clone()); + } + spans + } + + /// Add the given span to this sequence, putting it in the right place. + fn add(&mut self, span: ast::Span) { + // This is grossly inefficient since we sort after each add, but right + // now, we only ever add two spans at most. + if span.is_one_line() { + let i = span.start.line - 1; // because lines are 1-indexed + self.by_line[i].push(span); + self.by_line[i].sort(); + } else { + self.multi_line.push(span); + self.multi_line.sort(); + } + } + + /// Notate the pattern string with carents (`^`) pointing at each span + /// location. This only applies to spans that occur within a single line. + fn notate(&self) -> String { + let mut notated = String::new(); + for (i, line) in self.pattern.lines().enumerate() { + if self.line_number_width > 0 { + notated.push_str(&self.left_pad_line_number(i + 1)); + notated.push_str(": "); + } else { + notated.push_str(" "); + } + notated.push_str(line); + notated.push('\n'); + if let Some(notes) = self.notate_line(i) { + notated.push_str(¬es); + notated.push('\n'); + } + } + notated + } + + /// Return notes for the line indexed at `i` (zero-based). If there are no + /// spans for the given line, then `None` is returned. Otherwise, an + /// appropriately space padded string with correctly positioned `^` is + /// returned, accounting for line numbers. + fn notate_line(&self, i: usize) -> Option { + let spans = &self.by_line[i]; + if spans.is_empty() { + return None; + } + let mut notes = String::new(); + for _ in 0..self.line_number_padding() { + notes.push(' '); + } + let mut pos = 0; + for span in spans { + for _ in pos..(span.start.column - 1) { + notes.push(' '); + pos += 1; + } + let note_len = span.end.column.saturating_sub(span.start.column); + for _ in 0..cmp::max(1, note_len) { + notes.push('^'); + pos += 1; + } + } + Some(notes) + } + + /// Left pad the given line number with spaces such that it is aligned with + /// other line numbers. + fn left_pad_line_number(&self, n: usize) -> String { + let n = n.to_string(); + let pad = self.line_number_width.checked_sub(n.len()).unwrap(); + let mut result = repeat_char(' ', pad); + result.push_str(&n); + result + } + + /// Return the line number padding beginning at the start of each line of + /// the pattern. + /// + /// If the pattern is only one line, then this returns a fixed padding + /// for visual indentation. + fn line_number_padding(&self) -> usize { + if self.line_number_width == 0 { + 4 + } else { + 2 + self.line_number_width + } + } +} + +fn repeat_char(c: char, count: usize) -> String { + ::std::iter::repeat(c).take(count).collect() +} diff --git a/regex-syntax-2/src/hir/interval.rs b/regex-syntax-2/src/hir/interval.rs new file mode 100644 index 0000000000..a7e70ef596 --- /dev/null +++ b/regex-syntax-2/src/hir/interval.rs @@ -0,0 +1,490 @@ +use std::char; +use std::cmp; +use std::fmt::Debug; +use std::slice; +use std::u8; + +// This module contains an *internal* implementation of interval sets. +// +// The primary invariant that interval sets guards is canonical ordering. That +// is, every interval set contains an ordered sequence of intervals where +// no two intervals are overlapping or adjacent. While this invariant is +// ocassionally broken within the implementation, it should be impossible for +// callers to observe it. +// +// Since case folding (as implemented below) breaks that invariant, we roll +// that into this API even though it is a little out of place in an otherwise +// generic interval set. +// +// Some of the implementation complexity here is a result of me wanting to +// preserve the sequential representation without using additional memory. +// In many cases, we do use linear extra memory, but it is at most 2x and it +// is amortized. If we relaxed the memory requirements, this implementation +// could become much simpler. The extra memory is honestly probably OK, but +// character classes (especially of the Unicode variety) can become quite +// large, and it would be nice to keep regex compilation snappy even in debug +// builds. (In the past, I have been careless with this area of code and it has +// caused slow regex compilations in debug mode, so this isn't entirely +// unwarranted.) +// +// Tests on this are relegated to the public API of HIR in src/hir.rs. + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct IntervalSet { + ranges: Vec, +} + +impl IntervalSet { + /// Create a new set from a sequence of intervals. Each interval is + /// specified as a pair of bounds, where both bounds are inclusive. + /// + /// The given ranges do not need to be in any specific order, and ranges + /// may overlap. + pub fn new>(intervals: T) -> IntervalSet { + let mut set = IntervalSet { ranges: intervals.into_iter().collect() }; + set.canonicalize(); + set + } + + /// Add a new interval to this set. + pub fn push(&mut self, interval: I) { + // TODO: This could be faster. e.g., Push the interval such that + // it preserves canonicalization. + self.ranges.push(interval); + self.canonicalize(); + } + + /// Return an iterator over all intervals in this set. + /// + /// The iterator yields intervals in ascending order. + pub fn iter(&self) -> IntervalSetIter { + IntervalSetIter(self.ranges.iter()) + } + + /// Return an immutable slice of intervals in this set. + /// + /// The sequence returned is in canonical ordering. + pub fn intervals(&self) -> &[I] { + &self.ranges + } + + /// Expand this interval set such that it contains all case folded + /// characters. For example, if this class consists of the range `a-z`, + /// then applying case folding will result in the class containing both the + /// ranges `a-z` and `A-Z`. + pub fn case_fold_simple(&mut self) { + let len = self.ranges.len(); + for i in 0..len { + let range = self.ranges[i]; + range.case_fold_simple(&mut self.ranges); + } + self.canonicalize(); + } + + /// Union this set with the given set, in place. + pub fn union(&mut self, other: &IntervalSet) { + // This could almost certainly be done more efficiently. + self.ranges.extend(&other.ranges); + self.canonicalize(); + } + + /// Intersect this set with the given set, in place. + pub fn intersect(&mut self, other: &IntervalSet) { + if self.ranges.is_empty() { + return; + } + if other.ranges.is_empty() { + self.ranges.clear(); + return; + } + + // There should be a way to do this in-place with constant memory, + // but I couldn't figure out a simple way to do it. So just append + // the intersection to the end of this range, and then drain it before + // we're done. + let drain_end = self.ranges.len(); + + let mut ita = (0..drain_end).into_iter(); + let mut itb = (0..other.ranges.len()).into_iter(); + let mut a = ita.next().unwrap(); + let mut b = itb.next().unwrap(); + loop { + if let Some(ab) = self.ranges[a].intersect(&other.ranges[b]) { + self.ranges.push(ab); + } + let (it, aorb) = + if self.ranges[a].upper() < other.ranges[b].upper() { + (&mut ita, &mut a) + } else { + (&mut itb, &mut b) + }; + match it.next() { + Some(v) => *aorb = v, + None => break, + } + } + self.ranges.drain(..drain_end); + } + + /// Subtract the given set from this set, in place. + pub fn difference(&mut self, other: &IntervalSet) { + if self.ranges.is_empty() || other.ranges.is_empty() { + return; + } + + // This algorithm is (to me) surprisingly complex. A search of the + // interwebs indicate that this is a potentially interesting problem. + // Folks seem to suggest interval or segment trees, but I'd like to + // avoid the overhead (both runtime and conceptual) of that. + // + // The following is basically my Shitty First Draft. Therefore, in + // order to grok it, you probably need to read each line carefully. + // Simplifications are most welcome! + // + // Remember, we can assume the canonical format invariant here, which + // says that all ranges are sorted, not overlapping and not adjacent in + // each class. + let drain_end = self.ranges.len(); + let (mut a, mut b) = (0, 0); + 'LOOP: + while a < drain_end && b < other.ranges.len() { + // Basically, the easy cases are when neither range overlaps with + // each other. If the `b` range is less than our current `a` + // range, then we can skip it and move on. + if other.ranges[b].upper() < self.ranges[a].lower() { + b += 1; + continue; + } + // ... similarly for the `a` range. If it's less than the smallest + // `b` range, then we can add it as-is. + if self.ranges[a].upper() < other.ranges[b].lower() { + let range = self.ranges[a]; + self.ranges.push(range); + a += 1; + continue; + } + // Otherwise, we have overlapping ranges. + assert!(!self.ranges[a].is_intersection_empty(&other.ranges[b])); + + // This part is tricky and was non-obvious to me without looking + // at explicit examples (see the tests). The trickiness stems from + // two things: 1) subtracting a range from another range could + // yield two ranges and 2) after subtracting a range, it's possible + // that future ranges can have an impact. The loop below advances + // the `b` ranges until they can't possible impact the current + // range. + // + // For example, if our `a` range is `a-t` and our next three `b` + // ranges are `a-c`, `g-i`, `r-t` and `x-z`, then we need to apply + // subtraction three times before moving on to the next `a` range. + let mut range = self.ranges[a]; + while b < other.ranges.len() + && !range.is_intersection_empty(&other.ranges[b]) + { + let old_range = range; + range = match range.difference(&other.ranges[b]) { + (None, None) => { + // We lost the entire range, so move on to the next + // without adding this one. + a += 1; + continue 'LOOP; + } + (Some(range1), None) | (None, Some(range1)) => range1, + (Some(range1), Some(range2)) => { + self.ranges.push(range1); + range2 + } + }; + // It's possible that the `b` range has more to contribute + // here. In particular, if it is greater than the original + // range, then it might impact the next `a` range *and* it + // has impacted the current `a` range as much as possible, + // so we can quit. We don't bump `b` so that the next `a` + // range can apply it. + if other.ranges[b].upper() > old_range.upper() { + break; + } + // Otherwise, the next `b` range might apply to the current + // `a` range. + b += 1; + } + self.ranges.push(range); + a += 1; + } + while a < drain_end { + let range = self.ranges[a]; + self.ranges.push(range); + a += 1; + } + self.ranges.drain(..drain_end); + } + + /// Compute the symmetric difference of the two sets, in place. + /// + /// This computes the symmetric difference of two interval sets. This + /// removes all elements in this set that are also in the given set, + /// but also adds all elements from the given set that aren't in this + /// set. That is, the set will contain all elements in either set, + /// but will not contain any elements that are in both sets. + pub fn symmetric_difference(&mut self, other: &IntervalSet) { + // TODO(burntsushi): Fix this so that it amortizes allocation. + let mut intersection = self.clone(); + intersection.intersect(other); + self.union(other); + self.difference(&intersection); + } + + /// Negate this interval set. + /// + /// For all `x` where `x` is any element, if `x` was in this set, then it + /// will not be in this set after negation. + pub fn negate(&mut self) { + if self.ranges.is_empty() { + let (min, max) = (I::Bound::min_value(), I::Bound::max_value()); + self.ranges.push(I::create(min, max)); + return; + } + + // There should be a way to do this in-place with constant memory, + // but I couldn't figure out a simple way to do it. So just append + // the negation to the end of this range, and then drain it before + // we're done. + let drain_end = self.ranges.len(); + + // We do checked arithmetic below because of the canonical ordering + // invariant. + if self.ranges[0].lower() > I::Bound::min_value() { + let upper = self.ranges[0].lower().decrement(); + self.ranges.push(I::create(I::Bound::min_value(), upper)); + } + for i in 1..drain_end { + let lower = self.ranges[i - 1].upper().increment(); + let upper = self.ranges[i].lower().decrement(); + self.ranges.push(I::create(lower, upper)); + } + if self.ranges[drain_end - 1].upper() < I::Bound::max_value() { + let lower = self.ranges[drain_end - 1].upper().increment(); + self.ranges.push(I::create(lower, I::Bound::max_value())); + } + self.ranges.drain(..drain_end); + } + + /// Converts this set into a canonical ordering. + fn canonicalize(&mut self) { + if self.is_canonical() { + return; + } + self.ranges.sort(); + assert!(!self.ranges.is_empty()); + + // Is there a way to do this in-place with constant memory? I couldn't + // figure out a way to do it. So just append the canonicalization to + // the end of this range, and then drain it before we're done. + let drain_end = self.ranges.len(); + for oldi in 0..drain_end { + // If we've added at least one new range, then check if we can + // merge this range in the previously added range. + if self.ranges.len() > drain_end { + let (last, rest) = self.ranges.split_last_mut().unwrap(); + if let Some(union) = last.union(&rest[oldi]) { + *last = union; + continue; + } + } + let range = self.ranges[oldi]; + self.ranges.push(range); + } + self.ranges.drain(..drain_end); + } + + /// Returns true if and only if this class is in a canonical ordering. + fn is_canonical(&self) -> bool { + for pair in self.ranges.windows(2) { + if pair[0] >= pair[1] { + return false; + } + if pair[0].is_contiguous(&pair[1]) { + return false; + } + } + true + } +} + +/// An iterator over intervals. +#[derive(Debug)] +pub struct IntervalSetIter<'a, I: 'a>(slice::Iter<'a, I>); + +impl<'a, I> Iterator for IntervalSetIter<'a, I> { + type Item = &'a I; + + fn next(&mut self) -> Option<&'a I> { + self.0.next() + } +} + +pub trait Interval: + Clone + Copy + Debug + Default + Eq + PartialEq + PartialOrd + Ord +{ + type Bound: Bound; + + fn lower(&self) -> Self::Bound; + fn upper(&self) -> Self::Bound; + fn set_lower(&mut self, bound: Self::Bound); + fn set_upper(&mut self, bound: Self::Bound); + fn case_fold_simple(&self, intervals: &mut Vec); + + /// Create a new interval. + fn create(lower: Self::Bound, upper: Self::Bound) -> Self { + let mut int = Self::default(); + if lower <= upper { + int.set_lower(lower); + int.set_upper(upper); + } else { + int.set_lower(upper); + int.set_upper(lower); + } + int + } + + /// Union the given overlapping range into this range. + /// + /// If the two ranges aren't contiguous, then this returns `None`. + fn union(&self, other: &Self) -> Option { + if !self.is_contiguous(other) { + return None; + } + let lower = cmp::min(self.lower(), other.lower()); + let upper = cmp::max(self.upper(), other.upper()); + Some(Self::create(lower, upper)) + } + + /// Intersect this range with the given range and return the result. + /// + /// If the intersection is empty, then this returns `None`. + fn intersect(&self, other: &Self) -> Option { + let lower = cmp::max(self.lower(), other.lower()); + let upper = cmp::min(self.upper(), other.upper()); + if lower <= upper { + Some(Self::create(lower, upper)) + } else { + None + } + } + + /// Subtract the given range from this range and return the resulting + /// ranges. + /// + /// If subtraction would result in an empty range, then no ranges are + /// returned. + fn difference(&self, other: &Self) -> (Option, Option) { + if self.is_subset(other) { + return (None, None); + } + if self.is_intersection_empty(other) { + return (Some(self.clone()), None); + } + let add_lower = other.lower() > self.lower(); + let add_upper = other.upper() < self.upper(); + // We know this because !self.is_subset(other) and the ranges have + // a non-empty intersection. + assert!(add_lower || add_upper); + let mut ret = (None, None); + if add_lower { + let upper = other.lower().decrement(); + ret.0 = Some(Self::create(self.lower(), upper)); + } + if add_upper { + let lower = other.upper().increment(); + let range = Self::create(lower, self.upper()); + if ret.0.is_none() { + ret.0 = Some(range); + } else { + ret.1 = Some(range); + } + } + ret + } + + /// Compute the symmetric difference the given range from this range. This + /// returns the union of the two ranges minus its intersection. + fn symmetric_difference( + &self, + other: &Self, + ) -> (Option, Option) { + let union = match self.union(other) { + None => return (Some(self.clone()), Some(other.clone())), + Some(union) => union, + }; + let intersection = match self.intersect(other) { + None => return (Some(self.clone()), Some(other.clone())), + Some(intersection) => intersection, + }; + union.difference(&intersection) + } + + /// Returns true if and only if the two ranges are contiguous. Two ranges + /// are contiguous if and only if the ranges are either overlapping or + /// adjacent. + fn is_contiguous(&self, other: &Self) -> bool { + let lower1 = self.lower().as_u32(); + let upper1 = self.upper().as_u32(); + let lower2 = other.lower().as_u32(); + let upper2 = other.upper().as_u32(); + cmp::max(lower1, lower2) <= cmp::min(upper1, upper2).saturating_add(1) + } + + /// Returns true if and only if the intersection of this range and the + /// other range is empty. + fn is_intersection_empty(&self, other: &Self) -> bool { + let (lower1, upper1) = (self.lower(), self.upper()); + let (lower2, upper2) = (other.lower(), other.upper()); + cmp::max(lower1, lower2) > cmp::min(upper1, upper2) + } + + /// Returns true if and only if this range is a subset of the other range. + fn is_subset(&self, other: &Self) -> bool { + let (lower1, upper1) = (self.lower(), self.upper()); + let (lower2, upper2) = (other.lower(), other.upper()); + (lower2 <= lower1 && lower1 <= upper2) + && (lower2 <= upper1 && upper1 <= upper2) + } +} + +pub trait Bound: Copy + Clone + Debug + Eq + PartialEq + PartialOrd + Ord { + fn min_value() -> Self; + fn max_value() -> Self; + fn as_u32(self) -> u32; + fn increment(self) -> Self; + fn decrement(self) -> Self; +} + +impl Bound for u8 { + fn min_value() -> Self { u8::MIN } + fn max_value() -> Self { u8::MAX } + fn as_u32(self) -> u32 { self as u32 } + fn increment(self) -> Self { self.checked_add(1).unwrap() } + fn decrement(self) -> Self { self.checked_sub(1).unwrap() } +} + +impl Bound for char { + fn min_value() -> Self { '\x00' } + fn max_value() -> Self { '\u{10FFFF}' } + fn as_u32(self) -> u32 { self as u32 } + + fn increment(self) -> Self { + match self { + '\u{D7FF}' => '\u{E000}', + c => char::from_u32((c as u32).checked_add(1).unwrap()).unwrap(), + } + } + + fn decrement(self) -> Self { + match self { + '\u{E000}' => '\u{D7FF}', + c => char::from_u32((c as u32).checked_sub(1).unwrap()).unwrap(), + } + } +} + +// Tests for interval sets are written in src/hir.rs against the public API. diff --git a/regex-syntax-2/src/hir/literal/mod.rs b/regex-syntax-2/src/hir/literal/mod.rs new file mode 100644 index 0000000000..3113ec970f --- /dev/null +++ b/regex-syntax-2/src/hir/literal/mod.rs @@ -0,0 +1,1553 @@ +// Copyright 2018 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +/*! +Provides routines for extracting literal prefixes and suffixes from an `Hir`. +*/ + +use std::cmp; +use std::fmt; +use std::iter; +use std::mem; +use std::ops; + +use hir::{self, Hir, HirKind}; +use unicode; + +/// A set of literal byte strings extracted from a regular expression. +/// +/// Every member of the set is a `Literal`, which is represented by a +/// `Vec`. (Notably, it may contain invalid UTF-8.) Every member is +/// said to be either *complete* or *cut*. A complete literal means that +/// it extends until the beginning (or end) of the regular expression. In +/// some circumstances, this can be used to indicate a match in the regular +/// expression. +/// +/// A key aspect of literal extraction is knowing when to stop. It is not +/// feasible to blindly extract all literals from a regular expression, even if +/// there are finitely many. For example, the regular expression `[0-9]{10}` +/// has `10^10` distinct literals. For this reason, literal extraction is +/// bounded to some low number by default using heuristics, but the limits can +/// be tweaked. +/// +/// **WARNING**: Literal extraction uses stack space proportional to the size +/// of the `Hir` expression. At some point, this drawback will be eliminated. +/// To protect yourself, set a reasonable +/// [`nest_limit` on your `Parser`](../../struct.ParserBuilder.html#method.nest_limit). +/// This is done for you by default. +#[derive(Clone, Eq, PartialEq)] +pub struct Literals { + lits: Vec, + limit_size: usize, + limit_class: usize, +} + +/// A single member of a set of literals extracted from a regular expression. +/// +/// This type has `Deref` and `DerefMut` impls to `Vec` so that all slice +/// and `Vec` operations are available. +#[derive(Clone, Eq, Ord)] +pub struct Literal { + v: Vec, + cut: bool, +} + +impl Literals { + /// Returns a new empty set of literals using default limits. + pub fn empty() -> Literals { + Literals { + lits: vec![], + limit_size: 250, + limit_class: 10, + } + } + + /// Returns a set of literal prefixes extracted from the given `Hir`. + pub fn prefixes(expr: &Hir) -> Literals { + let mut lits = Literals::empty(); + lits.union_prefixes(expr); + lits + } + + /// Returns a set of literal suffixes extracted from the given `Hir`. + pub fn suffixes(expr: &Hir) -> Literals { + let mut lits = Literals::empty(); + lits.union_suffixes(expr); + lits + } + + /// Get the approximate size limit (in bytes) of this set. + pub fn limit_size(&self) -> usize { + self.limit_size + } + + /// Set the approximate size limit (in bytes) of this set. + /// + /// If extracting a literal would put the set over this limit, then + /// extraction stops. + /// + /// The new limits will only apply to additions to this set. Existing + /// members remain unchanged, even if the set exceeds the new limit. + pub fn set_limit_size(&mut self, size: usize) -> &mut Literals { + self.limit_size = size; + self + } + + /// Get the character class size limit for this set. + pub fn limit_class(&self) -> usize { + self.limit_class + } + + /// Limits the size of character(or byte) classes considered. + /// + /// A value of `0` prevents all character classes from being considered. + /// + /// This limit also applies to case insensitive literals, since each + /// character in the case insensitive literal is converted to a class, and + /// then case folded. + /// + /// The new limits will only apply to additions to this set. Existing + /// members remain unchanged, even if the set exceeds the new limit. + pub fn set_limit_class(&mut self, size: usize) -> &mut Literals { + self.limit_class = size; + self + } + + /// Returns the set of literals as a slice. Its order is unspecified. + pub fn literals(&self) -> &[Literal] { + &self.lits + } + + /// Returns the length of the smallest literal. + /// + /// Returns None is there are no literals in the set. + pub fn min_len(&self) -> Option { + let mut min = None; + for lit in &self.lits { + match min { + None => min = Some(lit.len()), + Some(m) if lit.len() < m => min = Some(lit.len()), + _ => {} + } + } + min + } + + /// Returns true if all members in this set are complete. + pub fn all_complete(&self) -> bool { + !self.lits.is_empty() && self.lits.iter().all(|l| !l.is_cut()) + } + + /// Returns true if any member in this set is complete. + pub fn any_complete(&self) -> bool { + self.lits.iter().any(|lit| !lit.is_cut()) + } + + /// Returns true if this set contains an empty literal. + pub fn contains_empty(&self) -> bool { + self.lits.iter().any(|lit| lit.is_empty()) + } + + /// Returns true if this set is empty or if all of its members is empty. + pub fn is_empty(&self) -> bool { + self.lits.is_empty() || self.lits.iter().all(|lit| lit.is_empty()) + } + + /// Returns a new empty set of literals using this set's limits. + pub fn to_empty(&self) -> Literals { + let mut lits = Literals::empty(); + lits.set_limit_size(self.limit_size) + .set_limit_class(self.limit_class); + lits + } + + /// Returns the longest common prefix of all members in this set. + pub fn longest_common_prefix(&self) -> &[u8] { + if self.is_empty() { + return &[]; + } + let lit0 = &*self.lits[0]; + let mut len = lit0.len(); + for lit in &self.lits[1..] { + len = cmp::min( + len, + lit.iter() + .zip(lit0) + .take_while(|&(a, b)| a == b) + .count()); + } + &self.lits[0][..len] + } + + /// Returns the longest common suffix of all members in this set. + pub fn longest_common_suffix(&self) -> &[u8] { + if self.is_empty() { + return &[]; + } + let lit0 = &*self.lits[0]; + let mut len = lit0.len(); + for lit in &self.lits[1..] { + len = cmp::min( + len, + lit.iter() + .rev() + .zip(lit0.iter().rev()) + .take_while(|&(a, b)| a == b) + .count()); + } + &self.lits[0][self.lits[0].len() - len..] + } + + /// Returns a new set of literals with the given number of bytes trimmed + /// from the suffix of each literal. + /// + /// If any literal would be cut out completely by trimming, then None is + /// returned. + /// + /// Any duplicates that are created as a result of this transformation are + /// removed. + pub fn trim_suffix(&self, num_bytes: usize) -> Option { + if self.min_len().map(|len| len <= num_bytes).unwrap_or(true) { + return None; + } + let mut new = self.to_empty(); + for mut lit in self.lits.iter().cloned() { + let new_len = lit.len() - num_bytes; + lit.truncate(new_len); + lit.cut(); + new.lits.push(lit); + } + new.lits.sort(); + new.lits.dedup(); + Some(new) + } + + /// Returns a new set of prefixes of this set of literals that are + /// guaranteed to be unambiguous. + /// + /// Any substring match with a member of the set is returned is guaranteed + /// to never overlap with a substring match of another member of the set + /// at the same starting position. + /// + /// Given any two members of the returned set, neither is a substring of + /// the other. + pub fn unambiguous_prefixes(&self) -> Literals { + if self.lits.is_empty() { + return self.to_empty(); + } + let mut old: Vec = self.lits.iter().cloned().collect(); + let mut new = self.to_empty(); + 'OUTER: + while let Some(mut candidate) = old.pop() { + if candidate.is_empty() { + continue; + } + if new.lits.is_empty() { + new.lits.push(candidate); + continue; + } + for lit2 in &mut new.lits { + if lit2.is_empty() { + continue; + } + if &candidate == lit2 { + // If the literal is already in the set, then we can + // just drop it. But make sure that cut literals are + // infectious! + candidate.cut = candidate.cut || lit2.cut; + lit2.cut = candidate.cut; + continue 'OUTER; + } + if candidate.len() < lit2.len() { + if let Some(i) = position(&candidate, &lit2) { + candidate.cut(); + let mut lit3 = lit2.clone(); + lit3.truncate(i); + lit3.cut(); + old.push(lit3); + lit2.clear(); + } + } else { + if let Some(i) = position(&lit2, &candidate) { + lit2.cut(); + let mut new_candidate = candidate.clone(); + new_candidate.truncate(i); + new_candidate.cut(); + old.push(new_candidate); + candidate.clear(); + } + } + // Oops, the candidate is already represented in the set. + if candidate.is_empty() { + continue 'OUTER; + } + } + new.lits.push(candidate); + } + new.lits.retain(|lit| !lit.is_empty()); + new.lits.sort(); + new.lits.dedup(); + new + } + + /// Returns a new set of suffixes of this set of literals that are + /// guaranteed to be unambiguous. + /// + /// Any substring match with a member of the set is returned is guaranteed + /// to never overlap with a substring match of another member of the set + /// at the same ending position. + /// + /// Given any two members of the returned set, neither is a substring of + /// the other. + pub fn unambiguous_suffixes(&self) -> Literals { + // This is a touch wasteful... + let mut lits = self.clone(); + lits.reverse(); + let mut unamb = lits.unambiguous_prefixes(); + unamb.reverse(); + unamb + } + + /// Unions the prefixes from the given expression to this set. + /// + /// If prefixes could not be added (for example, this set would exceed its + /// size limits or the set of prefixes from `expr` includes the empty + /// string), then false is returned. + /// + /// Note that prefix literals extracted from `expr` are said to be complete + /// if and only if the literal extends from the beginning of `expr` to the + /// end of `expr`. + pub fn union_prefixes(&mut self, expr: &Hir) -> bool { + let mut lits = self.to_empty(); + prefixes(expr, &mut lits); + !lits.is_empty() && !lits.contains_empty() && self.union(lits) + } + + /// Unions the suffixes from the given expression to this set. + /// + /// If suffixes could not be added (for example, this set would exceed its + /// size limits or the set of suffixes from `expr` includes the empty + /// string), then false is returned. + /// + /// Note that prefix literals extracted from `expr` are said to be complete + /// if and only if the literal extends from the end of `expr` to the + /// beginning of `expr`. + pub fn union_suffixes(&mut self, expr: &Hir) -> bool { + let mut lits = self.to_empty(); + suffixes(expr, &mut lits); + lits.reverse(); + !lits.is_empty() && !lits.contains_empty() && self.union(lits) + } + + /// Unions this set with another set. + /// + /// If the union would cause the set to exceed its limits, then the union + /// is skipped and it returns false. Otherwise, if the union succeeds, it + /// returns true. + pub fn union(&mut self, lits: Literals) -> bool { + if self.num_bytes() + lits.num_bytes() > self.limit_size { + return false; + } + if lits.is_empty() { + self.lits.push(Literal::empty()); + } else { + self.lits.extend(lits.lits); + } + true + } + + /// Extends this set with another set. + /// + /// The set of literals is extended via a cross product. + /// + /// If a cross product would cause this set to exceed its limits, then the + /// cross product is skipped and it returns false. Otherwise, if the cross + /// product succeeds, it returns true. + pub fn cross_product(&mut self, lits: &Literals) -> bool { + if lits.is_empty() { + return true; + } + // Check that we make sure we stay in our limits. + let mut size_after; + if self.is_empty() || !self.any_complete() { + size_after = self.num_bytes(); + for lits_lit in lits.literals() { + size_after += lits_lit.len(); + } + } else { + size_after = self.lits.iter().fold(0, |accum, lit| { + accum + if lit.is_cut() { lit.len() } else { 0 } + }); + for lits_lit in lits.literals() { + for self_lit in self.literals() { + if !self_lit.is_cut() { + size_after += self_lit.len() + lits_lit.len(); + } + } + } + } + if size_after > self.limit_size { + return false; + } + + let mut base = self.remove_complete(); + if base.is_empty() { + base = vec![Literal::empty()]; + } + for lits_lit in lits.literals() { + for mut self_lit in base.clone() { + self_lit.extend(&**lits_lit); + self_lit.cut = lits_lit.cut; + self.lits.push(self_lit); + } + } + true + } + + /// Extends each literal in this set with the bytes given. + /// + /// If the set is empty, then the given literal is added to the set. + /// + /// If adding any number of bytes to all members of this set causes a limit + /// to be exceeded, then no bytes are added and false is returned. If a + /// prefix of `bytes` can be fit into this set, then it is used and all + /// resulting literals are cut. + pub fn cross_add(&mut self, bytes: &[u8]) -> bool { + // N.B. This could be implemented by simply calling cross_product with + // a literal set containing just `bytes`, but we can be smarter about + // taking shorter prefixes of `bytes` if they'll fit. + if bytes.is_empty() { + return true; + } + if self.lits.is_empty() { + let i = cmp::min(self.limit_size, bytes.len()); + self.lits.push(Literal::new(bytes[..i].to_owned())); + self.lits[0].cut = i < bytes.len(); + return !self.lits[0].is_cut(); + } + let size = self.num_bytes(); + if size + self.lits.len() >= self.limit_size { + return false; + } + let mut i = 1; + while size + (i * self.lits.len()) <= self.limit_size + && i < bytes.len() { + i += 1; + } + for lit in &mut self.lits { + if !lit.is_cut() { + lit.extend(&bytes[..i]); + if i < bytes.len() { + lit.cut(); + } + } + } + true + } + + /// Adds the given literal to this set. + /// + /// Returns false if adding this literal would cause the class to be too + /// big. + pub fn add(&mut self, lit: Literal) -> bool { + if self.num_bytes() + lit.len() > self.limit_size { + return false; + } + self.lits.push(lit); + true + } + + /// Extends each literal in this set with the character class given. + /// + /// Returns false if the character class was too big to add. + pub fn add_char_class(&mut self, cls: &hir::ClassUnicode) -> bool { + self._add_char_class(cls, false) + } + + /// Extends each literal in this set with the character class given, + /// writing the bytes of each character in reverse. + /// + /// Returns false if the character class was too big to add. + fn add_char_class_reverse(&mut self, cls: &hir::ClassUnicode) -> bool { + self._add_char_class(cls, true) + } + + fn _add_char_class( + &mut self, + cls: &hir::ClassUnicode, + reverse: bool, + ) -> bool { + use std::char; + + if self.class_exceeds_limits(cls_char_count(cls)) { + return false; + } + let mut base = self.remove_complete(); + if base.is_empty() { + base = vec![Literal::empty()]; + } + for r in cls.iter() { + let (s, e) = (r.start as u32, r.end as u32 + 1); + for c in (s..e).filter_map(char::from_u32) { + for mut lit in base.clone() { + let mut bytes = c.to_string().into_bytes(); + if reverse { + bytes.reverse(); + } + lit.extend(&bytes); + self.lits.push(lit); + } + } + } + true + } + + /// Extends each literal in this set with the byte class given. + /// + /// Returns false if the byte class was too big to add. + pub fn add_byte_class(&mut self, cls: &hir::ClassBytes) -> bool { + if self.class_exceeds_limits(cls_byte_count(cls)) { + return false; + } + let mut base = self.remove_complete(); + if base.is_empty() { + base = vec![Literal::empty()]; + } + for r in cls.iter() { + let (s, e) = (r.start as u32, r.end as u32 + 1); + for b in (s..e).map(|b| b as u8) { + for mut lit in base.clone() { + lit.push(b); + self.lits.push(lit); + } + } + } + true + } + + /// Cuts every member of this set. When a member is cut, it can never + /// be extended. + pub fn cut(&mut self) { + for lit in &mut self.lits { + lit.cut(); + } + } + + /// Reverses all members in place. + pub fn reverse(&mut self) { + for lit in &mut self.lits { + lit.reverse(); + } + } + + /// Clears this set of all members. + pub fn clear(&mut self) { + self.lits.clear(); + } + + /// Pops all complete literals out of this set. + fn remove_complete(&mut self) -> Vec { + let mut base = vec![]; + for lit in mem::replace(&mut self.lits, vec![]) { + if lit.is_cut() { + self.lits.push(lit); + } else { + base.push(lit); + } + } + base + } + + /// Returns the total number of bytes in this set. + fn num_bytes(&self) -> usize { + self.lits.iter().fold(0, |accum, lit| accum + lit.len()) + } + + /// Returns true if a character class with the given size would cause this + /// set to exceed its limits. + /// + /// The size given should correspond to the number of items in the class. + fn class_exceeds_limits(&self, size: usize) -> bool { + if size > self.limit_class { + return true; + } + // This is an approximation since codepoints in a char class can encode + // to 1-4 bytes. + let new_byte_count = + if self.lits.is_empty() { + size + } else { + self.lits + .iter() + .fold(0, |accum, lit| { + accum + if lit.is_cut() { + // If the literal is cut, then we'll never add + // anything to it, so don't count it. + 0 + } else { + (lit.len() + 1) * size + } + }) + }; + new_byte_count > self.limit_size + } +} + +fn prefixes(expr: &Hir, lits: &mut Literals) { + match *expr.kind() { + HirKind::Literal(hir::Literal::Unicode(c)) => { + let mut buf = [0u8; 4]; + let i = unicode::encode_utf8(c, &mut buf).unwrap(); + lits.cross_add(&buf[..i]); + } + HirKind::Literal(hir::Literal::Byte(b)) => { + lits.cross_add(&[b]); + } + HirKind::Class(hir::Class::Unicode(ref cls)) => { + if !lits.add_char_class(cls) { + lits.cut(); + } + } + HirKind::Class(hir::Class::Bytes(ref cls)) => { + if !lits.add_byte_class(cls) { + lits.cut(); + } + } + HirKind::Group(hir::Group { ref hir, .. }) => { + prefixes(&**hir, lits); + } + HirKind::Repetition(ref x) => { + match x.kind { + hir::RepetitionKind::ZeroOrOne => { + repeat_zero_or_one_literals(&x.hir, lits, prefixes); + } + hir::RepetitionKind::ZeroOrMore => { + repeat_zero_or_more_literals(&x.hir, lits, prefixes); + } + hir::RepetitionKind::OneOrMore => { + repeat_one_or_more_literals(&x.hir, lits, prefixes); + } + hir::RepetitionKind::Range(ref rng) => { + let (min, max) = match *rng { + hir::RepetitionRange::Exactly(m) => { + (m, Some(m)) + } + hir::RepetitionRange::AtLeast(m) => { + (m, None) + } + hir::RepetitionRange::Bounded(m, n) => { + (m, Some(n)) + } + }; + repeat_range_literals( + &x.hir, min, max, x.greedy, lits, prefixes) + } + } + } + HirKind::Concat(ref es) if es.is_empty() => {} + HirKind::Concat(ref es) if es.len() == 1 => prefixes(&es[0], lits), + HirKind::Concat(ref es) => { + for e in es { + if let HirKind::Anchor(hir::Anchor::StartText) = *e.kind() { + if !lits.is_empty() { + lits.cut(); + break; + } + lits.add(Literal::empty()); + continue; + } + let mut lits2 = lits.to_empty(); + prefixes(e, &mut lits2); + if !lits.cross_product(&lits2) || !lits2.any_complete() { + // If this expression couldn't yield any literal that + // could be extended, then we need to quit. Since we're + // short-circuiting, we also need to freeze every member. + lits.cut(); + break; + } + } + } + HirKind::Alternation(ref es) => { + alternate_literals(es, lits, prefixes); + } + _ => lits.cut(), + } +} + +fn suffixes(expr: &Hir, lits: &mut Literals) { + match *expr.kind() { + HirKind::Literal(hir::Literal::Unicode(c)) => { + let mut buf = [0u8; 4]; + let i = unicode::encode_utf8(c, &mut buf).unwrap(); + let mut buf = &mut buf[..i]; + buf.reverse(); + lits.cross_add(buf); + } + HirKind::Literal(hir::Literal::Byte(b)) => { + lits.cross_add(&[b]); + } + HirKind::Class(hir::Class::Unicode(ref cls)) => { + if !lits.add_char_class_reverse(cls) { + lits.cut(); + } + } + HirKind::Class(hir::Class::Bytes(ref cls)) => { + if !lits.add_byte_class(cls) { + lits.cut(); + } + } + HirKind::Group(hir::Group { ref hir, .. }) => { + suffixes(&**hir, lits); + } + HirKind::Repetition(ref x) => { + match x.kind { + hir::RepetitionKind::ZeroOrOne => { + repeat_zero_or_one_literals(&x.hir, lits, suffixes); + } + hir::RepetitionKind::ZeroOrMore => { + repeat_zero_or_more_literals(&x.hir, lits, suffixes); + } + hir::RepetitionKind::OneOrMore => { + repeat_one_or_more_literals(&x.hir, lits, suffixes); + } + hir::RepetitionKind::Range(ref rng) => { + let (min, max) = match *rng { + hir::RepetitionRange::Exactly(m) => { + (m, Some(m)) + } + hir::RepetitionRange::AtLeast(m) => { + (m, None) + } + hir::RepetitionRange::Bounded(m, n) => { + (m, Some(n)) + } + }; + repeat_range_literals( + &x.hir, min, max, x.greedy, lits, suffixes) + } + } + } + HirKind::Concat(ref es) if es.is_empty() => {} + HirKind::Concat(ref es) if es.len() == 1 => suffixes(&es[0], lits), + HirKind::Concat(ref es) => { + for e in es.iter().rev() { + if let HirKind::Anchor(hir::Anchor::EndText) = *e.kind() { + if !lits.is_empty() { + lits.cut(); + break; + } + lits.add(Literal::empty()); + continue; + } + let mut lits2 = lits.to_empty(); + suffixes(e, &mut lits2); + if !lits.cross_product(&lits2) || !lits2.any_complete() { + // If this expression couldn't yield any literal that + // could be extended, then we need to quit. Since we're + // short-circuiting, we also need to freeze every member. + lits.cut(); + break; + } + } + } + HirKind::Alternation(ref es) => { + alternate_literals(es, lits, suffixes); + } + _ => lits.cut(), + } +} + +fn repeat_zero_or_one_literals( + e: &Hir, + lits: &mut Literals, + mut f: F, +) { + let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty()); + lits3.set_limit_size(lits.limit_size() / 2); + f(e, &mut lits3); + + if lits3.is_empty() || !lits2.cross_product(&lits3) { + lits.cut(); + return; + } + lits2.add(Literal::empty()); + if !lits.union(lits2) { + lits.cut(); + } +} + +fn repeat_zero_or_more_literals( + e: &Hir, + lits: &mut Literals, + mut f: F, +) { + let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty()); + lits3.set_limit_size(lits.limit_size() / 2); + f(e, &mut lits3); + + if lits3.is_empty() || !lits2.cross_product(&lits3) { + lits.cut(); + return; + } + lits2.cut(); + lits2.add(Literal::empty()); + if !lits.union(lits2) { + lits.cut(); + } +} + +fn repeat_one_or_more_literals( + e: &Hir, + lits: &mut Literals, + mut f: F, +) { + f(e, lits); + lits.cut(); +} + +fn repeat_range_literals( + e: &Hir, + min: u32, + max: Option, + greedy: bool, + lits: &mut Literals, + mut f: F, +) { + if min == 0 { + // This is a bit conservative. If `max` is set, then we could + // treat this as a finite set of alternations. For now, we + // just treat it as `e*`. + f(&Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::ZeroOrMore, + greedy: greedy, + hir: Box::new(e.clone()), + }), lits); + } else { + if min > 0 { + let n = cmp::min(lits.limit_size, min as usize); + let es = iter::repeat(e.clone()).take(n).collect(); + f(&Hir::concat(es), lits); + if n < min as usize || lits.contains_empty() { + lits.cut(); + } + } + if max.map_or(true, |max| min < max) { + lits.cut(); + } + } +} + +fn alternate_literals( + es: &[Hir], + lits: &mut Literals, + mut f: F, +) { + let mut lits2 = lits.to_empty(); + for e in es { + let mut lits3 = lits.to_empty(); + lits3.set_limit_size(lits.limit_size() / 5); + f(e, &mut lits3); + if lits3.is_empty() || !lits2.union(lits3) { + // If we couldn't find suffixes for *any* of the + // alternates, then the entire alternation has to be thrown + // away and any existing members must be frozen. Similarly, + // if the union couldn't complete, stop and freeze. + lits.cut(); + return; + } + } + if !lits.cross_product(&lits2) { + lits.cut(); + } +} + +impl fmt::Debug for Literals { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("Literals") + .field("lits", &self.lits) + .field("limit_size", &self.limit_size) + .field("limit_class", &self.limit_class) + .finish() + } +} + +impl Literal { + /// Returns a new complete literal with the bytes given. + pub fn new(bytes: Vec) -> Literal { + Literal { v: bytes, cut: false } + } + + /// Returns a new complete empty literal. + pub fn empty() -> Literal { + Literal { v: vec![], cut: false } + } + + /// Returns true if this literal was "cut." + pub fn is_cut(&self) -> bool { + self.cut + } + + /// Cuts this literal. + pub fn cut(&mut self) { + self.cut = true; + } +} + +impl PartialEq for Literal { + fn eq(&self, other: &Literal) -> bool { + self.v == other.v + } +} + +impl PartialOrd for Literal { + fn partial_cmp(&self, other: &Literal) -> Option { + self.v.partial_cmp(&other.v) + } +} + +impl fmt::Debug for Literal { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + if self.is_cut() { + write!(f, "Cut({})", escape_unicode(&self.v)) + } else { + write!(f, "Complete({})", escape_unicode(&self.v)) + } + } +} + +impl AsRef<[u8]> for Literal { + fn as_ref(&self) -> &[u8] { &self.v } +} + +impl ops::Deref for Literal { + type Target = Vec; + fn deref(&self) -> &Vec { &self.v } +} + +impl ops::DerefMut for Literal { + fn deref_mut(&mut self) -> &mut Vec { &mut self.v } +} + +fn position(needle: &[u8], mut haystack: &[u8]) -> Option { + let mut i = 0; + while haystack.len() >= needle.len() { + if needle == &haystack[..needle.len()] { + return Some(i); + } + i += 1; + haystack = &haystack[1..]; + } + None +} + +fn escape_unicode(bytes: &[u8]) -> String { + let show = match ::std::str::from_utf8(bytes) { + Ok(v) => v.to_string(), + Err(_) => escape_bytes(bytes), + }; + let mut space_escaped = String::new(); + for c in show.chars() { + if c.is_whitespace() { + let escaped = if c as u32 <= 0x7F { + escape_byte(c as u8) + } else { + if c as u32 <= 0xFFFF { + format!(r"\u{{{:04x}}}", c as u32) + } else { + format!(r"\U{{{:08x}}}", c as u32) + } + }; + space_escaped.push_str(&escaped); + } else { + space_escaped.push(c); + } + } + space_escaped +} + +fn escape_bytes(bytes: &[u8]) -> String { + let mut s = String::new(); + for &b in bytes { + s.push_str(&escape_byte(b)); + } + s +} + +fn escape_byte(byte: u8) -> String { + use std::ascii::escape_default; + + let escaped: Vec = escape_default(byte).collect(); + String::from_utf8_lossy(&escaped).into_owned() +} + +fn cls_char_count(cls: &hir::ClassUnicode) -> usize { + cls.iter() + .map(|&r| 1 + (r.end as u32) - (r.start as u32)) + .sum::() as usize +} + +fn cls_byte_count(cls: &hir::ClassBytes) -> usize { + cls.iter() + .map(|&r| 1 + (r.end as u32) - (r.start as u32)) + .sum::() as usize +} + +#[cfg(test)] +mod tests { + use std::fmt; + + use ParserBuilder; + use hir::Hir; + use super::{Literals, Literal, escape_bytes}; + + // To make test failures easier to read. + #[derive(Debug, Eq, PartialEq)] + struct Bytes(Vec); + #[derive(Debug, Eq, PartialEq)] + struct Unicode(Vec); + + fn escape_lits(blits: &[Literal]) -> Vec { + let mut ulits = vec![]; + for blit in blits { + ulits.push(ULiteral { + v: escape_bytes(&blit), + cut: blit.is_cut(), + }); + } + ulits + } + + fn create_lits>(it: I) -> Literals { + Literals { + lits: it.into_iter().collect(), + limit_size: 0, + limit_class: 0, + } + } + + // Needs to be pub for 1.3? + #[derive(Clone, Eq, PartialEq)] + pub struct ULiteral { + v: String, + cut: bool, + } + + impl ULiteral { + fn is_cut(&self) -> bool { self.cut } + } + + impl fmt::Debug for ULiteral { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + if self.is_cut() { + write!(f, "Cut({})", self.v) + } else { + write!(f, "Complete({})", self.v) + } + } + } + + impl PartialEq for ULiteral { + fn eq(&self, other: &Literal) -> bool { + self.v.as_bytes() == &*other.v && self.is_cut() == other.is_cut() + } + } + + impl PartialEq for Literal { + fn eq(&self, other: &ULiteral) -> bool { + &*self.v == other.v.as_bytes() && self.is_cut() == other.is_cut() + } + } + + #[allow(non_snake_case)] + fn C(s: &'static str) -> ULiteral { + ULiteral { v: s.to_owned(), cut: true } + } + #[allow(non_snake_case)] + fn M(s: &'static str) -> ULiteral { + ULiteral { v: s.to_owned(), cut: false } + } + + fn prefixes(lits: &mut Literals, expr: &Hir) { + lits.union_prefixes(expr); + } + + fn suffixes(lits: &mut Literals, expr: &Hir) { + lits.union_suffixes(expr); + } + + macro_rules! assert_lit_eq { + ($which:ident, $got_lits:expr, $($expected_lit:expr),*) => {{ + let expected: Vec = vec![$($expected_lit),*]; + let lits = $got_lits; + assert_eq!( + $which(expected.clone()), + $which(escape_lits(lits.literals()))); + assert_eq!( + !expected.is_empty() && expected.iter().all(|l| !l.is_cut()), + lits.all_complete()); + assert_eq!( + expected.iter().any(|l| !l.is_cut()), + lits.any_complete()); + }}; + } + + macro_rules! test_lit { + ($name:ident, $which:ident, $re:expr) => { + test_lit!($name, $which, $re,); + }; + ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => { + #[test] + fn $name() { + let expr = ParserBuilder::new() + .build() + .parse($re) + .unwrap(); + let lits = Literals::$which(&expr); + assert_lit_eq!(Unicode, lits, $($lit),*); + + let expr = ParserBuilder::new() + .allow_invalid_utf8(true) + .unicode(false) + .build() + .parse($re) + .unwrap(); + let lits = Literals::$which(&expr); + assert_lit_eq!(Bytes, lits, $($lit),*); + } + }; + } + + // ************************************************************************ + // Tests for prefix literal extraction. + // ************************************************************************ + + // Elementary tests. + test_lit!(pfx_one_lit1, prefixes, "a", M("a")); + test_lit!(pfx_one_lit2, prefixes, "abc", M("abc")); + test_lit!(pfx_one_lit3, prefixes, "(?u)☃", M("\\xe2\\x98\\x83")); + test_lit!(pfx_one_lit4, prefixes, "(?ui)☃", M("\\xe2\\x98\\x83")); + test_lit!(pfx_class1, prefixes, "[1-4]", + M("1"), M("2"), M("3"), M("4")); + test_lit!(pfx_class2, prefixes, "(?u)[☃Ⅰ]", + M("\\xe2\\x85\\xa0"), M("\\xe2\\x98\\x83")); + test_lit!(pfx_class3, prefixes, "(?ui)[☃Ⅰ]", + M("\\xe2\\x85\\xa0"), M("\\xe2\\x85\\xb0"), + M("\\xe2\\x98\\x83")); + test_lit!(pfx_one_lit_casei1, prefixes, "(?i)a", + M("A"), M("a")); + test_lit!(pfx_one_lit_casei2, prefixes, "(?i)abc", + M("ABC"), M("aBC"), M("AbC"), M("abC"), + M("ABc"), M("aBc"), M("Abc"), M("abc")); + test_lit!(pfx_group1, prefixes, "(a)", M("a")); + test_lit!(pfx_rep_zero_or_one1, prefixes, "a?"); + test_lit!(pfx_rep_zero_or_one2, prefixes, "(?:abc)?"); + test_lit!(pfx_rep_zero_or_more1, prefixes, "a*"); + test_lit!(pfx_rep_zero_or_more2, prefixes, "(?:abc)*"); + test_lit!(pfx_rep_one_or_more1, prefixes, "a+", C("a")); + test_lit!(pfx_rep_one_or_more2, prefixes, "(?:abc)+", C("abc")); + test_lit!(pfx_rep_nested_one_or_more, prefixes, "(?:a+)+", C("a")); + test_lit!(pfx_rep_range1, prefixes, "a{0}"); + test_lit!(pfx_rep_range2, prefixes, "a{0,}"); + test_lit!(pfx_rep_range3, prefixes, "a{0,1}"); + test_lit!(pfx_rep_range4, prefixes, "a{1}", M("a")); + test_lit!(pfx_rep_range5, prefixes, "a{2}", M("aa")); + test_lit!(pfx_rep_range6, prefixes, "a{1,2}", C("a")); + test_lit!(pfx_rep_range7, prefixes, "a{2,3}", C("aa")); + + // Test regexes with concatenations. + test_lit!(pfx_cat1, prefixes, "(?:a)(?:b)", M("ab")); + test_lit!(pfx_cat2, prefixes, "[ab]z", M("az"), M("bz")); + test_lit!(pfx_cat3, prefixes, "(?i)[ab]z", + M("AZ"), M("BZ"), M("aZ"), M("bZ"), + M("Az"), M("Bz"), M("az"), M("bz")); + test_lit!(pfx_cat4, prefixes, "[ab][yz]", + M("ay"), M("by"), M("az"), M("bz")); + test_lit!(pfx_cat5, prefixes, "a*b", C("a"), M("b")); + test_lit!(pfx_cat6, prefixes, "a*b*c", C("a"), C("b"), M("c")); + test_lit!(pfx_cat7, prefixes, "a*b*c+", C("a"), C("b"), C("c")); + test_lit!(pfx_cat8, prefixes, "a*b+c", C("a"), C("b")); + test_lit!(pfx_cat9, prefixes, "a*b+c*", C("a"), C("b")); + test_lit!(pfx_cat10, prefixes, "ab*", C("ab"), M("a")); + test_lit!(pfx_cat11, prefixes, "ab*c", C("ab"), M("ac")); + test_lit!(pfx_cat12, prefixes, "ab+", C("ab")); + test_lit!(pfx_cat13, prefixes, "ab+c", C("ab")); + test_lit!(pfx_cat14, prefixes, "a^", C("a")); + test_lit!(pfx_cat15, prefixes, "$a"); + test_lit!(pfx_cat16, prefixes, r"ab*c", C("ab"), M("ac")); + test_lit!(pfx_cat17, prefixes, r"ab+c", C("ab")); + test_lit!(pfx_cat18, prefixes, r"z*azb", C("z"), M("azb")); + test_lit!(pfx_cat19, prefixes, "a.z", C("a")); + + // Test regexes with alternations. + test_lit!(pfx_alt1, prefixes, "a|b", M("a"), M("b")); + test_lit!(pfx_alt2, prefixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b")); + test_lit!(pfx_alt3, prefixes, "y(?:a|b)z", M("yaz"), M("ybz")); + test_lit!(pfx_alt4, prefixes, "a|b*"); + test_lit!(pfx_alt5, prefixes, "a|b+", M("a"), C("b")); + test_lit!(pfx_alt6, prefixes, "a|(?:b|c*)"); + test_lit!(pfx_alt7, prefixes, "(a|b)*c|(a|ab)*c", + C("a"), C("b"), M("c"), C("a"), C("ab"), M("c")); + test_lit!(pfx_alt8, prefixes, "a*b|c", C("a"), M("b"), M("c")); + + // Test regexes with empty assertions. + test_lit!(pfx_empty1, prefixes, "^a", M("a")); + test_lit!(pfx_empty2, prefixes, "a${2}", C("a")); + test_lit!(pfx_empty3, prefixes, "^abc", M("abc")); + test_lit!(pfx_empty4, prefixes, "(?:^abc)|(?:^z)", M("abc"), M("z")); + + // Make sure some curious regexes have no prefixes. + test_lit!(pfx_nothing1, prefixes, "."); + test_lit!(pfx_nothing2, prefixes, "(?s)."); + test_lit!(pfx_nothing3, prefixes, "^"); + test_lit!(pfx_nothing4, prefixes, "$"); + test_lit!(pfx_nothing6, prefixes, "(?m)$"); + test_lit!(pfx_nothing7, prefixes, r"\b"); + test_lit!(pfx_nothing8, prefixes, r"\B"); + + // Test a few regexes that defeat any prefix literal detection. + test_lit!(pfx_defeated1, prefixes, ".a"); + test_lit!(pfx_defeated2, prefixes, "(?s).a"); + test_lit!(pfx_defeated3, prefixes, "a*b*c*"); + test_lit!(pfx_defeated4, prefixes, "a|."); + test_lit!(pfx_defeated5, prefixes, ".|a"); + test_lit!(pfx_defeated6, prefixes, "a|^"); + test_lit!(pfx_defeated7, prefixes, ".(?:a(?:b)(?:c))"); + test_lit!(pfx_defeated8, prefixes, "$a"); + test_lit!(pfx_defeated9, prefixes, "(?m)$a"); + test_lit!(pfx_defeated10, prefixes, r"\ba"); + test_lit!(pfx_defeated11, prefixes, r"\Ba"); + test_lit!(pfx_defeated12, prefixes, "^*a"); + test_lit!(pfx_defeated13, prefixes, "^+a"); + + test_lit!( + pfx_crazy1, + prefixes, + r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", + C("Mo\\'am"), C("Mu\\'am"), C("Moam"), C("Muam")); + + // ************************************************************************ + // Tests for quiting prefix literal search. + // ************************************************************************ + + macro_rules! test_exhausted { + ($name:ident, $which:ident, $re:expr) => { + test_exhausted!($name, $which, $re,); + }; + ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => { + #[test] + fn $name() { + let expr = ParserBuilder::new() + .build() + .parse($re) + .unwrap(); + let mut lits = Literals::empty(); + lits.set_limit_size(20).set_limit_class(10); + $which(&mut lits, &expr); + assert_lit_eq!(Unicode, lits, $($lit),*); + + let expr = ParserBuilder::new() + .allow_invalid_utf8(true) + .unicode(false) + .build() + .parse($re) + .unwrap(); + let mut lits = Literals::empty(); + lits.set_limit_size(20).set_limit_class(10); + $which(&mut lits, &expr); + assert_lit_eq!(Bytes, lits, $($lit),*); + } + }; + } + + // These test use a much lower limit than the default so that we can + // write test cases of reasonable size. + test_exhausted!(pfx_exhausted1, prefixes, "[a-z]"); + test_exhausted!(pfx_exhausted2, prefixes, "[a-z]*A"); + test_exhausted!(pfx_exhausted3, prefixes, "A[a-z]Z", C("A")); + test_exhausted!(pfx_exhausted4, prefixes, "(?i)foobar", + C("FO"), C("fO"), C("Fo"), C("fo")); + test_exhausted!(pfx_exhausted5, prefixes, "(?:ab){100}", + C("abababababababababab")); + test_exhausted!(pfx_exhausted6, prefixes, "(?:(?:ab){100})*cd", + C("ababababab"), M("cd")); + test_exhausted!(pfx_exhausted7, prefixes, "z(?:(?:ab){100})*cd", + C("zababababab"), M("zcd")); + test_exhausted!(pfx_exhausted8, prefixes, "aaaaaaaaaaaaaaaaaaaaz", + C("aaaaaaaaaaaaaaaaaaaa")); + + // ************************************************************************ + // Tests for suffix literal extraction. + // ************************************************************************ + + // Elementary tests. + test_lit!(sfx_one_lit1, suffixes, "a", M("a")); + test_lit!(sfx_one_lit2, suffixes, "abc", M("abc")); + test_lit!(sfx_one_lit3, suffixes, "(?u)☃", M("\\xe2\\x98\\x83")); + test_lit!(sfx_one_lit4, suffixes, "(?ui)☃", M("\\xe2\\x98\\x83")); + test_lit!(sfx_class1, suffixes, "[1-4]", + M("1"), M("2"), M("3"), M("4")); + test_lit!(sfx_class2, suffixes, "(?u)[☃Ⅰ]", + M("\\xe2\\x85\\xa0"), M("\\xe2\\x98\\x83")); + test_lit!(sfx_class3, suffixes, "(?ui)[☃Ⅰ]", + M("\\xe2\\x85\\xa0"), M("\\xe2\\x85\\xb0"), + M("\\xe2\\x98\\x83")); + test_lit!(sfx_one_lit_casei1, suffixes, "(?i)a", + M("A"), M("a")); + test_lit!(sfx_one_lit_casei2, suffixes, "(?i)abc", + M("ABC"), M("ABc"), M("AbC"), M("Abc"), + M("aBC"), M("aBc"), M("abC"), M("abc")); + test_lit!(sfx_group1, suffixes, "(a)", M("a")); + test_lit!(sfx_rep_zero_or_one1, suffixes, "a?"); + test_lit!(sfx_rep_zero_or_one2, suffixes, "(?:abc)?"); + test_lit!(sfx_rep_zero_or_more1, suffixes, "a*"); + test_lit!(sfx_rep_zero_or_more2, suffixes, "(?:abc)*"); + test_lit!(sfx_rep_one_or_more1, suffixes, "a+", C("a")); + test_lit!(sfx_rep_one_or_more2, suffixes, "(?:abc)+", C("abc")); + test_lit!(sfx_rep_nested_one_or_more, suffixes, "(?:a+)+", C("a")); + test_lit!(sfx_rep_range1, suffixes, "a{0}"); + test_lit!(sfx_rep_range2, suffixes, "a{0,}"); + test_lit!(sfx_rep_range3, suffixes, "a{0,1}"); + test_lit!(sfx_rep_range4, suffixes, "a{1}", M("a")); + test_lit!(sfx_rep_range5, suffixes, "a{2}", M("aa")); + test_lit!(sfx_rep_range6, suffixes, "a{1,2}", C("a")); + test_lit!(sfx_rep_range7, suffixes, "a{2,3}", C("aa")); + + // Test regexes with concatenations. + test_lit!(sfx_cat1, suffixes, "(?:a)(?:b)", M("ab")); + test_lit!(sfx_cat2, suffixes, "[ab]z", M("az"), M("bz")); + test_lit!(sfx_cat3, suffixes, "(?i)[ab]z", + M("AZ"), M("Az"), M("BZ"), M("Bz"), + M("aZ"), M("az"), M("bZ"), M("bz")); + test_lit!(sfx_cat4, suffixes, "[ab][yz]", + M("ay"), M("az"), M("by"), M("bz")); + test_lit!(sfx_cat5, suffixes, "a*b", C("ab"), M("b")); + test_lit!(sfx_cat6, suffixes, "a*b*c", C("bc"), C("ac"), M("c")); + test_lit!(sfx_cat7, suffixes, "a*b*c+", C("c")); + test_lit!(sfx_cat8, suffixes, "a*b+c", C("bc")); + test_lit!(sfx_cat9, suffixes, "a*b+c*", C("c"), C("b")); + test_lit!(sfx_cat10, suffixes, "ab*", C("b"), M("a")); + test_lit!(sfx_cat11, suffixes, "ab*c", C("bc"), M("ac")); + test_lit!(sfx_cat12, suffixes, "ab+", C("b")); + test_lit!(sfx_cat13, suffixes, "ab+c", C("bc")); + test_lit!(sfx_cat14, suffixes, "a^"); + test_lit!(sfx_cat15, suffixes, "$a", C("a")); + test_lit!(sfx_cat16, suffixes, r"ab*c", C("bc"), M("ac")); + test_lit!(sfx_cat17, suffixes, r"ab+c", C("bc")); + test_lit!(sfx_cat18, suffixes, r"z*azb", C("zazb"), M("azb")); + test_lit!(sfx_cat19, suffixes, "a.z", C("z")); + + // Test regexes with alternations. + test_lit!(sfx_alt1, suffixes, "a|b", M("a"), M("b")); + test_lit!(sfx_alt2, suffixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b")); + test_lit!(sfx_alt3, suffixes, "y(?:a|b)z", M("yaz"), M("ybz")); + test_lit!(sfx_alt4, suffixes, "a|b*"); + test_lit!(sfx_alt5, suffixes, "a|b+", M("a"), C("b")); + test_lit!(sfx_alt6, suffixes, "a|(?:b|c*)"); + test_lit!(sfx_alt7, suffixes, "(a|b)*c|(a|ab)*c", + C("ac"), C("bc"), M("c"), C("ac"), C("abc"), M("c")); + test_lit!(sfx_alt8, suffixes, "a*b|c", C("ab"), M("b"), M("c")); + + // Test regexes with empty assertions. + test_lit!(sfx_empty1, suffixes, "a$", M("a")); + test_lit!(sfx_empty2, suffixes, "${2}a", C("a")); + + // Make sure some curious regexes have no suffixes. + test_lit!(sfx_nothing1, suffixes, "."); + test_lit!(sfx_nothing2, suffixes, "(?s)."); + test_lit!(sfx_nothing3, suffixes, "^"); + test_lit!(sfx_nothing4, suffixes, "$"); + test_lit!(sfx_nothing6, suffixes, "(?m)$"); + test_lit!(sfx_nothing7, suffixes, r"\b"); + test_lit!(sfx_nothing8, suffixes, r"\B"); + + // Test a few regexes that defeat any suffix literal detection. + test_lit!(sfx_defeated1, suffixes, "a."); + test_lit!(sfx_defeated2, suffixes, "(?s)a."); + test_lit!(sfx_defeated3, suffixes, "a*b*c*"); + test_lit!(sfx_defeated4, suffixes, "a|."); + test_lit!(sfx_defeated5, suffixes, ".|a"); + test_lit!(sfx_defeated6, suffixes, "a|^"); + test_lit!(sfx_defeated7, suffixes, "(?:a(?:b)(?:c))."); + test_lit!(sfx_defeated8, suffixes, "a^"); + test_lit!(sfx_defeated9, suffixes, "(?m)a$"); + test_lit!(sfx_defeated10, suffixes, r"a\b"); + test_lit!(sfx_defeated11, suffixes, r"a\B"); + test_lit!(sfx_defeated12, suffixes, "a^*"); + test_lit!(sfx_defeated13, suffixes, "a^+"); + + // These test use a much lower limit than the default so that we can + // write test cases of reasonable size. + test_exhausted!(sfx_exhausted1, suffixes, "[a-z]"); + test_exhausted!(sfx_exhausted2, suffixes, "A[a-z]*"); + test_exhausted!(sfx_exhausted3, suffixes, "A[a-z]Z", C("Z")); + test_exhausted!(sfx_exhausted4, suffixes, "(?i)foobar", + C("AR"), C("Ar"), C("aR"), C("ar")); + test_exhausted!(sfx_exhausted5, suffixes, "(?:ab){100}", + C("abababababababababab")); + test_exhausted!(sfx_exhausted6, suffixes, "cd(?:(?:ab){100})*", + C("ababababab"), M("cd")); + test_exhausted!(sfx_exhausted7, suffixes, "cd(?:(?:ab){100})*z", + C("abababababz"), M("cdz")); + test_exhausted!(sfx_exhausted8, suffixes, "zaaaaaaaaaaaaaaaaaaaa", + C("aaaaaaaaaaaaaaaaaaaa")); + + // ************************************************************************ + // Tests for generating unambiguous literal sets. + // ************************************************************************ + + macro_rules! test_unamb { + ($name:ident, $given:expr, $expected:expr) => { + #[test] + fn $name() { + let given: Vec = + $given + .into_iter() + .map(|ul| { + let cut = ul.is_cut(); + Literal { v: ul.v.into_bytes(), cut: cut } + }) + .collect(); + let lits = create_lits(given); + let got = lits.unambiguous_prefixes(); + assert_eq!($expected, escape_lits(got.literals())); + } + }; + } + + test_unamb!(unambiguous1, vec![M("z"), M("azb")], vec![C("a"), C("z")]); + test_unamb!(unambiguous2, + vec![M("zaaaaaa"), M("aa")], vec![C("aa"), C("z")]); + test_unamb!(unambiguous3, + vec![M("Sherlock"), M("Watson")], + vec![M("Sherlock"), M("Watson")]); + test_unamb!(unambiguous4, vec![M("abc"), M("bc")], vec![C("a"), C("bc")]); + test_unamb!(unambiguous5, vec![M("bc"), M("abc")], vec![C("a"), C("bc")]); + test_unamb!(unambiguous6, vec![M("a"), M("aa")], vec![C("a")]); + test_unamb!(unambiguous7, vec![M("aa"), M("a")], vec![C("a")]); + test_unamb!(unambiguous8, vec![M("ab"), M("a")], vec![C("a")]); + test_unamb!(unambiguous9, + vec![M("ac"), M("bc"), M("c"), M("ac"), M("abc"), M("c")], + vec![C("a"), C("b"), C("c")]); + test_unamb!(unambiguous10, + vec![M("Mo'"), M("Mu'"), M("Mo"), M("Mu")], + vec![C("Mo"), C("Mu")]); + test_unamb!(unambiguous11, + vec![M("zazb"), M("azb")], vec![C("a"), C("z")]); + test_unamb!(unambiguous12, vec![M("foo"), C("foo")], vec![C("foo")]); + test_unamb!(unambiguous13, + vec![M("ABCX"), M("CDAX"), M("BCX")], + vec![C("A"), C("BCX"), C("CD")]); + test_unamb!(unambiguous14, + vec![M("IMGX"), M("MVIX"), M("MGX"), M("DSX")], + vec![M("DSX"), C("I"), C("MGX"), C("MV")]); + test_unamb!(unambiguous15, + vec![M("IMG_"), M("MG_"), M("CIMG")], + vec![C("C"), C("I"), C("MG_")]); + + + // ************************************************************************ + // Tests for suffix trimming. + // ************************************************************************ + macro_rules! test_trim { + ($name:ident, $trim:expr, $given:expr, $expected:expr) => { + #[test] + fn $name() { + let given: Vec = + $given + .into_iter() + .map(|ul| { + let cut = ul.is_cut(); + Literal { v: ul.v.into_bytes(), cut: cut } + }) + .collect(); + let lits = create_lits(given); + let got = lits.trim_suffix($trim).unwrap(); + assert_eq!($expected, escape_lits(got.literals())); + } + } + } + + test_trim!(trim1, 1, vec![M("ab"), M("yz")], vec![C("a"), C("y")]); + test_trim!(trim2, 1, vec![M("abc"), M("abd")], vec![C("ab")]); + test_trim!(trim3, 2, vec![M("abc"), M("abd")], vec![C("a")]); + test_trim!(trim4, 2, vec![M("abc"), M("ghij")], vec![C("a"), C("gh")]); + + // ************************************************************************ + // Tests for longest common prefix. + // ************************************************************************ + + macro_rules! test_lcp { + ($name:ident, $given:expr, $expected:expr) => { + #[test] + fn $name() { + let given: Vec = + $given + .into_iter() + .map(|s: &str| Literal { + v: s.to_owned().into_bytes(), + cut: false, + }) + .collect(); + let lits = create_lits(given); + let got = lits.longest_common_prefix(); + assert_eq!($expected, escape_bytes(got)); + } + }; + } + + test_lcp!(lcp1, vec!["a"], "a"); + test_lcp!(lcp2, vec![], ""); + test_lcp!(lcp3, vec!["a", "b"], ""); + test_lcp!(lcp4, vec!["ab", "ab"], "ab"); + test_lcp!(lcp5, vec!["ab", "a"], "a"); + test_lcp!(lcp6, vec!["a", "ab"], "a"); + test_lcp!(lcp7, vec!["ab", "b"], ""); + test_lcp!(lcp8, vec!["b", "ab"], ""); + test_lcp!(lcp9, vec!["foobar", "foobaz"], "fooba"); + test_lcp!(lcp10, vec!["foobar", "foobaz", "a"], ""); + test_lcp!(lcp11, vec!["a", "foobar", "foobaz"], ""); + test_lcp!(lcp12, vec!["foo", "flub", "flab", "floo"], "f"); + + // ************************************************************************ + // Tests for longest common suffix. + // ************************************************************************ + + macro_rules! test_lcs { + ($name:ident, $given:expr, $expected:expr) => { + #[test] + fn $name() { + let given: Vec = + $given + .into_iter() + .map(|s: &str| Literal { + v: s.to_owned().into_bytes(), + cut: false, + }) + .collect(); + let lits = create_lits(given); + let got = lits.longest_common_suffix(); + assert_eq!($expected, escape_bytes(got)); + } + }; + } + + test_lcs!(lcs1, vec!["a"], "a"); + test_lcs!(lcs2, vec![], ""); + test_lcs!(lcs3, vec!["a", "b"], ""); + test_lcs!(lcs4, vec!["ab", "ab"], "ab"); + test_lcs!(lcs5, vec!["ab", "a"], ""); + test_lcs!(lcs6, vec!["a", "ab"], ""); + test_lcs!(lcs7, vec!["ab", "b"], "b"); + test_lcs!(lcs8, vec!["b", "ab"], "b"); + test_lcs!(lcs9, vec!["barfoo", "bazfoo"], "foo"); + test_lcs!(lcs10, vec!["barfoo", "bazfoo", "a"], ""); + test_lcs!(lcs11, vec!["a", "barfoo", "bazfoo"], ""); + test_lcs!(lcs12, vec!["flub", "bub", "boob", "dub"], "b"); +} diff --git a/regex-syntax-2/src/hir/mod.rs b/regex-syntax-2/src/hir/mod.rs new file mode 100644 index 0000000000..8856a95c2c --- /dev/null +++ b/regex-syntax-2/src/hir/mod.rs @@ -0,0 +1,2040 @@ +// Copyright 2018 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +/*! +Defines a high-level intermediate representation for regular expressions. +*/ +use std::char; +use std::cmp; +use std::error; +use std::fmt; +use std::u8; + +use ast::Span; +use hir::interval::{Interval, IntervalSet, IntervalSetIter}; +use unicode; + +pub use hir::visitor::{Visitor, visit}; + +mod interval; +pub mod literal; +pub mod translate; +mod visitor; + +/// An error that can occur while translating an `Ast` to a `Hir`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Error { + /// The kind of error. + kind: ErrorKind, + /// The original pattern that the translator's Ast was parsed from. Every + /// span in an error is a valid range into this string. + pattern: String, + /// The span of this error, derived from the Ast given to the translator. + span: Span, +} + +impl Error { + /// Return the type of this error. + pub fn kind(&self) -> &ErrorKind { + &self.kind + } + + /// The original pattern string in which this error occurred. + /// + /// Every span reported by this error is reported in terms of this string. + pub fn pattern(&self) -> &str { + &self.pattern + } + + /// Return the span at which this error occurred. + pub fn span(&self) -> &Span { + &self.span + } +} + +/// The type of an error that occurred while building an `Hir`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ErrorKind { + /// This error occurs when a Unicode feature is used when Unicode + /// support is disabled. For example `(?-u:\pL)` would trigger this error. + UnicodeNotAllowed, + /// This error occurs when translating a pattern that could match a byte + /// sequence that isn't UTF-8 and `allow_invalid_utf8` was disabled. + InvalidUtf8, + /// This occurs when an unrecognized Unicode property name could not + /// be found. + UnicodePropertyNotFound, + /// This occurs when an unrecognized Unicode property value could not + /// be found. + UnicodePropertyValueNotFound, + /// This occurs when the translator attempts to construct a character class + /// that is empty. + /// + /// Note that this restriction in the translator may be removed in the + /// future. + EmptyClassNotAllowed, + /// Hints that destructuring should not be exhaustive. + /// + /// This enum may grow additional variants, so this makes sure clients + /// don't count on exhaustive matching. (Otherwise, adding a new variant + /// could break existing code.) + #[doc(hidden)] + __Nonexhaustive, +} + +impl ErrorKind { + fn description(&self) -> &str { + use self::ErrorKind::*; + match *self { + UnicodeNotAllowed => "Unicode not allowed here", + InvalidUtf8 => "pattern can match invalid UTF-8", + UnicodePropertyNotFound => "Unicode property not found", + UnicodePropertyValueNotFound => "Unicode property value not found", + EmptyClassNotAllowed => "empty character classes are not allowed", + _ => unreachable!(), + } + } +} + +impl error::Error for Error { + fn description(&self) -> &str { + self.kind.description() + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + ::error::Formatter::from(self).fmt(f) + } +} + +impl fmt::Display for ErrorKind { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.write_str(self.description()) + } +} + +/// A high-level intermediate representation (HIR) for a regular expression. +/// +/// The HIR of a regular expression represents an intermediate step between its +/// abstract syntax (a structured description of the concrete syntax) and +/// compiled byte codes. The purpose of HIR is to make regular expressions +/// easier to analyze. In particular, the AST is much more complex than the +/// HIR. For example, while an AST supports arbitrarily nested character +/// classes, the HIR will flatten all nested classes into a single set. The HIR +/// will also "compile away" every flag present in the concrete syntax. For +/// example, users of HIR expressions never need to worry about case folding; +/// it is handled automatically by the translator (e.g., by translating `(?i)A` +/// to `[aA]`). +/// +/// If the HIR was produced by a translator that disallows invalid UTF-8, then +/// the HIR is guaranteed to match UTF-8 exclusively. +/// +/// This type defines its own destructor that uses constant stack space and +/// heap space proportional to the size of the HIR. +/// +/// The specific type of an HIR expression can be accessed via its `kind` +/// or `into_kind` methods. This extra level of indirection exists for two +/// reasons: +/// +/// 1. Construction of an HIR expression *must* use the constructor methods +/// on this `Hir` type instead of building the `HirKind` values directly. +/// This permits construction to enforce invariants like "concatenations +/// always consist of two or more sub-expressions." +/// 2. Every HIR expression contains attributes that are defined inductively, +/// and can be computed cheaply during the construction process. For +/// example, one such attribute is whether the expression must match at the +/// beginning of the text. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Hir { + /// The underlying HIR kind. + kind: HirKind, + /// Analysis info about this HIR, computed during construction. + info: HirInfo, +} + +/// The kind of an arbitrary `Hir` expression. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum HirKind { + /// The empty regular expression, which matches everything, including the + /// empty string. + Empty, + /// A single literal character that matches exactly this character. + Literal(Literal), + /// A single character class that matches any of the characters in the + /// class. A class can either consist of Unicode scalar values as + /// characters, or it can use bytes. + Class(Class), + /// An anchor assertion. An anchor assertion match always has zero length. + Anchor(Anchor), + /// A word boundary assertion, which may or may not be Unicode aware. A + /// word boundary assertion match always has zero length. + WordBoundary(WordBoundary), + /// A repetition operation applied to a child expression. + Repetition(Repetition), + /// A possibly capturing group, which contains a child expression. + Group(Group), + /// A concatenation of expressions. A concatenation always has at least two + /// child expressions. + /// + /// A concatenation matches only if each of its child expression matches + /// one after the other. + Concat(Vec), + /// An alternation of expressions. An alternation always has at least two + /// child expressions. + /// + /// An alternation matches only if at least one of its child expression + /// matches. If multiple expressions match, then the leftmost is preferred. + Alternation(Vec), +} + +impl Hir { + /// Returns a reference to the underlying HIR kind. + pub fn kind(&self) -> &HirKind { + &self.kind + } + + /// Consumes ownership of this HIR expression and returns its underlying + /// `HirKind`. + pub fn into_kind(mut self) -> HirKind { + use std::mem; + mem::replace(&mut self.kind, HirKind::Empty) + } + + /// Returns an empty HIR expression. + /// + /// An empty HIR expression always matches, including the empty string. + pub fn empty() -> Hir { + let mut info = HirInfo::new(); + info.set_always_utf8(true); + info.set_all_assertions(true); + info.set_anchored_start(false); + info.set_anchored_end(false); + info.set_any_anchored_start(false); + info.set_any_anchored_end(false); + info.set_match_empty(true); + Hir { + kind: HirKind::Empty, + info: info, + } + } + + /// Creates a literal HIR expression. + /// + /// If the given literal has a `Byte` variant with an ASCII byte, then this + /// method panics. This enforces the invariant that `Byte` variants are + /// only used to express matching of invalid UTF-8. + pub fn literal(lit: Literal) -> Hir { + if let Literal::Byte(b) = lit { + assert!(b > 0x7F); + } + + let mut info = HirInfo::new(); + info.set_always_utf8(lit.is_unicode()); + info.set_all_assertions(false); + info.set_anchored_start(false); + info.set_anchored_end(false); + info.set_any_anchored_start(false); + info.set_any_anchored_end(false); + info.set_match_empty(false); + Hir { + kind: HirKind::Literal(lit), + info: info, + } + } + + /// Creates a class HIR expression. + pub fn class(class: Class) -> Hir { + let mut info = HirInfo::new(); + info.set_always_utf8(class.is_always_utf8()); + info.set_all_assertions(false); + info.set_anchored_start(false); + info.set_anchored_end(false); + info.set_any_anchored_start(false); + info.set_any_anchored_end(false); + info.set_match_empty(false); + Hir { + kind: HirKind::Class(class), + info: info, + } + } + + /// Creates an anchor assertion HIR expression. + pub fn anchor(anchor: Anchor) -> Hir { + let mut info = HirInfo::new(); + info.set_always_utf8(true); + info.set_all_assertions(true); + info.set_anchored_start(false); + info.set_anchored_end(false); + info.set_any_anchored_start(false); + info.set_any_anchored_end(false); + info.set_match_empty(true); + if let Anchor::StartText = anchor { + info.set_anchored_start(true); + info.set_any_anchored_start(true); + } + if let Anchor::EndText = anchor { + info.set_anchored_end(true); + info.set_any_anchored_end(true); + } + Hir { + kind: HirKind::Anchor(anchor), + info: info, + } + } + + /// Creates a word boundary assertion HIR expression. + pub fn word_boundary(word_boundary: WordBoundary) -> Hir { + let mut info = HirInfo::new(); + info.set_always_utf8(true); + info.set_all_assertions(true); + info.set_anchored_start(false); + info.set_anchored_end(false); + info.set_any_anchored_start(false); + info.set_any_anchored_end(false); + // A negated word boundary matches the empty string, but a normal + // word boundary does not! + info.set_match_empty(word_boundary.is_negated()); + // ASCII word boundaries can match invalid UTF-8. + if let WordBoundary::Ascii = word_boundary { + info.set_always_utf8(false); + } + if let WordBoundary::AsciiNegate = word_boundary { + info.set_always_utf8(false); + } + Hir { + kind: HirKind::WordBoundary(word_boundary), + info: info, + } + } + + /// Creates a repetition HIR expression. + pub fn repetition(rep: Repetition) -> Hir { + let mut info = HirInfo::new(); + info.set_always_utf8(rep.hir.is_always_utf8()); + info.set_all_assertions(rep.hir.is_all_assertions()); + // If this operator can match the empty string, then it can never + // be anchored. + info.set_anchored_start( + !rep.is_match_empty() && rep.hir.is_anchored_start() + ); + info.set_anchored_end( + !rep.is_match_empty() && rep.hir.is_anchored_end() + ); + info.set_any_anchored_start(rep.hir.is_any_anchored_start()); + info.set_any_anchored_end(rep.hir.is_any_anchored_end()); + info.set_match_empty(rep.is_match_empty() || rep.hir.is_match_empty()); + Hir { + kind: HirKind::Repetition(rep), + info: info, + } + } + + /// Creates a group HIR expression. + pub fn group(group: Group) -> Hir { + let mut info = HirInfo::new(); + info.set_always_utf8(group.hir.is_always_utf8()); + info.set_all_assertions(group.hir.is_all_assertions()); + info.set_anchored_start(group.hir.is_anchored_start()); + info.set_anchored_end(group.hir.is_anchored_end()); + info.set_any_anchored_start(group.hir.is_any_anchored_start()); + info.set_any_anchored_end(group.hir.is_any_anchored_end()); + info.set_match_empty(group.hir.is_match_empty()); + Hir { + kind: HirKind::Group(group), + info: info, + } + } + + /// Returns the concatenation of the given expressions. + /// + /// This flattens the concatenation as appropriate. + pub fn concat(mut exprs: Vec) -> Hir { + match exprs.len() { + 0 => Hir::empty(), + 1 => exprs.pop().unwrap(), + _ => { + let mut info = HirInfo::new(); + info.set_always_utf8(true); + info.set_all_assertions(true); + info.set_any_anchored_start(false); + info.set_any_anchored_end(false); + info.set_match_empty(true); + + // Some attributes require analyzing all sub-expressions. + for e in &exprs { + let x = info.is_always_utf8() && e.is_always_utf8(); + info.set_always_utf8(x); + + let x = info.is_all_assertions() && e.is_all_assertions(); + info.set_all_assertions(x); + + let x = + info.is_any_anchored_start() + || e.is_any_anchored_start(); + info.set_any_anchored_start(x); + + let x = + info.is_any_anchored_end() + || e.is_any_anchored_end(); + info.set_any_anchored_end(x); + + let x = info.is_match_empty() && e.is_match_empty(); + info.set_match_empty(x); + } + // Anchored attributes require something slightly more + // sophisticated. Normally, WLOG, to determine whether an + // expression is anchored to the start, we'd only need to check + // the first expression of a concatenation. However, + // expressions like `$\b^` are still anchored to the start, + // but the first expression in the concatenation *isn't* + // anchored to the start. So the "first" expression to look at + // is actually one that is either not an assertion or is + // specifically the StartText assertion. + info.set_anchored_start( + exprs.iter() + .take_while(|e| { + e.is_anchored_start() || e.is_all_assertions() + }) + .any(|e| { + e.is_anchored_start() + })); + // Similarly for the end anchor, but in reverse. + info.set_anchored_end( + exprs.iter() + .rev() + .take_while(|e| { + e.is_anchored_end() || e.is_all_assertions() + }) + .any(|e| { + e.is_anchored_end() + })); + Hir { + kind: HirKind::Concat(exprs), + info: info, + } + } + } + } + + /// Returns the alternation of the given expressions. + /// + /// This flattens the alternation as appropriate. + pub fn alternation(mut exprs: Vec) -> Hir { + match exprs.len() { + 0 => Hir::empty(), + 1 => exprs.pop().unwrap(), + _ => { + let mut info = HirInfo::new(); + info.set_always_utf8(true); + info.set_all_assertions(true); + info.set_anchored_start(true); + info.set_anchored_end(true); + info.set_any_anchored_start(false); + info.set_any_anchored_end(false); + info.set_match_empty(false); + + // Some attributes require analyzing all sub-expressions. + for e in &exprs { + let x = info.is_always_utf8() && e.is_always_utf8(); + info.set_always_utf8(x); + + let x = info.is_all_assertions() && e.is_all_assertions(); + info.set_all_assertions(x); + + let x = info.is_anchored_start() && e.is_anchored_start(); + info.set_anchored_start(x); + + let x = info.is_anchored_end() && e.is_anchored_end(); + info.set_anchored_end(x); + + let x = + info.is_any_anchored_start() + || e.is_any_anchored_start(); + info.set_any_anchored_start(x); + + let x = + info.is_any_anchored_end() + || e.is_any_anchored_end(); + info.set_any_anchored_end(x); + + let x = info.is_match_empty() || e.is_match_empty(); + info.set_match_empty(x); + } + Hir { + kind: HirKind::Alternation(exprs), + info: info, + } + } + } + } + + /// Build an HIR expression for `.`. + /// + /// A `.` expression matches any character except for `\n`. To build an + /// expression that matches any character, including `\n`, use the `any` + /// method. + /// + /// If `bytes` is `true`, then this assumes characters are limited to a + /// single byte. + pub fn dot(bytes: bool) -> Hir { + if bytes { + let mut cls = ClassBytes::empty(); + cls.push(ClassRangeBytes::new(b'\0', b'\x09')); + cls.push(ClassRangeBytes::new(b'\x0B', b'\xFF')); + Hir::class(Class::Bytes(cls)) + } else { + let mut cls = ClassUnicode::empty(); + cls.push(ClassRangeUnicode::new('\0', '\x09')); + cls.push(ClassRangeUnicode::new('\x0B', '\u{10FFFF}')); + Hir::class(Class::Unicode(cls)) + } + } + + /// Build an HIR expression for `(?s).`. + /// + /// A `(?s).` expression matches any character, including `\n`. To build an + /// expression that matches any character except for `\n`, then use the + /// `dot` method. + /// + /// If `bytes` is `true`, then this assumes characters are limited to a + /// single byte. + pub fn any(bytes: bool) -> Hir { + if bytes { + let mut cls = ClassBytes::empty(); + cls.push(ClassRangeBytes::new(b'\0', b'\xFF')); + Hir::class(Class::Bytes(cls)) + } else { + let mut cls = ClassUnicode::empty(); + cls.push(ClassRangeUnicode::new('\0', '\u{10FFFF}')); + Hir::class(Class::Unicode(cls)) + } + } + + /// Return true if and only if this HIR will always match valid UTF-8. + /// + /// When this returns false, then it is possible for this HIR expression + /// to match invalid UTF-8. + pub fn is_always_utf8(&self) -> bool { + self.info.is_always_utf8() + } + + /// Returns true if and only if this entire HIR expression is made up of + /// zero-width assertions. + /// + /// This includes expressions like `^$\b\A\z` and even `((\b)+())*^`, but + /// not `^a`. + pub fn is_all_assertions(&self) -> bool { + self.info.is_all_assertions() + } + + /// Return true if and only if this HIR is required to match from the + /// beginning of text. This includes expressions like `^foo`, `^(foo|bar)`, + /// `^foo|^bar` but not `^foo|bar`. + pub fn is_anchored_start(&self) -> bool { + self.info.is_anchored_start() + } + + /// Return true if and only if this HIR is required to match at the end + /// of text. This includes expressions like `foo$`, `(foo|bar)$`, + /// `foo$|bar$` but not `foo$|bar`. + pub fn is_anchored_end(&self) -> bool { + self.info.is_anchored_end() + } + + /// Return true if and only if this HIR contains any sub-expression that + /// is required to match at the beginning of text. Specifically, this + /// returns true if the `^` symbol (when multiline mode is disabled) or the + /// `\A` escape appear anywhere in the regex. + pub fn is_any_anchored_start(&self) -> bool { + self.info.is_any_anchored_start() + } + + /// Return true if and only if this HIR contains any sub-expression that is + /// required to match at the end of text. Specifically, this returns true + /// if the `$` symbol (when multiline mode is disabled) or the `\z` escape + /// appear anywhere in the regex. + pub fn is_any_anchored_end(&self) -> bool { + self.info.is_any_anchored_end() + } + + /// Return true if and only if the empty string is part of the language + /// matched by this regular expression. + /// + /// This includes `a*`, `a?b*`, `a{0}`, `()`, `()+`, `^$`, `a|b?`, `\B`, + /// but not `a`, `a+` or `\b`. + pub fn is_match_empty(&self) -> bool { + self.info.is_match_empty() + } +} + +impl HirKind { + /// Return true if and only if this HIR is the empty regular expression. + /// + /// Note that this is not defined inductively. That is, it only tests if + /// this kind is the `Empty` variant. To get the inductive definition, + /// use the `is_match_empty` method on [`Hir`](struct.Hir.html). + pub fn is_empty(&self) -> bool { + match *self { + HirKind::Empty => true, + _ => false, + } + } + + /// Returns true if and only if this kind has any (including possibly + /// empty) subexpressions. + pub fn has_subexprs(&self) -> bool { + match *self { + HirKind::Empty + | HirKind::Literal(_) + | HirKind::Class(_) + | HirKind::Anchor(_) + | HirKind::WordBoundary(_) => false, + HirKind::Group(_) + | HirKind::Repetition(_) + | HirKind::Concat(_) + | HirKind::Alternation(_) => true, + } + } +} + +/// The high-level intermediate representation of a literal. +/// +/// A literal corresponds to a single character, where a character is either +/// defined by a Unicode scalar value or an arbitrary byte. Unicode characters +/// are preferred whenever possible. In particular, a `Byte` variant is only +/// ever produced when it could match invalid UTF-8. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum Literal { + /// A single character represented by a Unicode scalar value. + Unicode(char), + /// A single character represented by an arbitrary byte. + Byte(u8), +} + +impl Literal { + /// Returns true if and only if this literal corresponds to a Unicode + /// scalar value. + pub fn is_unicode(&self) -> bool { + match *self { + Literal::Unicode(_) => true, + Literal::Byte(b) if b <= 0x7F => true, + Literal::Byte(_) => false, + } + } +} + +/// The high-level intermediate representation of a character class. +/// +/// A character class corresponds to a set of characters. A character is either +/// defined by a Unicode scalar value or a byte. Unicode characters are used +/// by default, while bytes are used when Unicode mode (via the `u` flag) is +/// disabled. +/// +/// A character class, regardless of its character type, is represented by a +/// sequence of non-overlapping non-adjacent ranges of characters. +/// +/// Note that unlike [`Literal`](enum.Literal.html), a `Bytes` variant may +/// be produced even when it exclusively matches valid UTF-8. This is because +/// a `Bytes` variant represents an intention by the author of the regular +/// expression to disable Unicode mode, which in turn impacts the semantics of +/// case insensitive matching. For example, `(?i)k` and `(?i-u)k` will not +/// match the same set of strings. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum Class { + /// A set of characters represented by Unicode scalar values. + Unicode(ClassUnicode), + /// A set of characters represented by arbitrary bytes (one byte per + /// character). + Bytes(ClassBytes), +} + +impl Class { + /// Apply Unicode simple case folding to this character class, in place. + /// The character class will be expanded to include all simple case folded + /// character variants. + /// + /// If this is a byte oriented character class, then this will be limited + /// to the ASCII ranges `A-Z` and `a-z`. + pub fn case_fold_simple(&mut self) { + match *self { + Class::Unicode(ref mut x) => x.case_fold_simple(), + Class::Bytes(ref mut x) => x.case_fold_simple(), + } + } + + /// Negate this character class in place. + /// + /// After completion, this character class will contain precisely the + /// characters that weren't previously in the class. + pub fn negate(&mut self) { + match *self { + Class::Unicode(ref mut x) => x.negate(), + Class::Bytes(ref mut x) => x.negate(), + } + } + + /// Returns true if and only if this character class will only ever match + /// valid UTF-8. + /// + /// A character class can match invalid UTF-8 only when the following + /// conditions are met: + /// + /// 1. The translator was configured to permit generating an expression + /// that can match invalid UTF-8. (By default, this is disabled.) + /// 2. Unicode mode (via the `u` flag) was disabled either in the concrete + /// syntax or in the parser builder. By default, Unicode mode is + /// enabled. + pub fn is_always_utf8(&self) -> bool { + match *self { + Class::Unicode(_) => true, + Class::Bytes(ref x) => x.is_all_ascii(), + } + } +} + +/// A set of characters represented by Unicode scalar values. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassUnicode { + set: IntervalSet, +} + +impl ClassUnicode { + /// Create a new class from a sequence of ranges. + /// + /// The given ranges do not need to be in any specific order, and ranges + /// may overlap. + pub fn new(ranges: I) -> ClassUnicode + where I: IntoIterator + { + ClassUnicode { set: IntervalSet::new(ranges) } + } + + /// Create a new class with no ranges. + pub fn empty() -> ClassUnicode { + ClassUnicode::new(vec![]) + } + + /// Add a new range to this set. + pub fn push(&mut self, range: ClassRangeUnicode) { + self.set.push(range); + } + + /// Return an iterator over all ranges in this class. + /// + /// The iterator yields ranges in ascending order. + pub fn iter(&self) -> ClassUnicodeIter { + ClassUnicodeIter(self.set.iter()) + } + + /// Return the underlying ranges as a slice. + pub fn ranges(&self) -> &[ClassRangeUnicode] { + self.set.intervals() + } + + /// Expand this character class such that it contains all case folded + /// characters, according to Unicode's "simple" mapping. For example, if + /// this class consists of the range `a-z`, then applying case folding will + /// result in the class containing both the ranges `a-z` and `A-Z`. + pub fn case_fold_simple(&mut self) { + self.set.case_fold_simple(); + } + + /// Negate this character class. + /// + /// For all `c` where `c` is a Unicode scalar value, if `c` was in this + /// set, then it will not be in this set after negation. + pub fn negate(&mut self) { + self.set.negate(); + } + + /// Union this character class with the given character class, in place. + pub fn union(&mut self, other: &ClassUnicode) { + self.set.union(&other.set); + } + + /// Intersect this character class with the given character class, in + /// place. + pub fn intersect(&mut self, other: &ClassUnicode) { + self.set.intersect(&other.set); + } + + /// Subtract the given character class from this character class, in place. + pub fn difference(&mut self, other: &ClassUnicode) { + self.set.difference(&other.set); + } + + /// Compute the symmetric difference of the given character classes, in + /// place. + /// + /// This computes the symmetric difference of two character classes. This + /// removes all elements in this class that are also in the given class, + /// but all adds all elements from the given class that aren't in this + /// class. That is, the class will contain all elements in either class, + /// but will not contain any elements that are in both classes. + pub fn symmetric_difference(&mut self, other: &ClassUnicode) { + self.set.symmetric_difference(&other.set); + } +} + +/// An iterator over all ranges in a Unicode character class. +/// +/// The lifetime `'a` refers to the lifetime of the underlying class. +#[derive(Debug)] +pub struct ClassUnicodeIter<'a>(IntervalSetIter<'a, ClassRangeUnicode>); + +impl<'a> Iterator for ClassUnicodeIter<'a> { + type Item = &'a ClassRangeUnicode; + + fn next(&mut self) -> Option<&'a ClassRangeUnicode> { + self.0.next() + } +} + +/// A single range of characters represented by Unicode scalar values. +/// +/// The range is closed. That is, the start and end of the range are included +/// in the range. +#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)] +pub struct ClassRangeUnicode { + start: char, + end: char, +} + +impl fmt::Debug for ClassRangeUnicode { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let start = + if !self.start.is_whitespace() && !self.start.is_control() { + self.start.to_string() + } else { + format!("0x{:X}", self.start as u32) + }; + let end = + if !self.end.is_whitespace() && !self.end.is_control() { + self.end.to_string() + } else { + format!("0x{:X}", self.end as u32) + }; + f.debug_struct("ClassRangeUnicode") + .field("start", &start) + .field("end", &end) + .finish() + } +} + +impl Interval for ClassRangeUnicode { + type Bound = char; + + #[inline] fn lower(&self) -> char { self.start } + #[inline] fn upper(&self) -> char { self.end } + #[inline] fn set_lower(&mut self, bound: char) { self.start = bound; } + #[inline] fn set_upper(&mut self, bound: char) { self.end = bound; } + + /// Apply simple case folding to this Unicode scalar value range. + /// + /// Additional ranges are appended to the given vector. Canonical ordering + /// is *not* maintained in the given vector. + fn case_fold_simple(&self, ranges: &mut Vec) { + if !unicode::contains_simple_case_mapping(self.start, self.end) { + return; + } + let start = self.start as u32; + let end = (self.end as u32).saturating_add(1); + let mut next_simple_cp = None; + for cp in (start..end).filter_map(char::from_u32) { + if next_simple_cp.map_or(false, |next| cp < next) { + continue; + } + let it = match unicode::simple_fold(cp) { + Ok(it) => it, + Err(next) => { + next_simple_cp = next; + continue; + } + }; + for cp_folded in it { + ranges.push(ClassRangeUnicode::new(cp_folded, cp_folded)); + } + } + } +} + +impl ClassRangeUnicode { + /// Create a new Unicode scalar value range for a character class. + /// + /// The returned range is always in a canonical form. That is, the range + /// returned always satisfies the invariant that `start <= end`. + pub fn new(start: char, end: char) -> ClassRangeUnicode { + ClassRangeUnicode::create(start, end) + } + + /// Return the start of this range. + /// + /// The start of a range is always less than or equal to the end of the + /// range. + pub fn start(&self) -> char { + self.start + } + + /// Return the end of this range. + /// + /// The end of a range is always greater than or equal to the start of the + /// range. + pub fn end(&self) -> char { + self.end + } +} + +/// A set of characters represented by arbitrary bytes (where one byte +/// corresponds to one character). +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassBytes { + set: IntervalSet, +} + +impl ClassBytes { + /// Create a new class from a sequence of ranges. + /// + /// The given ranges do not need to be in any specific order, and ranges + /// may overlap. + pub fn new(ranges: I) -> ClassBytes + where I: IntoIterator + { + ClassBytes { set: IntervalSet::new(ranges) } + } + + /// Create a new class with no ranges. + pub fn empty() -> ClassBytes { + ClassBytes::new(vec![]) + } + + /// Add a new range to this set. + pub fn push(&mut self, range: ClassRangeBytes) { + self.set.push(range); + } + + /// Return an iterator over all ranges in this class. + /// + /// The iterator yields ranges in ascending order. + pub fn iter(&self) -> ClassBytesIter { + ClassBytesIter(self.set.iter()) + } + + /// Return the underlying ranges as a slice. + pub fn ranges(&self) -> &[ClassRangeBytes] { + self.set.intervals() + } + + /// Expand this character class such that it contains all case folded + /// characters. For example, if this class consists of the range `a-z`, + /// then applying case folding will result in the class containing both the + /// ranges `a-z` and `A-Z`. + /// + /// Note that this only applies ASCII case folding, which is limited to the + /// characters `a-z` and `A-Z`. + pub fn case_fold_simple(&mut self) { + self.set.case_fold_simple(); + } + + /// Negate this byte class. + /// + /// For all `b` where `b` is a any byte, if `b` was in this set, then it + /// will not be in this set after negation. + pub fn negate(&mut self) { + self.set.negate(); + } + + /// Union this byte class with the given byte class, in place. + pub fn union(&mut self, other: &ClassBytes) { + self.set.union(&other.set); + } + + /// Intersect this byte class with the given byte class, in place. + pub fn intersect(&mut self, other: &ClassBytes) { + self.set.intersect(&other.set); + } + + /// Subtract the given byte class from this byte class, in place. + pub fn difference(&mut self, other: &ClassBytes) { + self.set.difference(&other.set); + } + + /// Compute the symmetric difference of the given byte classes, in place. + /// + /// This computes the symmetric difference of two byte classes. This + /// removes all elements in this class that are also in the given class, + /// but all adds all elements from the given class that aren't in this + /// class. That is, the class will contain all elements in either class, + /// but will not contain any elements that are in both classes. + pub fn symmetric_difference(&mut self, other: &ClassBytes) { + self.set.symmetric_difference(&other.set); + } + + /// Returns true if and only if this character class will either match + /// nothing or only ASCII bytes. Stated differently, this returns false + /// if and only if this class contains a non-ASCII byte. + pub fn is_all_ascii(&self) -> bool { + self.set.intervals().last().map_or(true, |r| r.end <= 0x7F) + } +} + +/// An iterator over all ranges in a byte character class. +/// +/// The lifetime `'a` refers to the lifetime of the underlying class. +#[derive(Debug)] +pub struct ClassBytesIter<'a>(IntervalSetIter<'a, ClassRangeBytes>); + +impl<'a> Iterator for ClassBytesIter<'a> { + type Item = &'a ClassRangeBytes; + + fn next(&mut self) -> Option<&'a ClassRangeBytes> { + self.0.next() + } +} + +/// A single range of characters represented by arbitrary bytes. +/// +/// The range is closed. That is, the start and end of the range are included +/// in the range. +#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)] +pub struct ClassRangeBytes { + start: u8, + end: u8, +} + +impl Interval for ClassRangeBytes { + type Bound = u8; + + #[inline] fn lower(&self) -> u8 { self.start } + #[inline] fn upper(&self) -> u8 { self.end } + #[inline] fn set_lower(&mut self, bound: u8) { self.start = bound; } + #[inline] fn set_upper(&mut self, bound: u8) { self.end = bound; } + + /// Apply simple case folding to this byte range. Only ASCII case mappings + /// (for a-z) are applied. + /// + /// Additional ranges are appended to the given vector. Canonical ordering + /// is *not* maintained in the given vector. + fn case_fold_simple(&self, ranges: &mut Vec) { + if !ClassRangeBytes::new(b'a', b'z').is_intersection_empty(self) { + let lower = cmp::max(self.start, b'a'); + let upper = cmp::min(self.end, b'z'); + ranges.push(ClassRangeBytes::new(lower - 32, upper - 32)); + } + if !ClassRangeBytes::new(b'A', b'Z').is_intersection_empty(self) { + let lower = cmp::max(self.start, b'A'); + let upper = cmp::min(self.end, b'Z'); + ranges.push(ClassRangeBytes::new(lower + 32, upper + 32)); + } + } +} + +impl ClassRangeBytes { + /// Create a new byte range for a character class. + /// + /// The returned range is always in a canonical form. That is, the range + /// returned always satisfies the invariant that `start <= end`. + pub fn new(start: u8, end: u8) -> ClassRangeBytes { + ClassRangeBytes::create(start, end) + } + + /// Return the start of this range. + /// + /// The start of a range is always less than or equal to the end of the + /// range. + pub fn start(&self) -> u8 { + self.start + } + + /// Return the end of this range. + /// + /// The end of a range is always greater than or equal to the start of the + /// range. + pub fn end(&self) -> u8 { + self.end + } +} + +impl fmt::Debug for ClassRangeBytes { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let mut debug = f.debug_struct("ClassRangeBytes"); + if self.start <= 0x7F { + debug.field("start", &(self.start as char)); + } else { + debug.field("start", &self.start); + } + if self.end <= 0x7F { + debug.field("end", &(self.end as char)); + } else { + debug.field("end", &self.end); + } + debug.finish() + } +} + +/// The high-level intermediate representation for an anchor assertion. +/// +/// A matching anchor assertion is always zero-length. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum Anchor { + /// Match the beginning of a line or the beginning of text. Specifically, + /// this matches at the starting position of the input, or at the position + /// immediately following a `\n` character. + StartLine, + /// Match the end of a line or the end of text. Specifically, + /// this matches at the end position of the input, or at the position + /// immediately preceding a `\n` character. + EndLine, + /// Match the beginning of text. Specifically, this matches at the starting + /// position of the input. + StartText, + /// Match the end of text. Specifically, this matches at the ending + /// position of the input. + EndText, +} + +/// The high-level intermediate representation for a word-boundary assertion. +/// +/// A matching word boundary assertion is always zero-length. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum WordBoundary { + /// Match a Unicode-aware word boundary. That is, this matches a position + /// where the left adjacent character and right adjacent character + /// correspond to a word and non-word or a non-word and word character. + Unicode, + /// Match a Unicode-aware negation of a word boundary. + UnicodeNegate, + /// Match an ASCII-only word boundary. That is, this matches a position + /// where the left adjacent character and right adjacent character + /// correspond to a word and non-word or a non-word and word character. + Ascii, + /// Match an ASCII-only negation of a word boundary. + AsciiNegate, +} + +impl WordBoundary { + /// Returns true if and only if this word boundary assertion is negated. + pub fn is_negated(&self) -> bool { + match *self { + WordBoundary::Unicode | WordBoundary::Ascii => false, + WordBoundary::UnicodeNegate | WordBoundary::AsciiNegate => true, + } + } +} + +/// The high-level intermediate representation for a group. +/// +/// This represents one of three possible group types: +/// +/// 1. A non-capturing group (e.g., `(?:expr)`). +/// 2. A capturing group (e.g., `(expr)`). +/// 3. A named capturing group (e.g., `(?Pexpr)`). +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Group { + /// The kind of this group. If it is a capturing group, then the kind + /// contains the capture group index (and the name, if it is a named + /// group). + pub kind: GroupKind, + /// The expression inside the capturing group, which may be empty. + pub hir: Box, +} + +/// The kind of group. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum GroupKind { + /// A normal unnamed capturing group. + /// + /// The value is the capture index of the group. + CaptureIndex(u32), + /// A named capturing group. + CaptureName { + /// The name of the group. + name: String, + /// The capture index of the group. + index: u32, + }, + /// A non-capturing group. + NonCapturing, +} + +/// The high-level intermediate representation of a repetition operator. +/// +/// A repetition operator permits the repetition of an arbitrary +/// sub-expression. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Repetition { + /// The kind of this repetition operator. + pub kind: RepetitionKind, + /// Whether this repetition operator is greedy or not. A greedy operator + /// will match as much as it can. A non-greedy operator will match as + /// little as it can. + /// + /// Typically, operators are greedy by default and are only non-greedy when + /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is + /// not. However, this can be inverted via the `U` "ungreedy" flag. + pub greedy: bool, + /// The expression being repeated. + pub hir: Box, +} + +impl Repetition { + /// Returns true if and only if this repetition operator makes it possible + /// to match the empty string. + /// + /// Note that this is not defined inductively. For example, while `a*` + /// will report `true`, `()+` will not, even though `()` matches the empty + /// string and one or more occurrences of something that matches the empty + /// string will always match the empty string. In order to get the + /// inductive definition, see the corresponding method on + /// [`Hir`](struct.Hir.html). + pub fn is_match_empty(&self) -> bool { + match self.kind { + RepetitionKind::ZeroOrOne => true, + RepetitionKind::ZeroOrMore => true, + RepetitionKind::OneOrMore => false, + RepetitionKind::Range(RepetitionRange::Exactly(m)) => m == 0, + RepetitionKind::Range(RepetitionRange::AtLeast(m)) => m == 0, + RepetitionKind::Range(RepetitionRange::Bounded(m, _)) => m == 0, + } + } +} + +/// The kind of a repetition operator. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum RepetitionKind { + /// Matches a sub-expression zero or one times. + ZeroOrOne, + /// Matches a sub-expression zero or more times. + ZeroOrMore, + /// Matches a sub-expression one or more times. + OneOrMore, + /// Matches a sub-expression within a bounded range of times. + Range(RepetitionRange), +} + +/// The kind of a counted repetition operator. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum RepetitionRange { + /// Matches a sub-expression exactly this many times. + Exactly(u32), + /// Matches a sub-expression at least this many times. + AtLeast(u32), + /// Matches a sub-expression at least `m` times and at most `n` times. + Bounded(u32, u32), +} + +/// A custom `Drop` impl is used for `HirKind` such that it uses constant stack +/// space but heap space proportional to the depth of the total `Hir`. +impl Drop for Hir { + fn drop(&mut self) { + use std::mem; + + match *self.kind() { + HirKind::Empty + | HirKind::Literal(_) + | HirKind::Class(_) + | HirKind::Anchor(_) + | HirKind::WordBoundary(_) => return, + HirKind::Group(ref x) if !x.hir.kind.has_subexprs() => return, + HirKind::Repetition(ref x) if !x.hir.kind.has_subexprs() => return, + HirKind::Concat(ref x) if x.is_empty() => return, + HirKind::Alternation(ref x) if x.is_empty() => return, + _ => {} + } + + let mut stack = vec![mem::replace(self, Hir::empty())]; + while let Some(mut expr) = stack.pop() { + match expr.kind { + HirKind::Empty + | HirKind::Literal(_) + | HirKind::Class(_) + | HirKind::Anchor(_) + | HirKind::WordBoundary(_) => {} + HirKind::Group(ref mut x) => { + stack.push(mem::replace(&mut x.hir, Hir::empty())); + } + HirKind::Repetition(ref mut x) => { + stack.push(mem::replace(&mut x.hir, Hir::empty())); + } + HirKind::Concat(ref mut x) => { + stack.extend(x.drain(..)); + } + HirKind::Alternation(ref mut x) => { + stack.extend(x.drain(..)); + } + } + } + } +} + +/// A type that documents various attributes of an HIR expression. +/// +/// These attributes are typically defined inductively on the HIR. +#[derive(Clone, Debug, Eq, PartialEq)] +struct HirInfo { + /// Represent yes/no questions by a bitfield to conserve space, since + /// this is included in every HIR expression. + /// + /// If more attributes need to be added, it is OK to increase the size of + /// this as appropriate. + bools: u8, +} + +// A simple macro for defining bitfield accessors/mutators. +macro_rules! define_bool { + ($bit:expr, $is_fn_name:ident, $set_fn_name:ident) => { + fn $is_fn_name(&self) -> bool { + self.bools & (0b1 << $bit) > 0 + } + + fn $set_fn_name(&mut self, yes: bool) { + if yes { + self.bools |= 1 << $bit; + } else { + self.bools &= !(1 << $bit); + } + } + } +} + +impl HirInfo { + fn new() -> HirInfo { + HirInfo { + bools: 0, + } + } + + define_bool!(0, is_always_utf8, set_always_utf8); + define_bool!(1, is_all_assertions, set_all_assertions); + define_bool!(2, is_anchored_start, set_anchored_start); + define_bool!(3, is_anchored_end, set_anchored_end); + define_bool!(4, is_any_anchored_start, set_any_anchored_start); + define_bool!(5, is_any_anchored_end, set_any_anchored_end); + define_bool!(6, is_match_empty, set_match_empty); +} + +#[cfg(test)] +mod tests { + use super::*; + + fn uclass(ranges: &[(char, char)]) -> ClassUnicode { + let ranges: Vec = ranges + .iter() + .map(|&(s, e)| ClassRangeUnicode::new(s, e)) + .collect(); + ClassUnicode::new(ranges) + } + + fn bclass(ranges: &[(u8, u8)]) -> ClassBytes { + let ranges: Vec = ranges + .iter() + .map(|&(s, e)| ClassRangeBytes::new(s, e)) + .collect(); + ClassBytes::new(ranges) + } + + fn uranges(cls: &ClassUnicode) -> Vec<(char, char)> { + cls.iter().map(|x| (x.start(), x.end())).collect() + } + + fn ucasefold(cls: &ClassUnicode) -> ClassUnicode { + let mut cls_ = cls.clone(); + cls_.case_fold_simple(); + cls_ + } + + fn uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode { + let mut cls_ = cls1.clone(); + cls_.union(cls2); + cls_ + } + + fn uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode { + let mut cls_ = cls1.clone(); + cls_.intersect(cls2); + cls_ + } + + fn udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode { + let mut cls_ = cls1.clone(); + cls_.difference(cls2); + cls_ + } + + fn usymdifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode { + let mut cls_ = cls1.clone(); + cls_.symmetric_difference(cls2); + cls_ + } + + fn unegate(cls: &ClassUnicode) -> ClassUnicode { + let mut cls_ = cls.clone(); + cls_.negate(); + cls_ + } + + fn branges(cls: &ClassBytes) -> Vec<(u8, u8)> { + cls.iter().map(|x| (x.start(), x.end())).collect() + } + + fn bcasefold(cls: &ClassBytes) -> ClassBytes { + let mut cls_ = cls.clone(); + cls_.case_fold_simple(); + cls_ + } + + fn bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { + let mut cls_ = cls1.clone(); + cls_.union(cls2); + cls_ + } + + fn bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { + let mut cls_ = cls1.clone(); + cls_.intersect(cls2); + cls_ + } + + fn bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { + let mut cls_ = cls1.clone(); + cls_.difference(cls2); + cls_ + } + + fn bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { + let mut cls_ = cls1.clone(); + cls_.symmetric_difference(cls2); + cls_ + } + + fn bnegate(cls: &ClassBytes) -> ClassBytes { + let mut cls_ = cls.clone(); + cls_.negate(); + cls_ + } + + #[test] + fn class_range_canonical_unicode() { + let range = ClassRangeUnicode::new('\u{00FF}', '\0'); + assert_eq!('\0', range.start()); + assert_eq!('\u{00FF}', range.end()); + } + + #[test] + fn class_range_canonical_bytes() { + let range = ClassRangeBytes::new(b'\xFF', b'\0'); + assert_eq!(b'\0', range.start()); + assert_eq!(b'\xFF', range.end()); + } + + #[test] + fn class_canonicalize_unicode() { + let cls = uclass(&[('a', 'c'), ('x', 'z')]); + let expected = vec![('a', 'c'), ('x', 'z')]; + assert_eq!(expected, uranges(&cls)); + + let cls = uclass(&[('x', 'z'), ('a', 'c')]); + let expected = vec![('a', 'c'), ('x', 'z')]; + assert_eq!(expected, uranges(&cls)); + + let cls = uclass(&[('x', 'z'), ('w', 'y')]); + let expected = vec![('w', 'z')]; + assert_eq!(expected, uranges(&cls)); + + let cls = uclass(&[ + ('c', 'f'), ('a', 'g'), ('d', 'j'), ('a', 'c'), + ('m', 'p'), ('l', 's'), + ]); + let expected = vec![('a', 'j'), ('l', 's')]; + assert_eq!(expected, uranges(&cls)); + + let cls = uclass(&[('x', 'z'), ('u', 'w')]); + let expected = vec![('u', 'z')]; + assert_eq!(expected, uranges(&cls)); + + let cls = uclass(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]); + let expected = vec![('\x00', '\u{10FFFF}')]; + assert_eq!(expected, uranges(&cls)); + + + let cls = uclass(&[('a', 'a'), ('b', 'b')]); + let expected = vec![('a', 'b')]; + assert_eq!(expected, uranges(&cls)); + } + + #[test] + fn class_canonicalize_bytes() { + let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]); + let expected = vec![(b'a', b'c'), (b'x', b'z')]; + assert_eq!(expected, branges(&cls)); + + let cls = bclass(&[(b'x', b'z'), (b'a', b'c')]); + let expected = vec![(b'a', b'c'), (b'x', b'z')]; + assert_eq!(expected, branges(&cls)); + + let cls = bclass(&[(b'x', b'z'), (b'w', b'y')]); + let expected = vec![(b'w', b'z')]; + assert_eq!(expected, branges(&cls)); + + let cls = bclass(&[ + (b'c', b'f'), (b'a', b'g'), (b'd', b'j'), (b'a', b'c'), + (b'm', b'p'), (b'l', b's'), + ]); + let expected = vec![(b'a', b'j'), (b'l', b's')]; + assert_eq!(expected, branges(&cls)); + + let cls = bclass(&[(b'x', b'z'), (b'u', b'w')]); + let expected = vec![(b'u', b'z')]; + assert_eq!(expected, branges(&cls)); + + let cls = bclass(&[(b'\x00', b'\xFF'), (b'\x00', b'\xFF')]); + let expected = vec![(b'\x00', b'\xFF')]; + assert_eq!(expected, branges(&cls)); + + let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]); + let expected = vec![(b'a', b'b')]; + assert_eq!(expected, branges(&cls)); + } + + #[test] + fn class_case_fold_unicode() { + let cls = uclass(&[ + ('C', 'F'), ('A', 'G'), ('D', 'J'), ('A', 'C'), + ('M', 'P'), ('L', 'S'), ('c', 'f'), + ]); + let expected = uclass(&[ + ('A', 'J'), ('L', 'S'), + ('a', 'j'), ('l', 's'), + ('\u{17F}', '\u{17F}'), + ]); + assert_eq!(expected, ucasefold(&cls)); + + let cls = uclass(&[('A', 'Z')]); + let expected = uclass(&[ + ('A', 'Z'), ('a', 'z'), + ('\u{17F}', '\u{17F}'), + ('\u{212A}', '\u{212A}'), + ]); + assert_eq!(expected, ucasefold(&cls)); + + let cls = uclass(&[('a', 'z')]); + let expected = uclass(&[ + ('A', 'Z'), ('a', 'z'), + ('\u{17F}', '\u{17F}'), + ('\u{212A}', '\u{212A}'), + ]); + assert_eq!(expected, ucasefold(&cls)); + + let cls = uclass(&[('A', 'A'), ('_', '_')]); + let expected = uclass(&[('A', 'A'), ('_', '_'), ('a', 'a')]); + assert_eq!(expected, ucasefold(&cls)); + + let cls = uclass(&[('A', 'A'), ('=', '=')]); + let expected = uclass(&[('=', '='), ('A', 'A'), ('a', 'a')]); + assert_eq!(expected, ucasefold(&cls)); + + let cls = uclass(&[('\x00', '\x10')]); + assert_eq!(cls, ucasefold(&cls)); + + let cls = uclass(&[('k', 'k')]); + let expected = uclass(&[ + ('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}'), + ]); + assert_eq!(expected, ucasefold(&cls)); + + let cls = uclass(&[('@', '@')]); + assert_eq!(cls, ucasefold(&cls)); + } + + #[test] + fn class_case_fold_bytes() { + let cls = bclass(&[ + (b'C', b'F'), (b'A', b'G'), (b'D', b'J'), (b'A', b'C'), + (b'M', b'P'), (b'L', b'S'), (b'c', b'f'), + ]); + let expected = bclass(&[ + (b'A', b'J'), (b'L', b'S'), + (b'a', b'j'), (b'l', b's'), + ]); + assert_eq!(expected, bcasefold(&cls)); + + let cls = bclass(&[(b'A', b'Z')]); + let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]); + assert_eq!(expected, bcasefold(&cls)); + + let cls = bclass(&[(b'a', b'z')]); + let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]); + assert_eq!(expected, bcasefold(&cls)); + + let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]); + let expected = bclass(&[(b'A', b'A'), (b'_', b'_'), (b'a', b'a')]); + assert_eq!(expected, bcasefold(&cls)); + + let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]); + let expected = bclass(&[(b'=', b'='), (b'A', b'A'), (b'a', b'a')]); + assert_eq!(expected, bcasefold(&cls)); + + let cls = bclass(&[(b'\x00', b'\x10')]); + assert_eq!(cls, bcasefold(&cls)); + + let cls = bclass(&[(b'k', b'k')]); + let expected = bclass(&[(b'K', b'K'), (b'k', b'k')]); + assert_eq!(expected, bcasefold(&cls)); + + let cls = bclass(&[(b'@', b'@')]); + assert_eq!(cls, bcasefold(&cls)); + } + + #[test] + fn class_negate_unicode() { + let cls = uclass(&[('a', 'a')]); + let expected = uclass(&[('\x00', '\x60'), ('\x62', '\u{10FFFF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('a', 'a'), ('b', 'b')]); + let expected = uclass(&[('\x00', '\x60'), ('\x63', '\u{10FFFF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('a', 'c'), ('x', 'z')]); + let expected = uclass(&[ + ('\x00', '\x60'), ('\x64', '\x77'), ('\x7B', '\u{10FFFF}'), + ]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('\x00', 'a')]); + let expected = uclass(&[('\x62', '\u{10FFFF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('a', '\u{10FFFF}')]); + let expected = uclass(&[('\x00', '\x60')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('\x00', '\u{10FFFF}')]); + let expected = uclass(&[]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[]); + let expected = uclass(&[('\x00', '\u{10FFFF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[ + ('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}'), + ]); + let expected = uclass(&[('\u{10FFFE}', '\u{10FFFE}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('\x00', '\u{D7FF}')]); + let expected = uclass(&[('\u{E000}', '\u{10FFFF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('\x00', '\u{D7FE}')]); + let expected = uclass(&[('\u{D7FF}', '\u{10FFFF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('\u{E000}', '\u{10FFFF}')]); + let expected = uclass(&[('\x00', '\u{D7FF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('\u{E001}', '\u{10FFFF}')]); + let expected = uclass(&[('\x00', '\u{E000}')]); + assert_eq!(expected, unegate(&cls)); + } + + #[test] + fn class_negate_bytes() { + let cls = bclass(&[(b'a', b'a')]); + let expected = bclass(&[(b'\x00', b'\x60'), (b'\x62', b'\xFF')]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]); + let expected = bclass(&[(b'\x00', b'\x60'), (b'\x63', b'\xFF')]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]); + let expected = bclass(&[ + (b'\x00', b'\x60'), (b'\x64', b'\x77'), (b'\x7B', b'\xFF'), + ]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[(b'\x00', b'a')]); + let expected = bclass(&[(b'\x62', b'\xFF')]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[(b'a', b'\xFF')]); + let expected = bclass(&[(b'\x00', b'\x60')]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[(b'\x00', b'\xFF')]); + let expected = bclass(&[]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[]); + let expected = bclass(&[(b'\x00', b'\xFF')]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[(b'\x00', b'\xFD'), (b'\xFF', b'\xFF')]); + let expected = bclass(&[(b'\xFE', b'\xFE')]); + assert_eq!(expected, bnegate(&cls)); + } + + #[test] + fn class_union_unicode() { + let cls1 = uclass(&[('a', 'g'), ('m', 't'), ('A', 'C')]); + let cls2 = uclass(&[('a', 'z')]); + let expected = uclass(&[('a', 'z'), ('A', 'C')]); + assert_eq!(expected, uunion(&cls1, &cls2)); + } + + #[test] + fn class_union_bytes() { + let cls1 = bclass(&[(b'a', b'g'), (b'm', b't'), (b'A', b'C')]); + let cls2 = bclass(&[(b'a', b'z')]); + let expected = bclass(&[(b'a', b'z'), (b'A', b'C')]); + assert_eq!(expected, bunion(&cls1, &cls2)); + } + + #[test] + fn class_intersect_unicode() { + let cls1 = uclass(&[]); + let cls2 = uclass(&[('a', 'a')]); + let expected = uclass(&[]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'a')]); + let cls2 = uclass(&[('a', 'a')]); + let expected = uclass(&[('a', 'a')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'a')]); + let cls2 = uclass(&[('b', 'b')]); + let expected = uclass(&[]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'a')]); + let cls2 = uclass(&[('a', 'c')]); + let expected = uclass(&[('a', 'a')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b')]); + let cls2 = uclass(&[('a', 'c')]); + let expected = uclass(&[('a', 'b')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b')]); + let cls2 = uclass(&[('b', 'c')]); + let expected = uclass(&[('b', 'b')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b')]); + let cls2 = uclass(&[('c', 'd')]); + let expected = uclass(&[]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('b', 'c')]); + let cls2 = uclass(&[('a', 'd')]); + let expected = uclass(&[('b', 'c')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + let cls2 = uclass(&[('a', 'h')]); + let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + let cls2 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b'), ('g', 'h')]); + let cls2 = uclass(&[('d', 'e'), ('k', 'l')]); + let expected = uclass(&[]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + let cls2 = uclass(&[('h', 'h')]); + let expected = uclass(&[('h', 'h')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b'), ('e', 'f'), ('i', 'j')]); + let cls2 = uclass(&[('c', 'd'), ('g', 'h'), ('k', 'l')]); + let expected = uclass(&[]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b'), ('c', 'd'), ('e', 'f')]); + let cls2 = uclass(&[('b', 'c'), ('d', 'e'), ('f', 'g')]); + let expected = uclass(&[('b', 'f')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + } + + #[test] + fn class_intersect_bytes() { + let cls1 = bclass(&[]); + let cls2 = bclass(&[(b'a', b'a')]); + let expected = bclass(&[]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'a')]); + let cls2 = bclass(&[(b'a', b'a')]); + let expected = bclass(&[(b'a', b'a')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'a')]); + let cls2 = bclass(&[(b'b', b'b')]); + let expected = bclass(&[]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'a')]); + let cls2 = bclass(&[(b'a', b'c')]); + let expected = bclass(&[(b'a', b'a')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b')]); + let cls2 = bclass(&[(b'a', b'c')]); + let expected = bclass(&[(b'a', b'b')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b')]); + let cls2 = bclass(&[(b'b', b'c')]); + let expected = bclass(&[(b'b', b'b')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b')]); + let cls2 = bclass(&[(b'c', b'd')]); + let expected = bclass(&[]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'b', b'c')]); + let cls2 = bclass(&[(b'a', b'd')]); + let expected = bclass(&[(b'b', b'c')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); + let cls2 = bclass(&[(b'a', b'h')]); + let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); + let cls2 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); + let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b'), (b'g', b'h')]); + let cls2 = bclass(&[(b'd', b'e'), (b'k', b'l')]); + let expected = bclass(&[]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); + let cls2 = bclass(&[(b'h', b'h')]); + let expected = bclass(&[(b'h', b'h')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b'), (b'e', b'f'), (b'i', b'j')]); + let cls2 = bclass(&[(b'c', b'd'), (b'g', b'h'), (b'k', b'l')]); + let expected = bclass(&[]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b'), (b'c', b'd'), (b'e', b'f')]); + let cls2 = bclass(&[(b'b', b'c'), (b'd', b'e'), (b'f', b'g')]); + let expected = bclass(&[(b'b', b'f')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + } + + #[test] + fn class_difference_unicode() { + let cls1 = uclass(&[('a', 'a')]); + let cls2 = uclass(&[('a', 'a')]); + let expected = uclass(&[]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'a')]); + let cls2 = uclass(&[]); + let expected = uclass(&[('a', 'a')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[]); + let cls2 = uclass(&[('a', 'a')]); + let expected = uclass(&[]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'z')]); + let cls2 = uclass(&[('a', 'a')]); + let expected = uclass(&[('b', 'z')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'z')]); + let cls2 = uclass(&[('z', 'z')]); + let expected = uclass(&[('a', 'y')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'z')]); + let cls2 = uclass(&[('m', 'm')]); + let expected = uclass(&[('a', 'l'), ('n', 'z')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); + let cls2 = uclass(&[('a', 'z')]); + let expected = uclass(&[]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); + let cls2 = uclass(&[('d', 'v')]); + let expected = uclass(&[('a', 'c')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); + let cls2 = uclass(&[('b', 'g'), ('s', 'u')]); + let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); + let cls2 = uclass(&[('b', 'd'), ('e', 'g'), ('s', 'u')]); + let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('x', 'z')]); + let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]); + let expected = uclass(&[('x', 'z')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'z')]); + let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]); + let expected = uclass(&[('d', 'd'), ('h', 'r'), ('v', 'z')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + } + + #[test] + fn class_difference_bytes() { + let cls1 = bclass(&[(b'a', b'a')]); + let cls2 = bclass(&[(b'a', b'a')]); + let expected = bclass(&[]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'a')]); + let cls2 = bclass(&[]); + let expected = bclass(&[(b'a', b'a')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[]); + let cls2 = bclass(&[(b'a', b'a')]); + let expected = bclass(&[]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'z')]); + let cls2 = bclass(&[(b'a', b'a')]); + let expected = bclass(&[(b'b', b'z')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'z')]); + let cls2 = bclass(&[(b'z', b'z')]); + let expected = bclass(&[(b'a', b'y')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'z')]); + let cls2 = bclass(&[(b'm', b'm')]); + let expected = bclass(&[(b'a', b'l'), (b'n', b'z')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); + let cls2 = bclass(&[(b'a', b'z')]); + let expected = bclass(&[]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); + let cls2 = bclass(&[(b'd', b'v')]); + let expected = bclass(&[(b'a', b'c')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); + let cls2 = bclass(&[(b'b', b'g'), (b's', b'u')]); + let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); + let cls2 = bclass(&[(b'b', b'd'), (b'e', b'g'), (b's', b'u')]); + let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'x', b'z')]); + let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]); + let expected = bclass(&[(b'x', b'z')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'z')]); + let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]); + let expected = bclass(&[(b'd', b'd'), (b'h', b'r'), (b'v', b'z')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + } + + #[test] + fn class_symmetric_difference_unicode() { + let cls1 = uclass(&[('a', 'm')]); + let cls2 = uclass(&[('g', 't')]); + let expected = uclass(&[('a', 'f'), ('n', 't')]); + assert_eq!(expected, usymdifference(&cls1, &cls2)); + } + + #[test] + fn class_symmetric_difference_bytes() { + let cls1 = bclass(&[(b'a', b'm')]); + let cls2 = bclass(&[(b'g', b't')]); + let expected = bclass(&[(b'a', b'f'), (b'n', b't')]); + assert_eq!(expected, bsymdifference(&cls1, &cls2)); + } + + #[test] + #[should_panic] + fn hir_byte_literal_non_ascii() { + Hir::literal(Literal::Byte(b'a')); + } + + // We use a thread with an explicit stack size to test that our destructor + // for Hir can handle arbitrarily sized expressions in constant stack + // space. In case we run on a platform without threads (WASM?), we limit + // this test to Windows/Unix. + #[test] + #[cfg(any(unix, windows))] + fn no_stack_overflow_on_drop() { + use std::thread; + + let run = || { + let mut expr = Hir::empty(); + for _ in 0..100 { + expr = Hir::group(Group { + kind: GroupKind::NonCapturing, + hir: Box::new(expr), + }); + expr = Hir::repetition(Repetition { + kind: RepetitionKind::ZeroOrOne, + greedy: true, + hir: Box::new(expr), + }); + + expr = Hir { + kind: HirKind::Concat(vec![expr]), + info: HirInfo::new(), + }; + expr = Hir { + kind: HirKind::Alternation(vec![expr]), + info: HirInfo::new(), + }; + } + assert!(!expr.kind.is_empty()); + }; + + // We run our test on a thread with a small stack size so we can + // force the issue more easily. + thread::Builder::new() + .stack_size(1<<10) + .spawn(run) + .unwrap() + .join() + .unwrap(); + } +} diff --git a/regex-syntax-2/src/hir/translate.rs b/regex-syntax-2/src/hir/translate.rs new file mode 100644 index 0000000000..367d1648ec --- /dev/null +++ b/regex-syntax-2/src/hir/translate.rs @@ -0,0 +1,2472 @@ +// Copyright 2018 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +/*! +Defines a translator that converts an `Ast` to an `Hir`. +*/ + +use std::cell::{Cell, RefCell}; +use std::result; + +use ast::{self, Ast, Span, Visitor}; +use hir::{self, Error, ErrorKind, Hir}; +use unicode::{self, ClassQuery}; + +type Result = result::Result; + +/// A builder for constructing an AST->HIR translator. +#[derive(Clone, Debug)] +pub struct TranslatorBuilder { + allow_invalid_utf8: bool, + flags: Flags, +} + +impl Default for TranslatorBuilder { + fn default() -> TranslatorBuilder { + TranslatorBuilder::new() + } +} + +impl TranslatorBuilder { + /// Create a new translator builder with a default c onfiguration. + pub fn new() -> TranslatorBuilder { + TranslatorBuilder { + allow_invalid_utf8: false, + flags: Flags::default(), + } + } + + /// Build a translator using the current configuration. + pub fn build(&self) -> Translator { + Translator { + stack: RefCell::new(vec![]), + flags: Cell::new(self.flags), + allow_invalid_utf8: self.allow_invalid_utf8, + } + } + + /// When enabled, translation will permit the construction of a regular + /// expression that may match invalid UTF-8. + /// + /// When disabled (the default), the translator is guaranteed to produce + /// an expression that will only ever match valid UTF-8 (otherwise, the + /// translator will return an error). + pub fn allow_invalid_utf8( + &mut self, + yes: bool, + ) -> &mut TranslatorBuilder { + self.allow_invalid_utf8 = yes; + self + } + + /// Enable or disable the case insensitive flag (`i`) by default. + pub fn case_insensitive(&mut self, yes: bool) -> &mut TranslatorBuilder { + self.flags.case_insensitive = if yes { Some(true) } else { None }; + self + } + + /// Enable or disable the multi-line matching flag (`m`) by default. + pub fn multi_line(&mut self, yes: bool) -> &mut TranslatorBuilder { + self.flags.multi_line = if yes { Some(true) } else { None }; + self + } + + /// Enable or disable the "dot matches any character" flag (`s`) by + /// default. + pub fn dot_matches_new_line( + &mut self, + yes: bool, + ) -> &mut TranslatorBuilder { + self.flags.dot_matches_new_line = if yes { Some(true) } else { None }; + self + } + + /// Enable or disable the "swap greed" flag (`U`) by default. + pub fn swap_greed(&mut self, yes: bool) -> &mut TranslatorBuilder { + self.flags.swap_greed = if yes { Some(true) } else { None }; + self + } + + /// Enable or disable the Unicode flag (`u`) by default. + pub fn unicode(&mut self, yes: bool) -> &mut TranslatorBuilder { + self.flags.unicode = if yes { None } else { Some(false) }; + self + } +} + +/// A translator maps abstract syntax to a high level intermediate +/// representation. +/// +/// A translator may be benefit from reuse. That is, a translator can translate +/// many abstract syntax trees. +/// +/// A `Translator` can be configured in more detail via a +/// [`TranslatorBuilder`](struct.TranslatorBuilder.html). +#[derive(Clone, Debug)] +pub struct Translator { + /// Our call stack, but on the heap. + stack: RefCell>, + /// The current flag settings. + flags: Cell, + /// Whether we're allowed to produce HIR that can match arbitrary bytes. + allow_invalid_utf8: bool, +} + +impl Translator { + /// Create a new translator using the default configuration. + pub fn new() -> Translator { + TranslatorBuilder::new().build() + } + + /// Translate the given abstract syntax tree (AST) into a high level + /// intermediate representation (HIR). + /// + /// If there was a problem doing the translation, then an HIR-specific + /// error is returned. + /// + /// The original pattern string used to produce the `Ast` *must* also be + /// provided. The translator does not use the pattern string during any + /// correct translation, but is used for error reporting. + pub fn translate(&mut self, pattern: &str, ast: &Ast) -> Result { + ast::visit(ast, TranslatorI::new(self, pattern)) + } +} + +/// An HirFrame is a single stack frame, represented explicitly, which is +/// created for each item in the Ast that we traverse. +/// +/// Note that technically, this type doesn't represent our entire stack +/// frame. In particular, the Ast visitor represents any state associated with +/// traversing the Ast itself. +#[derive(Clone, Debug)] +enum HirFrame { + /// An arbitrary HIR expression. These get pushed whenever we hit a base + /// case in the Ast. They get popped after an inductive (i.e., recursive) + /// step is complete. + Expr(Hir), + /// A Unicode character class. This frame is mutated as we descend into + /// the Ast of a character class (which is itself its own mini recursive + /// structure). + ClassUnicode(hir::ClassUnicode), + /// A byte-oriented character class. This frame is mutated as we descend + /// into the Ast of a character class (which is itself its own mini + /// recursive structure). + /// + /// Byte character classes are created when Unicode mode (`u`) is disabled. + /// If `allow_invalid_utf8` is disabled (the default), then a byte + /// character is only permitted to match ASCII text. + ClassBytes(hir::ClassBytes), + /// This is pushed on to the stack upon first seeing any kind of group, + /// indicated by parentheses (including non-capturing groups). It is popped + /// upon leaving a group. + Group { + /// The old active flags, if any, when this group was opened. + /// + /// If this group sets flags, then the new active flags are set to the + /// result of merging the old flags with the flags introduced by this + /// group. + /// + /// When this group is popped, the active flags should be restored to + /// the flags set here. + /// + /// The "active" flags correspond to whatever flags are set in the + /// Translator. + old_flags: Option, + }, + /// This is pushed whenever a concatenation is observed. After visiting + /// every sub-expression in the concatenation, the translator's stack is + /// popped until it sees a Concat frame. + Concat, + /// This is pushed whenever an alternation is observed. After visiting + /// every sub-expression in the alternation, the translator's stack is + /// popped until it sees an Alternation frame. + Alternation, +} + +impl HirFrame { + /// Assert that the current stack frame is an Hir expression and return it. + fn unwrap_expr(self) -> Hir { + match self { + HirFrame::Expr(expr) => expr, + _ => panic!("tried to unwrap expr from HirFrame, got: {:?}", self) + } + } + + /// Assert that the current stack frame is a Unicode class expression and + /// return it. + fn unwrap_class_unicode(self) -> hir::ClassUnicode { + match self { + HirFrame::ClassUnicode(cls) => cls, + _ => panic!("tried to unwrap Unicode class \ + from HirFrame, got: {:?}", self) + } + } + + /// Assert that the current stack frame is a byte class expression and + /// return it. + fn unwrap_class_bytes(self) -> hir::ClassBytes { + match self { + HirFrame::ClassBytes(cls) => cls, + _ => panic!("tried to unwrap byte class \ + from HirFrame, got: {:?}", self) + } + } + + /// Assert that the current stack frame is a group indicator and return + /// its corresponding flags (the flags that were active at the time the + /// group was entered) if they exist. + fn unwrap_group(self) -> Option { + match self { + HirFrame::Group { old_flags } => old_flags, + _ => panic!("tried to unwrap group from HirFrame, got: {:?}", self) + } + } +} + +impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { + type Output = Hir; + type Err = Error; + + fn finish(self) -> Result { + if self.trans().stack.borrow().is_empty() { + // This can happen if the Ast given consists of a single set of + // flags. e.g., `(?i)`. /shrug + return Ok(Hir::empty()); + } + // ... otherwise, we should have exactly one HIR on the stack. + assert_eq!(self.trans().stack.borrow().len(), 1); + Ok(self.pop().unwrap().unwrap_expr()) + } + + fn visit_pre(&mut self, ast: &Ast) -> Result<()> { + match *ast { + Ast::Class(ast::Class::Bracketed(_)) => { + if self.flags().unicode() { + let cls = hir::ClassUnicode::empty(); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let cls = hir::ClassBytes::empty(); + self.push(HirFrame::ClassBytes(cls)); + } + } + Ast::Group(ref x) => { + let old_flags = x.flags().map(|ast| self.set_flags(ast)); + self.push(HirFrame::Group { + old_flags: old_flags, + }); + } + Ast::Concat(ref x) if x.asts.is_empty() => {} + Ast::Concat(_) => { + self.push(HirFrame::Concat); + } + Ast::Alternation(ref x) if x.asts.is_empty() => {} + Ast::Alternation(_) => { + self.push(HirFrame::Alternation); + } + _ => {} + } + Ok(()) + } + + fn visit_post(&mut self, ast: &Ast) -> Result<()> { + match *ast { + Ast::Empty(_) => { + self.push(HirFrame::Expr(Hir::empty())); + } + Ast::Flags(ref x) => { + self.set_flags(&x.flags); + } + Ast::Literal(ref x) => { + self.push(HirFrame::Expr(try!(self.hir_literal(x)))); + } + Ast::Dot(span) => { + self.push(HirFrame::Expr(try!(self.hir_dot(span)))); + } + Ast::Assertion(ref x) => { + self.push(HirFrame::Expr(self.hir_assertion(x))); + } + Ast::Class(ast::Class::Perl(ref x)) => { + if self.flags().unicode() { + let cls = self.hir_perl_unicode_class(x); + let hcls = hir::Class::Unicode(cls); + self.push(HirFrame::Expr(Hir::class(hcls))); + } else { + let cls = self.hir_perl_byte_class(x); + let hcls = hir::Class::Bytes(cls); + self.push(HirFrame::Expr(Hir::class(hcls))); + } + } + Ast::Class(ast::Class::Unicode(ref x)) => { + let cls = hir::Class::Unicode(try!(self.hir_unicode_class(x))); + self.push(HirFrame::Expr(Hir::class(cls))); + } + Ast::Class(ast::Class::Bracketed(ref ast)) => { + if self.flags().unicode() { + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + self.unicode_fold_and_negate(ast.negated, &mut cls); + if cls.iter().next().is_none() { + return Err(self.error( + ast.span, ErrorKind::EmptyClassNotAllowed)); + } + let expr = Hir::class(hir::Class::Unicode(cls)); + self.push(HirFrame::Expr(expr)); + } else { + let mut cls = self.pop().unwrap().unwrap_class_bytes(); + try!(self.bytes_fold_and_negate( + &ast.span, ast.negated, &mut cls)); + if cls.iter().next().is_none() { + return Err(self.error( + ast.span, ErrorKind::EmptyClassNotAllowed)); + } + + let expr = Hir::class(hir::Class::Bytes(cls)); + self.push(HirFrame::Expr(expr)); + } + } + Ast::Repetition(ref x) => { + let expr = self.pop().unwrap().unwrap_expr(); + self.push(HirFrame::Expr(self.hir_repetition(x, expr))); + } + Ast::Group(ref x) => { + let expr = self.pop().unwrap().unwrap_expr(); + if let Some(flags) = self.pop().unwrap().unwrap_group() { + self.trans().flags.set(flags); + } + self.push(HirFrame::Expr(self.hir_group(x, expr))); + } + Ast::Concat(_) => { + let mut exprs = vec![]; + while let Some(HirFrame::Expr(expr)) = self.pop() { + exprs.push(expr); + } + exprs.reverse(); + self.push(HirFrame::Expr(Hir::concat(exprs))); + } + Ast::Alternation(_) => { + let mut exprs = vec![]; + while let Some(HirFrame::Expr(expr)) = self.pop() { + exprs.push(expr); + } + exprs.reverse(); + self.push(HirFrame::Expr(Hir::alternation(exprs))); + } + } + Ok(()) + } + + fn visit_class_set_item_pre( + &mut self, + ast: &ast::ClassSetItem, + ) -> Result<()> { + match *ast { + ast::ClassSetItem::Bracketed(_) => { + if self.flags().unicode() { + let cls = hir::ClassUnicode::empty(); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let cls = hir::ClassBytes::empty(); + self.push(HirFrame::ClassBytes(cls)); + } + } + // We needn't handle the Union case here since the visitor will + // do it for us. + _ => {} + } + Ok(()) + } + + fn visit_class_set_item_post( + &mut self, + ast: &ast::ClassSetItem, + ) -> Result<()> { + match *ast { + ast::ClassSetItem::Empty(_) => {} + ast::ClassSetItem::Literal(ref x) => { + if self.flags().unicode() { + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + cls.push(hir::ClassRangeUnicode::new(x.c, x.c)); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let mut cls = self.pop().unwrap().unwrap_class_bytes(); + let byte = try!(self.class_literal_byte(x)); + cls.push(hir::ClassRangeBytes::new(byte, byte)); + self.push(HirFrame::ClassBytes(cls)); + } + } + ast::ClassSetItem::Range(ref x) => { + if self.flags().unicode() { + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + cls.push(hir::ClassRangeUnicode::new(x.start.c, x.end.c)); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let mut cls = self.pop().unwrap().unwrap_class_bytes(); + let start = try!(self.class_literal_byte(&x.start)); + let end = try!(self.class_literal_byte(&x.end)); + cls.push(hir::ClassRangeBytes::new(start, end)); + self.push(HirFrame::ClassBytes(cls)); + } + } + ast::ClassSetItem::Ascii(ref x) => { + if self.flags().unicode() { + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + for &(s, e) in ascii_class(&x.kind) { + cls.push(hir::ClassRangeUnicode::new(s, e)); + } + self.unicode_fold_and_negate(x.negated, &mut cls); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let mut cls = self.pop().unwrap().unwrap_class_bytes(); + for &(s, e) in ascii_class(&x.kind) { + cls.push(hir::ClassRangeBytes::new(s as u8, e as u8)); + } + try!(self.bytes_fold_and_negate( + &x.span, x.negated, &mut cls)); + self.push(HirFrame::ClassBytes(cls)); + } + } + ast::ClassSetItem::Unicode(ref x) => { + let xcls = try!(self.hir_unicode_class(x)); + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + cls.union(&xcls); + self.push(HirFrame::ClassUnicode(cls)); + } + ast::ClassSetItem::Perl(ref x) => { + if self.flags().unicode() { + let xcls = self.hir_perl_unicode_class(x); + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + cls.union(&xcls); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let xcls = self.hir_perl_byte_class(x); + let mut cls = self.pop().unwrap().unwrap_class_bytes(); + cls.union(&xcls); + self.push(HirFrame::ClassBytes(cls)); + } + } + ast::ClassSetItem::Bracketed(ref ast) => { + if self.flags().unicode() { + let mut cls1 = self.pop().unwrap().unwrap_class_unicode(); + self.unicode_fold_and_negate(ast.negated, &mut cls1); + + let mut cls2 = self.pop().unwrap().unwrap_class_unicode(); + cls2.union(&cls1); + self.push(HirFrame::ClassUnicode(cls2)); + } else { + let mut cls1 = self.pop().unwrap().unwrap_class_bytes(); + try!(self.bytes_fold_and_negate( + &ast.span, ast.negated, &mut cls1)); + + let mut cls2 = self.pop().unwrap().unwrap_class_bytes(); + cls2.union(&cls1); + self.push(HirFrame::ClassBytes(cls2)); + } + } + // This is handled automatically by the visitor. + ast::ClassSetItem::Union(_) => {} + } + Ok(()) + } + + fn visit_class_set_binary_op_pre( + &mut self, + _op: &ast::ClassSetBinaryOp, + ) -> Result<()> { + if self.flags().unicode() { + let cls = hir::ClassUnicode::empty(); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let cls = hir::ClassBytes::empty(); + self.push(HirFrame::ClassBytes(cls)); + } + Ok(()) + } + + fn visit_class_set_binary_op_in( + &mut self, + _op: &ast::ClassSetBinaryOp, + ) -> Result<()> { + if self.flags().unicode() { + let cls = hir::ClassUnicode::empty(); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let cls = hir::ClassBytes::empty(); + self.push(HirFrame::ClassBytes(cls)); + } + Ok(()) + } + + fn visit_class_set_binary_op_post( + &mut self, + op: &ast::ClassSetBinaryOp, + ) -> Result<()> { + use ast::ClassSetBinaryOpKind::*; + + if self.flags().unicode() { + let mut rhs = self.pop().unwrap().unwrap_class_unicode(); + let mut lhs = self.pop().unwrap().unwrap_class_unicode(); + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + if self.flags().case_insensitive() { + rhs.case_fold_simple(); + lhs.case_fold_simple(); + } + match op.kind { + Intersection => lhs.intersect(&rhs), + Difference => lhs.difference(&rhs), + SymmetricDifference => lhs.symmetric_difference(&rhs), + } + cls.union(&lhs); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let mut rhs = self.pop().unwrap().unwrap_class_bytes(); + let mut lhs = self.pop().unwrap().unwrap_class_bytes(); + let mut cls = self.pop().unwrap().unwrap_class_bytes(); + if self.flags().case_insensitive() { + rhs.case_fold_simple(); + lhs.case_fold_simple(); + } + match op.kind { + Intersection => lhs.intersect(&rhs), + Difference => lhs.difference(&rhs), + SymmetricDifference => lhs.symmetric_difference(&rhs), + } + cls.union(&lhs); + self.push(HirFrame::ClassBytes(cls)); + } + Ok(()) + } +} + +/// The internal implementation of a translator. +/// +/// This type is responsible for carrying around the original pattern string, +/// which is not tied to the internal state of a translator. +/// +/// A TranslatorI exists for the time it takes to translate a single Ast. +#[derive(Clone, Debug)] +struct TranslatorI<'t, 'p> { + trans: &'t Translator, + pattern: &'p str, +} + +impl<'t, 'p> TranslatorI<'t, 'p> { + /// Build a new internal translator. + fn new(trans: &'t Translator, pattern: &'p str) -> TranslatorI<'t, 'p> { + TranslatorI { trans: trans, pattern: pattern } + } + + /// Return a reference to the underlying translator. + fn trans(&self) -> &Translator { + &self.trans + } + + /// Push the given frame on to the call stack. + fn push(&self, frame: HirFrame) { + self.trans().stack.borrow_mut().push(frame); + } + + /// Pop the top of the call stack. If the call stack is empty, return None. + fn pop(&self) -> Option { + self.trans().stack.borrow_mut().pop() + } + + /// Create a new error with the given span and error type. + fn error(&self, span: Span, kind: ErrorKind) -> Error { + Error { kind: kind, pattern: self.pattern.to_string(), span: span } + } + + /// Return a copy of the active flags. + fn flags(&self) -> Flags { + self.trans().flags.get() + } + + /// Set the flags of this translator from the flags set in the given AST. + /// Then, return the old flags. + fn set_flags(&self, ast_flags: &ast::Flags) -> Flags { + let old_flags = self.flags(); + let mut new_flags = Flags::from_ast(ast_flags); + new_flags.merge(&old_flags); + self.trans().flags.set(new_flags); + old_flags + } + + fn hir_literal(&self, lit: &ast::Literal) -> Result { + let ch = match try!(self.literal_to_char(lit)) { + byte @ hir::Literal::Byte(_) => return Ok(Hir::literal(byte)), + hir::Literal::Unicode(ch) => ch, + }; + if self.flags().case_insensitive() { + self.hir_from_char_case_insensitive(lit.span, ch) + } else { + self.hir_from_char(lit.span, ch) + } + } + + /// Convert an Ast literal to its scalar representation. + /// + /// When Unicode mode is enabled, then this always succeeds and returns a + /// `char` (Unicode scalar value). + /// + /// When Unicode mode is disabled, then a raw byte is returned. If that + /// byte is not ASCII and invalid UTF-8 is not allowed, then this returns + /// an error. + fn literal_to_char(&self, lit: &ast::Literal) -> Result { + if self.flags().unicode() { + return Ok(hir::Literal::Unicode(lit.c)); + } + let byte = match lit.byte() { + None => return Ok(hir::Literal::Unicode(lit.c)), + Some(byte) => byte, + }; + if byte <= 0x7F { + return Ok(hir::Literal::Unicode(byte as char)); + } + if !self.trans().allow_invalid_utf8 { + return Err(self.error(lit.span, ErrorKind::InvalidUtf8)); + } + Ok(hir::Literal::Byte(byte)) + } + + fn hir_from_char(&self, span: Span, c: char) -> Result { + if !self.flags().unicode() && c.len_utf8() > 1 { + return Err(self.error(span, ErrorKind::UnicodeNotAllowed)); + } + Ok(Hir::literal(hir::Literal::Unicode(c))) + } + + fn hir_from_char_case_insensitive( + &self, + span: Span, + c: char, + ) -> Result { + // If case folding won't do anything, then don't bother trying. + if !unicode::contains_simple_case_mapping(c, c) { + return self.hir_from_char(span, c); + } + if self.flags().unicode() { + let mut cls = hir::ClassUnicode::new(vec![ + hir::ClassRangeUnicode::new(c, c), + ]); + cls.case_fold_simple(); + Ok(Hir::class(hir::Class::Unicode(cls))) + } else { + if c.len_utf8() > 1 { + return Err(self.error(span, ErrorKind::UnicodeNotAllowed)); + } + let mut cls = hir::ClassBytes::new(vec![ + hir::ClassRangeBytes::new(c as u8, c as u8), + ]); + cls.case_fold_simple(); + Ok(Hir::class(hir::Class::Bytes(cls))) + } + } + + fn hir_dot(&self, span: Span) -> Result { + let unicode = self.flags().unicode(); + if !unicode && !self.trans().allow_invalid_utf8 { + return Err(self.error(span, ErrorKind::InvalidUtf8)); + } + Ok(if self.flags().dot_matches_new_line() { + Hir::any(!unicode) + } else { + Hir::dot(!unicode) + }) + } + + fn hir_assertion(&self, asst: &ast::Assertion) -> Hir { + let unicode = self.flags().unicode(); + let multi_line = self.flags().multi_line(); + match asst.kind { + ast::AssertionKind::StartLine => { + Hir::anchor(if multi_line { + hir::Anchor::StartLine + } else { + hir::Anchor::StartText + }) + } + ast::AssertionKind::EndLine => { + Hir::anchor(if multi_line { + hir::Anchor::EndLine + } else { + hir::Anchor::EndText + }) + } + ast::AssertionKind::StartText => { + Hir::anchor(hir::Anchor::StartText) + } + ast::AssertionKind::EndText => { + Hir::anchor(hir::Anchor::EndText) + } + ast::AssertionKind::WordBoundary => { + Hir::word_boundary(if unicode { + hir::WordBoundary::Unicode + } else { + hir::WordBoundary::Ascii + }) + } + ast::AssertionKind::NotWordBoundary => { + Hir::word_boundary(if unicode { + hir::WordBoundary::UnicodeNegate + } else { + hir::WordBoundary::AsciiNegate + }) + } + } + } + + fn hir_group(&self, group: &ast::Group, expr: Hir) -> Hir { + let kind = match group.kind { + ast::GroupKind::CaptureIndex(idx) => { + hir::GroupKind::CaptureIndex(idx) + } + ast::GroupKind::CaptureName(ref capname) => { + hir::GroupKind::CaptureName { + name: capname.name.clone(), + index: capname.index, + } + } + ast::GroupKind::NonCapturing(_) => hir::GroupKind::NonCapturing, + }; + Hir::group(hir::Group { + kind: kind, + hir: Box::new(expr), + }) + } + + fn hir_repetition(&self, rep: &ast::Repetition, expr: Hir) -> Hir { + let kind = match rep.op.kind { + ast::RepetitionKind::ZeroOrOne => hir::RepetitionKind::ZeroOrOne, + ast::RepetitionKind::ZeroOrMore => hir::RepetitionKind::ZeroOrMore, + ast::RepetitionKind::OneOrMore => hir::RepetitionKind::OneOrMore, + ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(m)) => { + hir::RepetitionKind::Range(hir::RepetitionRange::Exactly(m)) + } + ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(m)) => { + hir::RepetitionKind::Range(hir::RepetitionRange::AtLeast(m)) + } + ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(m,n)) => { + hir::RepetitionKind::Range(hir::RepetitionRange::Bounded(m, n)) + } + }; + let greedy = + if self.flags().swap_greed() { + !rep.greedy + } else { + rep.greedy + }; + Hir::repetition(hir::Repetition { + kind: kind, + greedy: greedy, + hir: Box::new(expr), + }) + } + + fn hir_unicode_class( + &self, + ast_class: &ast::ClassUnicode, + ) -> Result { + use ast::ClassUnicodeKind::*; + + if !self.flags().unicode() { + return Err(self.error( + ast_class.span, + ErrorKind::UnicodeNotAllowed, + )); + } + let query = match ast_class.kind { + OneLetter(name) => ClassQuery::OneLetter(name), + Named(ref name) => ClassQuery::Binary(name), + NamedValue { ref name, ref value, .. } => { + ClassQuery::ByValue { + property_name: name, + property_value: value, + } + } + }; + match unicode::class(query) { + Ok(mut class) => { + self.unicode_fold_and_negate(ast_class.negated, &mut class); + Ok(class) + } + Err(unicode::Error::PropertyNotFound) => { + Err(self.error( + ast_class.span, + ErrorKind::UnicodePropertyNotFound, + )) + } + Err(unicode::Error::PropertyValueNotFound) => { + Err(self.error( + ast_class.span, + ErrorKind::UnicodePropertyValueNotFound, + )) + } + } + } + + fn hir_perl_unicode_class( + &self, + ast_class: &ast::ClassPerl, + ) -> hir::ClassUnicode { + use ast::ClassPerlKind::*; + use unicode_tables::perl_word::PERL_WORD; + + assert!(self.flags().unicode()); + let mut class = match ast_class.kind { + Digit => { + let query = ClassQuery::Binary("Decimal_Number"); + unicode::class(query).unwrap() + } + Space => { + let query = ClassQuery::Binary("Whitespace"); + unicode::class(query).unwrap() + } + Word => unicode::hir_class(PERL_WORD), + }; + // We needn't apply case folding here because the Perl Unicode classes + // are already closed under Unicode simple case folding. + if ast_class.negated { + class.negate(); + } + class + } + + fn hir_perl_byte_class( + &self, + ast_class: &ast::ClassPerl, + ) -> hir::ClassBytes { + use ast::ClassPerlKind::*; + + assert!(!self.flags().unicode()); + let mut class = match ast_class.kind { + Digit => hir_ascii_class_bytes(&ast::ClassAsciiKind::Digit), + Space => hir_ascii_class_bytes(&ast::ClassAsciiKind::Space), + Word => hir_ascii_class_bytes(&ast::ClassAsciiKind::Word), + }; + // We needn't apply case folding here because the Perl ASCII classes + // are already closed (under ASCII case folding). + if ast_class.negated { + class.negate(); + } + class + } + + fn unicode_fold_and_negate( + &self, + negated: bool, + class: &mut hir::ClassUnicode, + ) { + // Note that we must apply case folding before negation! + // Consider `(?i)[^x]`. If we applied negation field, then + // the result would be the character class that matched any + // Unicode scalar value. + if self.flags().case_insensitive() { + class.case_fold_simple(); + } + if negated { + class.negate(); + } + } + + fn bytes_fold_and_negate( + &self, + span: &Span, + negated: bool, + class: &mut hir::ClassBytes, + ) -> Result<()> { + // Note that we must apply case folding before negation! + // Consider `(?i)[^x]`. If we applied negation field, then + // the result would be the character class that matched any + // Unicode scalar value. + if self.flags().case_insensitive() { + class.case_fold_simple(); + } + if negated { + class.negate(); + } + if !self.trans().allow_invalid_utf8 && !class.is_all_ascii() { + return Err(self.error(span.clone(), ErrorKind::InvalidUtf8)); + } + Ok(()) + } + + /// Return a scalar byte value suitable for use as a literal in a byte + /// character class. + fn class_literal_byte(&self, ast: &ast::Literal) -> Result { + match try!(self.literal_to_char(ast)) { + hir::Literal::Byte(byte) => Ok(byte), + hir::Literal::Unicode(ch) => { + if ch <= 0x7F as char { + Ok(ch as u8) + } else { + // We can't feasibly support Unicode in + // byte oriented classes. Byte classes don't + // do Unicode case folding. + Err(self.error(ast.span, ErrorKind::UnicodeNotAllowed)) + } + } + } + } +} + +/// A translator's representation of a regular expression's flags at any given +/// moment in time. +/// +/// Each flag can be in one of three states: absent, present but disabled or +/// present but enabled. +#[derive(Clone, Copy, Debug, Default)] +struct Flags { + case_insensitive: Option, + multi_line: Option, + dot_matches_new_line: Option, + swap_greed: Option, + unicode: Option, + // Note that `ignore_whitespace` is omitted here because it is handled + // entirely in the parser. +} + +impl Flags { + fn from_ast(ast: &ast::Flags) -> Flags { + let mut flags = Flags::default(); + let mut enable = true; + for item in &ast.items { + match item.kind { + ast::FlagsItemKind::Negation => { + enable = false; + } + ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive) => { + flags.case_insensitive = Some(enable); + } + ast::FlagsItemKind::Flag(ast::Flag::MultiLine) => { + flags.multi_line = Some(enable); + } + ast::FlagsItemKind::Flag(ast::Flag::DotMatchesNewLine) => { + flags.dot_matches_new_line = Some(enable); + } + ast::FlagsItemKind::Flag(ast::Flag::SwapGreed) => { + flags.swap_greed = Some(enable); + } + ast::FlagsItemKind::Flag(ast::Flag::Unicode) => { + flags.unicode = Some(enable); + } + ast::FlagsItemKind::Flag(ast::Flag::IgnoreWhitespace) => {} + } + } + flags + } + + fn merge(&mut self, previous: &Flags) { + if self.case_insensitive.is_none() { + self.case_insensitive = previous.case_insensitive; + } + if self.multi_line.is_none() { + self.multi_line = previous.multi_line; + } + if self.dot_matches_new_line.is_none() { + self.dot_matches_new_line = previous.dot_matches_new_line; + } + if self.swap_greed.is_none() { + self.swap_greed = previous.swap_greed; + } + if self.unicode.is_none() { + self.unicode = previous.unicode; + } + } + + fn case_insensitive(&self) -> bool { + self.case_insensitive.unwrap_or(false) + } + + fn multi_line(&self) -> bool { + self.multi_line.unwrap_or(false) + } + + fn dot_matches_new_line(&self) -> bool { + self.dot_matches_new_line.unwrap_or(false) + } + + fn swap_greed(&self) -> bool { + self.swap_greed.unwrap_or(false) + } + + fn unicode(&self) -> bool { + self.unicode.unwrap_or(true) + } +} + +fn hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes { + let ranges: Vec<_> = ascii_class(kind).iter().cloned().map(|(s, e)| { + hir::ClassRangeBytes::new(s as u8, e as u8) + }).collect(); + hir::ClassBytes::new(ranges) +} + +fn ascii_class(kind: &ast::ClassAsciiKind) -> &'static [(char, char)] { + use ast::ClassAsciiKind::*; + + // TODO: Get rid of these consts, which appear necessary for older + // versions of Rust. + type T = &'static [(char, char)]; + match *kind { + Alnum => { + const X: T = &[('0', '9'), ('A', 'Z'), ('a', 'z')]; + X + } + Alpha => { + const X: T = &[('A', 'Z'), ('a', 'z')]; + X + } + Ascii => { + const X: T = &[('\x00', '\x7F')]; + X + } + Blank => { + const X: T = &[(' ', '\t')]; + X + } + Cntrl => { + const X: T = &[('\x00', '\x1F'), ('\x7F', '\x7F')]; + X + } + Digit => { + const X: T = &[('0', '9')]; + X + } + Graph => { + const X: T = &[('!', '~')]; + X + } + Lower => { + const X: T = &[('a', 'z')]; + X + } + Print => { + const X: T = &[(' ', '~')]; + X + } + Punct => { + const X: T = &[('!', '/'), (':', '@'), ('[', '`'), ('{', '~')]; + X + } + Space => { + const X: T = &[ + ('\t', '\t'), ('\n', '\n'), ('\x0B', '\x0B'), ('\x0C', '\x0C'), + ('\r', '\r'), (' ', ' '), + ]; + X + } + Upper => { + const X: T = &[('A', 'Z')]; + X + } + Word => { + const X: T = &[('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z')]; + X + } + Xdigit => { + const X: T = &[('0', '9'), ('A', 'F'), ('a', 'f')]; + X + } + } +} + +#[cfg(test)] +mod tests { + use ast::{self, Ast, Position, Span}; + use ast::parse::ParserBuilder; + use hir::{self, Hir, HirKind}; + use unicode::{self, ClassQuery}; + + use super::{TranslatorBuilder, ascii_class}; + + // We create these errors to compare with real hir::Errors in the tests. + // We define equality between TestError and hir::Error to disregard the + // pattern string in hir::Error, which is annoying to provide in tests. + #[derive(Clone, Debug)] + struct TestError { + span: Span, + kind: hir::ErrorKind, + } + + impl PartialEq for TestError { + fn eq(&self, other: &hir::Error) -> bool { + self.span == other.span && self.kind == other.kind + } + } + + impl PartialEq for hir::Error { + fn eq(&self, other: &TestError) -> bool { + self.span == other.span && self.kind == other.kind + } + } + + fn parse(pattern: &str) -> Ast { + ParserBuilder::new().octal(true).build().parse(pattern).unwrap() + } + + fn t(pattern: &str) -> Hir { + TranslatorBuilder::new() + .allow_invalid_utf8(false) + .build() + .translate(pattern, &parse(pattern)) + .unwrap() + } + + fn t_err(pattern: &str) -> hir::Error { + TranslatorBuilder::new() + .allow_invalid_utf8(false) + .build() + .translate(pattern, &parse(pattern)) + .unwrap_err() + } + + fn t_bytes(pattern: &str) -> Hir { + TranslatorBuilder::new() + .allow_invalid_utf8(true) + .build() + .translate(pattern, &parse(pattern)) + .unwrap() + } + + fn hir_lit(s: &str) -> Hir { + match s.len() { + 0 => Hir::empty(), + _ => { + let lits = s + .chars() + .map(hir::Literal::Unicode) + .map(Hir::literal) + .collect(); + Hir::concat(lits) + } + } + } + + fn hir_blit(s: &[u8]) -> Hir { + match s.len() { + 0 => Hir::empty(), + 1 => Hir::literal(hir::Literal::Byte(s[0])), + _ => { + let lits = s + .iter() + .cloned() + .map(hir::Literal::Byte) + .map(Hir::literal) + .collect(); + Hir::concat(lits) + } + } + } + + fn hir_group(i: u32, expr: Hir) -> Hir { + Hir::group(hir::Group { + kind: hir::GroupKind::CaptureIndex(i), + hir: Box::new(expr), + }) + } + + fn hir_group_name(i: u32, name: &str, expr: Hir) -> Hir { + Hir::group(hir::Group { + kind: hir::GroupKind::CaptureName { + name: name.to_string(), + index: i, + }, + hir: Box::new(expr), + }) + } + + fn hir_group_nocap(expr: Hir) -> Hir { + Hir::group(hir::Group { + kind: hir::GroupKind::NonCapturing, + hir: Box::new(expr), + }) + } + + fn hir_quest(greedy: bool, expr: Hir) -> Hir { + Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::ZeroOrOne, + greedy: greedy, + hir: Box::new(expr), + }) + } + + fn hir_star(greedy: bool, expr: Hir) -> Hir { + Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::ZeroOrMore, + greedy: greedy, + hir: Box::new(expr), + }) + } + + fn hir_plus(greedy: bool, expr: Hir) -> Hir { + Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::OneOrMore, + greedy: greedy, + hir: Box::new(expr), + }) + } + + fn hir_range(greedy: bool, range: hir::RepetitionRange, expr: Hir) -> Hir { + Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::Range(range), + greedy: greedy, + hir: Box::new(expr), + }) + } + + fn hir_alt(alts: Vec) -> Hir { + Hir::alternation(alts) + } + + fn hir_cat(exprs: Vec) -> Hir { + Hir::concat(exprs) + } + + fn hir_uclass_query(query: ClassQuery) -> Hir { + Hir::class(hir::Class::Unicode(unicode::class(query).unwrap())) + } + + fn hir_uclass_perl_word() -> Hir { + use unicode_tables::perl_word::PERL_WORD; + Hir::class(hir::Class::Unicode(unicode::hir_class(PERL_WORD))) + } + + fn hir_uclass(ranges: &[(char, char)]) -> Hir { + let ranges: Vec = ranges + .iter() + .map(|&(s, e)| hir::ClassRangeUnicode::new(s, e)) + .collect(); + Hir::class(hir::Class::Unicode(hir::ClassUnicode::new(ranges))) + } + + fn hir_bclass(ranges: &[(u8, u8)]) -> Hir { + let ranges: Vec = ranges + .iter() + .map(|&(s, e)| hir::ClassRangeBytes::new(s, e)) + .collect(); + Hir::class(hir::Class::Bytes(hir::ClassBytes::new(ranges))) + } + + fn hir_bclass_from_char(ranges: &[(char, char)]) -> Hir { + let ranges: Vec = ranges + .iter() + .map(|&(s, e)| { + assert!(s as u32 <= 0x7F); + assert!(e as u32 <= 0x7F); + hir::ClassRangeBytes::new(s as u8, e as u8) + }) + .collect(); + Hir::class(hir::Class::Bytes(hir::ClassBytes::new(ranges))) + } + + fn hir_case_fold(expr: Hir) -> Hir { + match expr.into_kind() { + HirKind::Class(mut cls) => { + cls.case_fold_simple(); + Hir::class(cls) + } + _ => panic!("cannot case fold non-class Hir expr"), + } + } + + fn hir_negate(expr: Hir) -> Hir { + match expr.into_kind() { + HirKind::Class(mut cls) => { + cls.negate(); + Hir::class(cls) + } + _ => panic!("cannot negate non-class Hir expr"), + } + } + + fn hir_union(expr1: Hir, expr2: Hir) -> Hir { + use hir::Class::{Bytes, Unicode}; + + match (expr1.into_kind(), expr2.into_kind()) { + ( + HirKind::Class(Unicode(mut c1)), + HirKind::Class(Unicode(c2)), + ) => { + c1.union(&c2); + Hir::class(hir::Class::Unicode(c1)) + } + ( + HirKind::Class(Bytes(mut c1)), + HirKind::Class(Bytes(c2)), + ) => { + c1.union(&c2); + Hir::class(hir::Class::Bytes(c1)) + } + _ => panic!("cannot union non-class Hir exprs"), + } + } + + fn hir_difference(expr1: Hir, expr2: Hir) -> Hir { + use hir::Class::{Bytes, Unicode}; + + match (expr1.into_kind(), expr2.into_kind()) { + ( + HirKind::Class(Unicode(mut c1)), + HirKind::Class(Unicode(c2)), + ) => { + c1.difference(&c2); + Hir::class(hir::Class::Unicode(c1)) + } + ( + HirKind::Class(Bytes(mut c1)), + HirKind::Class(Bytes(c2)), + ) => { + c1.difference(&c2); + Hir::class(hir::Class::Bytes(c1)) + } + _ => panic!("cannot difference non-class Hir exprs"), + } + } + + fn hir_anchor(anchor: hir::Anchor) -> Hir { + Hir::anchor(anchor) + } + + fn hir_word(wb: hir::WordBoundary) -> Hir { + Hir::word_boundary(wb) + } + + #[test] + fn empty() { + assert_eq!(t(""), Hir::empty()); + assert_eq!(t("()"), hir_group(1, Hir::empty())); + assert_eq!(t("|"), hir_alt(vec![Hir::empty(), Hir::empty()])); + assert_eq!(t("()|()"), hir_alt(vec![ + hir_group(1, Hir::empty()), + hir_group(2, Hir::empty()), + ])); + assert_eq!(t("(|b)"), hir_group(1, hir_alt(vec![ + Hir::empty(), + hir_lit("b"), + ]))); + assert_eq!(t("(a|)"), hir_group(1, hir_alt(vec![ + hir_lit("a"), + Hir::empty(), + ]))); + assert_eq!(t("(a||c)"), hir_group(1, hir_alt(vec![ + hir_lit("a"), + Hir::empty(), + hir_lit("c"), + ]))); + assert_eq!(t("(||)"), hir_group(1, hir_alt(vec![ + Hir::empty(), + Hir::empty(), + Hir::empty(), + ]))); + } + + #[test] + fn literal() { + assert_eq!(t("a"), hir_lit("a")); + assert_eq!(t("(?-u)a"), hir_lit("a")); + assert_eq!(t("☃"), hir_lit("☃")); + assert_eq!(t("abcd"), hir_lit("abcd")); + + assert_eq!(t_bytes("(?-u)a"), hir_lit("a")); + assert_eq!(t_bytes("(?-u)\x61"), hir_lit("a")); + assert_eq!(t_bytes(r"(?-u)\x61"), hir_lit("a")); + assert_eq!(t_bytes(r"(?-u)\xFF"), hir_blit(b"\xFF")); + + assert_eq!(t_err("(?-u)☃"), TestError { + kind: hir::ErrorKind::UnicodeNotAllowed, + span: Span::new(Position::new(5, 1, 6), Position::new(8, 1, 7)), + }); + assert_eq!(t_err(r"(?-u)\xFF"), TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new(Position::new(5, 1, 6), Position::new(9, 1, 10)), + }); + } + + #[test] + fn literal_case_insensitive() { + assert_eq!(t("(?i)a"), hir_uclass(&[ + ('A', 'A'), ('a', 'a'), + ])); + assert_eq!(t("(?i:a)"), hir_group_nocap(hir_uclass(&[ + ('A', 'A'), ('a', 'a')], + ))); + assert_eq!(t("a(?i)a(?-i)a"), hir_cat(vec![ + hir_lit("a"), + hir_uclass(&[('A', 'A'), ('a', 'a')]), + hir_lit("a"), + ])); + assert_eq!(t("(?i)ab@c"), hir_cat(vec![ + hir_uclass(&[('A', 'A'), ('a', 'a')]), + hir_uclass(&[('B', 'B'), ('b', 'b')]), + hir_lit("@"), + hir_uclass(&[('C', 'C'), ('c', 'c')]), + ])); + assert_eq!(t("(?i)β"), hir_uclass(&[ + ('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'), + ])); + + assert_eq!(t("(?i-u)a"), hir_bclass(&[ + (b'A', b'A'), (b'a', b'a'), + ])); + assert_eq!(t("(?-u)a(?i)a(?-i)a"), hir_cat(vec![ + hir_lit("a"), + hir_bclass(&[(b'A', b'A'), (b'a', b'a')]), + hir_lit("a"), + ])); + assert_eq!(t("(?i-u)ab@c"), hir_cat(vec![ + hir_bclass(&[(b'A', b'A'), (b'a', b'a')]), + hir_bclass(&[(b'B', b'B'), (b'b', b'b')]), + hir_lit("@"), + hir_bclass(&[(b'C', b'C'), (b'c', b'c')]), + ])); + + assert_eq!(t_bytes("(?i-u)a"), hir_bclass(&[ + (b'A', b'A'), (b'a', b'a'), + ])); + assert_eq!(t_bytes("(?i-u)\x61"), hir_bclass(&[ + (b'A', b'A'), (b'a', b'a'), + ])); + assert_eq!(t_bytes(r"(?i-u)\x61"), hir_bclass(&[ + (b'A', b'A'), (b'a', b'a'), + ])); + assert_eq!(t_bytes(r"(?i-u)\xFF"), hir_blit(b"\xFF")); + + assert_eq!(t_err("(?i-u)β"), TestError { + kind: hir::ErrorKind::UnicodeNotAllowed, + span: Span::new( + Position::new(6, 1, 7), + Position::new(8, 1, 8), + ), + }); + } + + #[test] + fn dot() { + assert_eq!(t("."), hir_uclass(&[ + ('\0', '\t'), + ('\x0B', '\u{10FFFF}'), + ])); + assert_eq!(t("(?s)."), hir_uclass(&[ + ('\0', '\u{10FFFF}'), + ])); + assert_eq!(t_bytes("(?-u)."), hir_bclass(&[ + (b'\0', b'\t'), + (b'\x0B', b'\xFF'), + ])); + assert_eq!(t_bytes("(?s-u)."), hir_bclass(&[ + (b'\0', b'\xFF'), + ])); + + // If invalid UTF-8 isn't allowed, then non-Unicode `.` isn't allowed. + assert_eq!(t_err("(?-u)."), TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new(Position::new(5, 1, 6), Position::new(6, 1, 7)), + }); + assert_eq!(t_err("(?s-u)."), TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new(Position::new(6, 1, 7), Position::new(7, 1, 8)), + }); + } + + #[test] + fn assertions() { + assert_eq!(t("^"), hir_anchor(hir::Anchor::StartText)); + assert_eq!(t("$"), hir_anchor(hir::Anchor::EndText)); + assert_eq!(t(r"\A"), hir_anchor(hir::Anchor::StartText)); + assert_eq!(t(r"\z"), hir_anchor(hir::Anchor::EndText)); + assert_eq!(t("(?m)^"), hir_anchor(hir::Anchor::StartLine)); + assert_eq!(t("(?m)$"), hir_anchor(hir::Anchor::EndLine)); + assert_eq!(t(r"(?m)\A"), hir_anchor(hir::Anchor::StartText)); + assert_eq!(t(r"(?m)\z"), hir_anchor(hir::Anchor::EndText)); + + assert_eq!(t(r"\b"), hir_word(hir::WordBoundary::Unicode)); + assert_eq!(t(r"\B"), hir_word(hir::WordBoundary::UnicodeNegate)); + assert_eq!(t(r"(?-u)\b"), hir_word(hir::WordBoundary::Ascii)); + assert_eq!(t(r"(?-u)\B"), hir_word(hir::WordBoundary::AsciiNegate)); + } + + #[test] + fn group() { + assert_eq!(t("(a)"), hir_group(1, hir_lit("a"))); + assert_eq!(t("(a)(b)"), hir_cat(vec![ + hir_group(1, hir_lit("a")), + hir_group(2, hir_lit("b")), + ])); + assert_eq!(t("(a)|(b)"), hir_alt(vec![ + hir_group(1, hir_lit("a")), + hir_group(2, hir_lit("b")), + ])); + assert_eq!(t("(?P)"), hir_group_name(1, "foo", Hir::empty())); + assert_eq!(t("(?Pa)"), hir_group_name(1, "foo", hir_lit("a"))); + assert_eq!(t("(?Pa)(?Pb)"), hir_cat(vec![ + hir_group_name(1, "foo", hir_lit("a")), + hir_group_name(2, "bar", hir_lit("b")), + ])); + assert_eq!(t("(?:)"), hir_group_nocap(Hir::empty())); + assert_eq!(t("(?:a)"), hir_group_nocap(hir_lit("a"))); + assert_eq!(t("(?:a)(b)"), hir_cat(vec![ + hir_group_nocap(hir_lit("a")), + hir_group(1, hir_lit("b")), + ])); + assert_eq!(t("(a)(?:b)(c)"), hir_cat(vec![ + hir_group(1, hir_lit("a")), + hir_group_nocap(hir_lit("b")), + hir_group(2, hir_lit("c")), + ])); + assert_eq!(t("(a)(?Pb)(c)"), hir_cat(vec![ + hir_group(1, hir_lit("a")), + hir_group_name(2, "foo", hir_lit("b")), + hir_group(3, hir_lit("c")), + ])); + } + + #[test] + fn flags() { + assert_eq!(t("(?i:a)a"), hir_cat(vec![ + hir_group_nocap(hir_uclass(&[('A', 'A'), ('a', 'a')])), + hir_lit("a"), + ])); + assert_eq!(t("(?i-u:a)β"), hir_cat(vec![ + hir_group_nocap(hir_bclass(&[(b'A', b'A'), (b'a', b'a')])), + hir_lit("β"), + ])); + assert_eq!(t("(?i)(?-i:a)a"), hir_cat(vec![ + hir_group_nocap(hir_lit("a")), + hir_uclass(&[('A', 'A'), ('a', 'a')]), + ])); + assert_eq!(t("(?im)a^"), hir_cat(vec![ + hir_uclass(&[('A', 'A'), ('a', 'a')]), + hir_anchor(hir::Anchor::StartLine), + ])); + assert_eq!(t("(?im)a^(?i-m)a^"), hir_cat(vec![ + hir_uclass(&[('A', 'A'), ('a', 'a')]), + hir_anchor(hir::Anchor::StartLine), + hir_uclass(&[('A', 'A'), ('a', 'a')]), + hir_anchor(hir::Anchor::StartText), + ])); + assert_eq!(t("(?U)a*a*?(?-U)a*a*?"), hir_cat(vec![ + hir_star(false, hir_lit("a")), + hir_star(true, hir_lit("a")), + hir_star(true, hir_lit("a")), + hir_star(false, hir_lit("a")), + ])); + assert_eq!(t("(?:a(?i)a)a"), hir_cat(vec![ + hir_group_nocap(hir_cat(vec![ + hir_lit("a"), + hir_uclass(&[('A', 'A'), ('a', 'a')]), + ])), + hir_lit("a"), + ])); + assert_eq!(t("(?i)(?:a(?-i)a)a"), hir_cat(vec![ + hir_group_nocap(hir_cat(vec![ + hir_uclass(&[('A', 'A'), ('a', 'a')]), + hir_lit("a"), + ])), + hir_uclass(&[('A', 'A'), ('a', 'a')]), + ])); + } + + #[test] + fn escape() { + assert_eq!( + t(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#"), + hir_lit(r"\.+*?()|[]{}^$#"), + ); + } + + #[test] + fn repetition() { + assert_eq!(t("a?"), hir_quest(true, hir_lit("a"))); + assert_eq!(t("a*"), hir_star(true, hir_lit("a"))); + assert_eq!(t("a+"), hir_plus(true, hir_lit("a"))); + assert_eq!(t("a??"), hir_quest(false, hir_lit("a"))); + assert_eq!(t("a*?"), hir_star(false, hir_lit("a"))); + assert_eq!(t("a+?"), hir_plus(false, hir_lit("a"))); + + assert_eq!( + t("a{1}"), + hir_range( + true, + hir::RepetitionRange::Exactly(1), + hir_lit("a"), + )); + assert_eq!( + t("a{1,}"), + hir_range( + true, + hir::RepetitionRange::AtLeast(1), + hir_lit("a"), + )); + assert_eq!( + t("a{1,2}"), + hir_range( + true, + hir::RepetitionRange::Bounded(1, 2), + hir_lit("a"), + )); + assert_eq!( + t("a{1}?"), + hir_range( + false, + hir::RepetitionRange::Exactly(1), + hir_lit("a"), + )); + assert_eq!( + t("a{1,}?"), + hir_range( + false, + hir::RepetitionRange::AtLeast(1), + hir_lit("a"), + )); + assert_eq!( + t("a{1,2}?"), + hir_range( + false, + hir::RepetitionRange::Bounded(1, 2), + hir_lit("a"), + )); + + assert_eq!(t("ab?"), hir_cat(vec![ + hir_lit("a"), + hir_quest(true, hir_lit("b")), + ])); + assert_eq!(t("(ab)?"), hir_quest(true, hir_group(1, hir_cat(vec![ + hir_lit("a"), + hir_lit("b"), + ])))); + assert_eq!(t("a|b?"), hir_alt(vec![ + hir_lit("a"), + hir_quest(true, hir_lit("b")), + ])); + } + + #[test] + fn cat_alt() { + assert_eq!(t("(ab)"), hir_group(1, hir_cat(vec![ + hir_lit("a"), + hir_lit("b"), + ]))); + assert_eq!(t("a|b"), hir_alt(vec![ + hir_lit("a"), + hir_lit("b"), + ])); + assert_eq!(t("a|b|c"), hir_alt(vec![ + hir_lit("a"), + hir_lit("b"), + hir_lit("c"), + ])); + assert_eq!(t("ab|bc|cd"), hir_alt(vec![ + hir_lit("ab"), + hir_lit("bc"), + hir_lit("cd"), + ])); + assert_eq!(t("(a|b)"), hir_group(1, hir_alt(vec![ + hir_lit("a"), + hir_lit("b"), + ]))); + assert_eq!(t("(a|b|c)"), hir_group(1, hir_alt(vec![ + hir_lit("a"), + hir_lit("b"), + hir_lit("c"), + ]))); + assert_eq!(t("(ab|bc|cd)"), hir_group(1, hir_alt(vec![ + hir_lit("ab"), + hir_lit("bc"), + hir_lit("cd"), + ]))); + assert_eq!(t("(ab|(bc|(cd)))"), hir_group(1, hir_alt(vec![ + hir_lit("ab"), + hir_group(2, hir_alt(vec![ + hir_lit("bc"), + hir_group(3, hir_lit("cd")), + ])), + ]))); + } + + #[test] + fn class_ascii() { + assert_eq!( + t("[[:alnum:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Alnum))); + assert_eq!( + t("[[:alpha:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Alpha))); + assert_eq!( + t("[[:ascii:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Ascii))); + assert_eq!( + t("[[:blank:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Blank))); + assert_eq!( + t("[[:cntrl:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Cntrl))); + assert_eq!( + t("[[:digit:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Digit))); + assert_eq!( + t("[[:graph:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Graph))); + assert_eq!( + t("[[:lower:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Lower))); + assert_eq!( + t("[[:print:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Print))); + assert_eq!( + t("[[:punct:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Punct))); + assert_eq!( + t("[[:space:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Space))); + assert_eq!( + t("[[:upper:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Upper))); + assert_eq!( + t("[[:word:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Word))); + assert_eq!( + t("[[:xdigit:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Xdigit))); + + assert_eq!( + t("[[:^lower:]]"), + hir_negate(hir_uclass(ascii_class(&ast::ClassAsciiKind::Lower)))); + assert_eq!( + t("(?i)[[:lower:]]"), + hir_uclass(&[ + ('A', 'Z'), ('a', 'z'), + ('\u{17F}', '\u{17F}'), + ('\u{212A}', '\u{212A}'), + ])); + + assert_eq!( + t("(?-u)[[:lower:]]"), + hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Lower))); + assert_eq!( + t("(?i-u)[[:lower:]]"), + hir_case_fold(hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Lower)))); + + assert_eq!(t_err("(?-u)[[:^lower:]]"), TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new(Position::new(6, 1, 7), Position::new(16, 1, 17)), + }); + assert_eq!(t_err("(?i-u)[[:^lower:]]"), TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new(Position::new(7, 1, 8), Position::new(17, 1, 18)), + }); + } + + #[test] + fn class_perl() { + // Unicode + assert_eq!( + t(r"\d"), + hir_uclass_query(ClassQuery::Binary("digit"))); + assert_eq!( + t(r"\s"), + hir_uclass_query(ClassQuery::Binary("space"))); + assert_eq!( + t(r"\w"), + hir_uclass_perl_word()); + assert_eq!( + t(r"(?i)\d"), + hir_uclass_query(ClassQuery::Binary("digit"))); + assert_eq!( + t(r"(?i)\s"), + hir_uclass_query(ClassQuery::Binary("space"))); + assert_eq!( + t(r"(?i)\w"), + hir_uclass_perl_word()); + + // Unicode, negated + assert_eq!( + t(r"\D"), + hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))); + assert_eq!( + t(r"\S"), + hir_negate(hir_uclass_query(ClassQuery::Binary("space")))); + assert_eq!( + t(r"\W"), + hir_negate(hir_uclass_perl_word())); + assert_eq!( + t(r"(?i)\D"), + hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))); + assert_eq!( + t(r"(?i)\S"), + hir_negate(hir_uclass_query(ClassQuery::Binary("space")))); + assert_eq!( + t(r"(?i)\W"), + hir_negate(hir_uclass_perl_word())); + + // ASCII only + assert_eq!( + t(r"(?-u)\d"), + hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit))); + assert_eq!( + t(r"(?-u)\s"), + hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Space))); + assert_eq!( + t(r"(?-u)\w"), + hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Word))); + assert_eq!( + t(r"(?i-u)\d"), + hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit))); + assert_eq!( + t(r"(?i-u)\s"), + hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Space))); + assert_eq!( + t(r"(?i-u)\w"), + hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Word))); + + // ASCII only, negated + assert_eq!( + t(r"(?-u)\D"), + hir_negate(hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Digit)))); + assert_eq!( + t(r"(?-u)\S"), + hir_negate(hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Space)))); + assert_eq!( + t(r"(?-u)\W"), + hir_negate(hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Word)))); + assert_eq!( + t(r"(?i-u)\D"), + hir_negate(hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Digit)))); + assert_eq!( + t(r"(?i-u)\S"), + hir_negate(hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Space)))); + assert_eq!( + t(r"(?i-u)\W"), + hir_negate(hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Word)))); + } + + #[test] + fn class_unicode() { + assert_eq!( + t(r"\pZ"), + hir_uclass_query(ClassQuery::Binary("Z"))); + assert_eq!( + t(r"\pz"), + hir_uclass_query(ClassQuery::Binary("Z"))); + assert_eq!( + t(r"\p{Separator}"), + hir_uclass_query(ClassQuery::Binary("Z"))); + assert_eq!( + t(r"\p{se PaRa ToR}"), + hir_uclass_query(ClassQuery::Binary("Z"))); + assert_eq!( + t(r"\p{gc:Separator}"), + hir_uclass_query(ClassQuery::Binary("Z"))); + assert_eq!( + t(r"\p{gc=Separator}"), + hir_uclass_query(ClassQuery::Binary("Z"))); + + assert_eq!( + t(r"\PZ"), + hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))); + assert_eq!( + t(r"\P{separator}"), + hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))); + assert_eq!( + t(r"\P{gc!=separator}"), + hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))); + + assert_eq!( + t(r"\p{Greek}"), + hir_uclass_query(ClassQuery::Binary("Greek"))); + assert_eq!( + t(r"(?i)\p{Greek}"), + hir_case_fold(hir_uclass_query(ClassQuery::Binary("Greek")))); + assert_eq!( + t(r"(?i)\P{Greek}"), + hir_negate(hir_case_fold(hir_uclass_query( + ClassQuery::Binary("Greek"))))); + + assert_eq!(t_err(r"(?-u)\pZ"), TestError { + kind: hir::ErrorKind::UnicodeNotAllowed, + span: Span::new(Position::new(5, 1, 6), Position::new(8, 1, 9)), + }); + assert_eq!(t_err(r"(?-u)\p{Separator}"), TestError { + kind: hir::ErrorKind::UnicodeNotAllowed, + span: Span::new(Position::new(5, 1, 6), Position::new(18, 1, 19)), + }); + assert_eq!(t_err(r"\pE"), TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new(Position::new(0, 1, 1), Position::new(3, 1, 4)), + }); + assert_eq!(t_err(r"\p{Foo}"), TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new(Position::new(0, 1, 1), Position::new(7, 1, 8)), + }); + assert_eq!(t_err(r"\p{gc:Foo}"), TestError { + kind: hir::ErrorKind::UnicodePropertyValueNotFound, + span: Span::new(Position::new(0, 1, 1), Position::new(10, 1, 11)), + }); + assert_eq!(t_err(r"\p{sc:Foo}"), TestError { + kind: hir::ErrorKind::UnicodePropertyValueNotFound, + span: Span::new(Position::new(0, 1, 1), Position::new(10, 1, 11)), + }); + assert_eq!(t_err(r"\p{scx:Foo}"), TestError { + kind: hir::ErrorKind::UnicodePropertyValueNotFound, + span: Span::new(Position::new(0, 1, 1), Position::new(11, 1, 12)), + }); + assert_eq!(t_err(r"\p{age:Foo}"), TestError { + kind: hir::ErrorKind::UnicodePropertyValueNotFound, + span: Span::new(Position::new(0, 1, 1), Position::new(11, 1, 12)), + }); + } + + #[test] + fn class_bracketed() { + assert_eq!(t("[a]"), hir_uclass(&[('a', 'a')])); + assert_eq!(t("[^[a]]"), hir_negate(hir_uclass(&[('a', 'a')]))); + assert_eq!(t("[a-z]"), hir_uclass(&[('a', 'z')])); + assert_eq!(t("[a-fd-h]"), hir_uclass(&[('a', 'h')])); + assert_eq!(t("[a-fg-m]"), hir_uclass(&[('a', 'm')])); + assert_eq!(t(r"[\x00]"), hir_uclass(&[('\0', '\0')])); + assert_eq!(t(r"[\n]"), hir_uclass(&[('\n', '\n')])); + assert_eq!(t("[\n]"), hir_uclass(&[('\n', '\n')])); + assert_eq!( + t(r"[\d]"), + hir_uclass_query(ClassQuery::Binary("digit"))); + assert_eq!( + t(r"[\pZ]"), + hir_uclass_query(ClassQuery::Binary("separator"))); + assert_eq!( + t(r"[\p{separator}]"), + hir_uclass_query(ClassQuery::Binary("separator"))); + assert_eq!( + t(r"[^\D]"), + hir_uclass_query(ClassQuery::Binary("digit"))); + assert_eq!( + t(r"[^\PZ]"), + hir_uclass_query(ClassQuery::Binary("separator"))); + assert_eq!( + t(r"[^\P{separator}]"), + hir_uclass_query(ClassQuery::Binary("separator"))); + assert_eq!( + t(r"(?i)[^\D]"), + hir_uclass_query(ClassQuery::Binary("digit"))); + assert_eq!( + t(r"(?i)[^\P{greek}]"), + hir_case_fold(hir_uclass_query(ClassQuery::Binary("greek")))); + + assert_eq!(t("(?-u)[a]"), hir_bclass(&[(b'a', b'a')])); + assert_eq!(t(r"(?-u)[\x00]"), hir_bclass(&[(b'\0', b'\0')])); + assert_eq!(t_bytes(r"(?-u)[\xFF]"), hir_bclass(&[(b'\xFF', b'\xFF')])); + + assert_eq!(t("(?i)[a]"), hir_uclass(&[('A', 'A'), ('a', 'a')])); + assert_eq!(t("(?i)[k]"), hir_uclass(&[ + ('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}'), + ])); + assert_eq!(t("(?i)[β]"), hir_uclass(&[ + ('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'), + ])); + assert_eq!(t("(?i-u)[k]"), hir_bclass(&[ + (b'K', b'K'), (b'k', b'k'), + ])); + + assert_eq!(t("[^a]"), hir_negate(hir_uclass(&[('a', 'a')]))); + assert_eq!(t(r"[^\x00]"), hir_negate(hir_uclass(&[('\0', '\0')]))); + assert_eq!( + t_bytes("(?-u)[^a]"), + hir_negate(hir_bclass(&[(b'a', b'a')]))); + assert_eq!( + t(r"[^\d]"), + hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))); + assert_eq!( + t(r"[^\pZ]"), + hir_negate(hir_uclass_query(ClassQuery::Binary("separator")))); + assert_eq!( + t(r"[^\p{separator}]"), + hir_negate(hir_uclass_query(ClassQuery::Binary("separator")))); + assert_eq!( + t(r"(?i)[^\p{greek}]"), + hir_negate(hir_case_fold(hir_uclass_query( + ClassQuery::Binary("greek"))))); + assert_eq!( + t(r"(?i)[\P{greek}]"), + hir_negate(hir_case_fold(hir_uclass_query( + ClassQuery::Binary("greek"))))); + + // Test some weird cases. + assert_eq!(t(r"[\[]"), hir_uclass(&[('[', '[')])); + + assert_eq!(t(r"[&]"), hir_uclass(&[('&', '&')])); + assert_eq!(t(r"[\&]"), hir_uclass(&[('&', '&')])); + assert_eq!(t(r"[\&\&]"), hir_uclass(&[('&', '&')])); + assert_eq!(t(r"[\x00-&]"), hir_uclass(&[('\0', '&')])); + assert_eq!(t(r"[&-\xFF]"), hir_uclass(&[('&', '\u{FF}')])); + + assert_eq!(t(r"[~]"), hir_uclass(&[('~', '~')])); + assert_eq!(t(r"[\~]"), hir_uclass(&[('~', '~')])); + assert_eq!(t(r"[\~\~]"), hir_uclass(&[('~', '~')])); + assert_eq!(t(r"[\x00-~]"), hir_uclass(&[('\0', '~')])); + assert_eq!(t(r"[~-\xFF]"), hir_uclass(&[('~', '\u{FF}')])); + + assert_eq!(t(r"[-]"), hir_uclass(&[('-', '-')])); + assert_eq!(t(r"[\-]"), hir_uclass(&[('-', '-')])); + assert_eq!(t(r"[\-\-]"), hir_uclass(&[('-', '-')])); + assert_eq!(t(r"[\x00-\-]"), hir_uclass(&[('\0', '-')])); + assert_eq!(t(r"[\--\xFF]"), hir_uclass(&[('-', '\u{FF}')])); + + assert_eq!(t_err("(?-u)[^a]"), TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new(Position::new(5, 1, 6), Position::new(9, 1, 10)), + }); + assert_eq!(t_err(r"[^\s\S]"), TestError { + kind: hir::ErrorKind::EmptyClassNotAllowed, + span: Span::new(Position::new(0, 1, 1), Position::new(7, 1, 8)), + }); + } + + #[test] + fn class_bracketed_union() { + assert_eq!( + t("[a-zA-Z]"), + hir_uclass(&[('A', 'Z'), ('a', 'z')])); + assert_eq!( + t(r"[a\pZb]"), + hir_union( + hir_uclass(&[('a', 'b')]), + hir_uclass_query(ClassQuery::Binary("separator")))); + assert_eq!( + t(r"[\pZ\p{Greek}]"), + hir_union( + hir_uclass_query(ClassQuery::Binary("greek")), + hir_uclass_query(ClassQuery::Binary("separator")))); + assert_eq!( + t(r"[\p{age:3.0}\pZ\p{Greek}]"), + hir_union( + hir_uclass_query(ClassQuery::ByValue { + property_name: "age", + property_value: "3.0", + }), + hir_union( + hir_uclass_query(ClassQuery::Binary("greek")), + hir_uclass_query(ClassQuery::Binary("separator"))))); + assert_eq!( + t(r"[[[\p{age:3.0}\pZ]\p{Greek}][\p{Cyrillic}]]"), + hir_union( + hir_uclass_query(ClassQuery::ByValue { + property_name: "age", + property_value: "3.0", + }), + hir_union( + hir_uclass_query(ClassQuery::Binary("cyrillic")), + hir_union( + hir_uclass_query(ClassQuery::Binary("greek")), + hir_uclass_query(ClassQuery::Binary("separator")))))); + + assert_eq!( + t(r"(?i)[\p{age:3.0}\pZ\p{Greek}]"), + hir_case_fold(hir_union( + hir_uclass_query(ClassQuery::ByValue { + property_name: "age", + property_value: "3.0", + }), + hir_union( + hir_uclass_query(ClassQuery::Binary("greek")), + hir_uclass_query(ClassQuery::Binary("separator")))))); + assert_eq!( + t(r"[^\p{age:3.0}\pZ\p{Greek}]"), + hir_negate(hir_union( + hir_uclass_query(ClassQuery::ByValue { + property_name: "age", + property_value: "3.0", + }), + hir_union( + hir_uclass_query(ClassQuery::Binary("greek")), + hir_uclass_query(ClassQuery::Binary("separator")))))); + assert_eq!( + t(r"(?i)[^\p{age:3.0}\pZ\p{Greek}]"), + hir_negate(hir_case_fold(hir_union( + hir_uclass_query(ClassQuery::ByValue { + property_name: "age", + property_value: "3.0", + }), + hir_union( + hir_uclass_query(ClassQuery::Binary("greek")), + hir_uclass_query(ClassQuery::Binary("separator"))))))); + } + + #[test] + fn class_bracketed_nested() { + assert_eq!( + t(r"[a[^c]]"), + hir_negate(hir_uclass(&[('c', 'c')]))); + assert_eq!( + t(r"[a-b[^c]]"), + hir_negate(hir_uclass(&[('c', 'c')]))); + assert_eq!( + t(r"[a-c[^c]]"), + hir_negate(hir_uclass(&[]))); + + assert_eq!( + t(r"[^a[^c]]"), + hir_uclass(&[('c', 'c')])); + assert_eq!( + t(r"[^a-b[^c]]"), + hir_uclass(&[('c', 'c')])); + + assert_eq!( + t(r"(?i)[a[^c]]"), + hir_negate(hir_case_fold(hir_uclass(&[('c', 'c')])))); + assert_eq!( + t(r"(?i)[a-b[^c]]"), + hir_negate(hir_case_fold(hir_uclass(&[('c', 'c')])))); + + assert_eq!( + t(r"(?i)[^a[^c]]"), + hir_uclass(&[('C', 'C'), ('c', 'c')])); + assert_eq!( + t(r"(?i)[^a-b[^c]]"), + hir_uclass(&[('C', 'C'), ('c', 'c')])); + + assert_eq!(t_err(r"[^a-c[^c]]"), TestError { + kind: hir::ErrorKind::EmptyClassNotAllowed, + span: Span::new(Position::new(0, 1, 1), Position::new(10, 1, 11)), + }); + assert_eq!(t_err(r"(?i)[^a-c[^c]]"), TestError { + kind: hir::ErrorKind::EmptyClassNotAllowed, + span: Span::new(Position::new(4, 1, 5), Position::new(14, 1, 15)), + }); + } + + #[test] + fn class_bracketed_intersect() { + assert_eq!(t("[abc&&b-c]"), hir_uclass(&[('b', 'c')])); + assert_eq!(t("[abc&&[b-c]]"), hir_uclass(&[('b', 'c')])); + assert_eq!(t("[[abc]&&[b-c]]"), hir_uclass(&[('b', 'c')])); + assert_eq!(t("[a-z&&b-y&&c-x]"), hir_uclass(&[('c', 'x')])); + assert_eq!(t("[c-da-b&&a-d]"), hir_uclass(&[('a', 'd')])); + assert_eq!(t("[a-d&&c-da-b]"), hir_uclass(&[('a', 'd')])); + assert_eq!(t(r"[a-z&&a-c]"), hir_uclass(&[('a', 'c')])); + assert_eq!(t(r"[[a-z&&a-c]]"), hir_uclass(&[('a', 'c')])); + assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')]))); + + assert_eq!(t("(?-u)[abc&&b-c]"), hir_bclass(&[(b'b', b'c')])); + assert_eq!(t("(?-u)[abc&&[b-c]]"), hir_bclass(&[(b'b', b'c')])); + assert_eq!(t("(?-u)[[abc]&&[b-c]]"), hir_bclass(&[(b'b', b'c')])); + assert_eq!(t("(?-u)[a-z&&b-y&&c-x]"), hir_bclass(&[(b'c', b'x')])); + assert_eq!(t("(?-u)[c-da-b&&a-d]"), hir_bclass(&[(b'a', b'd')])); + assert_eq!(t("(?-u)[a-d&&c-da-b]"), hir_bclass(&[(b'a', b'd')])); + + assert_eq!( + t("(?i)[abc&&b-c]"), + hir_case_fold(hir_uclass(&[('b', 'c')]))); + assert_eq!( + t("(?i)[abc&&[b-c]]"), + hir_case_fold(hir_uclass(&[('b', 'c')]))); + assert_eq!( + t("(?i)[[abc]&&[b-c]]"), + hir_case_fold(hir_uclass(&[('b', 'c')]))); + assert_eq!( + t("(?i)[a-z&&b-y&&c-x]"), + hir_case_fold(hir_uclass(&[('c', 'x')]))); + assert_eq!( + t("(?i)[c-da-b&&a-d]"), + hir_case_fold(hir_uclass(&[('a', 'd')]))); + assert_eq!( + t("(?i)[a-d&&c-da-b]"), + hir_case_fold(hir_uclass(&[('a', 'd')]))); + + assert_eq!( + t("(?i-u)[abc&&b-c]"), + hir_case_fold(hir_bclass(&[(b'b', b'c')]))); + assert_eq!( + t("(?i-u)[abc&&[b-c]]"), + hir_case_fold(hir_bclass(&[(b'b', b'c')]))); + assert_eq!( + t("(?i-u)[[abc]&&[b-c]]"), + hir_case_fold(hir_bclass(&[(b'b', b'c')]))); + assert_eq!( + t("(?i-u)[a-z&&b-y&&c-x]"), + hir_case_fold(hir_bclass(&[(b'c', b'x')]))); + assert_eq!( + t("(?i-u)[c-da-b&&a-d]"), + hir_case_fold(hir_bclass(&[(b'a', b'd')]))); + assert_eq!( + t("(?i-u)[a-d&&c-da-b]"), + hir_case_fold(hir_bclass(&[(b'a', b'd')]))); + + // In `[a^]`, `^` does not need to be escaped, so it makes sense that + // `^` is also allowed to be unescaped after `&&`. + assert_eq!(t(r"[\^&&^]"), hir_uclass(&[('^', '^')])); + // `]` needs to be escaped after `&&` since it's not at start of class. + assert_eq!(t(r"[]&&\]]"), hir_uclass(&[(']', ']')])); + assert_eq!(t(r"[-&&-]"), hir_uclass(&[('-', '-')])); + assert_eq!(t(r"[\&&&&]"), hir_uclass(&[('&', '&')])); + assert_eq!(t(r"[\&&&\&]"), hir_uclass(&[('&', '&')])); + // Test precedence. + assert_eq!( + t(r"[a-w&&[^c-g]z]"), + hir_uclass(&[('a', 'b'), ('h', 'w')])); + } + + #[test] + fn class_bracketed_intersect_negate() { + assert_eq!( + t(r"[^\w&&\d]"), + hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))); + assert_eq!( + t(r"[^[a-z&&a-c]]"), + hir_negate(hir_uclass(&[('a', 'c')]))); + assert_eq!( + t(r"[^[\w&&\d]]"), + hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))); + assert_eq!( + t(r"[^[^\w&&\d]]"), + hir_uclass_query(ClassQuery::Binary("digit"))); + assert_eq!( + t(r"[[[^\w]&&[^\d]]]"), + hir_negate(hir_uclass_perl_word())); + + assert_eq!( + t_bytes(r"(?-u)[^\w&&\d]"), + hir_negate(hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Digit)))); + assert_eq!( + t_bytes(r"(?-u)[^[a-z&&a-c]]"), + hir_negate(hir_bclass(&[(b'a', b'c')]))); + assert_eq!( + t_bytes(r"(?-u)[^[\w&&\d]]"), + hir_negate(hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Digit)))); + assert_eq!( + t_bytes(r"(?-u)[^[^\w&&\d]]"), + hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Digit))); + assert_eq!( + t_bytes(r"(?-u)[[[^\w]&&[^\d]]]"), + hir_negate(hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Word)))); + } + + #[test] + fn class_bracketed_difference() { + assert_eq!( + t(r"[\pL--[:ascii:]]"), + hir_difference( + hir_uclass_query(ClassQuery::Binary("letter")), + hir_uclass(&[('\0', '\x7F')]))); + } + + #[test] + fn class_bracketed_symmetric_difference() { + assert_eq!( + t(r"[\p{sc:Greek}~~\p{scx:Greek}]"), + hir_uclass(&[ + ('\u{0342}', '\u{0342}'), + ('\u{0345}', '\u{0345}'), + ('\u{1DC0}', '\u{1DC1}'), + ])); + assert_eq!( + t(r"[a-g~~c-j]"), + hir_uclass(&[('a', 'b'), ('h', 'j')])); + } + + #[test] + fn ignore_whitespace() { + assert_eq!(t(r"(?x)\12 3"), hir_lit("\n3")); + assert_eq!(t(r"(?x)\x { 53 }"), hir_lit("S")); + assert_eq!(t(r"(?x)\x # comment +{ # comment + 53 # comment +} #comment"), hir_lit("S")); + + assert_eq!(t(r"(?x)\x 53"), hir_lit("S")); + assert_eq!(t(r"(?x)\x # comment + 53 # comment"), hir_lit("S")); + assert_eq!(t(r"(?x)\x5 3"), hir_lit("S")); + + assert_eq!(t(r"(?x)\p # comment +{ # comment + Separator # comment +} # comment"), hir_uclass_query(ClassQuery::Binary("separator"))); + + assert_eq!(t(r"(?x)a # comment +{ # comment + 5 # comment + , # comment + 10 # comment +} # comment"), + hir_range( + true, hir::RepetitionRange::Bounded(5, 10), hir_lit("a"))); + + assert_eq!(t(r"(?x)a\ # hi there"), hir_lit("a ")); + } + + #[test] + fn analysis_is_always_utf8() { + // Positive examples. + assert!(t_bytes(r"a").is_always_utf8()); + assert!(t_bytes(r"ab").is_always_utf8()); + assert!(t_bytes(r"(?-u)a").is_always_utf8()); + assert!(t_bytes(r"(?-u)ab").is_always_utf8()); + assert!(t_bytes(r"\xFF").is_always_utf8()); + assert!(t_bytes(r"\xFF\xFF").is_always_utf8()); + assert!(t_bytes(r"[^a]").is_always_utf8()); + assert!(t_bytes(r"[^a][^a]").is_always_utf8()); + assert!(t_bytes(r"\b").is_always_utf8()); + assert!(t_bytes(r"\B").is_always_utf8()); + + // Negative examples. + assert!(!t_bytes(r"(?-u)\xFF").is_always_utf8()); + assert!(!t_bytes(r"(?-u)\xFF\xFF").is_always_utf8()); + assert!(!t_bytes(r"(?-u)[^a]").is_always_utf8()); + assert!(!t_bytes(r"(?-u)[^a][^a]").is_always_utf8()); + assert!(!t_bytes(r"(?-u)\b").is_always_utf8()); + assert!(!t_bytes(r"(?-u)\B").is_always_utf8()); + } + + #[test] + fn analysis_is_all_assertions() { + // Positive examples. + assert!(t(r"\b").is_all_assertions()); + assert!(t(r"\B").is_all_assertions()); + assert!(t(r"^").is_all_assertions()); + assert!(t(r"$").is_all_assertions()); + assert!(t(r"\A").is_all_assertions()); + assert!(t(r"\z").is_all_assertions()); + assert!(t(r"$^\z\A\b\B").is_all_assertions()); + assert!(t(r"$|^|\z|\A|\b|\B").is_all_assertions()); + assert!(t(r"^$|$^").is_all_assertions()); + assert!(t(r"((\b)+())*^").is_all_assertions()); + + // Negative examples. + assert!(!t(r"^a").is_all_assertions()); + } + + #[test] + fn analysis_is_anchored() { + // Positive examples. + assert!(t(r"^").is_anchored_start()); + assert!(t(r"$").is_anchored_end()); + + assert!(t(r"^^").is_anchored_start()); + assert!(t(r"$$").is_anchored_end()); + + assert!(t(r"^$").is_anchored_start()); + assert!(t(r"^$").is_anchored_end()); + + assert!(t(r"^foo").is_anchored_start()); + assert!(t(r"foo$").is_anchored_end()); + + assert!(t(r"^foo|^bar").is_anchored_start()); + assert!(t(r"foo$|bar$").is_anchored_end()); + + assert!(t(r"^(foo|bar)").is_anchored_start()); + assert!(t(r"(foo|bar)$").is_anchored_end()); + + assert!(t(r"^+").is_anchored_start()); + assert!(t(r"$+").is_anchored_end()); + assert!(t(r"^++").is_anchored_start()); + assert!(t(r"$++").is_anchored_end()); + assert!(t(r"(^)+").is_anchored_start()); + assert!(t(r"($)+").is_anchored_end()); + + assert!(t(r"$^").is_anchored_start()); + assert!(t(r"$^").is_anchored_end()); + assert!(t(r"$^|^$").is_anchored_start()); + assert!(t(r"$^|^$").is_anchored_end()); + + assert!(t(r"\b^").is_anchored_start()); + assert!(t(r"$\b").is_anchored_end()); + assert!(t(r"^(?m:^)").is_anchored_start()); + assert!(t(r"(?m:$)$").is_anchored_end()); + assert!(t(r"(?m:^)^").is_anchored_start()); + assert!(t(r"$(?m:$)").is_anchored_end()); + + // Negative examples. + assert!(!t(r"(?m)^").is_anchored_start()); + assert!(!t(r"(?m)$").is_anchored_end()); + assert!(!t(r"(?m:^$)|$^").is_anchored_start()); + assert!(!t(r"(?m:^$)|$^").is_anchored_end()); + assert!(!t(r"$^|(?m:^$)").is_anchored_start()); + assert!(!t(r"$^|(?m:^$)").is_anchored_end()); + + assert!(!t(r"a^").is_anchored_start()); + assert!(!t(r"$a").is_anchored_start()); + + assert!(!t(r"a^").is_anchored_start()); + assert!(!t(r"$a").is_anchored_start()); + + assert!(!t(r"^foo|bar").is_anchored_start()); + assert!(!t(r"foo|bar$").is_anchored_end()); + + assert!(!t(r"^*").is_anchored_start()); + assert!(!t(r"$*").is_anchored_end()); + assert!(!t(r"^*+").is_anchored_start()); + assert!(!t(r"$*+").is_anchored_end()); + assert!(!t(r"^+*").is_anchored_start()); + assert!(!t(r"$+*").is_anchored_end()); + assert!(!t(r"(^)*").is_anchored_start()); + assert!(!t(r"($)*").is_anchored_end()); + } + + #[test] + fn analysis_is_any_anchored() { + // Positive examples. + assert!(t(r"^").is_any_anchored_start()); + assert!(t(r"$").is_any_anchored_end()); + assert!(t(r"\A").is_any_anchored_start()); + assert!(t(r"\z").is_any_anchored_end()); + + // Negative examples. + assert!(!t(r"(?m)^").is_any_anchored_start()); + assert!(!t(r"(?m)$").is_any_anchored_end()); + assert!(!t(r"$").is_any_anchored_start()); + assert!(!t(r"^").is_any_anchored_end()); + } + + #[test] + fn analysis_is_match_empty() { + // Positive examples. + assert!(t(r"").is_match_empty()); + assert!(t(r"()").is_match_empty()); + assert!(t(r"()*").is_match_empty()); + assert!(t(r"()+").is_match_empty()); + assert!(t(r"()?").is_match_empty()); + assert!(t(r"a*").is_match_empty()); + assert!(t(r"a?").is_match_empty()); + assert!(t(r"a{0}").is_match_empty()); + assert!(t(r"a{0,}").is_match_empty()); + assert!(t(r"a{0,1}").is_match_empty()); + assert!(t(r"a{0,10}").is_match_empty()); + assert!(t(r"\pL*").is_match_empty()); + assert!(t(r"a*|b").is_match_empty()); + assert!(t(r"b|a*").is_match_empty()); + assert!(t(r"a*a?(abcd)*").is_match_empty()); + assert!(t(r"^").is_match_empty()); + assert!(t(r"$").is_match_empty()); + assert!(t(r"(?m)^").is_match_empty()); + assert!(t(r"(?m)$").is_match_empty()); + assert!(t(r"\A").is_match_empty()); + assert!(t(r"\z").is_match_empty()); + assert!(t(r"\B").is_match_empty()); + assert!(t(r"(?-u)\B").is_match_empty()); + + // Negative examples. + assert!(!t(r"a+").is_match_empty()); + assert!(!t(r"a{1}").is_match_empty()); + assert!(!t(r"a{1,}").is_match_empty()); + assert!(!t(r"a{1,2}").is_match_empty()); + assert!(!t(r"a{1,10}").is_match_empty()); + assert!(!t(r"b|a").is_match_empty()); + assert!(!t(r"a*a+(abcd)*").is_match_empty()); + assert!(!t(r"\b").is_match_empty()); + assert!(!t(r"(?-u)\b").is_match_empty()); + } +} diff --git a/regex-syntax-2/src/hir/visitor.rs b/regex-syntax-2/src/hir/visitor.rs new file mode 100644 index 0000000000..716a96d9b4 --- /dev/null +++ b/regex-syntax-2/src/hir/visitor.rs @@ -0,0 +1,222 @@ +// Copyright 2018 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use hir::{self, Hir, HirKind}; + +/// A trait for visiting the high-level IR (HIR) in depth first order. +/// +/// The principle aim of this trait is to enable callers to perform case +/// analysis on a high-level intermediate representation of a regular +/// expression without necessarily using recursion. In particular, this permits +/// callers to do case analysis with constant stack usage, which can be +/// important since the size of an HIR may be proportional to end user input. +/// +/// Typical usage of this trait involves providing an implementation and then +/// running it using the [`visit`](fn.visit.html) function. +pub trait Visitor { + /// The result of visiting an HIR. + type Output; + /// An error that visiting an HIR might return. + type Err; + + /// All implementors of `Visitor` must provide a `finish` method, which + /// yields the result of visiting the HIR or an error. + fn finish(self) -> Result; + + /// This method is called before beginning traversal of the HIR. + fn start(&mut self) {} + + /// This method is called on an `Hir` before descending into child `Hir` + /// nodes. + fn visit_pre(&mut self, _hir: &Hir) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called on an `Hir` after descending all of its child + /// `Hir` nodes. + fn visit_post(&mut self, _hir: &Hir) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called between child nodes of an alternation. + fn visit_alternation_in(&mut self) -> Result<(), Self::Err> { + Ok(()) + } +} + +/// Executes an implementation of `Visitor` in constant stack space. +/// +/// This function will visit every node in the given `Hir` while calling +/// appropriate methods provided by the +/// [`Visitor`](trait.Visitor.html) trait. +/// +/// The primary use case for this method is when one wants to perform case +/// analysis over an `Hir` without using a stack size proportional to the depth +/// of the `Hir`. Namely, this method will instead use constant stack space, +/// but will use heap space proportional to the size of the `Hir`. This may be +/// desirable in cases where the size of `Hir` is proportional to end user +/// input. +/// +/// If the visitor returns an error at any point, then visiting is stopped and +/// the error is returned. +pub fn visit(hir: &Hir, visitor: V) -> Result { + HeapVisitor::new().visit(hir, visitor) +} + +/// HeapVisitor visits every item in an `Hir` recursively using constant stack +/// size and a heap size proportional to the size of the `Hir`. +struct HeapVisitor<'a> { + /// A stack of `Hir` nodes. This is roughly analogous to the call stack + /// used in a typical recursive visitor. + stack: Vec<(&'a Hir, Frame<'a>)>, +} + +/// Represents a single stack frame while performing structural induction over +/// an `Hir`. +enum Frame<'a> { + /// A stack frame allocated just before descending into a repetition + /// operator's child node. + Repetition(&'a hir::Repetition), + /// A stack frame allocated just before descending into a group's child + /// node. + Group(&'a hir::Group), + /// The stack frame used while visiting every child node of a concatenation + /// of expressions. + Concat { + /// The child node we are currently visiting. + head: &'a Hir, + /// The remaining child nodes to visit (which may be empty). + tail: &'a [Hir], + }, + /// The stack frame used while visiting every child node of an alternation + /// of expressions. + Alternation { + /// The child node we are currently visiting. + head: &'a Hir, + /// The remaining child nodes to visit (which may be empty). + tail: &'a [Hir], + }, +} + +impl<'a> HeapVisitor<'a> { + fn new() -> HeapVisitor<'a> { + HeapVisitor { stack: vec![] } + } + + fn visit( + &mut self, + mut hir: &'a Hir, + mut visitor: V, + ) -> Result { + self.stack.clear(); + + visitor.start(); + loop { + try!(visitor.visit_pre(hir)); + if let Some(x) = self.induct(hir) { + let child = x.child(); + self.stack.push((hir, x)); + hir = child; + continue; + } + // No induction means we have a base case, so we can post visit + // it now. + try!(visitor.visit_post(hir)); + + // At this point, we now try to pop our call stack until it is + // either empty or we hit another inductive case. + loop { + let (post_hir, frame) = match self.stack.pop() { + None => return visitor.finish(), + Some((post_hir, frame)) => (post_hir, frame), + }; + // If this is a concat/alternate, then we might have additional + // inductive steps to process. + if let Some(x) = self.pop(frame) { + if let Frame::Alternation {..} = x { + try!(visitor.visit_alternation_in()); + } + hir = x.child(); + self.stack.push((post_hir, x)); + break; + } + // Otherwise, we've finished visiting all the child nodes for + // this HIR, so we can post visit it now. + try!(visitor.visit_post(post_hir)); + } + } + } + + /// Build a stack frame for the given HIR if one is needed (which occurs if + /// and only if there are child nodes in the HIR). Otherwise, return None. + fn induct(&mut self, hir: &'a Hir) -> Option> { + match *hir.kind() { + HirKind::Repetition(ref x) => Some(Frame::Repetition(x)), + HirKind::Group(ref x) => Some(Frame::Group(x)), + HirKind::Concat(ref x) if x.is_empty() => None, + HirKind::Concat(ref x) => { + Some(Frame::Concat { + head: &x[0], + tail: &x[1..], + }) + } + HirKind::Alternation(ref x) if x.is_empty() => None, + HirKind::Alternation(ref x) => { + Some(Frame::Alternation { + head: &x[0], + tail: &x[1..], + }) + } + _ => None, + } + } + + /// Pops the given frame. If the frame has an additional inductive step, + /// then return it, otherwise return `None`. + fn pop(&self, induct: Frame<'a>) -> Option> { + match induct { + Frame::Repetition(_) => None, + Frame::Group(_) => None, + Frame::Concat { tail, .. } => { + if tail.is_empty() { + None + } else { + Some(Frame::Concat { + head: &tail[0], + tail: &tail[1..], + }) + } + } + Frame::Alternation { tail, .. } => { + if tail.is_empty() { + None + } else { + Some(Frame::Alternation { + head: &tail[0], + tail: &tail[1..], + }) + } + } + } + } +} + +impl<'a> Frame<'a> { + /// Perform the next inductive step on this frame and return the next + /// child HIR node to visit. + fn child(&self) -> &'a Hir { + match *self { + Frame::Repetition(rep) => &rep.hir, + Frame::Group(group) => &group.hir, + Frame::Concat { head, .. } => head, + Frame::Alternation { head, .. } => head, + } + } +} diff --git a/regex-syntax-2/src/lib.rs b/regex-syntax-2/src/lib.rs new file mode 100644 index 0000000000..9469fe23b8 --- /dev/null +++ b/regex-syntax-2/src/lib.rs @@ -0,0 +1,222 @@ +// Copyright 2018 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +/*! +This crate provides a robust regular expression parser. + +This crate defines two primary types: + +* [`Ast`](ast/enum.Ast.html) is the abstract syntax of a regular expression. + An abstract syntax corresponds to a *structured representation* of the + concrete syntax of a regular expression, where the concrete syntax is the + pattern string itself (e.g., `foo(bar)+`). Given some abstract syntax, it + can be converted back to the original concrete syntax (modulo some details, + like whitespace). To a first approximation, the abstract syntax is complex + and difficult to analyze. +* [`Hir`](hir/struct.Hir.html) is the high-level intermediate representation + ("HIR" or "high-level IR" for short) of regular expression. It corresponds to + an intermediate state of a regular expression that sits between the abstract + syntax and the low level compiled opcodes that are eventually responsible for + executing a regular expression search. Given some high-level IR, it is not + possible to produce the original concrete syntax (although it is possible to + produce an equivalent conrete syntax, but it will likely scarcely resemble + the original pattern). To a first approximation, the high-level IR is simple + and easy to analyze. + +These two types come with conversion routines: + +* An [`ast::parse::Parser`](ast/parse/struct.Parser.html) converts concrete + syntax (a `&str`) to an [`Ast`](ast/enum.Ast.html). +* A [`hir::translate::Translator`](hir/translate/struct.Translator.html) + converts an [`Ast`](ast/enum.Ast.html) to a [`Hir`](hir/struct.Hir.html). + +As a convenience, the above two conversion routines are combined into one via +the top-level [`Parser`](struct.Parser.html) type. This `Parser` will first +convert your pattern to an `Ast` and then convert the `Ast` to an `Hir`. + + +# Example + +This example shows how to parse a pattern string into its HIR: + +``` +use regex_syntax2::Parser; +use regex_syntax2::hir::{self, Hir}; + +let hir = Parser::new().parse("a|b").unwrap(); +assert_eq!(hir, Hir::alternation(vec![ + Hir::literal(hir::Literal::Unicode('a')), + Hir::literal(hir::Literal::Unicode('b')), +])); +``` + + +# Concrete syntax supported + +The concrete syntax is documented as part of the public API of the +[`regex` crate](https://docs.rs/regex/%2A/regex/#syntax). + + +# Input safety + +A key feature of this library is that it is safe to use with end user facing +input. This plays a significant role in the internal implementation. In +particular: + +1. Parsers provide a `nest_limit` option that permits callers to control how + deeply nested a regular expression is allowed to be. This makes it possible + to do case analysis over an `Ast` or an `Hir` using recursion without + worrying about stack overflow. +2. Since relying on a particular stack size is brittle, this crate goes to + great lengths to ensure that all interactions with both the `Ast` and the + `Hir` do not use recursion. Namely, they use constant stack space and heap + space proportional to the size of the original pattern string (in bytes). + This includes the type's corresponding destructors. (One exception to this + is literal extraction, but this will eventually get fixed.) + + +# Error reporting + +The `Display` implementations on all `Error` types exposed in this library +provide nice human readable errors that are suitable for showing to end users +in a monospace font. + + +# Literal extraction + +This crate provides limited support for +[literal extraction from `Hir` values](hir/literal/struct.Literals.html). +Be warned that literal extraction currently uses recursion, and therefore, +stack size proportional to the size of the `Hir`. + +The purpose of literal extraction is to speed up searches. That is, if you +know a regular expression must match a prefix or suffix literal, then it is +often quicker to search for instances of that literal, and then confirm or deny +the match using the full regular expression engine. These optimizations are +done automatically in the `regex` crate. +*/ + +#![deny(missing_docs)] + +extern crate ucd_util; + +pub use error::{Error, Result}; +pub use parser::{Parser, ParserBuilder}; + +pub mod ast; +mod either; +mod error; +pub mod hir; +mod parser; +mod unicode; +mod unicode_tables; + +/// Escapes all regular expression meta characters in `text`. +/// +/// The string returned may be safely used as a literal in a regular +/// expression. +pub fn escape(text: &str) -> String { + let mut quoted = String::with_capacity(text.len()); + escape_into(text, &mut quoted); + quoted +} + +/// Escapes all meta characters in `text` and writes the result into `buf`. +/// +/// This will append escape characters into the given buffer. The characters +/// that are appended are safe to use as a literal in a regular expression. +pub fn escape_into(text: &str, buf: &mut String) { + for c in text.chars() { + if is_meta_character(c) { + buf.push('\\'); + } + buf.push(c); + } +} + +/// Returns true if the give character has significance in a regex. +/// +/// These are the only characters that are allowed to be escaped, with one +/// exception: an ASCII space character may be escaped when extended mode (with +/// the `x` flag) is enabld. In particular, `is_meta_character(' ')` returns +/// `false`. +/// +/// Note that the set of characters for which this function returns `true` or +/// `false` is fixed and won't change in a semver compatible release. +pub fn is_meta_character(c: char) -> bool { + match c { + '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | + '[' | ']' | '{' | '}' | '^' | '$' | '#' | '&' | '-' | '~' => true, + _ => false, + } +} + +/// Returns true if and only if the given character is a Unicode word +/// character. +/// +/// A Unicode word character is defined by +/// [UTS#18 Annex C](http://unicode.org/reports/tr18/#Compatibility_Properties). +/// In particular, a character +/// is considered a word character if it is in either of the `Alphabetic` or +/// `Join_Control` properties, or is in one of the `Decimal_Number`, `Mark` +/// or `Connector_Punctuation` general categories. +pub fn is_word_character(c: char) -> bool { + use std::cmp::Ordering; + use unicode_tables::perl_word::PERL_WORD; + + if c <= 0x7F as char && is_word_byte(c as u8) { + return true; + } + PERL_WORD + .binary_search_by(|&(start, end)| { + if start <= c && c <= end { + Ordering::Equal + } else if start > c { + Ordering::Greater + } else { + Ordering::Less + } + }).is_ok() +} + +/// Returns true if and only if the given character is an ASCII word character. +/// +/// An ASCII word character is defined by the following character class: +/// `[_0-9a-zA-Z]'. +pub fn is_word_byte(c: u8) -> bool { + match c { + b'_' | b'0' ... b'9' | b'a' ... b'z' | b'A' ... b'Z' => true, + _ => false, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn escape_meta() { + assert_eq!( + escape(r"\.+*?()|[]{}^$#&-~"), + r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#\&\-\~".to_string(), + ); + } + + #[test] + fn word() { + assert!(is_word_byte(b'a')); + assert!(!is_word_byte(b'-')); + + assert!(is_word_character('a')); + assert!(is_word_character('β')); + assert!(!is_word_character('-')); + assert!(!is_word_character('☃')); + } +} diff --git a/regex-syntax-2/src/parser.rs b/regex-syntax-2/src/parser.rs new file mode 100644 index 0000000000..2afd2fe234 --- /dev/null +++ b/regex-syntax-2/src/parser.rs @@ -0,0 +1,201 @@ +use ast; +use hir; + +use Result; + +/// A builder for a regular expression parser. +/// +/// This builder permits modifying configuration options for the parser. +/// +/// This type combines the builder options for both the +/// [AST `ParserBuilder`](ast/parse/struct.ParserBuilder.html) +/// and the +/// [HIR `TranslatorBuilder`](hir/translate/struct.TranslatorBuilder.html). +#[derive(Clone, Debug, Default)] +pub struct ParserBuilder { + ast: ast::parse::ParserBuilder, + hir: hir::translate::TranslatorBuilder, +} + +impl ParserBuilder { + /// Create a new parser builder with a default configuration. + pub fn new() -> ParserBuilder { + ParserBuilder::default() + } + + /// Build a parser from this configuration with the given pattern. + pub fn build(&self) -> Parser { + Parser { + ast: self.ast.build(), + hir: self.hir.build(), + } + } + + /// Set the nesting limit for this parser. + /// + /// The nesting limit controls how deep the abstract syntax tree is allowed + /// to be. If the AST exceeds the given limit (e.g., with too many nested + /// groups), then an error is returned by the parser. + /// + /// The purpose of this limit is to act as a heuristic to prevent stack + /// overflow for consumers that do structural induction on an `Ast` using + /// explicit recursion. While this crate never does this (instead using + /// constant stack space and moving the call stack to the heap), other + /// crates may. + /// + /// This limit is not checked until the entire Ast is parsed. Therefore, + /// if callers want to put a limit on the amount of heap space used, then + /// they should impose a limit on the length, in bytes, of the concrete + /// pattern string. In particular, this is viable since this parser + /// implementation will limit itself to heap space proportional to the + /// lenth of the pattern string. + /// + /// Note that a nest limit of `0` will return a nest limit error for most + /// patterns but not all. For example, a nest limit of `0` permits `a` but + /// not `ab`, since `ab` requires a concatenation, which results in a nest + /// depth of `1`. In general, a nest limit is not something that manifests + /// in an obvious way in the concrete syntax, therefore, it should not be + /// used in a granular way. + pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder { + self.ast.nest_limit(limit); + self + } + + /// Whether to support octal syntax or not. + /// + /// Octal syntax is a little-known way of uttering Unicode codepoints in + /// a regular expression. For example, `a`, `\x61`, `\u0061` and + /// `\141` are all equivalent regular expressions, where the last example + /// shows octal syntax. + /// + /// While supporting octal syntax isn't in and of itself a problem, it does + /// make good error messages harder. That is, in PCRE based regex engines, + /// syntax like `\0` invokes a backreference, which is explicitly + /// unsupported in Rust's regex engine. However, many users expect it to + /// be supported. Therefore, when octal support is disabled, the error + /// message will explicitly mention that backreferences aren't supported. + /// + /// Octal syntax is disabled by default. + pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder { + self.ast.octal(yes); + self + } + + /// When enabled, the parser will permit the construction of a regular + /// expression that may match invalid UTF-8. + /// + /// When disabled (the default), the parser is guaranteed to produce + /// an expression that will only ever match valid UTF-8 (otherwise, the + /// parser will return an error). + pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut ParserBuilder { + self.hir.allow_invalid_utf8(yes); + self + } + + /// Enable verbose mode in the regular expression. + /// + /// When enabled, verbose mode permits insigificant whitespace in many + /// places in the regular expression, as well as comments. Comments are + /// started using `#` and continue until the end of the line. + /// + /// By default, this is disabled. It may be selectively enabled in the + /// regular expression by using the `x` flag regardless of this setting. + pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder { + self.ast.ignore_whitespace(yes); + self + } + + /// Enable or disable the case insensitive flag by default. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `i` flag. + pub fn case_insensitive(&mut self, yes: bool) -> &mut ParserBuilder { + self.hir.case_insensitive(yes); + self + } + + /// Enable or disable the multi-line matching flag by default. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `m` flag. + pub fn multi_line(&mut self, yes: bool) -> &mut ParserBuilder { + self.hir.multi_line(yes); + self + } + + /// Enable or disable the "dot matches any character" flag by default. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `s` flag. + pub fn dot_matches_new_line( + &mut self, + yes: bool, + ) -> &mut ParserBuilder { + self.hir.dot_matches_new_line(yes); + self + } + + /// Enable or disable the "swap greed" flag by default. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `U` flag. + pub fn swap_greed(&mut self, yes: bool) -> &mut ParserBuilder { + self.hir.swap_greed(yes); + self + } + + /// Enable or disable the Unicode flag (`u`) by default. + /// + /// By default this is **enabled**. It may alternatively be selectively + /// disabled in the regular expression itself via the `u` flag. + /// + /// Note that unless `allow_invalid_utf8` is enabled (it's disabled by + /// default), a regular expression will fail to parse if Unicode mode is + /// disabled and a sub-expression could possibly match invalid UTF-8. + pub fn unicode(&mut self, yes: bool) -> &mut ParserBuilder { + self.hir.unicode(yes); + self + } +} + +/// A convenience parser for regular expressions. +/// +/// This parser takes as input a regular expression pattern string (the +/// "concrete syntax") and returns a high-level intermediate representation +/// (the HIR) suitable for most types of analysis. In particular, this parser +/// hides the intermediate state of producing an AST (the "abstract syntax"). +/// The AST is itself far more complex than the HIR, so this parser serves as a +/// convenience for never having to deal with it at all. +/// +/// If callers have more fine grained use cases that need an AST, then please +/// see the [`ast::parse`](ast/parse/index.html) module. +/// +/// A `Parser` can be configured in more detail via a +/// [`ParserBuilder`](struct.ParserBuilder.html). +#[derive(Clone, Debug)] +pub struct Parser { + ast: ast::parse::Parser, + hir: hir::translate::Translator, +} + +impl Parser { + /// Create a new parser with a default configuration. + /// + /// The parser can be run with `parse` method. The parse method returns + /// a high level intermediate representation of the given regular + /// expression. + /// + /// To set configuration options on the parser, use + /// [`ParserBuilder`](struct.ParserBuilder.html). + pub fn new() -> Parser { + ParserBuilder::new().build() + } + + /// Parse the regular expression into a high level intermediate + /// representation. + pub fn parse(&mut self, pattern: &str) -> Result { + let ast = try!(self.ast.parse(pattern)); + let hir = try!(self.hir.translate(pattern, &ast)); + Ok(hir) + } +} diff --git a/regex-syntax-2/src/unicode.rs b/regex-syntax-2/src/unicode.rs new file mode 100644 index 0000000000..23b56245e6 --- /dev/null +++ b/regex-syntax-2/src/unicode.rs @@ -0,0 +1,412 @@ +use std::cmp::Ordering; +use std::result; + +use ucd_util::{self, PropertyValues}; + +use hir; +use unicode_tables::age; +use unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE; +use unicode_tables::general_category; +use unicode_tables::property_bool; +use unicode_tables::property_names::PROPERTY_NAMES; +use unicode_tables::property_values::PROPERTY_VALUES; +use unicode_tables::script; +use unicode_tables::script_extension; + +type Result = result::Result; + +/// An error that occurs when dealing with Unicode. +/// +/// We don't impl the Error trait here because these always get converted +/// into other public errors. (This error type isn't exported.) +#[derive(Debug)] +pub enum Error { + PropertyNotFound, + PropertyValueNotFound, +} + +/// Encode the given Unicode character to `dst` as a single UTF-8 sequence. +/// +/// If `dst` is not long enough, then `None` is returned. Otherwise, the number +/// of bytes written is returned. +pub fn encode_utf8(character: char, dst: &mut [u8]) -> Option { + // TODO: Remove this function once we move to at least Rust 1.15, which + // provides char::encode_utf8 for us. + const TAG_CONT: u8 = 0b1000_0000; + const TAG_TWO: u8 = 0b1100_0000; + const TAG_THREE: u8 = 0b1110_0000; + const TAG_FOUR: u8 = 0b1111_0000; + + let code = character as u32; + if code <= 0x7F && !dst.is_empty() { + dst[0] = code as u8; + Some(1) + } else if code <= 0x7FF && dst.len() >= 2 { + dst[0] = (code >> 6 & 0x1F) as u8 | TAG_TWO; + dst[1] = (code & 0x3F) as u8 | TAG_CONT; + Some(2) + } else if code <= 0xFFFF && dst.len() >= 3 { + dst[0] = (code >> 12 & 0x0F) as u8 | TAG_THREE; + dst[1] = (code >> 6 & 0x3F) as u8 | TAG_CONT; + dst[2] = (code & 0x3F) as u8 | TAG_CONT; + Some(3) + } else if dst.len() >= 4 { + dst[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR; + dst[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT; + dst[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT; + dst[3] = (code & 0x3F) as u8 | TAG_CONT; + Some(4) + } else { + None + } +} + +/// An iterator over a codepoint's simple case equivalence class. +#[derive(Debug)] +pub struct SimpleFoldIter(::std::slice::Iter<'static, char>); + +impl Iterator for SimpleFoldIter { + type Item = char; + + fn next(&mut self) -> Option { + self.0.next().map(|c| *c) + } +} + +/// Return an iterator over the equivalence class of simple case mappings +/// for the given codepoint. The equivalence class does not include the +/// given codepoint. +/// +/// If the equivalence class is empty, then this returns the next scalar +/// value that has a non-empty equivalence class, if it exists. If no such +/// scalar value exists, then `None` is returned. The point of this behavior +/// is to permit callers to avoid calling `simple_fold` more than they need +/// to, since there is some cost to fetching the equivalence class. +pub fn simple_fold(c: char) -> result::Result> { + CASE_FOLDING_SIMPLE + .binary_search_by_key(&c, |&(c1, _)| c1) + .map(|i| SimpleFoldIter(CASE_FOLDING_SIMPLE[i].1.iter())) + .map_err(|i| { + if i >= CASE_FOLDING_SIMPLE.len() { + None + } else { + Some(CASE_FOLDING_SIMPLE[i].0) + } + }) +} + +/// Returns true if and only if the given (inclusive) range contains at least +/// one Unicode scalar value that has a non-empty non-trivial simple case +/// mapping. +/// +/// This function panics if `end < start`. +pub fn contains_simple_case_mapping(start: char, end: char) -> bool { + assert!(start <= end); + CASE_FOLDING_SIMPLE + .binary_search_by(|&(c, _)| { + if start <= c && c <= end { + Ordering::Equal + } else if c > end { + Ordering::Greater + } else { + Ordering::Less + } + }).is_ok() +} + +/// A query for finding a character class defined by Unicode. This supports +/// either use of a property name directly, or lookup by property value. The +/// former generally refers to Binary properties (see UTS#44, Table 8), but +/// as a special exception (see UTS#18, Section 1.2) both general categories +/// (an enumeration) and scripts (a catalog) are supported as if each of their +/// possible values were a binary property. +/// +/// In all circumstances, property names and values are normalized and +/// canonicalized. That is, `GC == gc == GeneralCategory == general_category`. +/// +/// The lifetime `'a` refers to the shorter of the lifetimes of property name +/// and property value. +#[derive(Debug)] +pub enum ClassQuery<'a> { + /// Return a class corresponding to a Unicode binary property, named by + /// a single letter. + OneLetter(char), + /// Return a class corresponding to a Unicode binary property. + /// + /// Note that, by special exception (see UTS#18, Section 1.2), both + /// general category values and script values are permitted here as if + /// they were a binary property. + Binary(&'a str), + /// Return a class corresponding to all codepoints whose property + /// (identified by `property_name`) corresponds to the given value + /// (identified by `property_value`). + ByValue { + /// A property name. + property_name: &'a str, + /// A property value. + property_value: &'a str, + }, +} + +impl<'a> ClassQuery<'a> { + fn canonicalize(&self) -> Result { + match *self { + ClassQuery::OneLetter(c) => self.canonical_binary(&c.to_string()), + ClassQuery::Binary(name) => self.canonical_binary(name), + ClassQuery::ByValue { property_name, property_value } => { + let canon_name = match canonical_prop(property_name) { + None => return Err(Error::PropertyNotFound), + Some(canon_name) => canon_name, + }; + let vals = match property_values(canon_name) { + None => return Err(Error::PropertyValueNotFound), + Some(vals) => vals, + }; + let canon_val = match canonical_value(vals, property_value) { + None => return Err(Error::PropertyValueNotFound), + Some(canon_val) => canon_val, + }; + Ok(match canon_name { + "General_Category" => { + CanonicalClassQuery::GeneralCategory(canon_val) + } + "Script" => { + CanonicalClassQuery::Script(canon_val) + } + _ => { + CanonicalClassQuery::ByValue { + property_name: canon_name, + property_value: canon_val, + } + } + }) + } + } + } + + fn canonical_binary(&self, name: &str) -> Result { + if let Some(canon) = canonical_prop(name) { + return Ok(CanonicalClassQuery::Binary(canon)); + } + let gencats = property_values("General_Category").unwrap(); + if let Some(canon) = canonical_value(gencats, name) { + return Ok(CanonicalClassQuery::GeneralCategory(canon)); + } + let scripts = property_values("Script").unwrap(); + if let Some(canon) = canonical_value(scripts, name) { + return Ok(CanonicalClassQuery::Script(canon)); + } + Err(Error::PropertyNotFound) + } +} + +/// Like ClassQuery, but its parameters have been canonicalized. This also +/// differentiates binary properties from flattened general categories and +/// scripts. +#[derive(Debug)] +enum CanonicalClassQuery { + /// The canonical binary property name. + Binary(&'static str), + /// The canonical general category name. + GeneralCategory(&'static str), + /// The canonical script name. + Script(&'static str), + /// An arbitrary association between property and value, both of which + /// have been canonicalized. + /// + /// Note that by construction, the property name of ByValue will never + /// be General_Category or Script. Those two cases are subsumed by the + /// eponymous variants. + ByValue { + /// The canonical property name. + property_name: &'static str, + /// The canonical property value. + property_value: &'static str, + }, +} + +/// Looks up a Unicode class given a query. If one doesn't exist, then +/// `None` is returned. +pub fn class<'a>(query: ClassQuery<'a>) -> Result { + use self::CanonicalClassQuery::*; + + match try!(query.canonicalize()) { + Binary(name) => { + property_set(property_bool::BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyNotFound) + } + GeneralCategory(name) => { + property_set(general_category::BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyValueNotFound) + } + Script(name) => { + property_set(script::BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyValueNotFound) + } + ByValue { property_name: "Age", property_value } => { + let mut class = hir::ClassUnicode::empty(); + for set in try!(ages(property_value)) { + class.union(&hir_class(set)); + } + Ok(class) + } + ByValue { property_name: "Script_Extensions", property_value } => { + property_set(script_extension::BY_NAME, property_value) + .map(hir_class) + .ok_or(Error::PropertyValueNotFound) + } + _ => { + // What else should we support? + Err(Error::PropertyNotFound) + } + } +} + +/// Build a Unicode HIR class from a sequence of Unicode scalar value ranges. +pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode { + let hir_ranges: Vec = ranges + .iter() + .map(|&(s, e)| hir::ClassRangeUnicode::new(s, e)) + .collect(); + hir::ClassUnicode::new(hir_ranges) +} + +fn canonical_prop(name: &str) -> Option<&'static str> { + let mut name = name.to_string(); + ucd_util::symbolic_name_normalize(&mut name); + ucd_util::canonical_property_name(PROPERTY_NAMES, &name) +} + +fn canonical_value(vals: PropertyValues, value: &str) -> Option<&'static str> { + let mut value = value.to_string(); + ucd_util::symbolic_name_normalize(&mut value); + ucd_util::canonical_property_value(vals, &value) +} + +fn property_values( + canonical_property_name: &'static str, +) -> Option +{ + ucd_util::property_values(PROPERTY_VALUES, canonical_property_name) +} + +fn property_set( + name_map: &'static [(&'static str, &'static [(char, char)])], + canonical: &'static str, +) -> Option<&'static [(char, char)]> { + name_map + .binary_search_by_key(&canonical, |x| x.0) + .ok() + .map(|i| name_map[i].1) +} + +/// An iterator over Unicode Age sets. Each item corresponds to a set of +/// codepoints that were added in a particular revision of Unicode. The +/// iterator yields items in chronological order. +#[derive(Debug)] +struct AgeIter { + ages: &'static [(&'static str, &'static [(char, char)])], +} + +fn ages(canonical_age: &str) -> Result { + const AGES: &'static [(&'static str, &'static [(char, char)])] = &[ + ("V1_1", age::V1_1), + ("V2_0", age::V2_0), + ("V2_1", age::V2_1), + ("V3_0", age::V3_0), + ("V3_1", age::V3_1), + ("V3_2", age::V3_2), + ("V4_0", age::V4_0), + ("V4_1", age::V4_1), + ("V5_0", age::V5_0), + ("V5_1", age::V5_1), + ("V5_2", age::V5_2), + ("V6_0", age::V6_0), + ("V6_1", age::V6_1), + ("V6_2", age::V6_2), + ("V6_3", age::V6_3), + ("V7_0", age::V7_0), + ("V8_0", age::V8_0), + ("V9_0", age::V9_0), + ("V10_0", age::V10_0), + ]; + assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync"); + + let pos = AGES.iter().position(|&(age, _)| canonical_age == age); + match pos { + None => Err(Error::PropertyValueNotFound), + Some(i) => Ok(AgeIter { ages: &AGES[..i+1] }), + } +} + +impl Iterator for AgeIter { + type Item = &'static [(char, char)]; + + fn next(&mut self) -> Option<&'static [(char, char)]> { + if self.ages.is_empty() { + None + } else { + let set = self.ages[0]; + self.ages = &self.ages[1..]; + Some(set.1) + } + } +} + +#[cfg(test)] +mod tests { + use super::{contains_simple_case_mapping, simple_fold}; + + #[test] + fn simple_fold_k() { + let xs: Vec = simple_fold('k').unwrap().collect(); + assert_eq!(xs, vec!['K', 'K']); + + let xs: Vec = simple_fold('K').unwrap().collect(); + assert_eq!(xs, vec!['k', 'K']); + + let xs: Vec = simple_fold('K').unwrap().collect(); + assert_eq!(xs, vec!['K', 'k']); + } + + #[test] + fn simple_fold_a() { + let xs: Vec = simple_fold('a').unwrap().collect(); + assert_eq!(xs, vec!['A']); + + let xs: Vec = simple_fold('A').unwrap().collect(); + assert_eq!(xs, vec!['a']); + } + + #[test] + fn simple_fold_empty() { + assert_eq!(Some('A'), simple_fold('?').unwrap_err()); + assert_eq!(Some('A'), simple_fold('@').unwrap_err()); + assert_eq!(Some('a'), simple_fold('[').unwrap_err()); + assert_eq!(Some('Ⰰ'), simple_fold('☃').unwrap_err()); + } + + #[test] + fn simple_fold_max() { + assert_eq!(None, simple_fold('\u{10FFFE}').unwrap_err()); + assert_eq!(None, simple_fold('\u{10FFFF}').unwrap_err()); + } + + #[test] + fn range_contains() { + assert!(contains_simple_case_mapping('A', 'A')); + assert!(contains_simple_case_mapping('Z', 'Z')); + assert!(contains_simple_case_mapping('A', 'Z')); + assert!(contains_simple_case_mapping('@', 'A')); + assert!(contains_simple_case_mapping('Z', '[')); + assert!(contains_simple_case_mapping('☃', 'Ⰰ')); + + assert!(!contains_simple_case_mapping('[', '[')); + assert!(!contains_simple_case_mapping('[', '`')); + + assert!(!contains_simple_case_mapping('☃', '☃')); + } +} diff --git a/regex-syntax-2/src/unicode_tables/age.rs b/regex-syntax-2/src/unicode_tables/age.rs new file mode 100644 index 0000000000..afba3d3ff4 --- /dev/null +++ b/regex-syntax-2/src/unicode_tables/age.rs @@ -0,0 +1,424 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate age tmp/ucd-10.0.0/ --chars +// +// ucd-generate is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ + ("V10_0", V10_0), ("V1_1", V1_1), ("V2_0", V2_0), ("V2_1", V2_1), + ("V3_0", V3_0), ("V3_1", V3_1), ("V3_2", V3_2), ("V4_0", V4_0), + ("V4_1", V4_1), ("V5_0", V5_0), ("V5_1", V5_1), ("V5_2", V5_2), + ("V6_0", V6_0), ("V6_1", V6_1), ("V6_2", V6_2), ("V6_3", V6_3), + ("V7_0", V7_0), ("V8_0", V8_0), ("V9_0", V9_0), +]; + +pub const V10_0: &'static [(char, char)] = &[ + ('ࡠ', 'ࡪ'), ('ৼ', '৽'), ('ૺ', '૿'), ('ഀ', 'ഀ'), + ('഻', '഼'), ('᳷', '᳷'), ('᷶', '᷹'), ('₿', '₿'), + ('⏿', '⏿'), ('⯒', '⯒'), ('⹅', '⹉'), ('ㄮ', 'ㄮ'), + ('鿖', '鿪'), ('𐌭', '𐌯'), ('𑨀', '𑩇'), ('𑩐', '𑪃'), + ('𑪆', '𑪜'), ('𑪞', '𑪢'), ('𑴀', '𑴆'), ('𑴈', '𑴉'), + ('𑴋', '𑴶'), ('𑴺', '𑴺'), ('𑴼', '𑴽'), ('𑴿', '𑵇'), + ('𑵐', '𑵙'), ('𖿡', '𖿡'), ('𛀂', '𛄞'), ('𛅰', '𛋻'), + ('🉠', '🉥'), ('🛓', '🛔'), ('🛷', '🛸'), ('🤀', '🤋'), + ('🤟', '🤟'), ('🤨', '🤯'), ('🤱', '🤲'), ('🥌', '🥌'), + ('🥟', '🥫'), ('🦒', '🦗'), ('🧐', '🧦'), ('𬺰', '𮯠'), +]; + +pub const V1_1: &'static [(char, char)] = &[ + ('\u{0}', 'ǵ'), ('Ǻ', 'ȗ'), ('ɐ', 'ʨ'), ('ʰ', '˞'), ('ˠ', '˩'), + ('̀', 'ͅ'), ('͠', '͡'), ('ʹ', '͵'), ('ͺ', 'ͺ'), (';', ';'), + ('΄', 'Ί'), ('Ό', 'Ό'), ('Ύ', 'Ρ'), ('Σ', 'ώ'), ('ϐ', 'ϖ'), + ('Ϛ', 'Ϛ'), ('Ϝ', 'Ϝ'), ('Ϟ', 'Ϟ'), ('Ϡ', 'Ϡ'), ('Ϣ', 'ϳ'), + ('Ё', 'Ќ'), ('Ў', 'я'), ('ё', 'ќ'), ('ў', '҆'), ('Ґ', 'ӄ'), + ('Ӈ', 'ӈ'), ('Ӌ', 'ӌ'), ('Ӑ', 'ӫ'), ('Ӯ', 'ӵ'), ('Ӹ', 'ӹ'), + ('Ա', 'Ֆ'), ('ՙ', '՟'), ('ա', 'և'), ('։', '։'), ('ְ', 'ֹ'), + ('ֻ', '׃'), ('א', 'ת'), ('װ', '״'), ('،', '،'), ('؛', '؛'), + ('؟', '؟'), ('ء', 'غ'), ('ـ', 'ْ'), ('٠', '٭'), ('ٰ', 'ڷ'), + ('ں', 'ھ'), ('ۀ', 'ێ'), ('ې', 'ۭ'), ('۰', '۹'), ('ँ', 'ः'), + ('अ', 'ह'), ('़', '्'), ('ॐ', '॔'), ('क़', '॰'), + ('ঁ', 'ঃ'), ('অ', 'ঌ'), ('এ', 'ঐ'), ('ও', 'ন'), + ('প', 'র'), ('ল', 'ল'), ('শ', 'হ'), ('়', '়'), + ('া', 'ৄ'), ('ে', 'ৈ'), ('ো', '্'), ('ৗ', 'ৗ'), + ('ড়', 'ঢ়'), ('য়', 'ৣ'), ('০', '৺'), ('ਂ', 'ਂ'), + ('ਅ', 'ਊ'), ('ਏ', 'ਐ'), ('ਓ', 'ਨ'), ('ਪ', 'ਰ'), + ('ਲ', 'ਲ਼'), ('ਵ', 'ਸ਼'), ('ਸ', 'ਹ'), ('਼', '਼'), + ('ਾ', 'ੂ'), ('ੇ', 'ੈ'), ('ੋ', '੍'), ('ਖ਼', 'ੜ'), + ('ਫ਼', 'ਫ਼'), ('੦', 'ੴ'), ('ઁ', 'ઃ'), ('અ', 'ઋ'), + ('ઍ', 'ઍ'), ('એ', 'ઑ'), ('ઓ', 'ન'), ('પ', 'ર'), + ('લ', 'ળ'), ('વ', 'હ'), ('઼', 'ૅ'), ('ે', 'ૉ'), + ('ો', '્'), ('ૐ', 'ૐ'), ('ૠ', 'ૠ'), ('૦', '૯'), + ('ଁ', 'ଃ'), ('ଅ', 'ଌ'), ('ଏ', 'ଐ'), ('ଓ', 'ନ'), + ('ପ', 'ର'), ('ଲ', 'ଳ'), ('ଶ', 'ହ'), ('଼', 'ୃ'), + ('େ', 'ୈ'), ('ୋ', '୍'), ('ୖ', 'ୗ'), ('ଡ଼', 'ଢ଼'), + ('ୟ', 'ୡ'), ('୦', '୰'), ('ஂ', 'ஃ'), ('அ', 'ஊ'), + ('எ', 'ஐ'), ('ஒ', 'க'), ('ங', 'ச'), ('ஜ', 'ஜ'), + ('ஞ', 'ட'), ('ண', 'த'), ('ந', 'ப'), ('ம', 'வ'), + ('ஷ', 'ஹ'), ('ா', 'ூ'), ('ெ', 'ை'), ('ொ', '்'), + ('ௗ', 'ௗ'), ('௧', '௲'), ('ఁ', 'ః'), ('అ', 'ఌ'), + ('ఎ', 'ఐ'), ('ఒ', 'న'), ('ప', 'ళ'), ('వ', 'హ'), + ('ా', 'ౄ'), ('ె', 'ై'), ('ొ', '్'), ('ౕ', 'ౖ'), + ('ౠ', 'ౡ'), ('౦', '౯'), ('ಂ', 'ಃ'), ('ಅ', 'ಌ'), + ('ಎ', 'ಐ'), ('ಒ', 'ನ'), ('ಪ', 'ಳ'), ('ವ', 'ಹ'), + ('ಾ', 'ೄ'), ('ೆ', 'ೈ'), ('ೊ', '್'), ('ೕ', 'ೖ'), + ('ೞ', 'ೞ'), ('ೠ', 'ೡ'), ('೦', '೯'), ('ം', 'ഃ'), + ('അ', 'ഌ'), ('എ', 'ഐ'), ('ഒ', 'ന'), ('പ', 'ഹ'), + ('ാ', 'ൃ'), ('െ', 'ൈ'), ('ൊ', '്'), ('ൗ', 'ൗ'), + ('ൠ', 'ൡ'), ('൦', '൯'), ('ก', 'ฺ'), ('฿', '๛'), + ('ກ', 'ຂ'), ('ຄ', 'ຄ'), ('ງ', 'ຈ'), ('ຊ', 'ຊ'), + ('ຍ', 'ຍ'), ('ດ', 'ທ'), ('ນ', 'ຟ'), ('ມ', 'ຣ'), + ('ລ', 'ລ'), ('ວ', 'ວ'), ('ສ', 'ຫ'), ('ອ', 'ູ'), + ('ົ', 'ຽ'), ('ເ', 'ໄ'), ('ໆ', 'ໆ'), ('່', 'ໍ'), + ('໐', '໙'), ('ໜ', 'ໝ'), ('Ⴀ', 'Ⴥ'), ('ა', 'ჶ'), + ('჻', '჻'), ('ᄀ', 'ᅙ'), ('ᅟ', 'ᆢ'), ('ᆨ', 'ᇹ'), + ('Ḁ', 'ẚ'), ('Ạ', 'ỹ'), ('ἀ', 'ἕ'), ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), ('Ὀ', 'Ὅ'), ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), + ('ᾶ', 'ῄ'), ('ῆ', 'ΐ'), ('ῖ', 'Ί'), ('῝', '`'), + ('ῲ', 'ῴ'), ('ῶ', '῾'), ('\u{2000}', '\u{202e}'), ('‰', '⁆'), + ('\u{206a}', '⁰'), ('⁴', '₎'), ('₠', '₪'), ('⃐', '⃡'), + ('℀', 'ℸ'), ('⅓', 'ↂ'), ('←', '⇪'), ('∀', '⋱'), + ('⌀', '⌀'), ('⌂', '⍺'), ('␀', '␤'), ('⑀', '⑊'), + ('①', '⓪'), ('─', '▕'), ('■', '◯'), ('☀', '☓'), + ('☚', '♯'), ('✁', '✄'), ('✆', '✉'), ('✌', '✧'), + ('✩', '❋'), ('❍', '❍'), ('❏', '❒'), ('❖', '❖'), + ('❘', '❞'), ('❡', '❧'), ('❶', '➔'), ('➘', '➯'), + ('➱', '➾'), ('\u{3000}', '〷'), ('〿', '〿'), ('ぁ', 'ゔ'), + ('゙', 'ゞ'), ('ァ', 'ヾ'), ('ㄅ', 'ㄬ'), ('ㄱ', 'ㆎ'), + ('㆐', '㆟'), ('㈀', '㈜'), ('㈠', '㉃'), ('㉠', '㉻'), + ('㉿', '㊰'), ('㋀', '㋋'), ('㋐', '㋾'), ('㌀', '㍶'), + ('㍻', '㏝'), ('㏠', '㏾'), ('一', '龥'), ('\u{e000}', '鶴'), + ('ff', 'st'), ('ﬓ', 'ﬗ'), ('ﬞ', 'זּ'), ('טּ', 'לּ'), + ('מּ', 'מּ'), ('נּ', 'סּ'), ('ףּ', 'פּ'), ('צּ', 'ﮱ'), + ('ﯓ', '﴿'), ('ﵐ', 'ﶏ'), ('ﶒ', 'ﷇ'), ('ﷰ', 'ﷻ'), + ('︠', '︣'), ('︰', '﹄'), ('﹉', '﹒'), ('﹔', '﹦'), + ('﹨', '﹫'), ('ﹰ', 'ﹲ'), ('ﹴ', 'ﹴ'), ('ﹶ', 'ﻼ'), + ('\u{feff}', '\u{feff}'), ('!', '~'), ('。', 'ᄒ'), ('ᅡ', 'ᅦ'), + ('ᅧ', 'ᅬ'), ('ᅭ', 'ᅲ'), ('ᅳ', 'ᅵ'), ('¢', '₩'), + ('│', '○'), ('�', '\u{ffff}'), +]; + +pub const V2_0: &'static [(char, char)] = &[ + ('֑', '֡'), ('֣', '֯'), ('ׄ', 'ׄ'), ('ༀ', 'ཇ'), ('ཉ', 'ཀྵ'), + ('ཱ', 'ྋ'), ('ྐ', 'ྕ'), ('ྗ', 'ྗ'), ('ྙ', 'ྭ'), + ('ྱ', 'ྷ'), ('ྐྵ', 'ྐྵ'), ('ẛ', 'ẛ'), ('₫', '₫'), + ('가', '힣'), ('\u{1fffe}', '\u{1ffff}'), ('\u{2fffe}', '\u{2ffff}'), + ('\u{3fffe}', '\u{3ffff}'), ('\u{4fffe}', '\u{4ffff}'), + ('\u{5fffe}', '\u{5ffff}'), ('\u{6fffe}', '\u{6ffff}'), + ('\u{7fffe}', '\u{7ffff}'), ('\u{8fffe}', '\u{8ffff}'), + ('\u{9fffe}', '\u{9ffff}'), ('\u{afffe}', '\u{affff}'), + ('\u{bfffe}', '\u{bffff}'), ('\u{cfffe}', '\u{cffff}'), + ('\u{dfffe}', '\u{dffff}'), ('\u{efffe}', '\u{10ffff}'), +]; + +pub const V2_1: &'static [(char, char)] = &[ + ('€', '€'), ('', ''), +]; + +pub const V3_0: &'static [(char, char)] = &[ + ('Ƕ', 'ǹ'), ('Ș', 'ȟ'), ('Ȣ', 'ȳ'), ('ʩ', 'ʭ'), ('˟', '˟'), + ('˪', 'ˮ'), ('͆', '͎'), ('͢', '͢'), ('ϗ', 'ϗ'), ('ϛ', 'ϛ'), + ('ϝ', 'ϝ'), ('ϟ', 'ϟ'), ('ϡ', 'ϡ'), ('Ѐ', 'Ѐ'), ('Ѝ', 'Ѝ'), + ('ѐ', 'ѐ'), ('ѝ', 'ѝ'), ('҈', '҉'), ('Ҍ', 'ҏ'), ('Ӭ', 'ӭ'), + ('֊', '֊'), ('ٓ', 'ٕ'), ('ڸ', 'ڹ'), ('ڿ', 'ڿ'), ('ۏ', 'ۏ'), + ('ۺ', '۾'), ('܀', '܍'), ('\u{70f}', 'ܬ'), ('ܰ', '݊'), ('ހ', 'ް'), + ('ං', 'ඃ'), ('අ', 'ඖ'), ('ක', 'න'), ('ඳ', 'ර'), + ('ල', 'ල'), ('ව', 'ෆ'), ('්', '්'), ('ා', 'ු'), + ('ූ', 'ූ'), ('ෘ', 'ෟ'), ('ෲ', '෴'), ('ཪ', 'ཪ'), + ('ྖ', 'ྖ'), ('ྮ', 'ྰ'), ('ྸ', 'ྸ'), ('ྺ', 'ྼ'), + ('྾', '࿌'), ('࿏', '࿏'), ('က', 'အ'), ('ဣ', 'ဧ'), + ('ဩ', 'ဪ'), ('ာ', 'ဲ'), ('ံ', '္'), ('၀', 'ၙ'), + ('ሀ', 'ሆ'), ('ለ', 'ቆ'), ('ቈ', 'ቈ'), ('ቊ', 'ቍ'), + ('ቐ', 'ቖ'), ('ቘ', 'ቘ'), ('ቚ', 'ቝ'), ('በ', 'ኆ'), + ('ኈ', 'ኈ'), ('ኊ', 'ኍ'), ('ነ', 'ኮ'), ('ኰ', 'ኰ'), + ('ኲ', 'ኵ'), ('ኸ', 'ኾ'), ('ዀ', 'ዀ'), ('ዂ', 'ዅ'), + ('ወ', 'ዎ'), ('ዐ', 'ዖ'), ('ዘ', 'ዮ'), ('ደ', 'ጎ'), + ('ጐ', 'ጐ'), ('ጒ', 'ጕ'), ('ጘ', 'ጞ'), ('ጠ', 'ፆ'), + ('ፈ', 'ፚ'), ('፡', '፼'), ('Ꭰ', 'Ᏼ'), ('ᐁ', 'ᙶ'), + ('\u{1680}', '᚜'), ('ᚠ', 'ᛰ'), ('ក', 'ៜ'), ('០', '៩'), + ('᠀', '\u{180e}'), ('᠐', '᠙'), ('ᠠ', 'ᡷ'), ('ᢀ', 'ᢩ'), + ('\u{202f}', '\u{202f}'), ('⁈', '⁍'), ('₭', '₯'), ('⃢', '⃣'), + ('ℹ', '℺'), ('Ↄ', 'Ↄ'), ('⇫', '⇳'), ('⌁', '⌁'), + ('⍻', '⍻'), ('⍽', '⎚'), ('␥', '␦'), ('◰', '◷'), + ('☙', '☙'), ('♰', '♱'), ('⠀', '⣿'), ('⺀', '⺙'), + ('⺛', '⻳'), ('⼀', '⿕'), ('⿰', '⿻'), ('〸', '〺'), + ('〾', '〾'), ('ㆠ', 'ㆷ'), ('㐀', '䶵'), ('ꀀ', 'ꒌ'), + ('꒐', '꒡'), ('꒤', '꒳'), ('꒵', '꓀'), ('꓂', '꓄'), + ('꓆', '꓆'), ('יִ', 'יִ'), ('\u{fff9}', '\u{fffb}'), +]; + +pub const V3_1: &'static [(char, char)] = &[ + ('ϴ', 'ϵ'), ('\u{fdd0}', '\u{fdef}'), ('𐌀', '𐌞'), ('𐌠', '𐌣'), + ('𐌰', '𐍊'), ('𐐀', '𐐥'), ('𐐨', '𐑍'), ('𝀀', '𝃵'), + ('𝄀', '𝄦'), ('𝄪', '𝇝'), ('𝐀', '𝑔'), ('𝑖', '𝒜'), + ('𝒞', '𝒟'), ('𝒢', '𝒢'), ('𝒥', '𝒦'), ('𝒩', '𝒬'), + ('𝒮', '𝒹'), ('𝒻', '𝒻'), ('𝒽', '𝓀'), ('𝓂', '𝓃'), + ('𝓅', '𝔅'), ('𝔇', '𝔊'), ('𝔍', '𝔔'), ('𝔖', '𝔜'), + ('𝔞', '𝔹'), ('𝔻', '𝔾'), ('𝕀', '𝕄'), ('𝕆', '𝕆'), + ('𝕊', '𝕐'), ('𝕒', '𝚣'), ('𝚨', '𝟉'), ('𝟎', '𝟿'), + ('𠀀', '𪛖'), ('丽', '𪘀'), ('\u{e0001}', '\u{e0001}'), + ('\u{e0020}', '\u{e007f}'), +]; + +pub const V3_2: &'static [(char, char)] = &[ + ('Ƞ', 'Ƞ'), ('͏', '͏'), ('ͣ', 'ͯ'), ('Ϙ', 'ϙ'), ('϶', '϶'), + ('Ҋ', 'ҋ'), ('Ӆ', 'ӆ'), ('Ӊ', 'ӊ'), ('Ӎ', 'ӎ'), ('Ԁ', 'ԏ'), + ('ٮ', 'ٯ'), ('ޱ', 'ޱ'), ('ჷ', 'ჸ'), ('ᜀ', 'ᜌ'), ('ᜎ', '᜔'), + ('ᜠ', '᜶'), ('ᝀ', 'ᝓ'), ('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), + ('ᝲ', 'ᝳ'), ('⁇', '⁇'), ('⁎', '⁒'), ('⁗', '⁗'), + ('\u{205f}', '\u{2063}'), ('ⁱ', 'ⁱ'), ('₰', '₱'), ('⃤', '⃪'), + ('ℽ', '⅋'), ('⇴', '⇿'), ('⋲', '⋿'), ('⍼', '⍼'), + ('⎛', '⏎'), ('⓫', '⓾'), ('▖', '▟'), ('◸', '◿'), + ('☖', '☗'), ('♲', '♽'), ('⚀', '⚉'), ('❨', '❵'), + ('⟐', '⟫'), ('⟰', '⟿'), ('⤀', '⫿'), ('〻', '〽'), + ('ゕ', 'ゖ'), ('ゟ', '゠'), ('ヿ', 'ヿ'), ('ㇰ', 'ㇿ'), + ('㉑', '㉟'), ('㊱', '㊿'), ('꒢', '꒣'), ('꒴', '꒴'), + ('꓁', '꓁'), ('꓅', '꓅'), ('侮', '頻'), ('﷼', '﷼'), + ('︀', '️'), ('﹅', '﹆'), ('ﹳ', 'ﹳ'), ('⦅', '⦆'), +]; + +pub const V4_0: &'static [(char, char)] = &[ + ('ȡ', 'ȡ'), ('ȴ', 'ȶ'), ('ʮ', 'ʯ'), ('˯', '˿'), ('͐', '͗'), + ('͝', '͟'), ('Ϸ', 'ϻ'), ('\u{600}', '\u{603}'), ('؍', 'ؕ'), + ('ٖ', '٘'), ('ۮ', 'ۯ'), ('ۿ', 'ۿ'), ('ܭ', 'ܯ'), ('ݍ', 'ݏ'), + ('ऄ', 'ऄ'), ('ঽ', 'ঽ'), ('ਁ', 'ਁ'), ('ਃ', 'ਃ'), + ('ઌ', 'ઌ'), ('ૡ', 'ૣ'), ('૱', '૱'), ('ଵ', 'ଵ'), + ('ୱ', 'ୱ'), ('௳', '௺'), ('಼', 'ಽ'), ('៝', '៝'), + ('៰', '៹'), ('ᤀ', 'ᤜ'), ('ᤠ', 'ᤫ'), ('ᤰ', '᤻'), + ('᥀', '᥀'), ('᥄', 'ᥭ'), ('ᥰ', 'ᥴ'), ('᧠', '᧿'), + ('ᴀ', 'ᵫ'), ('⁓', '⁔'), ('℻', '℻'), ('⏏', '⏐'), + ('⓿', '⓿'), ('☔', '☕'), ('⚊', '⚑'), ('⚠', '⚡'), + ('⬀', '⬍'), ('㈝', '㈞'), ('㉐', '㉐'), ('㉼', '㉽'), + ('㋌', '㋏'), ('㍷', '㍺'), ('㏞', '㏟'), ('㏿', '㏿'), + ('䷀', '䷿'), ('﷽', '﷽'), ('﹇', '﹈'), ('𐀀', '𐀋'), + ('𐀍', '𐀦'), ('𐀨', '𐀺'), ('𐀼', '𐀽'), ('𐀿', '𐁍'), + ('𐁐', '𐁝'), ('𐂀', '𐃺'), ('𐄀', '𐄂'), ('𐄇', '𐄳'), + ('𐄷', '𐄿'), ('𐎀', '𐎝'), ('𐎟', '𐎟'), ('𐐦', '𐐧'), + ('𐑎', '𐒝'), ('𐒠', '𐒩'), ('𐠀', '𐠅'), ('𐠈', '𐠈'), + ('𐠊', '𐠵'), ('𐠷', '𐠸'), ('𐠼', '𐠼'), ('𐠿', '𐠿'), + ('𝌀', '𝍖'), ('𝓁', '𝓁'), ('󠄀', '󠇯'), +]; + +pub const V4_1: &'static [(char, char)] = &[ + ('ȷ', 'Ɂ'), ('͘', '͜'), ('ϼ', 'Ͽ'), ('Ӷ', 'ӷ'), ('֢', '֢'), + ('ׅ', 'ׇ'), ('؋', '؋'), ('؞', '؞'), ('ٙ', 'ٞ'), ('ݐ', 'ݭ'), + ('ॽ', 'ॽ'), ('ৎ', 'ৎ'), ('ஶ', 'ஶ'), ('௦', '௦'), + ('࿐', '࿑'), ('ჹ', 'ჺ'), ('ჼ', 'ჼ'), ('ሇ', 'ሇ'), + ('ቇ', 'ቇ'), ('ኇ', 'ኇ'), ('ኯ', 'ኯ'), ('ዏ', 'ዏ'), + ('ዯ', 'ዯ'), ('ጏ', 'ጏ'), ('ጟ', 'ጟ'), ('ፇ', 'ፇ'), + ('፟', '፠'), ('ᎀ', '᎙'), ('ᦀ', 'ᦩ'), ('ᦰ', 'ᧉ'), + ('᧐', '᧙'), ('᧞', '᧟'), ('ᨀ', 'ᨛ'), ('᨞', '᨟'), + ('ᵬ', '᷃'), ('⁕', '⁖'), ('⁘', '⁞'), ('ₐ', 'ₔ'), + ('₲', '₵'), ('⃫', '⃫'), ('ℼ', 'ℼ'), ('⅌', '⅌'), + ('⏑', '⏛'), ('☘', '☘'), ('♾', '♿'), ('⚒', '⚜'), + ('⚢', '⚱'), ('⟀', '⟆'), ('⬎', '⬓'), ('Ⰰ', 'Ⱞ'), + ('ⰰ', 'ⱞ'), ('Ⲁ', '⳪'), ('⳹', 'ⴥ'), ('ⴰ', 'ⵥ'), + ('ⵯ', 'ⵯ'), ('ⶀ', 'ⶖ'), ('ⶠ', 'ⶦ'), ('ⶨ', 'ⶮ'), + ('ⶰ', 'ⶶ'), ('ⶸ', 'ⶾ'), ('ⷀ', 'ⷆ'), ('ⷈ', 'ⷎ'), + ('ⷐ', 'ⷖ'), ('ⷘ', 'ⷞ'), ('⸀', '⸗'), ('⸜', '⸝'), + ('㇀', '㇏'), ('㉾', '㉾'), ('龦', '龻'), ('꜀', '꜖'), + ('ꠀ', '꠫'), ('並', '龎'), ('︐', '︙'), ('𐅀', '𐆊'), + ('𐎠', '𐏃'), ('𐏈', '𐏕'), ('𐨀', '𐨃'), ('𐨅', '𐨆'), + ('𐨌', '𐨓'), ('𐨕', '𐨗'), ('𐨙', '𐨳'), ('𐨸', '𐨺'), + ('𐨿', '𐩇'), ('𐩐', '𐩘'), ('𝈀', '𝉅'), ('𝚤', '𝚥'), +]; + +pub const V5_0: &'static [(char, char)] = &[ + ('ɂ', 'ɏ'), ('ͻ', 'ͽ'), ('ӏ', 'ӏ'), ('Ӻ', 'ӿ'), ('Ԑ', 'ԓ'), + ('ֺ', 'ֺ'), ('߀', 'ߺ'), ('ॻ', 'ॼ'), ('ॾ', 'ॿ'), ('ೢ', 'ೣ'), + ('ೱ', 'ೲ'), ('ᬀ', 'ᭋ'), ('᭐', '᭼'), ('᷄', '᷊'), + ('᷾', '᷿'), ('⃬', '⃯'), ('⅍', 'ⅎ'), ('ↄ', 'ↄ'), + ('⏜', '⏧'), ('⚲', '⚲'), ('⟇', '⟊'), ('⬔', '⬚'), + ('⬠', '⬣'), ('Ⱡ', 'ⱬ'), ('ⱴ', 'ⱷ'), ('ꜗ', 'ꜚ'), + ('꜠', '꜡'), ('ꡀ', '꡷'), ('𐤀', '𐤙'), ('𐤟', '𐤟'), + ('𒀀', '𒍮'), ('𒐀', '𒑢'), ('𒑰', '𒑳'), ('𝍠', '𝍱'), + ('𝟊', '𝟋'), +]; + +pub const V5_1: &'static [(char, char)] = &[ + ('Ͱ', 'ͳ'), ('Ͷ', 'ͷ'), ('Ϗ', 'Ϗ'), ('҇', '҇'), ('Ԕ', 'ԣ'), + ('؆', '؊'), ('ؖ', 'ؚ'), ('ػ', 'ؿ'), ('ݮ', 'ݿ'), ('ॱ', 'ॲ'), + ('ੑ', 'ੑ'), ('ੵ', 'ੵ'), ('ୄ', 'ୄ'), ('ୢ', 'ୣ'), + ('ௐ', 'ௐ'), ('ఽ', 'ఽ'), ('ౘ', 'ౙ'), ('ౢ', 'ౣ'), + ('౸', '౿'), ('ഽ', 'ഽ'), ('ൄ', 'ൄ'), ('ൢ', 'ൣ'), + ('൰', '൵'), ('൹', 'ൿ'), ('ཫ', 'ཬ'), ('࿎', '࿎'), + ('࿒', '࿔'), ('ဢ', 'ဢ'), ('ဨ', 'ဨ'), ('ါ', 'ါ'), + ('ဳ', 'ဵ'), ('်', 'ဿ'), ('ၚ', '႙'), ('႞', '႟'), + ('ᢪ', 'ᢪ'), ('ᮀ', '᮪'), ('ᮮ', '᮹'), ('ᰀ', '᰷'), + ('᰻', '᱉'), ('ᱍ', '᱿'), ('᷋', 'ᷦ'), ('ẜ', 'ẟ'), + ('Ỻ', 'ỿ'), ('\u{2064}', '\u{2064}'), ('⃰', '⃰'), ('⅏', '⅏'), + ('ↅ', 'ↈ'), ('⚝', '⚝'), ('⚳', '⚼'), ('⛀', '⛃'), + ('⟌', '⟌'), ('⟬', '⟯'), ('⬛', '⬟'), ('⬤', '⭌'), + ('⭐', '⭔'), ('Ɑ', 'Ɐ'), ('ⱱ', 'ⱳ'), ('ⱸ', 'ⱽ'), + ('ⷠ', 'ⷿ'), ('⸘', '⸛'), ('⸞', '⸰'), ('ㄭ', 'ㄭ'), + ('㇐', '㇣'), ('龼', '鿃'), ('ꔀ', 'ꘫ'), ('Ꙁ', 'ꙟ'), + ('Ꙣ', '꙳'), ('꙼', 'ꚗ'), ('ꜛ', 'ꜟ'), ('Ꜣ', 'ꞌ'), + ('ꟻ', 'ꟿ'), ('ꢀ', '꣄'), ('꣎', '꣙'), ('꤀', '꥓'), + ('꥟', '꥟'), ('ꨀ', 'ꨶ'), ('ꩀ', 'ꩍ'), ('꩐', '꩙'), + ('꩜', '꩟'), ('︤', '︦'), ('𐆐', '𐆛'), ('𐇐', '𐇽'), + ('𐊀', '𐊜'), ('𐊠', '𐋐'), ('𐤠', '𐤹'), ('𐤿', '𐤿'), + ('𝄩', '𝄩'), ('🀀', '🀫'), ('🀰', '🂓'), +]; + +pub const V5_2: &'static [(char, char)] = &[ + ('Ԥ', 'ԥ'), ('ࠀ', '࠭'), ('࠰', '࠾'), ('ऀ', 'ऀ'), + ('ॎ', 'ॎ'), ('ॕ', 'ॕ'), ('ॹ', 'ॺ'), ('৻', '৻'), + ('࿕', '࿘'), ('ႚ', 'ႝ'), ('ᅚ', 'ᅞ'), ('ᆣ', 'ᆧ'), + ('ᇺ', 'ᇿ'), ('᐀', '᐀'), ('ᙷ', 'ᙿ'), ('ᢰ', 'ᣵ'), + ('ᦪ', 'ᦫ'), ('᧚', '᧚'), ('ᨠ', 'ᩞ'), ('᩠', '᩼'), + ('᩿', '᪉'), ('᪐', '᪙'), ('᪠', '᪭'), ('᳐', 'ᳲ'), + ('᷽', '᷽'), ('₶', '₸'), ('⅐', '⅒'), ('↉', '↉'), + ('⏨', '⏨'), ('⚞', '⚟'), ('⚽', '⚿'), ('⛄', '⛍'), + ('⛏', '⛡'), ('⛣', '⛣'), ('⛨', '⛿'), ('❗', '❗'), + ('⭕', '⭙'), ('Ɒ', 'Ɒ'), ('Ȿ', 'Ɀ'), ('Ⳬ', '⳱'), + ('⸱', '⸱'), ('㉄', '㉏'), ('鿄', '鿋'), ('ꓐ', '꓿'), + ('ꚠ', '꛷'), ('꠰', '꠹'), ('꣠', 'ꣻ'), ('ꥠ', 'ꥼ'), + ('ꦀ', '꧍'), ('ꧏ', '꧙'), ('꧞', '꧟'), ('ꩠ', 'ꩻ'), + ('ꪀ', 'ꫂ'), ('ꫛ', '꫟'), ('ꯀ', '꯭'), ('꯰', '꯹'), + ('ힰ', 'ퟆ'), ('ퟋ', 'ퟻ'), ('恵', '舘'), ('𐡀', '𐡕'), + ('𐡗', '𐡟'), ('𐤚', '𐤛'), ('𐩠', '𐩿'), ('𐬀', '𐬵'), + ('𐬹', '𐭕'), ('𐭘', '𐭲'), ('𐭸', '𐭿'), ('𐰀', '𐱈'), + ('𐹠', '𐹾'), ('𑂀', '𑃁'), ('𓀀', '𓐮'), ('🄀', '🄊'), + ('🄐', '🄮'), ('🄱', '🄱'), ('🄽', '🄽'), ('🄿', '🄿'), + ('🅂', '🅂'), ('🅆', '🅆'), ('🅊', '🅎'), ('🅗', '🅗'), + ('🅟', '🅟'), ('🅹', '🅹'), ('🅻', '🅼'), ('🅿', '🅿'), + ('🆊', '🆍'), ('🆐', '🆐'), ('🈀', '🈀'), ('🈐', '🈱'), + ('🉀', '🉈'), ('𪜀', '𫜴'), +]; + +pub const V6_0: &'static [(char, char)] = &[ + ('Ԧ', 'ԧ'), ('ؠ', 'ؠ'), ('ٟ', 'ٟ'), ('ࡀ', '࡛'), ('࡞', '࡞'), + ('ऺ', 'ऻ'), ('ॏ', 'ॏ'), ('ॖ', 'ॗ'), ('ॳ', 'ॷ'), + ('୲', '୷'), ('ഩ', 'ഩ'), ('ഺ', 'ഺ'), ('ൎ', 'ൎ'), + ('ྌ', 'ྏ'), ('࿙', '࿚'), ('፝', '፞'), ('ᯀ', '᯳'), + ('᯼', '᯿'), ('᷼', '᷼'), ('ₕ', 'ₜ'), ('₹', '₹'), + ('⏩', '⏳'), ('⛎', '⛎'), ('⛢', '⛢'), ('⛤', '⛧'), + ('✅', '✅'), ('✊', '✋'), ('✨', '✨'), ('❌', '❌'), + ('❎', '❎'), ('❓', '❕'), ('❟', '❠'), ('➕', '➗'), + ('➰', '➰'), ('➿', '➿'), ('⟎', '⟏'), ('⵰', '⵰'), + ('⵿', '⵿'), ('ㆸ', 'ㆺ'), ('Ꙡ', 'ꙡ'), ('Ɥ', 'ꞎ'), + ('Ꞑ', 'ꞑ'), ('Ꞡ', 'ꞩ'), ('ꟺ', 'ꟺ'), ('ꬁ', 'ꬆ'), + ('ꬉ', 'ꬎ'), ('ꬑ', 'ꬖ'), ('ꬠ', 'ꬦ'), ('ꬨ', 'ꬮ'), + ('﮲', '﯁'), ('𑀀', '𑁍'), ('𑁒', '𑁯'), ('𖠀', '𖨸'), + ('𛀀', '𛀁'), ('🂠', '🂮'), ('🂱', '🂾'), ('🃁', '🃏'), + ('🃑', '🃟'), ('🄰', '🄰'), ('🄲', '🄼'), ('🄾', '🄾'), + ('🅀', '🅁'), ('🅃', '🅅'), ('🅇', '🅉'), ('🅏', '🅖'), + ('🅘', '🅞'), ('🅠', '🅩'), ('🅰', '🅸'), ('🅺', '🅺'), + ('🅽', '🅾'), ('🆀', '🆉'), ('🆎', '🆏'), ('🆑', '🆚'), + ('🇦', '🇿'), ('🈁', '🈂'), ('🈲', '🈺'), ('🉐', '🉑'), + ('🌀', '🌠'), ('🌰', '🌵'), ('🌷', '🍼'), ('🎀', '🎓'), + ('🎠', '🏄'), ('🏆', '🏊'), ('🏠', '🏰'), ('🐀', '🐾'), + ('👀', '👀'), ('👂', '📷'), ('📹', '📼'), ('🔀', '🔽'), + ('🕐', '🕧'), ('🗻', '🗿'), ('😁', '😐'), ('😒', '😔'), + ('😖', '😖'), ('😘', '😘'), ('😚', '😚'), ('😜', '😞'), + ('😠', '😥'), ('😨', '😫'), ('😭', '😭'), ('😰', '😳'), + ('😵', '🙀'), ('🙅', '🙏'), ('🚀', '🛅'), ('🜀', '🝳'), + ('𫝀', '𫠝'), +]; + +pub const V6_1: &'static [(char, char)] = &[ + ('֏', '֏'), ('\u{604}', '\u{604}'), ('ࢠ', 'ࢠ'), ('ࢢ', 'ࢬ'), + ('ࣤ', 'ࣾ'), ('૰', '૰'), ('ໞ', 'ໟ'), ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), ('ჽ', 'ჿ'), ('᮫', 'ᮭ'), ('ᮺ', 'ᮿ'), + ('᳀', '᳇'), ('ᳳ', 'ᳶ'), ('⟋', '⟋'), ('⟍', '⟍'), + ('Ⳳ', 'ⳳ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), ('ⵦ', 'ⵧ'), + ('⸲', '⸻'), ('鿌', '鿌'), ('ꙴ', 'ꙻ'), ('ꚟ', 'ꚟ'), + ('Ꞓ', 'ꞓ'), ('Ɦ', 'Ɦ'), ('ꟸ', 'ꟹ'), ('ꫠ', '꫶'), + ('郞', '隷'), ('𐦀', '𐦷'), ('𐦾', '𐦿'), ('𑃐', '𑃨'), + ('𑃰', '𑃹'), ('𑄀', '𑄴'), ('𑄶', '𑅃'), ('𑆀', '𑇈'), + ('𑇐', '𑇙'), ('𑚀', '𑚷'), ('𑛀', '𑛉'), ('𖼀', '𖽄'), + ('𖽐', '𖽾'), ('𖾏', '𖾟'), ('𞸀', '𞸃'), ('𞸅', '𞸟'), + ('𞸡', '𞸢'), ('𞸤', '𞸤'), ('𞸧', '𞸧'), ('𞸩', '𞸲'), + ('𞸴', '𞸷'), ('𞸹', '𞸹'), ('𞸻', '𞸻'), ('𞹂', '𞹂'), + ('𞹇', '𞹇'), ('𞹉', '𞹉'), ('𞹋', '𞹋'), ('𞹍', '𞹏'), + ('𞹑', '𞹒'), ('𞹔', '𞹔'), ('𞹗', '𞹗'), ('𞹙', '𞹙'), + ('𞹛', '𞹛'), ('𞹝', '𞹝'), ('𞹟', '𞹟'), ('𞹡', '𞹢'), + ('𞹤', '𞹤'), ('𞹧', '𞹪'), ('𞹬', '𞹲'), ('𞹴', '𞹷'), + ('𞹹', '𞹼'), ('𞹾', '𞹾'), ('𞺀', '𞺉'), ('𞺋', '𞺛'), + ('𞺡', '𞺣'), ('𞺥', '𞺩'), ('𞺫', '𞺻'), ('𞻰', '𞻱'), + ('🅪', '🅫'), ('🕀', '🕃'), ('😀', '😀'), ('😑', '😑'), + ('😕', '😕'), ('😗', '😗'), ('😙', '😙'), ('😛', '😛'), + ('😟', '😟'), ('😦', '😧'), ('😬', '😬'), ('😮', '😯'), + ('😴', '😴'), +]; + +pub const V6_2: &'static [(char, char)] = &[ + ('₺', '₺'), +]; + +pub const V6_3: &'static [(char, char)] = &[ + ('\u{61c}', '\u{61c}'), ('\u{2066}', '\u{2069}'), +]; + +pub const V7_0: &'static [(char, char)] = &[ + ('Ϳ', 'Ϳ'), ('Ԩ', 'ԯ'), ('֍', '֎'), ('\u{605}', '\u{605}'), + ('ࢡ', 'ࢡ'), ('ࢭ', 'ࢲ'), ('ࣿ', 'ࣿ'), ('ॸ', 'ॸ'), + ('ঀ', 'ঀ'), ('ఀ', 'ఀ'), ('ఴ', 'ఴ'), ('ಁ', 'ಁ'), + ('ഁ', 'ഁ'), ('෦', '෯'), ('ᛱ', 'ᛸ'), ('ᤝ', 'ᤞ'), + ('᪰', '᪾'), ('᳸', '᳹'), ('ᷧ', '᷵'), ('₻', '₽'), + ('⏴', '⏺'), ('✀', '✀'), ('⭍', '⭏'), ('⭚', '⭳'), + ('⭶', '⮕'), ('⮘', '⮹'), ('⮽', '⯈'), ('⯊', '⯑'), + ('⸼', '⹂'), ('Ꚙ', 'ꚝ'), ('ꞔ', 'ꞟ'), ('Ɜ', 'Ɬ'), + ('Ʞ', 'Ʇ'), ('ꟷ', 'ꟷ'), ('ꧠ', 'ꧾ'), ('ꩼ', 'ꩿ'), + ('ꬰ', 'ꭟ'), ('ꭤ', 'ꭥ'), ('︧', '︭'), ('𐆋', '𐆌'), + ('𐆠', '𐆠'), ('𐋠', '𐋻'), ('𐌟', '𐌟'), ('𐍐', '𐍺'), + ('𐔀', '𐔧'), ('𐔰', '𐕣'), ('𐕯', '𐕯'), ('𐘀', '𐜶'), + ('𐝀', '𐝕'), ('𐝠', '𐝧'), ('𐡠', '𐢞'), ('𐢧', '𐢯'), + ('𐪀', '𐪟'), ('𐫀', '𐫦'), ('𐫫', '𐫶'), ('𐮀', '𐮑'), + ('𐮙', '𐮜'), ('𐮩', '𐮯'), ('𑁿', '𑁿'), ('𑅐', '𑅶'), + ('𑇍', '𑇍'), ('𑇚', '𑇚'), ('𑇡', '𑇴'), ('𑈀', '𑈑'), + ('𑈓', '𑈽'), ('𑊰', '𑋪'), ('𑋰', '𑋹'), ('𑌁', '𑌃'), + ('𑌅', '𑌌'), ('𑌏', '𑌐'), ('𑌓', '𑌨'), ('𑌪', '𑌰'), + ('𑌲', '𑌳'), ('𑌵', '𑌹'), ('𑌼', '𑍄'), ('𑍇', '𑍈'), + ('𑍋', '𑍍'), ('𑍗', '𑍗'), ('𑍝', '𑍣'), ('𑍦', '𑍬'), + ('𑍰', '𑍴'), ('𑒀', '𑓇'), ('𑓐', '𑓙'), ('𑖀', '𑖵'), + ('𑖸', '𑗉'), ('𑘀', '𑙄'), ('𑙐', '𑙙'), ('𑢠', '𑣲'), + ('𑣿', '𑣿'), ('𑫀', '𑫸'), ('𒍯', '𒎘'), ('𒑣', '𒑮'), + ('𒑴', '𒑴'), ('𖩀', '𖩞'), ('𖩠', '𖩩'), ('𖩮', '𖩯'), + ('𖫐', '𖫭'), ('𖫰', '𖫵'), ('𖬀', '𖭅'), ('𖭐', '𖭙'), + ('𖭛', '𖭡'), ('𖭣', '𖭷'), ('𖭽', '𖮏'), ('𛰀', '𛱪'), + ('𛱰', '𛱼'), ('𛲀', '𛲈'), ('𛲐', '𛲙'), ('𛲜', '\u{1bca3}'), + ('𞠀', '𞣄'), ('𞣇', '𞣖'), ('🂿', '🂿'), ('🃠', '🃵'), + ('🄋', '🄌'), ('🌡', '🌬'), ('🌶', '🌶'), ('🍽', '🍽'), + ('🎔', '🎟'), ('🏅', '🏅'), ('🏋', '🏎'), ('🏔', '🏟'), + ('🏱', '🏷'), ('🐿', '🐿'), ('👁', '👁'), ('📸', '📸'), + ('📽', '📾'), ('🔾', '🔿'), ('🕄', '🕊'), ('🕨', '🕹'), + ('🕻', '🖣'), ('🖥', '🗺'), ('🙁', '🙂'), ('🙐', '🙿'), + ('🛆', '🛏'), ('🛠', '🛬'), ('🛰', '🛳'), ('🞀', '🟔'), + ('🠀', '🠋'), ('🠐', '🡇'), ('🡐', '🡙'), ('🡠', '🢇'), + ('🢐', '🢭'), +]; + +pub const V8_0: &'static [(char, char)] = &[ + ('ࢳ', 'ࢴ'), ('ࣣ', 'ࣣ'), ('ૹ', 'ૹ'), ('ౚ', 'ౚ'), + ('ൟ', 'ൟ'), ('Ᏽ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('₾', '₾'), + ('↊', '↋'), ('⯬', '⯯'), ('鿍', '鿕'), ('ꚞ', 'ꚞ'), + ('ꞏ', 'ꞏ'), ('Ʝ', 'ꞷ'), ('꣼', 'ꣽ'), ('ꭠ', 'ꭣ'), + ('ꭰ', 'ꮿ'), ('︮', '︯'), ('𐣠', '𐣲'), ('𐣴', '𐣵'), + ('𐣻', '𐣿'), ('𐦼', '𐦽'), ('𐧀', '𐧏'), ('𐧒', '𐧿'), + ('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𐳺', '𐳿'), ('𑇉', '𑇌'), + ('𑇛', '𑇟'), ('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), + ('𑊏', '𑊝'), ('𑊟', '𑊩'), ('𑌀', '𑌀'), ('𑍐', '𑍐'), + ('𑗊', '𑗝'), ('𑜀', '𑜙'), ('𑜝', '𑜫'), ('𑜰', '𑜿'), + ('𒎙', '𒎙'), ('𒒀', '𒕃'), ('𔐀', '𔙆'), ('𝇞', '𝇨'), + ('𝠀', '𝪋'), ('𝪛', '𝪟'), ('𝪡', '𝪯'), ('🌭', '🌯'), + ('🍾', '🍿'), ('🏏', '🏓'), ('🏸', '🏿'), ('📿', '📿'), + ('🕋', '🕏'), ('🙃', '🙄'), ('🛐', '🛐'), ('🤐', '🤘'), + ('🦀', '🦄'), ('🧀', '🧀'), ('𫠠', '𬺡'), +]; + +pub const V9_0: &'static [(char, char)] = &[ + ('ࢶ', 'ࢽ'), ('ࣔ', '\u{8e2}'), ('ಀ', 'ಀ'), ('൏', '൏'), + ('ൔ', 'ൖ'), ('൘', '൞'), ('൶', '൸'), ('ᲀ', 'ᲈ'), + ('᷻', '᷻'), ('⏻', '⏾'), ('⹃', '⹄'), ('Ɪ', 'Ɪ'), + ('ꣅ', 'ꣅ'), ('𐆍', '𐆎'), ('𐒰', '𐓓'), ('𐓘', '𐓻'), + ('𑈾', '𑈾'), ('𑐀', '𑑙'), ('𑑛', '𑑛'), ('𑑝', '𑑝'), + ('𑙠', '𑙬'), ('𑰀', '𑰈'), ('𑰊', '𑰶'), ('𑰸', '𑱅'), + ('𑱐', '𑱬'), ('𑱰', '𑲏'), ('𑲒', '𑲧'), ('𑲩', '𑲶'), + ('𖿠', '𖿠'), ('𗀀', '𘟬'), ('𘠀', '𘫲'), ('𞀀', '𞀆'), + ('𞀈', '𞀘'), ('𞀛', '𞀡'), ('𞀣', '𞀤'), ('𞀦', '𞀪'), + ('𞤀', '𞥊'), ('𞥐', '𞥙'), ('𞥞', '𞥟'), ('🆛', '🆬'), + ('🈻', '🈻'), ('🕺', '🕺'), ('🖤', '🖤'), ('🛑', '🛒'), + ('🛴', '🛶'), ('🤙', '🤞'), ('🤠', '🤧'), ('🤰', '🤰'), + ('🤳', '🤾'), ('🥀', '🥋'), ('🥐', '🥞'), ('🦅', '🦑'), +]; diff --git a/regex-syntax-2/src/unicode_tables/case_folding_simple.rs b/regex-syntax-2/src/unicode_tables/case_folding_simple.rs new file mode 100644 index 0000000000..72ec79f0dc --- /dev/null +++ b/regex-syntax-2/src/unicode_tables/case_folding_simple.rs @@ -0,0 +1,662 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate case-folding-simple /home/andrew/tmp/ucd-10.0.0/ --chars --all-pairs +// +// ucd-generate is available on crates.io. + +pub const CASE_FOLDING_SIMPLE: &'static [(char, &'static [char])] = &[ + ('A', &['a']), ('B', &['b']), ('C', &['c']), ('D', &['d']), ('E', &['e']), + ('F', &['f']), ('G', &['g']), ('H', &['h']), ('I', &['i']), ('J', &['j']), + ('K', &['k', 'K', ]), ('L', &['l']), ('M', &['m']), ('N', &['n']), ('O', &[ + 'o']), ('P', &['p']), ('Q', &['q']), ('R', &['r']), ('S', &['s', 'ſ', ]), + ('T', &['t']), ('U', &['u']), ('V', &['v']), ('W', &['w']), ('X', &['x']), + ('Y', &['y']), ('Z', &['z']), ('a', &['A']), ('b', &['B']), ('c', &['C']), + ('d', &['D']), ('e', &['E']), ('f', &['F']), ('g', &['G']), ('h', &['H']), + ('i', &['I']), ('j', &['J']), ('k', &['K', 'K', ]), ('l', &['L']), ('m', &[ + 'M']), ('n', &['N']), ('o', &['O']), ('p', &['P']), ('q', &['Q']), ('r', &[ + 'R']), ('s', &['S', 'ſ', ]), ('t', &['T']), ('u', &['U']), ('v', &['V']), + ('w', &['W']), ('x', &['X']), ('y', &['Y']), ('z', &['Z']), ('µ', &['Μ', + 'μ', ]), ('À', &['à']), ('Á', &['á']), ('Â', &['â']), ('Ã', &['ã' + ]), ('Ä', &['ä']), ('Å', &['å', 'Å', ]), ('Æ', &['æ']), ('Ç', &['ç' + ]), ('È', &['è']), ('É', &['é']), ('Ê', &['ê']), ('Ë', &['ë']), + ('Ì', &['ì']), ('Í', &['í']), ('Î', &['î']), ('Ï', &['ï']), ('Ð', &[ + 'ð']), ('Ñ', &['ñ']), ('Ò', &['ò']), ('Ó', &['ó']), ('Ô', &['ô']), + ('Õ', &['õ']), ('Ö', &['ö']), ('Ø', &['ø']), ('Ù', &['ù']), ('Ú', &[ + 'ú']), ('Û', &['û']), ('Ü', &['ü']), ('Ý', &['ý']), ('Þ', &['þ']), + ('ß', &['ẞ']), ('à', &['À']), ('á', &['Á']), ('â', &['Â']), + ('ã', &['Ã']), ('ä', &['Ä']), ('å', &['Å', 'Å', ]), ('æ', &['Æ']), + ('ç', &['Ç']), ('è', &['È']), ('é', &['É']), ('ê', &['Ê']), ('ë', &[ + 'Ë']), ('ì', &['Ì']), ('í', &['Í']), ('î', &['Î']), ('ï', &['Ï']), + ('ð', &['Ð']), ('ñ', &['Ñ']), ('ò', &['Ò']), ('ó', &['Ó']), ('ô', &[ + 'Ô']), ('õ', &['Õ']), ('ö', &['Ö']), ('ø', &['Ø']), ('ù', &['Ù']), + ('ú', &['Ú']), ('û', &['Û']), ('ü', &['Ü']), ('ý', &['Ý']), ('þ', &[ + 'Þ']), ('ÿ', &['Ÿ']), ('Ā', &['ā']), ('ā', &['Ā']), ('Ă', &['ă']), + ('ă', &['Ă']), ('Ą', &['ą']), ('ą', &['Ą']), ('Ć', &['ć']), ('ć', &[ + 'Ć']), ('Ĉ', &['ĉ']), ('ĉ', &['Ĉ']), ('Ċ', &['ċ']), ('ċ', &['Ċ']), + ('Č', &['č']), ('č', &['Č']), ('Ď', &['ď']), ('ď', &['Ď']), ('Đ', &[ + 'đ']), ('đ', &['Đ']), ('Ē', &['ē']), ('ē', &['Ē']), ('Ĕ', &['ĕ']), + ('ĕ', &['Ĕ']), ('Ė', &['ė']), ('ė', &['Ė']), ('Ę', &['ę']), ('ę', &[ + 'Ę']), ('Ě', &['ě']), ('ě', &['Ě']), ('Ĝ', &['ĝ']), ('ĝ', &['Ĝ']), + ('Ğ', &['ğ']), ('ğ', &['Ğ']), ('Ġ', &['ġ']), ('ġ', &['Ġ']), ('Ģ', &[ + 'ģ']), ('ģ', &['Ģ']), ('Ĥ', &['ĥ']), ('ĥ', &['Ĥ']), ('Ħ', &['ħ']), + ('ħ', &['Ħ']), ('Ĩ', &['ĩ']), ('ĩ', &['Ĩ']), ('Ī', &['ī']), ('ī', &[ + 'Ī']), ('Ĭ', &['ĭ']), ('ĭ', &['Ĭ']), ('Į', &['į']), ('į', &['Į']), + ('IJ', &['ij']), ('ij', &['IJ']), ('Ĵ', &['ĵ']), ('ĵ', &['Ĵ']), ('Ķ', &[ + 'ķ']), ('ķ', &['Ķ']), ('Ĺ', &['ĺ']), ('ĺ', &['Ĺ']), ('Ļ', &['ļ']), + ('ļ', &['Ļ']), ('Ľ', &['ľ']), ('ľ', &['Ľ']), ('Ŀ', &['ŀ']), ('ŀ', &[ + 'Ŀ']), ('Ł', &['ł']), ('ł', &['Ł']), ('Ń', &['ń']), ('ń', &['Ń']), + ('Ņ', &['ņ']), ('ņ', &['Ņ']), ('Ň', &['ň']), ('ň', &['Ň']), ('Ŋ', &[ + 'ŋ']), ('ŋ', &['Ŋ']), ('Ō', &['ō']), ('ō', &['Ō']), ('Ŏ', &['ŏ']), + ('ŏ', &['Ŏ']), ('Ő', &['ő']), ('ő', &['Ő']), ('Œ', &['œ']), ('œ', &[ + 'Œ']), ('Ŕ', &['ŕ']), ('ŕ', &['Ŕ']), ('Ŗ', &['ŗ']), ('ŗ', &['Ŗ']), + ('Ř', &['ř']), ('ř', &['Ř']), ('Ś', &['ś']), ('ś', &['Ś']), ('Ŝ', &[ + 'ŝ']), ('ŝ', &['Ŝ']), ('Ş', &['ş']), ('ş', &['Ş']), ('Š', &['š']), + ('š', &['Š']), ('Ţ', &['ţ']), ('ţ', &['Ţ']), ('Ť', &['ť']), ('ť', &[ + 'Ť']), ('Ŧ', &['ŧ']), ('ŧ', &['Ŧ']), ('Ũ', &['ũ']), ('ũ', &['Ũ']), + ('Ū', &['ū']), ('ū', &['Ū']), ('Ŭ', &['ŭ']), ('ŭ', &['Ŭ']), ('Ů', &[ + 'ů']), ('ů', &['Ů']), ('Ű', &['ű']), ('ű', &['Ű']), ('Ų', &['ų']), + ('ų', &['Ų']), ('Ŵ', &['ŵ']), ('ŵ', &['Ŵ']), ('Ŷ', &['ŷ']), ('ŷ', &[ + 'Ŷ']), ('Ÿ', &['ÿ']), ('Ź', &['ź']), ('ź', &['Ź']), ('Ż', &['ż']), + ('ż', &['Ż']), ('Ž', &['ž']), ('ž', &['Ž']), ('ſ', &['S', 's', ]), + ('ƀ', &['Ƀ']), ('Ɓ', &['ɓ']), ('Ƃ', &['ƃ']), ('ƃ', &['Ƃ']), ('Ƅ', &[ + 'ƅ']), ('ƅ', &['Ƅ']), ('Ɔ', &['ɔ']), ('Ƈ', &['ƈ']), ('ƈ', &['Ƈ']), + ('Ɖ', &['ɖ']), ('Ɗ', &['ɗ']), ('Ƌ', &['ƌ']), ('ƌ', &['Ƌ']), ('Ǝ', &[ + 'ǝ']), ('Ə', &['ə']), ('Ɛ', &['ɛ']), ('Ƒ', &['ƒ']), ('ƒ', &['Ƒ']), + ('Ɠ', &['ɠ']), ('Ɣ', &['ɣ']), ('ƕ', &['Ƕ']), ('Ɩ', &['ɩ']), ('Ɨ', &[ + 'ɨ']), ('Ƙ', &['ƙ']), ('ƙ', &['Ƙ']), ('ƚ', &['Ƚ']), ('Ɯ', &['ɯ']), + ('Ɲ', &['ɲ']), ('ƞ', &['Ƞ']), ('Ɵ', &['ɵ']), ('Ơ', &['ơ']), ('ơ', &[ + 'Ơ']), ('Ƣ', &['ƣ']), ('ƣ', &['Ƣ']), ('Ƥ', &['ƥ']), ('ƥ', &['Ƥ']), + ('Ʀ', &['ʀ']), ('Ƨ', &['ƨ']), ('ƨ', &['Ƨ']), ('Ʃ', &['ʃ']), ('Ƭ', &[ + 'ƭ']), ('ƭ', &['Ƭ']), ('Ʈ', &['ʈ']), ('Ư', &['ư']), ('ư', &['Ư']), + ('Ʊ', &['ʊ']), ('Ʋ', &['ʋ']), ('Ƴ', &['ƴ']), ('ƴ', &['Ƴ']), ('Ƶ', &[ + 'ƶ']), ('ƶ', &['Ƶ']), ('Ʒ', &['ʒ']), ('Ƹ', &['ƹ']), ('ƹ', &['Ƹ']), + ('Ƽ', &['ƽ']), ('ƽ', &['Ƽ']), ('ƿ', &['Ƿ']), ('DŽ', &['Dž', 'dž', ]), + ('Dž', &['DŽ', 'dž', ]), ('dž', &['DŽ', 'Dž', ]), ('LJ', &['Lj', 'lj', ]), + ('Lj', &['LJ', 'lj', ]), ('lj', &['LJ', 'Lj', ]), ('NJ', &['Nj', 'nj', ]), + ('Nj', &['NJ', 'nj', ]), ('nj', &['NJ', 'Nj', ]), ('Ǎ', &['ǎ']), ('ǎ', &[ + 'Ǎ']), ('Ǐ', &['ǐ']), ('ǐ', &['Ǐ']), ('Ǒ', &['ǒ']), ('ǒ', &['Ǒ']), + ('Ǔ', &['ǔ']), ('ǔ', &['Ǔ']), ('Ǖ', &['ǖ']), ('ǖ', &['Ǖ']), ('Ǘ', &[ + 'ǘ']), ('ǘ', &['Ǘ']), ('Ǚ', &['ǚ']), ('ǚ', &['Ǚ']), ('Ǜ', &['ǜ']), + ('ǜ', &['Ǜ']), ('ǝ', &['Ǝ']), ('Ǟ', &['ǟ']), ('ǟ', &['Ǟ']), ('Ǡ', &[ + 'ǡ']), ('ǡ', &['Ǡ']), ('Ǣ', &['ǣ']), ('ǣ', &['Ǣ']), ('Ǥ', &['ǥ']), + ('ǥ', &['Ǥ']), ('Ǧ', &['ǧ']), ('ǧ', &['Ǧ']), ('Ǩ', &['ǩ']), ('ǩ', &[ + 'Ǩ']), ('Ǫ', &['ǫ']), ('ǫ', &['Ǫ']), ('Ǭ', &['ǭ']), ('ǭ', &['Ǭ']), + ('Ǯ', &['ǯ']), ('ǯ', &['Ǯ']), ('DZ', &['Dz', 'dz', ]), ('Dz', &['DZ', + 'dz', ]), ('dz', &['DZ', 'Dz', ]), ('Ǵ', &['ǵ']), ('ǵ', &['Ǵ']), + ('Ƕ', &['ƕ']), ('Ƿ', &['ƿ']), ('Ǹ', &['ǹ']), ('ǹ', &['Ǹ']), ('Ǻ', &[ + 'ǻ']), ('ǻ', &['Ǻ']), ('Ǽ', &['ǽ']), ('ǽ', &['Ǽ']), ('Ǿ', &['ǿ']), + ('ǿ', &['Ǿ']), ('Ȁ', &['ȁ']), ('ȁ', &['Ȁ']), ('Ȃ', &['ȃ']), ('ȃ', &[ + 'Ȃ']), ('Ȅ', &['ȅ']), ('ȅ', &['Ȅ']), ('Ȇ', &['ȇ']), ('ȇ', &['Ȇ']), + ('Ȉ', &['ȉ']), ('ȉ', &['Ȉ']), ('Ȋ', &['ȋ']), ('ȋ', &['Ȋ']), ('Ȍ', &[ + 'ȍ']), ('ȍ', &['Ȍ']), ('Ȏ', &['ȏ']), ('ȏ', &['Ȏ']), ('Ȑ', &['ȑ']), + ('ȑ', &['Ȑ']), ('Ȓ', &['ȓ']), ('ȓ', &['Ȓ']), ('Ȕ', &['ȕ']), ('ȕ', &[ + 'Ȕ']), ('Ȗ', &['ȗ']), ('ȗ', &['Ȗ']), ('Ș', &['ș']), ('ș', &['Ș']), + ('Ț', &['ț']), ('ț', &['Ț']), ('Ȝ', &['ȝ']), ('ȝ', &['Ȝ']), ('Ȟ', &[ + 'ȟ']), ('ȟ', &['Ȟ']), ('Ƞ', &['ƞ']), ('Ȣ', &['ȣ']), ('ȣ', &['Ȣ']), + ('Ȥ', &['ȥ']), ('ȥ', &['Ȥ']), ('Ȧ', &['ȧ']), ('ȧ', &['Ȧ']), ('Ȩ', &[ + 'ȩ']), ('ȩ', &['Ȩ']), ('Ȫ', &['ȫ']), ('ȫ', &['Ȫ']), ('Ȭ', &['ȭ']), + ('ȭ', &['Ȭ']), ('Ȯ', &['ȯ']), ('ȯ', &['Ȯ']), ('Ȱ', &['ȱ']), ('ȱ', &[ + 'Ȱ']), ('Ȳ', &['ȳ']), ('ȳ', &['Ȳ']), ('Ⱥ', &['ⱥ']), ('Ȼ', &['ȼ']), + ('ȼ', &['Ȼ']), ('Ƚ', &['ƚ']), ('Ⱦ', &['ⱦ']), ('ȿ', &['Ȿ']), + ('ɀ', &['Ɀ']), ('Ɂ', &['ɂ']), ('ɂ', &['Ɂ']), ('Ƀ', &['ƀ']), + ('Ʉ', &['ʉ']), ('Ʌ', &['ʌ']), ('Ɇ', &['ɇ']), ('ɇ', &['Ɇ']), ('Ɉ', &[ + 'ɉ']), ('ɉ', &['Ɉ']), ('Ɋ', &['ɋ']), ('ɋ', &['Ɋ']), ('Ɍ', &['ɍ']), + ('ɍ', &['Ɍ']), ('Ɏ', &['ɏ']), ('ɏ', &['Ɏ']), ('ɐ', &['Ɐ']), + ('ɑ', &['Ɑ']), ('ɒ', &['Ɒ']), ('ɓ', &['Ɓ']), ('ɔ', &['Ɔ']), + ('ɖ', &['Ɖ']), ('ɗ', &['Ɗ']), ('ə', &['Ə']), ('ɛ', &['Ɛ']), ('ɜ', &[ + 'Ɜ']), ('ɠ', &['Ɠ']), ('ɡ', &['Ɡ']), ('ɣ', &['Ɣ']), ('ɥ', &['Ɥ' + ]), ('ɦ', &['Ɦ']), ('ɨ', &['Ɨ']), ('ɩ', &['Ɩ']), ('ɪ', &['Ɪ']), + ('ɫ', &['Ɫ']), ('ɬ', &['Ɬ']), ('ɯ', &['Ɯ']), ('ɱ', &['Ɱ']), + ('ɲ', &['Ɲ']), ('ɵ', &['Ɵ']), ('ɽ', &['Ɽ']), ('ʀ', &['Ʀ']), + ('ʃ', &['Ʃ']), ('ʇ', &['Ʇ']), ('ʈ', &['Ʈ']), ('ʉ', &['Ʉ']), + ('ʊ', &['Ʊ']), ('ʋ', &['Ʋ']), ('ʌ', &['Ʌ']), ('ʒ', &['Ʒ']), ('ʝ', &[ + 'Ʝ']), ('ʞ', &['Ʞ']), ('ͅ', &['Ι', 'ι', 'ι', ]), ('Ͱ', &['ͱ']), + ('ͱ', &['Ͱ']), ('Ͳ', &['ͳ']), ('ͳ', &['Ͳ']), ('Ͷ', &['ͷ']), ('ͷ', &[ + 'Ͷ']), ('ͻ', &['Ͻ']), ('ͼ', &['Ͼ']), ('ͽ', &['Ͽ']), ('Ϳ', &['ϳ']), + ('Ά', &['ά']), ('Έ', &['έ']), ('Ή', &['ή']), ('Ί', &['ί']), ('Ό', &[ + 'ό']), ('Ύ', &['ύ']), ('Ώ', &['ώ']), ('Α', &['α']), ('Β', &['β', + 'ϐ', ]), ('Γ', &['γ']), ('Δ', &['δ']), ('Ε', &['ε', 'ϵ', ]), + ('Ζ', &['ζ']), ('Η', &['η']), ('Θ', &['θ', 'ϑ', 'ϴ', ]), ('Ι', &[ + 'ͅ', 'ι', 'ι', ]), ('Κ', &['κ', 'ϰ', ]), ('Λ', &['λ']), ('Μ', &[ + 'µ', 'μ', ]), ('Ν', &['ν']), ('Ξ', &['ξ']), ('Ο', &['ο']), ('Π', &[ + 'π', 'ϖ', ]), ('Ρ', &['ρ', 'ϱ', ]), ('Σ', &['ς', 'σ', ]), ('Τ', &[ + 'τ']), ('Υ', &['υ']), ('Φ', &['φ', 'ϕ', ]), ('Χ', &['χ']), ('Ψ', &[ + 'ψ']), ('Ω', &['ω', 'Ω', ]), ('Ϊ', &['ϊ']), ('Ϋ', &['ϋ']), ('ά', &[ + 'Ά']), ('έ', &['Έ']), ('ή', &['Ή']), ('ί', &['Ί']), ('α', &['Α']), + ('β', &['Β', 'ϐ', ]), ('γ', &['Γ']), ('δ', &['Δ']), ('ε', &['Ε', + 'ϵ', ]), ('ζ', &['Ζ']), ('η', &['Η']), ('θ', &['Θ', 'ϑ', 'ϴ', ]), + ('ι', &['ͅ', 'Ι', 'ι', ]), ('κ', &['Κ', 'ϰ', ]), ('λ', &['Λ']), + ('μ', &['µ', 'Μ', ]), ('ν', &['Ν']), ('ξ', &['Ξ']), ('ο', &['Ο']), + ('π', &['Π', 'ϖ', ]), ('ρ', &['Ρ', 'ϱ', ]), ('ς', &['Σ', 'σ', ]), + ('σ', &['Σ', 'ς', ]), ('τ', &['Τ']), ('υ', &['Υ']), ('φ', &['Φ', + 'ϕ', ]), ('χ', &['Χ']), ('ψ', &['Ψ']), ('ω', &['Ω', 'Ω', ]), + ('ϊ', &['Ϊ']), ('ϋ', &['Ϋ']), ('ό', &['Ό']), ('ύ', &['Ύ']), ('ώ', &[ + 'Ώ']), ('Ϗ', &['ϗ']), ('ϐ', &['Β', 'β', ]), ('ϑ', &['Θ', 'θ', 'ϴ', + ]), ('ϕ', &['Φ', 'φ', ]), ('ϖ', &['Π', 'π', ]), ('ϗ', &['Ϗ']), + ('Ϙ', &['ϙ']), ('ϙ', &['Ϙ']), ('Ϛ', &['ϛ']), ('ϛ', &['Ϛ']), ('Ϝ', &[ + 'ϝ']), ('ϝ', &['Ϝ']), ('Ϟ', &['ϟ']), ('ϟ', &['Ϟ']), ('Ϡ', &['ϡ']), + ('ϡ', &['Ϡ']), ('Ϣ', &['ϣ']), ('ϣ', &['Ϣ']), ('Ϥ', &['ϥ']), ('ϥ', &[ + 'Ϥ']), ('Ϧ', &['ϧ']), ('ϧ', &['Ϧ']), ('Ϩ', &['ϩ']), ('ϩ', &['Ϩ']), + ('Ϫ', &['ϫ']), ('ϫ', &['Ϫ']), ('Ϭ', &['ϭ']), ('ϭ', &['Ϭ']), ('Ϯ', &[ + 'ϯ']), ('ϯ', &['Ϯ']), ('ϰ', &['Κ', 'κ', ]), ('ϱ', &['Ρ', 'ρ', ]), + ('ϲ', &['Ϲ']), ('ϳ', &['Ϳ']), ('ϴ', &['Θ', 'θ', 'ϑ', ]), ('ϵ', &[ + 'Ε', 'ε', ]), ('Ϸ', &['ϸ']), ('ϸ', &['Ϸ']), ('Ϲ', &['ϲ']), ('Ϻ', &[ + 'ϻ']), ('ϻ', &['Ϻ']), ('Ͻ', &['ͻ']), ('Ͼ', &['ͼ']), ('Ͽ', &['ͽ']), + ('Ѐ', &['ѐ']), ('Ё', &['ё']), ('Ђ', &['ђ']), ('Ѓ', &['ѓ']), ('Є', &[ + 'є']), ('Ѕ', &['ѕ']), ('І', &['і']), ('Ї', &['ї']), ('Ј', &['ј']), + ('Љ', &['љ']), ('Њ', &['њ']), ('Ћ', &['ћ']), ('Ќ', &['ќ']), ('Ѝ', &[ + 'ѝ']), ('Ў', &['ў']), ('Џ', &['џ']), ('А', &['а']), ('Б', &['б']), + ('В', &['в', 'ᲀ', ]), ('Г', &['г']), ('Д', &['д', 'ᲁ', ]), + ('Е', &['е']), ('Ж', &['ж']), ('З', &['з']), ('И', &['и']), ('Й', &[ + 'й']), ('К', &['к']), ('Л', &['л']), ('М', &['м']), ('Н', &['н']), + ('О', &['о', 'ᲂ', ]), ('П', &['п']), ('Р', &['р']), ('С', &['с', + 'ᲃ', ]), ('Т', &['т', 'ᲄ', 'ᲅ', ]), ('У', &['у']), ('Ф', &['ф' + ]), ('Х', &['х']), ('Ц', &['ц']), ('Ч', &['ч']), ('Ш', &['ш']), + ('Щ', &['щ']), ('Ъ', &['ъ', 'ᲆ', ]), ('Ы', &['ы']), ('Ь', &['ь']), + ('Э', &['э']), ('Ю', &['ю']), ('Я', &['я']), ('а', &['А']), ('б', &[ + 'Б']), ('в', &['В', 'ᲀ', ]), ('г', &['Г']), ('д', &['Д', 'ᲁ', ]), + ('е', &['Е']), ('ж', &['Ж']), ('з', &['З']), ('и', &['И']), ('й', &[ + 'Й']), ('к', &['К']), ('л', &['Л']), ('м', &['М']), ('н', &['Н']), + ('о', &['О', 'ᲂ', ]), ('п', &['П']), ('р', &['Р']), ('с', &['С', + 'ᲃ', ]), ('т', &['Т', 'ᲄ', 'ᲅ', ]), ('у', &['У']), ('ф', &['Ф' + ]), ('х', &['Х']), ('ц', &['Ц']), ('ч', &['Ч']), ('ш', &['Ш']), + ('щ', &['Щ']), ('ъ', &['Ъ', 'ᲆ', ]), ('ы', &['Ы']), ('ь', &['Ь']), + ('э', &['Э']), ('ю', &['Ю']), ('я', &['Я']), ('ѐ', &['Ѐ']), ('ё', &[ + 'Ё']), ('ђ', &['Ђ']), ('ѓ', &['Ѓ']), ('є', &['Є']), ('ѕ', &['Ѕ']), + ('і', &['І']), ('ї', &['Ї']), ('ј', &['Ј']), ('љ', &['Љ']), ('њ', &[ + 'Њ']), ('ћ', &['Ћ']), ('ќ', &['Ќ']), ('ѝ', &['Ѝ']), ('ў', &['Ў']), + ('џ', &['Џ']), ('Ѡ', &['ѡ']), ('ѡ', &['Ѡ']), ('Ѣ', &['ѣ', 'ᲇ', ]), + ('ѣ', &['Ѣ', 'ᲇ', ]), ('Ѥ', &['ѥ']), ('ѥ', &['Ѥ']), ('Ѧ', &['ѧ']), + ('ѧ', &['Ѧ']), ('Ѩ', &['ѩ']), ('ѩ', &['Ѩ']), ('Ѫ', &['ѫ']), ('ѫ', &[ + 'Ѫ']), ('Ѭ', &['ѭ']), ('ѭ', &['Ѭ']), ('Ѯ', &['ѯ']), ('ѯ', &['Ѯ']), + ('Ѱ', &['ѱ']), ('ѱ', &['Ѱ']), ('Ѳ', &['ѳ']), ('ѳ', &['Ѳ']), ('Ѵ', &[ + 'ѵ']), ('ѵ', &['Ѵ']), ('Ѷ', &['ѷ']), ('ѷ', &['Ѷ']), ('Ѹ', &['ѹ']), + ('ѹ', &['Ѹ']), ('Ѻ', &['ѻ']), ('ѻ', &['Ѻ']), ('Ѽ', &['ѽ']), ('ѽ', &[ + 'Ѽ']), ('Ѿ', &['ѿ']), ('ѿ', &['Ѿ']), ('Ҁ', &['ҁ']), ('ҁ', &['Ҁ']), + ('Ҋ', &['ҋ']), ('ҋ', &['Ҋ']), ('Ҍ', &['ҍ']), ('ҍ', &['Ҍ']), ('Ҏ', &[ + 'ҏ']), ('ҏ', &['Ҏ']), ('Ґ', &['ґ']), ('ґ', &['Ґ']), ('Ғ', &['ғ']), + ('ғ', &['Ғ']), ('Ҕ', &['ҕ']), ('ҕ', &['Ҕ']), ('Җ', &['җ']), ('җ', &[ + 'Җ']), ('Ҙ', &['ҙ']), ('ҙ', &['Ҙ']), ('Қ', &['қ']), ('қ', &['Қ']), + ('Ҝ', &['ҝ']), ('ҝ', &['Ҝ']), ('Ҟ', &['ҟ']), ('ҟ', &['Ҟ']), ('Ҡ', &[ + 'ҡ']), ('ҡ', &['Ҡ']), ('Ң', &['ң']), ('ң', &['Ң']), ('Ҥ', &['ҥ']), + ('ҥ', &['Ҥ']), ('Ҧ', &['ҧ']), ('ҧ', &['Ҧ']), ('Ҩ', &['ҩ']), ('ҩ', &[ + 'Ҩ']), ('Ҫ', &['ҫ']), ('ҫ', &['Ҫ']), ('Ҭ', &['ҭ']), ('ҭ', &['Ҭ']), + ('Ү', &['ү']), ('ү', &['Ү']), ('Ұ', &['ұ']), ('ұ', &['Ұ']), ('Ҳ', &[ + 'ҳ']), ('ҳ', &['Ҳ']), ('Ҵ', &['ҵ']), ('ҵ', &['Ҵ']), ('Ҷ', &['ҷ']), + ('ҷ', &['Ҷ']), ('Ҹ', &['ҹ']), ('ҹ', &['Ҹ']), ('Һ', &['һ']), ('һ', &[ + 'Һ']), ('Ҽ', &['ҽ']), ('ҽ', &['Ҽ']), ('Ҿ', &['ҿ']), ('ҿ', &['Ҿ']), + ('Ӏ', &['ӏ']), ('Ӂ', &['ӂ']), ('ӂ', &['Ӂ']), ('Ӄ', &['ӄ']), ('ӄ', &[ + 'Ӄ']), ('Ӆ', &['ӆ']), ('ӆ', &['Ӆ']), ('Ӈ', &['ӈ']), ('ӈ', &['Ӈ']), + ('Ӊ', &['ӊ']), ('ӊ', &['Ӊ']), ('Ӌ', &['ӌ']), ('ӌ', &['Ӌ']), ('Ӎ', &[ + 'ӎ']), ('ӎ', &['Ӎ']), ('ӏ', &['Ӏ']), ('Ӑ', &['ӑ']), ('ӑ', &['Ӑ']), + ('Ӓ', &['ӓ']), ('ӓ', &['Ӓ']), ('Ӕ', &['ӕ']), ('ӕ', &['Ӕ']), ('Ӗ', &[ + 'ӗ']), ('ӗ', &['Ӗ']), ('Ә', &['ә']), ('ә', &['Ә']), ('Ӛ', &['ӛ']), + ('ӛ', &['Ӛ']), ('Ӝ', &['ӝ']), ('ӝ', &['Ӝ']), ('Ӟ', &['ӟ']), ('ӟ', &[ + 'Ӟ']), ('Ӡ', &['ӡ']), ('ӡ', &['Ӡ']), ('Ӣ', &['ӣ']), ('ӣ', &['Ӣ']), + ('Ӥ', &['ӥ']), ('ӥ', &['Ӥ']), ('Ӧ', &['ӧ']), ('ӧ', &['Ӧ']), ('Ө', &[ + 'ө']), ('ө', &['Ө']), ('Ӫ', &['ӫ']), ('ӫ', &['Ӫ']), ('Ӭ', &['ӭ']), + ('ӭ', &['Ӭ']), ('Ӯ', &['ӯ']), ('ӯ', &['Ӯ']), ('Ӱ', &['ӱ']), ('ӱ', &[ + 'Ӱ']), ('Ӳ', &['ӳ']), ('ӳ', &['Ӳ']), ('Ӵ', &['ӵ']), ('ӵ', &['Ӵ']), + ('Ӷ', &['ӷ']), ('ӷ', &['Ӷ']), ('Ӹ', &['ӹ']), ('ӹ', &['Ӹ']), ('Ӻ', &[ + 'ӻ']), ('ӻ', &['Ӻ']), ('Ӽ', &['ӽ']), ('ӽ', &['Ӽ']), ('Ӿ', &['ӿ']), + ('ӿ', &['Ӿ']), ('Ԁ', &['ԁ']), ('ԁ', &['Ԁ']), ('Ԃ', &['ԃ']), ('ԃ', &[ + 'Ԃ']), ('Ԅ', &['ԅ']), ('ԅ', &['Ԅ']), ('Ԇ', &['ԇ']), ('ԇ', &['Ԇ']), + ('Ԉ', &['ԉ']), ('ԉ', &['Ԉ']), ('Ԋ', &['ԋ']), ('ԋ', &['Ԋ']), ('Ԍ', &[ + 'ԍ']), ('ԍ', &['Ԍ']), ('Ԏ', &['ԏ']), ('ԏ', &['Ԏ']), ('Ԑ', &['ԑ']), + ('ԑ', &['Ԑ']), ('Ԓ', &['ԓ']), ('ԓ', &['Ԓ']), ('Ԕ', &['ԕ']), ('ԕ', &[ + 'Ԕ']), ('Ԗ', &['ԗ']), ('ԗ', &['Ԗ']), ('Ԙ', &['ԙ']), ('ԙ', &['Ԙ']), + ('Ԛ', &['ԛ']), ('ԛ', &['Ԛ']), ('Ԝ', &['ԝ']), ('ԝ', &['Ԝ']), ('Ԟ', &[ + 'ԟ']), ('ԟ', &['Ԟ']), ('Ԡ', &['ԡ']), ('ԡ', &['Ԡ']), ('Ԣ', &['ԣ']), + ('ԣ', &['Ԣ']), ('Ԥ', &['ԥ']), ('ԥ', &['Ԥ']), ('Ԧ', &['ԧ']), ('ԧ', &[ + 'Ԧ']), ('Ԩ', &['ԩ']), ('ԩ', &['Ԩ']), ('Ԫ', &['ԫ']), ('ԫ', &['Ԫ']), + ('Ԭ', &['ԭ']), ('ԭ', &['Ԭ']), ('Ԯ', &['ԯ']), ('ԯ', &['Ԯ']), ('Ա', &[ + 'ա']), ('Բ', &['բ']), ('Գ', &['գ']), ('Դ', &['դ']), ('Ե', &['ե']), + ('Զ', &['զ']), ('Է', &['է']), ('Ը', &['ը']), ('Թ', &['թ']), ('Ժ', &[ + 'ժ']), ('Ի', &['ի']), ('Լ', &['լ']), ('Խ', &['խ']), ('Ծ', &['ծ']), + ('Կ', &['կ']), ('Հ', &['հ']), ('Ձ', &['ձ']), ('Ղ', &['ղ']), ('Ճ', &[ + 'ճ']), ('Մ', &['մ']), ('Յ', &['յ']), ('Ն', &['ն']), ('Շ', &['շ']), + ('Ո', &['ո']), ('Չ', &['չ']), ('Պ', &['պ']), ('Ջ', &['ջ']), ('Ռ', &[ + 'ռ']), ('Ս', &['ս']), ('Վ', &['վ']), ('Տ', &['տ']), ('Ր', &['ր']), + ('Ց', &['ց']), ('Ւ', &['ւ']), ('Փ', &['փ']), ('Ք', &['ք']), ('Օ', &[ + 'օ']), ('Ֆ', &['ֆ']), ('ա', &['Ա']), ('բ', &['Բ']), ('գ', &['Գ']), + ('դ', &['Դ']), ('ե', &['Ե']), ('զ', &['Զ']), ('է', &['Է']), ('ը', &[ + 'Ը']), ('թ', &['Թ']), ('ժ', &['Ժ']), ('ի', &['Ի']), ('լ', &['Լ']), + ('խ', &['Խ']), ('ծ', &['Ծ']), ('կ', &['Կ']), ('հ', &['Հ']), ('ձ', &[ + 'Ձ']), ('ղ', &['Ղ']), ('ճ', &['Ճ']), ('մ', &['Մ']), ('յ', &['Յ']), + ('ն', &['Ն']), ('շ', &['Շ']), ('ո', &['Ո']), ('չ', &['Չ']), ('պ', &[ + 'Պ']), ('ջ', &['Ջ']), ('ռ', &['Ռ']), ('ս', &['Ս']), ('վ', &['Վ']), + ('տ', &['Տ']), ('ր', &['Ր']), ('ց', &['Ց']), ('ւ', &['Ւ']), ('փ', &[ + 'Փ']), ('ք', &['Ք']), ('օ', &['Օ']), ('ֆ', &['Ֆ']), ('Ⴀ', &['ⴀ' + ]), ('Ⴁ', &['ⴁ']), ('Ⴂ', &['ⴂ']), ('Ⴃ', &['ⴃ']), ('Ⴄ', &['ⴄ' + ]), ('Ⴅ', &['ⴅ']), ('Ⴆ', &['ⴆ']), ('Ⴇ', &['ⴇ']), ('Ⴈ', &['ⴈ' + ]), ('Ⴉ', &['ⴉ']), ('Ⴊ', &['ⴊ']), ('Ⴋ', &['ⴋ']), ('Ⴌ', &['ⴌ' + ]), ('Ⴍ', &['ⴍ']), ('Ⴎ', &['ⴎ']), ('Ⴏ', &['ⴏ']), ('Ⴐ', &['ⴐ' + ]), ('Ⴑ', &['ⴑ']), ('Ⴒ', &['ⴒ']), ('Ⴓ', &['ⴓ']), ('Ⴔ', &['ⴔ' + ]), ('Ⴕ', &['ⴕ']), ('Ⴖ', &['ⴖ']), ('Ⴗ', &['ⴗ']), ('Ⴘ', &['ⴘ' + ]), ('Ⴙ', &['ⴙ']), ('Ⴚ', &['ⴚ']), ('Ⴛ', &['ⴛ']), ('Ⴜ', &['ⴜ' + ]), ('Ⴝ', &['ⴝ']), ('Ⴞ', &['ⴞ']), ('Ⴟ', &['ⴟ']), ('Ⴠ', &['ⴠ' + ]), ('Ⴡ', &['ⴡ']), ('Ⴢ', &['ⴢ']), ('Ⴣ', &['ⴣ']), ('Ⴤ', &['ⴤ' + ]), ('Ⴥ', &['ⴥ']), ('Ⴧ', &['ⴧ']), ('Ⴭ', &['ⴭ']), ('Ꭰ', &['ꭰ' + ]), ('Ꭱ', &['ꭱ']), ('Ꭲ', &['ꭲ']), ('Ꭳ', &['ꭳ']), ('Ꭴ', &['ꭴ' + ]), ('Ꭵ', &['ꭵ']), ('Ꭶ', &['ꭶ']), ('Ꭷ', &['ꭷ']), ('Ꭸ', &['ꭸ' + ]), ('Ꭹ', &['ꭹ']), ('Ꭺ', &['ꭺ']), ('Ꭻ', &['ꭻ']), ('Ꭼ', &['ꭼ' + ]), ('Ꭽ', &['ꭽ']), ('Ꭾ', &['ꭾ']), ('Ꭿ', &['ꭿ']), ('Ꮀ', &['ꮀ' + ]), ('Ꮁ', &['ꮁ']), ('Ꮂ', &['ꮂ']), ('Ꮃ', &['ꮃ']), ('Ꮄ', &['ꮄ' + ]), ('Ꮅ', &['ꮅ']), ('Ꮆ', &['ꮆ']), ('Ꮇ', &['ꮇ']), ('Ꮈ', &['ꮈ' + ]), ('Ꮉ', &['ꮉ']), ('Ꮊ', &['ꮊ']), ('Ꮋ', &['ꮋ']), ('Ꮌ', &['ꮌ' + ]), ('Ꮍ', &['ꮍ']), ('Ꮎ', &['ꮎ']), ('Ꮏ', &['ꮏ']), ('Ꮐ', &['ꮐ' + ]), ('Ꮑ', &['ꮑ']), ('Ꮒ', &['ꮒ']), ('Ꮓ', &['ꮓ']), ('Ꮔ', &['ꮔ' + ]), ('Ꮕ', &['ꮕ']), ('Ꮖ', &['ꮖ']), ('Ꮗ', &['ꮗ']), ('Ꮘ', &['ꮘ' + ]), ('Ꮙ', &['ꮙ']), ('Ꮚ', &['ꮚ']), ('Ꮛ', &['ꮛ']), ('Ꮜ', &['ꮜ' + ]), ('Ꮝ', &['ꮝ']), ('Ꮞ', &['ꮞ']), ('Ꮟ', &['ꮟ']), ('Ꮠ', &['ꮠ' + ]), ('Ꮡ', &['ꮡ']), ('Ꮢ', &['ꮢ']), ('Ꮣ', &['ꮣ']), ('Ꮤ', &['ꮤ' + ]), ('Ꮥ', &['ꮥ']), ('Ꮦ', &['ꮦ']), ('Ꮧ', &['ꮧ']), ('Ꮨ', &['ꮨ' + ]), ('Ꮩ', &['ꮩ']), ('Ꮪ', &['ꮪ']), ('Ꮫ', &['ꮫ']), ('Ꮬ', &['ꮬ' + ]), ('Ꮭ', &['ꮭ']), ('Ꮮ', &['ꮮ']), ('Ꮯ', &['ꮯ']), ('Ꮰ', &['ꮰ' + ]), ('Ꮱ', &['ꮱ']), ('Ꮲ', &['ꮲ']), ('Ꮳ', &['ꮳ']), ('Ꮴ', &['ꮴ' + ]), ('Ꮵ', &['ꮵ']), ('Ꮶ', &['ꮶ']), ('Ꮷ', &['ꮷ']), ('Ꮸ', &['ꮸ' + ]), ('Ꮹ', &['ꮹ']), ('Ꮺ', &['ꮺ']), ('Ꮻ', &['ꮻ']), ('Ꮼ', &['ꮼ' + ]), ('Ꮽ', &['ꮽ']), ('Ꮾ', &['ꮾ']), ('Ꮿ', &['ꮿ']), ('Ᏸ', &['ᏸ' + ]), ('Ᏹ', &['ᏹ']), ('Ᏺ', &['ᏺ']), ('Ᏻ', &['ᏻ']), ('Ᏼ', &['ᏼ' + ]), ('Ᏽ', &['ᏽ']), ('ᏸ', &['Ᏸ']), ('ᏹ', &['Ᏹ']), ('ᏺ', &['Ᏺ' + ]), ('ᏻ', &['Ᏻ']), ('ᏼ', &['Ᏼ']), ('ᏽ', &['Ᏽ']), ('ᲀ', &['В', + 'в', ]), ('ᲁ', &['Д', 'д', ]), ('ᲂ', &['О', 'о', ]), ('ᲃ', &[ + 'С', 'с', ]), ('ᲄ', &['Т', 'т', 'ᲅ', ]), ('ᲅ', &['Т', 'т', + 'ᲄ', ]), ('ᲆ', &['Ъ', 'ъ', ]), ('ᲇ', &['Ѣ', 'ѣ', ]), ('ᲈ', &[ + 'Ꙋ', 'ꙋ', ]), ('ᵹ', &['Ᵹ']), ('ᵽ', &['Ᵽ']), ('Ḁ', &['ḁ']), + ('ḁ', &['Ḁ']), ('Ḃ', &['ḃ']), ('ḃ', &['Ḃ']), ('Ḅ', &['ḅ']), + ('ḅ', &['Ḅ']), ('Ḇ', &['ḇ']), ('ḇ', &['Ḇ']), ('Ḉ', &['ḉ']), + ('ḉ', &['Ḉ']), ('Ḋ', &['ḋ']), ('ḋ', &['Ḋ']), ('Ḍ', &['ḍ']), + ('ḍ', &['Ḍ']), ('Ḏ', &['ḏ']), ('ḏ', &['Ḏ']), ('Ḑ', &['ḑ']), + ('ḑ', &['Ḑ']), ('Ḓ', &['ḓ']), ('ḓ', &['Ḓ']), ('Ḕ', &['ḕ']), + ('ḕ', &['Ḕ']), ('Ḗ', &['ḗ']), ('ḗ', &['Ḗ']), ('Ḙ', &['ḙ']), + ('ḙ', &['Ḙ']), ('Ḛ', &['ḛ']), ('ḛ', &['Ḛ']), ('Ḝ', &['ḝ']), + ('ḝ', &['Ḝ']), ('Ḟ', &['ḟ']), ('ḟ', &['Ḟ']), ('Ḡ', &['ḡ']), + ('ḡ', &['Ḡ']), ('Ḣ', &['ḣ']), ('ḣ', &['Ḣ']), ('Ḥ', &['ḥ']), + ('ḥ', &['Ḥ']), ('Ḧ', &['ḧ']), ('ḧ', &['Ḧ']), ('Ḩ', &['ḩ']), + ('ḩ', &['Ḩ']), ('Ḫ', &['ḫ']), ('ḫ', &['Ḫ']), ('Ḭ', &['ḭ']), + ('ḭ', &['Ḭ']), ('Ḯ', &['ḯ']), ('ḯ', &['Ḯ']), ('Ḱ', &['ḱ']), + ('ḱ', &['Ḱ']), ('Ḳ', &['ḳ']), ('ḳ', &['Ḳ']), ('Ḵ', &['ḵ']), + ('ḵ', &['Ḵ']), ('Ḷ', &['ḷ']), ('ḷ', &['Ḷ']), ('Ḹ', &['ḹ']), + ('ḹ', &['Ḹ']), ('Ḻ', &['ḻ']), ('ḻ', &['Ḻ']), ('Ḽ', &['ḽ']), + ('ḽ', &['Ḽ']), ('Ḿ', &['ḿ']), ('ḿ', &['Ḿ']), ('Ṁ', &['ṁ']), + ('ṁ', &['Ṁ']), ('Ṃ', &['ṃ']), ('ṃ', &['Ṃ']), ('Ṅ', &['ṅ']), + ('ṅ', &['Ṅ']), ('Ṇ', &['ṇ']), ('ṇ', &['Ṇ']), ('Ṉ', &['ṉ']), + ('ṉ', &['Ṉ']), ('Ṋ', &['ṋ']), ('ṋ', &['Ṋ']), ('Ṍ', &['ṍ']), + ('ṍ', &['Ṍ']), ('Ṏ', &['ṏ']), ('ṏ', &['Ṏ']), ('Ṑ', &['ṑ']), + ('ṑ', &['Ṑ']), ('Ṓ', &['ṓ']), ('ṓ', &['Ṓ']), ('Ṕ', &['ṕ']), + ('ṕ', &['Ṕ']), ('Ṗ', &['ṗ']), ('ṗ', &['Ṗ']), ('Ṙ', &['ṙ']), + ('ṙ', &['Ṙ']), ('Ṛ', &['ṛ']), ('ṛ', &['Ṛ']), ('Ṝ', &['ṝ']), + ('ṝ', &['Ṝ']), ('Ṟ', &['ṟ']), ('ṟ', &['Ṟ']), ('Ṡ', &['ṡ', + 'ẛ', ]), ('ṡ', &['Ṡ', 'ẛ', ]), ('Ṣ', &['ṣ']), ('ṣ', &['Ṣ']), + ('Ṥ', &['ṥ']), ('ṥ', &['Ṥ']), ('Ṧ', &['ṧ']), ('ṧ', &['Ṧ']), + ('Ṩ', &['ṩ']), ('ṩ', &['Ṩ']), ('Ṫ', &['ṫ']), ('ṫ', &['Ṫ']), + ('Ṭ', &['ṭ']), ('ṭ', &['Ṭ']), ('Ṯ', &['ṯ']), ('ṯ', &['Ṯ']), + ('Ṱ', &['ṱ']), ('ṱ', &['Ṱ']), ('Ṳ', &['ṳ']), ('ṳ', &['Ṳ']), + ('Ṵ', &['ṵ']), ('ṵ', &['Ṵ']), ('Ṷ', &['ṷ']), ('ṷ', &['Ṷ']), + ('Ṹ', &['ṹ']), ('ṹ', &['Ṹ']), ('Ṻ', &['ṻ']), ('ṻ', &['Ṻ']), + ('Ṽ', &['ṽ']), ('ṽ', &['Ṽ']), ('Ṿ', &['ṿ']), ('ṿ', &['Ṿ']), + ('Ẁ', &['ẁ']), ('ẁ', &['Ẁ']), ('Ẃ', &['ẃ']), ('ẃ', &['Ẃ']), + ('Ẅ', &['ẅ']), ('ẅ', &['Ẅ']), ('Ẇ', &['ẇ']), ('ẇ', &['Ẇ']), + ('Ẉ', &['ẉ']), ('ẉ', &['Ẉ']), ('Ẋ', &['ẋ']), ('ẋ', &['Ẋ']), + ('Ẍ', &['ẍ']), ('ẍ', &['Ẍ']), ('Ẏ', &['ẏ']), ('ẏ', &['Ẏ']), + ('Ẑ', &['ẑ']), ('ẑ', &['Ẑ']), ('Ẓ', &['ẓ']), ('ẓ', &['Ẓ']), + ('Ẕ', &['ẕ']), ('ẕ', &['Ẕ']), ('ẛ', &['Ṡ', 'ṡ', ]), ('ẞ', &[ + 'ß']), ('Ạ', &['ạ']), ('ạ', &['Ạ']), ('Ả', &['ả']), ('ả', &[ + 'Ả']), ('Ấ', &['ấ']), ('ấ', &['Ấ']), ('Ầ', &['ầ']), ('ầ', &[ + 'Ầ']), ('Ẩ', &['ẩ']), ('ẩ', &['Ẩ']), ('Ẫ', &['ẫ']), ('ẫ', &[ + 'Ẫ']), ('Ậ', &['ậ']), ('ậ', &['Ậ']), ('Ắ', &['ắ']), ('ắ', &[ + 'Ắ']), ('Ằ', &['ằ']), ('ằ', &['Ằ']), ('Ẳ', &['ẳ']), ('ẳ', &[ + 'Ẳ']), ('Ẵ', &['ẵ']), ('ẵ', &['Ẵ']), ('Ặ', &['ặ']), ('ặ', &[ + 'Ặ']), ('Ẹ', &['ẹ']), ('ẹ', &['Ẹ']), ('Ẻ', &['ẻ']), ('ẻ', &[ + 'Ẻ']), ('Ẽ', &['ẽ']), ('ẽ', &['Ẽ']), ('Ế', &['ế']), ('ế', &[ + 'Ế']), ('Ề', &['ề']), ('ề', &['Ề']), ('Ể', &['ể']), ('ể', &[ + 'Ể']), ('Ễ', &['ễ']), ('ễ', &['Ễ']), ('Ệ', &['ệ']), ('ệ', &[ + 'Ệ']), ('Ỉ', &['ỉ']), ('ỉ', &['Ỉ']), ('Ị', &['ị']), ('ị', &[ + 'Ị']), ('Ọ', &['ọ']), ('ọ', &['Ọ']), ('Ỏ', &['ỏ']), ('ỏ', &[ + 'Ỏ']), ('Ố', &['ố']), ('ố', &['Ố']), ('Ồ', &['ồ']), ('ồ', &[ + 'Ồ']), ('Ổ', &['ổ']), ('ổ', &['Ổ']), ('Ỗ', &['ỗ']), ('ỗ', &[ + 'Ỗ']), ('Ộ', &['ộ']), ('ộ', &['Ộ']), ('Ớ', &['ớ']), ('ớ', &[ + 'Ớ']), ('Ờ', &['ờ']), ('ờ', &['Ờ']), ('Ở', &['ở']), ('ở', &[ + 'Ở']), ('Ỡ', &['ỡ']), ('ỡ', &['Ỡ']), ('Ợ', &['ợ']), ('ợ', &[ + 'Ợ']), ('Ụ', &['ụ']), ('ụ', &['Ụ']), ('Ủ', &['ủ']), ('ủ', &[ + 'Ủ']), ('Ứ', &['ứ']), ('ứ', &['Ứ']), ('Ừ', &['ừ']), ('ừ', &[ + 'Ừ']), ('Ử', &['ử']), ('ử', &['Ử']), ('Ữ', &['ữ']), ('ữ', &[ + 'Ữ']), ('Ự', &['ự']), ('ự', &['Ự']), ('Ỳ', &['ỳ']), ('ỳ', &[ + 'Ỳ']), ('Ỵ', &['ỵ']), ('ỵ', &['Ỵ']), ('Ỷ', &['ỷ']), ('ỷ', &[ + 'Ỷ']), ('Ỹ', &['ỹ']), ('ỹ', &['Ỹ']), ('Ỻ', &['ỻ']), ('ỻ', &[ + 'Ỻ']), ('Ỽ', &['ỽ']), ('ỽ', &['Ỽ']), ('Ỿ', &['ỿ']), ('ỿ', &[ + 'Ỿ']), ('ἀ', &['Ἀ']), ('ἁ', &['Ἁ']), ('ἂ', &['Ἂ']), ('ἃ', &[ + 'Ἃ']), ('ἄ', &['Ἄ']), ('ἅ', &['Ἅ']), ('ἆ', &['Ἆ']), ('ἇ', &[ + 'Ἇ']), ('Ἀ', &['ἀ']), ('Ἁ', &['ἁ']), ('Ἂ', &['ἂ']), ('Ἃ', &[ + 'ἃ']), ('Ἄ', &['ἄ']), ('Ἅ', &['ἅ']), ('Ἆ', &['ἆ']), ('Ἇ', &[ + 'ἇ']), ('ἐ', &['Ἐ']), ('ἑ', &['Ἑ']), ('ἒ', &['Ἒ']), ('ἓ', &[ + 'Ἓ']), ('ἔ', &['Ἔ']), ('ἕ', &['Ἕ']), ('Ἐ', &['ἐ']), ('Ἑ', &[ + 'ἑ']), ('Ἒ', &['ἒ']), ('Ἓ', &['ἓ']), ('Ἔ', &['ἔ']), ('Ἕ', &[ + 'ἕ']), ('ἠ', &['Ἠ']), ('ἡ', &['Ἡ']), ('ἢ', &['Ἢ']), ('ἣ', &[ + 'Ἣ']), ('ἤ', &['Ἤ']), ('ἥ', &['Ἥ']), ('ἦ', &['Ἦ']), ('ἧ', &[ + 'Ἧ']), ('Ἠ', &['ἠ']), ('Ἡ', &['ἡ']), ('Ἢ', &['ἢ']), ('Ἣ', &[ + 'ἣ']), ('Ἤ', &['ἤ']), ('Ἥ', &['ἥ']), ('Ἦ', &['ἦ']), ('Ἧ', &[ + 'ἧ']), ('ἰ', &['Ἰ']), ('ἱ', &['Ἱ']), ('ἲ', &['Ἲ']), ('ἳ', &[ + 'Ἳ']), ('ἴ', &['Ἴ']), ('ἵ', &['Ἵ']), ('ἶ', &['Ἶ']), ('ἷ', &[ + 'Ἷ']), ('Ἰ', &['ἰ']), ('Ἱ', &['ἱ']), ('Ἲ', &['ἲ']), ('Ἳ', &[ + 'ἳ']), ('Ἴ', &['ἴ']), ('Ἵ', &['ἵ']), ('Ἶ', &['ἶ']), ('Ἷ', &[ + 'ἷ']), ('ὀ', &['Ὀ']), ('ὁ', &['Ὁ']), ('ὂ', &['Ὂ']), ('ὃ', &[ + 'Ὃ']), ('ὄ', &['Ὄ']), ('ὅ', &['Ὅ']), ('Ὀ', &['ὀ']), ('Ὁ', &[ + 'ὁ']), ('Ὂ', &['ὂ']), ('Ὃ', &['ὃ']), ('Ὄ', &['ὄ']), ('Ὅ', &[ + 'ὅ']), ('ὑ', &['Ὑ']), ('ὓ', &['Ὓ']), ('ὕ', &['Ὕ']), ('ὗ', &[ + 'Ὗ']), ('Ὑ', &['ὑ']), ('Ὓ', &['ὓ']), ('Ὕ', &['ὕ']), ('Ὗ', &[ + 'ὗ']), ('ὠ', &['Ὠ']), ('ὡ', &['Ὡ']), ('ὢ', &['Ὢ']), ('ὣ', &[ + 'Ὣ']), ('ὤ', &['Ὤ']), ('ὥ', &['Ὥ']), ('ὦ', &['Ὦ']), ('ὧ', &[ + 'Ὧ']), ('Ὠ', &['ὠ']), ('Ὡ', &['ὡ']), ('Ὢ', &['ὢ']), ('Ὣ', &[ + 'ὣ']), ('Ὤ', &['ὤ']), ('Ὥ', &['ὥ']), ('Ὦ', &['ὦ']), ('Ὧ', &[ + 'ὧ']), ('ὰ', &['Ὰ']), ('ά', &['Ά']), ('ὲ', &['Ὲ']), ('έ', &[ + 'Έ']), ('ὴ', &['Ὴ']), ('ή', &['Ή']), ('ὶ', &['Ὶ']), ('ί', &[ + 'Ί']), ('ὸ', &['Ὸ']), ('ό', &['Ό']), ('ὺ', &['Ὺ']), ('ύ', &[ + 'Ύ']), ('ὼ', &['Ὼ']), ('ώ', &['Ώ']), ('ᾀ', &['ᾈ']), ('ᾁ', &[ + 'ᾉ']), ('ᾂ', &['ᾊ']), ('ᾃ', &['ᾋ']), ('ᾄ', &['ᾌ']), ('ᾅ', &[ + 'ᾍ']), ('ᾆ', &['ᾎ']), ('ᾇ', &['ᾏ']), ('ᾈ', &['ᾀ']), ('ᾉ', &[ + 'ᾁ']), ('ᾊ', &['ᾂ']), ('ᾋ', &['ᾃ']), ('ᾌ', &['ᾄ']), ('ᾍ', &[ + 'ᾅ']), ('ᾎ', &['ᾆ']), ('ᾏ', &['ᾇ']), ('ᾐ', &['ᾘ']), ('ᾑ', &[ + 'ᾙ']), ('ᾒ', &['ᾚ']), ('ᾓ', &['ᾛ']), ('ᾔ', &['ᾜ']), ('ᾕ', &[ + 'ᾝ']), ('ᾖ', &['ᾞ']), ('ᾗ', &['ᾟ']), ('ᾘ', &['ᾐ']), ('ᾙ', &[ + 'ᾑ']), ('ᾚ', &['ᾒ']), ('ᾛ', &['ᾓ']), ('ᾜ', &['ᾔ']), ('ᾝ', &[ + 'ᾕ']), ('ᾞ', &['ᾖ']), ('ᾟ', &['ᾗ']), ('ᾠ', &['ᾨ']), ('ᾡ', &[ + 'ᾩ']), ('ᾢ', &['ᾪ']), ('ᾣ', &['ᾫ']), ('ᾤ', &['ᾬ']), ('ᾥ', &[ + 'ᾭ']), ('ᾦ', &['ᾮ']), ('ᾧ', &['ᾯ']), ('ᾨ', &['ᾠ']), ('ᾩ', &[ + 'ᾡ']), ('ᾪ', &['ᾢ']), ('ᾫ', &['ᾣ']), ('ᾬ', &['ᾤ']), ('ᾭ', &[ + 'ᾥ']), ('ᾮ', &['ᾦ']), ('ᾯ', &['ᾧ']), ('ᾰ', &['Ᾰ']), ('ᾱ', &[ + 'Ᾱ']), ('ᾳ', &['ᾼ']), ('Ᾰ', &['ᾰ']), ('Ᾱ', &['ᾱ']), ('Ὰ', &[ + 'ὰ']), ('Ά', &['ά']), ('ᾼ', &['ᾳ']), ('ι', &['ͅ', 'Ι', 'ι', + ]), ('ῃ', &['ῌ']), ('Ὲ', &['ὲ']), ('Έ', &['έ']), ('Ὴ', &['ὴ' + ]), ('Ή', &['ή']), ('ῌ', &['ῃ']), ('ῐ', &['Ῐ']), ('ῑ', &['Ῑ' + ]), ('Ῐ', &['ῐ']), ('Ῑ', &['ῑ']), ('Ὶ', &['ὶ']), ('Ί', &['ί' + ]), ('ῠ', &['Ῠ']), ('ῡ', &['Ῡ']), ('ῥ', &['Ῥ']), ('Ῠ', &['ῠ' + ]), ('Ῡ', &['ῡ']), ('Ὺ', &['ὺ']), ('Ύ', &['ύ']), ('Ῥ', &['ῥ' + ]), ('ῳ', &['ῼ']), ('Ὸ', &['ὸ']), ('Ό', &['ό']), ('Ὼ', &['ὼ' + ]), ('Ώ', &['ώ']), ('ῼ', &['ῳ']), ('Ω', &['Ω', 'ω', ]), + ('K', &['K', 'k', ]), ('Å', &['Å', 'å', ]), ('Ⅎ', &['ⅎ']), + ('ⅎ', &['Ⅎ']), ('Ⅰ', &['ⅰ']), ('Ⅱ', &['ⅱ']), ('Ⅲ', &['ⅲ']), + ('Ⅳ', &['ⅳ']), ('Ⅴ', &['ⅴ']), ('Ⅵ', &['ⅵ']), ('Ⅶ', &['ⅶ']), + ('Ⅷ', &['ⅷ']), ('Ⅸ', &['ⅸ']), ('Ⅹ', &['ⅹ']), ('Ⅺ', &['ⅺ']), + ('Ⅻ', &['ⅻ']), ('Ⅼ', &['ⅼ']), ('Ⅽ', &['ⅽ']), ('Ⅾ', &['ⅾ']), + ('Ⅿ', &['ⅿ']), ('ⅰ', &['Ⅰ']), ('ⅱ', &['Ⅱ']), ('ⅲ', &['Ⅲ']), + ('ⅳ', &['Ⅳ']), ('ⅴ', &['Ⅴ']), ('ⅵ', &['Ⅵ']), ('ⅶ', &['Ⅶ']), + ('ⅷ', &['Ⅷ']), ('ⅸ', &['Ⅸ']), ('ⅹ', &['Ⅹ']), ('ⅺ', &['Ⅺ']), + ('ⅻ', &['Ⅻ']), ('ⅼ', &['Ⅼ']), ('ⅽ', &['Ⅽ']), ('ⅾ', &['Ⅾ']), + ('ⅿ', &['Ⅿ']), ('Ↄ', &['ↄ']), ('ↄ', &['Ↄ']), ('Ⓐ', &['ⓐ']), + ('Ⓑ', &['ⓑ']), ('Ⓒ', &['ⓒ']), ('Ⓓ', &['ⓓ']), ('Ⓔ', &['ⓔ']), + ('Ⓕ', &['ⓕ']), ('Ⓖ', &['ⓖ']), ('Ⓗ', &['ⓗ']), ('Ⓘ', &['ⓘ']), + ('Ⓙ', &['ⓙ']), ('Ⓚ', &['ⓚ']), ('Ⓛ', &['ⓛ']), ('Ⓜ', &['ⓜ']), + ('Ⓝ', &['ⓝ']), ('Ⓞ', &['ⓞ']), ('Ⓟ', &['ⓟ']), ('Ⓠ', &['ⓠ']), + ('Ⓡ', &['ⓡ']), ('Ⓢ', &['ⓢ']), ('Ⓣ', &['ⓣ']), ('Ⓤ', &['ⓤ']), + ('Ⓥ', &['ⓥ']), ('Ⓦ', &['ⓦ']), ('Ⓧ', &['ⓧ']), ('Ⓨ', &['ⓨ']), + ('Ⓩ', &['ⓩ']), ('ⓐ', &['Ⓐ']), ('ⓑ', &['Ⓑ']), ('ⓒ', &['Ⓒ']), + ('ⓓ', &['Ⓓ']), ('ⓔ', &['Ⓔ']), ('ⓕ', &['Ⓕ']), ('ⓖ', &['Ⓖ']), + ('ⓗ', &['Ⓗ']), ('ⓘ', &['Ⓘ']), ('ⓙ', &['Ⓙ']), ('ⓚ', &['Ⓚ']), + ('ⓛ', &['Ⓛ']), ('ⓜ', &['Ⓜ']), ('ⓝ', &['Ⓝ']), ('ⓞ', &['Ⓞ']), + ('ⓟ', &['Ⓟ']), ('ⓠ', &['Ⓠ']), ('ⓡ', &['Ⓡ']), ('ⓢ', &['Ⓢ']), + ('ⓣ', &['Ⓣ']), ('ⓤ', &['Ⓤ']), ('ⓥ', &['Ⓥ']), ('ⓦ', &['Ⓦ']), + ('ⓧ', &['Ⓧ']), ('ⓨ', &['Ⓨ']), ('ⓩ', &['Ⓩ']), ('Ⰰ', &['ⰰ']), + ('Ⰱ', &['ⰱ']), ('Ⰲ', &['ⰲ']), ('Ⰳ', &['ⰳ']), ('Ⰴ', &['ⰴ']), + ('Ⰵ', &['ⰵ']), ('Ⰶ', &['ⰶ']), ('Ⰷ', &['ⰷ']), ('Ⰸ', &['ⰸ']), + ('Ⰹ', &['ⰹ']), ('Ⰺ', &['ⰺ']), ('Ⰻ', &['ⰻ']), ('Ⰼ', &['ⰼ']), + ('Ⰽ', &['ⰽ']), ('Ⰾ', &['ⰾ']), ('Ⰿ', &['ⰿ']), ('Ⱀ', &['ⱀ']), + ('Ⱁ', &['ⱁ']), ('Ⱂ', &['ⱂ']), ('Ⱃ', &['ⱃ']), ('Ⱄ', &['ⱄ']), + ('Ⱅ', &['ⱅ']), ('Ⱆ', &['ⱆ']), ('Ⱇ', &['ⱇ']), ('Ⱈ', &['ⱈ']), + ('Ⱉ', &['ⱉ']), ('Ⱊ', &['ⱊ']), ('Ⱋ', &['ⱋ']), ('Ⱌ', &['ⱌ']), + ('Ⱍ', &['ⱍ']), ('Ⱎ', &['ⱎ']), ('Ⱏ', &['ⱏ']), ('Ⱐ', &['ⱐ']), + ('Ⱑ', &['ⱑ']), ('Ⱒ', &['ⱒ']), ('Ⱓ', &['ⱓ']), ('Ⱔ', &['ⱔ']), + ('Ⱕ', &['ⱕ']), ('Ⱖ', &['ⱖ']), ('Ⱗ', &['ⱗ']), ('Ⱘ', &['ⱘ']), + ('Ⱙ', &['ⱙ']), ('Ⱚ', &['ⱚ']), ('Ⱛ', &['ⱛ']), ('Ⱜ', &['ⱜ']), + ('Ⱝ', &['ⱝ']), ('Ⱞ', &['ⱞ']), ('ⰰ', &['Ⰰ']), ('ⰱ', &['Ⰱ']), + ('ⰲ', &['Ⰲ']), ('ⰳ', &['Ⰳ']), ('ⰴ', &['Ⰴ']), ('ⰵ', &['Ⰵ']), + ('ⰶ', &['Ⰶ']), ('ⰷ', &['Ⰷ']), ('ⰸ', &['Ⰸ']), ('ⰹ', &['Ⰹ']), + ('ⰺ', &['Ⰺ']), ('ⰻ', &['Ⰻ']), ('ⰼ', &['Ⰼ']), ('ⰽ', &['Ⰽ']), + ('ⰾ', &['Ⰾ']), ('ⰿ', &['Ⰿ']), ('ⱀ', &['Ⱀ']), ('ⱁ', &['Ⱁ']), + ('ⱂ', &['Ⱂ']), ('ⱃ', &['Ⱃ']), ('ⱄ', &['Ⱄ']), ('ⱅ', &['Ⱅ']), + ('ⱆ', &['Ⱆ']), ('ⱇ', &['Ⱇ']), ('ⱈ', &['Ⱈ']), ('ⱉ', &['Ⱉ']), + ('ⱊ', &['Ⱊ']), ('ⱋ', &['Ⱋ']), ('ⱌ', &['Ⱌ']), ('ⱍ', &['Ⱍ']), + ('ⱎ', &['Ⱎ']), ('ⱏ', &['Ⱏ']), ('ⱐ', &['Ⱐ']), ('ⱑ', &['Ⱑ']), + ('ⱒ', &['Ⱒ']), ('ⱓ', &['Ⱓ']), ('ⱔ', &['Ⱔ']), ('ⱕ', &['Ⱕ']), + ('ⱖ', &['Ⱖ']), ('ⱗ', &['Ⱗ']), ('ⱘ', &['Ⱘ']), ('ⱙ', &['Ⱙ']), + ('ⱚ', &['Ⱚ']), ('ⱛ', &['Ⱛ']), ('ⱜ', &['Ⱜ']), ('ⱝ', &['Ⱝ']), + ('ⱞ', &['Ⱞ']), ('Ⱡ', &['ⱡ']), ('ⱡ', &['Ⱡ']), ('Ɫ', &['ɫ']), + ('Ᵽ', &['ᵽ']), ('Ɽ', &['ɽ']), ('ⱥ', &['Ⱥ']), ('ⱦ', &['Ⱦ']), + ('Ⱨ', &['ⱨ']), ('ⱨ', &['Ⱨ']), ('Ⱪ', &['ⱪ']), ('ⱪ', &['Ⱪ']), + ('Ⱬ', &['ⱬ']), ('ⱬ', &['Ⱬ']), ('Ɑ', &['ɑ']), ('Ɱ', &['ɱ']), + ('Ɐ', &['ɐ']), ('Ɒ', &['ɒ']), ('Ⱳ', &['ⱳ']), ('ⱳ', &['Ⱳ']), + ('Ⱶ', &['ⱶ']), ('ⱶ', &['Ⱶ']), ('Ȿ', &['ȿ']), ('Ɀ', &['ɀ']), + ('Ⲁ', &['ⲁ']), ('ⲁ', &['Ⲁ']), ('Ⲃ', &['ⲃ']), ('ⲃ', &['Ⲃ']), + ('Ⲅ', &['ⲅ']), ('ⲅ', &['Ⲅ']), ('Ⲇ', &['ⲇ']), ('ⲇ', &['Ⲇ']), + ('Ⲉ', &['ⲉ']), ('ⲉ', &['Ⲉ']), ('Ⲋ', &['ⲋ']), ('ⲋ', &['Ⲋ']), + ('Ⲍ', &['ⲍ']), ('ⲍ', &['Ⲍ']), ('Ⲏ', &['ⲏ']), ('ⲏ', &['Ⲏ']), + ('Ⲑ', &['ⲑ']), ('ⲑ', &['Ⲑ']), ('Ⲓ', &['ⲓ']), ('ⲓ', &['Ⲓ']), + ('Ⲕ', &['ⲕ']), ('ⲕ', &['Ⲕ']), ('Ⲗ', &['ⲗ']), ('ⲗ', &['Ⲗ']), + ('Ⲙ', &['ⲙ']), ('ⲙ', &['Ⲙ']), ('Ⲛ', &['ⲛ']), ('ⲛ', &['Ⲛ']), + ('Ⲝ', &['ⲝ']), ('ⲝ', &['Ⲝ']), ('Ⲟ', &['ⲟ']), ('ⲟ', &['Ⲟ']), + ('Ⲡ', &['ⲡ']), ('ⲡ', &['Ⲡ']), ('Ⲣ', &['ⲣ']), ('ⲣ', &['Ⲣ']), + ('Ⲥ', &['ⲥ']), ('ⲥ', &['Ⲥ']), ('Ⲧ', &['ⲧ']), ('ⲧ', &['Ⲧ']), + ('Ⲩ', &['ⲩ']), ('ⲩ', &['Ⲩ']), ('Ⲫ', &['ⲫ']), ('ⲫ', &['Ⲫ']), + ('Ⲭ', &['ⲭ']), ('ⲭ', &['Ⲭ']), ('Ⲯ', &['ⲯ']), ('ⲯ', &['Ⲯ']), + ('Ⲱ', &['ⲱ']), ('ⲱ', &['Ⲱ']), ('Ⲳ', &['ⲳ']), ('ⲳ', &['Ⲳ']), + ('Ⲵ', &['ⲵ']), ('ⲵ', &['Ⲵ']), ('Ⲷ', &['ⲷ']), ('ⲷ', &['Ⲷ']), + ('Ⲹ', &['ⲹ']), ('ⲹ', &['Ⲹ']), ('Ⲻ', &['ⲻ']), ('ⲻ', &['Ⲻ']), + ('Ⲽ', &['ⲽ']), ('ⲽ', &['Ⲽ']), ('Ⲿ', &['ⲿ']), ('ⲿ', &['Ⲿ']), + ('Ⳁ', &['ⳁ']), ('ⳁ', &['Ⳁ']), ('Ⳃ', &['ⳃ']), ('ⳃ', &['Ⳃ']), + ('Ⳅ', &['ⳅ']), ('ⳅ', &['Ⳅ']), ('Ⳇ', &['ⳇ']), ('ⳇ', &['Ⳇ']), + ('Ⳉ', &['ⳉ']), ('ⳉ', &['Ⳉ']), ('Ⳋ', &['ⳋ']), ('ⳋ', &['Ⳋ']), + ('Ⳍ', &['ⳍ']), ('ⳍ', &['Ⳍ']), ('Ⳏ', &['ⳏ']), ('ⳏ', &['Ⳏ']), + ('Ⳑ', &['ⳑ']), ('ⳑ', &['Ⳑ']), ('Ⳓ', &['ⳓ']), ('ⳓ', &['Ⳓ']), + ('Ⳕ', &['ⳕ']), ('ⳕ', &['Ⳕ']), ('Ⳗ', &['ⳗ']), ('ⳗ', &['Ⳗ']), + ('Ⳙ', &['ⳙ']), ('ⳙ', &['Ⳙ']), ('Ⳛ', &['ⳛ']), ('ⳛ', &['Ⳛ']), + ('Ⳝ', &['ⳝ']), ('ⳝ', &['Ⳝ']), ('Ⳟ', &['ⳟ']), ('ⳟ', &['Ⳟ']), + ('Ⳡ', &['ⳡ']), ('ⳡ', &['Ⳡ']), ('Ⳣ', &['ⳣ']), ('ⳣ', &['Ⳣ']), + ('Ⳬ', &['ⳬ']), ('ⳬ', &['Ⳬ']), ('Ⳮ', &['ⳮ']), ('ⳮ', &['Ⳮ']), + ('Ⳳ', &['ⳳ']), ('ⳳ', &['Ⳳ']), ('ⴀ', &['Ⴀ']), ('ⴁ', &['Ⴁ']), + ('ⴂ', &['Ⴂ']), ('ⴃ', &['Ⴃ']), ('ⴄ', &['Ⴄ']), ('ⴅ', &['Ⴅ']), + ('ⴆ', &['Ⴆ']), ('ⴇ', &['Ⴇ']), ('ⴈ', &['Ⴈ']), ('ⴉ', &['Ⴉ']), + ('ⴊ', &['Ⴊ']), ('ⴋ', &['Ⴋ']), ('ⴌ', &['Ⴌ']), ('ⴍ', &['Ⴍ']), + ('ⴎ', &['Ⴎ']), ('ⴏ', &['Ⴏ']), ('ⴐ', &['Ⴐ']), ('ⴑ', &['Ⴑ']), + ('ⴒ', &['Ⴒ']), ('ⴓ', &['Ⴓ']), ('ⴔ', &['Ⴔ']), ('ⴕ', &['Ⴕ']), + ('ⴖ', &['Ⴖ']), ('ⴗ', &['Ⴗ']), ('ⴘ', &['Ⴘ']), ('ⴙ', &['Ⴙ']), + ('ⴚ', &['Ⴚ']), ('ⴛ', &['Ⴛ']), ('ⴜ', &['Ⴜ']), ('ⴝ', &['Ⴝ']), + ('ⴞ', &['Ⴞ']), ('ⴟ', &['Ⴟ']), ('ⴠ', &['Ⴠ']), ('ⴡ', &['Ⴡ']), + ('ⴢ', &['Ⴢ']), ('ⴣ', &['Ⴣ']), ('ⴤ', &['Ⴤ']), ('ⴥ', &['Ⴥ']), + ('ⴧ', &['Ⴧ']), ('ⴭ', &['Ⴭ']), ('Ꙁ', &['ꙁ']), ('ꙁ', &['Ꙁ']), + ('Ꙃ', &['ꙃ']), ('ꙃ', &['Ꙃ']), ('Ꙅ', &['ꙅ']), ('ꙅ', &['Ꙅ']), + ('Ꙇ', &['ꙇ']), ('ꙇ', &['Ꙇ']), ('Ꙉ', &['ꙉ']), ('ꙉ', &['Ꙉ']), + ('Ꙋ', &['ᲈ', 'ꙋ', ]), ('ꙋ', &['ᲈ', 'Ꙋ', ]), ('Ꙍ', &['ꙍ']), + ('ꙍ', &['Ꙍ']), ('Ꙏ', &['ꙏ']), ('ꙏ', &['Ꙏ']), ('Ꙑ', &['ꙑ']), + ('ꙑ', &['Ꙑ']), ('Ꙓ', &['ꙓ']), ('ꙓ', &['Ꙓ']), ('Ꙕ', &['ꙕ']), + ('ꙕ', &['Ꙕ']), ('Ꙗ', &['ꙗ']), ('ꙗ', &['Ꙗ']), ('Ꙙ', &['ꙙ']), + ('ꙙ', &['Ꙙ']), ('Ꙛ', &['ꙛ']), ('ꙛ', &['Ꙛ']), ('Ꙝ', &['ꙝ']), + ('ꙝ', &['Ꙝ']), ('Ꙟ', &['ꙟ']), ('ꙟ', &['Ꙟ']), ('Ꙡ', &['ꙡ']), + ('ꙡ', &['Ꙡ']), ('Ꙣ', &['ꙣ']), ('ꙣ', &['Ꙣ']), ('Ꙥ', &['ꙥ']), + ('ꙥ', &['Ꙥ']), ('Ꙧ', &['ꙧ']), ('ꙧ', &['Ꙧ']), ('Ꙩ', &['ꙩ']), + ('ꙩ', &['Ꙩ']), ('Ꙫ', &['ꙫ']), ('ꙫ', &['Ꙫ']), ('Ꙭ', &['ꙭ']), + ('ꙭ', &['Ꙭ']), ('Ꚁ', &['ꚁ']), ('ꚁ', &['Ꚁ']), ('Ꚃ', &['ꚃ']), + ('ꚃ', &['Ꚃ']), ('Ꚅ', &['ꚅ']), ('ꚅ', &['Ꚅ']), ('Ꚇ', &['ꚇ']), + ('ꚇ', &['Ꚇ']), ('Ꚉ', &['ꚉ']), ('ꚉ', &['Ꚉ']), ('Ꚋ', &['ꚋ']), + ('ꚋ', &['Ꚋ']), ('Ꚍ', &['ꚍ']), ('ꚍ', &['Ꚍ']), ('Ꚏ', &['ꚏ']), + ('ꚏ', &['Ꚏ']), ('Ꚑ', &['ꚑ']), ('ꚑ', &['Ꚑ']), ('Ꚓ', &['ꚓ']), + ('ꚓ', &['Ꚓ']), ('Ꚕ', &['ꚕ']), ('ꚕ', &['Ꚕ']), ('Ꚗ', &['ꚗ']), + ('ꚗ', &['Ꚗ']), ('Ꚙ', &['ꚙ']), ('ꚙ', &['Ꚙ']), ('Ꚛ', &['ꚛ']), + ('ꚛ', &['Ꚛ']), ('Ꜣ', &['ꜣ']), ('ꜣ', &['Ꜣ']), ('Ꜥ', &['ꜥ']), + ('ꜥ', &['Ꜥ']), ('Ꜧ', &['ꜧ']), ('ꜧ', &['Ꜧ']), ('Ꜩ', &['ꜩ']), + ('ꜩ', &['Ꜩ']), ('Ꜫ', &['ꜫ']), ('ꜫ', &['Ꜫ']), ('Ꜭ', &['ꜭ']), + ('ꜭ', &['Ꜭ']), ('Ꜯ', &['ꜯ']), ('ꜯ', &['Ꜯ']), ('Ꜳ', &['ꜳ']), + ('ꜳ', &['Ꜳ']), ('Ꜵ', &['ꜵ']), ('ꜵ', &['Ꜵ']), ('Ꜷ', &['ꜷ']), + ('ꜷ', &['Ꜷ']), ('Ꜹ', &['ꜹ']), ('ꜹ', &['Ꜹ']), ('Ꜻ', &['ꜻ']), + ('ꜻ', &['Ꜻ']), ('Ꜽ', &['ꜽ']), ('ꜽ', &['Ꜽ']), ('Ꜿ', &['ꜿ']), + ('ꜿ', &['Ꜿ']), ('Ꝁ', &['ꝁ']), ('ꝁ', &['Ꝁ']), ('Ꝃ', &['ꝃ']), + ('ꝃ', &['Ꝃ']), ('Ꝅ', &['ꝅ']), ('ꝅ', &['Ꝅ']), ('Ꝇ', &['ꝇ']), + ('ꝇ', &['Ꝇ']), ('Ꝉ', &['ꝉ']), ('ꝉ', &['Ꝉ']), ('Ꝋ', &['ꝋ']), + ('ꝋ', &['Ꝋ']), ('Ꝍ', &['ꝍ']), ('ꝍ', &['Ꝍ']), ('Ꝏ', &['ꝏ']), + ('ꝏ', &['Ꝏ']), ('Ꝑ', &['ꝑ']), ('ꝑ', &['Ꝑ']), ('Ꝓ', &['ꝓ']), + ('ꝓ', &['Ꝓ']), ('Ꝕ', &['ꝕ']), ('ꝕ', &['Ꝕ']), ('Ꝗ', &['ꝗ']), + ('ꝗ', &['Ꝗ']), ('Ꝙ', &['ꝙ']), ('ꝙ', &['Ꝙ']), ('Ꝛ', &['ꝛ']), + ('ꝛ', &['Ꝛ']), ('Ꝝ', &['ꝝ']), ('ꝝ', &['Ꝝ']), ('Ꝟ', &['ꝟ']), + ('ꝟ', &['Ꝟ']), ('Ꝡ', &['ꝡ']), ('ꝡ', &['Ꝡ']), ('Ꝣ', &['ꝣ']), + ('ꝣ', &['Ꝣ']), ('Ꝥ', &['ꝥ']), ('ꝥ', &['Ꝥ']), ('Ꝧ', &['ꝧ']), + ('ꝧ', &['Ꝧ']), ('Ꝩ', &['ꝩ']), ('ꝩ', &['Ꝩ']), ('Ꝫ', &['ꝫ']), + ('ꝫ', &['Ꝫ']), ('Ꝭ', &['ꝭ']), ('ꝭ', &['Ꝭ']), ('Ꝯ', &['ꝯ']), + ('ꝯ', &['Ꝯ']), ('Ꝺ', &['ꝺ']), ('ꝺ', &['Ꝺ']), ('Ꝼ', &['ꝼ']), + ('ꝼ', &['Ꝼ']), ('Ᵹ', &['ᵹ']), ('Ꝿ', &['ꝿ']), ('ꝿ', &['Ꝿ']), + ('Ꞁ', &['ꞁ']), ('ꞁ', &['Ꞁ']), ('Ꞃ', &['ꞃ']), ('ꞃ', &['Ꞃ']), + ('Ꞅ', &['ꞅ']), ('ꞅ', &['Ꞅ']), ('Ꞇ', &['ꞇ']), ('ꞇ', &['Ꞇ']), + ('Ꞌ', &['ꞌ']), ('ꞌ', &['Ꞌ']), ('Ɥ', &['ɥ']), ('Ꞑ', &['ꞑ']), + ('ꞑ', &['Ꞑ']), ('Ꞓ', &['ꞓ']), ('ꞓ', &['Ꞓ']), ('Ꞗ', &['ꞗ']), + ('ꞗ', &['Ꞗ']), ('Ꞙ', &['ꞙ']), ('ꞙ', &['Ꞙ']), ('Ꞛ', &['ꞛ']), + ('ꞛ', &['Ꞛ']), ('Ꞝ', &['ꞝ']), ('ꞝ', &['Ꞝ']), ('Ꞟ', &['ꞟ']), + ('ꞟ', &['Ꞟ']), ('Ꞡ', &['ꞡ']), ('ꞡ', &['Ꞡ']), ('Ꞣ', &['ꞣ']), + ('ꞣ', &['Ꞣ']), ('Ꞥ', &['ꞥ']), ('ꞥ', &['Ꞥ']), ('Ꞧ', &['ꞧ']), + ('ꞧ', &['Ꞧ']), ('Ꞩ', &['ꞩ']), ('ꞩ', &['Ꞩ']), ('Ɦ', &['ɦ']), + ('Ɜ', &['ɜ']), ('Ɡ', &['ɡ']), ('Ɬ', &['ɬ']), ('Ɪ', &['ɪ']), + ('Ʞ', &['ʞ']), ('Ʇ', &['ʇ']), ('Ʝ', &['ʝ']), ('Ꭓ', &['ꭓ']), + ('Ꞵ', &['ꞵ']), ('ꞵ', &['Ꞵ']), ('Ꞷ', &['ꞷ']), ('ꞷ', &['Ꞷ']), + ('ꭓ', &['Ꭓ']), ('ꭰ', &['Ꭰ']), ('ꭱ', &['Ꭱ']), ('ꭲ', &['Ꭲ']), + ('ꭳ', &['Ꭳ']), ('ꭴ', &['Ꭴ']), ('ꭵ', &['Ꭵ']), ('ꭶ', &['Ꭶ']), + ('ꭷ', &['Ꭷ']), ('ꭸ', &['Ꭸ']), ('ꭹ', &['Ꭹ']), ('ꭺ', &['Ꭺ']), + ('ꭻ', &['Ꭻ']), ('ꭼ', &['Ꭼ']), ('ꭽ', &['Ꭽ']), ('ꭾ', &['Ꭾ']), + ('ꭿ', &['Ꭿ']), ('ꮀ', &['Ꮀ']), ('ꮁ', &['Ꮁ']), ('ꮂ', &['Ꮂ']), + ('ꮃ', &['Ꮃ']), ('ꮄ', &['Ꮄ']), ('ꮅ', &['Ꮅ']), ('ꮆ', &['Ꮆ']), + ('ꮇ', &['Ꮇ']), ('ꮈ', &['Ꮈ']), ('ꮉ', &['Ꮉ']), ('ꮊ', &['Ꮊ']), + ('ꮋ', &['Ꮋ']), ('ꮌ', &['Ꮌ']), ('ꮍ', &['Ꮍ']), ('ꮎ', &['Ꮎ']), + ('ꮏ', &['Ꮏ']), ('ꮐ', &['Ꮐ']), ('ꮑ', &['Ꮑ']), ('ꮒ', &['Ꮒ']), + ('ꮓ', &['Ꮓ']), ('ꮔ', &['Ꮔ']), ('ꮕ', &['Ꮕ']), ('ꮖ', &['Ꮖ']), + ('ꮗ', &['Ꮗ']), ('ꮘ', &['Ꮘ']), ('ꮙ', &['Ꮙ']), ('ꮚ', &['Ꮚ']), + ('ꮛ', &['Ꮛ']), ('ꮜ', &['Ꮜ']), ('ꮝ', &['Ꮝ']), ('ꮞ', &['Ꮞ']), + ('ꮟ', &['Ꮟ']), ('ꮠ', &['Ꮠ']), ('ꮡ', &['Ꮡ']), ('ꮢ', &['Ꮢ']), + ('ꮣ', &['Ꮣ']), ('ꮤ', &['Ꮤ']), ('ꮥ', &['Ꮥ']), ('ꮦ', &['Ꮦ']), + ('ꮧ', &['Ꮧ']), ('ꮨ', &['Ꮨ']), ('ꮩ', &['Ꮩ']), ('ꮪ', &['Ꮪ']), + ('ꮫ', &['Ꮫ']), ('ꮬ', &['Ꮬ']), ('ꮭ', &['Ꮭ']), ('ꮮ', &['Ꮮ']), + ('ꮯ', &['Ꮯ']), ('ꮰ', &['Ꮰ']), ('ꮱ', &['Ꮱ']), ('ꮲ', &['Ꮲ']), + ('ꮳ', &['Ꮳ']), ('ꮴ', &['Ꮴ']), ('ꮵ', &['Ꮵ']), ('ꮶ', &['Ꮶ']), + ('ꮷ', &['Ꮷ']), ('ꮸ', &['Ꮸ']), ('ꮹ', &['Ꮹ']), ('ꮺ', &['Ꮺ']), + ('ꮻ', &['Ꮻ']), ('ꮼ', &['Ꮼ']), ('ꮽ', &['Ꮽ']), ('ꮾ', &['Ꮾ']), + ('ꮿ', &['Ꮿ']), ('A', &['a']), ('B', &['b']), ('C', &['c']), + ('D', &['d']), ('E', &['e']), ('F', &['f']), ('G', &['g']), + ('H', &['h']), ('I', &['i']), ('J', &['j']), ('K', &['k']), + ('L', &['l']), ('M', &['m']), ('N', &['n']), ('O', &['o']), + ('P', &['p']), ('Q', &['q']), ('R', &['r']), ('S', &['s']), + ('T', &['t']), ('U', &['u']), ('V', &['v']), ('W', &['w']), + ('X', &['x']), ('Y', &['y']), ('Z', &['z']), ('a', &['A']), + ('b', &['B']), ('c', &['C']), ('d', &['D']), ('e', &['E']), + ('f', &['F']), ('g', &['G']), ('h', &['H']), ('i', &['I']), + ('j', &['J']), ('k', &['K']), ('l', &['L']), ('m', &['M']), + ('n', &['N']), ('o', &['O']), ('p', &['P']), ('q', &['Q']), + ('r', &['R']), ('s', &['S']), ('t', &['T']), ('u', &['U']), + ('v', &['V']), ('w', &['W']), ('x', &['X']), ('y', &['Y']), + ('z', &['Z']), ('𐐀', &['𐐨']), ('𐐁', &['𐐩']), ('𐐂', &[ + '𐐪']), ('𐐃', &['𐐫']), ('𐐄', &['𐐬']), ('𐐅', &['𐐭']), + ('𐐆', &['𐐮']), ('𐐇', &['𐐯']), ('𐐈', &['𐐰']), ('𐐉', &[ + '𐐱']), ('𐐊', &['𐐲']), ('𐐋', &['𐐳']), ('𐐌', &['𐐴']), + ('𐐍', &['𐐵']), ('𐐎', &['𐐶']), ('𐐏', &['𐐷']), ('𐐐', &[ + '𐐸']), ('𐐑', &['𐐹']), ('𐐒', &['𐐺']), ('𐐓', &['𐐻']), + ('𐐔', &['𐐼']), ('𐐕', &['𐐽']), ('𐐖', &['𐐾']), ('𐐗', &[ + '𐐿']), ('𐐘', &['𐑀']), ('𐐙', &['𐑁']), ('𐐚', &['𐑂']), + ('𐐛', &['𐑃']), ('𐐜', &['𐑄']), ('𐐝', &['𐑅']), ('𐐞', &[ + '𐑆']), ('𐐟', &['𐑇']), ('𐐠', &['𐑈']), ('𐐡', &['𐑉']), + ('𐐢', &['𐑊']), ('𐐣', &['𐑋']), ('𐐤', &['𐑌']), ('𐐥', &[ + '𐑍']), ('𐐦', &['𐑎']), ('𐐧', &['𐑏']), ('𐐨', &['𐐀']), + ('𐐩', &['𐐁']), ('𐐪', &['𐐂']), ('𐐫', &['𐐃']), ('𐐬', &[ + '𐐄']), ('𐐭', &['𐐅']), ('𐐮', &['𐐆']), ('𐐯', &['𐐇']), + ('𐐰', &['𐐈']), ('𐐱', &['𐐉']), ('𐐲', &['𐐊']), ('𐐳', &[ + '𐐋']), ('𐐴', &['𐐌']), ('𐐵', &['𐐍']), ('𐐶', &['𐐎']), + ('𐐷', &['𐐏']), ('𐐸', &['𐐐']), ('𐐹', &['𐐑']), ('𐐺', &[ + '𐐒']), ('𐐻', &['𐐓']), ('𐐼', &['𐐔']), ('𐐽', &['𐐕']), + ('𐐾', &['𐐖']), ('𐐿', &['𐐗']), ('𐑀', &['𐐘']), ('𐑁', &[ + '𐐙']), ('𐑂', &['𐐚']), ('𐑃', &['𐐛']), ('𐑄', &['𐐜']), + ('𐑅', &['𐐝']), ('𐑆', &['𐐞']), ('𐑇', &['𐐟']), ('𐑈', &[ + '𐐠']), ('𐑉', &['𐐡']), ('𐑊', &['𐐢']), ('𐑋', &['𐐣']), + ('𐑌', &['𐐤']), ('𐑍', &['𐐥']), ('𐑎', &['𐐦']), ('𐑏', &[ + '𐐧']), ('𐒰', &['𐓘']), ('𐒱', &['𐓙']), ('𐒲', &['𐓚']), + ('𐒳', &['𐓛']), ('𐒴', &['𐓜']), ('𐒵', &['𐓝']), ('𐒶', &[ + '𐓞']), ('𐒷', &['𐓟']), ('𐒸', &['𐓠']), ('𐒹', &['𐓡']), + ('𐒺', &['𐓢']), ('𐒻', &['𐓣']), ('𐒼', &['𐓤']), ('𐒽', &[ + '𐓥']), ('𐒾', &['𐓦']), ('𐒿', &['𐓧']), ('𐓀', &['𐓨']), + ('𐓁', &['𐓩']), ('𐓂', &['𐓪']), ('𐓃', &['𐓫']), ('𐓄', &[ + '𐓬']), ('𐓅', &['𐓭']), ('𐓆', &['𐓮']), ('𐓇', &['𐓯']), + ('𐓈', &['𐓰']), ('𐓉', &['𐓱']), ('𐓊', &['𐓲']), ('𐓋', &[ + '𐓳']), ('𐓌', &['𐓴']), ('𐓍', &['𐓵']), ('𐓎', &['𐓶']), + ('𐓏', &['𐓷']), ('𐓐', &['𐓸']), ('𐓑', &['𐓹']), ('𐓒', &[ + '𐓺']), ('𐓓', &['𐓻']), ('𐓘', &['𐒰']), ('𐓙', &['𐒱']), + ('𐓚', &['𐒲']), ('𐓛', &['𐒳']), ('𐓜', &['𐒴']), ('𐓝', &[ + '𐒵']), ('𐓞', &['𐒶']), ('𐓟', &['𐒷']), ('𐓠', &['𐒸']), + ('𐓡', &['𐒹']), ('𐓢', &['𐒺']), ('𐓣', &['𐒻']), ('𐓤', &[ + '𐒼']), ('𐓥', &['𐒽']), ('𐓦', &['𐒾']), ('𐓧', &['𐒿']), + ('𐓨', &['𐓀']), ('𐓩', &['𐓁']), ('𐓪', &['𐓂']), ('𐓫', &[ + '𐓃']), ('𐓬', &['𐓄']), ('𐓭', &['𐓅']), ('𐓮', &['𐓆']), + ('𐓯', &['𐓇']), ('𐓰', &['𐓈']), ('𐓱', &['𐓉']), ('𐓲', &[ + '𐓊']), ('𐓳', &['𐓋']), ('𐓴', &['𐓌']), ('𐓵', &['𐓍']), + ('𐓶', &['𐓎']), ('𐓷', &['𐓏']), ('𐓸', &['𐓐']), ('𐓹', &[ + '𐓑']), ('𐓺', &['𐓒']), ('𐓻', &['𐓓']), ('𐲀', &['𐳀']), + ('𐲁', &['𐳁']), ('𐲂', &['𐳂']), ('𐲃', &['𐳃']), ('𐲄', &[ + '𐳄']), ('𐲅', &['𐳅']), ('𐲆', &['𐳆']), ('𐲇', &['𐳇']), + ('𐲈', &['𐳈']), ('𐲉', &['𐳉']), ('𐲊', &['𐳊']), ('𐲋', &[ + '𐳋']), ('𐲌', &['𐳌']), ('𐲍', &['𐳍']), ('𐲎', &['𐳎']), + ('𐲏', &['𐳏']), ('𐲐', &['𐳐']), ('𐲑', &['𐳑']), ('𐲒', &[ + '𐳒']), ('𐲓', &['𐳓']), ('𐲔', &['𐳔']), ('𐲕', &['𐳕']), + ('𐲖', &['𐳖']), ('𐲗', &['𐳗']), ('𐲘', &['𐳘']), ('𐲙', &[ + '𐳙']), ('𐲚', &['𐳚']), ('𐲛', &['𐳛']), ('𐲜', &['𐳜']), + ('𐲝', &['𐳝']), ('𐲞', &['𐳞']), ('𐲟', &['𐳟']), ('𐲠', &[ + '𐳠']), ('𐲡', &['𐳡']), ('𐲢', &['𐳢']), ('𐲣', &['𐳣']), + ('𐲤', &['𐳤']), ('𐲥', &['𐳥']), ('𐲦', &['𐳦']), ('𐲧', &[ + '𐳧']), ('𐲨', &['𐳨']), ('𐲩', &['𐳩']), ('𐲪', &['𐳪']), + ('𐲫', &['𐳫']), ('𐲬', &['𐳬']), ('𐲭', &['𐳭']), ('𐲮', &[ + '𐳮']), ('𐲯', &['𐳯']), ('𐲰', &['𐳰']), ('𐲱', &['𐳱']), + ('𐲲', &['𐳲']), ('𐳀', &['𐲀']), ('𐳁', &['𐲁']), ('𐳂', &[ + '𐲂']), ('𐳃', &['𐲃']), ('𐳄', &['𐲄']), ('𐳅', &['𐲅']), + ('𐳆', &['𐲆']), ('𐳇', &['𐲇']), ('𐳈', &['𐲈']), ('𐳉', &[ + '𐲉']), ('𐳊', &['𐲊']), ('𐳋', &['𐲋']), ('𐳌', &['𐲌']), + ('𐳍', &['𐲍']), ('𐳎', &['𐲎']), ('𐳏', &['𐲏']), ('𐳐', &[ + '𐲐']), ('𐳑', &['𐲑']), ('𐳒', &['𐲒']), ('𐳓', &['𐲓']), + ('𐳔', &['𐲔']), ('𐳕', &['𐲕']), ('𐳖', &['𐲖']), ('𐳗', &[ + '𐲗']), ('𐳘', &['𐲘']), ('𐳙', &['𐲙']), ('𐳚', &['𐲚']), + ('𐳛', &['𐲛']), ('𐳜', &['𐲜']), ('𐳝', &['𐲝']), ('𐳞', &[ + '𐲞']), ('𐳟', &['𐲟']), ('𐳠', &['𐲠']), ('𐳡', &['𐲡']), + ('𐳢', &['𐲢']), ('𐳣', &['𐲣']), ('𐳤', &['𐲤']), ('𐳥', &[ + '𐲥']), ('𐳦', &['𐲦']), ('𐳧', &['𐲧']), ('𐳨', &['𐲨']), + ('𐳩', &['𐲩']), ('𐳪', &['𐲪']), ('𐳫', &['𐲫']), ('𐳬', &[ + '𐲬']), ('𐳭', &['𐲭']), ('𐳮', &['𐲮']), ('𐳯', &['𐲯']), + ('𐳰', &['𐲰']), ('𐳱', &['𐲱']), ('𐳲', &['𐲲']), ('𑢠', &[ + '𑣀']), ('𑢡', &['𑣁']), ('𑢢', &['𑣂']), ('𑢣', &['𑣃']), + ('𑢤', &['𑣄']), ('𑢥', &['𑣅']), ('𑢦', &['𑣆']), ('𑢧', &[ + '𑣇']), ('𑢨', &['𑣈']), ('𑢩', &['𑣉']), ('𑢪', &['𑣊']), + ('𑢫', &['𑣋']), ('𑢬', &['𑣌']), ('𑢭', &['𑣍']), ('𑢮', &[ + '𑣎']), ('𑢯', &['𑣏']), ('𑢰', &['𑣐']), ('𑢱', &['𑣑']), + ('𑢲', &['𑣒']), ('𑢳', &['𑣓']), ('𑢴', &['𑣔']), ('𑢵', &[ + '𑣕']), ('𑢶', &['𑣖']), ('𑢷', &['𑣗']), ('𑢸', &['𑣘']), + ('𑢹', &['𑣙']), ('𑢺', &['𑣚']), ('𑢻', &['𑣛']), ('𑢼', &[ + '𑣜']), ('𑢽', &['𑣝']), ('𑢾', &['𑣞']), ('𑢿', &['𑣟']), + ('𑣀', &['𑢠']), ('𑣁', &['𑢡']), ('𑣂', &['𑢢']), ('𑣃', &[ + '𑢣']), ('𑣄', &['𑢤']), ('𑣅', &['𑢥']), ('𑣆', &['𑢦']), + ('𑣇', &['𑢧']), ('𑣈', &['𑢨']), ('𑣉', &['𑢩']), ('𑣊', &[ + '𑢪']), ('𑣋', &['𑢫']), ('𑣌', &['𑢬']), ('𑣍', &['𑢭']), + ('𑣎', &['𑢮']), ('𑣏', &['𑢯']), ('𑣐', &['𑢰']), ('𑣑', &[ + '𑢱']), ('𑣒', &['𑢲']), ('𑣓', &['𑢳']), ('𑣔', &['𑢴']), + ('𑣕', &['𑢵']), ('𑣖', &['𑢶']), ('𑣗', &['𑢷']), ('𑣘', &[ + '𑢸']), ('𑣙', &['𑢹']), ('𑣚', &['𑢺']), ('𑣛', &['𑢻']), + ('𑣜', &['𑢼']), ('𑣝', &['𑢽']), ('𑣞', &['𑢾']), ('𑣟', &[ + '𑢿']), ('𞤀', &['𞤢']), ('𞤁', &['𞤣']), ('𞤂', &['𞤤']), + ('𞤃', &['𞤥']), ('𞤄', &['𞤦']), ('𞤅', &['𞤧']), ('𞤆', &[ + '𞤨']), ('𞤇', &['𞤩']), ('𞤈', &['𞤪']), ('𞤉', &['𞤫']), + ('𞤊', &['𞤬']), ('𞤋', &['𞤭']), ('𞤌', &['𞤮']), ('𞤍', &[ + '𞤯']), ('𞤎', &['𞤰']), ('𞤏', &['𞤱']), ('𞤐', &['𞤲']), + ('𞤑', &['𞤳']), ('𞤒', &['𞤴']), ('𞤓', &['𞤵']), ('𞤔', &[ + '𞤶']), ('𞤕', &['𞤷']), ('𞤖', &['𞤸']), ('𞤗', &['𞤹']), + ('𞤘', &['𞤺']), ('𞤙', &['𞤻']), ('𞤚', &['𞤼']), ('𞤛', &[ + '𞤽']), ('𞤜', &['𞤾']), ('𞤝', &['𞤿']), ('𞤞', &['𞥀']), + ('𞤟', &['𞥁']), ('𞤠', &['𞥂']), ('𞤡', &['𞥃']), ('𞤢', &[ + '𞤀']), ('𞤣', &['𞤁']), ('𞤤', &['𞤂']), ('𞤥', &['𞤃']), + ('𞤦', &['𞤄']), ('𞤧', &['𞤅']), ('𞤨', &['𞤆']), ('𞤩', &[ + '𞤇']), ('𞤪', &['𞤈']), ('𞤫', &['𞤉']), ('𞤬', &['𞤊']), + ('𞤭', &['𞤋']), ('𞤮', &['𞤌']), ('𞤯', &['𞤍']), ('𞤰', &[ + '𞤎']), ('𞤱', &['𞤏']), ('𞤲', &['𞤐']), ('𞤳', &['𞤑']), + ('𞤴', &['𞤒']), ('𞤵', &['𞤓']), ('𞤶', &['𞤔']), ('𞤷', &[ + '𞤕']), ('𞤸', &['𞤖']), ('𞤹', &['𞤗']), ('𞤺', &['𞤘']), + ('𞤻', &['𞤙']), ('𞤼', &['𞤚']), ('𞤽', &['𞤛']), ('𞤾', &[ + '𞤜']), ('𞤿', &['𞤝']), ('𞥀', &['𞤞']), ('𞥁', &['𞤟']), + ('𞥂', &['𞤠']), ('𞥃', &['𞤡']), +]; diff --git a/regex-syntax-2/src/unicode_tables/general_category.rs b/regex-syntax-2/src/unicode_tables/general_category.rs new file mode 100644 index 0000000000..451a0b27c7 --- /dev/null +++ b/regex-syntax-2/src/unicode_tables/general_category.rs @@ -0,0 +1,1844 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate general-category tmp/ucd-10.0.0/ --chars --exclude surrogate +// +// ucd-generate is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ + ("Cased_Letter", CASED_LETTER), ("Close_Punctuation", CLOSE_PUNCTUATION), + ("Connector_Punctuation", CONNECTOR_PUNCTUATION), ("Control", CONTROL), + ("Currency_Symbol", CURRENCY_SYMBOL), + ("Dash_Punctuation", DASH_PUNCTUATION), ("Decimal_Number", DECIMAL_NUMBER), + ("Enclosing_Mark", ENCLOSING_MARK), + ("Final_Punctuation", FINAL_PUNCTUATION), ("Format", FORMAT), + ("Initial_Punctuation", INITIAL_PUNCTUATION), ("Letter", LETTER), + ("Letter_Number", LETTER_NUMBER), ("Line_Separator", LINE_SEPARATOR), + ("Lowercase_Letter", LOWERCASE_LETTER), ("Mark", MARK), + ("Math_Symbol", MATH_SYMBOL), ("Modifier_Letter", MODIFIER_LETTER), + ("Modifier_Symbol", MODIFIER_SYMBOL), ("Nonspacing_Mark", NONSPACING_MARK), + ("Number", NUMBER), ("Open_Punctuation", OPEN_PUNCTUATION), + ("Other", OTHER), ("Other_Letter", OTHER_LETTER), + ("Other_Number", OTHER_NUMBER), ("Other_Punctuation", OTHER_PUNCTUATION), + ("Other_Symbol", OTHER_SYMBOL), + ("Paragraph_Separator", PARAGRAPH_SEPARATOR), ("Private_Use", PRIVATE_USE), + ("Punctuation", PUNCTUATION), ("Separator", SEPARATOR), + ("Space_Separator", SPACE_SEPARATOR), ("Spacing_Mark", SPACING_MARK), + ("Symbol", SYMBOL), ("Titlecase_Letter", TITLECASE_LETTER), + ("Unassigned", UNASSIGNED), ("Uppercase_Letter", UPPERCASE_LETTER), +]; + +pub const CASED_LETTER: &'static [(char, char)] = &[ + ('A', 'Z'), ('a', 'z'), ('µ', 'µ'), ('À', 'Ö'), ('Ø', 'ö'), + ('ø', 'ƺ'), ('Ƽ', 'ƿ'), ('DŽ', 'ʓ'), ('ʕ', 'ʯ'), ('Ͱ', 'ͳ'), + ('Ͷ', 'ͷ'), ('ͻ', 'ͽ'), ('Ϳ', 'Ϳ'), ('Ά', 'Ά'), ('Έ', 'Ί'), + ('Ό', 'Ό'), ('Ύ', 'Ρ'), ('Σ', 'ϵ'), ('Ϸ', 'ҁ'), ('Ҋ', 'ԯ'), + ('Ա', 'Ֆ'), ('ա', 'և'), ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), + ('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ᲀ', 'ᲈ'), ('ᴀ', 'ᴫ'), + ('ᵫ', 'ᵷ'), ('ᵹ', 'ᶚ'), ('Ḁ', 'ἕ'), ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), ('Ὀ', 'Ὅ'), ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), + ('ᾶ', 'ᾼ'), ('ι', 'ι'), ('ῂ', 'ῄ'), ('ῆ', 'ῌ'), + ('ῐ', 'ΐ'), ('ῖ', 'Ί'), ('ῠ', 'Ῥ'), ('ῲ', 'ῴ'), + ('ῶ', 'ῼ'), ('ℂ', 'ℂ'), ('ℇ', 'ℇ'), ('ℊ', 'ℓ'), + ('ℕ', 'ℕ'), ('ℙ', 'ℝ'), ('ℤ', 'ℤ'), ('Ω', 'Ω'), + ('ℨ', 'ℨ'), ('K', 'ℭ'), ('ℯ', 'ℴ'), ('ℹ', 'ℹ'), + ('ℼ', 'ℿ'), ('ⅅ', 'ⅉ'), ('ⅎ', 'ⅎ'), ('Ↄ', 'ↄ'), + ('Ⰰ', 'Ⱞ'), ('ⰰ', 'ⱞ'), ('Ⱡ', 'ⱻ'), ('Ȿ', 'ⳤ'), + ('Ⳬ', 'ⳮ'), ('Ⳳ', 'ⳳ'), ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), ('Ꙁ', 'ꙭ'), ('Ꚁ', 'ꚛ'), ('Ꜣ', 'ꝯ'), + ('ꝱ', 'ꞇ'), ('Ꞌ', 'ꞎ'), ('Ꞑ', 'Ɪ'), ('Ʞ', 'ꞷ'), + ('ꟺ', 'ꟺ'), ('ꬰ', 'ꭚ'), ('ꭠ', 'ꭥ'), ('ꭰ', 'ꮿ'), + ('ff', 'st'), ('ﬓ', 'ﬗ'), ('A', 'Z'), ('a', 'z'), + ('𐐀', '𐑏'), ('𐒰', '𐓓'), ('𐓘', '𐓻'), ('𐲀', '𐲲'), + ('𐳀', '𐳲'), ('𑢠', '𑣟'), ('𝐀', '𝑔'), ('𝑖', '𝒜'), + ('𝒞', '𝒟'), ('𝒢', '𝒢'), ('𝒥', '𝒦'), ('𝒩', '𝒬'), + ('𝒮', '𝒹'), ('𝒻', '𝒻'), ('𝒽', '𝓃'), ('𝓅', '𝔅'), + ('𝔇', '𝔊'), ('𝔍', '𝔔'), ('𝔖', '𝔜'), ('𝔞', '𝔹'), + ('𝔻', '𝔾'), ('𝕀', '𝕄'), ('𝕆', '𝕆'), ('𝕊', '𝕐'), + ('𝕒', '𝚥'), ('𝚨', '𝛀'), ('𝛂', '𝛚'), ('𝛜', '𝛺'), + ('𝛼', '𝜔'), ('𝜖', '𝜴'), ('𝜶', '𝝎'), ('𝝐', '𝝮'), + ('𝝰', '𝞈'), ('𝞊', '𝞨'), ('𝞪', '𝟂'), ('𝟄', '𝟋'), + ('𞤀', '𞥃'), +]; + +pub const CLOSE_PUNCTUATION: &'static [(char, char)] = &[ + (')', ')'), (']', ']'), ('}', '}'), ('༻', '༻'), ('༽', '༽'), + ('᚜', '᚜'), ('⁆', '⁆'), ('⁾', '⁾'), ('₎', '₎'), + ('⌉', '⌉'), ('⌋', '⌋'), ('〉', '〉'), ('❩', '❩'), + ('❫', '❫'), ('❭', '❭'), ('❯', '❯'), ('❱', '❱'), + ('❳', '❳'), ('❵', '❵'), ('⟆', '⟆'), ('⟧', '⟧'), + ('⟩', '⟩'), ('⟫', '⟫'), ('⟭', '⟭'), ('⟯', '⟯'), + ('⦄', '⦄'), ('⦆', '⦆'), ('⦈', '⦈'), ('⦊', '⦊'), + ('⦌', '⦌'), ('⦎', '⦎'), ('⦐', '⦐'), ('⦒', '⦒'), + ('⦔', '⦔'), ('⦖', '⦖'), ('⦘', '⦘'), ('⧙', '⧙'), + ('⧛', '⧛'), ('⧽', '⧽'), ('⸣', '⸣'), ('⸥', '⸥'), + ('⸧', '⸧'), ('⸩', '⸩'), ('〉', '〉'), ('》', '》'), + ('」', '」'), ('』', '』'), ('】', '】'), ('〕', '〕'), + ('〗', '〗'), ('〙', '〙'), ('〛', '〛'), ('〞', '〟'), + ('﴾', '﴾'), ('︘', '︘'), ('︶', '︶'), ('︸', '︸'), + ('︺', '︺'), ('︼', '︼'), ('︾', '︾'), ('﹀', '﹀'), + ('﹂', '﹂'), ('﹄', '﹄'), ('﹈', '﹈'), ('﹚', '﹚'), + ('﹜', '﹜'), ('﹞', '﹞'), (')', ')'), (']', ']'), + ('}', '}'), ('⦆', '⦆'), ('」', '」'), +]; + +pub const CONNECTOR_PUNCTUATION: &'static [(char, char)] = &[ + ('_', '_'), ('‿', '⁀'), ('⁔', '⁔'), ('︳', '︴'), ('﹍', '﹏'), + ('_', '_'), +]; + +pub const CONTROL: &'static [(char, char)] = &[ + ('\u{0}', '\u{1f}'), ('\u{7f}', '\u{9f}'), +]; + +pub const CURRENCY_SYMBOL: &'static [(char, char)] = &[ + ('$', '$'), ('¢', '¥'), ('֏', '֏'), ('؋', '؋'), ('৲', '৳'), + ('৻', '৻'), ('૱', '૱'), ('௹', '௹'), ('฿', '฿'), + ('៛', '៛'), ('₠', '₿'), ('꠸', '꠸'), ('﷼', '﷼'), + ('﹩', '﹩'), ('$', '$'), ('¢', '£'), ('¥', '₩'), +]; + +pub const DASH_PUNCTUATION: &'static [(char, char)] = &[ + ('-', '-'), ('֊', '֊'), ('־', '־'), ('᐀', '᐀'), ('᠆', '᠆'), + ('‐', '―'), ('⸗', '⸗'), ('⸚', '⸚'), ('⸺', '⸻'), + ('⹀', '⹀'), ('〜', '〜'), ('〰', '〰'), ('゠', '゠'), + ('︱', '︲'), ('﹘', '﹘'), ('﹣', '﹣'), ('-', '-'), +]; + +pub const DECIMAL_NUMBER: &'static [(char, char)] = &[ + ('0', '9'), ('٠', '٩'), ('۰', '۹'), ('߀', '߉'), ('०', '९'), + ('০', '৯'), ('੦', '੯'), ('૦', '૯'), ('୦', '୯'), + ('௦', '௯'), ('౦', '౯'), ('೦', '೯'), ('൦', '൯'), + ('෦', '෯'), ('๐', '๙'), ('໐', '໙'), ('༠', '༩'), + ('၀', '၉'), ('႐', '႙'), ('០', '៩'), ('᠐', '᠙'), + ('᥆', '᥏'), ('᧐', '᧙'), ('᪀', '᪉'), ('᪐', '᪙'), + ('᭐', '᭙'), ('᮰', '᮹'), ('᱀', '᱉'), ('᱐', '᱙'), + ('꘠', '꘩'), ('꣐', '꣙'), ('꤀', '꤉'), ('꧐', '꧙'), + ('꧰', '꧹'), ('꩐', '꩙'), ('꯰', '꯹'), ('0', '9'), + ('𐒠', '𐒩'), ('𑁦', '𑁯'), ('𑃰', '𑃹'), ('𑄶', '𑄿'), + ('𑇐', '𑇙'), ('𑋰', '𑋹'), ('𑑐', '𑑙'), ('𑓐', '𑓙'), + ('𑙐', '𑙙'), ('𑛀', '𑛉'), ('𑜰', '𑜹'), ('𑣠', '𑣩'), + ('𑱐', '𑱙'), ('𑵐', '𑵙'), ('𖩠', '𖩩'), ('𖭐', '𖭙'), + ('𝟎', '𝟿'), ('𞥐', '𞥙'), +]; + +pub const ENCLOSING_MARK: &'static [(char, char)] = &[ + ('҈', '҉'), ('᪾', '᪾'), ('⃝', '⃠'), ('⃢', '⃤'), + ('꙰', '꙲'), +]; + +pub const FINAL_PUNCTUATION: &'static [(char, char)] = &[ + ('»', '»'), ('’', '’'), ('”', '”'), ('›', '›'), + ('⸃', '⸃'), ('⸅', '⸅'), ('⸊', '⸊'), ('⸍', '⸍'), + ('⸝', '⸝'), ('⸡', '⸡'), +]; + +pub const FORMAT: &'static [(char, char)] = &[ + ('\u{ad}', '\u{ad}'), ('\u{600}', '\u{605}'), ('\u{61c}', '\u{61c}'), + ('\u{6dd}', '\u{6dd}'), ('\u{70f}', '\u{70f}'), ('\u{8e2}', '\u{8e2}'), + ('\u{180e}', '\u{180e}'), ('\u{200b}', '\u{200f}'), + ('\u{202a}', '\u{202e}'), ('\u{2060}', '\u{2064}'), + ('\u{2066}', '\u{206f}'), ('\u{feff}', '\u{feff}'), + ('\u{fff9}', '\u{fffb}'), ('\u{110bd}', '\u{110bd}'), + ('\u{1bca0}', '\u{1bca3}'), ('\u{1d173}', '\u{1d17a}'), + ('\u{e0001}', '\u{e0001}'), ('\u{e0020}', '\u{e007f}'), +]; + +pub const INITIAL_PUNCTUATION: &'static [(char, char)] = &[ + ('«', '«'), ('‘', '‘'), ('‛', '“'), ('‟', '‟'), + ('‹', '‹'), ('⸂', '⸂'), ('⸄', '⸄'), ('⸉', '⸉'), + ('⸌', '⸌'), ('⸜', '⸜'), ('⸠', '⸠'), +]; + +pub const LETTER: &'static [(char, char)] = &[ + ('A', 'Z'), ('a', 'z'), ('ª', 'ª'), ('µ', 'µ'), ('º', 'º'), + ('À', 'Ö'), ('Ø', 'ö'), ('ø', 'ˁ'), ('ˆ', 'ˑ'), ('ˠ', 'ˤ'), + ('ˬ', 'ˬ'), ('ˮ', 'ˮ'), ('Ͱ', 'ʹ'), ('Ͷ', 'ͷ'), ('ͺ', 'ͽ'), + ('Ϳ', 'Ϳ'), ('Ά', 'Ά'), ('Έ', 'Ί'), ('Ό', 'Ό'), ('Ύ', 'Ρ'), + ('Σ', 'ϵ'), ('Ϸ', 'ҁ'), ('Ҋ', 'ԯ'), ('Ա', 'Ֆ'), ('ՙ', 'ՙ'), + ('ա', 'և'), ('א', 'ת'), ('װ', 'ײ'), ('ؠ', 'ي'), ('ٮ', 'ٯ'), + ('ٱ', 'ۓ'), ('ە', 'ە'), ('ۥ', 'ۦ'), ('ۮ', 'ۯ'), ('ۺ', 'ۼ'), + ('ۿ', 'ۿ'), ('ܐ', 'ܐ'), ('ܒ', 'ܯ'), ('ݍ', 'ޥ'), ('ޱ', 'ޱ'), + ('ߊ', 'ߪ'), ('ߴ', 'ߵ'), ('ߺ', 'ߺ'), ('ࠀ', 'ࠕ'), ('ࠚ', 'ࠚ'), + ('ࠤ', 'ࠤ'), ('ࠨ', 'ࠨ'), ('ࡀ', 'ࡘ'), ('ࡠ', 'ࡪ'), + ('ࢠ', 'ࢴ'), ('ࢶ', 'ࢽ'), ('ऄ', 'ह'), ('ऽ', 'ऽ'), + ('ॐ', 'ॐ'), ('क़', 'ॡ'), ('ॱ', 'ঀ'), ('অ', 'ঌ'), + ('এ', 'ঐ'), ('ও', 'ন'), ('প', 'র'), ('ল', 'ল'), + ('শ', 'হ'), ('ঽ', 'ঽ'), ('ৎ', 'ৎ'), ('ড়', 'ঢ়'), + ('য়', 'ৡ'), ('ৰ', 'ৱ'), ('ৼ', 'ৼ'), ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), ('ਓ', 'ਨ'), ('ਪ', 'ਰ'), ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), ('ਸ', 'ਹ'), ('ਖ਼', 'ੜ'), ('ਫ਼', 'ਫ਼'), + ('ੲ', 'ੴ'), ('અ', 'ઍ'), ('એ', 'ઑ'), ('ઓ', 'ન'), + ('પ', 'ર'), ('લ', 'ળ'), ('વ', 'હ'), ('ઽ', 'ઽ'), + ('ૐ', 'ૐ'), ('ૠ', 'ૡ'), ('ૹ', 'ૹ'), ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), ('ଓ', 'ନ'), ('ପ', 'ର'), ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), ('ଽ', 'ଽ'), ('ଡ଼', 'ଢ଼'), ('ୟ', 'ୡ'), + ('ୱ', 'ୱ'), ('ஃ', 'ஃ'), ('அ', 'ஊ'), ('எ', 'ஐ'), + ('ஒ', 'க'), ('ங', 'ச'), ('ஜ', 'ஜ'), ('ஞ', 'ட'), + ('ண', 'த'), ('ந', 'ப'), ('ம', 'ஹ'), ('ௐ', 'ௐ'), + ('అ', 'ఌ'), ('ఎ', 'ఐ'), ('ఒ', 'న'), ('ప', 'హ'), + ('ఽ', 'ఽ'), ('ౘ', 'ౚ'), ('ౠ', 'ౡ'), ('ಀ', 'ಀ'), + ('ಅ', 'ಌ'), ('ಎ', 'ಐ'), ('ಒ', 'ನ'), ('ಪ', 'ಳ'), + ('ವ', 'ಹ'), ('ಽ', 'ಽ'), ('ೞ', 'ೞ'), ('ೠ', 'ೡ'), + ('ೱ', 'ೲ'), ('അ', 'ഌ'), ('എ', 'ഐ'), ('ഒ', 'ഺ'), + ('ഽ', 'ഽ'), ('ൎ', 'ൎ'), ('ൔ', 'ൖ'), ('ൟ', 'ൡ'), + ('ൺ', 'ൿ'), ('අ', 'ඖ'), ('ක', 'න'), ('ඳ', 'ර'), + ('ල', 'ල'), ('ව', 'ෆ'), ('ก', 'ะ'), ('า', 'ำ'), + ('เ', 'ๆ'), ('ກ', 'ຂ'), ('ຄ', 'ຄ'), ('ງ', 'ຈ'), + ('ຊ', 'ຊ'), ('ຍ', 'ຍ'), ('ດ', 'ທ'), ('ນ', 'ຟ'), + ('ມ', 'ຣ'), ('ລ', 'ລ'), ('ວ', 'ວ'), ('ສ', 'ຫ'), + ('ອ', 'ະ'), ('າ', 'ຳ'), ('ຽ', 'ຽ'), ('ເ', 'ໄ'), + ('ໆ', 'ໆ'), ('ໜ', 'ໟ'), ('ༀ', 'ༀ'), ('ཀ', 'ཇ'), + ('ཉ', 'ཬ'), ('ྈ', 'ྌ'), ('က', 'ဪ'), ('ဿ', 'ဿ'), + ('ၐ', 'ၕ'), ('ၚ', 'ၝ'), ('ၡ', 'ၡ'), ('ၥ', 'ၦ'), + ('ၮ', 'ၰ'), ('ၵ', 'ႁ'), ('ႎ', 'ႎ'), ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), ('ა', 'ჺ'), ('ჼ', 'ቈ'), + ('ቊ', 'ቍ'), ('ቐ', 'ቖ'), ('ቘ', 'ቘ'), ('ቚ', 'ቝ'), + ('በ', 'ኈ'), ('ኊ', 'ኍ'), ('ነ', 'ኰ'), ('ኲ', 'ኵ'), + ('ኸ', 'ኾ'), ('ዀ', 'ዀ'), ('ዂ', 'ዅ'), ('ወ', 'ዖ'), + ('ዘ', 'ጐ'), ('ጒ', 'ጕ'), ('ጘ', 'ፚ'), ('ᎀ', 'ᎏ'), + ('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ᐁ', 'ᙬ'), ('ᙯ', 'ᙿ'), + ('ᚁ', 'ᚚ'), ('ᚠ', 'ᛪ'), ('ᛱ', 'ᛸ'), ('ᜀ', 'ᜌ'), + ('ᜎ', 'ᜑ'), ('ᜠ', 'ᜱ'), ('ᝀ', 'ᝑ'), ('ᝠ', 'ᝬ'), + ('ᝮ', 'ᝰ'), ('ក', 'ឳ'), ('ៗ', 'ៗ'), ('ៜ', 'ៜ'), + ('ᠠ', 'ᡷ'), ('ᢀ', 'ᢄ'), ('ᢇ', 'ᢨ'), ('ᢪ', 'ᢪ'), + ('ᢰ', 'ᣵ'), ('ᤀ', 'ᤞ'), ('ᥐ', 'ᥭ'), ('ᥰ', 'ᥴ'), + ('ᦀ', 'ᦫ'), ('ᦰ', 'ᧉ'), ('ᨀ', 'ᨖ'), ('ᨠ', 'ᩔ'), + ('ᪧ', 'ᪧ'), ('ᬅ', 'ᬳ'), ('ᭅ', 'ᭋ'), ('ᮃ', 'ᮠ'), + ('ᮮ', 'ᮯ'), ('ᮺ', 'ᯥ'), ('ᰀ', 'ᰣ'), ('ᱍ', 'ᱏ'), + ('ᱚ', 'ᱽ'), ('ᲀ', 'ᲈ'), ('ᳩ', 'ᳬ'), ('ᳮ', 'ᳱ'), + ('ᳵ', 'ᳶ'), ('ᴀ', 'ᶿ'), ('Ḁ', 'ἕ'), ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), ('Ὀ', 'Ὅ'), ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), + ('ᾶ', 'ᾼ'), ('ι', 'ι'), ('ῂ', 'ῄ'), ('ῆ', 'ῌ'), + ('ῐ', 'ΐ'), ('ῖ', 'Ί'), ('ῠ', 'Ῥ'), ('ῲ', 'ῴ'), + ('ῶ', 'ῼ'), ('ⁱ', 'ⁱ'), ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), + ('ℂ', 'ℂ'), ('ℇ', 'ℇ'), ('ℊ', 'ℓ'), ('ℕ', 'ℕ'), + ('ℙ', 'ℝ'), ('ℤ', 'ℤ'), ('Ω', 'Ω'), ('ℨ', 'ℨ'), + ('K', 'ℭ'), ('ℯ', 'ℹ'), ('ℼ', 'ℿ'), ('ⅅ', 'ⅉ'), + ('ⅎ', 'ⅎ'), ('Ↄ', 'ↄ'), ('Ⰰ', 'Ⱞ'), ('ⰰ', 'ⱞ'), + ('Ⱡ', 'ⳤ'), ('Ⳬ', 'ⳮ'), ('Ⳳ', 'ⳳ'), ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), ('ⴰ', 'ⵧ'), ('ⵯ', 'ⵯ'), + ('ⶀ', 'ⶖ'), ('ⶠ', 'ⶦ'), ('ⶨ', 'ⶮ'), ('ⶰ', 'ⶶ'), + ('ⶸ', 'ⶾ'), ('ⷀ', 'ⷆ'), ('ⷈ', 'ⷎ'), ('ⷐ', 'ⷖ'), + ('ⷘ', 'ⷞ'), ('ⸯ', 'ⸯ'), ('々', '〆'), ('〱', '〵'), + ('〻', '〼'), ('ぁ', 'ゖ'), ('ゝ', 'ゟ'), ('ァ', 'ヺ'), + ('ー', 'ヿ'), ('ㄅ', 'ㄮ'), ('ㄱ', 'ㆎ'), ('ㆠ', 'ㆺ'), + ('ㇰ', 'ㇿ'), ('㐀', '䶵'), ('一', '鿪'), ('ꀀ', 'ꒌ'), + ('ꓐ', 'ꓽ'), ('ꔀ', 'ꘌ'), ('ꘐ', 'ꘟ'), ('ꘪ', 'ꘫ'), + ('Ꙁ', 'ꙮ'), ('ꙿ', 'ꚝ'), ('ꚠ', 'ꛥ'), ('ꜗ', 'ꜟ'), + ('Ꜣ', 'ꞈ'), ('Ꞌ', 'Ɪ'), ('Ʞ', 'ꞷ'), ('ꟷ', 'ꠁ'), + ('ꠃ', 'ꠅ'), ('ꠇ', 'ꠊ'), ('ꠌ', 'ꠢ'), ('ꡀ', 'ꡳ'), + ('ꢂ', 'ꢳ'), ('ꣲ', 'ꣷ'), ('ꣻ', 'ꣻ'), ('ꣽ', 'ꣽ'), + ('ꤊ', 'ꤥ'), ('ꤰ', 'ꥆ'), ('ꥠ', 'ꥼ'), ('ꦄ', 'ꦲ'), + ('ꧏ', 'ꧏ'), ('ꧠ', 'ꧤ'), ('ꧦ', 'ꧯ'), ('ꧺ', 'ꧾ'), + ('ꨀ', 'ꨨ'), ('ꩀ', 'ꩂ'), ('ꩄ', 'ꩋ'), ('ꩠ', 'ꩶ'), + ('ꩺ', 'ꩺ'), ('ꩾ', 'ꪯ'), ('ꪱ', 'ꪱ'), ('ꪵ', 'ꪶ'), + ('ꪹ', 'ꪽ'), ('ꫀ', 'ꫀ'), ('ꫂ', 'ꫂ'), ('ꫛ', 'ꫝ'), + ('ꫠ', 'ꫪ'), ('ꫲ', 'ꫴ'), ('ꬁ', 'ꬆ'), ('ꬉ', 'ꬎ'), + ('ꬑ', 'ꬖ'), ('ꬠ', 'ꬦ'), ('ꬨ', 'ꬮ'), ('ꬰ', 'ꭚ'), + ('ꭜ', 'ꭥ'), ('ꭰ', 'ꯢ'), ('가', '힣'), ('ힰ', 'ퟆ'), + ('ퟋ', 'ퟻ'), ('豈', '舘'), ('並', '龎'), ('ff', 'st'), + ('ﬓ', 'ﬗ'), ('יִ', 'יִ'), ('ײַ', 'ﬨ'), ('שׁ', 'זּ'), + ('טּ', 'לּ'), ('מּ', 'מּ'), ('נּ', 'סּ'), ('ףּ', 'פּ'), + ('צּ', 'ﮱ'), ('ﯓ', 'ﴽ'), ('ﵐ', 'ﶏ'), ('ﶒ', 'ﷇ'), + ('ﷰ', 'ﷻ'), ('ﹰ', 'ﹴ'), ('ﹶ', 'ﻼ'), ('A', 'Z'), + ('a', 'z'), ('ヲ', 'ᄒ'), ('ᅡ', 'ᅦ'), ('ᅧ', 'ᅬ'), + ('ᅭ', 'ᅲ'), ('ᅳ', 'ᅵ'), ('𐀀', '𐀋'), ('𐀍', '𐀦'), + ('𐀨', '𐀺'), ('𐀼', '𐀽'), ('𐀿', '𐁍'), ('𐁐', '𐁝'), + ('𐂀', '𐃺'), ('𐊀', '𐊜'), ('𐊠', '𐋐'), ('𐌀', '𐌟'), + ('𐌭', '𐍀'), ('𐍂', '𐍉'), ('𐍐', '𐍵'), ('𐎀', '𐎝'), + ('𐎠', '𐏃'), ('𐏈', '𐏏'), ('𐐀', '𐒝'), ('𐒰', '𐓓'), + ('𐓘', '𐓻'), ('𐔀', '𐔧'), ('𐔰', '𐕣'), ('𐘀', '𐜶'), + ('𐝀', '𐝕'), ('𐝠', '𐝧'), ('𐠀', '𐠅'), ('𐠈', '𐠈'), + ('𐠊', '𐠵'), ('𐠷', '𐠸'), ('𐠼', '𐠼'), ('𐠿', '𐡕'), + ('𐡠', '𐡶'), ('𐢀', '𐢞'), ('𐣠', '𐣲'), ('𐣴', '𐣵'), + ('𐤀', '𐤕'), ('𐤠', '𐤹'), ('𐦀', '𐦷'), ('𐦾', '𐦿'), + ('𐨀', '𐨀'), ('𐨐', '𐨓'), ('𐨕', '𐨗'), ('𐨙', '𐨳'), + ('𐩠', '𐩼'), ('𐪀', '𐪜'), ('𐫀', '𐫇'), ('𐫉', '𐫤'), + ('𐬀', '𐬵'), ('𐭀', '𐭕'), ('𐭠', '𐭲'), ('𐮀', '𐮑'), + ('𐰀', '𐱈'), ('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𑀃', '𑀷'), + ('𑂃', '𑂯'), ('𑃐', '𑃨'), ('𑄃', '𑄦'), ('𑅐', '𑅲'), + ('𑅶', '𑅶'), ('𑆃', '𑆲'), ('𑇁', '𑇄'), ('𑇚', '𑇚'), + ('𑇜', '𑇜'), ('𑈀', '𑈑'), ('𑈓', '𑈫'), ('𑊀', '𑊆'), + ('𑊈', '𑊈'), ('𑊊', '𑊍'), ('𑊏', '𑊝'), ('𑊟', '𑊨'), + ('𑊰', '𑋞'), ('𑌅', '𑌌'), ('𑌏', '𑌐'), ('𑌓', '𑌨'), + ('𑌪', '𑌰'), ('𑌲', '𑌳'), ('𑌵', '𑌹'), ('𑌽', '𑌽'), + ('𑍐', '𑍐'), ('𑍝', '𑍡'), ('𑐀', '𑐴'), ('𑑇', '𑑊'), + ('𑒀', '𑒯'), ('𑓄', '𑓅'), ('𑓇', '𑓇'), ('𑖀', '𑖮'), + ('𑗘', '𑗛'), ('𑘀', '𑘯'), ('𑙄', '𑙄'), ('𑚀', '𑚪'), + ('𑜀', '𑜙'), ('𑢠', '𑣟'), ('𑣿', '𑣿'), ('𑨀', '𑨀'), + ('𑨋', '𑨲'), ('𑨺', '𑨺'), ('𑩐', '𑩐'), ('𑩜', '𑪃'), + ('𑪆', '𑪉'), ('𑫀', '𑫸'), ('𑰀', '𑰈'), ('𑰊', '𑰮'), + ('𑱀', '𑱀'), ('𑱲', '𑲏'), ('𑴀', '𑴆'), ('𑴈', '𑴉'), + ('𑴋', '𑴰'), ('𑵆', '𑵆'), ('𒀀', '𒎙'), ('𒒀', '𒕃'), + ('𓀀', '𓐮'), ('𔐀', '𔙆'), ('𖠀', '𖨸'), ('𖩀', '𖩞'), + ('𖫐', '𖫭'), ('𖬀', '𖬯'), ('𖭀', '𖭃'), ('𖭣', '𖭷'), + ('𖭽', '𖮏'), ('𖼀', '𖽄'), ('𖽐', '𖽐'), ('𖾓', '𖾟'), + ('𖿠', '𖿡'), ('𗀀', '𘟬'), ('𘠀', '𘫲'), ('𛀀', '𛄞'), + ('𛅰', '𛋻'), ('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), + ('𛲐', '𛲙'), ('𝐀', '𝑔'), ('𝑖', '𝒜'), ('𝒞', '𝒟'), + ('𝒢', '𝒢'), ('𝒥', '𝒦'), ('𝒩', '𝒬'), ('𝒮', '𝒹'), + ('𝒻', '𝒻'), ('𝒽', '𝓃'), ('𝓅', '𝔅'), ('𝔇', '𝔊'), + ('𝔍', '𝔔'), ('𝔖', '𝔜'), ('𝔞', '𝔹'), ('𝔻', '𝔾'), + ('𝕀', '𝕄'), ('𝕆', '𝕆'), ('𝕊', '𝕐'), ('𝕒', '𝚥'), + ('𝚨', '𝛀'), ('𝛂', '𝛚'), ('𝛜', '𝛺'), ('𝛼', '𝜔'), + ('𝜖', '𝜴'), ('𝜶', '𝝎'), ('𝝐', '𝝮'), ('𝝰', '𝞈'), + ('𝞊', '𝞨'), ('𝞪', '𝟂'), ('𝟄', '𝟋'), ('𞠀', '𞣄'), + ('𞤀', '𞥃'), ('𞸀', '𞸃'), ('𞸅', '𞸟'), ('𞸡', '𞸢'), + ('𞸤', '𞸤'), ('𞸧', '𞸧'), ('𞸩', '𞸲'), ('𞸴', '𞸷'), + ('𞸹', '𞸹'), ('𞸻', '𞸻'), ('𞹂', '𞹂'), ('𞹇', '𞹇'), + ('𞹉', '𞹉'), ('𞹋', '𞹋'), ('𞹍', '𞹏'), ('𞹑', '𞹒'), + ('𞹔', '𞹔'), ('𞹗', '𞹗'), ('𞹙', '𞹙'), ('𞹛', '𞹛'), + ('𞹝', '𞹝'), ('𞹟', '𞹟'), ('𞹡', '𞹢'), ('𞹤', '𞹤'), + ('𞹧', '𞹪'), ('𞹬', '𞹲'), ('𞹴', '𞹷'), ('𞹹', '𞹼'), + ('𞹾', '𞹾'), ('𞺀', '𞺉'), ('𞺋', '𞺛'), ('𞺡', '𞺣'), + ('𞺥', '𞺩'), ('𞺫', '𞺻'), ('𠀀', '𪛖'), ('𪜀', '𫜴'), + ('𫝀', '𫠝'), ('𫠠', '𬺡'), ('𬺰', '𮯠'), ('丽', '𪘀'), +]; + +pub const LETTER_NUMBER: &'static [(char, char)] = &[ + ('ᛮ', 'ᛰ'), ('Ⅰ', 'ↂ'), ('ↅ', 'ↈ'), ('〇', '〇'), + ('〡', '〩'), ('〸', '〺'), ('ꛦ', 'ꛯ'), ('𐅀', '𐅴'), + ('𐍁', '𐍁'), ('𐍊', '𐍊'), ('𐏑', '𐏕'), ('𒐀', '𒑮'), +]; + +pub const LINE_SEPARATOR: &'static [(char, char)] = &[ + ('\u{2028}', '\u{2028}'), +]; + +pub const LOWERCASE_LETTER: &'static [(char, char)] = &[ + ('a', 'z'), ('µ', 'µ'), ('ß', 'ö'), ('ø', 'ÿ'), ('ā', 'ā'), + ('ă', 'ă'), ('ą', 'ą'), ('ć', 'ć'), ('ĉ', 'ĉ'), ('ċ', 'ċ'), + ('č', 'č'), ('ď', 'ď'), ('đ', 'đ'), ('ē', 'ē'), ('ĕ', 'ĕ'), + ('ė', 'ė'), ('ę', 'ę'), ('ě', 'ě'), ('ĝ', 'ĝ'), ('ğ', 'ğ'), + ('ġ', 'ġ'), ('ģ', 'ģ'), ('ĥ', 'ĥ'), ('ħ', 'ħ'), ('ĩ', 'ĩ'), + ('ī', 'ī'), ('ĭ', 'ĭ'), ('į', 'į'), ('ı', 'ı'), ('ij', 'ij'), + ('ĵ', 'ĵ'), ('ķ', 'ĸ'), ('ĺ', 'ĺ'), ('ļ', 'ļ'), ('ľ', 'ľ'), + ('ŀ', 'ŀ'), ('ł', 'ł'), ('ń', 'ń'), ('ņ', 'ņ'), ('ň', 'ʼn'), + ('ŋ', 'ŋ'), ('ō', 'ō'), ('ŏ', 'ŏ'), ('ő', 'ő'), ('œ', 'œ'), + ('ŕ', 'ŕ'), ('ŗ', 'ŗ'), ('ř', 'ř'), ('ś', 'ś'), ('ŝ', 'ŝ'), + ('ş', 'ş'), ('š', 'š'), ('ţ', 'ţ'), ('ť', 'ť'), ('ŧ', 'ŧ'), + ('ũ', 'ũ'), ('ū', 'ū'), ('ŭ', 'ŭ'), ('ů', 'ů'), ('ű', 'ű'), + ('ų', 'ų'), ('ŵ', 'ŵ'), ('ŷ', 'ŷ'), ('ź', 'ź'), ('ż', 'ż'), + ('ž', 'ƀ'), ('ƃ', 'ƃ'), ('ƅ', 'ƅ'), ('ƈ', 'ƈ'), ('ƌ', 'ƍ'), + ('ƒ', 'ƒ'), ('ƕ', 'ƕ'), ('ƙ', 'ƛ'), ('ƞ', 'ƞ'), ('ơ', 'ơ'), + ('ƣ', 'ƣ'), ('ƥ', 'ƥ'), ('ƨ', 'ƨ'), ('ƪ', 'ƫ'), ('ƭ', 'ƭ'), + ('ư', 'ư'), ('ƴ', 'ƴ'), ('ƶ', 'ƶ'), ('ƹ', 'ƺ'), ('ƽ', 'ƿ'), + ('dž', 'dž'), ('lj', 'lj'), ('nj', 'nj'), ('ǎ', 'ǎ'), ('ǐ', 'ǐ'), + ('ǒ', 'ǒ'), ('ǔ', 'ǔ'), ('ǖ', 'ǖ'), ('ǘ', 'ǘ'), ('ǚ', 'ǚ'), + ('ǜ', 'ǝ'), ('ǟ', 'ǟ'), ('ǡ', 'ǡ'), ('ǣ', 'ǣ'), ('ǥ', 'ǥ'), + ('ǧ', 'ǧ'), ('ǩ', 'ǩ'), ('ǫ', 'ǫ'), ('ǭ', 'ǭ'), ('ǯ', 'ǰ'), + ('dz', 'dz'), ('ǵ', 'ǵ'), ('ǹ', 'ǹ'), ('ǻ', 'ǻ'), ('ǽ', 'ǽ'), + ('ǿ', 'ǿ'), ('ȁ', 'ȁ'), ('ȃ', 'ȃ'), ('ȅ', 'ȅ'), ('ȇ', 'ȇ'), + ('ȉ', 'ȉ'), ('ȋ', 'ȋ'), ('ȍ', 'ȍ'), ('ȏ', 'ȏ'), ('ȑ', 'ȑ'), + ('ȓ', 'ȓ'), ('ȕ', 'ȕ'), ('ȗ', 'ȗ'), ('ș', 'ș'), ('ț', 'ț'), + ('ȝ', 'ȝ'), ('ȟ', 'ȟ'), ('ȡ', 'ȡ'), ('ȣ', 'ȣ'), ('ȥ', 'ȥ'), + ('ȧ', 'ȧ'), ('ȩ', 'ȩ'), ('ȫ', 'ȫ'), ('ȭ', 'ȭ'), ('ȯ', 'ȯ'), + ('ȱ', 'ȱ'), ('ȳ', 'ȹ'), ('ȼ', 'ȼ'), ('ȿ', 'ɀ'), ('ɂ', 'ɂ'), + ('ɇ', 'ɇ'), ('ɉ', 'ɉ'), ('ɋ', 'ɋ'), ('ɍ', 'ɍ'), ('ɏ', 'ʓ'), + ('ʕ', 'ʯ'), ('ͱ', 'ͱ'), ('ͳ', 'ͳ'), ('ͷ', 'ͷ'), ('ͻ', 'ͽ'), + ('ΐ', 'ΐ'), ('ά', 'ώ'), ('ϐ', 'ϑ'), ('ϕ', 'ϗ'), ('ϙ', 'ϙ'), + ('ϛ', 'ϛ'), ('ϝ', 'ϝ'), ('ϟ', 'ϟ'), ('ϡ', 'ϡ'), ('ϣ', 'ϣ'), + ('ϥ', 'ϥ'), ('ϧ', 'ϧ'), ('ϩ', 'ϩ'), ('ϫ', 'ϫ'), ('ϭ', 'ϭ'), + ('ϯ', 'ϳ'), ('ϵ', 'ϵ'), ('ϸ', 'ϸ'), ('ϻ', 'ϼ'), ('а', 'џ'), + ('ѡ', 'ѡ'), ('ѣ', 'ѣ'), ('ѥ', 'ѥ'), ('ѧ', 'ѧ'), ('ѩ', 'ѩ'), + ('ѫ', 'ѫ'), ('ѭ', 'ѭ'), ('ѯ', 'ѯ'), ('ѱ', 'ѱ'), ('ѳ', 'ѳ'), + ('ѵ', 'ѵ'), ('ѷ', 'ѷ'), ('ѹ', 'ѹ'), ('ѻ', 'ѻ'), ('ѽ', 'ѽ'), + ('ѿ', 'ѿ'), ('ҁ', 'ҁ'), ('ҋ', 'ҋ'), ('ҍ', 'ҍ'), ('ҏ', 'ҏ'), + ('ґ', 'ґ'), ('ғ', 'ғ'), ('ҕ', 'ҕ'), ('җ', 'җ'), ('ҙ', 'ҙ'), + ('қ', 'қ'), ('ҝ', 'ҝ'), ('ҟ', 'ҟ'), ('ҡ', 'ҡ'), ('ң', 'ң'), + ('ҥ', 'ҥ'), ('ҧ', 'ҧ'), ('ҩ', 'ҩ'), ('ҫ', 'ҫ'), ('ҭ', 'ҭ'), + ('ү', 'ү'), ('ұ', 'ұ'), ('ҳ', 'ҳ'), ('ҵ', 'ҵ'), ('ҷ', 'ҷ'), + ('ҹ', 'ҹ'), ('һ', 'һ'), ('ҽ', 'ҽ'), ('ҿ', 'ҿ'), ('ӂ', 'ӂ'), + ('ӄ', 'ӄ'), ('ӆ', 'ӆ'), ('ӈ', 'ӈ'), ('ӊ', 'ӊ'), ('ӌ', 'ӌ'), + ('ӎ', 'ӏ'), ('ӑ', 'ӑ'), ('ӓ', 'ӓ'), ('ӕ', 'ӕ'), ('ӗ', 'ӗ'), + ('ә', 'ә'), ('ӛ', 'ӛ'), ('ӝ', 'ӝ'), ('ӟ', 'ӟ'), ('ӡ', 'ӡ'), + ('ӣ', 'ӣ'), ('ӥ', 'ӥ'), ('ӧ', 'ӧ'), ('ө', 'ө'), ('ӫ', 'ӫ'), + ('ӭ', 'ӭ'), ('ӯ', 'ӯ'), ('ӱ', 'ӱ'), ('ӳ', 'ӳ'), ('ӵ', 'ӵ'), + ('ӷ', 'ӷ'), ('ӹ', 'ӹ'), ('ӻ', 'ӻ'), ('ӽ', 'ӽ'), ('ӿ', 'ӿ'), + ('ԁ', 'ԁ'), ('ԃ', 'ԃ'), ('ԅ', 'ԅ'), ('ԇ', 'ԇ'), ('ԉ', 'ԉ'), + ('ԋ', 'ԋ'), ('ԍ', 'ԍ'), ('ԏ', 'ԏ'), ('ԑ', 'ԑ'), ('ԓ', 'ԓ'), + ('ԕ', 'ԕ'), ('ԗ', 'ԗ'), ('ԙ', 'ԙ'), ('ԛ', 'ԛ'), ('ԝ', 'ԝ'), + ('ԟ', 'ԟ'), ('ԡ', 'ԡ'), ('ԣ', 'ԣ'), ('ԥ', 'ԥ'), ('ԧ', 'ԧ'), + ('ԩ', 'ԩ'), ('ԫ', 'ԫ'), ('ԭ', 'ԭ'), ('ԯ', 'ԯ'), ('ա', 'և'), + ('ᏸ', 'ᏽ'), ('ᲀ', 'ᲈ'), ('ᴀ', 'ᴫ'), ('ᵫ', 'ᵷ'), + ('ᵹ', 'ᶚ'), ('ḁ', 'ḁ'), ('ḃ', 'ḃ'), ('ḅ', 'ḅ'), + ('ḇ', 'ḇ'), ('ḉ', 'ḉ'), ('ḋ', 'ḋ'), ('ḍ', 'ḍ'), + ('ḏ', 'ḏ'), ('ḑ', 'ḑ'), ('ḓ', 'ḓ'), ('ḕ', 'ḕ'), + ('ḗ', 'ḗ'), ('ḙ', 'ḙ'), ('ḛ', 'ḛ'), ('ḝ', 'ḝ'), + ('ḟ', 'ḟ'), ('ḡ', 'ḡ'), ('ḣ', 'ḣ'), ('ḥ', 'ḥ'), + ('ḧ', 'ḧ'), ('ḩ', 'ḩ'), ('ḫ', 'ḫ'), ('ḭ', 'ḭ'), + ('ḯ', 'ḯ'), ('ḱ', 'ḱ'), ('ḳ', 'ḳ'), ('ḵ', 'ḵ'), + ('ḷ', 'ḷ'), ('ḹ', 'ḹ'), ('ḻ', 'ḻ'), ('ḽ', 'ḽ'), + ('ḿ', 'ḿ'), ('ṁ', 'ṁ'), ('ṃ', 'ṃ'), ('ṅ', 'ṅ'), + ('ṇ', 'ṇ'), ('ṉ', 'ṉ'), ('ṋ', 'ṋ'), ('ṍ', 'ṍ'), + ('ṏ', 'ṏ'), ('ṑ', 'ṑ'), ('ṓ', 'ṓ'), ('ṕ', 'ṕ'), + ('ṗ', 'ṗ'), ('ṙ', 'ṙ'), ('ṛ', 'ṛ'), ('ṝ', 'ṝ'), + ('ṟ', 'ṟ'), ('ṡ', 'ṡ'), ('ṣ', 'ṣ'), ('ṥ', 'ṥ'), + ('ṧ', 'ṧ'), ('ṩ', 'ṩ'), ('ṫ', 'ṫ'), ('ṭ', 'ṭ'), + ('ṯ', 'ṯ'), ('ṱ', 'ṱ'), ('ṳ', 'ṳ'), ('ṵ', 'ṵ'), + ('ṷ', 'ṷ'), ('ṹ', 'ṹ'), ('ṻ', 'ṻ'), ('ṽ', 'ṽ'), + ('ṿ', 'ṿ'), ('ẁ', 'ẁ'), ('ẃ', 'ẃ'), ('ẅ', 'ẅ'), + ('ẇ', 'ẇ'), ('ẉ', 'ẉ'), ('ẋ', 'ẋ'), ('ẍ', 'ẍ'), + ('ẏ', 'ẏ'), ('ẑ', 'ẑ'), ('ẓ', 'ẓ'), ('ẕ', 'ẝ'), + ('ẟ', 'ẟ'), ('ạ', 'ạ'), ('ả', 'ả'), ('ấ', 'ấ'), + ('ầ', 'ầ'), ('ẩ', 'ẩ'), ('ẫ', 'ẫ'), ('ậ', 'ậ'), + ('ắ', 'ắ'), ('ằ', 'ằ'), ('ẳ', 'ẳ'), ('ẵ', 'ẵ'), + ('ặ', 'ặ'), ('ẹ', 'ẹ'), ('ẻ', 'ẻ'), ('ẽ', 'ẽ'), + ('ế', 'ế'), ('ề', 'ề'), ('ể', 'ể'), ('ễ', 'ễ'), + ('ệ', 'ệ'), ('ỉ', 'ỉ'), ('ị', 'ị'), ('ọ', 'ọ'), + ('ỏ', 'ỏ'), ('ố', 'ố'), ('ồ', 'ồ'), ('ổ', 'ổ'), + ('ỗ', 'ỗ'), ('ộ', 'ộ'), ('ớ', 'ớ'), ('ờ', 'ờ'), + ('ở', 'ở'), ('ỡ', 'ỡ'), ('ợ', 'ợ'), ('ụ', 'ụ'), + ('ủ', 'ủ'), ('ứ', 'ứ'), ('ừ', 'ừ'), ('ử', 'ử'), + ('ữ', 'ữ'), ('ự', 'ự'), ('ỳ', 'ỳ'), ('ỵ', 'ỵ'), + ('ỷ', 'ỷ'), ('ỹ', 'ỹ'), ('ỻ', 'ỻ'), ('ỽ', 'ỽ'), + ('ỿ', 'ἇ'), ('ἐ', 'ἕ'), ('ἠ', 'ἧ'), ('ἰ', 'ἷ'), + ('ὀ', 'ὅ'), ('ὐ', 'ὗ'), ('ὠ', 'ὧ'), ('ὰ', 'ώ'), + ('ᾀ', 'ᾇ'), ('ᾐ', 'ᾗ'), ('ᾠ', 'ᾧ'), ('ᾰ', 'ᾴ'), + ('ᾶ', 'ᾷ'), ('ι', 'ι'), ('ῂ', 'ῄ'), ('ῆ', 'ῇ'), + ('ῐ', 'ΐ'), ('ῖ', 'ῗ'), ('ῠ', 'ῧ'), ('ῲ', 'ῴ'), + ('ῶ', 'ῷ'), ('ℊ', 'ℊ'), ('ℎ', 'ℏ'), ('ℓ', 'ℓ'), + ('ℯ', 'ℯ'), ('ℴ', 'ℴ'), ('ℹ', 'ℹ'), ('ℼ', 'ℽ'), + ('ⅆ', 'ⅉ'), ('ⅎ', 'ⅎ'), ('ↄ', 'ↄ'), ('ⰰ', 'ⱞ'), + ('ⱡ', 'ⱡ'), ('ⱥ', 'ⱦ'), ('ⱨ', 'ⱨ'), ('ⱪ', 'ⱪ'), + ('ⱬ', 'ⱬ'), ('ⱱ', 'ⱱ'), ('ⱳ', 'ⱴ'), ('ⱶ', 'ⱻ'), + ('ⲁ', 'ⲁ'), ('ⲃ', 'ⲃ'), ('ⲅ', 'ⲅ'), ('ⲇ', 'ⲇ'), + ('ⲉ', 'ⲉ'), ('ⲋ', 'ⲋ'), ('ⲍ', 'ⲍ'), ('ⲏ', 'ⲏ'), + ('ⲑ', 'ⲑ'), ('ⲓ', 'ⲓ'), ('ⲕ', 'ⲕ'), ('ⲗ', 'ⲗ'), + ('ⲙ', 'ⲙ'), ('ⲛ', 'ⲛ'), ('ⲝ', 'ⲝ'), ('ⲟ', 'ⲟ'), + ('ⲡ', 'ⲡ'), ('ⲣ', 'ⲣ'), ('ⲥ', 'ⲥ'), ('ⲧ', 'ⲧ'), + ('ⲩ', 'ⲩ'), ('ⲫ', 'ⲫ'), ('ⲭ', 'ⲭ'), ('ⲯ', 'ⲯ'), + ('ⲱ', 'ⲱ'), ('ⲳ', 'ⲳ'), ('ⲵ', 'ⲵ'), ('ⲷ', 'ⲷ'), + ('ⲹ', 'ⲹ'), ('ⲻ', 'ⲻ'), ('ⲽ', 'ⲽ'), ('ⲿ', 'ⲿ'), + ('ⳁ', 'ⳁ'), ('ⳃ', 'ⳃ'), ('ⳅ', 'ⳅ'), ('ⳇ', 'ⳇ'), + ('ⳉ', 'ⳉ'), ('ⳋ', 'ⳋ'), ('ⳍ', 'ⳍ'), ('ⳏ', 'ⳏ'), + ('ⳑ', 'ⳑ'), ('ⳓ', 'ⳓ'), ('ⳕ', 'ⳕ'), ('ⳗ', 'ⳗ'), + ('ⳙ', 'ⳙ'), ('ⳛ', 'ⳛ'), ('ⳝ', 'ⳝ'), ('ⳟ', 'ⳟ'), + ('ⳡ', 'ⳡ'), ('ⳣ', 'ⳤ'), ('ⳬ', 'ⳬ'), ('ⳮ', 'ⳮ'), + ('ⳳ', 'ⳳ'), ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), + ('ꙁ', 'ꙁ'), ('ꙃ', 'ꙃ'), ('ꙅ', 'ꙅ'), ('ꙇ', 'ꙇ'), + ('ꙉ', 'ꙉ'), ('ꙋ', 'ꙋ'), ('ꙍ', 'ꙍ'), ('ꙏ', 'ꙏ'), + ('ꙑ', 'ꙑ'), ('ꙓ', 'ꙓ'), ('ꙕ', 'ꙕ'), ('ꙗ', 'ꙗ'), + ('ꙙ', 'ꙙ'), ('ꙛ', 'ꙛ'), ('ꙝ', 'ꙝ'), ('ꙟ', 'ꙟ'), + ('ꙡ', 'ꙡ'), ('ꙣ', 'ꙣ'), ('ꙥ', 'ꙥ'), ('ꙧ', 'ꙧ'), + ('ꙩ', 'ꙩ'), ('ꙫ', 'ꙫ'), ('ꙭ', 'ꙭ'), ('ꚁ', 'ꚁ'), + ('ꚃ', 'ꚃ'), ('ꚅ', 'ꚅ'), ('ꚇ', 'ꚇ'), ('ꚉ', 'ꚉ'), + ('ꚋ', 'ꚋ'), ('ꚍ', 'ꚍ'), ('ꚏ', 'ꚏ'), ('ꚑ', 'ꚑ'), + ('ꚓ', 'ꚓ'), ('ꚕ', 'ꚕ'), ('ꚗ', 'ꚗ'), ('ꚙ', 'ꚙ'), + ('ꚛ', 'ꚛ'), ('ꜣ', 'ꜣ'), ('ꜥ', 'ꜥ'), ('ꜧ', 'ꜧ'), + ('ꜩ', 'ꜩ'), ('ꜫ', 'ꜫ'), ('ꜭ', 'ꜭ'), ('ꜯ', 'ꜱ'), + ('ꜳ', 'ꜳ'), ('ꜵ', 'ꜵ'), ('ꜷ', 'ꜷ'), ('ꜹ', 'ꜹ'), + ('ꜻ', 'ꜻ'), ('ꜽ', 'ꜽ'), ('ꜿ', 'ꜿ'), ('ꝁ', 'ꝁ'), + ('ꝃ', 'ꝃ'), ('ꝅ', 'ꝅ'), ('ꝇ', 'ꝇ'), ('ꝉ', 'ꝉ'), + ('ꝋ', 'ꝋ'), ('ꝍ', 'ꝍ'), ('ꝏ', 'ꝏ'), ('ꝑ', 'ꝑ'), + ('ꝓ', 'ꝓ'), ('ꝕ', 'ꝕ'), ('ꝗ', 'ꝗ'), ('ꝙ', 'ꝙ'), + ('ꝛ', 'ꝛ'), ('ꝝ', 'ꝝ'), ('ꝟ', 'ꝟ'), ('ꝡ', 'ꝡ'), + ('ꝣ', 'ꝣ'), ('ꝥ', 'ꝥ'), ('ꝧ', 'ꝧ'), ('ꝩ', 'ꝩ'), + ('ꝫ', 'ꝫ'), ('ꝭ', 'ꝭ'), ('ꝯ', 'ꝯ'), ('ꝱ', 'ꝸ'), + ('ꝺ', 'ꝺ'), ('ꝼ', 'ꝼ'), ('ꝿ', 'ꝿ'), ('ꞁ', 'ꞁ'), + ('ꞃ', 'ꞃ'), ('ꞅ', 'ꞅ'), ('ꞇ', 'ꞇ'), ('ꞌ', 'ꞌ'), + ('ꞎ', 'ꞎ'), ('ꞑ', 'ꞑ'), ('ꞓ', 'ꞕ'), ('ꞗ', 'ꞗ'), + ('ꞙ', 'ꞙ'), ('ꞛ', 'ꞛ'), ('ꞝ', 'ꞝ'), ('ꞟ', 'ꞟ'), + ('ꞡ', 'ꞡ'), ('ꞣ', 'ꞣ'), ('ꞥ', 'ꞥ'), ('ꞧ', 'ꞧ'), + ('ꞩ', 'ꞩ'), ('ꞵ', 'ꞵ'), ('ꞷ', 'ꞷ'), ('ꟺ', 'ꟺ'), + ('ꬰ', 'ꭚ'), ('ꭠ', 'ꭥ'), ('ꭰ', 'ꮿ'), ('ff', 'st'), + ('ﬓ', 'ﬗ'), ('a', 'z'), ('𐐨', '𐑏'), ('𐓘', '𐓻'), + ('𐳀', '𐳲'), ('𑣀', '𑣟'), ('𝐚', '𝐳'), ('𝑎', '𝑔'), + ('𝑖', '𝑧'), ('𝒂', '𝒛'), ('𝒶', '𝒹'), ('𝒻', '𝒻'), + ('𝒽', '𝓃'), ('𝓅', '𝓏'), ('𝓪', '𝔃'), ('𝔞', '𝔷'), + ('𝕒', '𝕫'), ('𝖆', '𝖟'), ('𝖺', '𝗓'), ('𝗮', '𝘇'), + ('𝘢', '𝘻'), ('𝙖', '𝙯'), ('𝚊', '𝚥'), ('𝛂', '𝛚'), + ('𝛜', '𝛡'), ('𝛼', '𝜔'), ('𝜖', '𝜛'), ('𝜶', '𝝎'), + ('𝝐', '𝝕'), ('𝝰', '𝞈'), ('𝞊', '𝞏'), ('𝞪', '𝟂'), + ('𝟄', '𝟉'), ('𝟋', '𝟋'), ('𞤢', '𞥃'), +]; + +pub const MARK: &'static [(char, char)] = &[ + ('̀', 'ͯ'), ('҃', '҉'), ('֑', 'ֽ'), ('ֿ', 'ֿ'), ('ׁ', 'ׂ'), + ('ׄ', 'ׅ'), ('ׇ', 'ׇ'), ('ؐ', 'ؚ'), ('ً', 'ٟ'), ('ٰ', 'ٰ'), + ('ۖ', 'ۜ'), ('۟', 'ۤ'), ('ۧ', 'ۨ'), ('۪', 'ۭ'), ('ܑ', 'ܑ'), + ('ܰ', '݊'), ('ަ', 'ް'), ('߫', '߳'), ('ࠖ', '࠙'), ('ࠛ', 'ࠣ'), + ('ࠥ', 'ࠧ'), ('ࠩ', '࠭'), ('࡙', '࡛'), ('ࣔ', '࣡'), + ('ࣣ', 'ः'), ('ऺ', '़'), ('ा', 'ॏ'), ('॑', 'ॗ'), + ('ॢ', 'ॣ'), ('ঁ', 'ঃ'), ('়', '়'), ('া', 'ৄ'), + ('ে', 'ৈ'), ('ো', '্'), ('ৗ', 'ৗ'), ('ৢ', 'ৣ'), + ('ਁ', 'ਃ'), ('਼', '਼'), ('ਾ', 'ੂ'), ('ੇ', 'ੈ'), + ('ੋ', '੍'), ('ੑ', 'ੑ'), ('ੰ', 'ੱ'), ('ੵ', 'ੵ'), + ('ઁ', 'ઃ'), ('઼', '઼'), ('ા', 'ૅ'), ('ે', 'ૉ'), + ('ો', '્'), ('ૢ', 'ૣ'), ('ૺ', '૿'), ('ଁ', 'ଃ'), + ('଼', '଼'), ('ା', 'ୄ'), ('େ', 'ୈ'), ('ୋ', '୍'), + ('ୖ', 'ୗ'), ('ୢ', 'ୣ'), ('ஂ', 'ஂ'), ('ா', 'ூ'), + ('ெ', 'ை'), ('ொ', '்'), ('ௗ', 'ௗ'), ('ఀ', 'ః'), + ('ా', 'ౄ'), ('ె', 'ై'), ('ొ', '్'), ('ౕ', 'ౖ'), + ('ౢ', 'ౣ'), ('ಁ', 'ಃ'), ('಼', '಼'), ('ಾ', 'ೄ'), + ('ೆ', 'ೈ'), ('ೊ', '್'), ('ೕ', 'ೖ'), ('ೢ', 'ೣ'), + ('ഀ', 'ഃ'), ('഻', '഼'), ('ാ', 'ൄ'), ('െ', 'ൈ'), + ('ൊ', '്'), ('ൗ', 'ൗ'), ('ൢ', 'ൣ'), ('ං', 'ඃ'), + ('්', '්'), ('ා', 'ු'), ('ූ', 'ූ'), ('ෘ', 'ෟ'), + ('ෲ', 'ෳ'), ('ั', 'ั'), ('ิ', 'ฺ'), ('็', '๎'), + ('ັ', 'ັ'), ('ິ', 'ູ'), ('ົ', 'ຼ'), ('່', 'ໍ'), + ('༘', '༙'), ('༵', '༵'), ('༷', '༷'), ('༹', '༹'), + ('༾', '༿'), ('ཱ', '྄'), ('྆', '྇'), ('ྍ', 'ྗ'), + ('ྙ', 'ྼ'), ('࿆', '࿆'), ('ါ', 'ှ'), ('ၖ', 'ၙ'), + ('ၞ', 'ၠ'), ('ၢ', 'ၤ'), ('ၧ', 'ၭ'), ('ၱ', 'ၴ'), + ('ႂ', 'ႍ'), ('ႏ', 'ႏ'), ('ႚ', 'ႝ'), ('፝', '፟'), + ('ᜒ', '᜔'), ('ᜲ', '᜴'), ('ᝒ', 'ᝓ'), ('ᝲ', 'ᝳ'), + ('឴', '៓'), ('៝', '៝'), ('᠋', '᠍'), ('ᢅ', 'ᢆ'), + ('ᢩ', 'ᢩ'), ('ᤠ', 'ᤫ'), ('ᤰ', '᤻'), ('ᨗ', 'ᨛ'), + ('ᩕ', 'ᩞ'), ('᩠', '᩼'), ('᩿', '᩿'), ('᪰', '᪾'), + ('ᬀ', 'ᬄ'), ('᬴', '᭄'), ('᭫', '᭳'), ('ᮀ', 'ᮂ'), + ('ᮡ', 'ᮭ'), ('᯦', '᯳'), ('ᰤ', '᰷'), ('᳐', '᳒'), + ('᳔', '᳨'), ('᳭', '᳭'), ('ᳲ', '᳴'), ('᳷', '᳹'), + ('᷀', '᷹'), ('᷻', '᷿'), ('⃐', '⃰'), ('⳯', '⳱'), + ('⵿', '⵿'), ('ⷠ', 'ⷿ'), ('〪', '〯'), ('゙', '゚'), + ('꙯', '꙲'), ('ꙴ', '꙽'), ('ꚞ', 'ꚟ'), ('꛰', '꛱'), + ('ꠂ', 'ꠂ'), ('꠆', '꠆'), ('ꠋ', 'ꠋ'), ('ꠣ', 'ꠧ'), + ('ꢀ', 'ꢁ'), ('ꢴ', 'ꣅ'), ('꣠', '꣱'), ('ꤦ', '꤭'), + ('ꥇ', '꥓'), ('ꦀ', 'ꦃ'), ('꦳', '꧀'), ('ꧥ', 'ꧥ'), + ('ꨩ', 'ꨶ'), ('ꩃ', 'ꩃ'), ('ꩌ', 'ꩍ'), ('ꩻ', 'ꩽ'), + ('ꪰ', 'ꪰ'), ('ꪲ', 'ꪴ'), ('ꪷ', 'ꪸ'), ('ꪾ', '꪿'), + ('꫁', '꫁'), ('ꫫ', 'ꫯ'), ('ꫵ', '꫶'), ('ꯣ', 'ꯪ'), + ('꯬', '꯭'), ('ﬞ', 'ﬞ'), ('︀', '️'), ('︠', '︯'), + ('𐇽', '𐇽'), ('𐋠', '𐋠'), ('𐍶', '𐍺'), ('𐨁', '𐨃'), + ('𐨅', '𐨆'), ('𐨌', '𐨏'), ('𐨸', '𐨺'), ('𐨿', '𐨿'), + ('𐫥', '𐫦'), ('𑀀', '𑀂'), ('𑀸', '𑁆'), ('𑁿', '𑂂'), + ('𑂰', '𑂺'), ('𑄀', '𑄂'), ('𑄧', '𑄴'), ('𑅳', '𑅳'), + ('𑆀', '𑆂'), ('𑆳', '𑇀'), ('𑇊', '𑇌'), ('𑈬', '𑈷'), + ('𑈾', '𑈾'), ('𑋟', '𑋪'), ('𑌀', '𑌃'), ('𑌼', '𑌼'), + ('𑌾', '𑍄'), ('𑍇', '𑍈'), ('𑍋', '𑍍'), ('𑍗', '𑍗'), + ('𑍢', '𑍣'), ('𑍦', '𑍬'), ('𑍰', '𑍴'), ('𑐵', '𑑆'), + ('𑒰', '𑓃'), ('𑖯', '𑖵'), ('𑖸', '𑗀'), ('𑗜', '𑗝'), + ('𑘰', '𑙀'), ('𑚫', '𑚷'), ('𑜝', '𑜫'), ('𑨁', '𑨊'), + ('𑨳', '𑨹'), ('𑨻', '𑨾'), ('𑩇', '𑩇'), ('𑩑', '𑩛'), + ('𑪊', '𑪙'), ('𑰯', '𑰶'), ('𑰸', '𑰿'), ('𑲒', '𑲧'), + ('𑲩', '𑲶'), ('𑴱', '𑴶'), ('𑴺', '𑴺'), ('𑴼', '𑴽'), + ('𑴿', '𑵅'), ('𑵇', '𑵇'), ('𖫰', '𖫴'), ('𖬰', '𖬶'), + ('𖽑', '𖽾'), ('𖾏', '𖾒'), ('𛲝', '𛲞'), ('𝅥', '𝅩'), + ('𝅭', '𝅲'), ('𝅻', '𝆂'), ('𝆅', '𝆋'), ('𝆪', '𝆭'), + ('𝉂', '𝉄'), ('𝨀', '𝨶'), ('𝨻', '𝩬'), ('𝩵', '𝩵'), + ('𝪄', '𝪄'), ('𝪛', '𝪟'), ('𝪡', '𝪯'), ('𞀀', '𞀆'), + ('𞀈', '𞀘'), ('𞀛', '𞀡'), ('𞀣', '𞀤'), ('𞀦', '𞀪'), + ('𞣐', '𞣖'), ('𞥄', '𞥊'), ('󠄀', '󠇯'), +]; + +pub const MATH_SYMBOL: &'static [(char, char)] = &[ + ('+', '+'), ('<', '>'), ('|', '|'), ('~', '~'), ('¬', '¬'), ('±', '±'), + ('×', '×'), ('÷', '÷'), ('϶', '϶'), ('؆', '؈'), ('⁄', '⁄'), + ('⁒', '⁒'), ('⁺', '⁼'), ('₊', '₌'), ('℘', '℘'), + ('⅀', '⅄'), ('⅋', '⅋'), ('←', '↔'), ('↚', '↛'), + ('↠', '↠'), ('↣', '↣'), ('↦', '↦'), ('↮', '↮'), + ('⇎', '⇏'), ('⇒', '⇒'), ('⇔', '⇔'), ('⇴', '⋿'), + ('⌠', '⌡'), ('⍼', '⍼'), ('⎛', '⎳'), ('⏜', '⏡'), + ('▷', '▷'), ('◁', '◁'), ('◸', '◿'), ('♯', '♯'), + ('⟀', '⟄'), ('⟇', '⟥'), ('⟰', '⟿'), ('⤀', '⦂'), + ('⦙', '⧗'), ('⧜', '⧻'), ('⧾', '⫿'), ('⬰', '⭄'), + ('⭇', '⭌'), ('﬩', '﬩'), ('﹢', '﹢'), ('﹤', '﹦'), + ('+', '+'), ('<', '>'), ('|', '|'), ('~', '~'), + ('¬', '¬'), ('←', '↓'), ('𝛁', '𝛁'), ('𝛛', '𝛛'), + ('𝛻', '𝛻'), ('𝜕', '𝜕'), ('𝜵', '𝜵'), ('𝝏', '𝝏'), + ('𝝯', '𝝯'), ('𝞉', '𝞉'), ('𝞩', '𝞩'), ('𝟃', '𝟃'), + ('𞻰', '𞻱'), +]; + +pub const MODIFIER_LETTER: &'static [(char, char)] = &[ + ('ʰ', 'ˁ'), ('ˆ', 'ˑ'), ('ˠ', 'ˤ'), ('ˬ', 'ˬ'), ('ˮ', 'ˮ'), + ('ʹ', 'ʹ'), ('ͺ', 'ͺ'), ('ՙ', 'ՙ'), ('ـ', 'ـ'), ('ۥ', 'ۦ'), + ('ߴ', 'ߵ'), ('ߺ', 'ߺ'), ('ࠚ', 'ࠚ'), ('ࠤ', 'ࠤ'), ('ࠨ', 'ࠨ'), + ('ॱ', 'ॱ'), ('ๆ', 'ๆ'), ('ໆ', 'ໆ'), ('ჼ', 'ჼ'), + ('ៗ', 'ៗ'), ('ᡃ', 'ᡃ'), ('ᪧ', 'ᪧ'), ('ᱸ', 'ᱽ'), + ('ᴬ', 'ᵪ'), ('ᵸ', 'ᵸ'), ('ᶛ', 'ᶿ'), ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), ('ⱼ', 'ⱽ'), ('ⵯ', 'ⵯ'), + ('ⸯ', 'ⸯ'), ('々', '々'), ('〱', '〵'), ('〻', '〻'), + ('ゝ', 'ゞ'), ('ー', 'ヾ'), ('ꀕ', 'ꀕ'), ('ꓸ', 'ꓽ'), + ('ꘌ', 'ꘌ'), ('ꙿ', 'ꙿ'), ('ꚜ', 'ꚝ'), ('ꜗ', 'ꜟ'), + ('ꝰ', 'ꝰ'), ('ꞈ', 'ꞈ'), ('ꟸ', 'ꟹ'), ('ꧏ', 'ꧏ'), + ('ꧦ', 'ꧦ'), ('ꩰ', 'ꩰ'), ('ꫝ', 'ꫝ'), ('ꫳ', 'ꫴ'), + ('ꭜ', 'ꭟ'), ('ー', 'ー'), ('゙', '゚'), ('𖭀', '𖭃'), + ('𖾓', '𖾟'), ('𖿠', '𖿡'), +]; + +pub const MODIFIER_SYMBOL: &'static [(char, char)] = &[ + ('^', '^'), ('`', '`'), ('¨', '¨'), ('¯', '¯'), ('´', '´'), + ('¸', '¸'), ('˂', '˅'), ('˒', '˟'), ('˥', '˫'), ('˭', '˭'), + ('˯', '˿'), ('͵', '͵'), ('΄', '΅'), ('᾽', '᾽'), ('᾿', '῁'), + ('῍', '῏'), ('῝', '῟'), ('῭', '`'), ('´', '῾'), + ('゛', '゜'), ('꜀', '꜖'), ('꜠', '꜡'), ('꞉', '꞊'), + ('꭛', '꭛'), ('﮲', '﯁'), ('^', '^'), ('`', '`'), + (' ̄', ' ̄'), ('🏻', '🏿'), +]; + +pub const NONSPACING_MARK: &'static [(char, char)] = &[ + ('̀', 'ͯ'), ('҃', '҇'), ('֑', 'ֽ'), ('ֿ', 'ֿ'), ('ׁ', 'ׂ'), + ('ׄ', 'ׅ'), ('ׇ', 'ׇ'), ('ؐ', 'ؚ'), ('ً', 'ٟ'), ('ٰ', 'ٰ'), + ('ۖ', 'ۜ'), ('۟', 'ۤ'), ('ۧ', 'ۨ'), ('۪', 'ۭ'), ('ܑ', 'ܑ'), + ('ܰ', '݊'), ('ަ', 'ް'), ('߫', '߳'), ('ࠖ', '࠙'), ('ࠛ', 'ࠣ'), + ('ࠥ', 'ࠧ'), ('ࠩ', '࠭'), ('࡙', '࡛'), ('ࣔ', '࣡'), + ('ࣣ', 'ं'), ('ऺ', 'ऺ'), ('़', '़'), ('ु', 'ै'), + ('्', '्'), ('॑', 'ॗ'), ('ॢ', 'ॣ'), ('ঁ', 'ঁ'), + ('়', '়'), ('ু', 'ৄ'), ('্', '্'), ('ৢ', 'ৣ'), + ('ਁ', 'ਂ'), ('਼', '਼'), ('ੁ', 'ੂ'), ('ੇ', 'ੈ'), + ('ੋ', '੍'), ('ੑ', 'ੑ'), ('ੰ', 'ੱ'), ('ੵ', 'ੵ'), + ('ઁ', 'ં'), ('઼', '઼'), ('ુ', 'ૅ'), ('ે', 'ૈ'), + ('્', '્'), ('ૢ', 'ૣ'), ('ૺ', '૿'), ('ଁ', 'ଁ'), + ('଼', '଼'), ('ି', 'ି'), ('ୁ', 'ୄ'), ('୍', '୍'), + ('ୖ', 'ୖ'), ('ୢ', 'ୣ'), ('ஂ', 'ஂ'), ('ீ', 'ீ'), + ('்', '்'), ('ఀ', 'ఀ'), ('ా', 'ీ'), ('ె', 'ై'), + ('ొ', '్'), ('ౕ', 'ౖ'), ('ౢ', 'ౣ'), ('ಁ', 'ಁ'), + ('಼', '಼'), ('ಿ', 'ಿ'), ('ೆ', 'ೆ'), ('ೌ', '್'), + ('ೢ', 'ೣ'), ('ഀ', 'ഁ'), ('഻', '഼'), ('ു', 'ൄ'), + ('്', '്'), ('ൢ', 'ൣ'), ('්', '්'), ('ි', 'ු'), + ('ූ', 'ූ'), ('ั', 'ั'), ('ิ', 'ฺ'), ('็', '๎'), + ('ັ', 'ັ'), ('ິ', 'ູ'), ('ົ', 'ຼ'), ('່', 'ໍ'), + ('༘', '༙'), ('༵', '༵'), ('༷', '༷'), ('༹', '༹'), + ('ཱ', 'ཾ'), ('ྀ', '྄'), ('྆', '྇'), ('ྍ', 'ྗ'), + ('ྙ', 'ྼ'), ('࿆', '࿆'), ('ိ', 'ူ'), ('ဲ', '့'), + ('္', '်'), ('ွ', 'ှ'), ('ၘ', 'ၙ'), ('ၞ', 'ၠ'), + ('ၱ', 'ၴ'), ('ႂ', 'ႂ'), ('ႅ', 'ႆ'), ('ႍ', 'ႍ'), + ('ႝ', 'ႝ'), ('፝', '፟'), ('ᜒ', '᜔'), ('ᜲ', '᜴'), + ('ᝒ', 'ᝓ'), ('ᝲ', 'ᝳ'), ('឴', '឵'), ('ិ', 'ួ'), + ('ំ', 'ំ'), ('៉', '៓'), ('៝', '៝'), ('᠋', '᠍'), + ('ᢅ', 'ᢆ'), ('ᢩ', 'ᢩ'), ('ᤠ', 'ᤢ'), ('ᤧ', 'ᤨ'), + ('ᤲ', 'ᤲ'), ('᤹', '᤻'), ('ᨗ', 'ᨘ'), ('ᨛ', 'ᨛ'), + ('ᩖ', 'ᩖ'), ('ᩘ', 'ᩞ'), ('᩠', '᩠'), ('ᩢ', 'ᩢ'), + ('ᩥ', 'ᩬ'), ('ᩳ', '᩼'), ('᩿', '᩿'), ('᪰', '᪽'), + ('ᬀ', 'ᬃ'), ('᬴', '᬴'), ('ᬶ', 'ᬺ'), ('ᬼ', 'ᬼ'), + ('ᭂ', 'ᭂ'), ('᭫', '᭳'), ('ᮀ', 'ᮁ'), ('ᮢ', 'ᮥ'), + ('ᮨ', 'ᮩ'), ('᮫', 'ᮭ'), ('᯦', '᯦'), ('ᯨ', 'ᯩ'), + ('ᯭ', 'ᯭ'), ('ᯯ', 'ᯱ'), ('ᰬ', 'ᰳ'), ('ᰶ', '᰷'), + ('᳐', '᳒'), ('᳔', '᳠'), ('᳢', '᳨'), ('᳭', '᳭'), + ('᳴', '᳴'), ('᳸', '᳹'), ('᷀', '᷹'), ('᷻', '᷿'), + ('⃐', '⃜'), ('⃡', '⃡'), ('⃥', '⃰'), ('⳯', '⳱'), + ('⵿', '⵿'), ('ⷠ', 'ⷿ'), ('〪', '〭'), ('゙', '゚'), + ('꙯', '꙯'), ('ꙴ', '꙽'), ('ꚞ', 'ꚟ'), ('꛰', '꛱'), + ('ꠂ', 'ꠂ'), ('꠆', '꠆'), ('ꠋ', 'ꠋ'), ('ꠥ', 'ꠦ'), + ('꣄', 'ꣅ'), ('꣠', '꣱'), ('ꤦ', '꤭'), ('ꥇ', 'ꥑ'), + ('ꦀ', 'ꦂ'), ('꦳', '꦳'), ('ꦶ', 'ꦹ'), ('ꦼ', 'ꦼ'), + ('ꧥ', 'ꧥ'), ('ꨩ', 'ꨮ'), ('ꨱ', 'ꨲ'), ('ꨵ', 'ꨶ'), + ('ꩃ', 'ꩃ'), ('ꩌ', 'ꩌ'), ('ꩼ', 'ꩼ'), ('ꪰ', 'ꪰ'), + ('ꪲ', 'ꪴ'), ('ꪷ', 'ꪸ'), ('ꪾ', '꪿'), ('꫁', '꫁'), + ('ꫬ', 'ꫭ'), ('꫶', '꫶'), ('ꯥ', 'ꯥ'), ('ꯨ', 'ꯨ'), + ('꯭', '꯭'), ('ﬞ', 'ﬞ'), ('︀', '️'), ('︠', '︯'), + ('𐇽', '𐇽'), ('𐋠', '𐋠'), ('𐍶', '𐍺'), ('𐨁', '𐨃'), + ('𐨅', '𐨆'), ('𐨌', '𐨏'), ('𐨸', '𐨺'), ('𐨿', '𐨿'), + ('𐫥', '𐫦'), ('𑀁', '𑀁'), ('𑀸', '𑁆'), ('𑁿', '𑂁'), + ('𑂳', '𑂶'), ('𑂹', '𑂺'), ('𑄀', '𑄂'), ('𑄧', '𑄫'), + ('𑄭', '𑄴'), ('𑅳', '𑅳'), ('𑆀', '𑆁'), ('𑆶', '𑆾'), + ('𑇊', '𑇌'), ('𑈯', '𑈱'), ('𑈴', '𑈴'), ('𑈶', '𑈷'), + ('𑈾', '𑈾'), ('𑋟', '𑋟'), ('𑋣', '𑋪'), ('𑌀', '𑌁'), + ('𑌼', '𑌼'), ('𑍀', '𑍀'), ('𑍦', '𑍬'), ('𑍰', '𑍴'), + ('𑐸', '𑐿'), ('𑑂', '𑑄'), ('𑑆', '𑑆'), ('𑒳', '𑒸'), + ('𑒺', '𑒺'), ('𑒿', '𑓀'), ('𑓂', '𑓃'), ('𑖲', '𑖵'), + ('𑖼', '𑖽'), ('𑖿', '𑗀'), ('𑗜', '𑗝'), ('𑘳', '𑘺'), + ('𑘽', '𑘽'), ('𑘿', '𑙀'), ('𑚫', '𑚫'), ('𑚭', '𑚭'), + ('𑚰', '𑚵'), ('𑚷', '𑚷'), ('𑜝', '𑜟'), ('𑜢', '𑜥'), + ('𑜧', '𑜫'), ('𑨁', '𑨆'), ('𑨉', '𑨊'), ('𑨳', '𑨸'), + ('𑨻', '𑨾'), ('𑩇', '𑩇'), ('𑩑', '𑩖'), ('𑩙', '𑩛'), + ('𑪊', '𑪖'), ('𑪘', '𑪙'), ('𑰰', '𑰶'), ('𑰸', '𑰽'), + ('𑰿', '𑰿'), ('𑲒', '𑲧'), ('𑲪', '𑲰'), ('𑲲', '𑲳'), + ('𑲵', '𑲶'), ('𑴱', '𑴶'), ('𑴺', '𑴺'), ('𑴼', '𑴽'), + ('𑴿', '𑵅'), ('𑵇', '𑵇'), ('𖫰', '𖫴'), ('𖬰', '𖬶'), + ('𖾏', '𖾒'), ('𛲝', '𛲞'), ('𝅧', '𝅩'), ('𝅻', '𝆂'), + ('𝆅', '𝆋'), ('𝆪', '𝆭'), ('𝉂', '𝉄'), ('𝨀', '𝨶'), + ('𝨻', '𝩬'), ('𝩵', '𝩵'), ('𝪄', '𝪄'), ('𝪛', '𝪟'), + ('𝪡', '𝪯'), ('𞀀', '𞀆'), ('𞀈', '𞀘'), ('𞀛', '𞀡'), + ('𞀣', '𞀤'), ('𞀦', '𞀪'), ('𞣐', '𞣖'), ('𞥄', '𞥊'), + ('󠄀', '󠇯'), +]; + +pub const NUMBER: &'static [(char, char)] = &[ + ('0', '9'), ('²', '³'), ('¹', '¹'), ('¼', '¾'), ('٠', '٩'), + ('۰', '۹'), ('߀', '߉'), ('०', '९'), ('০', '৯'), ('৴', '৹'), + ('੦', '੯'), ('૦', '૯'), ('୦', '୯'), ('୲', '୷'), + ('௦', '௲'), ('౦', '౯'), ('౸', '౾'), ('೦', '೯'), + ('൘', '൞'), ('൦', '൸'), ('෦', '෯'), ('๐', '๙'), + ('໐', '໙'), ('༠', '༳'), ('၀', '၉'), ('႐', '႙'), + ('፩', '፼'), ('ᛮ', 'ᛰ'), ('០', '៩'), ('៰', '៹'), + ('᠐', '᠙'), ('᥆', '᥏'), ('᧐', '᧚'), ('᪀', '᪉'), + ('᪐', '᪙'), ('᭐', '᭙'), ('᮰', '᮹'), ('᱀', '᱉'), + ('᱐', '᱙'), ('⁰', '⁰'), ('⁴', '⁹'), ('₀', '₉'), + ('⅐', 'ↂ'), ('ↅ', '↉'), ('①', '⒛'), ('⓪', '⓿'), + ('❶', '➓'), ('⳽', '⳽'), ('〇', '〇'), ('〡', '〩'), + ('〸', '〺'), ('㆒', '㆕'), ('㈠', '㈩'), ('㉈', '㉏'), + ('㉑', '㉟'), ('㊀', '㊉'), ('㊱', '㊿'), ('꘠', '꘩'), + ('ꛦ', 'ꛯ'), ('꠰', '꠵'), ('꣐', '꣙'), ('꤀', '꤉'), + ('꧐', '꧙'), ('꧰', '꧹'), ('꩐', '꩙'), ('꯰', '꯹'), + ('0', '9'), ('𐄇', '𐄳'), ('𐅀', '𐅸'), ('𐆊', '𐆋'), + ('𐋡', '𐋻'), ('𐌠', '𐌣'), ('𐍁', '𐍁'), ('𐍊', '𐍊'), + ('𐏑', '𐏕'), ('𐒠', '𐒩'), ('𐡘', '𐡟'), ('𐡹', '𐡿'), + ('𐢧', '𐢯'), ('𐣻', '𐣿'), ('𐤖', '𐤛'), ('𐦼', '𐦽'), + ('𐧀', '𐧏'), ('𐧒', '𐧿'), ('𐩀', '𐩇'), ('𐩽', '𐩾'), + ('𐪝', '𐪟'), ('𐫫', '𐫯'), ('𐭘', '𐭟'), ('𐭸', '𐭿'), + ('𐮩', '𐮯'), ('𐳺', '𐳿'), ('𐹠', '𐹾'), ('𑁒', '𑁯'), + ('𑃰', '𑃹'), ('𑄶', '𑄿'), ('𑇐', '𑇙'), ('𑇡', '𑇴'), + ('𑋰', '𑋹'), ('𑑐', '𑑙'), ('𑓐', '𑓙'), ('𑙐', '𑙙'), + ('𑛀', '𑛉'), ('𑜰', '𑜻'), ('𑣠', '𑣲'), ('𑱐', '𑱬'), + ('𑵐', '𑵙'), ('𒐀', '𒑮'), ('𖩠', '𖩩'), ('𖭐', '𖭙'), + ('𖭛', '𖭡'), ('𝍠', '𝍱'), ('𝟎', '𝟿'), ('𞣇', '𞣏'), + ('𞥐', '𞥙'), ('🄀', '🄌'), +]; + +pub const OPEN_PUNCTUATION: &'static [(char, char)] = &[ + ('(', '('), ('[', '['), ('{', '{'), ('༺', '༺'), ('༼', '༼'), + ('᚛', '᚛'), ('‚', '‚'), ('„', '„'), ('⁅', '⁅'), + ('⁽', '⁽'), ('₍', '₍'), ('⌈', '⌈'), ('⌊', '⌊'), + ('〈', '〈'), ('❨', '❨'), ('❪', '❪'), ('❬', '❬'), + ('❮', '❮'), ('❰', '❰'), ('❲', '❲'), ('❴', '❴'), + ('⟅', '⟅'), ('⟦', '⟦'), ('⟨', '⟨'), ('⟪', '⟪'), + ('⟬', '⟬'), ('⟮', '⟮'), ('⦃', '⦃'), ('⦅', '⦅'), + ('⦇', '⦇'), ('⦉', '⦉'), ('⦋', '⦋'), ('⦍', '⦍'), + ('⦏', '⦏'), ('⦑', '⦑'), ('⦓', '⦓'), ('⦕', '⦕'), + ('⦗', '⦗'), ('⧘', '⧘'), ('⧚', '⧚'), ('⧼', '⧼'), + ('⸢', '⸢'), ('⸤', '⸤'), ('⸦', '⸦'), ('⸨', '⸨'), + ('⹂', '⹂'), ('〈', '〈'), ('《', '《'), ('「', '「'), + ('『', '『'), ('【', '【'), ('〔', '〔'), ('〖', '〖'), + ('〘', '〘'), ('〚', '〚'), ('〝', '〝'), ('﴿', '﴿'), + ('︗', '︗'), ('︵', '︵'), ('︷', '︷'), ('︹', '︹'), + ('︻', '︻'), ('︽', '︽'), ('︿', '︿'), ('﹁', '﹁'), + ('﹃', '﹃'), ('﹇', '﹇'), ('﹙', '﹙'), ('﹛', '﹛'), + ('﹝', '﹝'), ('(', '('), ('[', '['), ('{', '{'), + ('⦅', '⦅'), ('「', '「'), +]; + +pub const OTHER: &'static [(char, char)] = &[ + ('\u{0}', '\u{1f}'), ('\u{7f}', '\u{9f}'), ('\u{ad}', '\u{ad}'), + ('\u{378}', '\u{379}'), ('\u{380}', '\u{383}'), ('\u{38b}', '\u{38b}'), + ('\u{38d}', '\u{38d}'), ('\u{3a2}', '\u{3a2}'), ('\u{530}', '\u{530}'), + ('\u{557}', '\u{558}'), ('\u{560}', '\u{560}'), ('\u{588}', '\u{588}'), + ('\u{58b}', '\u{58c}'), ('\u{590}', '\u{590}'), ('\u{5c8}', '\u{5cf}'), + ('\u{5eb}', '\u{5ef}'), ('\u{5f5}', '\u{605}'), ('\u{61c}', '\u{61d}'), + ('\u{6dd}', '\u{6dd}'), ('\u{70e}', '\u{70f}'), ('\u{74b}', '\u{74c}'), + ('\u{7b2}', '\u{7bf}'), ('\u{7fb}', '\u{7ff}'), ('\u{82e}', '\u{82f}'), + ('\u{83f}', '\u{83f}'), ('\u{85c}', '\u{85d}'), ('\u{85f}', '\u{85f}'), + ('\u{86b}', '\u{89f}'), ('\u{8b5}', '\u{8b5}'), ('\u{8be}', '\u{8d3}'), + ('\u{8e2}', '\u{8e2}'), ('\u{984}', '\u{984}'), ('\u{98d}', '\u{98e}'), + ('\u{991}', '\u{992}'), ('\u{9a9}', '\u{9a9}'), ('\u{9b1}', '\u{9b1}'), + ('\u{9b3}', '\u{9b5}'), ('\u{9ba}', '\u{9bb}'), ('\u{9c5}', '\u{9c6}'), + ('\u{9c9}', '\u{9ca}'), ('\u{9cf}', '\u{9d6}'), ('\u{9d8}', '\u{9db}'), + ('\u{9de}', '\u{9de}'), ('\u{9e4}', '\u{9e5}'), ('\u{9fe}', '\u{a00}'), + ('\u{a04}', '\u{a04}'), ('\u{a0b}', '\u{a0e}'), ('\u{a11}', '\u{a12}'), + ('\u{a29}', '\u{a29}'), ('\u{a31}', '\u{a31}'), ('\u{a34}', '\u{a34}'), + ('\u{a37}', '\u{a37}'), ('\u{a3a}', '\u{a3b}'), ('\u{a3d}', '\u{a3d}'), + ('\u{a43}', '\u{a46}'), ('\u{a49}', '\u{a4a}'), ('\u{a4e}', '\u{a50}'), + ('\u{a52}', '\u{a58}'), ('\u{a5d}', '\u{a5d}'), ('\u{a5f}', '\u{a65}'), + ('\u{a76}', '\u{a80}'), ('\u{a84}', '\u{a84}'), ('\u{a8e}', '\u{a8e}'), + ('\u{a92}', '\u{a92}'), ('\u{aa9}', '\u{aa9}'), ('\u{ab1}', '\u{ab1}'), + ('\u{ab4}', '\u{ab4}'), ('\u{aba}', '\u{abb}'), ('\u{ac6}', '\u{ac6}'), + ('\u{aca}', '\u{aca}'), ('\u{ace}', '\u{acf}'), ('\u{ad1}', '\u{adf}'), + ('\u{ae4}', '\u{ae5}'), ('\u{af2}', '\u{af8}'), ('\u{b00}', '\u{b00}'), + ('\u{b04}', '\u{b04}'), ('\u{b0d}', '\u{b0e}'), ('\u{b11}', '\u{b12}'), + ('\u{b29}', '\u{b29}'), ('\u{b31}', '\u{b31}'), ('\u{b34}', '\u{b34}'), + ('\u{b3a}', '\u{b3b}'), ('\u{b45}', '\u{b46}'), ('\u{b49}', '\u{b4a}'), + ('\u{b4e}', '\u{b55}'), ('\u{b58}', '\u{b5b}'), ('\u{b5e}', '\u{b5e}'), + ('\u{b64}', '\u{b65}'), ('\u{b78}', '\u{b81}'), ('\u{b84}', '\u{b84}'), + ('\u{b8b}', '\u{b8d}'), ('\u{b91}', '\u{b91}'), ('\u{b96}', '\u{b98}'), + ('\u{b9b}', '\u{b9b}'), ('\u{b9d}', '\u{b9d}'), ('\u{ba0}', '\u{ba2}'), + ('\u{ba5}', '\u{ba7}'), ('\u{bab}', '\u{bad}'), ('\u{bba}', '\u{bbd}'), + ('\u{bc3}', '\u{bc5}'), ('\u{bc9}', '\u{bc9}'), ('\u{bce}', '\u{bcf}'), + ('\u{bd1}', '\u{bd6}'), ('\u{bd8}', '\u{be5}'), ('\u{bfb}', '\u{bff}'), + ('\u{c04}', '\u{c04}'), ('\u{c0d}', '\u{c0d}'), ('\u{c11}', '\u{c11}'), + ('\u{c29}', '\u{c29}'), ('\u{c3a}', '\u{c3c}'), ('\u{c45}', '\u{c45}'), + ('\u{c49}', '\u{c49}'), ('\u{c4e}', '\u{c54}'), ('\u{c57}', '\u{c57}'), + ('\u{c5b}', '\u{c5f}'), ('\u{c64}', '\u{c65}'), ('\u{c70}', '\u{c77}'), + ('\u{c84}', '\u{c84}'), ('\u{c8d}', '\u{c8d}'), ('\u{c91}', '\u{c91}'), + ('\u{ca9}', '\u{ca9}'), ('\u{cb4}', '\u{cb4}'), ('\u{cba}', '\u{cbb}'), + ('\u{cc5}', '\u{cc5}'), ('\u{cc9}', '\u{cc9}'), ('\u{cce}', '\u{cd4}'), + ('\u{cd7}', '\u{cdd}'), ('\u{cdf}', '\u{cdf}'), ('\u{ce4}', '\u{ce5}'), + ('\u{cf0}', '\u{cf0}'), ('\u{cf3}', '\u{cff}'), ('\u{d04}', '\u{d04}'), + ('\u{d0d}', '\u{d0d}'), ('\u{d11}', '\u{d11}'), ('\u{d45}', '\u{d45}'), + ('\u{d49}', '\u{d49}'), ('\u{d50}', '\u{d53}'), ('\u{d64}', '\u{d65}'), + ('\u{d80}', '\u{d81}'), ('\u{d84}', '\u{d84}'), ('\u{d97}', '\u{d99}'), + ('\u{db2}', '\u{db2}'), ('\u{dbc}', '\u{dbc}'), ('\u{dbe}', '\u{dbf}'), + ('\u{dc7}', '\u{dc9}'), ('\u{dcb}', '\u{dce}'), ('\u{dd5}', '\u{dd5}'), + ('\u{dd7}', '\u{dd7}'), ('\u{de0}', '\u{de5}'), ('\u{df0}', '\u{df1}'), + ('\u{df5}', '\u{e00}'), ('\u{e3b}', '\u{e3e}'), ('\u{e5c}', '\u{e80}'), + ('\u{e83}', '\u{e83}'), ('\u{e85}', '\u{e86}'), ('\u{e89}', '\u{e89}'), + ('\u{e8b}', '\u{e8c}'), ('\u{e8e}', '\u{e93}'), ('\u{e98}', '\u{e98}'), + ('\u{ea0}', '\u{ea0}'), ('\u{ea4}', '\u{ea4}'), ('\u{ea6}', '\u{ea6}'), + ('\u{ea8}', '\u{ea9}'), ('\u{eac}', '\u{eac}'), ('\u{eba}', '\u{eba}'), + ('\u{ebe}', '\u{ebf}'), ('\u{ec5}', '\u{ec5}'), ('\u{ec7}', '\u{ec7}'), + ('\u{ece}', '\u{ecf}'), ('\u{eda}', '\u{edb}'), ('\u{ee0}', '\u{eff}'), + ('\u{f48}', '\u{f48}'), ('\u{f6d}', '\u{f70}'), ('\u{f98}', '\u{f98}'), + ('\u{fbd}', '\u{fbd}'), ('\u{fcd}', '\u{fcd}'), ('\u{fdb}', '\u{fff}'), + ('\u{10c6}', '\u{10c6}'), ('\u{10c8}', '\u{10cc}'), + ('\u{10ce}', '\u{10cf}'), ('\u{1249}', '\u{1249}'), + ('\u{124e}', '\u{124f}'), ('\u{1257}', '\u{1257}'), + ('\u{1259}', '\u{1259}'), ('\u{125e}', '\u{125f}'), + ('\u{1289}', '\u{1289}'), ('\u{128e}', '\u{128f}'), + ('\u{12b1}', '\u{12b1}'), ('\u{12b6}', '\u{12b7}'), + ('\u{12bf}', '\u{12bf}'), ('\u{12c1}', '\u{12c1}'), + ('\u{12c6}', '\u{12c7}'), ('\u{12d7}', '\u{12d7}'), + ('\u{1311}', '\u{1311}'), ('\u{1316}', '\u{1317}'), + ('\u{135b}', '\u{135c}'), ('\u{137d}', '\u{137f}'), + ('\u{139a}', '\u{139f}'), ('\u{13f6}', '\u{13f7}'), + ('\u{13fe}', '\u{13ff}'), ('\u{169d}', '\u{169f}'), + ('\u{16f9}', '\u{16ff}'), ('\u{170d}', '\u{170d}'), + ('\u{1715}', '\u{171f}'), ('\u{1737}', '\u{173f}'), + ('\u{1754}', '\u{175f}'), ('\u{176d}', '\u{176d}'), + ('\u{1771}', '\u{1771}'), ('\u{1774}', '\u{177f}'), + ('\u{17de}', '\u{17df}'), ('\u{17ea}', '\u{17ef}'), + ('\u{17fa}', '\u{17ff}'), ('\u{180e}', '\u{180f}'), + ('\u{181a}', '\u{181f}'), ('\u{1878}', '\u{187f}'), + ('\u{18ab}', '\u{18af}'), ('\u{18f6}', '\u{18ff}'), + ('\u{191f}', '\u{191f}'), ('\u{192c}', '\u{192f}'), + ('\u{193c}', '\u{193f}'), ('\u{1941}', '\u{1943}'), + ('\u{196e}', '\u{196f}'), ('\u{1975}', '\u{197f}'), + ('\u{19ac}', '\u{19af}'), ('\u{19ca}', '\u{19cf}'), + ('\u{19db}', '\u{19dd}'), ('\u{1a1c}', '\u{1a1d}'), + ('\u{1a5f}', '\u{1a5f}'), ('\u{1a7d}', '\u{1a7e}'), + ('\u{1a8a}', '\u{1a8f}'), ('\u{1a9a}', '\u{1a9f}'), + ('\u{1aae}', '\u{1aaf}'), ('\u{1abf}', '\u{1aff}'), + ('\u{1b4c}', '\u{1b4f}'), ('\u{1b7d}', '\u{1b7f}'), + ('\u{1bf4}', '\u{1bfb}'), ('\u{1c38}', '\u{1c3a}'), + ('\u{1c4a}', '\u{1c4c}'), ('\u{1c89}', '\u{1cbf}'), + ('\u{1cc8}', '\u{1ccf}'), ('\u{1cfa}', '\u{1cff}'), + ('\u{1dfa}', '\u{1dfa}'), ('\u{1f16}', '\u{1f17}'), + ('\u{1f1e}', '\u{1f1f}'), ('\u{1f46}', '\u{1f47}'), + ('\u{1f4e}', '\u{1f4f}'), ('\u{1f58}', '\u{1f58}'), + ('\u{1f5a}', '\u{1f5a}'), ('\u{1f5c}', '\u{1f5c}'), + ('\u{1f5e}', '\u{1f5e}'), ('\u{1f7e}', '\u{1f7f}'), + ('\u{1fb5}', '\u{1fb5}'), ('\u{1fc5}', '\u{1fc5}'), + ('\u{1fd4}', '\u{1fd5}'), ('\u{1fdc}', '\u{1fdc}'), + ('\u{1ff0}', '\u{1ff1}'), ('\u{1ff5}', '\u{1ff5}'), + ('\u{1fff}', '\u{1fff}'), ('\u{200b}', '\u{200f}'), + ('\u{202a}', '\u{202e}'), ('\u{2060}', '\u{206f}'), + ('\u{2072}', '\u{2073}'), ('\u{208f}', '\u{208f}'), + ('\u{209d}', '\u{209f}'), ('\u{20c0}', '\u{20cf}'), + ('\u{20f1}', '\u{20ff}'), ('\u{218c}', '\u{218f}'), + ('\u{2427}', '\u{243f}'), ('\u{244b}', '\u{245f}'), + ('\u{2b74}', '\u{2b75}'), ('\u{2b96}', '\u{2b97}'), + ('\u{2bba}', '\u{2bbc}'), ('\u{2bc9}', '\u{2bc9}'), + ('\u{2bd3}', '\u{2beb}'), ('\u{2bf0}', '\u{2bff}'), + ('\u{2c2f}', '\u{2c2f}'), ('\u{2c5f}', '\u{2c5f}'), + ('\u{2cf4}', '\u{2cf8}'), ('\u{2d26}', '\u{2d26}'), + ('\u{2d28}', '\u{2d2c}'), ('\u{2d2e}', '\u{2d2f}'), + ('\u{2d68}', '\u{2d6e}'), ('\u{2d71}', '\u{2d7e}'), + ('\u{2d97}', '\u{2d9f}'), ('\u{2da7}', '\u{2da7}'), + ('\u{2daf}', '\u{2daf}'), ('\u{2db7}', '\u{2db7}'), + ('\u{2dbf}', '\u{2dbf}'), ('\u{2dc7}', '\u{2dc7}'), + ('\u{2dcf}', '\u{2dcf}'), ('\u{2dd7}', '\u{2dd7}'), + ('\u{2ddf}', '\u{2ddf}'), ('\u{2e4a}', '\u{2e7f}'), + ('\u{2e9a}', '\u{2e9a}'), ('\u{2ef4}', '\u{2eff}'), + ('\u{2fd6}', '\u{2fef}'), ('\u{2ffc}', '\u{2fff}'), + ('\u{3040}', '\u{3040}'), ('\u{3097}', '\u{3098}'), + ('\u{3100}', '\u{3104}'), ('\u{312f}', '\u{3130}'), + ('\u{318f}', '\u{318f}'), ('\u{31bb}', '\u{31bf}'), + ('\u{31e4}', '\u{31ef}'), ('\u{321f}', '\u{321f}'), + ('\u{32ff}', '\u{32ff}'), ('\u{4db6}', '\u{4dbf}'), + ('\u{9feb}', '\u{9fff}'), ('\u{a48d}', '\u{a48f}'), + ('\u{a4c7}', '\u{a4cf}'), ('\u{a62c}', '\u{a63f}'), + ('\u{a6f8}', '\u{a6ff}'), ('\u{a7af}', '\u{a7af}'), + ('\u{a7b8}', '\u{a7f6}'), ('\u{a82c}', '\u{a82f}'), + ('\u{a83a}', '\u{a83f}'), ('\u{a878}', '\u{a87f}'), + ('\u{a8c6}', '\u{a8cd}'), ('\u{a8da}', '\u{a8df}'), + ('\u{a8fe}', '\u{a8ff}'), ('\u{a954}', '\u{a95e}'), + ('\u{a97d}', '\u{a97f}'), ('\u{a9ce}', '\u{a9ce}'), + ('\u{a9da}', '\u{a9dd}'), ('\u{a9ff}', '\u{a9ff}'), + ('\u{aa37}', '\u{aa3f}'), ('\u{aa4e}', '\u{aa4f}'), + ('\u{aa5a}', '\u{aa5b}'), ('\u{aac3}', '\u{aada}'), + ('\u{aaf7}', '\u{ab00}'), ('\u{ab07}', '\u{ab08}'), + ('\u{ab0f}', '\u{ab10}'), ('\u{ab17}', '\u{ab1f}'), + ('\u{ab27}', '\u{ab27}'), ('\u{ab2f}', '\u{ab2f}'), + ('\u{ab66}', '\u{ab6f}'), ('\u{abee}', '\u{abef}'), + ('\u{abfa}', '\u{abff}'), ('\u{d7a4}', '\u{d7af}'), + ('\u{d7c7}', '\u{d7ca}'), ('\u{d7fc}', '\u{f8ff}'), + ('\u{fa6e}', '\u{fa6f}'), ('\u{fada}', '\u{faff}'), + ('\u{fb07}', '\u{fb12}'), ('\u{fb18}', '\u{fb1c}'), + ('\u{fb37}', '\u{fb37}'), ('\u{fb3d}', '\u{fb3d}'), + ('\u{fb3f}', '\u{fb3f}'), ('\u{fb42}', '\u{fb42}'), + ('\u{fb45}', '\u{fb45}'), ('\u{fbc2}', '\u{fbd2}'), + ('\u{fd40}', '\u{fd4f}'), ('\u{fd90}', '\u{fd91}'), + ('\u{fdc8}', '\u{fdef}'), ('\u{fdfe}', '\u{fdff}'), + ('\u{fe1a}', '\u{fe1f}'), ('\u{fe53}', '\u{fe53}'), + ('\u{fe67}', '\u{fe67}'), ('\u{fe6c}', '\u{fe6f}'), + ('\u{fe75}', '\u{fe75}'), ('\u{fefd}', '\u{ff00}'), + ('\u{ffbf}', '\u{ffc1}'), ('\u{ffc8}', '\u{ffc9}'), + ('\u{ffd0}', '\u{ffd1}'), ('\u{ffd8}', '\u{ffd9}'), + ('\u{ffdd}', '\u{ffdf}'), ('\u{ffe7}', '\u{ffe7}'), + ('\u{ffef}', '\u{fffb}'), ('\u{fffe}', '\u{ffff}'), + ('\u{1000c}', '\u{1000c}'), ('\u{10027}', '\u{10027}'), + ('\u{1003b}', '\u{1003b}'), ('\u{1003e}', '\u{1003e}'), + ('\u{1004e}', '\u{1004f}'), ('\u{1005e}', '\u{1007f}'), + ('\u{100fb}', '\u{100ff}'), ('\u{10103}', '\u{10106}'), + ('\u{10134}', '\u{10136}'), ('\u{1018f}', '\u{1018f}'), + ('\u{1019c}', '\u{1019f}'), ('\u{101a1}', '\u{101cf}'), + ('\u{101fe}', '\u{1027f}'), ('\u{1029d}', '\u{1029f}'), + ('\u{102d1}', '\u{102df}'), ('\u{102fc}', '\u{102ff}'), + ('\u{10324}', '\u{1032c}'), ('\u{1034b}', '\u{1034f}'), + ('\u{1037b}', '\u{1037f}'), ('\u{1039e}', '\u{1039e}'), + ('\u{103c4}', '\u{103c7}'), ('\u{103d6}', '\u{103ff}'), + ('\u{1049e}', '\u{1049f}'), ('\u{104aa}', '\u{104af}'), + ('\u{104d4}', '\u{104d7}'), ('\u{104fc}', '\u{104ff}'), + ('\u{10528}', '\u{1052f}'), ('\u{10564}', '\u{1056e}'), + ('\u{10570}', '\u{105ff}'), ('\u{10737}', '\u{1073f}'), + ('\u{10756}', '\u{1075f}'), ('\u{10768}', '\u{107ff}'), + ('\u{10806}', '\u{10807}'), ('\u{10809}', '\u{10809}'), + ('\u{10836}', '\u{10836}'), ('\u{10839}', '\u{1083b}'), + ('\u{1083d}', '\u{1083e}'), ('\u{10856}', '\u{10856}'), + ('\u{1089f}', '\u{108a6}'), ('\u{108b0}', '\u{108df}'), + ('\u{108f3}', '\u{108f3}'), ('\u{108f6}', '\u{108fa}'), + ('\u{1091c}', '\u{1091e}'), ('\u{1093a}', '\u{1093e}'), + ('\u{10940}', '\u{1097f}'), ('\u{109b8}', '\u{109bb}'), + ('\u{109d0}', '\u{109d1}'), ('\u{10a04}', '\u{10a04}'), + ('\u{10a07}', '\u{10a0b}'), ('\u{10a14}', '\u{10a14}'), + ('\u{10a18}', '\u{10a18}'), ('\u{10a34}', '\u{10a37}'), + ('\u{10a3b}', '\u{10a3e}'), ('\u{10a48}', '\u{10a4f}'), + ('\u{10a59}', '\u{10a5f}'), ('\u{10aa0}', '\u{10abf}'), + ('\u{10ae7}', '\u{10aea}'), ('\u{10af7}', '\u{10aff}'), + ('\u{10b36}', '\u{10b38}'), ('\u{10b56}', '\u{10b57}'), + ('\u{10b73}', '\u{10b77}'), ('\u{10b92}', '\u{10b98}'), + ('\u{10b9d}', '\u{10ba8}'), ('\u{10bb0}', '\u{10bff}'), + ('\u{10c49}', '\u{10c7f}'), ('\u{10cb3}', '\u{10cbf}'), + ('\u{10cf3}', '\u{10cf9}'), ('\u{10d00}', '\u{10e5f}'), + ('\u{10e7f}', '\u{10fff}'), ('\u{1104e}', '\u{11051}'), + ('\u{11070}', '\u{1107e}'), ('\u{110bd}', '\u{110bd}'), + ('\u{110c2}', '\u{110cf}'), ('\u{110e9}', '\u{110ef}'), + ('\u{110fa}', '\u{110ff}'), ('\u{11135}', '\u{11135}'), + ('\u{11144}', '\u{1114f}'), ('\u{11177}', '\u{1117f}'), + ('\u{111ce}', '\u{111cf}'), ('\u{111e0}', '\u{111e0}'), + ('\u{111f5}', '\u{111ff}'), ('\u{11212}', '\u{11212}'), + ('\u{1123f}', '\u{1127f}'), ('\u{11287}', '\u{11287}'), + ('\u{11289}', '\u{11289}'), ('\u{1128e}', '\u{1128e}'), + ('\u{1129e}', '\u{1129e}'), ('\u{112aa}', '\u{112af}'), + ('\u{112eb}', '\u{112ef}'), ('\u{112fa}', '\u{112ff}'), + ('\u{11304}', '\u{11304}'), ('\u{1130d}', '\u{1130e}'), + ('\u{11311}', '\u{11312}'), ('\u{11329}', '\u{11329}'), + ('\u{11331}', '\u{11331}'), ('\u{11334}', '\u{11334}'), + ('\u{1133a}', '\u{1133b}'), ('\u{11345}', '\u{11346}'), + ('\u{11349}', '\u{1134a}'), ('\u{1134e}', '\u{1134f}'), + ('\u{11351}', '\u{11356}'), ('\u{11358}', '\u{1135c}'), + ('\u{11364}', '\u{11365}'), ('\u{1136d}', '\u{1136f}'), + ('\u{11375}', '\u{113ff}'), ('\u{1145a}', '\u{1145a}'), + ('\u{1145c}', '\u{1145c}'), ('\u{1145e}', '\u{1147f}'), + ('\u{114c8}', '\u{114cf}'), ('\u{114da}', '\u{1157f}'), + ('\u{115b6}', '\u{115b7}'), ('\u{115de}', '\u{115ff}'), + ('\u{11645}', '\u{1164f}'), ('\u{1165a}', '\u{1165f}'), + ('\u{1166d}', '\u{1167f}'), ('\u{116b8}', '\u{116bf}'), + ('\u{116ca}', '\u{116ff}'), ('\u{1171a}', '\u{1171c}'), + ('\u{1172c}', '\u{1172f}'), ('\u{11740}', '\u{1189f}'), + ('\u{118f3}', '\u{118fe}'), ('\u{11900}', '\u{119ff}'), + ('\u{11a48}', '\u{11a4f}'), ('\u{11a84}', '\u{11a85}'), + ('\u{11a9d}', '\u{11a9d}'), ('\u{11aa3}', '\u{11abf}'), + ('\u{11af9}', '\u{11bff}'), ('\u{11c09}', '\u{11c09}'), + ('\u{11c37}', '\u{11c37}'), ('\u{11c46}', '\u{11c4f}'), + ('\u{11c6d}', '\u{11c6f}'), ('\u{11c90}', '\u{11c91}'), + ('\u{11ca8}', '\u{11ca8}'), ('\u{11cb7}', '\u{11cff}'), + ('\u{11d07}', '\u{11d07}'), ('\u{11d0a}', '\u{11d0a}'), + ('\u{11d37}', '\u{11d39}'), ('\u{11d3b}', '\u{11d3b}'), + ('\u{11d3e}', '\u{11d3e}'), ('\u{11d48}', '\u{11d4f}'), + ('\u{11d5a}', '\u{11fff}'), ('\u{1239a}', '\u{123ff}'), + ('\u{1246f}', '\u{1246f}'), ('\u{12475}', '\u{1247f}'), + ('\u{12544}', '\u{12fff}'), ('\u{1342f}', '\u{143ff}'), + ('\u{14647}', '\u{167ff}'), ('\u{16a39}', '\u{16a3f}'), + ('\u{16a5f}', '\u{16a5f}'), ('\u{16a6a}', '\u{16a6d}'), + ('\u{16a70}', '\u{16acf}'), ('\u{16aee}', '\u{16aef}'), + ('\u{16af6}', '\u{16aff}'), ('\u{16b46}', '\u{16b4f}'), + ('\u{16b5a}', '\u{16b5a}'), ('\u{16b62}', '\u{16b62}'), + ('\u{16b78}', '\u{16b7c}'), ('\u{16b90}', '\u{16eff}'), + ('\u{16f45}', '\u{16f4f}'), ('\u{16f7f}', '\u{16f8e}'), + ('\u{16fa0}', '\u{16fdf}'), ('\u{16fe2}', '\u{16fff}'), + ('\u{187ed}', '\u{187ff}'), ('\u{18af3}', '\u{1afff}'), + ('\u{1b11f}', '\u{1b16f}'), ('\u{1b2fc}', '\u{1bbff}'), + ('\u{1bc6b}', '\u{1bc6f}'), ('\u{1bc7d}', '\u{1bc7f}'), + ('\u{1bc89}', '\u{1bc8f}'), ('\u{1bc9a}', '\u{1bc9b}'), + ('\u{1bca0}', '\u{1cfff}'), ('\u{1d0f6}', '\u{1d0ff}'), + ('\u{1d127}', '\u{1d128}'), ('\u{1d173}', '\u{1d17a}'), + ('\u{1d1e9}', '\u{1d1ff}'), ('\u{1d246}', '\u{1d2ff}'), + ('\u{1d357}', '\u{1d35f}'), ('\u{1d372}', '\u{1d3ff}'), + ('\u{1d455}', '\u{1d455}'), ('\u{1d49d}', '\u{1d49d}'), + ('\u{1d4a0}', '\u{1d4a1}'), ('\u{1d4a3}', '\u{1d4a4}'), + ('\u{1d4a7}', '\u{1d4a8}'), ('\u{1d4ad}', '\u{1d4ad}'), + ('\u{1d4ba}', '\u{1d4ba}'), ('\u{1d4bc}', '\u{1d4bc}'), + ('\u{1d4c4}', '\u{1d4c4}'), ('\u{1d506}', '\u{1d506}'), + ('\u{1d50b}', '\u{1d50c}'), ('\u{1d515}', '\u{1d515}'), + ('\u{1d51d}', '\u{1d51d}'), ('\u{1d53a}', '\u{1d53a}'), + ('\u{1d53f}', '\u{1d53f}'), ('\u{1d545}', '\u{1d545}'), + ('\u{1d547}', '\u{1d549}'), ('\u{1d551}', '\u{1d551}'), + ('\u{1d6a6}', '\u{1d6a7}'), ('\u{1d7cc}', '\u{1d7cd}'), + ('\u{1da8c}', '\u{1da9a}'), ('\u{1daa0}', '\u{1daa0}'), + ('\u{1dab0}', '\u{1dfff}'), ('\u{1e007}', '\u{1e007}'), + ('\u{1e019}', '\u{1e01a}'), ('\u{1e022}', '\u{1e022}'), + ('\u{1e025}', '\u{1e025}'), ('\u{1e02b}', '\u{1e7ff}'), + ('\u{1e8c5}', '\u{1e8c6}'), ('\u{1e8d7}', '\u{1e8ff}'), + ('\u{1e94b}', '\u{1e94f}'), ('\u{1e95a}', '\u{1e95d}'), + ('\u{1e960}', '\u{1edff}'), ('\u{1ee04}', '\u{1ee04}'), + ('\u{1ee20}', '\u{1ee20}'), ('\u{1ee23}', '\u{1ee23}'), + ('\u{1ee25}', '\u{1ee26}'), ('\u{1ee28}', '\u{1ee28}'), + ('\u{1ee33}', '\u{1ee33}'), ('\u{1ee38}', '\u{1ee38}'), + ('\u{1ee3a}', '\u{1ee3a}'), ('\u{1ee3c}', '\u{1ee41}'), + ('\u{1ee43}', '\u{1ee46}'), ('\u{1ee48}', '\u{1ee48}'), + ('\u{1ee4a}', '\u{1ee4a}'), ('\u{1ee4c}', '\u{1ee4c}'), + ('\u{1ee50}', '\u{1ee50}'), ('\u{1ee53}', '\u{1ee53}'), + ('\u{1ee55}', '\u{1ee56}'), ('\u{1ee58}', '\u{1ee58}'), + ('\u{1ee5a}', '\u{1ee5a}'), ('\u{1ee5c}', '\u{1ee5c}'), + ('\u{1ee5e}', '\u{1ee5e}'), ('\u{1ee60}', '\u{1ee60}'), + ('\u{1ee63}', '\u{1ee63}'), ('\u{1ee65}', '\u{1ee66}'), + ('\u{1ee6b}', '\u{1ee6b}'), ('\u{1ee73}', '\u{1ee73}'), + ('\u{1ee78}', '\u{1ee78}'), ('\u{1ee7d}', '\u{1ee7d}'), + ('\u{1ee7f}', '\u{1ee7f}'), ('\u{1ee8a}', '\u{1ee8a}'), + ('\u{1ee9c}', '\u{1eea0}'), ('\u{1eea4}', '\u{1eea4}'), + ('\u{1eeaa}', '\u{1eeaa}'), ('\u{1eebc}', '\u{1eeef}'), + ('\u{1eef2}', '\u{1efff}'), ('\u{1f02c}', '\u{1f02f}'), + ('\u{1f094}', '\u{1f09f}'), ('\u{1f0af}', '\u{1f0b0}'), + ('\u{1f0c0}', '\u{1f0c0}'), ('\u{1f0d0}', '\u{1f0d0}'), + ('\u{1f0f6}', '\u{1f0ff}'), ('\u{1f10d}', '\u{1f10f}'), + ('\u{1f12f}', '\u{1f12f}'), ('\u{1f16c}', '\u{1f16f}'), + ('\u{1f1ad}', '\u{1f1e5}'), ('\u{1f203}', '\u{1f20f}'), + ('\u{1f23c}', '\u{1f23f}'), ('\u{1f249}', '\u{1f24f}'), + ('\u{1f252}', '\u{1f25f}'), ('\u{1f266}', '\u{1f2ff}'), + ('\u{1f6d5}', '\u{1f6df}'), ('\u{1f6ed}', '\u{1f6ef}'), + ('\u{1f6f9}', '\u{1f6ff}'), ('\u{1f774}', '\u{1f77f}'), + ('\u{1f7d5}', '\u{1f7ff}'), ('\u{1f80c}', '\u{1f80f}'), + ('\u{1f848}', '\u{1f84f}'), ('\u{1f85a}', '\u{1f85f}'), + ('\u{1f888}', '\u{1f88f}'), ('\u{1f8ae}', '\u{1f8ff}'), + ('\u{1f90c}', '\u{1f90f}'), ('\u{1f93f}', '\u{1f93f}'), + ('\u{1f94d}', '\u{1f94f}'), ('\u{1f96c}', '\u{1f97f}'), + ('\u{1f998}', '\u{1f9bf}'), ('\u{1f9c1}', '\u{1f9cf}'), + ('\u{1f9e7}', '\u{1ffff}'), ('\u{2a6d7}', '\u{2a6ff}'), + ('\u{2b735}', '\u{2b73f}'), ('\u{2b81e}', '\u{2b81f}'), + ('\u{2cea2}', '\u{2ceaf}'), ('\u{2ebe1}', '\u{2f7ff}'), + ('\u{2fa1e}', '\u{e00ff}'), ('\u{e01f0}', '\u{10ffff}'), +]; + +pub const OTHER_LETTER: &'static [(char, char)] = &[ + ('ª', 'ª'), ('º', 'º'), ('ƻ', 'ƻ'), ('ǀ', 'ǃ'), ('ʔ', 'ʔ'), + ('א', 'ת'), ('װ', 'ײ'), ('ؠ', 'ؿ'), ('ف', 'ي'), ('ٮ', 'ٯ'), + ('ٱ', 'ۓ'), ('ە', 'ە'), ('ۮ', 'ۯ'), ('ۺ', 'ۼ'), ('ۿ', 'ۿ'), + ('ܐ', 'ܐ'), ('ܒ', 'ܯ'), ('ݍ', 'ޥ'), ('ޱ', 'ޱ'), ('ߊ', 'ߪ'), + ('ࠀ', 'ࠕ'), ('ࡀ', 'ࡘ'), ('ࡠ', 'ࡪ'), ('ࢠ', 'ࢴ'), + ('ࢶ', 'ࢽ'), ('ऄ', 'ह'), ('ऽ', 'ऽ'), ('ॐ', 'ॐ'), + ('क़', 'ॡ'), ('ॲ', 'ঀ'), ('অ', 'ঌ'), ('এ', 'ঐ'), + ('ও', 'ন'), ('প', 'র'), ('ল', 'ল'), ('শ', 'হ'), + ('ঽ', 'ঽ'), ('ৎ', 'ৎ'), ('ড়', 'ঢ়'), ('য়', 'ৡ'), + ('ৰ', 'ৱ'), ('ৼ', 'ৼ'), ('ਅ', 'ਊ'), ('ਏ', 'ਐ'), + ('ਓ', 'ਨ'), ('ਪ', 'ਰ'), ('ਲ', 'ਲ਼'), ('ਵ', 'ਸ਼'), + ('ਸ', 'ਹ'), ('ਖ਼', 'ੜ'), ('ਫ਼', 'ਫ਼'), ('ੲ', 'ੴ'), + ('અ', 'ઍ'), ('એ', 'ઑ'), ('ઓ', 'ન'), ('પ', 'ર'), + ('લ', 'ળ'), ('વ', 'હ'), ('ઽ', 'ઽ'), ('ૐ', 'ૐ'), + ('ૠ', 'ૡ'), ('ૹ', 'ૹ'), ('ଅ', 'ଌ'), ('ଏ', 'ଐ'), + ('ଓ', 'ନ'), ('ପ', 'ର'), ('ଲ', 'ଳ'), ('ଵ', 'ହ'), + ('ଽ', 'ଽ'), ('ଡ଼', 'ଢ଼'), ('ୟ', 'ୡ'), ('ୱ', 'ୱ'), + ('ஃ', 'ஃ'), ('அ', 'ஊ'), ('எ', 'ஐ'), ('ஒ', 'க'), + ('ங', 'ச'), ('ஜ', 'ஜ'), ('ஞ', 'ட'), ('ண', 'த'), + ('ந', 'ப'), ('ம', 'ஹ'), ('ௐ', 'ௐ'), ('అ', 'ఌ'), + ('ఎ', 'ఐ'), ('ఒ', 'న'), ('ప', 'హ'), ('ఽ', 'ఽ'), + ('ౘ', 'ౚ'), ('ౠ', 'ౡ'), ('ಀ', 'ಀ'), ('ಅ', 'ಌ'), + ('ಎ', 'ಐ'), ('ಒ', 'ನ'), ('ಪ', 'ಳ'), ('ವ', 'ಹ'), + ('ಽ', 'ಽ'), ('ೞ', 'ೞ'), ('ೠ', 'ೡ'), ('ೱ', 'ೲ'), + ('അ', 'ഌ'), ('എ', 'ഐ'), ('ഒ', 'ഺ'), ('ഽ', 'ഽ'), + ('ൎ', 'ൎ'), ('ൔ', 'ൖ'), ('ൟ', 'ൡ'), ('ൺ', 'ൿ'), + ('අ', 'ඖ'), ('ක', 'න'), ('ඳ', 'ර'), ('ල', 'ල'), + ('ව', 'ෆ'), ('ก', 'ะ'), ('า', 'ำ'), ('เ', 'ๅ'), + ('ກ', 'ຂ'), ('ຄ', 'ຄ'), ('ງ', 'ຈ'), ('ຊ', 'ຊ'), + ('ຍ', 'ຍ'), ('ດ', 'ທ'), ('ນ', 'ຟ'), ('ມ', 'ຣ'), + ('ລ', 'ລ'), ('ວ', 'ວ'), ('ສ', 'ຫ'), ('ອ', 'ະ'), + ('າ', 'ຳ'), ('ຽ', 'ຽ'), ('ເ', 'ໄ'), ('ໜ', 'ໟ'), + ('ༀ', 'ༀ'), ('ཀ', 'ཇ'), ('ཉ', 'ཬ'), ('ྈ', 'ྌ'), + ('က', 'ဪ'), ('ဿ', 'ဿ'), ('ၐ', 'ၕ'), ('ၚ', 'ၝ'), + ('ၡ', 'ၡ'), ('ၥ', 'ၦ'), ('ၮ', 'ၰ'), ('ၵ', 'ႁ'), + ('ႎ', 'ႎ'), ('ა', 'ჺ'), ('ჽ', 'ቈ'), ('ቊ', 'ቍ'), + ('ቐ', 'ቖ'), ('ቘ', 'ቘ'), ('ቚ', 'ቝ'), ('በ', 'ኈ'), + ('ኊ', 'ኍ'), ('ነ', 'ኰ'), ('ኲ', 'ኵ'), ('ኸ', 'ኾ'), + ('ዀ', 'ዀ'), ('ዂ', 'ዅ'), ('ወ', 'ዖ'), ('ዘ', 'ጐ'), + ('ጒ', 'ጕ'), ('ጘ', 'ፚ'), ('ᎀ', 'ᎏ'), ('ᐁ', 'ᙬ'), + ('ᙯ', 'ᙿ'), ('ᚁ', 'ᚚ'), ('ᚠ', 'ᛪ'), ('ᛱ', 'ᛸ'), + ('ᜀ', 'ᜌ'), ('ᜎ', 'ᜑ'), ('ᜠ', 'ᜱ'), ('ᝀ', 'ᝑ'), + ('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), ('ក', 'ឳ'), ('ៜ', 'ៜ'), + ('ᠠ', 'ᡂ'), ('ᡄ', 'ᡷ'), ('ᢀ', 'ᢄ'), ('ᢇ', 'ᢨ'), + ('ᢪ', 'ᢪ'), ('ᢰ', 'ᣵ'), ('ᤀ', 'ᤞ'), ('ᥐ', 'ᥭ'), + ('ᥰ', 'ᥴ'), ('ᦀ', 'ᦫ'), ('ᦰ', 'ᧉ'), ('ᨀ', 'ᨖ'), + ('ᨠ', 'ᩔ'), ('ᬅ', 'ᬳ'), ('ᭅ', 'ᭋ'), ('ᮃ', 'ᮠ'), + ('ᮮ', 'ᮯ'), ('ᮺ', 'ᯥ'), ('ᰀ', 'ᰣ'), ('ᱍ', 'ᱏ'), + ('ᱚ', 'ᱷ'), ('ᳩ', 'ᳬ'), ('ᳮ', 'ᳱ'), ('ᳵ', 'ᳶ'), + ('ℵ', 'ℸ'), ('ⴰ', 'ⵧ'), ('ⶀ', 'ⶖ'), ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), ('ⶰ', 'ⶶ'), ('ⶸ', 'ⶾ'), ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), ('ⷐ', 'ⷖ'), ('ⷘ', 'ⷞ'), ('〆', '〆'), + ('〼', '〼'), ('ぁ', 'ゖ'), ('ゟ', 'ゟ'), ('ァ', 'ヺ'), + ('ヿ', 'ヿ'), ('ㄅ', 'ㄮ'), ('ㄱ', 'ㆎ'), ('ㆠ', 'ㆺ'), + ('ㇰ', 'ㇿ'), ('㐀', '䶵'), ('一', '鿪'), ('ꀀ', 'ꀔ'), + ('ꀖ', 'ꒌ'), ('ꓐ', 'ꓷ'), ('ꔀ', 'ꘋ'), ('ꘐ', 'ꘟ'), + ('ꘪ', 'ꘫ'), ('ꙮ', 'ꙮ'), ('ꚠ', 'ꛥ'), ('ꞏ', 'ꞏ'), + ('ꟷ', 'ꟷ'), ('ꟻ', 'ꠁ'), ('ꠃ', 'ꠅ'), ('ꠇ', 'ꠊ'), + ('ꠌ', 'ꠢ'), ('ꡀ', 'ꡳ'), ('ꢂ', 'ꢳ'), ('ꣲ', 'ꣷ'), + ('ꣻ', 'ꣻ'), ('ꣽ', 'ꣽ'), ('ꤊ', 'ꤥ'), ('ꤰ', 'ꥆ'), + ('ꥠ', 'ꥼ'), ('ꦄ', 'ꦲ'), ('ꧠ', 'ꧤ'), ('ꧧ', 'ꧯ'), + ('ꧺ', 'ꧾ'), ('ꨀ', 'ꨨ'), ('ꩀ', 'ꩂ'), ('ꩄ', 'ꩋ'), + ('ꩠ', 'ꩯ'), ('ꩱ', 'ꩶ'), ('ꩺ', 'ꩺ'), ('ꩾ', 'ꪯ'), + ('ꪱ', 'ꪱ'), ('ꪵ', 'ꪶ'), ('ꪹ', 'ꪽ'), ('ꫀ', 'ꫀ'), + ('ꫂ', 'ꫂ'), ('ꫛ', 'ꫜ'), ('ꫠ', 'ꫪ'), ('ꫲ', 'ꫲ'), + ('ꬁ', 'ꬆ'), ('ꬉ', 'ꬎ'), ('ꬑ', 'ꬖ'), ('ꬠ', 'ꬦ'), + ('ꬨ', 'ꬮ'), ('ꯀ', 'ꯢ'), ('가', '힣'), ('ힰ', 'ퟆ'), + ('ퟋ', 'ퟻ'), ('豈', '舘'), ('並', '龎'), ('יִ', 'יִ'), + ('ײַ', 'ﬨ'), ('שׁ', 'זּ'), ('טּ', 'לּ'), ('מּ', 'מּ'), + ('נּ', 'סּ'), ('ףּ', 'פּ'), ('צּ', 'ﮱ'), ('ﯓ', 'ﴽ'), + ('ﵐ', 'ﶏ'), ('ﶒ', 'ﷇ'), ('ﷰ', 'ﷻ'), ('ﹰ', 'ﹴ'), + ('ﹶ', 'ﻼ'), ('ヲ', 'ッ'), ('ア', 'ン'), ('ᅠ', 'ᄒ'), + ('ᅡ', 'ᅦ'), ('ᅧ', 'ᅬ'), ('ᅭ', 'ᅲ'), ('ᅳ', 'ᅵ'), + ('𐀀', '𐀋'), ('𐀍', '𐀦'), ('𐀨', '𐀺'), ('𐀼', '𐀽'), + ('𐀿', '𐁍'), ('𐁐', '𐁝'), ('𐂀', '𐃺'), ('𐊀', '𐊜'), + ('𐊠', '𐋐'), ('𐌀', '𐌟'), ('𐌭', '𐍀'), ('𐍂', '𐍉'), + ('𐍐', '𐍵'), ('𐎀', '𐎝'), ('𐎠', '𐏃'), ('𐏈', '𐏏'), + ('𐑐', '𐒝'), ('𐔀', '𐔧'), ('𐔰', '𐕣'), ('𐘀', '𐜶'), + ('𐝀', '𐝕'), ('𐝠', '𐝧'), ('𐠀', '𐠅'), ('𐠈', '𐠈'), + ('𐠊', '𐠵'), ('𐠷', '𐠸'), ('𐠼', '𐠼'), ('𐠿', '𐡕'), + ('𐡠', '𐡶'), ('𐢀', '𐢞'), ('𐣠', '𐣲'), ('𐣴', '𐣵'), + ('𐤀', '𐤕'), ('𐤠', '𐤹'), ('𐦀', '𐦷'), ('𐦾', '𐦿'), + ('𐨀', '𐨀'), ('𐨐', '𐨓'), ('𐨕', '𐨗'), ('𐨙', '𐨳'), + ('𐩠', '𐩼'), ('𐪀', '𐪜'), ('𐫀', '𐫇'), ('𐫉', '𐫤'), + ('𐬀', '𐬵'), ('𐭀', '𐭕'), ('𐭠', '𐭲'), ('𐮀', '𐮑'), + ('𐰀', '𐱈'), ('𑀃', '𑀷'), ('𑂃', '𑂯'), ('𑃐', '𑃨'), + ('𑄃', '𑄦'), ('𑅐', '𑅲'), ('𑅶', '𑅶'), ('𑆃', '𑆲'), + ('𑇁', '𑇄'), ('𑇚', '𑇚'), ('𑇜', '𑇜'), ('𑈀', '𑈑'), + ('𑈓', '𑈫'), ('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), + ('𑊏', '𑊝'), ('𑊟', '𑊨'), ('𑊰', '𑋞'), ('𑌅', '𑌌'), + ('𑌏', '𑌐'), ('𑌓', '𑌨'), ('𑌪', '𑌰'), ('𑌲', '𑌳'), + ('𑌵', '𑌹'), ('𑌽', '𑌽'), ('𑍐', '𑍐'), ('𑍝', '𑍡'), + ('𑐀', '𑐴'), ('𑑇', '𑑊'), ('𑒀', '𑒯'), ('𑓄', '𑓅'), + ('𑓇', '𑓇'), ('𑖀', '𑖮'), ('𑗘', '𑗛'), ('𑘀', '𑘯'), + ('𑙄', '𑙄'), ('𑚀', '𑚪'), ('𑜀', '𑜙'), ('𑣿', '𑣿'), + ('𑨀', '𑨀'), ('𑨋', '𑨲'), ('𑨺', '𑨺'), ('𑩐', '𑩐'), + ('𑩜', '𑪃'), ('𑪆', '𑪉'), ('𑫀', '𑫸'), ('𑰀', '𑰈'), + ('𑰊', '𑰮'), ('𑱀', '𑱀'), ('𑱲', '𑲏'), ('𑴀', '𑴆'), + ('𑴈', '𑴉'), ('𑴋', '𑴰'), ('𑵆', '𑵆'), ('𒀀', '𒎙'), + ('𒒀', '𒕃'), ('𓀀', '𓐮'), ('𔐀', '𔙆'), ('𖠀', '𖨸'), + ('𖩀', '𖩞'), ('𖫐', '𖫭'), ('𖬀', '𖬯'), ('𖭣', '𖭷'), + ('𖭽', '𖮏'), ('𖼀', '𖽄'), ('𖽐', '𖽐'), ('𗀀', '𘟬'), + ('𘠀', '𘫲'), ('𛀀', '𛄞'), ('𛅰', '𛋻'), ('𛰀', '𛱪'), + ('𛱰', '𛱼'), ('𛲀', '𛲈'), ('𛲐', '𛲙'), ('𞠀', '𞣄'), + ('𞸀', '𞸃'), ('𞸅', '𞸟'), ('𞸡', '𞸢'), ('𞸤', '𞸤'), + ('𞸧', '𞸧'), ('𞸩', '𞸲'), ('𞸴', '𞸷'), ('𞸹', '𞸹'), + ('𞸻', '𞸻'), ('𞹂', '𞹂'), ('𞹇', '𞹇'), ('𞹉', '𞹉'), + ('𞹋', '𞹋'), ('𞹍', '𞹏'), ('𞹑', '𞹒'), ('𞹔', '𞹔'), + ('𞹗', '𞹗'), ('𞹙', '𞹙'), ('𞹛', '𞹛'), ('𞹝', '𞹝'), + ('𞹟', '𞹟'), ('𞹡', '𞹢'), ('𞹤', '𞹤'), ('𞹧', '𞹪'), + ('𞹬', '𞹲'), ('𞹴', '𞹷'), ('𞹹', '𞹼'), ('𞹾', '𞹾'), + ('𞺀', '𞺉'), ('𞺋', '𞺛'), ('𞺡', '𞺣'), ('𞺥', '𞺩'), + ('𞺫', '𞺻'), ('𠀀', '𪛖'), ('𪜀', '𫜴'), ('𫝀', '𫠝'), + ('𫠠', '𬺡'), ('𬺰', '𮯠'), ('丽', '𪘀'), +]; + +pub const OTHER_NUMBER: &'static [(char, char)] = &[ + ('²', '³'), ('¹', '¹'), ('¼', '¾'), ('৴', '৹'), ('୲', '୷'), + ('௰', '௲'), ('౸', '౾'), ('൘', '൞'), ('൰', '൸'), + ('༪', '༳'), ('፩', '፼'), ('៰', '៹'), ('᧚', '᧚'), + ('⁰', '⁰'), ('⁴', '⁹'), ('₀', '₉'), ('⅐', '⅟'), + ('↉', '↉'), ('①', '⒛'), ('⓪', '⓿'), ('❶', '➓'), + ('⳽', '⳽'), ('㆒', '㆕'), ('㈠', '㈩'), ('㉈', '㉏'), + ('㉑', '㉟'), ('㊀', '㊉'), ('㊱', '㊿'), ('꠰', '꠵'), + ('𐄇', '𐄳'), ('𐅵', '𐅸'), ('𐆊', '𐆋'), ('𐋡', '𐋻'), + ('𐌠', '𐌣'), ('𐡘', '𐡟'), ('𐡹', '𐡿'), ('𐢧', '𐢯'), + ('𐣻', '𐣿'), ('𐤖', '𐤛'), ('𐦼', '𐦽'), ('𐧀', '𐧏'), + ('𐧒', '𐧿'), ('𐩀', '𐩇'), ('𐩽', '𐩾'), ('𐪝', '𐪟'), + ('𐫫', '𐫯'), ('𐭘', '𐭟'), ('𐭸', '𐭿'), ('𐮩', '𐮯'), + ('𐳺', '𐳿'), ('𐹠', '𐹾'), ('𑁒', '𑁥'), ('𑇡', '𑇴'), + ('𑜺', '𑜻'), ('𑣪', '𑣲'), ('𑱚', '𑱬'), ('𖭛', '𖭡'), + ('𝍠', '𝍱'), ('𞣇', '𞣏'), ('🄀', '🄌'), +]; + +pub const OTHER_PUNCTUATION: &'static [(char, char)] = &[ + ('!', '#'), ('%', '\''), ('*', '*'), (',', ','), ('.', '/'), (':', ';'), + ('?', '@'), ('\\', '\\'), ('¡', '¡'), ('§', '§'), ('¶', '·'), + ('¿', '¿'), (';', ';'), ('·', '·'), ('՚', '՟'), ('։', '։'), + ('׀', '׀'), ('׃', '׃'), ('׆', '׆'), ('׳', '״'), ('؉', '؊'), + ('،', '؍'), ('؛', '؛'), ('؞', '؟'), ('٪', '٭'), ('۔', '۔'), + ('܀', '܍'), ('߷', '߹'), ('࠰', '࠾'), ('࡞', '࡞'), ('।', '॥'), + ('॰', '॰'), ('৽', '৽'), ('૰', '૰'), ('෴', '෴'), + ('๏', '๏'), ('๚', '๛'), ('༄', '༒'), ('༔', '༔'), + ('྅', '྅'), ('࿐', '࿔'), ('࿙', '࿚'), ('၊', '၏'), + ('჻', '჻'), ('፠', '፨'), ('᙭', '᙮'), ('᛫', '᛭'), + ('᜵', '᜶'), ('។', '៖'), ('៘', '៚'), ('᠀', '᠅'), + ('᠇', '᠊'), ('᥄', '᥅'), ('᨞', '᨟'), ('᪠', '᪦'), + ('᪨', '᪭'), ('᭚', '᭠'), ('᯼', '᯿'), ('᰻', '᰿'), + ('᱾', '᱿'), ('᳀', '᳇'), ('᳓', '᳓'), ('‖', '‗'), + ('†', '‧'), ('‰', '‸'), ('※', '‾'), ('⁁', '⁃'), + ('⁇', '⁑'), ('⁓', '⁓'), ('⁕', '⁞'), ('⳹', '⳼'), + ('⳾', '⳿'), ('⵰', '⵰'), ('⸀', '⸁'), ('⸆', '⸈'), + ('⸋', '⸋'), ('⸎', '⸖'), ('⸘', '⸙'), ('⸛', '⸛'), + ('⸞', '⸟'), ('⸪', '⸮'), ('⸰', '⸹'), ('⸼', '⸿'), + ('⹁', '⹁'), ('⹃', '⹉'), ('、', '〃'), ('〽', '〽'), + ('・', '・'), ('꓾', '꓿'), ('꘍', '꘏'), ('꙳', '꙳'), + ('꙾', '꙾'), ('꛲', '꛷'), ('꡴', '꡷'), ('꣎', '꣏'), + ('꣸', '꣺'), ('꣼', '꣼'), ('꤮', '꤯'), ('꥟', '꥟'), + ('꧁', '꧍'), ('꧞', '꧟'), ('꩜', '꩟'), ('꫞', '꫟'), + ('꫰', '꫱'), ('꯫', '꯫'), ('︐', '︖'), ('︙', '︙'), + ('︰', '︰'), ('﹅', '﹆'), ('﹉', '﹌'), ('﹐', '﹒'), + ('﹔', '﹗'), ('﹟', '﹡'), ('﹨', '﹨'), ('﹪', '﹫'), + ('!', '#'), ('%', '''), ('*', '*'), (',', ','), + ('.', '/'), (':', ';'), ('?', '@'), ('\', '\'), + ('。', '。'), ('、', '・'), ('𐄀', '𐄂'), ('𐎟', '𐎟'), + ('𐏐', '𐏐'), ('𐕯', '𐕯'), ('𐡗', '𐡗'), ('𐤟', '𐤟'), + ('𐤿', '𐤿'), ('𐩐', '𐩘'), ('𐩿', '𐩿'), ('𐫰', '𐫶'), + ('𐬹', '𐬿'), ('𐮙', '𐮜'), ('𑁇', '𑁍'), ('𑂻', '𑂼'), + ('𑂾', '𑃁'), ('𑅀', '𑅃'), ('𑅴', '𑅵'), ('𑇅', '𑇉'), + ('𑇍', '𑇍'), ('𑇛', '𑇛'), ('𑇝', '𑇟'), ('𑈸', '𑈽'), + ('𑊩', '𑊩'), ('𑑋', '𑑏'), ('𑑛', '𑑛'), ('𑑝', '𑑝'), + ('𑓆', '𑓆'), ('𑗁', '𑗗'), ('𑙁', '𑙃'), ('𑙠', '𑙬'), + ('𑜼', '𑜾'), ('𑨿', '𑩆'), ('𑪚', '𑪜'), ('𑪞', '𑪢'), + ('𑱁', '𑱅'), ('𑱰', '𑱱'), ('𒑰', '𒑴'), ('𖩮', '𖩯'), + ('𖫵', '𖫵'), ('𖬷', '𖬻'), ('𖭄', '𖭄'), ('𛲟', '𛲟'), + ('𝪇', '𝪋'), ('𞥞', '𞥟'), +]; + +pub const OTHER_SYMBOL: &'static [(char, char)] = &[ + ('¦', '¦'), ('©', '©'), ('®', '®'), ('°', '°'), ('҂', '҂'), + ('֍', '֎'), ('؎', '؏'), ('۞', '۞'), ('۩', '۩'), ('۽', '۾'), + ('߶', '߶'), ('৺', '৺'), ('୰', '୰'), ('௳', '௸'), + ('௺', '௺'), ('౿', '౿'), ('൏', '൏'), ('൹', '൹'), + ('༁', '༃'), ('༓', '༓'), ('༕', '༗'), ('༚', '༟'), + ('༴', '༴'), ('༶', '༶'), ('༸', '༸'), ('྾', '࿅'), + ('࿇', '࿌'), ('࿎', '࿏'), ('࿕', '࿘'), ('႞', '႟'), + ('᎐', '᎙'), ('᥀', '᥀'), ('᧞', '᧿'), ('᭡', '᭪'), + ('᭴', '᭼'), ('℀', '℁'), ('℃', '℆'), ('℈', '℉'), + ('℔', '℔'), ('№', '℗'), ('℞', '℣'), ('℥', '℥'), + ('℧', '℧'), ('℩', '℩'), ('℮', '℮'), ('℺', '℻'), + ('⅊', '⅊'), ('⅌', '⅍'), ('⅏', '⅏'), ('↊', '↋'), + ('↕', '↙'), ('↜', '↟'), ('↡', '↢'), ('↤', '↥'), + ('↧', '↭'), ('↯', '⇍'), ('⇐', '⇑'), ('⇓', '⇓'), + ('⇕', '⇳'), ('⌀', '⌇'), ('⌌', '⌟'), ('⌢', '⌨'), + ('⌫', '⍻'), ('⍽', '⎚'), ('⎴', '⏛'), ('⏢', '␦'), + ('⑀', '⑊'), ('⒜', 'ⓩ'), ('─', '▶'), ('▸', '◀'), + ('◂', '◷'), ('☀', '♮'), ('♰', '❧'), ('➔', '➿'), + ('⠀', '⣿'), ('⬀', '⬯'), ('⭅', '⭆'), ('⭍', '⭳'), + ('⭶', '⮕'), ('⮘', '⮹'), ('⮽', '⯈'), ('⯊', '⯒'), + ('⯬', '⯯'), ('⳥', '⳪'), ('⺀', '⺙'), ('⺛', '⻳'), + ('⼀', '⿕'), ('⿰', '⿻'), ('〄', '〄'), ('〒', '〓'), + ('〠', '〠'), ('〶', '〷'), ('〾', '〿'), ('㆐', '㆑'), + ('㆖', '㆟'), ('㇀', '㇣'), ('㈀', '㈞'), ('㈪', '㉇'), + ('㉐', '㉐'), ('㉠', '㉿'), ('㊊', '㊰'), ('㋀', '㋾'), + ('㌀', '㏿'), ('䷀', '䷿'), ('꒐', '꓆'), ('꠨', '꠫'), + ('꠶', '꠷'), ('꠹', '꠹'), ('꩷', '꩹'), ('﷽', '﷽'), + ('¦', '¦'), ('│', '│'), ('■', '○'), ('', '�'), + ('𐄷', '𐄿'), ('𐅹', '𐆉'), ('𐆌', '𐆎'), ('𐆐', '𐆛'), + ('𐆠', '𐆠'), ('𐇐', '𐇼'), ('𐡷', '𐡸'), ('𐫈', '𐫈'), + ('𑜿', '𑜿'), ('𖬼', '𖬿'), ('𖭅', '𖭅'), ('𛲜', '𛲜'), + ('𝀀', '𝃵'), ('𝄀', '𝄦'), ('𝄩', '𝅘𝅥𝅲'), ('𝅪', '𝅬'), + ('𝆃', '𝆄'), ('𝆌', '𝆩'), ('𝆮', '𝇨'), ('𝈀', '𝉁'), + ('𝉅', '𝉅'), ('𝌀', '𝍖'), ('𝠀', '𝧿'), ('𝨷', '𝨺'), + ('𝩭', '𝩴'), ('𝩶', '𝪃'), ('𝪅', '𝪆'), ('🀀', '🀫'), + ('🀰', '🂓'), ('🂠', '🂮'), ('🂱', '🂿'), ('🃁', '🃏'), + ('🃑', '🃵'), ('🄐', '🄮'), ('🄰', '🅫'), ('🅰', '🆬'), + ('🇦', '🈂'), ('🈐', '🈻'), ('🉀', '🉈'), ('🉐', '🉑'), + ('🉠', '🉥'), ('🌀', '🏺'), ('🐀', '🛔'), ('🛠', '🛬'), + ('🛰', '🛸'), ('🜀', '🝳'), ('🞀', '🟔'), ('🠀', '🠋'), + ('🠐', '🡇'), ('🡐', '🡙'), ('🡠', '🢇'), ('🢐', '🢭'), + ('🤀', '🤋'), ('🤐', '🤾'), ('🥀', '🥌'), ('🥐', '🥫'), + ('🦀', '🦗'), ('🧀', '🧀'), ('🧐', '🧦'), +]; + +pub const PARAGRAPH_SEPARATOR: &'static [(char, char)] = &[ + ('\u{2029}', '\u{2029}'), +]; + +pub const PRIVATE_USE: &'static [(char, char)] = &[ + ('\u{e000}', '\u{f8ff}'), ('\u{f0000}', '\u{ffffd}'), + ('\u{100000}', '\u{10fffd}'), +]; + +pub const PUNCTUATION: &'static [(char, char)] = &[ + ('!', '#'), ('%', '*'), (',', '/'), (':', ';'), ('?', '@'), ('[', ']'), + ('_', '_'), ('{', '{'), ('}', '}'), ('¡', '¡'), ('§', '§'), + ('«', '«'), ('¶', '·'), ('»', '»'), ('¿', '¿'), (';', ';'), + ('·', '·'), ('՚', '՟'), ('։', '֊'), ('־', '־'), ('׀', '׀'), + ('׃', '׃'), ('׆', '׆'), ('׳', '״'), ('؉', '؊'), ('،', '؍'), + ('؛', '؛'), ('؞', '؟'), ('٪', '٭'), ('۔', '۔'), ('܀', '܍'), + ('߷', '߹'), ('࠰', '࠾'), ('࡞', '࡞'), ('।', '॥'), + ('॰', '॰'), ('৽', '৽'), ('૰', '૰'), ('෴', '෴'), + ('๏', '๏'), ('๚', '๛'), ('༄', '༒'), ('༔', '༔'), + ('༺', '༽'), ('྅', '྅'), ('࿐', '࿔'), ('࿙', '࿚'), + ('၊', '၏'), ('჻', '჻'), ('፠', '፨'), ('᐀', '᐀'), + ('᙭', '᙮'), ('᚛', '᚜'), ('᛫', '᛭'), ('᜵', '᜶'), + ('។', '៖'), ('៘', '៚'), ('᠀', '᠊'), ('᥄', '᥅'), + ('᨞', '᨟'), ('᪠', '᪦'), ('᪨', '᪭'), ('᭚', '᭠'), + ('᯼', '᯿'), ('᰻', '᰿'), ('᱾', '᱿'), ('᳀', '᳇'), + ('᳓', '᳓'), ('‐', '‧'), ('‰', '⁃'), ('⁅', '⁑'), + ('⁓', '⁞'), ('⁽', '⁾'), ('₍', '₎'), ('⌈', '⌋'), + ('〈', '〉'), ('❨', '❵'), ('⟅', '⟆'), ('⟦', '⟯'), + ('⦃', '⦘'), ('⧘', '⧛'), ('⧼', '⧽'), ('⳹', '⳼'), + ('⳾', '⳿'), ('⵰', '⵰'), ('⸀', '⸮'), ('⸰', '⹉'), + ('、', '〃'), ('〈', '】'), ('〔', '〟'), ('〰', '〰'), + ('〽', '〽'), ('゠', '゠'), ('・', '・'), ('꓾', '꓿'), + ('꘍', '꘏'), ('꙳', '꙳'), ('꙾', '꙾'), ('꛲', '꛷'), + ('꡴', '꡷'), ('꣎', '꣏'), ('꣸', '꣺'), ('꣼', '꣼'), + ('꤮', '꤯'), ('꥟', '꥟'), ('꧁', '꧍'), ('꧞', '꧟'), + ('꩜', '꩟'), ('꫞', '꫟'), ('꫰', '꫱'), ('꯫', '꯫'), + ('﴾', '﴿'), ('︐', '︙'), ('︰', '﹒'), ('﹔', '﹡'), + ('﹣', '﹣'), ('﹨', '﹨'), ('﹪', '﹫'), ('!', '#'), + ('%', '*'), (',', '/'), (':', ';'), ('?', '@'), + ('[', ']'), ('_', '_'), ('{', '{'), ('}', '}'), + ('⦅', '・'), ('𐄀', '𐄂'), ('𐎟', '𐎟'), ('𐏐', '𐏐'), + ('𐕯', '𐕯'), ('𐡗', '𐡗'), ('𐤟', '𐤟'), ('𐤿', '𐤿'), + ('𐩐', '𐩘'), ('𐩿', '𐩿'), ('𐫰', '𐫶'), ('𐬹', '𐬿'), + ('𐮙', '𐮜'), ('𑁇', '𑁍'), ('𑂻', '𑂼'), ('𑂾', '𑃁'), + ('𑅀', '𑅃'), ('𑅴', '𑅵'), ('𑇅', '𑇉'), ('𑇍', '𑇍'), + ('𑇛', '𑇛'), ('𑇝', '𑇟'), ('𑈸', '𑈽'), ('𑊩', '𑊩'), + ('𑑋', '𑑏'), ('𑑛', '𑑛'), ('𑑝', '𑑝'), ('𑓆', '𑓆'), + ('𑗁', '𑗗'), ('𑙁', '𑙃'), ('𑙠', '𑙬'), ('𑜼', '𑜾'), + ('𑨿', '𑩆'), ('𑪚', '𑪜'), ('𑪞', '𑪢'), ('𑱁', '𑱅'), + ('𑱰', '𑱱'), ('𒑰', '𒑴'), ('𖩮', '𖩯'), ('𖫵', '𖫵'), + ('𖬷', '𖬻'), ('𖭄', '𖭄'), ('𛲟', '𛲟'), ('𝪇', '𝪋'), + ('𞥞', '𞥟'), +]; + +pub const SEPARATOR: &'static [(char, char)] = &[ + (' ', ' '), ('\u{a0}', '\u{a0}'), ('\u{1680}', '\u{1680}'), + ('\u{2000}', '\u{200a}'), ('\u{2028}', '\u{2029}'), + ('\u{202f}', '\u{202f}'), ('\u{205f}', '\u{205f}'), + ('\u{3000}', '\u{3000}'), +]; + +pub const SPACE_SEPARATOR: &'static [(char, char)] = &[ + (' ', ' '), ('\u{a0}', '\u{a0}'), ('\u{1680}', '\u{1680}'), + ('\u{2000}', '\u{200a}'), ('\u{202f}', '\u{202f}'), + ('\u{205f}', '\u{205f}'), ('\u{3000}', '\u{3000}'), +]; + +pub const SPACING_MARK: &'static [(char, char)] = &[ + ('ः', 'ः'), ('ऻ', 'ऻ'), ('ा', 'ी'), ('ॉ', 'ौ'), + ('ॎ', 'ॏ'), ('ং', 'ঃ'), ('া', 'ী'), ('ে', 'ৈ'), + ('ো', 'ৌ'), ('ৗ', 'ৗ'), ('ਃ', 'ਃ'), ('ਾ', 'ੀ'), + ('ઃ', 'ઃ'), ('ા', 'ી'), ('ૉ', 'ૉ'), ('ો', 'ૌ'), + ('ଂ', 'ଃ'), ('ା', 'ା'), ('ୀ', 'ୀ'), ('େ', 'ୈ'), + ('ୋ', 'ୌ'), ('ୗ', 'ୗ'), ('ா', 'ி'), ('ு', 'ூ'), + ('ெ', 'ை'), ('ொ', 'ௌ'), ('ௗ', 'ௗ'), ('ఁ', 'ః'), + ('ు', 'ౄ'), ('ಂ', 'ಃ'), ('ಾ', 'ಾ'), ('ೀ', 'ೄ'), + ('ೇ', 'ೈ'), ('ೊ', 'ೋ'), ('ೕ', 'ೖ'), ('ം', 'ഃ'), + ('ാ', 'ീ'), ('െ', 'ൈ'), ('ൊ', 'ൌ'), ('ൗ', 'ൗ'), + ('ං', 'ඃ'), ('ා', 'ෑ'), ('ෘ', 'ෟ'), ('ෲ', 'ෳ'), + ('༾', '༿'), ('ཿ', 'ཿ'), ('ါ', 'ာ'), ('ေ', 'ေ'), + ('း', 'း'), ('ျ', 'ြ'), ('ၖ', 'ၗ'), ('ၢ', 'ၤ'), + ('ၧ', 'ၭ'), ('ႃ', 'ႄ'), ('ႇ', 'ႌ'), ('ႏ', 'ႏ'), + ('ႚ', 'ႜ'), ('ា', 'ា'), ('ើ', 'ៅ'), ('ះ', 'ៈ'), + ('ᤣ', 'ᤦ'), ('ᤩ', 'ᤫ'), ('ᤰ', 'ᤱ'), ('ᤳ', 'ᤸ'), + ('ᨙ', 'ᨚ'), ('ᩕ', 'ᩕ'), ('ᩗ', 'ᩗ'), ('ᩡ', 'ᩡ'), + ('ᩣ', 'ᩤ'), ('ᩭ', 'ᩲ'), ('ᬄ', 'ᬄ'), ('ᬵ', 'ᬵ'), + ('ᬻ', 'ᬻ'), ('ᬽ', 'ᭁ'), ('ᭃ', '᭄'), ('ᮂ', 'ᮂ'), + ('ᮡ', 'ᮡ'), ('ᮦ', 'ᮧ'), ('᮪', '᮪'), ('ᯧ', 'ᯧ'), + ('ᯪ', 'ᯬ'), ('ᯮ', 'ᯮ'), ('᯲', '᯳'), ('ᰤ', 'ᰫ'), + ('ᰴ', 'ᰵ'), ('᳡', '᳡'), ('ᳲ', 'ᳳ'), ('᳷', '᳷'), + ('〮', '〯'), ('ꠣ', 'ꠤ'), ('ꠧ', 'ꠧ'), ('ꢀ', 'ꢁ'), + ('ꢴ', 'ꣃ'), ('ꥒ', '꥓'), ('ꦃ', 'ꦃ'), ('ꦴ', 'ꦵ'), + ('ꦺ', 'ꦻ'), ('ꦽ', '꧀'), ('ꨯ', 'ꨰ'), ('ꨳ', 'ꨴ'), + ('ꩍ', 'ꩍ'), ('ꩻ', 'ꩻ'), ('ꩽ', 'ꩽ'), ('ꫫ', 'ꫫ'), + ('ꫮ', 'ꫯ'), ('ꫵ', 'ꫵ'), ('ꯣ', 'ꯤ'), ('ꯦ', 'ꯧ'), + ('ꯩ', 'ꯪ'), ('꯬', '꯬'), ('𑀀', '𑀀'), ('𑀂', '𑀂'), + ('𑂂', '𑂂'), ('𑂰', '𑂲'), ('𑂷', '𑂸'), ('𑄬', '𑄬'), + ('𑆂', '𑆂'), ('𑆳', '𑆵'), ('𑆿', '𑇀'), ('𑈬', '𑈮'), + ('𑈲', '𑈳'), ('𑈵', '𑈵'), ('𑋠', '𑋢'), ('𑌂', '𑌃'), + ('𑌾', '𑌿'), ('𑍁', '𑍄'), ('𑍇', '𑍈'), ('𑍋', '𑍍'), + ('𑍗', '𑍗'), ('𑍢', '𑍣'), ('𑐵', '𑐷'), ('𑑀', '𑑁'), + ('𑑅', '𑑅'), ('𑒰', '𑒲'), ('𑒹', '𑒹'), ('𑒻', '𑒾'), + ('𑓁', '𑓁'), ('𑖯', '𑖱'), ('𑖸', '𑖻'), ('𑖾', '𑖾'), + ('𑘰', '𑘲'), ('𑘻', '𑘼'), ('𑘾', '𑘾'), ('𑚬', '𑚬'), + ('𑚮', '𑚯'), ('𑚶', '𑚶'), ('𑜠', '𑜡'), ('𑜦', '𑜦'), + ('𑨇', '𑨈'), ('𑨹', '𑨹'), ('𑩗', '𑩘'), ('𑪗', '𑪗'), + ('𑰯', '𑰯'), ('𑰾', '𑰾'), ('𑲩', '𑲩'), ('𑲱', '𑲱'), + ('𑲴', '𑲴'), ('𖽑', '𖽾'), ('𝅥', '𝅦'), ('𝅭', '𝅲'), +]; + +pub const SYMBOL: &'static [(char, char)] = &[ + ('$', '$'), ('+', '+'), ('<', '>'), ('^', '^'), ('`', '`'), ('|', '|'), + ('~', '~'), ('¢', '¦'), ('¨', '©'), ('¬', '¬'), ('®', '±'), + ('´', '´'), ('¸', '¸'), ('×', '×'), ('÷', '÷'), ('˂', '˅'), + ('˒', '˟'), ('˥', '˫'), ('˭', '˭'), ('˯', '˿'), ('͵', '͵'), + ('΄', '΅'), ('϶', '϶'), ('҂', '҂'), ('֍', '֏'), ('؆', '؈'), + ('؋', '؋'), ('؎', '؏'), ('۞', '۞'), ('۩', '۩'), ('۽', '۾'), + ('߶', '߶'), ('৲', '৳'), ('৺', '৻'), ('૱', '૱'), + ('୰', '୰'), ('௳', '௺'), ('౿', '౿'), ('൏', '൏'), + ('൹', '൹'), ('฿', '฿'), ('༁', '༃'), ('༓', '༓'), + ('༕', '༗'), ('༚', '༟'), ('༴', '༴'), ('༶', '༶'), + ('༸', '༸'), ('྾', '࿅'), ('࿇', '࿌'), ('࿎', '࿏'), + ('࿕', '࿘'), ('႞', '႟'), ('᎐', '᎙'), ('៛', '៛'), + ('᥀', '᥀'), ('᧞', '᧿'), ('᭡', '᭪'), ('᭴', '᭼'), + ('᾽', '᾽'), ('᾿', '῁'), ('῍', '῏'), ('῝', '῟'), + ('῭', '`'), ('´', '῾'), ('⁄', '⁄'), ('⁒', '⁒'), + ('⁺', '⁼'), ('₊', '₌'), ('₠', '₿'), ('℀', '℁'), + ('℃', '℆'), ('℈', '℉'), ('℔', '℔'), ('№', '℘'), + ('℞', '℣'), ('℥', '℥'), ('℧', '℧'), ('℩', '℩'), + ('℮', '℮'), ('℺', '℻'), ('⅀', '⅄'), ('⅊', '⅍'), + ('⅏', '⅏'), ('↊', '↋'), ('←', '⌇'), ('⌌', '⌨'), + ('⌫', '␦'), ('⑀', '⑊'), ('⒜', 'ⓩ'), ('─', '❧'), + ('➔', '⟄'), ('⟇', '⟥'), ('⟰', '⦂'), ('⦙', '⧗'), + ('⧜', '⧻'), ('⧾', '⭳'), ('⭶', '⮕'), ('⮘', '⮹'), + ('⮽', '⯈'), ('⯊', '⯒'), ('⯬', '⯯'), ('⳥', '⳪'), + ('⺀', '⺙'), ('⺛', '⻳'), ('⼀', '⿕'), ('⿰', '⿻'), + ('〄', '〄'), ('〒', '〓'), ('〠', '〠'), ('〶', '〷'), + ('〾', '〿'), ('゛', '゜'), ('㆐', '㆑'), ('㆖', '㆟'), + ('㇀', '㇣'), ('㈀', '㈞'), ('㈪', '㉇'), ('㉐', '㉐'), + ('㉠', '㉿'), ('㊊', '㊰'), ('㋀', '㋾'), ('㌀', '㏿'), + ('䷀', '䷿'), ('꒐', '꓆'), ('꜀', '꜖'), ('꜠', '꜡'), + ('꞉', '꞊'), ('꠨', '꠫'), ('꠶', '꠹'), ('꩷', '꩹'), + ('꭛', '꭛'), ('﬩', '﬩'), ('﮲', '﯁'), ('﷼', '﷽'), + ('﹢', '﹢'), ('﹤', '﹦'), ('﹩', '﹩'), ('$', '$'), + ('+', '+'), ('<', '>'), ('^', '^'), ('`', '`'), + ('|', '|'), ('~', '~'), ('¢', '₩'), ('│', '○'), + ('', '�'), ('𐄷', '𐄿'), ('𐅹', '𐆉'), ('𐆌', '𐆎'), + ('𐆐', '𐆛'), ('𐆠', '𐆠'), ('𐇐', '𐇼'), ('𐡷', '𐡸'), + ('𐫈', '𐫈'), ('𑜿', '𑜿'), ('𖬼', '𖬿'), ('𖭅', '𖭅'), + ('𛲜', '𛲜'), ('𝀀', '𝃵'), ('𝄀', '𝄦'), ('𝄩', '𝅘𝅥𝅲'), + ('𝅪', '𝅬'), ('𝆃', '𝆄'), ('𝆌', '𝆩'), ('𝆮', '𝇨'), + ('𝈀', '𝉁'), ('𝉅', '𝉅'), ('𝌀', '𝍖'), ('𝛁', '𝛁'), + ('𝛛', '𝛛'), ('𝛻', '𝛻'), ('𝜕', '𝜕'), ('𝜵', '𝜵'), + ('𝝏', '𝝏'), ('𝝯', '𝝯'), ('𝞉', '𝞉'), ('𝞩', '𝞩'), + ('𝟃', '𝟃'), ('𝠀', '𝧿'), ('𝨷', '𝨺'), ('𝩭', '𝩴'), + ('𝩶', '𝪃'), ('𝪅', '𝪆'), ('𞻰', '𞻱'), ('🀀', '🀫'), + ('🀰', '🂓'), ('🂠', '🂮'), ('🂱', '🂿'), ('🃁', '🃏'), + ('🃑', '🃵'), ('🄐', '🄮'), ('🄰', '🅫'), ('🅰', '🆬'), + ('🇦', '🈂'), ('🈐', '🈻'), ('🉀', '🉈'), ('🉐', '🉑'), + ('🉠', '🉥'), ('🌀', '🛔'), ('🛠', '🛬'), ('🛰', '🛸'), + ('🜀', '🝳'), ('🞀', '🟔'), ('🠀', '🠋'), ('🠐', '🡇'), + ('🡐', '🡙'), ('🡠', '🢇'), ('🢐', '🢭'), ('🤀', '🤋'), + ('🤐', '🤾'), ('🥀', '🥌'), ('🥐', '🥫'), ('🦀', '🦗'), + ('🧀', '🧀'), ('🧐', '🧦'), +]; + +pub const TITLECASE_LETTER: &'static [(char, char)] = &[ + ('Dž', 'Dž'), ('Lj', 'Lj'), ('Nj', 'Nj'), ('Dz', 'Dz'), ('ᾈ', 'ᾏ'), + ('ᾘ', 'ᾟ'), ('ᾨ', 'ᾯ'), ('ᾼ', 'ᾼ'), ('ῌ', 'ῌ'), + ('ῼ', 'ῼ'), +]; + +pub const UNASSIGNED: &'static [(char, char)] = &[ + ('\u{378}', '\u{379}'), ('\u{380}', '\u{383}'), ('\u{38b}', '\u{38b}'), + ('\u{38d}', '\u{38d}'), ('\u{3a2}', '\u{3a2}'), ('\u{530}', '\u{530}'), + ('\u{557}', '\u{558}'), ('\u{560}', '\u{560}'), ('\u{588}', '\u{588}'), + ('\u{58b}', '\u{58c}'), ('\u{590}', '\u{590}'), ('\u{5c8}', '\u{5cf}'), + ('\u{5eb}', '\u{5ef}'), ('\u{5f5}', '\u{5ff}'), ('\u{61d}', '\u{61d}'), + ('\u{70e}', '\u{70e}'), ('\u{74b}', '\u{74c}'), ('\u{7b2}', '\u{7bf}'), + ('\u{7fb}', '\u{7ff}'), ('\u{82e}', '\u{82f}'), ('\u{83f}', '\u{83f}'), + ('\u{85c}', '\u{85d}'), ('\u{85f}', '\u{85f}'), ('\u{86b}', '\u{89f}'), + ('\u{8b5}', '\u{8b5}'), ('\u{8be}', '\u{8d3}'), ('\u{984}', '\u{984}'), + ('\u{98d}', '\u{98e}'), ('\u{991}', '\u{992}'), ('\u{9a9}', '\u{9a9}'), + ('\u{9b1}', '\u{9b1}'), ('\u{9b3}', '\u{9b5}'), ('\u{9ba}', '\u{9bb}'), + ('\u{9c5}', '\u{9c6}'), ('\u{9c9}', '\u{9ca}'), ('\u{9cf}', '\u{9d6}'), + ('\u{9d8}', '\u{9db}'), ('\u{9de}', '\u{9de}'), ('\u{9e4}', '\u{9e5}'), + ('\u{9fe}', '\u{a00}'), ('\u{a04}', '\u{a04}'), ('\u{a0b}', '\u{a0e}'), + ('\u{a11}', '\u{a12}'), ('\u{a29}', '\u{a29}'), ('\u{a31}', '\u{a31}'), + ('\u{a34}', '\u{a34}'), ('\u{a37}', '\u{a37}'), ('\u{a3a}', '\u{a3b}'), + ('\u{a3d}', '\u{a3d}'), ('\u{a43}', '\u{a46}'), ('\u{a49}', '\u{a4a}'), + ('\u{a4e}', '\u{a50}'), ('\u{a52}', '\u{a58}'), ('\u{a5d}', '\u{a5d}'), + ('\u{a5f}', '\u{a65}'), ('\u{a76}', '\u{a80}'), ('\u{a84}', '\u{a84}'), + ('\u{a8e}', '\u{a8e}'), ('\u{a92}', '\u{a92}'), ('\u{aa9}', '\u{aa9}'), + ('\u{ab1}', '\u{ab1}'), ('\u{ab4}', '\u{ab4}'), ('\u{aba}', '\u{abb}'), + ('\u{ac6}', '\u{ac6}'), ('\u{aca}', '\u{aca}'), ('\u{ace}', '\u{acf}'), + ('\u{ad1}', '\u{adf}'), ('\u{ae4}', '\u{ae5}'), ('\u{af2}', '\u{af8}'), + ('\u{b00}', '\u{b00}'), ('\u{b04}', '\u{b04}'), ('\u{b0d}', '\u{b0e}'), + ('\u{b11}', '\u{b12}'), ('\u{b29}', '\u{b29}'), ('\u{b31}', '\u{b31}'), + ('\u{b34}', '\u{b34}'), ('\u{b3a}', '\u{b3b}'), ('\u{b45}', '\u{b46}'), + ('\u{b49}', '\u{b4a}'), ('\u{b4e}', '\u{b55}'), ('\u{b58}', '\u{b5b}'), + ('\u{b5e}', '\u{b5e}'), ('\u{b64}', '\u{b65}'), ('\u{b78}', '\u{b81}'), + ('\u{b84}', '\u{b84}'), ('\u{b8b}', '\u{b8d}'), ('\u{b91}', '\u{b91}'), + ('\u{b96}', '\u{b98}'), ('\u{b9b}', '\u{b9b}'), ('\u{b9d}', '\u{b9d}'), + ('\u{ba0}', '\u{ba2}'), ('\u{ba5}', '\u{ba7}'), ('\u{bab}', '\u{bad}'), + ('\u{bba}', '\u{bbd}'), ('\u{bc3}', '\u{bc5}'), ('\u{bc9}', '\u{bc9}'), + ('\u{bce}', '\u{bcf}'), ('\u{bd1}', '\u{bd6}'), ('\u{bd8}', '\u{be5}'), + ('\u{bfb}', '\u{bff}'), ('\u{c04}', '\u{c04}'), ('\u{c0d}', '\u{c0d}'), + ('\u{c11}', '\u{c11}'), ('\u{c29}', '\u{c29}'), ('\u{c3a}', '\u{c3c}'), + ('\u{c45}', '\u{c45}'), ('\u{c49}', '\u{c49}'), ('\u{c4e}', '\u{c54}'), + ('\u{c57}', '\u{c57}'), ('\u{c5b}', '\u{c5f}'), ('\u{c64}', '\u{c65}'), + ('\u{c70}', '\u{c77}'), ('\u{c84}', '\u{c84}'), ('\u{c8d}', '\u{c8d}'), + ('\u{c91}', '\u{c91}'), ('\u{ca9}', '\u{ca9}'), ('\u{cb4}', '\u{cb4}'), + ('\u{cba}', '\u{cbb}'), ('\u{cc5}', '\u{cc5}'), ('\u{cc9}', '\u{cc9}'), + ('\u{cce}', '\u{cd4}'), ('\u{cd7}', '\u{cdd}'), ('\u{cdf}', '\u{cdf}'), + ('\u{ce4}', '\u{ce5}'), ('\u{cf0}', '\u{cf0}'), ('\u{cf3}', '\u{cff}'), + ('\u{d04}', '\u{d04}'), ('\u{d0d}', '\u{d0d}'), ('\u{d11}', '\u{d11}'), + ('\u{d45}', '\u{d45}'), ('\u{d49}', '\u{d49}'), ('\u{d50}', '\u{d53}'), + ('\u{d64}', '\u{d65}'), ('\u{d80}', '\u{d81}'), ('\u{d84}', '\u{d84}'), + ('\u{d97}', '\u{d99}'), ('\u{db2}', '\u{db2}'), ('\u{dbc}', '\u{dbc}'), + ('\u{dbe}', '\u{dbf}'), ('\u{dc7}', '\u{dc9}'), ('\u{dcb}', '\u{dce}'), + ('\u{dd5}', '\u{dd5}'), ('\u{dd7}', '\u{dd7}'), ('\u{de0}', '\u{de5}'), + ('\u{df0}', '\u{df1}'), ('\u{df5}', '\u{e00}'), ('\u{e3b}', '\u{e3e}'), + ('\u{e5c}', '\u{e80}'), ('\u{e83}', '\u{e83}'), ('\u{e85}', '\u{e86}'), + ('\u{e89}', '\u{e89}'), ('\u{e8b}', '\u{e8c}'), ('\u{e8e}', '\u{e93}'), + ('\u{e98}', '\u{e98}'), ('\u{ea0}', '\u{ea0}'), ('\u{ea4}', '\u{ea4}'), + ('\u{ea6}', '\u{ea6}'), ('\u{ea8}', '\u{ea9}'), ('\u{eac}', '\u{eac}'), + ('\u{eba}', '\u{eba}'), ('\u{ebe}', '\u{ebf}'), ('\u{ec5}', '\u{ec5}'), + ('\u{ec7}', '\u{ec7}'), ('\u{ece}', '\u{ecf}'), ('\u{eda}', '\u{edb}'), + ('\u{ee0}', '\u{eff}'), ('\u{f48}', '\u{f48}'), ('\u{f6d}', '\u{f70}'), + ('\u{f98}', '\u{f98}'), ('\u{fbd}', '\u{fbd}'), ('\u{fcd}', '\u{fcd}'), + ('\u{fdb}', '\u{fff}'), ('\u{10c6}', '\u{10c6}'), ('\u{10c8}', '\u{10cc}'), + ('\u{10ce}', '\u{10cf}'), ('\u{1249}', '\u{1249}'), + ('\u{124e}', '\u{124f}'), ('\u{1257}', '\u{1257}'), + ('\u{1259}', '\u{1259}'), ('\u{125e}', '\u{125f}'), + ('\u{1289}', '\u{1289}'), ('\u{128e}', '\u{128f}'), + ('\u{12b1}', '\u{12b1}'), ('\u{12b6}', '\u{12b7}'), + ('\u{12bf}', '\u{12bf}'), ('\u{12c1}', '\u{12c1}'), + ('\u{12c6}', '\u{12c7}'), ('\u{12d7}', '\u{12d7}'), + ('\u{1311}', '\u{1311}'), ('\u{1316}', '\u{1317}'), + ('\u{135b}', '\u{135c}'), ('\u{137d}', '\u{137f}'), + ('\u{139a}', '\u{139f}'), ('\u{13f6}', '\u{13f7}'), + ('\u{13fe}', '\u{13ff}'), ('\u{169d}', '\u{169f}'), + ('\u{16f9}', '\u{16ff}'), ('\u{170d}', '\u{170d}'), + ('\u{1715}', '\u{171f}'), ('\u{1737}', '\u{173f}'), + ('\u{1754}', '\u{175f}'), ('\u{176d}', '\u{176d}'), + ('\u{1771}', '\u{1771}'), ('\u{1774}', '\u{177f}'), + ('\u{17de}', '\u{17df}'), ('\u{17ea}', '\u{17ef}'), + ('\u{17fa}', '\u{17ff}'), ('\u{180f}', '\u{180f}'), + ('\u{181a}', '\u{181f}'), ('\u{1878}', '\u{187f}'), + ('\u{18ab}', '\u{18af}'), ('\u{18f6}', '\u{18ff}'), + ('\u{191f}', '\u{191f}'), ('\u{192c}', '\u{192f}'), + ('\u{193c}', '\u{193f}'), ('\u{1941}', '\u{1943}'), + ('\u{196e}', '\u{196f}'), ('\u{1975}', '\u{197f}'), + ('\u{19ac}', '\u{19af}'), ('\u{19ca}', '\u{19cf}'), + ('\u{19db}', '\u{19dd}'), ('\u{1a1c}', '\u{1a1d}'), + ('\u{1a5f}', '\u{1a5f}'), ('\u{1a7d}', '\u{1a7e}'), + ('\u{1a8a}', '\u{1a8f}'), ('\u{1a9a}', '\u{1a9f}'), + ('\u{1aae}', '\u{1aaf}'), ('\u{1abf}', '\u{1aff}'), + ('\u{1b4c}', '\u{1b4f}'), ('\u{1b7d}', '\u{1b7f}'), + ('\u{1bf4}', '\u{1bfb}'), ('\u{1c38}', '\u{1c3a}'), + ('\u{1c4a}', '\u{1c4c}'), ('\u{1c89}', '\u{1cbf}'), + ('\u{1cc8}', '\u{1ccf}'), ('\u{1cfa}', '\u{1cff}'), + ('\u{1dfa}', '\u{1dfa}'), ('\u{1f16}', '\u{1f17}'), + ('\u{1f1e}', '\u{1f1f}'), ('\u{1f46}', '\u{1f47}'), + ('\u{1f4e}', '\u{1f4f}'), ('\u{1f58}', '\u{1f58}'), + ('\u{1f5a}', '\u{1f5a}'), ('\u{1f5c}', '\u{1f5c}'), + ('\u{1f5e}', '\u{1f5e}'), ('\u{1f7e}', '\u{1f7f}'), + ('\u{1fb5}', '\u{1fb5}'), ('\u{1fc5}', '\u{1fc5}'), + ('\u{1fd4}', '\u{1fd5}'), ('\u{1fdc}', '\u{1fdc}'), + ('\u{1ff0}', '\u{1ff1}'), ('\u{1ff5}', '\u{1ff5}'), + ('\u{1fff}', '\u{1fff}'), ('\u{2065}', '\u{2065}'), + ('\u{2072}', '\u{2073}'), ('\u{208f}', '\u{208f}'), + ('\u{209d}', '\u{209f}'), ('\u{20c0}', '\u{20cf}'), + ('\u{20f1}', '\u{20ff}'), ('\u{218c}', '\u{218f}'), + ('\u{2427}', '\u{243f}'), ('\u{244b}', '\u{245f}'), + ('\u{2b74}', '\u{2b75}'), ('\u{2b96}', '\u{2b97}'), + ('\u{2bba}', '\u{2bbc}'), ('\u{2bc9}', '\u{2bc9}'), + ('\u{2bd3}', '\u{2beb}'), ('\u{2bf0}', '\u{2bff}'), + ('\u{2c2f}', '\u{2c2f}'), ('\u{2c5f}', '\u{2c5f}'), + ('\u{2cf4}', '\u{2cf8}'), ('\u{2d26}', '\u{2d26}'), + ('\u{2d28}', '\u{2d2c}'), ('\u{2d2e}', '\u{2d2f}'), + ('\u{2d68}', '\u{2d6e}'), ('\u{2d71}', '\u{2d7e}'), + ('\u{2d97}', '\u{2d9f}'), ('\u{2da7}', '\u{2da7}'), + ('\u{2daf}', '\u{2daf}'), ('\u{2db7}', '\u{2db7}'), + ('\u{2dbf}', '\u{2dbf}'), ('\u{2dc7}', '\u{2dc7}'), + ('\u{2dcf}', '\u{2dcf}'), ('\u{2dd7}', '\u{2dd7}'), + ('\u{2ddf}', '\u{2ddf}'), ('\u{2e4a}', '\u{2e7f}'), + ('\u{2e9a}', '\u{2e9a}'), ('\u{2ef4}', '\u{2eff}'), + ('\u{2fd6}', '\u{2fef}'), ('\u{2ffc}', '\u{2fff}'), + ('\u{3040}', '\u{3040}'), ('\u{3097}', '\u{3098}'), + ('\u{3100}', '\u{3104}'), ('\u{312f}', '\u{3130}'), + ('\u{318f}', '\u{318f}'), ('\u{31bb}', '\u{31bf}'), + ('\u{31e4}', '\u{31ef}'), ('\u{321f}', '\u{321f}'), + ('\u{32ff}', '\u{32ff}'), ('\u{4db6}', '\u{4dbf}'), + ('\u{9feb}', '\u{9fff}'), ('\u{a48d}', '\u{a48f}'), + ('\u{a4c7}', '\u{a4cf}'), ('\u{a62c}', '\u{a63f}'), + ('\u{a6f8}', '\u{a6ff}'), ('\u{a7af}', '\u{a7af}'), + ('\u{a7b8}', '\u{a7f6}'), ('\u{a82c}', '\u{a82f}'), + ('\u{a83a}', '\u{a83f}'), ('\u{a878}', '\u{a87f}'), + ('\u{a8c6}', '\u{a8cd}'), ('\u{a8da}', '\u{a8df}'), + ('\u{a8fe}', '\u{a8ff}'), ('\u{a954}', '\u{a95e}'), + ('\u{a97d}', '\u{a97f}'), ('\u{a9ce}', '\u{a9ce}'), + ('\u{a9da}', '\u{a9dd}'), ('\u{a9ff}', '\u{a9ff}'), + ('\u{aa37}', '\u{aa3f}'), ('\u{aa4e}', '\u{aa4f}'), + ('\u{aa5a}', '\u{aa5b}'), ('\u{aac3}', '\u{aada}'), + ('\u{aaf7}', '\u{ab00}'), ('\u{ab07}', '\u{ab08}'), + ('\u{ab0f}', '\u{ab10}'), ('\u{ab17}', '\u{ab1f}'), + ('\u{ab27}', '\u{ab27}'), ('\u{ab2f}', '\u{ab2f}'), + ('\u{ab66}', '\u{ab6f}'), ('\u{abee}', '\u{abef}'), + ('\u{abfa}', '\u{abff}'), ('\u{d7a4}', '\u{d7af}'), + ('\u{d7c7}', '\u{d7ca}'), ('\u{d7fc}', '\u{d7ff}'), + ('\u{fa6e}', '\u{fa6f}'), ('\u{fada}', '\u{faff}'), + ('\u{fb07}', '\u{fb12}'), ('\u{fb18}', '\u{fb1c}'), + ('\u{fb37}', '\u{fb37}'), ('\u{fb3d}', '\u{fb3d}'), + ('\u{fb3f}', '\u{fb3f}'), ('\u{fb42}', '\u{fb42}'), + ('\u{fb45}', '\u{fb45}'), ('\u{fbc2}', '\u{fbd2}'), + ('\u{fd40}', '\u{fd4f}'), ('\u{fd90}', '\u{fd91}'), + ('\u{fdc8}', '\u{fdef}'), ('\u{fdfe}', '\u{fdff}'), + ('\u{fe1a}', '\u{fe1f}'), ('\u{fe53}', '\u{fe53}'), + ('\u{fe67}', '\u{fe67}'), ('\u{fe6c}', '\u{fe6f}'), + ('\u{fe75}', '\u{fe75}'), ('\u{fefd}', '\u{fefe}'), + ('\u{ff00}', '\u{ff00}'), ('\u{ffbf}', '\u{ffc1}'), + ('\u{ffc8}', '\u{ffc9}'), ('\u{ffd0}', '\u{ffd1}'), + ('\u{ffd8}', '\u{ffd9}'), ('\u{ffdd}', '\u{ffdf}'), + ('\u{ffe7}', '\u{ffe7}'), ('\u{ffef}', '\u{fff8}'), + ('\u{fffe}', '\u{ffff}'), ('\u{1000c}', '\u{1000c}'), + ('\u{10027}', '\u{10027}'), ('\u{1003b}', '\u{1003b}'), + ('\u{1003e}', '\u{1003e}'), ('\u{1004e}', '\u{1004f}'), + ('\u{1005e}', '\u{1007f}'), ('\u{100fb}', '\u{100ff}'), + ('\u{10103}', '\u{10106}'), ('\u{10134}', '\u{10136}'), + ('\u{1018f}', '\u{1018f}'), ('\u{1019c}', '\u{1019f}'), + ('\u{101a1}', '\u{101cf}'), ('\u{101fe}', '\u{1027f}'), + ('\u{1029d}', '\u{1029f}'), ('\u{102d1}', '\u{102df}'), + ('\u{102fc}', '\u{102ff}'), ('\u{10324}', '\u{1032c}'), + ('\u{1034b}', '\u{1034f}'), ('\u{1037b}', '\u{1037f}'), + ('\u{1039e}', '\u{1039e}'), ('\u{103c4}', '\u{103c7}'), + ('\u{103d6}', '\u{103ff}'), ('\u{1049e}', '\u{1049f}'), + ('\u{104aa}', '\u{104af}'), ('\u{104d4}', '\u{104d7}'), + ('\u{104fc}', '\u{104ff}'), ('\u{10528}', '\u{1052f}'), + ('\u{10564}', '\u{1056e}'), ('\u{10570}', '\u{105ff}'), + ('\u{10737}', '\u{1073f}'), ('\u{10756}', '\u{1075f}'), + ('\u{10768}', '\u{107ff}'), ('\u{10806}', '\u{10807}'), + ('\u{10809}', '\u{10809}'), ('\u{10836}', '\u{10836}'), + ('\u{10839}', '\u{1083b}'), ('\u{1083d}', '\u{1083e}'), + ('\u{10856}', '\u{10856}'), ('\u{1089f}', '\u{108a6}'), + ('\u{108b0}', '\u{108df}'), ('\u{108f3}', '\u{108f3}'), + ('\u{108f6}', '\u{108fa}'), ('\u{1091c}', '\u{1091e}'), + ('\u{1093a}', '\u{1093e}'), ('\u{10940}', '\u{1097f}'), + ('\u{109b8}', '\u{109bb}'), ('\u{109d0}', '\u{109d1}'), + ('\u{10a04}', '\u{10a04}'), ('\u{10a07}', '\u{10a0b}'), + ('\u{10a14}', '\u{10a14}'), ('\u{10a18}', '\u{10a18}'), + ('\u{10a34}', '\u{10a37}'), ('\u{10a3b}', '\u{10a3e}'), + ('\u{10a48}', '\u{10a4f}'), ('\u{10a59}', '\u{10a5f}'), + ('\u{10aa0}', '\u{10abf}'), ('\u{10ae7}', '\u{10aea}'), + ('\u{10af7}', '\u{10aff}'), ('\u{10b36}', '\u{10b38}'), + ('\u{10b56}', '\u{10b57}'), ('\u{10b73}', '\u{10b77}'), + ('\u{10b92}', '\u{10b98}'), ('\u{10b9d}', '\u{10ba8}'), + ('\u{10bb0}', '\u{10bff}'), ('\u{10c49}', '\u{10c7f}'), + ('\u{10cb3}', '\u{10cbf}'), ('\u{10cf3}', '\u{10cf9}'), + ('\u{10d00}', '\u{10e5f}'), ('\u{10e7f}', '\u{10fff}'), + ('\u{1104e}', '\u{11051}'), ('\u{11070}', '\u{1107e}'), + ('\u{110c2}', '\u{110cf}'), ('\u{110e9}', '\u{110ef}'), + ('\u{110fa}', '\u{110ff}'), ('\u{11135}', '\u{11135}'), + ('\u{11144}', '\u{1114f}'), ('\u{11177}', '\u{1117f}'), + ('\u{111ce}', '\u{111cf}'), ('\u{111e0}', '\u{111e0}'), + ('\u{111f5}', '\u{111ff}'), ('\u{11212}', '\u{11212}'), + ('\u{1123f}', '\u{1127f}'), ('\u{11287}', '\u{11287}'), + ('\u{11289}', '\u{11289}'), ('\u{1128e}', '\u{1128e}'), + ('\u{1129e}', '\u{1129e}'), ('\u{112aa}', '\u{112af}'), + ('\u{112eb}', '\u{112ef}'), ('\u{112fa}', '\u{112ff}'), + ('\u{11304}', '\u{11304}'), ('\u{1130d}', '\u{1130e}'), + ('\u{11311}', '\u{11312}'), ('\u{11329}', '\u{11329}'), + ('\u{11331}', '\u{11331}'), ('\u{11334}', '\u{11334}'), + ('\u{1133a}', '\u{1133b}'), ('\u{11345}', '\u{11346}'), + ('\u{11349}', '\u{1134a}'), ('\u{1134e}', '\u{1134f}'), + ('\u{11351}', '\u{11356}'), ('\u{11358}', '\u{1135c}'), + ('\u{11364}', '\u{11365}'), ('\u{1136d}', '\u{1136f}'), + ('\u{11375}', '\u{113ff}'), ('\u{1145a}', '\u{1145a}'), + ('\u{1145c}', '\u{1145c}'), ('\u{1145e}', '\u{1147f}'), + ('\u{114c8}', '\u{114cf}'), ('\u{114da}', '\u{1157f}'), + ('\u{115b6}', '\u{115b7}'), ('\u{115de}', '\u{115ff}'), + ('\u{11645}', '\u{1164f}'), ('\u{1165a}', '\u{1165f}'), + ('\u{1166d}', '\u{1167f}'), ('\u{116b8}', '\u{116bf}'), + ('\u{116ca}', '\u{116ff}'), ('\u{1171a}', '\u{1171c}'), + ('\u{1172c}', '\u{1172f}'), ('\u{11740}', '\u{1189f}'), + ('\u{118f3}', '\u{118fe}'), ('\u{11900}', '\u{119ff}'), + ('\u{11a48}', '\u{11a4f}'), ('\u{11a84}', '\u{11a85}'), + ('\u{11a9d}', '\u{11a9d}'), ('\u{11aa3}', '\u{11abf}'), + ('\u{11af9}', '\u{11bff}'), ('\u{11c09}', '\u{11c09}'), + ('\u{11c37}', '\u{11c37}'), ('\u{11c46}', '\u{11c4f}'), + ('\u{11c6d}', '\u{11c6f}'), ('\u{11c90}', '\u{11c91}'), + ('\u{11ca8}', '\u{11ca8}'), ('\u{11cb7}', '\u{11cff}'), + ('\u{11d07}', '\u{11d07}'), ('\u{11d0a}', '\u{11d0a}'), + ('\u{11d37}', '\u{11d39}'), ('\u{11d3b}', '\u{11d3b}'), + ('\u{11d3e}', '\u{11d3e}'), ('\u{11d48}', '\u{11d4f}'), + ('\u{11d5a}', '\u{11fff}'), ('\u{1239a}', '\u{123ff}'), + ('\u{1246f}', '\u{1246f}'), ('\u{12475}', '\u{1247f}'), + ('\u{12544}', '\u{12fff}'), ('\u{1342f}', '\u{143ff}'), + ('\u{14647}', '\u{167ff}'), ('\u{16a39}', '\u{16a3f}'), + ('\u{16a5f}', '\u{16a5f}'), ('\u{16a6a}', '\u{16a6d}'), + ('\u{16a70}', '\u{16acf}'), ('\u{16aee}', '\u{16aef}'), + ('\u{16af6}', '\u{16aff}'), ('\u{16b46}', '\u{16b4f}'), + ('\u{16b5a}', '\u{16b5a}'), ('\u{16b62}', '\u{16b62}'), + ('\u{16b78}', '\u{16b7c}'), ('\u{16b90}', '\u{16eff}'), + ('\u{16f45}', '\u{16f4f}'), ('\u{16f7f}', '\u{16f8e}'), + ('\u{16fa0}', '\u{16fdf}'), ('\u{16fe2}', '\u{16fff}'), + ('\u{187ed}', '\u{187ff}'), ('\u{18af3}', '\u{1afff}'), + ('\u{1b11f}', '\u{1b16f}'), ('\u{1b2fc}', '\u{1bbff}'), + ('\u{1bc6b}', '\u{1bc6f}'), ('\u{1bc7d}', '\u{1bc7f}'), + ('\u{1bc89}', '\u{1bc8f}'), ('\u{1bc9a}', '\u{1bc9b}'), + ('\u{1bca4}', '\u{1cfff}'), ('\u{1d0f6}', '\u{1d0ff}'), + ('\u{1d127}', '\u{1d128}'), ('\u{1d1e9}', '\u{1d1ff}'), + ('\u{1d246}', '\u{1d2ff}'), ('\u{1d357}', '\u{1d35f}'), + ('\u{1d372}', '\u{1d3ff}'), ('\u{1d455}', '\u{1d455}'), + ('\u{1d49d}', '\u{1d49d}'), ('\u{1d4a0}', '\u{1d4a1}'), + ('\u{1d4a3}', '\u{1d4a4}'), ('\u{1d4a7}', '\u{1d4a8}'), + ('\u{1d4ad}', '\u{1d4ad}'), ('\u{1d4ba}', '\u{1d4ba}'), + ('\u{1d4bc}', '\u{1d4bc}'), ('\u{1d4c4}', '\u{1d4c4}'), + ('\u{1d506}', '\u{1d506}'), ('\u{1d50b}', '\u{1d50c}'), + ('\u{1d515}', '\u{1d515}'), ('\u{1d51d}', '\u{1d51d}'), + ('\u{1d53a}', '\u{1d53a}'), ('\u{1d53f}', '\u{1d53f}'), + ('\u{1d545}', '\u{1d545}'), ('\u{1d547}', '\u{1d549}'), + ('\u{1d551}', '\u{1d551}'), ('\u{1d6a6}', '\u{1d6a7}'), + ('\u{1d7cc}', '\u{1d7cd}'), ('\u{1da8c}', '\u{1da9a}'), + ('\u{1daa0}', '\u{1daa0}'), ('\u{1dab0}', '\u{1dfff}'), + ('\u{1e007}', '\u{1e007}'), ('\u{1e019}', '\u{1e01a}'), + ('\u{1e022}', '\u{1e022}'), ('\u{1e025}', '\u{1e025}'), + ('\u{1e02b}', '\u{1e7ff}'), ('\u{1e8c5}', '\u{1e8c6}'), + ('\u{1e8d7}', '\u{1e8ff}'), ('\u{1e94b}', '\u{1e94f}'), + ('\u{1e95a}', '\u{1e95d}'), ('\u{1e960}', '\u{1edff}'), + ('\u{1ee04}', '\u{1ee04}'), ('\u{1ee20}', '\u{1ee20}'), + ('\u{1ee23}', '\u{1ee23}'), ('\u{1ee25}', '\u{1ee26}'), + ('\u{1ee28}', '\u{1ee28}'), ('\u{1ee33}', '\u{1ee33}'), + ('\u{1ee38}', '\u{1ee38}'), ('\u{1ee3a}', '\u{1ee3a}'), + ('\u{1ee3c}', '\u{1ee41}'), ('\u{1ee43}', '\u{1ee46}'), + ('\u{1ee48}', '\u{1ee48}'), ('\u{1ee4a}', '\u{1ee4a}'), + ('\u{1ee4c}', '\u{1ee4c}'), ('\u{1ee50}', '\u{1ee50}'), + ('\u{1ee53}', '\u{1ee53}'), ('\u{1ee55}', '\u{1ee56}'), + ('\u{1ee58}', '\u{1ee58}'), ('\u{1ee5a}', '\u{1ee5a}'), + ('\u{1ee5c}', '\u{1ee5c}'), ('\u{1ee5e}', '\u{1ee5e}'), + ('\u{1ee60}', '\u{1ee60}'), ('\u{1ee63}', '\u{1ee63}'), + ('\u{1ee65}', '\u{1ee66}'), ('\u{1ee6b}', '\u{1ee6b}'), + ('\u{1ee73}', '\u{1ee73}'), ('\u{1ee78}', '\u{1ee78}'), + ('\u{1ee7d}', '\u{1ee7d}'), ('\u{1ee7f}', '\u{1ee7f}'), + ('\u{1ee8a}', '\u{1ee8a}'), ('\u{1ee9c}', '\u{1eea0}'), + ('\u{1eea4}', '\u{1eea4}'), ('\u{1eeaa}', '\u{1eeaa}'), + ('\u{1eebc}', '\u{1eeef}'), ('\u{1eef2}', '\u{1efff}'), + ('\u{1f02c}', '\u{1f02f}'), ('\u{1f094}', '\u{1f09f}'), + ('\u{1f0af}', '\u{1f0b0}'), ('\u{1f0c0}', '\u{1f0c0}'), + ('\u{1f0d0}', '\u{1f0d0}'), ('\u{1f0f6}', '\u{1f0ff}'), + ('\u{1f10d}', '\u{1f10f}'), ('\u{1f12f}', '\u{1f12f}'), + ('\u{1f16c}', '\u{1f16f}'), ('\u{1f1ad}', '\u{1f1e5}'), + ('\u{1f203}', '\u{1f20f}'), ('\u{1f23c}', '\u{1f23f}'), + ('\u{1f249}', '\u{1f24f}'), ('\u{1f252}', '\u{1f25f}'), + ('\u{1f266}', '\u{1f2ff}'), ('\u{1f6d5}', '\u{1f6df}'), + ('\u{1f6ed}', '\u{1f6ef}'), ('\u{1f6f9}', '\u{1f6ff}'), + ('\u{1f774}', '\u{1f77f}'), ('\u{1f7d5}', '\u{1f7ff}'), + ('\u{1f80c}', '\u{1f80f}'), ('\u{1f848}', '\u{1f84f}'), + ('\u{1f85a}', '\u{1f85f}'), ('\u{1f888}', '\u{1f88f}'), + ('\u{1f8ae}', '\u{1f8ff}'), ('\u{1f90c}', '\u{1f90f}'), + ('\u{1f93f}', '\u{1f93f}'), ('\u{1f94d}', '\u{1f94f}'), + ('\u{1f96c}', '\u{1f97f}'), ('\u{1f998}', '\u{1f9bf}'), + ('\u{1f9c1}', '\u{1f9cf}'), ('\u{1f9e7}', '\u{1ffff}'), + ('\u{2a6d7}', '\u{2a6ff}'), ('\u{2b735}', '\u{2b73f}'), + ('\u{2b81e}', '\u{2b81f}'), ('\u{2cea2}', '\u{2ceaf}'), + ('\u{2ebe1}', '\u{2f7ff}'), ('\u{2fa1e}', '\u{e0000}'), + ('\u{e0002}', '\u{e001f}'), ('\u{e0080}', '\u{e00ff}'), + ('\u{e01f0}', '\u{effff}'), ('\u{ffffe}', '\u{fffff}'), + ('\u{10fffe}', '\u{10ffff}'), +]; + +pub const UPPERCASE_LETTER: &'static [(char, char)] = &[ + ('A', 'Z'), ('À', 'Ö'), ('Ø', 'Þ'), ('Ā', 'Ā'), ('Ă', 'Ă'), + ('Ą', 'Ą'), ('Ć', 'Ć'), ('Ĉ', 'Ĉ'), ('Ċ', 'Ċ'), ('Č', 'Č'), + ('Ď', 'Ď'), ('Đ', 'Đ'), ('Ē', 'Ē'), ('Ĕ', 'Ĕ'), ('Ė', 'Ė'), + ('Ę', 'Ę'), ('Ě', 'Ě'), ('Ĝ', 'Ĝ'), ('Ğ', 'Ğ'), ('Ġ', 'Ġ'), + ('Ģ', 'Ģ'), ('Ĥ', 'Ĥ'), ('Ħ', 'Ħ'), ('Ĩ', 'Ĩ'), ('Ī', 'Ī'), + ('Ĭ', 'Ĭ'), ('Į', 'Į'), ('İ', 'İ'), ('IJ', 'IJ'), ('Ĵ', 'Ĵ'), + ('Ķ', 'Ķ'), ('Ĺ', 'Ĺ'), ('Ļ', 'Ļ'), ('Ľ', 'Ľ'), ('Ŀ', 'Ŀ'), + ('Ł', 'Ł'), ('Ń', 'Ń'), ('Ņ', 'Ņ'), ('Ň', 'Ň'), ('Ŋ', 'Ŋ'), + ('Ō', 'Ō'), ('Ŏ', 'Ŏ'), ('Ő', 'Ő'), ('Œ', 'Œ'), ('Ŕ', 'Ŕ'), + ('Ŗ', 'Ŗ'), ('Ř', 'Ř'), ('Ś', 'Ś'), ('Ŝ', 'Ŝ'), ('Ş', 'Ş'), + ('Š', 'Š'), ('Ţ', 'Ţ'), ('Ť', 'Ť'), ('Ŧ', 'Ŧ'), ('Ũ', 'Ũ'), + ('Ū', 'Ū'), ('Ŭ', 'Ŭ'), ('Ů', 'Ů'), ('Ű', 'Ű'), ('Ų', 'Ų'), + ('Ŵ', 'Ŵ'), ('Ŷ', 'Ŷ'), ('Ÿ', 'Ź'), ('Ż', 'Ż'), ('Ž', 'Ž'), + ('Ɓ', 'Ƃ'), ('Ƅ', 'Ƅ'), ('Ɔ', 'Ƈ'), ('Ɖ', 'Ƌ'), ('Ǝ', 'Ƒ'), + ('Ɠ', 'Ɣ'), ('Ɩ', 'Ƙ'), ('Ɯ', 'Ɲ'), ('Ɵ', 'Ơ'), ('Ƣ', 'Ƣ'), + ('Ƥ', 'Ƥ'), ('Ʀ', 'Ƨ'), ('Ʃ', 'Ʃ'), ('Ƭ', 'Ƭ'), ('Ʈ', 'Ư'), + ('Ʊ', 'Ƴ'), ('Ƶ', 'Ƶ'), ('Ʒ', 'Ƹ'), ('Ƽ', 'Ƽ'), ('DŽ', 'DŽ'), + ('LJ', 'LJ'), ('NJ', 'NJ'), ('Ǎ', 'Ǎ'), ('Ǐ', 'Ǐ'), ('Ǒ', 'Ǒ'), + ('Ǔ', 'Ǔ'), ('Ǖ', 'Ǖ'), ('Ǘ', 'Ǘ'), ('Ǚ', 'Ǚ'), ('Ǜ', 'Ǜ'), + ('Ǟ', 'Ǟ'), ('Ǡ', 'Ǡ'), ('Ǣ', 'Ǣ'), ('Ǥ', 'Ǥ'), ('Ǧ', 'Ǧ'), + ('Ǩ', 'Ǩ'), ('Ǫ', 'Ǫ'), ('Ǭ', 'Ǭ'), ('Ǯ', 'Ǯ'), ('DZ', 'DZ'), + ('Ǵ', 'Ǵ'), ('Ƕ', 'Ǹ'), ('Ǻ', 'Ǻ'), ('Ǽ', 'Ǽ'), ('Ǿ', 'Ǿ'), + ('Ȁ', 'Ȁ'), ('Ȃ', 'Ȃ'), ('Ȅ', 'Ȅ'), ('Ȇ', 'Ȇ'), ('Ȉ', 'Ȉ'), + ('Ȋ', 'Ȋ'), ('Ȍ', 'Ȍ'), ('Ȏ', 'Ȏ'), ('Ȑ', 'Ȑ'), ('Ȓ', 'Ȓ'), + ('Ȕ', 'Ȕ'), ('Ȗ', 'Ȗ'), ('Ș', 'Ș'), ('Ț', 'Ț'), ('Ȝ', 'Ȝ'), + ('Ȟ', 'Ȟ'), ('Ƞ', 'Ƞ'), ('Ȣ', 'Ȣ'), ('Ȥ', 'Ȥ'), ('Ȧ', 'Ȧ'), + ('Ȩ', 'Ȩ'), ('Ȫ', 'Ȫ'), ('Ȭ', 'Ȭ'), ('Ȯ', 'Ȯ'), ('Ȱ', 'Ȱ'), + ('Ȳ', 'Ȳ'), ('Ⱥ', 'Ȼ'), ('Ƚ', 'Ⱦ'), ('Ɂ', 'Ɂ'), ('Ƀ', 'Ɇ'), + ('Ɉ', 'Ɉ'), ('Ɋ', 'Ɋ'), ('Ɍ', 'Ɍ'), ('Ɏ', 'Ɏ'), ('Ͱ', 'Ͱ'), + ('Ͳ', 'Ͳ'), ('Ͷ', 'Ͷ'), ('Ϳ', 'Ϳ'), ('Ά', 'Ά'), ('Έ', 'Ί'), + ('Ό', 'Ό'), ('Ύ', 'Ώ'), ('Α', 'Ρ'), ('Σ', 'Ϋ'), ('Ϗ', 'Ϗ'), + ('ϒ', 'ϔ'), ('Ϙ', 'Ϙ'), ('Ϛ', 'Ϛ'), ('Ϝ', 'Ϝ'), ('Ϟ', 'Ϟ'), + ('Ϡ', 'Ϡ'), ('Ϣ', 'Ϣ'), ('Ϥ', 'Ϥ'), ('Ϧ', 'Ϧ'), ('Ϩ', 'Ϩ'), + ('Ϫ', 'Ϫ'), ('Ϭ', 'Ϭ'), ('Ϯ', 'Ϯ'), ('ϴ', 'ϴ'), ('Ϸ', 'Ϸ'), + ('Ϲ', 'Ϻ'), ('Ͻ', 'Я'), ('Ѡ', 'Ѡ'), ('Ѣ', 'Ѣ'), ('Ѥ', 'Ѥ'), + ('Ѧ', 'Ѧ'), ('Ѩ', 'Ѩ'), ('Ѫ', 'Ѫ'), ('Ѭ', 'Ѭ'), ('Ѯ', 'Ѯ'), + ('Ѱ', 'Ѱ'), ('Ѳ', 'Ѳ'), ('Ѵ', 'Ѵ'), ('Ѷ', 'Ѷ'), ('Ѹ', 'Ѹ'), + ('Ѻ', 'Ѻ'), ('Ѽ', 'Ѽ'), ('Ѿ', 'Ѿ'), ('Ҁ', 'Ҁ'), ('Ҋ', 'Ҋ'), + ('Ҍ', 'Ҍ'), ('Ҏ', 'Ҏ'), ('Ґ', 'Ґ'), ('Ғ', 'Ғ'), ('Ҕ', 'Ҕ'), + ('Җ', 'Җ'), ('Ҙ', 'Ҙ'), ('Қ', 'Қ'), ('Ҝ', 'Ҝ'), ('Ҟ', 'Ҟ'), + ('Ҡ', 'Ҡ'), ('Ң', 'Ң'), ('Ҥ', 'Ҥ'), ('Ҧ', 'Ҧ'), ('Ҩ', 'Ҩ'), + ('Ҫ', 'Ҫ'), ('Ҭ', 'Ҭ'), ('Ү', 'Ү'), ('Ұ', 'Ұ'), ('Ҳ', 'Ҳ'), + ('Ҵ', 'Ҵ'), ('Ҷ', 'Ҷ'), ('Ҹ', 'Ҹ'), ('Һ', 'Һ'), ('Ҽ', 'Ҽ'), + ('Ҿ', 'Ҿ'), ('Ӏ', 'Ӂ'), ('Ӄ', 'Ӄ'), ('Ӆ', 'Ӆ'), ('Ӈ', 'Ӈ'), + ('Ӊ', 'Ӊ'), ('Ӌ', 'Ӌ'), ('Ӎ', 'Ӎ'), ('Ӑ', 'Ӑ'), ('Ӓ', 'Ӓ'), + ('Ӕ', 'Ӕ'), ('Ӗ', 'Ӗ'), ('Ә', 'Ә'), ('Ӛ', 'Ӛ'), ('Ӝ', 'Ӝ'), + ('Ӟ', 'Ӟ'), ('Ӡ', 'Ӡ'), ('Ӣ', 'Ӣ'), ('Ӥ', 'Ӥ'), ('Ӧ', 'Ӧ'), + ('Ө', 'Ө'), ('Ӫ', 'Ӫ'), ('Ӭ', 'Ӭ'), ('Ӯ', 'Ӯ'), ('Ӱ', 'Ӱ'), + ('Ӳ', 'Ӳ'), ('Ӵ', 'Ӵ'), ('Ӷ', 'Ӷ'), ('Ӹ', 'Ӹ'), ('Ӻ', 'Ӻ'), + ('Ӽ', 'Ӽ'), ('Ӿ', 'Ӿ'), ('Ԁ', 'Ԁ'), ('Ԃ', 'Ԃ'), ('Ԅ', 'Ԅ'), + ('Ԇ', 'Ԇ'), ('Ԉ', 'Ԉ'), ('Ԋ', 'Ԋ'), ('Ԍ', 'Ԍ'), ('Ԏ', 'Ԏ'), + ('Ԑ', 'Ԑ'), ('Ԓ', 'Ԓ'), ('Ԕ', 'Ԕ'), ('Ԗ', 'Ԗ'), ('Ԙ', 'Ԙ'), + ('Ԛ', 'Ԛ'), ('Ԝ', 'Ԝ'), ('Ԟ', 'Ԟ'), ('Ԡ', 'Ԡ'), ('Ԣ', 'Ԣ'), + ('Ԥ', 'Ԥ'), ('Ԧ', 'Ԧ'), ('Ԩ', 'Ԩ'), ('Ԫ', 'Ԫ'), ('Ԭ', 'Ԭ'), + ('Ԯ', 'Ԯ'), ('Ա', 'Ֆ'), ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), + ('Ꭰ', 'Ᏽ'), ('Ḁ', 'Ḁ'), ('Ḃ', 'Ḃ'), ('Ḅ', 'Ḅ'), + ('Ḇ', 'Ḇ'), ('Ḉ', 'Ḉ'), ('Ḋ', 'Ḋ'), ('Ḍ', 'Ḍ'), + ('Ḏ', 'Ḏ'), ('Ḑ', 'Ḑ'), ('Ḓ', 'Ḓ'), ('Ḕ', 'Ḕ'), + ('Ḗ', 'Ḗ'), ('Ḙ', 'Ḙ'), ('Ḛ', 'Ḛ'), ('Ḝ', 'Ḝ'), + ('Ḟ', 'Ḟ'), ('Ḡ', 'Ḡ'), ('Ḣ', 'Ḣ'), ('Ḥ', 'Ḥ'), + ('Ḧ', 'Ḧ'), ('Ḩ', 'Ḩ'), ('Ḫ', 'Ḫ'), ('Ḭ', 'Ḭ'), + ('Ḯ', 'Ḯ'), ('Ḱ', 'Ḱ'), ('Ḳ', 'Ḳ'), ('Ḵ', 'Ḵ'), + ('Ḷ', 'Ḷ'), ('Ḹ', 'Ḹ'), ('Ḻ', 'Ḻ'), ('Ḽ', 'Ḽ'), + ('Ḿ', 'Ḿ'), ('Ṁ', 'Ṁ'), ('Ṃ', 'Ṃ'), ('Ṅ', 'Ṅ'), + ('Ṇ', 'Ṇ'), ('Ṉ', 'Ṉ'), ('Ṋ', 'Ṋ'), ('Ṍ', 'Ṍ'), + ('Ṏ', 'Ṏ'), ('Ṑ', 'Ṑ'), ('Ṓ', 'Ṓ'), ('Ṕ', 'Ṕ'), + ('Ṗ', 'Ṗ'), ('Ṙ', 'Ṙ'), ('Ṛ', 'Ṛ'), ('Ṝ', 'Ṝ'), + ('Ṟ', 'Ṟ'), ('Ṡ', 'Ṡ'), ('Ṣ', 'Ṣ'), ('Ṥ', 'Ṥ'), + ('Ṧ', 'Ṧ'), ('Ṩ', 'Ṩ'), ('Ṫ', 'Ṫ'), ('Ṭ', 'Ṭ'), + ('Ṯ', 'Ṯ'), ('Ṱ', 'Ṱ'), ('Ṳ', 'Ṳ'), ('Ṵ', 'Ṵ'), + ('Ṷ', 'Ṷ'), ('Ṹ', 'Ṹ'), ('Ṻ', 'Ṻ'), ('Ṽ', 'Ṽ'), + ('Ṿ', 'Ṿ'), ('Ẁ', 'Ẁ'), ('Ẃ', 'Ẃ'), ('Ẅ', 'Ẅ'), + ('Ẇ', 'Ẇ'), ('Ẉ', 'Ẉ'), ('Ẋ', 'Ẋ'), ('Ẍ', 'Ẍ'), + ('Ẏ', 'Ẏ'), ('Ẑ', 'Ẑ'), ('Ẓ', 'Ẓ'), ('Ẕ', 'Ẕ'), + ('ẞ', 'ẞ'), ('Ạ', 'Ạ'), ('Ả', 'Ả'), ('Ấ', 'Ấ'), + ('Ầ', 'Ầ'), ('Ẩ', 'Ẩ'), ('Ẫ', 'Ẫ'), ('Ậ', 'Ậ'), + ('Ắ', 'Ắ'), ('Ằ', 'Ằ'), ('Ẳ', 'Ẳ'), ('Ẵ', 'Ẵ'), + ('Ặ', 'Ặ'), ('Ẹ', 'Ẹ'), ('Ẻ', 'Ẻ'), ('Ẽ', 'Ẽ'), + ('Ế', 'Ế'), ('Ề', 'Ề'), ('Ể', 'Ể'), ('Ễ', 'Ễ'), + ('Ệ', 'Ệ'), ('Ỉ', 'Ỉ'), ('Ị', 'Ị'), ('Ọ', 'Ọ'), + ('Ỏ', 'Ỏ'), ('Ố', 'Ố'), ('Ồ', 'Ồ'), ('Ổ', 'Ổ'), + ('Ỗ', 'Ỗ'), ('Ộ', 'Ộ'), ('Ớ', 'Ớ'), ('Ờ', 'Ờ'), + ('Ở', 'Ở'), ('Ỡ', 'Ỡ'), ('Ợ', 'Ợ'), ('Ụ', 'Ụ'), + ('Ủ', 'Ủ'), ('Ứ', 'Ứ'), ('Ừ', 'Ừ'), ('Ử', 'Ử'), + ('Ữ', 'Ữ'), ('Ự', 'Ự'), ('Ỳ', 'Ỳ'), ('Ỵ', 'Ỵ'), + ('Ỷ', 'Ỷ'), ('Ỹ', 'Ỹ'), ('Ỻ', 'Ỻ'), ('Ỽ', 'Ỽ'), + ('Ỿ', 'Ỿ'), ('Ἀ', 'Ἇ'), ('Ἐ', 'Ἕ'), ('Ἠ', 'Ἧ'), + ('Ἰ', 'Ἷ'), ('Ὀ', 'Ὅ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), ('Ὗ', 'Ὗ'), ('Ὠ', 'Ὧ'), ('Ᾰ', 'Ά'), + ('Ὲ', 'Ή'), ('Ῐ', 'Ί'), ('Ῠ', 'Ῥ'), ('Ὸ', 'Ώ'), + ('ℂ', 'ℂ'), ('ℇ', 'ℇ'), ('ℋ', 'ℍ'), ('ℐ', 'ℒ'), + ('ℕ', 'ℕ'), ('ℙ', 'ℝ'), ('ℤ', 'ℤ'), ('Ω', 'Ω'), + ('ℨ', 'ℨ'), ('K', 'ℭ'), ('ℰ', 'ℳ'), ('ℾ', 'ℿ'), + ('ⅅ', 'ⅅ'), ('Ↄ', 'Ↄ'), ('Ⰰ', 'Ⱞ'), ('Ⱡ', 'Ⱡ'), + ('Ɫ', 'Ɽ'), ('Ⱨ', 'Ⱨ'), ('Ⱪ', 'Ⱪ'), ('Ⱬ', 'Ⱬ'), + ('Ɑ', 'Ɒ'), ('Ⱳ', 'Ⱳ'), ('Ⱶ', 'Ⱶ'), ('Ȿ', 'Ⲁ'), + ('Ⲃ', 'Ⲃ'), ('Ⲅ', 'Ⲅ'), ('Ⲇ', 'Ⲇ'), ('Ⲉ', 'Ⲉ'), + ('Ⲋ', 'Ⲋ'), ('Ⲍ', 'Ⲍ'), ('Ⲏ', 'Ⲏ'), ('Ⲑ', 'Ⲑ'), + ('Ⲓ', 'Ⲓ'), ('Ⲕ', 'Ⲕ'), ('Ⲗ', 'Ⲗ'), ('Ⲙ', 'Ⲙ'), + ('Ⲛ', 'Ⲛ'), ('Ⲝ', 'Ⲝ'), ('Ⲟ', 'Ⲟ'), ('Ⲡ', 'Ⲡ'), + ('Ⲣ', 'Ⲣ'), ('Ⲥ', 'Ⲥ'), ('Ⲧ', 'Ⲧ'), ('Ⲩ', 'Ⲩ'), + ('Ⲫ', 'Ⲫ'), ('Ⲭ', 'Ⲭ'), ('Ⲯ', 'Ⲯ'), ('Ⲱ', 'Ⲱ'), + ('Ⲳ', 'Ⲳ'), ('Ⲵ', 'Ⲵ'), ('Ⲷ', 'Ⲷ'), ('Ⲹ', 'Ⲹ'), + ('Ⲻ', 'Ⲻ'), ('Ⲽ', 'Ⲽ'), ('Ⲿ', 'Ⲿ'), ('Ⳁ', 'Ⳁ'), + ('Ⳃ', 'Ⳃ'), ('Ⳅ', 'Ⳅ'), ('Ⳇ', 'Ⳇ'), ('Ⳉ', 'Ⳉ'), + ('Ⳋ', 'Ⳋ'), ('Ⳍ', 'Ⳍ'), ('Ⳏ', 'Ⳏ'), ('Ⳑ', 'Ⳑ'), + ('Ⳓ', 'Ⳓ'), ('Ⳕ', 'Ⳕ'), ('Ⳗ', 'Ⳗ'), ('Ⳙ', 'Ⳙ'), + ('Ⳛ', 'Ⳛ'), ('Ⳝ', 'Ⳝ'), ('Ⳟ', 'Ⳟ'), ('Ⳡ', 'Ⳡ'), + ('Ⳣ', 'Ⳣ'), ('Ⳬ', 'Ⳬ'), ('Ⳮ', 'Ⳮ'), ('Ⳳ', 'Ⳳ'), + ('Ꙁ', 'Ꙁ'), ('Ꙃ', 'Ꙃ'), ('Ꙅ', 'Ꙅ'), ('Ꙇ', 'Ꙇ'), + ('Ꙉ', 'Ꙉ'), ('Ꙋ', 'Ꙋ'), ('Ꙍ', 'Ꙍ'), ('Ꙏ', 'Ꙏ'), + ('Ꙑ', 'Ꙑ'), ('Ꙓ', 'Ꙓ'), ('Ꙕ', 'Ꙕ'), ('Ꙗ', 'Ꙗ'), + ('Ꙙ', 'Ꙙ'), ('Ꙛ', 'Ꙛ'), ('Ꙝ', 'Ꙝ'), ('Ꙟ', 'Ꙟ'), + ('Ꙡ', 'Ꙡ'), ('Ꙣ', 'Ꙣ'), ('Ꙥ', 'Ꙥ'), ('Ꙧ', 'Ꙧ'), + ('Ꙩ', 'Ꙩ'), ('Ꙫ', 'Ꙫ'), ('Ꙭ', 'Ꙭ'), ('Ꚁ', 'Ꚁ'), + ('Ꚃ', 'Ꚃ'), ('Ꚅ', 'Ꚅ'), ('Ꚇ', 'Ꚇ'), ('Ꚉ', 'Ꚉ'), + ('Ꚋ', 'Ꚋ'), ('Ꚍ', 'Ꚍ'), ('Ꚏ', 'Ꚏ'), ('Ꚑ', 'Ꚑ'), + ('Ꚓ', 'Ꚓ'), ('Ꚕ', 'Ꚕ'), ('Ꚗ', 'Ꚗ'), ('Ꚙ', 'Ꚙ'), + ('Ꚛ', 'Ꚛ'), ('Ꜣ', 'Ꜣ'), ('Ꜥ', 'Ꜥ'), ('Ꜧ', 'Ꜧ'), + ('Ꜩ', 'Ꜩ'), ('Ꜫ', 'Ꜫ'), ('Ꜭ', 'Ꜭ'), ('Ꜯ', 'Ꜯ'), + ('Ꜳ', 'Ꜳ'), ('Ꜵ', 'Ꜵ'), ('Ꜷ', 'Ꜷ'), ('Ꜹ', 'Ꜹ'), + ('Ꜻ', 'Ꜻ'), ('Ꜽ', 'Ꜽ'), ('Ꜿ', 'Ꜿ'), ('Ꝁ', 'Ꝁ'), + ('Ꝃ', 'Ꝃ'), ('Ꝅ', 'Ꝅ'), ('Ꝇ', 'Ꝇ'), ('Ꝉ', 'Ꝉ'), + ('Ꝋ', 'Ꝋ'), ('Ꝍ', 'Ꝍ'), ('Ꝏ', 'Ꝏ'), ('Ꝑ', 'Ꝑ'), + ('Ꝓ', 'Ꝓ'), ('Ꝕ', 'Ꝕ'), ('Ꝗ', 'Ꝗ'), ('Ꝙ', 'Ꝙ'), + ('Ꝛ', 'Ꝛ'), ('Ꝝ', 'Ꝝ'), ('Ꝟ', 'Ꝟ'), ('Ꝡ', 'Ꝡ'), + ('Ꝣ', 'Ꝣ'), ('Ꝥ', 'Ꝥ'), ('Ꝧ', 'Ꝧ'), ('Ꝩ', 'Ꝩ'), + ('Ꝫ', 'Ꝫ'), ('Ꝭ', 'Ꝭ'), ('Ꝯ', 'Ꝯ'), ('Ꝺ', 'Ꝺ'), + ('Ꝼ', 'Ꝼ'), ('Ᵹ', 'Ꝿ'), ('Ꞁ', 'Ꞁ'), ('Ꞃ', 'Ꞃ'), + ('Ꞅ', 'Ꞅ'), ('Ꞇ', 'Ꞇ'), ('Ꞌ', 'Ꞌ'), ('Ɥ', 'Ɥ'), + ('Ꞑ', 'Ꞑ'), ('Ꞓ', 'Ꞓ'), ('Ꞗ', 'Ꞗ'), ('Ꞙ', 'Ꞙ'), + ('Ꞛ', 'Ꞛ'), ('Ꞝ', 'Ꞝ'), ('Ꞟ', 'Ꞟ'), ('Ꞡ', 'Ꞡ'), + ('Ꞣ', 'Ꞣ'), ('Ꞥ', 'Ꞥ'), ('Ꞧ', 'Ꞧ'), ('Ꞩ', 'Ꞩ'), + ('Ɦ', 'Ɪ'), ('Ʞ', 'Ꞵ'), ('Ꞷ', 'Ꞷ'), ('A', 'Z'), + ('𐐀', '𐐧'), ('𐒰', '𐓓'), ('𐲀', '𐲲'), ('𑢠', '𑢿'), + ('𝐀', '𝐙'), ('𝐴', '𝑍'), ('𝑨', '𝒁'), ('𝒜', '𝒜'), + ('𝒞', '𝒟'), ('𝒢', '𝒢'), ('𝒥', '𝒦'), ('𝒩', '𝒬'), + ('𝒮', '𝒵'), ('𝓐', '𝓩'), ('𝔄', '𝔅'), ('𝔇', '𝔊'), + ('𝔍', '𝔔'), ('𝔖', '𝔜'), ('𝔸', '𝔹'), ('𝔻', '𝔾'), + ('𝕀', '𝕄'), ('𝕆', '𝕆'), ('𝕊', '𝕐'), ('𝕬', '𝖅'), + ('𝖠', '𝖹'), ('𝗔', '𝗭'), ('𝘈', '𝘡'), ('𝘼', '𝙕'), + ('𝙰', '𝚉'), ('𝚨', '𝛀'), ('𝛢', '𝛺'), ('𝜜', '𝜴'), + ('𝝖', '𝝮'), ('𝞐', '𝞨'), ('𝟊', '𝟊'), ('𞤀', '𞤡'), +]; diff --git a/regex-syntax-2/src/unicode_tables/mod.rs b/regex-syntax-2/src/unicode_tables/mod.rs new file mode 100644 index 0000000000..6c2e9e7736 --- /dev/null +++ b/regex-syntax-2/src/unicode_tables/mod.rs @@ -0,0 +1,9 @@ +pub mod age; +pub mod case_folding_simple; +pub mod general_category; +pub mod perl_word; +pub mod property_bool; +pub mod property_names; +pub mod property_values; +pub mod script_extension; +pub mod script; diff --git a/regex-syntax-2/src/unicode_tables/perl_word.rs b/regex-syntax-2/src/unicode_tables/perl_word.rs new file mode 100644 index 0000000000..d33f79a02b --- /dev/null +++ b/regex-syntax-2/src/unicode_tables/perl_word.rs @@ -0,0 +1,179 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate perl-word /home/andrew/tmp/ucd-10.0.0/ --chars +// +// ucd-generate is available on crates.io. + +pub const PERL_WORD: &'static [(char, char)] = &[ + ('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z'), ('ª', 'ª'), ('µ', 'µ'), + ('º', 'º'), ('À', 'Ö'), ('Ø', 'ö'), ('ø', 'ˁ'), ('ˆ', 'ˑ'), + ('ˠ', 'ˤ'), ('ˬ', 'ˬ'), ('ˮ', 'ˮ'), ('̀', 'ʹ'), ('Ͷ', 'ͷ'), + ('ͺ', 'ͽ'), ('Ϳ', 'Ϳ'), ('Ά', 'Ά'), ('Έ', 'Ί'), ('Ό', 'Ό'), + ('Ύ', 'Ρ'), ('Σ', 'ϵ'), ('Ϸ', 'ҁ'), ('҃', 'ԯ'), ('Ա', 'Ֆ'), + ('ՙ', 'ՙ'), ('ա', 'և'), ('֑', 'ֽ'), ('ֿ', 'ֿ'), ('ׁ', 'ׂ'), + ('ׄ', 'ׅ'), ('ׇ', 'ׇ'), ('א', 'ת'), ('װ', 'ײ'), ('ؐ', 'ؚ'), + ('ؠ', '٩'), ('ٮ', 'ۓ'), ('ە', 'ۜ'), ('۟', 'ۨ'), ('۪', 'ۼ'), + ('ۿ', 'ۿ'), ('ܐ', '݊'), ('ݍ', 'ޱ'), ('߀', 'ߵ'), ('ߺ', 'ߺ'), + ('ࠀ', '࠭'), ('ࡀ', '࡛'), ('ࡠ', 'ࡪ'), ('ࢠ', 'ࢴ'), + ('ࢶ', 'ࢽ'), ('ࣔ', '࣡'), ('ࣣ', 'ॣ'), ('०', '९'), + ('ॱ', 'ঃ'), ('অ', 'ঌ'), ('এ', 'ঐ'), ('ও', 'ন'), + ('প', 'র'), ('ল', 'ল'), ('শ', 'হ'), ('়', 'ৄ'), + ('ে', 'ৈ'), ('ো', 'ৎ'), ('ৗ', 'ৗ'), ('ড়', 'ঢ়'), + ('য়', 'ৣ'), ('০', 'ৱ'), ('ৼ', 'ৼ'), ('ਁ', 'ਃ'), + ('ਅ', 'ਊ'), ('ਏ', 'ਐ'), ('ਓ', 'ਨ'), ('ਪ', 'ਰ'), + ('ਲ', 'ਲ਼'), ('ਵ', 'ਸ਼'), ('ਸ', 'ਹ'), ('਼', '਼'), + ('ਾ', 'ੂ'), ('ੇ', 'ੈ'), ('ੋ', '੍'), ('ੑ', 'ੑ'), + ('ਖ਼', 'ੜ'), ('ਫ਼', 'ਫ਼'), ('੦', 'ੵ'), ('ઁ', 'ઃ'), + ('અ', 'ઍ'), ('એ', 'ઑ'), ('ઓ', 'ન'), ('પ', 'ર'), + ('લ', 'ળ'), ('વ', 'હ'), ('઼', 'ૅ'), ('ે', 'ૉ'), + ('ો', '્'), ('ૐ', 'ૐ'), ('ૠ', 'ૣ'), ('૦', '૯'), + ('ૹ', '૿'), ('ଁ', 'ଃ'), ('ଅ', 'ଌ'), ('ଏ', 'ଐ'), + ('ଓ', 'ନ'), ('ପ', 'ର'), ('ଲ', 'ଳ'), ('ଵ', 'ହ'), + ('଼', 'ୄ'), ('େ', 'ୈ'), ('ୋ', '୍'), ('ୖ', 'ୗ'), + ('ଡ଼', 'ଢ଼'), ('ୟ', 'ୣ'), ('୦', '୯'), ('ୱ', 'ୱ'), + ('ஂ', 'ஃ'), ('அ', 'ஊ'), ('எ', 'ஐ'), ('ஒ', 'க'), + ('ங', 'ச'), ('ஜ', 'ஜ'), ('ஞ', 'ட'), ('ண', 'த'), + ('ந', 'ப'), ('ம', 'ஹ'), ('ா', 'ூ'), ('ெ', 'ை'), + ('ொ', '்'), ('ௐ', 'ௐ'), ('ௗ', 'ௗ'), ('௦', '௯'), + ('ఀ', 'ః'), ('అ', 'ఌ'), ('ఎ', 'ఐ'), ('ఒ', 'న'), + ('ప', 'హ'), ('ఽ', 'ౄ'), ('ె', 'ై'), ('ొ', '్'), + ('ౕ', 'ౖ'), ('ౘ', 'ౚ'), ('ౠ', 'ౣ'), ('౦', '౯'), + ('ಀ', 'ಃ'), ('ಅ', 'ಌ'), ('ಎ', 'ಐ'), ('ಒ', 'ನ'), + ('ಪ', 'ಳ'), ('ವ', 'ಹ'), ('಼', 'ೄ'), ('ೆ', 'ೈ'), + ('ೊ', '್'), ('ೕ', 'ೖ'), ('ೞ', 'ೞ'), ('ೠ', 'ೣ'), + ('೦', '೯'), ('ೱ', 'ೲ'), ('ഀ', 'ഃ'), ('അ', 'ഌ'), + ('എ', 'ഐ'), ('ഒ', 'ൄ'), ('െ', 'ൈ'), ('ൊ', 'ൎ'), + ('ൔ', 'ൗ'), ('ൟ', 'ൣ'), ('൦', '൯'), ('ൺ', 'ൿ'), + ('ං', 'ඃ'), ('අ', 'ඖ'), ('ක', 'න'), ('ඳ', 'ර'), + ('ල', 'ල'), ('ව', 'ෆ'), ('්', '්'), ('ා', 'ු'), + ('ූ', 'ූ'), ('ෘ', 'ෟ'), ('෦', '෯'), ('ෲ', 'ෳ'), + ('ก', 'ฺ'), ('เ', '๎'), ('๐', '๙'), ('ກ', 'ຂ'), + ('ຄ', 'ຄ'), ('ງ', 'ຈ'), ('ຊ', 'ຊ'), ('ຍ', 'ຍ'), + ('ດ', 'ທ'), ('ນ', 'ຟ'), ('ມ', 'ຣ'), ('ລ', 'ລ'), + ('ວ', 'ວ'), ('ສ', 'ຫ'), ('ອ', 'ູ'), ('ົ', 'ຽ'), + ('ເ', 'ໄ'), ('ໆ', 'ໆ'), ('່', 'ໍ'), ('໐', '໙'), + ('ໜ', 'ໟ'), ('ༀ', 'ༀ'), ('༘', '༙'), ('༠', '༩'), + ('༵', '༵'), ('༷', '༷'), ('༹', '༹'), ('༾', 'ཇ'), + ('ཉ', 'ཬ'), ('ཱ', '྄'), ('྆', 'ྗ'), ('ྙ', 'ྼ'), + ('࿆', '࿆'), ('က', '၉'), ('ၐ', 'ႝ'), ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), ('ა', 'ჺ'), ('ჼ', 'ቈ'), + ('ቊ', 'ቍ'), ('ቐ', 'ቖ'), ('ቘ', 'ቘ'), ('ቚ', 'ቝ'), + ('በ', 'ኈ'), ('ኊ', 'ኍ'), ('ነ', 'ኰ'), ('ኲ', 'ኵ'), + ('ኸ', 'ኾ'), ('ዀ', 'ዀ'), ('ዂ', 'ዅ'), ('ወ', 'ዖ'), + ('ዘ', 'ጐ'), ('ጒ', 'ጕ'), ('ጘ', 'ፚ'), ('፝', '፟'), + ('ᎀ', 'ᎏ'), ('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ᐁ', 'ᙬ'), + ('ᙯ', 'ᙿ'), ('ᚁ', 'ᚚ'), ('ᚠ', 'ᛪ'), ('ᛮ', 'ᛸ'), + ('ᜀ', 'ᜌ'), ('ᜎ', '᜔'), ('ᜠ', '᜴'), ('ᝀ', 'ᝓ'), + ('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), ('ᝲ', 'ᝳ'), ('ក', '៓'), + ('ៗ', 'ៗ'), ('ៜ', '៝'), ('០', '៩'), ('᠋', '᠍'), + ('᠐', '᠙'), ('ᠠ', 'ᡷ'), ('ᢀ', 'ᢪ'), ('ᢰ', 'ᣵ'), + ('ᤀ', 'ᤞ'), ('ᤠ', 'ᤫ'), ('ᤰ', '᤻'), ('᥆', 'ᥭ'), + ('ᥰ', 'ᥴ'), ('ᦀ', 'ᦫ'), ('ᦰ', 'ᧉ'), ('᧐', '᧙'), + ('ᨀ', 'ᨛ'), ('ᨠ', 'ᩞ'), ('᩠', '᩼'), ('᩿', '᪉'), + ('᪐', '᪙'), ('ᪧ', 'ᪧ'), ('᪰', '᪾'), ('ᬀ', 'ᭋ'), + ('᭐', '᭙'), ('᭫', '᭳'), ('ᮀ', '᯳'), ('ᰀ', '᰷'), + ('᱀', '᱉'), ('ᱍ', 'ᱽ'), ('ᲀ', 'ᲈ'), ('᳐', '᳒'), + ('᳔', '᳹'), ('ᴀ', '᷹'), ('᷻', 'ἕ'), ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), ('Ὀ', 'Ὅ'), ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), + ('ᾶ', 'ᾼ'), ('ι', 'ι'), ('ῂ', 'ῄ'), ('ῆ', 'ῌ'), + ('ῐ', 'ΐ'), ('ῖ', 'Ί'), ('ῠ', 'Ῥ'), ('ῲ', 'ῴ'), + ('ῶ', 'ῼ'), ('\u{200c}', '\u{200d}'), ('‿', '⁀'), ('⁔', '⁔'), + ('ⁱ', 'ⁱ'), ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), ('⃐', '⃰'), + ('ℂ', 'ℂ'), ('ℇ', 'ℇ'), ('ℊ', 'ℓ'), ('ℕ', 'ℕ'), + ('ℙ', 'ℝ'), ('ℤ', 'ℤ'), ('Ω', 'Ω'), ('ℨ', 'ℨ'), + ('K', 'ℭ'), ('ℯ', 'ℹ'), ('ℼ', 'ℿ'), ('ⅅ', 'ⅉ'), + ('ⅎ', 'ⅎ'), ('Ⅰ', 'ↈ'), ('Ⓐ', 'ⓩ'), ('Ⰰ', 'Ⱞ'), + ('ⰰ', 'ⱞ'), ('Ⱡ', 'ⳤ'), ('Ⳬ', 'ⳳ'), ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), ('ⴰ', 'ⵧ'), ('ⵯ', 'ⵯ'), + ('⵿', 'ⶖ'), ('ⶠ', 'ⶦ'), ('ⶨ', 'ⶮ'), ('ⶰ', 'ⶶ'), + ('ⶸ', 'ⶾ'), ('ⷀ', 'ⷆ'), ('ⷈ', 'ⷎ'), ('ⷐ', 'ⷖ'), + ('ⷘ', 'ⷞ'), ('ⷠ', 'ⷿ'), ('ⸯ', 'ⸯ'), ('々', '〇'), + ('〡', '〯'), ('〱', '〵'), ('〸', '〼'), ('ぁ', 'ゖ'), + ('゙', '゚'), ('ゝ', 'ゟ'), ('ァ', 'ヺ'), ('ー', 'ヿ'), + ('ㄅ', 'ㄮ'), ('ㄱ', 'ㆎ'), ('ㆠ', 'ㆺ'), ('ㇰ', 'ㇿ'), + ('㐀', '䶵'), ('一', '鿪'), ('ꀀ', 'ꒌ'), ('ꓐ', 'ꓽ'), + ('ꔀ', 'ꘌ'), ('ꘐ', 'ꘫ'), ('Ꙁ', '꙲'), ('ꙴ', '꙽'), + ('ꙿ', '꛱'), ('ꜗ', 'ꜟ'), ('Ꜣ', 'ꞈ'), ('Ꞌ', 'Ɪ'), + ('Ʞ', 'ꞷ'), ('ꟷ', 'ꠧ'), ('ꡀ', 'ꡳ'), ('ꢀ', 'ꣅ'), + ('꣐', '꣙'), ('꣠', 'ꣷ'), ('ꣻ', 'ꣻ'), ('ꣽ', 'ꣽ'), + ('꤀', '꤭'), ('ꤰ', '꥓'), ('ꥠ', 'ꥼ'), ('ꦀ', '꧀'), + ('ꧏ', '꧙'), ('ꧠ', 'ꧾ'), ('ꨀ', 'ꨶ'), ('ꩀ', 'ꩍ'), + ('꩐', '꩙'), ('ꩠ', 'ꩶ'), ('ꩺ', 'ꫂ'), ('ꫛ', 'ꫝ'), + ('ꫠ', 'ꫯ'), ('ꫲ', '꫶'), ('ꬁ', 'ꬆ'), ('ꬉ', 'ꬎ'), + ('ꬑ', 'ꬖ'), ('ꬠ', 'ꬦ'), ('ꬨ', 'ꬮ'), ('ꬰ', 'ꭚ'), + ('ꭜ', 'ꭥ'), ('ꭰ', 'ꯪ'), ('꯬', '꯭'), ('꯰', '꯹'), + ('가', '힣'), ('ힰ', 'ퟆ'), ('ퟋ', 'ퟻ'), ('豈', '舘'), + ('並', '龎'), ('ff', 'st'), ('ﬓ', 'ﬗ'), ('יִ', 'ﬨ'), + ('שׁ', 'זּ'), ('טּ', 'לּ'), ('מּ', 'מּ'), ('נּ', 'סּ'), + ('ףּ', 'פּ'), ('צּ', 'ﮱ'), ('ﯓ', 'ﴽ'), ('ﵐ', 'ﶏ'), + ('ﶒ', 'ﷇ'), ('ﷰ', 'ﷻ'), ('︀', '️'), ('︠', '︯'), + ('︳', '︴'), ('﹍', '﹏'), ('ﹰ', 'ﹴ'), ('ﹶ', 'ﻼ'), + ('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z'), + ('ヲ', 'ᄒ'), ('ᅡ', 'ᅦ'), ('ᅧ', 'ᅬ'), ('ᅭ', 'ᅲ'), + ('ᅳ', 'ᅵ'), ('𐀀', '𐀋'), ('𐀍', '𐀦'), ('𐀨', '𐀺'), + ('𐀼', '𐀽'), ('𐀿', '𐁍'), ('𐁐', '𐁝'), ('𐂀', '𐃺'), + ('𐅀', '𐅴'), ('𐇽', '𐇽'), ('𐊀', '𐊜'), ('𐊠', '𐋐'), + ('𐋠', '𐋠'), ('𐌀', '𐌟'), ('𐌭', '𐍊'), ('𐍐', '𐍺'), + ('𐎀', '𐎝'), ('𐎠', '𐏃'), ('𐏈', '𐏏'), ('𐏑', '𐏕'), + ('𐐀', '𐒝'), ('𐒠', '𐒩'), ('𐒰', '𐓓'), ('𐓘', '𐓻'), + ('𐔀', '𐔧'), ('𐔰', '𐕣'), ('𐘀', '𐜶'), ('𐝀', '𐝕'), + ('𐝠', '𐝧'), ('𐠀', '𐠅'), ('𐠈', '𐠈'), ('𐠊', '𐠵'), + ('𐠷', '𐠸'), ('𐠼', '𐠼'), ('𐠿', '𐡕'), ('𐡠', '𐡶'), + ('𐢀', '𐢞'), ('𐣠', '𐣲'), ('𐣴', '𐣵'), ('𐤀', '𐤕'), + ('𐤠', '𐤹'), ('𐦀', '𐦷'), ('𐦾', '𐦿'), ('𐨀', '𐨃'), + ('𐨅', '𐨆'), ('𐨌', '𐨓'), ('𐨕', '𐨗'), ('𐨙', '𐨳'), + ('𐨸', '𐨺'), ('𐨿', '𐨿'), ('𐩠', '𐩼'), ('𐪀', '𐪜'), + ('𐫀', '𐫇'), ('𐫉', '𐫦'), ('𐬀', '𐬵'), ('𐭀', '𐭕'), + ('𐭠', '𐭲'), ('𐮀', '𐮑'), ('𐰀', '𐱈'), ('𐲀', '𐲲'), + ('𐳀', '𐳲'), ('𑀀', '𑁆'), ('𑁦', '𑁯'), ('𑁿', '𑂺'), + ('𑃐', '𑃨'), ('𑃰', '𑃹'), ('𑄀', '𑄴'), ('𑄶', '𑄿'), + ('𑅐', '𑅳'), ('𑅶', '𑅶'), ('𑆀', '𑇄'), ('𑇊', '𑇌'), + ('𑇐', '𑇚'), ('𑇜', '𑇜'), ('𑈀', '𑈑'), ('𑈓', '𑈷'), + ('𑈾', '𑈾'), ('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), + ('𑊏', '𑊝'), ('𑊟', '𑊨'), ('𑊰', '𑋪'), ('𑋰', '𑋹'), + ('𑌀', '𑌃'), ('𑌅', '𑌌'), ('𑌏', '𑌐'), ('𑌓', '𑌨'), + ('𑌪', '𑌰'), ('𑌲', '𑌳'), ('𑌵', '𑌹'), ('𑌼', '𑍄'), + ('𑍇', '𑍈'), ('𑍋', '𑍍'), ('𑍐', '𑍐'), ('𑍗', '𑍗'), + ('𑍝', '𑍣'), ('𑍦', '𑍬'), ('𑍰', '𑍴'), ('𑐀', '𑑊'), + ('𑑐', '𑑙'), ('𑒀', '𑓅'), ('𑓇', '𑓇'), ('𑓐', '𑓙'), + ('𑖀', '𑖵'), ('𑖸', '𑗀'), ('𑗘', '𑗝'), ('𑘀', '𑙀'), + ('𑙄', '𑙄'), ('𑙐', '𑙙'), ('𑚀', '𑚷'), ('𑛀', '𑛉'), + ('𑜀', '𑜙'), ('𑜝', '𑜫'), ('𑜰', '𑜹'), ('𑢠', '𑣩'), + ('𑣿', '𑣿'), ('𑨀', '𑨾'), ('𑩇', '𑩇'), ('𑩐', '𑪃'), + ('𑪆', '𑪙'), ('𑫀', '𑫸'), ('𑰀', '𑰈'), ('𑰊', '𑰶'), + ('𑰸', '𑱀'), ('𑱐', '𑱙'), ('𑱲', '𑲏'), ('𑲒', '𑲧'), + ('𑲩', '𑲶'), ('𑴀', '𑴆'), ('𑴈', '𑴉'), ('𑴋', '𑴶'), + ('𑴺', '𑴺'), ('𑴼', '𑴽'), ('𑴿', '𑵇'), ('𑵐', '𑵙'), + ('𒀀', '𒎙'), ('𒐀', '𒑮'), ('𒒀', '𒕃'), ('𓀀', '𓐮'), + ('𔐀', '𔙆'), ('𖠀', '𖨸'), ('𖩀', '𖩞'), ('𖩠', '𖩩'), + ('𖫐', '𖫭'), ('𖫰', '𖫴'), ('𖬀', '𖬶'), ('𖭀', '𖭃'), + ('𖭐', '𖭙'), ('𖭣', '𖭷'), ('𖭽', '𖮏'), ('𖼀', '𖽄'), + ('𖽐', '𖽾'), ('𖾏', '𖾟'), ('𖿠', '𖿡'), ('𗀀', '𘟬'), + ('𘠀', '𘫲'), ('𛀀', '𛄞'), ('𛅰', '𛋻'), ('𛰀', '𛱪'), + ('𛱰', '𛱼'), ('𛲀', '𛲈'), ('𛲐', '𛲙'), ('𛲝', '𛲞'), + ('𝅥', '𝅩'), ('𝅭', '𝅲'), ('𝅻', '𝆂'), ('𝆅', '𝆋'), + ('𝆪', '𝆭'), ('𝉂', '𝉄'), ('𝐀', '𝑔'), ('𝑖', '𝒜'), + ('𝒞', '𝒟'), ('𝒢', '𝒢'), ('𝒥', '𝒦'), ('𝒩', '𝒬'), + ('𝒮', '𝒹'), ('𝒻', '𝒻'), ('𝒽', '𝓃'), ('𝓅', '𝔅'), + ('𝔇', '𝔊'), ('𝔍', '𝔔'), ('𝔖', '𝔜'), ('𝔞', '𝔹'), + ('𝔻', '𝔾'), ('𝕀', '𝕄'), ('𝕆', '𝕆'), ('𝕊', '𝕐'), + ('𝕒', '𝚥'), ('𝚨', '𝛀'), ('𝛂', '𝛚'), ('𝛜', '𝛺'), + ('𝛼', '𝜔'), ('𝜖', '𝜴'), ('𝜶', '𝝎'), ('𝝐', '𝝮'), + ('𝝰', '𝞈'), ('𝞊', '𝞨'), ('𝞪', '𝟂'), ('𝟄', '𝟋'), + ('𝟎', '𝟿'), ('𝨀', '𝨶'), ('𝨻', '𝩬'), ('𝩵', '𝩵'), + ('𝪄', '𝪄'), ('𝪛', '𝪟'), ('𝪡', '𝪯'), ('𞀀', '𞀆'), + ('𞀈', '𞀘'), ('𞀛', '𞀡'), ('𞀣', '𞀤'), ('𞀦', '𞀪'), + ('𞠀', '𞣄'), ('𞣐', '𞣖'), ('𞤀', '𞥊'), ('𞥐', '𞥙'), + ('𞸀', '𞸃'), ('𞸅', '𞸟'), ('𞸡', '𞸢'), ('𞸤', '𞸤'), + ('𞸧', '𞸧'), ('𞸩', '𞸲'), ('𞸴', '𞸷'), ('𞸹', '𞸹'), + ('𞸻', '𞸻'), ('𞹂', '𞹂'), ('𞹇', '𞹇'), ('𞹉', '𞹉'), + ('𞹋', '𞹋'), ('𞹍', '𞹏'), ('𞹑', '𞹒'), ('𞹔', '𞹔'), + ('𞹗', '𞹗'), ('𞹙', '𞹙'), ('𞹛', '𞹛'), ('𞹝', '𞹝'), + ('𞹟', '𞹟'), ('𞹡', '𞹢'), ('𞹤', '𞹤'), ('𞹧', '𞹪'), + ('𞹬', '𞹲'), ('𞹴', '𞹷'), ('𞹹', '𞹼'), ('𞹾', '𞹾'), + ('𞺀', '𞺉'), ('𞺋', '𞺛'), ('𞺡', '𞺣'), ('𞺥', '𞺩'), + ('𞺫', '𞺻'), ('🄰', '🅉'), ('🅐', '🅩'), ('🅰', '🆉'), + ('𠀀', '𪛖'), ('𪜀', '𫜴'), ('𫝀', '𫠝'), ('𫠠', '𬺡'), + ('𬺰', '𮯠'), ('丽', '𪘀'), ('󠄀', '󠇯'), +]; diff --git a/regex-syntax-2/src/unicode_tables/property_bool.rs b/regex-syntax-2/src/unicode_tables/property_bool.rs new file mode 100644 index 0000000000..ae867e3007 --- /dev/null +++ b/regex-syntax-2/src/unicode_tables/property_bool.rs @@ -0,0 +1,2576 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate property-bool tmp/ucd-10.0.0/ --chars +// +// ucd-generate is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ + ("ASCII_Hex_Digit", ASCII_HEX_DIGIT), ("Alphabetic", ALPHABETIC), + ("Bidi_Control", BIDI_CONTROL), ("Case_Ignorable", CASE_IGNORABLE), + ("Cased", CASED), ("Changes_When_Casefolded", CHANGES_WHEN_CASEFOLDED), + ("Changes_When_Casemapped", CHANGES_WHEN_CASEMAPPED), + ("Changes_When_Lowercased", CHANGES_WHEN_LOWERCASED), + ("Changes_When_Titlecased", CHANGES_WHEN_TITLECASED), + ("Changes_When_Uppercased", CHANGES_WHEN_UPPERCASED), ("Dash", DASH), + ("Default_Ignorable_Code_Point", DEFAULT_IGNORABLE_CODE_POINT), + ("Deprecated", DEPRECATED), ("Diacritic", DIACRITIC), + ("Extender", EXTENDER), ("Grapheme_Base", GRAPHEME_BASE), + ("Grapheme_Extend", GRAPHEME_EXTEND), ("Grapheme_Link", GRAPHEME_LINK), + ("Hex_Digit", HEX_DIGIT), ("Hyphen", HYPHEN), + ("IDS_Binary_Operator", IDS_BINARY_OPERATOR), + ("IDS_Trinary_Operator", IDS_TRINARY_OPERATOR), + ("ID_Continue", ID_CONTINUE), ("ID_Start", ID_START), + ("Ideographic", IDEOGRAPHIC), ("Join_Control", JOIN_CONTROL), + ("Logical_Order_Exception", LOGICAL_ORDER_EXCEPTION), + ("Lowercase", LOWERCASE), ("Math", MATH), + ("Noncharacter_Code_Point", NONCHARACTER_CODE_POINT), + ("Other_Alphabetic", OTHER_ALPHABETIC), + ("Other_Default_Ignorable_Code_Point", OTHER_DEFAULT_IGNORABLE_CODE_POINT), + ("Other_Grapheme_Extend", OTHER_GRAPHEME_EXTEND), + ("Other_ID_Continue", OTHER_ID_CONTINUE), + ("Other_ID_Start", OTHER_ID_START), ("Other_Lowercase", OTHER_LOWERCASE), + ("Other_Math", OTHER_MATH), ("Other_Uppercase", OTHER_UPPERCASE), + ("Pattern_Syntax", PATTERN_SYNTAX), + ("Pattern_White_Space", PATTERN_WHITE_SPACE), + ("Prepended_Concatenation_Mark", PREPENDED_CONCATENATION_MARK), + ("Quotation_Mark", QUOTATION_MARK), ("Radical", RADICAL), + ("Regional_Indicator", REGIONAL_INDICATOR), + ("Sentence_Terminal", SENTENCE_TERMINAL), ("Soft_Dotted", SOFT_DOTTED), + ("Terminal_Punctuation", TERMINAL_PUNCTUATION), + ("Unified_Ideograph", UNIFIED_IDEOGRAPH), ("Uppercase", UPPERCASE), + ("Variation_Selector", VARIATION_SELECTOR), ("White_Space", WHITE_SPACE), + ("XID_Continue", XID_CONTINUE), ("XID_Start", XID_START), +]; + +pub const ASCII_HEX_DIGIT: &'static [(char, char)] = &[ + ('0', '9'), ('A', 'F'), ('a', 'f'), +]; + +pub const ALPHABETIC: &'static [(char, char)] = &[ + ('A', 'Z'), ('a', 'z'), ('ª', 'ª'), ('µ', 'µ'), ('º', 'º'), + ('À', 'Ö'), ('Ø', 'ö'), ('ø', 'ˁ'), ('ˆ', 'ˑ'), ('ˠ', 'ˤ'), + ('ˬ', 'ˬ'), ('ˮ', 'ˮ'), ('ͅ', 'ͅ'), ('Ͱ', 'ʹ'), ('Ͷ', 'ͷ'), + ('ͺ', 'ͽ'), ('Ϳ', 'Ϳ'), ('Ά', 'Ά'), ('Έ', 'Ί'), ('Ό', 'Ό'), + ('Ύ', 'Ρ'), ('Σ', 'ϵ'), ('Ϸ', 'ҁ'), ('Ҋ', 'ԯ'), ('Ա', 'Ֆ'), + ('ՙ', 'ՙ'), ('ա', 'և'), ('ְ', 'ֽ'), ('ֿ', 'ֿ'), ('ׁ', 'ׂ'), + ('ׄ', 'ׅ'), ('ׇ', 'ׇ'), ('א', 'ת'), ('װ', 'ײ'), ('ؐ', 'ؚ'), + ('ؠ', 'ٗ'), ('ٙ', 'ٟ'), ('ٮ', 'ۓ'), ('ە', 'ۜ'), ('ۡ', 'ۨ'), + ('ۭ', 'ۯ'), ('ۺ', 'ۼ'), ('ۿ', 'ۿ'), ('ܐ', 'ܿ'), ('ݍ', 'ޱ'), + ('ߊ', 'ߪ'), ('ߴ', 'ߵ'), ('ߺ', 'ߺ'), ('ࠀ', 'ࠗ'), ('ࠚ', 'ࠬ'), + ('ࡀ', 'ࡘ'), ('ࡠ', 'ࡪ'), ('ࢠ', 'ࢴ'), ('ࢶ', 'ࢽ'), + ('ࣔ', 'ࣟ'), ('ࣣ', 'ࣩ'), ('ࣰ', 'ऻ'), ('ऽ', 'ौ'), + ('ॎ', 'ॐ'), ('ॕ', 'ॣ'), ('ॱ', 'ঃ'), ('অ', 'ঌ'), + ('এ', 'ঐ'), ('ও', 'ন'), ('প', 'র'), ('ল', 'ল'), + ('শ', 'হ'), ('ঽ', 'ৄ'), ('ে', 'ৈ'), ('ো', 'ৌ'), + ('ৎ', 'ৎ'), ('ৗ', 'ৗ'), ('ড়', 'ঢ়'), ('য়', 'ৣ'), + ('ৰ', 'ৱ'), ('ৼ', 'ৼ'), ('ਁ', 'ਃ'), ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), ('ਓ', 'ਨ'), ('ਪ', 'ਰ'), ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), ('ਸ', 'ਹ'), ('ਾ', 'ੂ'), ('ੇ', 'ੈ'), + ('ੋ', 'ੌ'), ('ੑ', 'ੑ'), ('ਖ਼', 'ੜ'), ('ਫ਼', 'ਫ਼'), + ('ੰ', 'ੵ'), ('ઁ', 'ઃ'), ('અ', 'ઍ'), ('એ', 'ઑ'), + ('ઓ', 'ન'), ('પ', 'ર'), ('લ', 'ળ'), ('વ', 'હ'), + ('ઽ', 'ૅ'), ('ે', 'ૉ'), ('ો', 'ૌ'), ('ૐ', 'ૐ'), + ('ૠ', 'ૣ'), ('ૹ', 'ૼ'), ('ଁ', 'ଃ'), ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), ('ଓ', 'ନ'), ('ପ', 'ର'), ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), ('ଽ', 'ୄ'), ('େ', 'ୈ'), ('ୋ', 'ୌ'), + ('ୖ', 'ୗ'), ('ଡ଼', 'ଢ଼'), ('ୟ', 'ୣ'), ('ୱ', 'ୱ'), + ('ஂ', 'ஃ'), ('அ', 'ஊ'), ('எ', 'ஐ'), ('ஒ', 'க'), + ('ங', 'ச'), ('ஜ', 'ஜ'), ('ஞ', 'ட'), ('ண', 'த'), + ('ந', 'ப'), ('ம', 'ஹ'), ('ா', 'ூ'), ('ெ', 'ை'), + ('ொ', 'ௌ'), ('ௐ', 'ௐ'), ('ௗ', 'ௗ'), ('ఀ', 'ః'), + ('అ', 'ఌ'), ('ఎ', 'ఐ'), ('ఒ', 'న'), ('ప', 'హ'), + ('ఽ', 'ౄ'), ('ె', 'ై'), ('ొ', 'ౌ'), ('ౕ', 'ౖ'), + ('ౘ', 'ౚ'), ('ౠ', 'ౣ'), ('ಀ', 'ಃ'), ('ಅ', 'ಌ'), + ('ಎ', 'ಐ'), ('ಒ', 'ನ'), ('ಪ', 'ಳ'), ('ವ', 'ಹ'), + ('ಽ', 'ೄ'), ('ೆ', 'ೈ'), ('ೊ', 'ೌ'), ('ೕ', 'ೖ'), + ('ೞ', 'ೞ'), ('ೠ', 'ೣ'), ('ೱ', 'ೲ'), ('ഀ', 'ഃ'), + ('അ', 'ഌ'), ('എ', 'ഐ'), ('ഒ', 'ഺ'), ('ഽ', 'ൄ'), + ('െ', 'ൈ'), ('ൊ', 'ൌ'), ('ൎ', 'ൎ'), ('ൔ', 'ൗ'), + ('ൟ', 'ൣ'), ('ൺ', 'ൿ'), ('ං', 'ඃ'), ('අ', 'ඖ'), + ('ක', 'න'), ('ඳ', 'ර'), ('ල', 'ල'), ('ව', 'ෆ'), + ('ා', 'ු'), ('ූ', 'ූ'), ('ෘ', 'ෟ'), ('ෲ', 'ෳ'), + ('ก', 'ฺ'), ('เ', 'ๆ'), ('ํ', 'ํ'), ('ກ', 'ຂ'), + ('ຄ', 'ຄ'), ('ງ', 'ຈ'), ('ຊ', 'ຊ'), ('ຍ', 'ຍ'), + ('ດ', 'ທ'), ('ນ', 'ຟ'), ('ມ', 'ຣ'), ('ລ', 'ລ'), + ('ວ', 'ວ'), ('ສ', 'ຫ'), ('ອ', 'ູ'), ('ົ', 'ຽ'), + ('ເ', 'ໄ'), ('ໆ', 'ໆ'), ('ໍ', 'ໍ'), ('ໜ', 'ໟ'), + ('ༀ', 'ༀ'), ('ཀ', 'ཇ'), ('ཉ', 'ཬ'), ('ཱ', 'ཱྀ'), + ('ྈ', 'ྗ'), ('ྙ', 'ྼ'), ('က', 'ံ'), ('း', 'း'), + ('ျ', 'ဿ'), ('ၐ', 'ၢ'), ('ၥ', 'ၨ'), ('ၮ', 'ႆ'), + ('ႎ', 'ႎ'), ('ႜ', 'ႝ'), ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), ('ა', 'ჺ'), ('ჼ', 'ቈ'), ('ቊ', 'ቍ'), + ('ቐ', 'ቖ'), ('ቘ', 'ቘ'), ('ቚ', 'ቝ'), ('በ', 'ኈ'), + ('ኊ', 'ኍ'), ('ነ', 'ኰ'), ('ኲ', 'ኵ'), ('ኸ', 'ኾ'), + ('ዀ', 'ዀ'), ('ዂ', 'ዅ'), ('ወ', 'ዖ'), ('ዘ', 'ጐ'), + ('ጒ', 'ጕ'), ('ጘ', 'ፚ'), ('፟', '፟'), ('ᎀ', 'ᎏ'), + ('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ᐁ', 'ᙬ'), ('ᙯ', 'ᙿ'), + ('ᚁ', 'ᚚ'), ('ᚠ', 'ᛪ'), ('ᛮ', 'ᛸ'), ('ᜀ', 'ᜌ'), + ('ᜎ', 'ᜓ'), ('ᜠ', 'ᜳ'), ('ᝀ', 'ᝓ'), ('ᝠ', 'ᝬ'), + ('ᝮ', 'ᝰ'), ('ᝲ', 'ᝳ'), ('ក', 'ឳ'), ('ា', 'ៈ'), + ('ៗ', 'ៗ'), ('ៜ', 'ៜ'), ('ᠠ', 'ᡷ'), ('ᢀ', 'ᢪ'), + ('ᢰ', 'ᣵ'), ('ᤀ', 'ᤞ'), ('ᤠ', 'ᤫ'), ('ᤰ', 'ᤸ'), + ('ᥐ', 'ᥭ'), ('ᥰ', 'ᥴ'), ('ᦀ', 'ᦫ'), ('ᦰ', 'ᧉ'), + ('ᨀ', 'ᨛ'), ('ᨠ', 'ᩞ'), ('ᩡ', 'ᩴ'), ('ᪧ', 'ᪧ'), + ('ᬀ', 'ᬳ'), ('ᬵ', 'ᭃ'), ('ᭅ', 'ᭋ'), ('ᮀ', 'ᮩ'), + ('ᮬ', 'ᮯ'), ('ᮺ', 'ᯥ'), ('ᯧ', 'ᯱ'), ('ᰀ', 'ᰵ'), + ('ᱍ', 'ᱏ'), ('ᱚ', 'ᱽ'), ('ᲀ', 'ᲈ'), ('ᳩ', 'ᳬ'), + ('ᳮ', 'ᳳ'), ('ᳵ', 'ᳶ'), ('ᴀ', 'ᶿ'), ('ᷧ', 'ᷴ'), + ('Ḁ', 'ἕ'), ('Ἐ', 'Ἕ'), ('ἠ', 'ὅ'), ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), ('ᾶ', 'ᾼ'), ('ι', 'ι'), + ('ῂ', 'ῄ'), ('ῆ', 'ῌ'), ('ῐ', 'ΐ'), ('ῖ', 'Ί'), + ('ῠ', 'Ῥ'), ('ῲ', 'ῴ'), ('ῶ', 'ῼ'), ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), ('ℂ', 'ℂ'), ('ℇ', 'ℇ'), + ('ℊ', 'ℓ'), ('ℕ', 'ℕ'), ('ℙ', 'ℝ'), ('ℤ', 'ℤ'), + ('Ω', 'Ω'), ('ℨ', 'ℨ'), ('K', 'ℭ'), ('ℯ', 'ℹ'), + ('ℼ', 'ℿ'), ('ⅅ', 'ⅉ'), ('ⅎ', 'ⅎ'), ('Ⅰ', 'ↈ'), + ('Ⓐ', 'ⓩ'), ('Ⰰ', 'Ⱞ'), ('ⰰ', 'ⱞ'), ('Ⱡ', 'ⳤ'), + ('Ⳬ', 'ⳮ'), ('Ⳳ', 'ⳳ'), ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), ('ⴰ', 'ⵧ'), ('ⵯ', 'ⵯ'), ('ⶀ', 'ⶖ'), + ('ⶠ', 'ⶦ'), ('ⶨ', 'ⶮ'), ('ⶰ', 'ⶶ'), ('ⶸ', 'ⶾ'), + ('ⷀ', 'ⷆ'), ('ⷈ', 'ⷎ'), ('ⷐ', 'ⷖ'), ('ⷘ', 'ⷞ'), + ('ⷠ', 'ⷿ'), ('ⸯ', 'ⸯ'), ('々', '〇'), ('〡', '〩'), + ('〱', '〵'), ('〸', '〼'), ('ぁ', 'ゖ'), ('ゝ', 'ゟ'), + ('ァ', 'ヺ'), ('ー', 'ヿ'), ('ㄅ', 'ㄮ'), ('ㄱ', 'ㆎ'), + ('ㆠ', 'ㆺ'), ('ㇰ', 'ㇿ'), ('㐀', '䶵'), ('一', '鿪'), + ('ꀀ', 'ꒌ'), ('ꓐ', 'ꓽ'), ('ꔀ', 'ꘌ'), ('ꘐ', 'ꘟ'), + ('ꘪ', 'ꘫ'), ('Ꙁ', 'ꙮ'), ('ꙴ', 'ꙻ'), ('ꙿ', 'ꛯ'), + ('ꜗ', 'ꜟ'), ('Ꜣ', 'ꞈ'), ('Ꞌ', 'Ɪ'), ('Ʞ', 'ꞷ'), + ('ꟷ', 'ꠁ'), ('ꠃ', 'ꠅ'), ('ꠇ', 'ꠊ'), ('ꠌ', 'ꠧ'), + ('ꡀ', 'ꡳ'), ('ꢀ', 'ꣃ'), ('ꣅ', 'ꣅ'), ('ꣲ', 'ꣷ'), + ('ꣻ', 'ꣻ'), ('ꣽ', 'ꣽ'), ('ꤊ', 'ꤪ'), ('ꤰ', 'ꥒ'), + ('ꥠ', 'ꥼ'), ('ꦀ', 'ꦲ'), ('ꦴ', 'ꦿ'), ('ꧏ', 'ꧏ'), + ('ꧠ', 'ꧤ'), ('ꧦ', 'ꧯ'), ('ꧺ', 'ꧾ'), ('ꨀ', 'ꨶ'), + ('ꩀ', 'ꩍ'), ('ꩠ', 'ꩶ'), ('ꩺ', 'ꩺ'), ('ꩾ', 'ꪾ'), + ('ꫀ', 'ꫀ'), ('ꫂ', 'ꫂ'), ('ꫛ', 'ꫝ'), ('ꫠ', 'ꫯ'), + ('ꫲ', 'ꫵ'), ('ꬁ', 'ꬆ'), ('ꬉ', 'ꬎ'), ('ꬑ', 'ꬖ'), + ('ꬠ', 'ꬦ'), ('ꬨ', 'ꬮ'), ('ꬰ', 'ꭚ'), ('ꭜ', 'ꭥ'), + ('ꭰ', 'ꯪ'), ('가', '힣'), ('ힰ', 'ퟆ'), ('ퟋ', 'ퟻ'), + ('豈', '舘'), ('並', '龎'), ('ff', 'st'), ('ﬓ', 'ﬗ'), + ('יִ', 'ﬨ'), ('שׁ', 'זּ'), ('טּ', 'לּ'), ('מּ', 'מּ'), + ('נּ', 'סּ'), ('ףּ', 'פּ'), ('צּ', 'ﮱ'), ('ﯓ', 'ﴽ'), + ('ﵐ', 'ﶏ'), ('ﶒ', 'ﷇ'), ('ﷰ', 'ﷻ'), ('ﹰ', 'ﹴ'), + ('ﹶ', 'ﻼ'), ('A', 'Z'), ('a', 'z'), ('ヲ', 'ᄒ'), + ('ᅡ', 'ᅦ'), ('ᅧ', 'ᅬ'), ('ᅭ', 'ᅲ'), ('ᅳ', 'ᅵ'), + ('𐀀', '𐀋'), ('𐀍', '𐀦'), ('𐀨', '𐀺'), ('𐀼', '𐀽'), + ('𐀿', '𐁍'), ('𐁐', '𐁝'), ('𐂀', '𐃺'), ('𐅀', '𐅴'), + ('𐊀', '𐊜'), ('𐊠', '𐋐'), ('𐌀', '𐌟'), ('𐌭', '𐍊'), + ('𐍐', '𐍺'), ('𐎀', '𐎝'), ('𐎠', '𐏃'), ('𐏈', '𐏏'), + ('𐏑', '𐏕'), ('𐐀', '𐒝'), ('𐒰', '𐓓'), ('𐓘', '𐓻'), + ('𐔀', '𐔧'), ('𐔰', '𐕣'), ('𐘀', '𐜶'), ('𐝀', '𐝕'), + ('𐝠', '𐝧'), ('𐠀', '𐠅'), ('𐠈', '𐠈'), ('𐠊', '𐠵'), + ('𐠷', '𐠸'), ('𐠼', '𐠼'), ('𐠿', '𐡕'), ('𐡠', '𐡶'), + ('𐢀', '𐢞'), ('𐣠', '𐣲'), ('𐣴', '𐣵'), ('𐤀', '𐤕'), + ('𐤠', '𐤹'), ('𐦀', '𐦷'), ('𐦾', '𐦿'), ('𐨀', '𐨃'), + ('𐨅', '𐨆'), ('𐨌', '𐨓'), ('𐨕', '𐨗'), ('𐨙', '𐨳'), + ('𐩠', '𐩼'), ('𐪀', '𐪜'), ('𐫀', '𐫇'), ('𐫉', '𐫤'), + ('𐬀', '𐬵'), ('𐭀', '𐭕'), ('𐭠', '𐭲'), ('𐮀', '𐮑'), + ('𐰀', '𐱈'), ('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𑀀', '𑁅'), + ('𑂂', '𑂸'), ('𑃐', '𑃨'), ('𑄀', '𑄲'), ('𑅐', '𑅲'), + ('𑅶', '𑅶'), ('𑆀', '𑆿'), ('𑇁', '𑇄'), ('𑇚', '𑇚'), + ('𑇜', '𑇜'), ('𑈀', '𑈑'), ('𑈓', '𑈴'), ('𑈷', '𑈷'), + ('𑈾', '𑈾'), ('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), + ('𑊏', '𑊝'), ('𑊟', '𑊨'), ('𑊰', '𑋨'), ('𑌀', '𑌃'), + ('𑌅', '𑌌'), ('𑌏', '𑌐'), ('𑌓', '𑌨'), ('𑌪', '𑌰'), + ('𑌲', '𑌳'), ('𑌵', '𑌹'), ('𑌽', '𑍄'), ('𑍇', '𑍈'), + ('𑍋', '𑍌'), ('𑍐', '𑍐'), ('𑍗', '𑍗'), ('𑍝', '𑍣'), + ('𑐀', '𑑁'), ('𑑃', '𑑅'), ('𑑇', '𑑊'), ('𑒀', '𑓁'), + ('𑓄', '𑓅'), ('𑓇', '𑓇'), ('𑖀', '𑖵'), ('𑖸', '𑖾'), + ('𑗘', '𑗝'), ('𑘀', '𑘾'), ('𑙀', '𑙀'), ('𑙄', '𑙄'), + ('𑚀', '𑚵'), ('𑜀', '𑜙'), ('𑜝', '𑜪'), ('𑢠', '𑣟'), + ('𑣿', '𑣿'), ('𑨀', '𑨲'), ('𑨵', '𑨾'), ('𑩐', '𑪃'), + ('𑪆', '𑪗'), ('𑫀', '𑫸'), ('𑰀', '𑰈'), ('𑰊', '𑰶'), + ('𑰸', '𑰾'), ('𑱀', '𑱀'), ('𑱲', '𑲏'), ('𑲒', '𑲧'), + ('𑲩', '𑲶'), ('𑴀', '𑴆'), ('𑴈', '𑴉'), ('𑴋', '𑴶'), + ('𑴺', '𑴺'), ('𑴼', '𑴽'), ('𑴿', '𑵁'), ('𑵃', '𑵃'), + ('𑵆', '𑵇'), ('𒀀', '𒎙'), ('𒐀', '𒑮'), ('𒒀', '𒕃'), + ('𓀀', '𓐮'), ('𔐀', '𔙆'), ('𖠀', '𖨸'), ('𖩀', '𖩞'), + ('𖫐', '𖫭'), ('𖬀', '𖬶'), ('𖭀', '𖭃'), ('𖭣', '𖭷'), + ('𖭽', '𖮏'), ('𖼀', '𖽄'), ('𖽐', '𖽾'), ('𖾓', '𖾟'), + ('𖿠', '𖿡'), ('𗀀', '𘟬'), ('𘠀', '𘫲'), ('𛀀', '𛄞'), + ('𛅰', '𛋻'), ('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), + ('𛲐', '𛲙'), ('𛲞', '𛲞'), ('𝐀', '𝑔'), ('𝑖', '𝒜'), + ('𝒞', '𝒟'), ('𝒢', '𝒢'), ('𝒥', '𝒦'), ('𝒩', '𝒬'), + ('𝒮', '𝒹'), ('𝒻', '𝒻'), ('𝒽', '𝓃'), ('𝓅', '𝔅'), + ('𝔇', '𝔊'), ('𝔍', '𝔔'), ('𝔖', '𝔜'), ('𝔞', '𝔹'), + ('𝔻', '𝔾'), ('𝕀', '𝕄'), ('𝕆', '𝕆'), ('𝕊', '𝕐'), + ('𝕒', '𝚥'), ('𝚨', '𝛀'), ('𝛂', '𝛚'), ('𝛜', '𝛺'), + ('𝛼', '𝜔'), ('𝜖', '𝜴'), ('𝜶', '𝝎'), ('𝝐', '𝝮'), + ('𝝰', '𝞈'), ('𝞊', '𝞨'), ('𝞪', '𝟂'), ('𝟄', '𝟋'), + ('𞀀', '𞀆'), ('𞀈', '𞀘'), ('𞀛', '𞀡'), ('𞀣', '𞀤'), + ('𞀦', '𞀪'), ('𞠀', '𞣄'), ('𞤀', '𞥃'), ('𞥇', '𞥇'), + ('𞸀', '𞸃'), ('𞸅', '𞸟'), ('𞸡', '𞸢'), ('𞸤', '𞸤'), + ('𞸧', '𞸧'), ('𞸩', '𞸲'), ('𞸴', '𞸷'), ('𞸹', '𞸹'), + ('𞸻', '𞸻'), ('𞹂', '𞹂'), ('𞹇', '𞹇'), ('𞹉', '𞹉'), + ('𞹋', '𞹋'), ('𞹍', '𞹏'), ('𞹑', '𞹒'), ('𞹔', '𞹔'), + ('𞹗', '𞹗'), ('𞹙', '𞹙'), ('𞹛', '𞹛'), ('𞹝', '𞹝'), + ('𞹟', '𞹟'), ('𞹡', '𞹢'), ('𞹤', '𞹤'), ('𞹧', '𞹪'), + ('𞹬', '𞹲'), ('𞹴', '𞹷'), ('𞹹', '𞹼'), ('𞹾', '𞹾'), + ('𞺀', '𞺉'), ('𞺋', '𞺛'), ('𞺡', '𞺣'), ('𞺥', '𞺩'), + ('𞺫', '𞺻'), ('🄰', '🅉'), ('🅐', '🅩'), ('🅰', '🆉'), + ('𠀀', '𪛖'), ('𪜀', '𫜴'), ('𫝀', '𫠝'), ('𫠠', '𬺡'), + ('𬺰', '𮯠'), ('丽', '𪘀'), +]; + +pub const BIDI_CONTROL: &'static [(char, char)] = &[ + ('\u{61c}', '\u{61c}'), ('\u{200e}', '\u{200f}'), ('\u{202a}', '\u{202e}'), + ('\u{2066}', '\u{2069}'), +]; + +pub const CASE_IGNORABLE: &'static [(char, char)] = &[ + ('\'', '\''), ('.', '.'), (':', ':'), ('^', '^'), ('`', '`'), ('¨', '¨'), + ('\u{ad}', '\u{ad}'), ('¯', '¯'), ('´', '´'), ('·', '¸'), + ('ʰ', 'ͯ'), ('ʹ', '͵'), ('ͺ', 'ͺ'), ('΄', '΅'), ('·', '·'), + ('҃', '҉'), ('ՙ', 'ՙ'), ('֑', 'ֽ'), ('ֿ', 'ֿ'), ('ׁ', 'ׂ'), + ('ׄ', 'ׅ'), ('ׇ', 'ׇ'), ('״', '״'), ('\u{600}', '\u{605}'), + ('ؐ', 'ؚ'), ('\u{61c}', '\u{61c}'), ('ـ', 'ـ'), ('ً', 'ٟ'), + ('ٰ', 'ٰ'), ('ۖ', '\u{6dd}'), ('۟', 'ۨ'), ('۪', 'ۭ'), + ('\u{70f}', '\u{70f}'), ('ܑ', 'ܑ'), ('ܰ', '݊'), ('ަ', 'ް'), + ('߫', 'ߵ'), ('ߺ', 'ߺ'), ('ࠖ', '࠭'), ('࡙', '࡛'), ('ࣔ', 'ं'), + ('ऺ', 'ऺ'), ('़', '़'), ('ु', 'ै'), ('्', '्'), + ('॑', 'ॗ'), ('ॢ', 'ॣ'), ('ॱ', 'ॱ'), ('ঁ', 'ঁ'), + ('়', '়'), ('ু', 'ৄ'), ('্', '্'), ('ৢ', 'ৣ'), + ('ਁ', 'ਂ'), ('਼', '਼'), ('ੁ', 'ੂ'), ('ੇ', 'ੈ'), + ('ੋ', '੍'), ('ੑ', 'ੑ'), ('ੰ', 'ੱ'), ('ੵ', 'ੵ'), + ('ઁ', 'ં'), ('઼', '઼'), ('ુ', 'ૅ'), ('ે', 'ૈ'), + ('્', '્'), ('ૢ', 'ૣ'), ('ૺ', '૿'), ('ଁ', 'ଁ'), + ('଼', '଼'), ('ି', 'ି'), ('ୁ', 'ୄ'), ('୍', '୍'), + ('ୖ', 'ୖ'), ('ୢ', 'ୣ'), ('ஂ', 'ஂ'), ('ீ', 'ீ'), + ('்', '்'), ('ఀ', 'ఀ'), ('ా', 'ీ'), ('ె', 'ై'), + ('ొ', '్'), ('ౕ', 'ౖ'), ('ౢ', 'ౣ'), ('ಁ', 'ಁ'), + ('಼', '಼'), ('ಿ', 'ಿ'), ('ೆ', 'ೆ'), ('ೌ', '್'), + ('ೢ', 'ೣ'), ('ഀ', 'ഁ'), ('഻', '഼'), ('ു', 'ൄ'), + ('്', '്'), ('ൢ', 'ൣ'), ('්', '්'), ('ි', 'ු'), + ('ූ', 'ූ'), ('ั', 'ั'), ('ิ', 'ฺ'), ('ๆ', '๎'), + ('ັ', 'ັ'), ('ິ', 'ູ'), ('ົ', 'ຼ'), ('ໆ', 'ໆ'), + ('່', 'ໍ'), ('༘', '༙'), ('༵', '༵'), ('༷', '༷'), + ('༹', '༹'), ('ཱ', 'ཾ'), ('ྀ', '྄'), ('྆', '྇'), + ('ྍ', 'ྗ'), ('ྙ', 'ྼ'), ('࿆', '࿆'), ('ိ', 'ူ'), + ('ဲ', '့'), ('္', '်'), ('ွ', 'ှ'), ('ၘ', 'ၙ'), + ('ၞ', 'ၠ'), ('ၱ', 'ၴ'), ('ႂ', 'ႂ'), ('ႅ', 'ႆ'), + ('ႍ', 'ႍ'), ('ႝ', 'ႝ'), ('ჼ', 'ჼ'), ('፝', '፟'), + ('ᜒ', '᜔'), ('ᜲ', '᜴'), ('ᝒ', 'ᝓ'), ('ᝲ', 'ᝳ'), + ('឴', '឵'), ('ិ', 'ួ'), ('ំ', 'ំ'), ('៉', '៓'), + ('ៗ', 'ៗ'), ('៝', '៝'), ('᠋', '\u{180e}'), ('ᡃ', 'ᡃ'), + ('ᢅ', 'ᢆ'), ('ᢩ', 'ᢩ'), ('ᤠ', 'ᤢ'), ('ᤧ', 'ᤨ'), + ('ᤲ', 'ᤲ'), ('᤹', '᤻'), ('ᨗ', 'ᨘ'), ('ᨛ', 'ᨛ'), + ('ᩖ', 'ᩖ'), ('ᩘ', 'ᩞ'), ('᩠', '᩠'), ('ᩢ', 'ᩢ'), + ('ᩥ', 'ᩬ'), ('ᩳ', '᩼'), ('᩿', '᩿'), ('ᪧ', 'ᪧ'), + ('᪰', '᪾'), ('ᬀ', 'ᬃ'), ('᬴', '᬴'), ('ᬶ', 'ᬺ'), + ('ᬼ', 'ᬼ'), ('ᭂ', 'ᭂ'), ('᭫', '᭳'), ('ᮀ', 'ᮁ'), + ('ᮢ', 'ᮥ'), ('ᮨ', 'ᮩ'), ('᮫', 'ᮭ'), ('᯦', '᯦'), + ('ᯨ', 'ᯩ'), ('ᯭ', 'ᯭ'), ('ᯯ', 'ᯱ'), ('ᰬ', 'ᰳ'), + ('ᰶ', '᰷'), ('ᱸ', 'ᱽ'), ('᳐', '᳒'), ('᳔', '᳠'), + ('᳢', '᳨'), ('᳭', '᳭'), ('᳴', '᳴'), ('᳸', '᳹'), + ('ᴬ', 'ᵪ'), ('ᵸ', 'ᵸ'), ('ᶛ', '᷹'), ('᷻', '᷿'), + ('᾽', '᾽'), ('᾿', '῁'), ('῍', '῏'), ('῝', '῟'), + ('῭', '`'), ('´', '῾'), ('\u{200b}', '\u{200f}'), ('‘', '’'), + ('․', '․'), ('‧', '‧'), ('\u{202a}', '\u{202e}'), + ('\u{2060}', '\u{2064}'), ('\u{2066}', '\u{206f}'), ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), ('⃐', '⃰'), ('ⱼ', 'ⱽ'), + ('⳯', '⳱'), ('ⵯ', 'ⵯ'), ('⵿', '⵿'), ('ⷠ', 'ⷿ'), + ('ⸯ', 'ⸯ'), ('々', '々'), ('〪', '〭'), ('〱', '〵'), + ('〻', '〻'), ('゙', 'ゞ'), ('ー', 'ヾ'), ('ꀕ', 'ꀕ'), + ('ꓸ', 'ꓽ'), ('ꘌ', 'ꘌ'), ('꙯', '꙲'), ('ꙴ', '꙽'), + ('ꙿ', 'ꙿ'), ('ꚜ', 'ꚟ'), ('꛰', '꛱'), ('꜀', '꜡'), + ('ꝰ', 'ꝰ'), ('ꞈ', '꞊'), ('ꟸ', 'ꟹ'), ('ꠂ', 'ꠂ'), + ('꠆', '꠆'), ('ꠋ', 'ꠋ'), ('ꠥ', 'ꠦ'), ('꣄', 'ꣅ'), + ('꣠', '꣱'), ('ꤦ', '꤭'), ('ꥇ', 'ꥑ'), ('ꦀ', 'ꦂ'), + ('꦳', '꦳'), ('ꦶ', 'ꦹ'), ('ꦼ', 'ꦼ'), ('ꧏ', 'ꧏ'), + ('ꧥ', 'ꧦ'), ('ꨩ', 'ꨮ'), ('ꨱ', 'ꨲ'), ('ꨵ', 'ꨶ'), + ('ꩃ', 'ꩃ'), ('ꩌ', 'ꩌ'), ('ꩰ', 'ꩰ'), ('ꩼ', 'ꩼ'), + ('ꪰ', 'ꪰ'), ('ꪲ', 'ꪴ'), ('ꪷ', 'ꪸ'), ('ꪾ', '꪿'), + ('꫁', '꫁'), ('ꫝ', 'ꫝ'), ('ꫬ', 'ꫭ'), ('ꫳ', 'ꫴ'), + ('꫶', '꫶'), ('꭛', 'ꭟ'), ('ꯥ', 'ꯥ'), ('ꯨ', 'ꯨ'), + ('꯭', '꯭'), ('ﬞ', 'ﬞ'), ('﮲', '﯁'), ('︀', '️'), + ('︓', '︓'), ('︠', '︯'), ('﹒', '﹒'), ('﹕', '﹕'), + ('\u{feff}', '\u{feff}'), (''', '''), ('.', '.'), (':', ':'), + ('^', '^'), ('`', '`'), ('ー', 'ー'), ('゙', '゚'), + (' ̄', ' ̄'), ('\u{fff9}', '\u{fffb}'), ('𐇽', '𐇽'), + ('𐋠', '𐋠'), ('𐍶', '𐍺'), ('𐨁', '𐨃'), ('𐨅', '𐨆'), + ('𐨌', '𐨏'), ('𐨸', '𐨺'), ('𐨿', '𐨿'), ('𐫥', '𐫦'), + ('𑀁', '𑀁'), ('𑀸', '𑁆'), ('𑁿', '𑂁'), ('𑂳', '𑂶'), + ('𑂹', '𑂺'), ('\u{110bd}', '\u{110bd}'), ('𑄀', '𑄂'), + ('𑄧', '𑄫'), ('𑄭', '𑄴'), ('𑅳', '𑅳'), ('𑆀', '𑆁'), + ('𑆶', '𑆾'), ('𑇊', '𑇌'), ('𑈯', '𑈱'), ('𑈴', '𑈴'), + ('𑈶', '𑈷'), ('𑈾', '𑈾'), ('𑋟', '𑋟'), ('𑋣', '𑋪'), + ('𑌀', '𑌁'), ('𑌼', '𑌼'), ('𑍀', '𑍀'), ('𑍦', '𑍬'), + ('𑍰', '𑍴'), ('𑐸', '𑐿'), ('𑑂', '𑑄'), ('𑑆', '𑑆'), + ('𑒳', '𑒸'), ('𑒺', '𑒺'), ('𑒿', '𑓀'), ('𑓂', '𑓃'), + ('𑖲', '𑖵'), ('𑖼', '𑖽'), ('𑖿', '𑗀'), ('𑗜', '𑗝'), + ('𑘳', '𑘺'), ('𑘽', '𑘽'), ('𑘿', '𑙀'), ('𑚫', '𑚫'), + ('𑚭', '𑚭'), ('𑚰', '𑚵'), ('𑚷', '𑚷'), ('𑜝', '𑜟'), + ('𑜢', '𑜥'), ('𑜧', '𑜫'), ('𑨁', '𑨆'), ('𑨉', '𑨊'), + ('𑨳', '𑨸'), ('𑨻', '𑨾'), ('𑩇', '𑩇'), ('𑩑', '𑩖'), + ('𑩙', '𑩛'), ('𑪊', '𑪖'), ('𑪘', '𑪙'), ('𑰰', '𑰶'), + ('𑰸', '𑰽'), ('𑰿', '𑰿'), ('𑲒', '𑲧'), ('𑲪', '𑲰'), + ('𑲲', '𑲳'), ('𑲵', '𑲶'), ('𑴱', '𑴶'), ('𑴺', '𑴺'), + ('𑴼', '𑴽'), ('𑴿', '𑵅'), ('𑵇', '𑵇'), ('𖫰', '𖫴'), + ('𖬰', '𖬶'), ('𖭀', '𖭃'), ('𖾏', '𖾟'), ('𖿠', '𖿡'), + ('𛲝', '𛲞'), ('\u{1bca0}', '\u{1bca3}'), ('𝅧', '𝅩'), + ('\u{1d173}', '𝆂'), ('𝆅', '𝆋'), ('𝆪', '𝆭'), ('𝉂', '𝉄'), + ('𝨀', '𝨶'), ('𝨻', '𝩬'), ('𝩵', '𝩵'), ('𝪄', '𝪄'), + ('𝪛', '𝪟'), ('𝪡', '𝪯'), ('𞀀', '𞀆'), ('𞀈', '𞀘'), + ('𞀛', '𞀡'), ('𞀣', '𞀤'), ('𞀦', '𞀪'), ('𞣐', '𞣖'), + ('𞥄', '𞥊'), ('🏻', '🏿'), ('\u{e0001}', '\u{e0001}'), + ('\u{e0020}', '\u{e007f}'), ('󠄀', '󠇯'), +]; + +pub const CASED: &'static [(char, char)] = &[ + ('A', 'Z'), ('a', 'z'), ('ª', 'ª'), ('µ', 'µ'), ('º', 'º'), + ('À', 'Ö'), ('Ø', 'ö'), ('ø', 'ƺ'), ('Ƽ', 'ƿ'), ('DŽ', 'ʓ'), + ('ʕ', 'ʸ'), ('ˀ', 'ˁ'), ('ˠ', 'ˤ'), ('ͅ', 'ͅ'), ('Ͱ', 'ͳ'), + ('Ͷ', 'ͷ'), ('ͺ', 'ͽ'), ('Ϳ', 'Ϳ'), ('Ά', 'Ά'), ('Έ', 'Ί'), + ('Ό', 'Ό'), ('Ύ', 'Ρ'), ('Σ', 'ϵ'), ('Ϸ', 'ҁ'), ('Ҋ', 'ԯ'), + ('Ա', 'Ֆ'), ('ա', 'և'), ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), + ('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ᲀ', 'ᲈ'), ('ᴀ', 'ᶿ'), + ('Ḁ', 'ἕ'), ('Ἐ', 'Ἕ'), ('ἠ', 'ὅ'), ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), ('ᾶ', 'ᾼ'), ('ι', 'ι'), + ('ῂ', 'ῄ'), ('ῆ', 'ῌ'), ('ῐ', 'ΐ'), ('ῖ', 'Ί'), + ('ῠ', 'Ῥ'), ('ῲ', 'ῴ'), ('ῶ', 'ῼ'), ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), ('ℂ', 'ℂ'), ('ℇ', 'ℇ'), + ('ℊ', 'ℓ'), ('ℕ', 'ℕ'), ('ℙ', 'ℝ'), ('ℤ', 'ℤ'), + ('Ω', 'Ω'), ('ℨ', 'ℨ'), ('K', 'ℭ'), ('ℯ', 'ℴ'), + ('ℹ', 'ℹ'), ('ℼ', 'ℿ'), ('ⅅ', 'ⅉ'), ('ⅎ', 'ⅎ'), + ('Ⅰ', 'ⅿ'), ('Ↄ', 'ↄ'), ('Ⓐ', 'ⓩ'), ('Ⰰ', 'Ⱞ'), + ('ⰰ', 'ⱞ'), ('Ⱡ', 'ⳤ'), ('Ⳬ', 'ⳮ'), ('Ⳳ', 'ⳳ'), + ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), ('Ꙁ', 'ꙭ'), + ('Ꚁ', 'ꚝ'), ('Ꜣ', 'ꞇ'), ('Ꞌ', 'ꞎ'), ('Ꞑ', 'Ɪ'), + ('Ʞ', 'ꞷ'), ('ꟸ', 'ꟺ'), ('ꬰ', 'ꭚ'), ('ꭜ', 'ꭥ'), + ('ꭰ', 'ꮿ'), ('ff', 'st'), ('ﬓ', 'ﬗ'), ('A', 'Z'), + ('a', 'z'), ('𐐀', '𐑏'), ('𐒰', '𐓓'), ('𐓘', '𐓻'), + ('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𑢠', '𑣟'), ('𝐀', '𝑔'), + ('𝑖', '𝒜'), ('𝒞', '𝒟'), ('𝒢', '𝒢'), ('𝒥', '𝒦'), + ('𝒩', '𝒬'), ('𝒮', '𝒹'), ('𝒻', '𝒻'), ('𝒽', '𝓃'), + ('𝓅', '𝔅'), ('𝔇', '𝔊'), ('𝔍', '𝔔'), ('𝔖', '𝔜'), + ('𝔞', '𝔹'), ('𝔻', '𝔾'), ('𝕀', '𝕄'), ('𝕆', '𝕆'), + ('𝕊', '𝕐'), ('𝕒', '𝚥'), ('𝚨', '𝛀'), ('𝛂', '𝛚'), + ('𝛜', '𝛺'), ('𝛼', '𝜔'), ('𝜖', '𝜴'), ('𝜶', '𝝎'), + ('𝝐', '𝝮'), ('𝝰', '𝞈'), ('𝞊', '𝞨'), ('𝞪', '𝟂'), + ('𝟄', '𝟋'), ('𞤀', '𞥃'), ('🄰', '🅉'), ('🅐', '🅩'), + ('🅰', '🆉'), +]; + +pub const CHANGES_WHEN_CASEFOLDED: &'static [(char, char)] = &[ + ('A', 'Z'), ('µ', 'µ'), ('À', 'Ö'), ('Ø', 'ß'), ('Ā', 'Ā'), + ('Ă', 'Ă'), ('Ą', 'Ą'), ('Ć', 'Ć'), ('Ĉ', 'Ĉ'), ('Ċ', 'Ċ'), + ('Č', 'Č'), ('Ď', 'Ď'), ('Đ', 'Đ'), ('Ē', 'Ē'), ('Ĕ', 'Ĕ'), + ('Ė', 'Ė'), ('Ę', 'Ę'), ('Ě', 'Ě'), ('Ĝ', 'Ĝ'), ('Ğ', 'Ğ'), + ('Ġ', 'Ġ'), ('Ģ', 'Ģ'), ('Ĥ', 'Ĥ'), ('Ħ', 'Ħ'), ('Ĩ', 'Ĩ'), + ('Ī', 'Ī'), ('Ĭ', 'Ĭ'), ('Į', 'Į'), ('İ', 'İ'), ('IJ', 'IJ'), + ('Ĵ', 'Ĵ'), ('Ķ', 'Ķ'), ('Ĺ', 'Ĺ'), ('Ļ', 'Ļ'), ('Ľ', 'Ľ'), + ('Ŀ', 'Ŀ'), ('Ł', 'Ł'), ('Ń', 'Ń'), ('Ņ', 'Ņ'), ('Ň', 'Ň'), + ('ʼn', 'Ŋ'), ('Ō', 'Ō'), ('Ŏ', 'Ŏ'), ('Ő', 'Ő'), ('Œ', 'Œ'), + ('Ŕ', 'Ŕ'), ('Ŗ', 'Ŗ'), ('Ř', 'Ř'), ('Ś', 'Ś'), ('Ŝ', 'Ŝ'), + ('Ş', 'Ş'), ('Š', 'Š'), ('Ţ', 'Ţ'), ('Ť', 'Ť'), ('Ŧ', 'Ŧ'), + ('Ũ', 'Ũ'), ('Ū', 'Ū'), ('Ŭ', 'Ŭ'), ('Ů', 'Ů'), ('Ű', 'Ű'), + ('Ų', 'Ų'), ('Ŵ', 'Ŵ'), ('Ŷ', 'Ŷ'), ('Ÿ', 'Ź'), ('Ż', 'Ż'), + ('Ž', 'Ž'), ('ſ', 'ſ'), ('Ɓ', 'Ƃ'), ('Ƅ', 'Ƅ'), ('Ɔ', 'Ƈ'), + ('Ɖ', 'Ƌ'), ('Ǝ', 'Ƒ'), ('Ɠ', 'Ɣ'), ('Ɩ', 'Ƙ'), ('Ɯ', 'Ɲ'), + ('Ɵ', 'Ơ'), ('Ƣ', 'Ƣ'), ('Ƥ', 'Ƥ'), ('Ʀ', 'Ƨ'), ('Ʃ', 'Ʃ'), + ('Ƭ', 'Ƭ'), ('Ʈ', 'Ư'), ('Ʊ', 'Ƴ'), ('Ƶ', 'Ƶ'), ('Ʒ', 'Ƹ'), + ('Ƽ', 'Ƽ'), ('DŽ', 'Dž'), ('LJ', 'Lj'), ('NJ', 'Nj'), ('Ǎ', 'Ǎ'), + ('Ǐ', 'Ǐ'), ('Ǒ', 'Ǒ'), ('Ǔ', 'Ǔ'), ('Ǖ', 'Ǖ'), ('Ǘ', 'Ǘ'), + ('Ǚ', 'Ǚ'), ('Ǜ', 'Ǜ'), ('Ǟ', 'Ǟ'), ('Ǡ', 'Ǡ'), ('Ǣ', 'Ǣ'), + ('Ǥ', 'Ǥ'), ('Ǧ', 'Ǧ'), ('Ǩ', 'Ǩ'), ('Ǫ', 'Ǫ'), ('Ǭ', 'Ǭ'), + ('Ǯ', 'Ǯ'), ('DZ', 'Dz'), ('Ǵ', 'Ǵ'), ('Ƕ', 'Ǹ'), ('Ǻ', 'Ǻ'), + ('Ǽ', 'Ǽ'), ('Ǿ', 'Ǿ'), ('Ȁ', 'Ȁ'), ('Ȃ', 'Ȃ'), ('Ȅ', 'Ȅ'), + ('Ȇ', 'Ȇ'), ('Ȉ', 'Ȉ'), ('Ȋ', 'Ȋ'), ('Ȍ', 'Ȍ'), ('Ȏ', 'Ȏ'), + ('Ȑ', 'Ȑ'), ('Ȓ', 'Ȓ'), ('Ȕ', 'Ȕ'), ('Ȗ', 'Ȗ'), ('Ș', 'Ș'), + ('Ț', 'Ț'), ('Ȝ', 'Ȝ'), ('Ȟ', 'Ȟ'), ('Ƞ', 'Ƞ'), ('Ȣ', 'Ȣ'), + ('Ȥ', 'Ȥ'), ('Ȧ', 'Ȧ'), ('Ȩ', 'Ȩ'), ('Ȫ', 'Ȫ'), ('Ȭ', 'Ȭ'), + ('Ȯ', 'Ȯ'), ('Ȱ', 'Ȱ'), ('Ȳ', 'Ȳ'), ('Ⱥ', 'Ȼ'), ('Ƚ', 'Ⱦ'), + ('Ɂ', 'Ɂ'), ('Ƀ', 'Ɇ'), ('Ɉ', 'Ɉ'), ('Ɋ', 'Ɋ'), ('Ɍ', 'Ɍ'), + ('Ɏ', 'Ɏ'), ('ͅ', 'ͅ'), ('Ͱ', 'Ͱ'), ('Ͳ', 'Ͳ'), ('Ͷ', 'Ͷ'), + ('Ϳ', 'Ϳ'), ('Ά', 'Ά'), ('Έ', 'Ί'), ('Ό', 'Ό'), ('Ύ', 'Ώ'), + ('Α', 'Ρ'), ('Σ', 'Ϋ'), ('ς', 'ς'), ('Ϗ', 'ϑ'), ('ϕ', 'ϖ'), + ('Ϙ', 'Ϙ'), ('Ϛ', 'Ϛ'), ('Ϝ', 'Ϝ'), ('Ϟ', 'Ϟ'), ('Ϡ', 'Ϡ'), + ('Ϣ', 'Ϣ'), ('Ϥ', 'Ϥ'), ('Ϧ', 'Ϧ'), ('Ϩ', 'Ϩ'), ('Ϫ', 'Ϫ'), + ('Ϭ', 'Ϭ'), ('Ϯ', 'Ϯ'), ('ϰ', 'ϱ'), ('ϴ', 'ϵ'), ('Ϸ', 'Ϸ'), + ('Ϲ', 'Ϻ'), ('Ͻ', 'Я'), ('Ѡ', 'Ѡ'), ('Ѣ', 'Ѣ'), ('Ѥ', 'Ѥ'), + ('Ѧ', 'Ѧ'), ('Ѩ', 'Ѩ'), ('Ѫ', 'Ѫ'), ('Ѭ', 'Ѭ'), ('Ѯ', 'Ѯ'), + ('Ѱ', 'Ѱ'), ('Ѳ', 'Ѳ'), ('Ѵ', 'Ѵ'), ('Ѷ', 'Ѷ'), ('Ѹ', 'Ѹ'), + ('Ѻ', 'Ѻ'), ('Ѽ', 'Ѽ'), ('Ѿ', 'Ѿ'), ('Ҁ', 'Ҁ'), ('Ҋ', 'Ҋ'), + ('Ҍ', 'Ҍ'), ('Ҏ', 'Ҏ'), ('Ґ', 'Ґ'), ('Ғ', 'Ғ'), ('Ҕ', 'Ҕ'), + ('Җ', 'Җ'), ('Ҙ', 'Ҙ'), ('Қ', 'Қ'), ('Ҝ', 'Ҝ'), ('Ҟ', 'Ҟ'), + ('Ҡ', 'Ҡ'), ('Ң', 'Ң'), ('Ҥ', 'Ҥ'), ('Ҧ', 'Ҧ'), ('Ҩ', 'Ҩ'), + ('Ҫ', 'Ҫ'), ('Ҭ', 'Ҭ'), ('Ү', 'Ү'), ('Ұ', 'Ұ'), ('Ҳ', 'Ҳ'), + ('Ҵ', 'Ҵ'), ('Ҷ', 'Ҷ'), ('Ҹ', 'Ҹ'), ('Һ', 'Һ'), ('Ҽ', 'Ҽ'), + ('Ҿ', 'Ҿ'), ('Ӏ', 'Ӂ'), ('Ӄ', 'Ӄ'), ('Ӆ', 'Ӆ'), ('Ӈ', 'Ӈ'), + ('Ӊ', 'Ӊ'), ('Ӌ', 'Ӌ'), ('Ӎ', 'Ӎ'), ('Ӑ', 'Ӑ'), ('Ӓ', 'Ӓ'), + ('Ӕ', 'Ӕ'), ('Ӗ', 'Ӗ'), ('Ә', 'Ә'), ('Ӛ', 'Ӛ'), ('Ӝ', 'Ӝ'), + ('Ӟ', 'Ӟ'), ('Ӡ', 'Ӡ'), ('Ӣ', 'Ӣ'), ('Ӥ', 'Ӥ'), ('Ӧ', 'Ӧ'), + ('Ө', 'Ө'), ('Ӫ', 'Ӫ'), ('Ӭ', 'Ӭ'), ('Ӯ', 'Ӯ'), ('Ӱ', 'Ӱ'), + ('Ӳ', 'Ӳ'), ('Ӵ', 'Ӵ'), ('Ӷ', 'Ӷ'), ('Ӹ', 'Ӹ'), ('Ӻ', 'Ӻ'), + ('Ӽ', 'Ӽ'), ('Ӿ', 'Ӿ'), ('Ԁ', 'Ԁ'), ('Ԃ', 'Ԃ'), ('Ԅ', 'Ԅ'), + ('Ԇ', 'Ԇ'), ('Ԉ', 'Ԉ'), ('Ԋ', 'Ԋ'), ('Ԍ', 'Ԍ'), ('Ԏ', 'Ԏ'), + ('Ԑ', 'Ԑ'), ('Ԓ', 'Ԓ'), ('Ԕ', 'Ԕ'), ('Ԗ', 'Ԗ'), ('Ԙ', 'Ԙ'), + ('Ԛ', 'Ԛ'), ('Ԝ', 'Ԝ'), ('Ԟ', 'Ԟ'), ('Ԡ', 'Ԡ'), ('Ԣ', 'Ԣ'), + ('Ԥ', 'Ԥ'), ('Ԧ', 'Ԧ'), ('Ԩ', 'Ԩ'), ('Ԫ', 'Ԫ'), ('Ԭ', 'Ԭ'), + ('Ԯ', 'Ԯ'), ('Ա', 'Ֆ'), ('և', 'և'), ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), ('ᏸ', 'ᏽ'), ('ᲀ', 'ᲈ'), ('Ḁ', 'Ḁ'), + ('Ḃ', 'Ḃ'), ('Ḅ', 'Ḅ'), ('Ḇ', 'Ḇ'), ('Ḉ', 'Ḉ'), + ('Ḋ', 'Ḋ'), ('Ḍ', 'Ḍ'), ('Ḏ', 'Ḏ'), ('Ḑ', 'Ḑ'), + ('Ḓ', 'Ḓ'), ('Ḕ', 'Ḕ'), ('Ḗ', 'Ḗ'), ('Ḙ', 'Ḙ'), + ('Ḛ', 'Ḛ'), ('Ḝ', 'Ḝ'), ('Ḟ', 'Ḟ'), ('Ḡ', 'Ḡ'), + ('Ḣ', 'Ḣ'), ('Ḥ', 'Ḥ'), ('Ḧ', 'Ḧ'), ('Ḩ', 'Ḩ'), + ('Ḫ', 'Ḫ'), ('Ḭ', 'Ḭ'), ('Ḯ', 'Ḯ'), ('Ḱ', 'Ḱ'), + ('Ḳ', 'Ḳ'), ('Ḵ', 'Ḵ'), ('Ḷ', 'Ḷ'), ('Ḹ', 'Ḹ'), + ('Ḻ', 'Ḻ'), ('Ḽ', 'Ḽ'), ('Ḿ', 'Ḿ'), ('Ṁ', 'Ṁ'), + ('Ṃ', 'Ṃ'), ('Ṅ', 'Ṅ'), ('Ṇ', 'Ṇ'), ('Ṉ', 'Ṉ'), + ('Ṋ', 'Ṋ'), ('Ṍ', 'Ṍ'), ('Ṏ', 'Ṏ'), ('Ṑ', 'Ṑ'), + ('Ṓ', 'Ṓ'), ('Ṕ', 'Ṕ'), ('Ṗ', 'Ṗ'), ('Ṙ', 'Ṙ'), + ('Ṛ', 'Ṛ'), ('Ṝ', 'Ṝ'), ('Ṟ', 'Ṟ'), ('Ṡ', 'Ṡ'), + ('Ṣ', 'Ṣ'), ('Ṥ', 'Ṥ'), ('Ṧ', 'Ṧ'), ('Ṩ', 'Ṩ'), + ('Ṫ', 'Ṫ'), ('Ṭ', 'Ṭ'), ('Ṯ', 'Ṯ'), ('Ṱ', 'Ṱ'), + ('Ṳ', 'Ṳ'), ('Ṵ', 'Ṵ'), ('Ṷ', 'Ṷ'), ('Ṹ', 'Ṹ'), + ('Ṻ', 'Ṻ'), ('Ṽ', 'Ṽ'), ('Ṿ', 'Ṿ'), ('Ẁ', 'Ẁ'), + ('Ẃ', 'Ẃ'), ('Ẅ', 'Ẅ'), ('Ẇ', 'Ẇ'), ('Ẉ', 'Ẉ'), + ('Ẋ', 'Ẋ'), ('Ẍ', 'Ẍ'), ('Ẏ', 'Ẏ'), ('Ẑ', 'Ẑ'), + ('Ẓ', 'Ẓ'), ('Ẕ', 'Ẕ'), ('ẚ', 'ẛ'), ('ẞ', 'ẞ'), + ('Ạ', 'Ạ'), ('Ả', 'Ả'), ('Ấ', 'Ấ'), ('Ầ', 'Ầ'), + ('Ẩ', 'Ẩ'), ('Ẫ', 'Ẫ'), ('Ậ', 'Ậ'), ('Ắ', 'Ắ'), + ('Ằ', 'Ằ'), ('Ẳ', 'Ẳ'), ('Ẵ', 'Ẵ'), ('Ặ', 'Ặ'), + ('Ẹ', 'Ẹ'), ('Ẻ', 'Ẻ'), ('Ẽ', 'Ẽ'), ('Ế', 'Ế'), + ('Ề', 'Ề'), ('Ể', 'Ể'), ('Ễ', 'Ễ'), ('Ệ', 'Ệ'), + ('Ỉ', 'Ỉ'), ('Ị', 'Ị'), ('Ọ', 'Ọ'), ('Ỏ', 'Ỏ'), + ('Ố', 'Ố'), ('Ồ', 'Ồ'), ('Ổ', 'Ổ'), ('Ỗ', 'Ỗ'), + ('Ộ', 'Ộ'), ('Ớ', 'Ớ'), ('Ờ', 'Ờ'), ('Ở', 'Ở'), + ('Ỡ', 'Ỡ'), ('Ợ', 'Ợ'), ('Ụ', 'Ụ'), ('Ủ', 'Ủ'), + ('Ứ', 'Ứ'), ('Ừ', 'Ừ'), ('Ử', 'Ử'), ('Ữ', 'Ữ'), + ('Ự', 'Ự'), ('Ỳ', 'Ỳ'), ('Ỵ', 'Ỵ'), ('Ỷ', 'Ỷ'), + ('Ỹ', 'Ỹ'), ('Ỻ', 'Ỻ'), ('Ỽ', 'Ỽ'), ('Ỿ', 'Ỿ'), + ('Ἀ', 'Ἇ'), ('Ἐ', 'Ἕ'), ('Ἠ', 'Ἧ'), ('Ἰ', 'Ἷ'), + ('Ὀ', 'Ὅ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), + ('Ὗ', 'Ὗ'), ('Ὠ', 'Ὧ'), ('ᾀ', 'ᾯ'), ('ᾲ', 'ᾴ'), + ('ᾷ', 'ᾼ'), ('ῂ', 'ῄ'), ('ῇ', 'ῌ'), ('Ῐ', 'Ί'), + ('Ῠ', 'Ῥ'), ('ῲ', 'ῴ'), ('ῷ', 'ῼ'), ('Ω', 'Ω'), + ('K', 'Å'), ('Ⅎ', 'Ⅎ'), ('Ⅰ', 'Ⅿ'), ('Ↄ', 'Ↄ'), + ('Ⓐ', 'Ⓩ'), ('Ⰰ', 'Ⱞ'), ('Ⱡ', 'Ⱡ'), ('Ɫ', 'Ɽ'), + ('Ⱨ', 'Ⱨ'), ('Ⱪ', 'Ⱪ'), ('Ⱬ', 'Ⱬ'), ('Ɑ', 'Ɒ'), + ('Ⱳ', 'Ⱳ'), ('Ⱶ', 'Ⱶ'), ('Ȿ', 'Ⲁ'), ('Ⲃ', 'Ⲃ'), + ('Ⲅ', 'Ⲅ'), ('Ⲇ', 'Ⲇ'), ('Ⲉ', 'Ⲉ'), ('Ⲋ', 'Ⲋ'), + ('Ⲍ', 'Ⲍ'), ('Ⲏ', 'Ⲏ'), ('Ⲑ', 'Ⲑ'), ('Ⲓ', 'Ⲓ'), + ('Ⲕ', 'Ⲕ'), ('Ⲗ', 'Ⲗ'), ('Ⲙ', 'Ⲙ'), ('Ⲛ', 'Ⲛ'), + ('Ⲝ', 'Ⲝ'), ('Ⲟ', 'Ⲟ'), ('Ⲡ', 'Ⲡ'), ('Ⲣ', 'Ⲣ'), + ('Ⲥ', 'Ⲥ'), ('Ⲧ', 'Ⲧ'), ('Ⲩ', 'Ⲩ'), ('Ⲫ', 'Ⲫ'), + ('Ⲭ', 'Ⲭ'), ('Ⲯ', 'Ⲯ'), ('Ⲱ', 'Ⲱ'), ('Ⲳ', 'Ⲳ'), + ('Ⲵ', 'Ⲵ'), ('Ⲷ', 'Ⲷ'), ('Ⲹ', 'Ⲹ'), ('Ⲻ', 'Ⲻ'), + ('Ⲽ', 'Ⲽ'), ('Ⲿ', 'Ⲿ'), ('Ⳁ', 'Ⳁ'), ('Ⳃ', 'Ⳃ'), + ('Ⳅ', 'Ⳅ'), ('Ⳇ', 'Ⳇ'), ('Ⳉ', 'Ⳉ'), ('Ⳋ', 'Ⳋ'), + ('Ⳍ', 'Ⳍ'), ('Ⳏ', 'Ⳏ'), ('Ⳑ', 'Ⳑ'), ('Ⳓ', 'Ⳓ'), + ('Ⳕ', 'Ⳕ'), ('Ⳗ', 'Ⳗ'), ('Ⳙ', 'Ⳙ'), ('Ⳛ', 'Ⳛ'), + ('Ⳝ', 'Ⳝ'), ('Ⳟ', 'Ⳟ'), ('Ⳡ', 'Ⳡ'), ('Ⳣ', 'Ⳣ'), + ('Ⳬ', 'Ⳬ'), ('Ⳮ', 'Ⳮ'), ('Ⳳ', 'Ⳳ'), ('Ꙁ', 'Ꙁ'), + ('Ꙃ', 'Ꙃ'), ('Ꙅ', 'Ꙅ'), ('Ꙇ', 'Ꙇ'), ('Ꙉ', 'Ꙉ'), + ('Ꙋ', 'Ꙋ'), ('Ꙍ', 'Ꙍ'), ('Ꙏ', 'Ꙏ'), ('Ꙑ', 'Ꙑ'), + ('Ꙓ', 'Ꙓ'), ('Ꙕ', 'Ꙕ'), ('Ꙗ', 'Ꙗ'), ('Ꙙ', 'Ꙙ'), + ('Ꙛ', 'Ꙛ'), ('Ꙝ', 'Ꙝ'), ('Ꙟ', 'Ꙟ'), ('Ꙡ', 'Ꙡ'), + ('Ꙣ', 'Ꙣ'), ('Ꙥ', 'Ꙥ'), ('Ꙧ', 'Ꙧ'), ('Ꙩ', 'Ꙩ'), + ('Ꙫ', 'Ꙫ'), ('Ꙭ', 'Ꙭ'), ('Ꚁ', 'Ꚁ'), ('Ꚃ', 'Ꚃ'), + ('Ꚅ', 'Ꚅ'), ('Ꚇ', 'Ꚇ'), ('Ꚉ', 'Ꚉ'), ('Ꚋ', 'Ꚋ'), + ('Ꚍ', 'Ꚍ'), ('Ꚏ', 'Ꚏ'), ('Ꚑ', 'Ꚑ'), ('Ꚓ', 'Ꚓ'), + ('Ꚕ', 'Ꚕ'), ('Ꚗ', 'Ꚗ'), ('Ꚙ', 'Ꚙ'), ('Ꚛ', 'Ꚛ'), + ('Ꜣ', 'Ꜣ'), ('Ꜥ', 'Ꜥ'), ('Ꜧ', 'Ꜧ'), ('Ꜩ', 'Ꜩ'), + ('Ꜫ', 'Ꜫ'), ('Ꜭ', 'Ꜭ'), ('Ꜯ', 'Ꜯ'), ('Ꜳ', 'Ꜳ'), + ('Ꜵ', 'Ꜵ'), ('Ꜷ', 'Ꜷ'), ('Ꜹ', 'Ꜹ'), ('Ꜻ', 'Ꜻ'), + ('Ꜽ', 'Ꜽ'), ('Ꜿ', 'Ꜿ'), ('Ꝁ', 'Ꝁ'), ('Ꝃ', 'Ꝃ'), + ('Ꝅ', 'Ꝅ'), ('Ꝇ', 'Ꝇ'), ('Ꝉ', 'Ꝉ'), ('Ꝋ', 'Ꝋ'), + ('Ꝍ', 'Ꝍ'), ('Ꝏ', 'Ꝏ'), ('Ꝑ', 'Ꝑ'), ('Ꝓ', 'Ꝓ'), + ('Ꝕ', 'Ꝕ'), ('Ꝗ', 'Ꝗ'), ('Ꝙ', 'Ꝙ'), ('Ꝛ', 'Ꝛ'), + ('Ꝝ', 'Ꝝ'), ('Ꝟ', 'Ꝟ'), ('Ꝡ', 'Ꝡ'), ('Ꝣ', 'Ꝣ'), + ('Ꝥ', 'Ꝥ'), ('Ꝧ', 'Ꝧ'), ('Ꝩ', 'Ꝩ'), ('Ꝫ', 'Ꝫ'), + ('Ꝭ', 'Ꝭ'), ('Ꝯ', 'Ꝯ'), ('Ꝺ', 'Ꝺ'), ('Ꝼ', 'Ꝼ'), + ('Ᵹ', 'Ꝿ'), ('Ꞁ', 'Ꞁ'), ('Ꞃ', 'Ꞃ'), ('Ꞅ', 'Ꞅ'), + ('Ꞇ', 'Ꞇ'), ('Ꞌ', 'Ꞌ'), ('Ɥ', 'Ɥ'), ('Ꞑ', 'Ꞑ'), + ('Ꞓ', 'Ꞓ'), ('Ꞗ', 'Ꞗ'), ('Ꞙ', 'Ꞙ'), ('Ꞛ', 'Ꞛ'), + ('Ꞝ', 'Ꞝ'), ('Ꞟ', 'Ꞟ'), ('Ꞡ', 'Ꞡ'), ('Ꞣ', 'Ꞣ'), + ('Ꞥ', 'Ꞥ'), ('Ꞧ', 'Ꞧ'), ('Ꞩ', 'Ꞩ'), ('Ɦ', 'Ɪ'), + ('Ʞ', 'Ꞵ'), ('Ꞷ', 'Ꞷ'), ('ꭰ', 'ꮿ'), ('ff', 'st'), + ('ﬓ', 'ﬗ'), ('A', 'Z'), ('𐐀', '𐐧'), ('𐒰', '𐓓'), + ('𐲀', '𐲲'), ('𑢠', '𑢿'), ('𞤀', '𞤡'), +]; + +pub const CHANGES_WHEN_CASEMAPPED: &'static [(char, char)] = &[ + ('A', 'Z'), ('a', 'z'), ('µ', 'µ'), ('À', 'Ö'), ('Ø', 'ö'), + ('ø', 'ķ'), ('Ĺ', 'ƌ'), ('Ǝ', 'ƚ'), ('Ɯ', 'Ʃ'), ('Ƭ', 'ƹ'), + ('Ƽ', 'ƽ'), ('ƿ', 'ƿ'), ('DŽ', 'Ƞ'), ('Ȣ', 'ȳ'), ('Ⱥ', 'ɔ'), + ('ɖ', 'ɗ'), ('ə', 'ə'), ('ɛ', 'ɜ'), ('ɠ', 'ɡ'), ('ɣ', 'ɣ'), + ('ɥ', 'ɦ'), ('ɨ', 'ɬ'), ('ɯ', 'ɯ'), ('ɱ', 'ɲ'), ('ɵ', 'ɵ'), + ('ɽ', 'ɽ'), ('ʀ', 'ʀ'), ('ʃ', 'ʃ'), ('ʇ', 'ʌ'), ('ʒ', 'ʒ'), + ('ʝ', 'ʞ'), ('ͅ', 'ͅ'), ('Ͱ', 'ͳ'), ('Ͷ', 'ͷ'), ('ͻ', 'ͽ'), + ('Ϳ', 'Ϳ'), ('Ά', 'Ά'), ('Έ', 'Ί'), ('Ό', 'Ό'), ('Ύ', 'Ρ'), + ('Σ', 'ϑ'), ('ϕ', 'ϵ'), ('Ϸ', 'ϻ'), ('Ͻ', 'ҁ'), ('Ҋ', 'ԯ'), + ('Ա', 'Ֆ'), ('ա', 'և'), ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), + ('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ᲀ', 'ᲈ'), ('ᵹ', 'ᵹ'), + ('ᵽ', 'ᵽ'), ('Ḁ', 'ẛ'), ('ẞ', 'ẞ'), ('Ạ', 'ἕ'), + ('Ἐ', 'Ἕ'), ('ἠ', 'ὅ'), ('Ὀ', 'Ὅ'), ('ὐ', 'ὗ'), + ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), ('Ὗ', 'ώ'), + ('ᾀ', 'ᾴ'), ('ᾶ', 'ᾼ'), ('ι', 'ι'), ('ῂ', 'ῄ'), + ('ῆ', 'ῌ'), ('ῐ', 'ΐ'), ('ῖ', 'Ί'), ('ῠ', 'Ῥ'), + ('ῲ', 'ῴ'), ('ῶ', 'ῼ'), ('Ω', 'Ω'), ('K', 'Å'), + ('Ⅎ', 'Ⅎ'), ('ⅎ', 'ⅎ'), ('Ⅰ', 'ⅿ'), ('Ↄ', 'ↄ'), + ('Ⓐ', 'ⓩ'), ('Ⰰ', 'Ⱞ'), ('ⰰ', 'ⱞ'), ('Ⱡ', 'Ɒ'), + ('Ⱳ', 'ⱳ'), ('Ⱶ', 'ⱶ'), ('Ȿ', 'ⳣ'), ('Ⳬ', 'ⳮ'), + ('Ⳳ', 'ⳳ'), ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), + ('Ꙁ', 'ꙭ'), ('Ꚁ', 'ꚛ'), ('Ꜣ', 'ꜯ'), ('Ꜳ', 'ꝯ'), + ('Ꝺ', 'ꞇ'), ('Ꞌ', 'Ɥ'), ('Ꞑ', 'ꞓ'), ('Ꞗ', 'Ɪ'), + ('Ʞ', 'ꞷ'), ('ꭓ', 'ꭓ'), ('ꭰ', 'ꮿ'), ('ff', 'st'), + ('ﬓ', 'ﬗ'), ('A', 'Z'), ('a', 'z'), ('𐐀', '𐑏'), + ('𐒰', '𐓓'), ('𐓘', '𐓻'), ('𐲀', '𐲲'), ('𐳀', '𐳲'), + ('𑢠', '𑣟'), ('𞤀', '𞥃'), +]; + +pub const CHANGES_WHEN_LOWERCASED: &'static [(char, char)] = &[ + ('A', 'Z'), ('À', 'Ö'), ('Ø', 'Þ'), ('Ā', 'Ā'), ('Ă', 'Ă'), + ('Ą', 'Ą'), ('Ć', 'Ć'), ('Ĉ', 'Ĉ'), ('Ċ', 'Ċ'), ('Č', 'Č'), + ('Ď', 'Ď'), ('Đ', 'Đ'), ('Ē', 'Ē'), ('Ĕ', 'Ĕ'), ('Ė', 'Ė'), + ('Ę', 'Ę'), ('Ě', 'Ě'), ('Ĝ', 'Ĝ'), ('Ğ', 'Ğ'), ('Ġ', 'Ġ'), + ('Ģ', 'Ģ'), ('Ĥ', 'Ĥ'), ('Ħ', 'Ħ'), ('Ĩ', 'Ĩ'), ('Ī', 'Ī'), + ('Ĭ', 'Ĭ'), ('Į', 'Į'), ('İ', 'İ'), ('IJ', 'IJ'), ('Ĵ', 'Ĵ'), + ('Ķ', 'Ķ'), ('Ĺ', 'Ĺ'), ('Ļ', 'Ļ'), ('Ľ', 'Ľ'), ('Ŀ', 'Ŀ'), + ('Ł', 'Ł'), ('Ń', 'Ń'), ('Ņ', 'Ņ'), ('Ň', 'Ň'), ('Ŋ', 'Ŋ'), + ('Ō', 'Ō'), ('Ŏ', 'Ŏ'), ('Ő', 'Ő'), ('Œ', 'Œ'), ('Ŕ', 'Ŕ'), + ('Ŗ', 'Ŗ'), ('Ř', 'Ř'), ('Ś', 'Ś'), ('Ŝ', 'Ŝ'), ('Ş', 'Ş'), + ('Š', 'Š'), ('Ţ', 'Ţ'), ('Ť', 'Ť'), ('Ŧ', 'Ŧ'), ('Ũ', 'Ũ'), + ('Ū', 'Ū'), ('Ŭ', 'Ŭ'), ('Ů', 'Ů'), ('Ű', 'Ű'), ('Ų', 'Ų'), + ('Ŵ', 'Ŵ'), ('Ŷ', 'Ŷ'), ('Ÿ', 'Ź'), ('Ż', 'Ż'), ('Ž', 'Ž'), + ('Ɓ', 'Ƃ'), ('Ƅ', 'Ƅ'), ('Ɔ', 'Ƈ'), ('Ɖ', 'Ƌ'), ('Ǝ', 'Ƒ'), + ('Ɠ', 'Ɣ'), ('Ɩ', 'Ƙ'), ('Ɯ', 'Ɲ'), ('Ɵ', 'Ơ'), ('Ƣ', 'Ƣ'), + ('Ƥ', 'Ƥ'), ('Ʀ', 'Ƨ'), ('Ʃ', 'Ʃ'), ('Ƭ', 'Ƭ'), ('Ʈ', 'Ư'), + ('Ʊ', 'Ƴ'), ('Ƶ', 'Ƶ'), ('Ʒ', 'Ƹ'), ('Ƽ', 'Ƽ'), ('DŽ', 'Dž'), + ('LJ', 'Lj'), ('NJ', 'Nj'), ('Ǎ', 'Ǎ'), ('Ǐ', 'Ǐ'), ('Ǒ', 'Ǒ'), + ('Ǔ', 'Ǔ'), ('Ǖ', 'Ǖ'), ('Ǘ', 'Ǘ'), ('Ǚ', 'Ǚ'), ('Ǜ', 'Ǜ'), + ('Ǟ', 'Ǟ'), ('Ǡ', 'Ǡ'), ('Ǣ', 'Ǣ'), ('Ǥ', 'Ǥ'), ('Ǧ', 'Ǧ'), + ('Ǩ', 'Ǩ'), ('Ǫ', 'Ǫ'), ('Ǭ', 'Ǭ'), ('Ǯ', 'Ǯ'), ('DZ', 'Dz'), + ('Ǵ', 'Ǵ'), ('Ƕ', 'Ǹ'), ('Ǻ', 'Ǻ'), ('Ǽ', 'Ǽ'), ('Ǿ', 'Ǿ'), + ('Ȁ', 'Ȁ'), ('Ȃ', 'Ȃ'), ('Ȅ', 'Ȅ'), ('Ȇ', 'Ȇ'), ('Ȉ', 'Ȉ'), + ('Ȋ', 'Ȋ'), ('Ȍ', 'Ȍ'), ('Ȏ', 'Ȏ'), ('Ȑ', 'Ȑ'), ('Ȓ', 'Ȓ'), + ('Ȕ', 'Ȕ'), ('Ȗ', 'Ȗ'), ('Ș', 'Ș'), ('Ț', 'Ț'), ('Ȝ', 'Ȝ'), + ('Ȟ', 'Ȟ'), ('Ƞ', 'Ƞ'), ('Ȣ', 'Ȣ'), ('Ȥ', 'Ȥ'), ('Ȧ', 'Ȧ'), + ('Ȩ', 'Ȩ'), ('Ȫ', 'Ȫ'), ('Ȭ', 'Ȭ'), ('Ȯ', 'Ȯ'), ('Ȱ', 'Ȱ'), + ('Ȳ', 'Ȳ'), ('Ⱥ', 'Ȼ'), ('Ƚ', 'Ⱦ'), ('Ɂ', 'Ɂ'), ('Ƀ', 'Ɇ'), + ('Ɉ', 'Ɉ'), ('Ɋ', 'Ɋ'), ('Ɍ', 'Ɍ'), ('Ɏ', 'Ɏ'), ('Ͱ', 'Ͱ'), + ('Ͳ', 'Ͳ'), ('Ͷ', 'Ͷ'), ('Ϳ', 'Ϳ'), ('Ά', 'Ά'), ('Έ', 'Ί'), + ('Ό', 'Ό'), ('Ύ', 'Ώ'), ('Α', 'Ρ'), ('Σ', 'Ϋ'), ('Ϗ', 'Ϗ'), + ('Ϙ', 'Ϙ'), ('Ϛ', 'Ϛ'), ('Ϝ', 'Ϝ'), ('Ϟ', 'Ϟ'), ('Ϡ', 'Ϡ'), + ('Ϣ', 'Ϣ'), ('Ϥ', 'Ϥ'), ('Ϧ', 'Ϧ'), ('Ϩ', 'Ϩ'), ('Ϫ', 'Ϫ'), + ('Ϭ', 'Ϭ'), ('Ϯ', 'Ϯ'), ('ϴ', 'ϴ'), ('Ϸ', 'Ϸ'), ('Ϲ', 'Ϻ'), + ('Ͻ', 'Я'), ('Ѡ', 'Ѡ'), ('Ѣ', 'Ѣ'), ('Ѥ', 'Ѥ'), ('Ѧ', 'Ѧ'), + ('Ѩ', 'Ѩ'), ('Ѫ', 'Ѫ'), ('Ѭ', 'Ѭ'), ('Ѯ', 'Ѯ'), ('Ѱ', 'Ѱ'), + ('Ѳ', 'Ѳ'), ('Ѵ', 'Ѵ'), ('Ѷ', 'Ѷ'), ('Ѹ', 'Ѹ'), ('Ѻ', 'Ѻ'), + ('Ѽ', 'Ѽ'), ('Ѿ', 'Ѿ'), ('Ҁ', 'Ҁ'), ('Ҋ', 'Ҋ'), ('Ҍ', 'Ҍ'), + ('Ҏ', 'Ҏ'), ('Ґ', 'Ґ'), ('Ғ', 'Ғ'), ('Ҕ', 'Ҕ'), ('Җ', 'Җ'), + ('Ҙ', 'Ҙ'), ('Қ', 'Қ'), ('Ҝ', 'Ҝ'), ('Ҟ', 'Ҟ'), ('Ҡ', 'Ҡ'), + ('Ң', 'Ң'), ('Ҥ', 'Ҥ'), ('Ҧ', 'Ҧ'), ('Ҩ', 'Ҩ'), ('Ҫ', 'Ҫ'), + ('Ҭ', 'Ҭ'), ('Ү', 'Ү'), ('Ұ', 'Ұ'), ('Ҳ', 'Ҳ'), ('Ҵ', 'Ҵ'), + ('Ҷ', 'Ҷ'), ('Ҹ', 'Ҹ'), ('Һ', 'Һ'), ('Ҽ', 'Ҽ'), ('Ҿ', 'Ҿ'), + ('Ӏ', 'Ӂ'), ('Ӄ', 'Ӄ'), ('Ӆ', 'Ӆ'), ('Ӈ', 'Ӈ'), ('Ӊ', 'Ӊ'), + ('Ӌ', 'Ӌ'), ('Ӎ', 'Ӎ'), ('Ӑ', 'Ӑ'), ('Ӓ', 'Ӓ'), ('Ӕ', 'Ӕ'), + ('Ӗ', 'Ӗ'), ('Ә', 'Ә'), ('Ӛ', 'Ӛ'), ('Ӝ', 'Ӝ'), ('Ӟ', 'Ӟ'), + ('Ӡ', 'Ӡ'), ('Ӣ', 'Ӣ'), ('Ӥ', 'Ӥ'), ('Ӧ', 'Ӧ'), ('Ө', 'Ө'), + ('Ӫ', 'Ӫ'), ('Ӭ', 'Ӭ'), ('Ӯ', 'Ӯ'), ('Ӱ', 'Ӱ'), ('Ӳ', 'Ӳ'), + ('Ӵ', 'Ӵ'), ('Ӷ', 'Ӷ'), ('Ӹ', 'Ӹ'), ('Ӻ', 'Ӻ'), ('Ӽ', 'Ӽ'), + ('Ӿ', 'Ӿ'), ('Ԁ', 'Ԁ'), ('Ԃ', 'Ԃ'), ('Ԅ', 'Ԅ'), ('Ԇ', 'Ԇ'), + ('Ԉ', 'Ԉ'), ('Ԋ', 'Ԋ'), ('Ԍ', 'Ԍ'), ('Ԏ', 'Ԏ'), ('Ԑ', 'Ԑ'), + ('Ԓ', 'Ԓ'), ('Ԕ', 'Ԕ'), ('Ԗ', 'Ԗ'), ('Ԙ', 'Ԙ'), ('Ԛ', 'Ԛ'), + ('Ԝ', 'Ԝ'), ('Ԟ', 'Ԟ'), ('Ԡ', 'Ԡ'), ('Ԣ', 'Ԣ'), ('Ԥ', 'Ԥ'), + ('Ԧ', 'Ԧ'), ('Ԩ', 'Ԩ'), ('Ԫ', 'Ԫ'), ('Ԭ', 'Ԭ'), ('Ԯ', 'Ԯ'), + ('Ա', 'Ֆ'), ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), + ('Ꭰ', 'Ᏽ'), ('Ḁ', 'Ḁ'), ('Ḃ', 'Ḃ'), ('Ḅ', 'Ḅ'), + ('Ḇ', 'Ḇ'), ('Ḉ', 'Ḉ'), ('Ḋ', 'Ḋ'), ('Ḍ', 'Ḍ'), + ('Ḏ', 'Ḏ'), ('Ḑ', 'Ḑ'), ('Ḓ', 'Ḓ'), ('Ḕ', 'Ḕ'), + ('Ḗ', 'Ḗ'), ('Ḙ', 'Ḙ'), ('Ḛ', 'Ḛ'), ('Ḝ', 'Ḝ'), + ('Ḟ', 'Ḟ'), ('Ḡ', 'Ḡ'), ('Ḣ', 'Ḣ'), ('Ḥ', 'Ḥ'), + ('Ḧ', 'Ḧ'), ('Ḩ', 'Ḩ'), ('Ḫ', 'Ḫ'), ('Ḭ', 'Ḭ'), + ('Ḯ', 'Ḯ'), ('Ḱ', 'Ḱ'), ('Ḳ', 'Ḳ'), ('Ḵ', 'Ḵ'), + ('Ḷ', 'Ḷ'), ('Ḹ', 'Ḹ'), ('Ḻ', 'Ḻ'), ('Ḽ', 'Ḽ'), + ('Ḿ', 'Ḿ'), ('Ṁ', 'Ṁ'), ('Ṃ', 'Ṃ'), ('Ṅ', 'Ṅ'), + ('Ṇ', 'Ṇ'), ('Ṉ', 'Ṉ'), ('Ṋ', 'Ṋ'), ('Ṍ', 'Ṍ'), + ('Ṏ', 'Ṏ'), ('Ṑ', 'Ṑ'), ('Ṓ', 'Ṓ'), ('Ṕ', 'Ṕ'), + ('Ṗ', 'Ṗ'), ('Ṙ', 'Ṙ'), ('Ṛ', 'Ṛ'), ('Ṝ', 'Ṝ'), + ('Ṟ', 'Ṟ'), ('Ṡ', 'Ṡ'), ('Ṣ', 'Ṣ'), ('Ṥ', 'Ṥ'), + ('Ṧ', 'Ṧ'), ('Ṩ', 'Ṩ'), ('Ṫ', 'Ṫ'), ('Ṭ', 'Ṭ'), + ('Ṯ', 'Ṯ'), ('Ṱ', 'Ṱ'), ('Ṳ', 'Ṳ'), ('Ṵ', 'Ṵ'), + ('Ṷ', 'Ṷ'), ('Ṹ', 'Ṹ'), ('Ṻ', 'Ṻ'), ('Ṽ', 'Ṽ'), + ('Ṿ', 'Ṿ'), ('Ẁ', 'Ẁ'), ('Ẃ', 'Ẃ'), ('Ẅ', 'Ẅ'), + ('Ẇ', 'Ẇ'), ('Ẉ', 'Ẉ'), ('Ẋ', 'Ẋ'), ('Ẍ', 'Ẍ'), + ('Ẏ', 'Ẏ'), ('Ẑ', 'Ẑ'), ('Ẓ', 'Ẓ'), ('Ẕ', 'Ẕ'), + ('ẞ', 'ẞ'), ('Ạ', 'Ạ'), ('Ả', 'Ả'), ('Ấ', 'Ấ'), + ('Ầ', 'Ầ'), ('Ẩ', 'Ẩ'), ('Ẫ', 'Ẫ'), ('Ậ', 'Ậ'), + ('Ắ', 'Ắ'), ('Ằ', 'Ằ'), ('Ẳ', 'Ẳ'), ('Ẵ', 'Ẵ'), + ('Ặ', 'Ặ'), ('Ẹ', 'Ẹ'), ('Ẻ', 'Ẻ'), ('Ẽ', 'Ẽ'), + ('Ế', 'Ế'), ('Ề', 'Ề'), ('Ể', 'Ể'), ('Ễ', 'Ễ'), + ('Ệ', 'Ệ'), ('Ỉ', 'Ỉ'), ('Ị', 'Ị'), ('Ọ', 'Ọ'), + ('Ỏ', 'Ỏ'), ('Ố', 'Ố'), ('Ồ', 'Ồ'), ('Ổ', 'Ổ'), + ('Ỗ', 'Ỗ'), ('Ộ', 'Ộ'), ('Ớ', 'Ớ'), ('Ờ', 'Ờ'), + ('Ở', 'Ở'), ('Ỡ', 'Ỡ'), ('Ợ', 'Ợ'), ('Ụ', 'Ụ'), + ('Ủ', 'Ủ'), ('Ứ', 'Ứ'), ('Ừ', 'Ừ'), ('Ử', 'Ử'), + ('Ữ', 'Ữ'), ('Ự', 'Ự'), ('Ỳ', 'Ỳ'), ('Ỵ', 'Ỵ'), + ('Ỷ', 'Ỷ'), ('Ỹ', 'Ỹ'), ('Ỻ', 'Ỻ'), ('Ỽ', 'Ỽ'), + ('Ỿ', 'Ỿ'), ('Ἀ', 'Ἇ'), ('Ἐ', 'Ἕ'), ('Ἠ', 'Ἧ'), + ('Ἰ', 'Ἷ'), ('Ὀ', 'Ὅ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), ('Ὗ', 'Ὗ'), ('Ὠ', 'Ὧ'), ('ᾈ', 'ᾏ'), + ('ᾘ', 'ᾟ'), ('ᾨ', 'ᾯ'), ('Ᾰ', 'ᾼ'), ('Ὲ', 'ῌ'), + ('Ῐ', 'Ί'), ('Ῠ', 'Ῥ'), ('Ὸ', 'ῼ'), ('Ω', 'Ω'), + ('K', 'Å'), ('Ⅎ', 'Ⅎ'), ('Ⅰ', 'Ⅿ'), ('Ↄ', 'Ↄ'), + ('Ⓐ', 'Ⓩ'), ('Ⰰ', 'Ⱞ'), ('Ⱡ', 'Ⱡ'), ('Ɫ', 'Ɽ'), + ('Ⱨ', 'Ⱨ'), ('Ⱪ', 'Ⱪ'), ('Ⱬ', 'Ⱬ'), ('Ɑ', 'Ɒ'), + ('Ⱳ', 'Ⱳ'), ('Ⱶ', 'Ⱶ'), ('Ȿ', 'Ⲁ'), ('Ⲃ', 'Ⲃ'), + ('Ⲅ', 'Ⲅ'), ('Ⲇ', 'Ⲇ'), ('Ⲉ', 'Ⲉ'), ('Ⲋ', 'Ⲋ'), + ('Ⲍ', 'Ⲍ'), ('Ⲏ', 'Ⲏ'), ('Ⲑ', 'Ⲑ'), ('Ⲓ', 'Ⲓ'), + ('Ⲕ', 'Ⲕ'), ('Ⲗ', 'Ⲗ'), ('Ⲙ', 'Ⲙ'), ('Ⲛ', 'Ⲛ'), + ('Ⲝ', 'Ⲝ'), ('Ⲟ', 'Ⲟ'), ('Ⲡ', 'Ⲡ'), ('Ⲣ', 'Ⲣ'), + ('Ⲥ', 'Ⲥ'), ('Ⲧ', 'Ⲧ'), ('Ⲩ', 'Ⲩ'), ('Ⲫ', 'Ⲫ'), + ('Ⲭ', 'Ⲭ'), ('Ⲯ', 'Ⲯ'), ('Ⲱ', 'Ⲱ'), ('Ⲳ', 'Ⲳ'), + ('Ⲵ', 'Ⲵ'), ('Ⲷ', 'Ⲷ'), ('Ⲹ', 'Ⲹ'), ('Ⲻ', 'Ⲻ'), + ('Ⲽ', 'Ⲽ'), ('Ⲿ', 'Ⲿ'), ('Ⳁ', 'Ⳁ'), ('Ⳃ', 'Ⳃ'), + ('Ⳅ', 'Ⳅ'), ('Ⳇ', 'Ⳇ'), ('Ⳉ', 'Ⳉ'), ('Ⳋ', 'Ⳋ'), + ('Ⳍ', 'Ⳍ'), ('Ⳏ', 'Ⳏ'), ('Ⳑ', 'Ⳑ'), ('Ⳓ', 'Ⳓ'), + ('Ⳕ', 'Ⳕ'), ('Ⳗ', 'Ⳗ'), ('Ⳙ', 'Ⳙ'), ('Ⳛ', 'Ⳛ'), + ('Ⳝ', 'Ⳝ'), ('Ⳟ', 'Ⳟ'), ('Ⳡ', 'Ⳡ'), ('Ⳣ', 'Ⳣ'), + ('Ⳬ', 'Ⳬ'), ('Ⳮ', 'Ⳮ'), ('Ⳳ', 'Ⳳ'), ('Ꙁ', 'Ꙁ'), + ('Ꙃ', 'Ꙃ'), ('Ꙅ', 'Ꙅ'), ('Ꙇ', 'Ꙇ'), ('Ꙉ', 'Ꙉ'), + ('Ꙋ', 'Ꙋ'), ('Ꙍ', 'Ꙍ'), ('Ꙏ', 'Ꙏ'), ('Ꙑ', 'Ꙑ'), + ('Ꙓ', 'Ꙓ'), ('Ꙕ', 'Ꙕ'), ('Ꙗ', 'Ꙗ'), ('Ꙙ', 'Ꙙ'), + ('Ꙛ', 'Ꙛ'), ('Ꙝ', 'Ꙝ'), ('Ꙟ', 'Ꙟ'), ('Ꙡ', 'Ꙡ'), + ('Ꙣ', 'Ꙣ'), ('Ꙥ', 'Ꙥ'), ('Ꙧ', 'Ꙧ'), ('Ꙩ', 'Ꙩ'), + ('Ꙫ', 'Ꙫ'), ('Ꙭ', 'Ꙭ'), ('Ꚁ', 'Ꚁ'), ('Ꚃ', 'Ꚃ'), + ('Ꚅ', 'Ꚅ'), ('Ꚇ', 'Ꚇ'), ('Ꚉ', 'Ꚉ'), ('Ꚋ', 'Ꚋ'), + ('Ꚍ', 'Ꚍ'), ('Ꚏ', 'Ꚏ'), ('Ꚑ', 'Ꚑ'), ('Ꚓ', 'Ꚓ'), + ('Ꚕ', 'Ꚕ'), ('Ꚗ', 'Ꚗ'), ('Ꚙ', 'Ꚙ'), ('Ꚛ', 'Ꚛ'), + ('Ꜣ', 'Ꜣ'), ('Ꜥ', 'Ꜥ'), ('Ꜧ', 'Ꜧ'), ('Ꜩ', 'Ꜩ'), + ('Ꜫ', 'Ꜫ'), ('Ꜭ', 'Ꜭ'), ('Ꜯ', 'Ꜯ'), ('Ꜳ', 'Ꜳ'), + ('Ꜵ', 'Ꜵ'), ('Ꜷ', 'Ꜷ'), ('Ꜹ', 'Ꜹ'), ('Ꜻ', 'Ꜻ'), + ('Ꜽ', 'Ꜽ'), ('Ꜿ', 'Ꜿ'), ('Ꝁ', 'Ꝁ'), ('Ꝃ', 'Ꝃ'), + ('Ꝅ', 'Ꝅ'), ('Ꝇ', 'Ꝇ'), ('Ꝉ', 'Ꝉ'), ('Ꝋ', 'Ꝋ'), + ('Ꝍ', 'Ꝍ'), ('Ꝏ', 'Ꝏ'), ('Ꝑ', 'Ꝑ'), ('Ꝓ', 'Ꝓ'), + ('Ꝕ', 'Ꝕ'), ('Ꝗ', 'Ꝗ'), ('Ꝙ', 'Ꝙ'), ('Ꝛ', 'Ꝛ'), + ('Ꝝ', 'Ꝝ'), ('Ꝟ', 'Ꝟ'), ('Ꝡ', 'Ꝡ'), ('Ꝣ', 'Ꝣ'), + ('Ꝥ', 'Ꝥ'), ('Ꝧ', 'Ꝧ'), ('Ꝩ', 'Ꝩ'), ('Ꝫ', 'Ꝫ'), + ('Ꝭ', 'Ꝭ'), ('Ꝯ', 'Ꝯ'), ('Ꝺ', 'Ꝺ'), ('Ꝼ', 'Ꝼ'), + ('Ᵹ', 'Ꝿ'), ('Ꞁ', 'Ꞁ'), ('Ꞃ', 'Ꞃ'), ('Ꞅ', 'Ꞅ'), + ('Ꞇ', 'Ꞇ'), ('Ꞌ', 'Ꞌ'), ('Ɥ', 'Ɥ'), ('Ꞑ', 'Ꞑ'), + ('Ꞓ', 'Ꞓ'), ('Ꞗ', 'Ꞗ'), ('Ꞙ', 'Ꞙ'), ('Ꞛ', 'Ꞛ'), + ('Ꞝ', 'Ꞝ'), ('Ꞟ', 'Ꞟ'), ('Ꞡ', 'Ꞡ'), ('Ꞣ', 'Ꞣ'), + ('Ꞥ', 'Ꞥ'), ('Ꞧ', 'Ꞧ'), ('Ꞩ', 'Ꞩ'), ('Ɦ', 'Ɪ'), + ('Ʞ', 'Ꞵ'), ('Ꞷ', 'Ꞷ'), ('A', 'Z'), ('𐐀', '𐐧'), + ('𐒰', '𐓓'), ('𐲀', '𐲲'), ('𑢠', '𑢿'), ('𞤀', '𞤡'), +]; + +pub const CHANGES_WHEN_TITLECASED: &'static [(char, char)] = &[ + ('a', 'z'), ('µ', 'µ'), ('ß', 'ö'), ('ø', 'ÿ'), ('ā', 'ā'), + ('ă', 'ă'), ('ą', 'ą'), ('ć', 'ć'), ('ĉ', 'ĉ'), ('ċ', 'ċ'), + ('č', 'č'), ('ď', 'ď'), ('đ', 'đ'), ('ē', 'ē'), ('ĕ', 'ĕ'), + ('ė', 'ė'), ('ę', 'ę'), ('ě', 'ě'), ('ĝ', 'ĝ'), ('ğ', 'ğ'), + ('ġ', 'ġ'), ('ģ', 'ģ'), ('ĥ', 'ĥ'), ('ħ', 'ħ'), ('ĩ', 'ĩ'), + ('ī', 'ī'), ('ĭ', 'ĭ'), ('į', 'į'), ('ı', 'ı'), ('ij', 'ij'), + ('ĵ', 'ĵ'), ('ķ', 'ķ'), ('ĺ', 'ĺ'), ('ļ', 'ļ'), ('ľ', 'ľ'), + ('ŀ', 'ŀ'), ('ł', 'ł'), ('ń', 'ń'), ('ņ', 'ņ'), ('ň', 'ʼn'), + ('ŋ', 'ŋ'), ('ō', 'ō'), ('ŏ', 'ŏ'), ('ő', 'ő'), ('œ', 'œ'), + ('ŕ', 'ŕ'), ('ŗ', 'ŗ'), ('ř', 'ř'), ('ś', 'ś'), ('ŝ', 'ŝ'), + ('ş', 'ş'), ('š', 'š'), ('ţ', 'ţ'), ('ť', 'ť'), ('ŧ', 'ŧ'), + ('ũ', 'ũ'), ('ū', 'ū'), ('ŭ', 'ŭ'), ('ů', 'ů'), ('ű', 'ű'), + ('ų', 'ų'), ('ŵ', 'ŵ'), ('ŷ', 'ŷ'), ('ź', 'ź'), ('ż', 'ż'), + ('ž', 'ƀ'), ('ƃ', 'ƃ'), ('ƅ', 'ƅ'), ('ƈ', 'ƈ'), ('ƌ', 'ƌ'), + ('ƒ', 'ƒ'), ('ƕ', 'ƕ'), ('ƙ', 'ƚ'), ('ƞ', 'ƞ'), ('ơ', 'ơ'), + ('ƣ', 'ƣ'), ('ƥ', 'ƥ'), ('ƨ', 'ƨ'), ('ƭ', 'ƭ'), ('ư', 'ư'), + ('ƴ', 'ƴ'), ('ƶ', 'ƶ'), ('ƹ', 'ƹ'), ('ƽ', 'ƽ'), ('ƿ', 'ƿ'), + ('DŽ', 'DŽ'), ('dž', 'LJ'), ('lj', 'NJ'), ('nj', 'nj'), ('ǎ', 'ǎ'), + ('ǐ', 'ǐ'), ('ǒ', 'ǒ'), ('ǔ', 'ǔ'), ('ǖ', 'ǖ'), ('ǘ', 'ǘ'), + ('ǚ', 'ǚ'), ('ǜ', 'ǝ'), ('ǟ', 'ǟ'), ('ǡ', 'ǡ'), ('ǣ', 'ǣ'), + ('ǥ', 'ǥ'), ('ǧ', 'ǧ'), ('ǩ', 'ǩ'), ('ǫ', 'ǫ'), ('ǭ', 'ǭ'), + ('ǯ', 'DZ'), ('dz', 'dz'), ('ǵ', 'ǵ'), ('ǹ', 'ǹ'), ('ǻ', 'ǻ'), + ('ǽ', 'ǽ'), ('ǿ', 'ǿ'), ('ȁ', 'ȁ'), ('ȃ', 'ȃ'), ('ȅ', 'ȅ'), + ('ȇ', 'ȇ'), ('ȉ', 'ȉ'), ('ȋ', 'ȋ'), ('ȍ', 'ȍ'), ('ȏ', 'ȏ'), + ('ȑ', 'ȑ'), ('ȓ', 'ȓ'), ('ȕ', 'ȕ'), ('ȗ', 'ȗ'), ('ș', 'ș'), + ('ț', 'ț'), ('ȝ', 'ȝ'), ('ȟ', 'ȟ'), ('ȣ', 'ȣ'), ('ȥ', 'ȥ'), + ('ȧ', 'ȧ'), ('ȩ', 'ȩ'), ('ȫ', 'ȫ'), ('ȭ', 'ȭ'), ('ȯ', 'ȯ'), + ('ȱ', 'ȱ'), ('ȳ', 'ȳ'), ('ȼ', 'ȼ'), ('ȿ', 'ɀ'), ('ɂ', 'ɂ'), + ('ɇ', 'ɇ'), ('ɉ', 'ɉ'), ('ɋ', 'ɋ'), ('ɍ', 'ɍ'), ('ɏ', 'ɔ'), + ('ɖ', 'ɗ'), ('ə', 'ə'), ('ɛ', 'ɜ'), ('ɠ', 'ɡ'), ('ɣ', 'ɣ'), + ('ɥ', 'ɦ'), ('ɨ', 'ɬ'), ('ɯ', 'ɯ'), ('ɱ', 'ɲ'), ('ɵ', 'ɵ'), + ('ɽ', 'ɽ'), ('ʀ', 'ʀ'), ('ʃ', 'ʃ'), ('ʇ', 'ʌ'), ('ʒ', 'ʒ'), + ('ʝ', 'ʞ'), ('ͅ', 'ͅ'), ('ͱ', 'ͱ'), ('ͳ', 'ͳ'), ('ͷ', 'ͷ'), + ('ͻ', 'ͽ'), ('ΐ', 'ΐ'), ('ά', 'ώ'), ('ϐ', 'ϑ'), ('ϕ', 'ϗ'), + ('ϙ', 'ϙ'), ('ϛ', 'ϛ'), ('ϝ', 'ϝ'), ('ϟ', 'ϟ'), ('ϡ', 'ϡ'), + ('ϣ', 'ϣ'), ('ϥ', 'ϥ'), ('ϧ', 'ϧ'), ('ϩ', 'ϩ'), ('ϫ', 'ϫ'), + ('ϭ', 'ϭ'), ('ϯ', 'ϳ'), ('ϵ', 'ϵ'), ('ϸ', 'ϸ'), ('ϻ', 'ϻ'), + ('а', 'џ'), ('ѡ', 'ѡ'), ('ѣ', 'ѣ'), ('ѥ', 'ѥ'), ('ѧ', 'ѧ'), + ('ѩ', 'ѩ'), ('ѫ', 'ѫ'), ('ѭ', 'ѭ'), ('ѯ', 'ѯ'), ('ѱ', 'ѱ'), + ('ѳ', 'ѳ'), ('ѵ', 'ѵ'), ('ѷ', 'ѷ'), ('ѹ', 'ѹ'), ('ѻ', 'ѻ'), + ('ѽ', 'ѽ'), ('ѿ', 'ѿ'), ('ҁ', 'ҁ'), ('ҋ', 'ҋ'), ('ҍ', 'ҍ'), + ('ҏ', 'ҏ'), ('ґ', 'ґ'), ('ғ', 'ғ'), ('ҕ', 'ҕ'), ('җ', 'җ'), + ('ҙ', 'ҙ'), ('қ', 'қ'), ('ҝ', 'ҝ'), ('ҟ', 'ҟ'), ('ҡ', 'ҡ'), + ('ң', 'ң'), ('ҥ', 'ҥ'), ('ҧ', 'ҧ'), ('ҩ', 'ҩ'), ('ҫ', 'ҫ'), + ('ҭ', 'ҭ'), ('ү', 'ү'), ('ұ', 'ұ'), ('ҳ', 'ҳ'), ('ҵ', 'ҵ'), + ('ҷ', 'ҷ'), ('ҹ', 'ҹ'), ('һ', 'һ'), ('ҽ', 'ҽ'), ('ҿ', 'ҿ'), + ('ӂ', 'ӂ'), ('ӄ', 'ӄ'), ('ӆ', 'ӆ'), ('ӈ', 'ӈ'), ('ӊ', 'ӊ'), + ('ӌ', 'ӌ'), ('ӎ', 'ӏ'), ('ӑ', 'ӑ'), ('ӓ', 'ӓ'), ('ӕ', 'ӕ'), + ('ӗ', 'ӗ'), ('ә', 'ә'), ('ӛ', 'ӛ'), ('ӝ', 'ӝ'), ('ӟ', 'ӟ'), + ('ӡ', 'ӡ'), ('ӣ', 'ӣ'), ('ӥ', 'ӥ'), ('ӧ', 'ӧ'), ('ө', 'ө'), + ('ӫ', 'ӫ'), ('ӭ', 'ӭ'), ('ӯ', 'ӯ'), ('ӱ', 'ӱ'), ('ӳ', 'ӳ'), + ('ӵ', 'ӵ'), ('ӷ', 'ӷ'), ('ӹ', 'ӹ'), ('ӻ', 'ӻ'), ('ӽ', 'ӽ'), + ('ӿ', 'ӿ'), ('ԁ', 'ԁ'), ('ԃ', 'ԃ'), ('ԅ', 'ԅ'), ('ԇ', 'ԇ'), + ('ԉ', 'ԉ'), ('ԋ', 'ԋ'), ('ԍ', 'ԍ'), ('ԏ', 'ԏ'), ('ԑ', 'ԑ'), + ('ԓ', 'ԓ'), ('ԕ', 'ԕ'), ('ԗ', 'ԗ'), ('ԙ', 'ԙ'), ('ԛ', 'ԛ'), + ('ԝ', 'ԝ'), ('ԟ', 'ԟ'), ('ԡ', 'ԡ'), ('ԣ', 'ԣ'), ('ԥ', 'ԥ'), + ('ԧ', 'ԧ'), ('ԩ', 'ԩ'), ('ԫ', 'ԫ'), ('ԭ', 'ԭ'), ('ԯ', 'ԯ'), + ('ա', 'և'), ('ᏸ', 'ᏽ'), ('ᲀ', 'ᲈ'), ('ᵹ', 'ᵹ'), + ('ᵽ', 'ᵽ'), ('ḁ', 'ḁ'), ('ḃ', 'ḃ'), ('ḅ', 'ḅ'), + ('ḇ', 'ḇ'), ('ḉ', 'ḉ'), ('ḋ', 'ḋ'), ('ḍ', 'ḍ'), + ('ḏ', 'ḏ'), ('ḑ', 'ḑ'), ('ḓ', 'ḓ'), ('ḕ', 'ḕ'), + ('ḗ', 'ḗ'), ('ḙ', 'ḙ'), ('ḛ', 'ḛ'), ('ḝ', 'ḝ'), + ('ḟ', 'ḟ'), ('ḡ', 'ḡ'), ('ḣ', 'ḣ'), ('ḥ', 'ḥ'), + ('ḧ', 'ḧ'), ('ḩ', 'ḩ'), ('ḫ', 'ḫ'), ('ḭ', 'ḭ'), + ('ḯ', 'ḯ'), ('ḱ', 'ḱ'), ('ḳ', 'ḳ'), ('ḵ', 'ḵ'), + ('ḷ', 'ḷ'), ('ḹ', 'ḹ'), ('ḻ', 'ḻ'), ('ḽ', 'ḽ'), + ('ḿ', 'ḿ'), ('ṁ', 'ṁ'), ('ṃ', 'ṃ'), ('ṅ', 'ṅ'), + ('ṇ', 'ṇ'), ('ṉ', 'ṉ'), ('ṋ', 'ṋ'), ('ṍ', 'ṍ'), + ('ṏ', 'ṏ'), ('ṑ', 'ṑ'), ('ṓ', 'ṓ'), ('ṕ', 'ṕ'), + ('ṗ', 'ṗ'), ('ṙ', 'ṙ'), ('ṛ', 'ṛ'), ('ṝ', 'ṝ'), + ('ṟ', 'ṟ'), ('ṡ', 'ṡ'), ('ṣ', 'ṣ'), ('ṥ', 'ṥ'), + ('ṧ', 'ṧ'), ('ṩ', 'ṩ'), ('ṫ', 'ṫ'), ('ṭ', 'ṭ'), + ('ṯ', 'ṯ'), ('ṱ', 'ṱ'), ('ṳ', 'ṳ'), ('ṵ', 'ṵ'), + ('ṷ', 'ṷ'), ('ṹ', 'ṹ'), ('ṻ', 'ṻ'), ('ṽ', 'ṽ'), + ('ṿ', 'ṿ'), ('ẁ', 'ẁ'), ('ẃ', 'ẃ'), ('ẅ', 'ẅ'), + ('ẇ', 'ẇ'), ('ẉ', 'ẉ'), ('ẋ', 'ẋ'), ('ẍ', 'ẍ'), + ('ẏ', 'ẏ'), ('ẑ', 'ẑ'), ('ẓ', 'ẓ'), ('ẕ', 'ẛ'), + ('ạ', 'ạ'), ('ả', 'ả'), ('ấ', 'ấ'), ('ầ', 'ầ'), + ('ẩ', 'ẩ'), ('ẫ', 'ẫ'), ('ậ', 'ậ'), ('ắ', 'ắ'), + ('ằ', 'ằ'), ('ẳ', 'ẳ'), ('ẵ', 'ẵ'), ('ặ', 'ặ'), + ('ẹ', 'ẹ'), ('ẻ', 'ẻ'), ('ẽ', 'ẽ'), ('ế', 'ế'), + ('ề', 'ề'), ('ể', 'ể'), ('ễ', 'ễ'), ('ệ', 'ệ'), + ('ỉ', 'ỉ'), ('ị', 'ị'), ('ọ', 'ọ'), ('ỏ', 'ỏ'), + ('ố', 'ố'), ('ồ', 'ồ'), ('ổ', 'ổ'), ('ỗ', 'ỗ'), + ('ộ', 'ộ'), ('ớ', 'ớ'), ('ờ', 'ờ'), ('ở', 'ở'), + ('ỡ', 'ỡ'), ('ợ', 'ợ'), ('ụ', 'ụ'), ('ủ', 'ủ'), + ('ứ', 'ứ'), ('ừ', 'ừ'), ('ử', 'ử'), ('ữ', 'ữ'), + ('ự', 'ự'), ('ỳ', 'ỳ'), ('ỵ', 'ỵ'), ('ỷ', 'ỷ'), + ('ỹ', 'ỹ'), ('ỻ', 'ỻ'), ('ỽ', 'ỽ'), ('ỿ', 'ἇ'), + ('ἐ', 'ἕ'), ('ἠ', 'ἧ'), ('ἰ', 'ἷ'), ('ὀ', 'ὅ'), + ('ὐ', 'ὗ'), ('ὠ', 'ὧ'), ('ὰ', 'ώ'), ('ᾀ', 'ᾇ'), + ('ᾐ', 'ᾗ'), ('ᾠ', 'ᾧ'), ('ᾰ', 'ᾴ'), ('ᾶ', 'ᾷ'), + ('ι', 'ι'), ('ῂ', 'ῄ'), ('ῆ', 'ῇ'), ('ῐ', 'ΐ'), + ('ῖ', 'ῗ'), ('ῠ', 'ῧ'), ('ῲ', 'ῴ'), ('ῶ', 'ῷ'), + ('ⅎ', 'ⅎ'), ('ⅰ', 'ⅿ'), ('ↄ', 'ↄ'), ('ⓐ', 'ⓩ'), + ('ⰰ', 'ⱞ'), ('ⱡ', 'ⱡ'), ('ⱥ', 'ⱦ'), ('ⱨ', 'ⱨ'), + ('ⱪ', 'ⱪ'), ('ⱬ', 'ⱬ'), ('ⱳ', 'ⱳ'), ('ⱶ', 'ⱶ'), + ('ⲁ', 'ⲁ'), ('ⲃ', 'ⲃ'), ('ⲅ', 'ⲅ'), ('ⲇ', 'ⲇ'), + ('ⲉ', 'ⲉ'), ('ⲋ', 'ⲋ'), ('ⲍ', 'ⲍ'), ('ⲏ', 'ⲏ'), + ('ⲑ', 'ⲑ'), ('ⲓ', 'ⲓ'), ('ⲕ', 'ⲕ'), ('ⲗ', 'ⲗ'), + ('ⲙ', 'ⲙ'), ('ⲛ', 'ⲛ'), ('ⲝ', 'ⲝ'), ('ⲟ', 'ⲟ'), + ('ⲡ', 'ⲡ'), ('ⲣ', 'ⲣ'), ('ⲥ', 'ⲥ'), ('ⲧ', 'ⲧ'), + ('ⲩ', 'ⲩ'), ('ⲫ', 'ⲫ'), ('ⲭ', 'ⲭ'), ('ⲯ', 'ⲯ'), + ('ⲱ', 'ⲱ'), ('ⲳ', 'ⲳ'), ('ⲵ', 'ⲵ'), ('ⲷ', 'ⲷ'), + ('ⲹ', 'ⲹ'), ('ⲻ', 'ⲻ'), ('ⲽ', 'ⲽ'), ('ⲿ', 'ⲿ'), + ('ⳁ', 'ⳁ'), ('ⳃ', 'ⳃ'), ('ⳅ', 'ⳅ'), ('ⳇ', 'ⳇ'), + ('ⳉ', 'ⳉ'), ('ⳋ', 'ⳋ'), ('ⳍ', 'ⳍ'), ('ⳏ', 'ⳏ'), + ('ⳑ', 'ⳑ'), ('ⳓ', 'ⳓ'), ('ⳕ', 'ⳕ'), ('ⳗ', 'ⳗ'), + ('ⳙ', 'ⳙ'), ('ⳛ', 'ⳛ'), ('ⳝ', 'ⳝ'), ('ⳟ', 'ⳟ'), + ('ⳡ', 'ⳡ'), ('ⳣ', 'ⳣ'), ('ⳬ', 'ⳬ'), ('ⳮ', 'ⳮ'), + ('ⳳ', 'ⳳ'), ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), + ('ꙁ', 'ꙁ'), ('ꙃ', 'ꙃ'), ('ꙅ', 'ꙅ'), ('ꙇ', 'ꙇ'), + ('ꙉ', 'ꙉ'), ('ꙋ', 'ꙋ'), ('ꙍ', 'ꙍ'), ('ꙏ', 'ꙏ'), + ('ꙑ', 'ꙑ'), ('ꙓ', 'ꙓ'), ('ꙕ', 'ꙕ'), ('ꙗ', 'ꙗ'), + ('ꙙ', 'ꙙ'), ('ꙛ', 'ꙛ'), ('ꙝ', 'ꙝ'), ('ꙟ', 'ꙟ'), + ('ꙡ', 'ꙡ'), ('ꙣ', 'ꙣ'), ('ꙥ', 'ꙥ'), ('ꙧ', 'ꙧ'), + ('ꙩ', 'ꙩ'), ('ꙫ', 'ꙫ'), ('ꙭ', 'ꙭ'), ('ꚁ', 'ꚁ'), + ('ꚃ', 'ꚃ'), ('ꚅ', 'ꚅ'), ('ꚇ', 'ꚇ'), ('ꚉ', 'ꚉ'), + ('ꚋ', 'ꚋ'), ('ꚍ', 'ꚍ'), ('ꚏ', 'ꚏ'), ('ꚑ', 'ꚑ'), + ('ꚓ', 'ꚓ'), ('ꚕ', 'ꚕ'), ('ꚗ', 'ꚗ'), ('ꚙ', 'ꚙ'), + ('ꚛ', 'ꚛ'), ('ꜣ', 'ꜣ'), ('ꜥ', 'ꜥ'), ('ꜧ', 'ꜧ'), + ('ꜩ', 'ꜩ'), ('ꜫ', 'ꜫ'), ('ꜭ', 'ꜭ'), ('ꜯ', 'ꜯ'), + ('ꜳ', 'ꜳ'), ('ꜵ', 'ꜵ'), ('ꜷ', 'ꜷ'), ('ꜹ', 'ꜹ'), + ('ꜻ', 'ꜻ'), ('ꜽ', 'ꜽ'), ('ꜿ', 'ꜿ'), ('ꝁ', 'ꝁ'), + ('ꝃ', 'ꝃ'), ('ꝅ', 'ꝅ'), ('ꝇ', 'ꝇ'), ('ꝉ', 'ꝉ'), + ('ꝋ', 'ꝋ'), ('ꝍ', 'ꝍ'), ('ꝏ', 'ꝏ'), ('ꝑ', 'ꝑ'), + ('ꝓ', 'ꝓ'), ('ꝕ', 'ꝕ'), ('ꝗ', 'ꝗ'), ('ꝙ', 'ꝙ'), + ('ꝛ', 'ꝛ'), ('ꝝ', 'ꝝ'), ('ꝟ', 'ꝟ'), ('ꝡ', 'ꝡ'), + ('ꝣ', 'ꝣ'), ('ꝥ', 'ꝥ'), ('ꝧ', 'ꝧ'), ('ꝩ', 'ꝩ'), + ('ꝫ', 'ꝫ'), ('ꝭ', 'ꝭ'), ('ꝯ', 'ꝯ'), ('ꝺ', 'ꝺ'), + ('ꝼ', 'ꝼ'), ('ꝿ', 'ꝿ'), ('ꞁ', 'ꞁ'), ('ꞃ', 'ꞃ'), + ('ꞅ', 'ꞅ'), ('ꞇ', 'ꞇ'), ('ꞌ', 'ꞌ'), ('ꞑ', 'ꞑ'), + ('ꞓ', 'ꞓ'), ('ꞗ', 'ꞗ'), ('ꞙ', 'ꞙ'), ('ꞛ', 'ꞛ'), + ('ꞝ', 'ꞝ'), ('ꞟ', 'ꞟ'), ('ꞡ', 'ꞡ'), ('ꞣ', 'ꞣ'), + ('ꞥ', 'ꞥ'), ('ꞧ', 'ꞧ'), ('ꞩ', 'ꞩ'), ('ꞵ', 'ꞵ'), + ('ꞷ', 'ꞷ'), ('ꭓ', 'ꭓ'), ('ꭰ', 'ꮿ'), ('ff', 'st'), + ('ﬓ', 'ﬗ'), ('a', 'z'), ('𐐨', '𐑏'), ('𐓘', '𐓻'), + ('𐳀', '𐳲'), ('𑣀', '𑣟'), ('𞤢', '𞥃'), +]; + +pub const CHANGES_WHEN_UPPERCASED: &'static [(char, char)] = &[ + ('a', 'z'), ('µ', 'µ'), ('ß', 'ö'), ('ø', 'ÿ'), ('ā', 'ā'), + ('ă', 'ă'), ('ą', 'ą'), ('ć', 'ć'), ('ĉ', 'ĉ'), ('ċ', 'ċ'), + ('č', 'č'), ('ď', 'ď'), ('đ', 'đ'), ('ē', 'ē'), ('ĕ', 'ĕ'), + ('ė', 'ė'), ('ę', 'ę'), ('ě', 'ě'), ('ĝ', 'ĝ'), ('ğ', 'ğ'), + ('ġ', 'ġ'), ('ģ', 'ģ'), ('ĥ', 'ĥ'), ('ħ', 'ħ'), ('ĩ', 'ĩ'), + ('ī', 'ī'), ('ĭ', 'ĭ'), ('į', 'į'), ('ı', 'ı'), ('ij', 'ij'), + ('ĵ', 'ĵ'), ('ķ', 'ķ'), ('ĺ', 'ĺ'), ('ļ', 'ļ'), ('ľ', 'ľ'), + ('ŀ', 'ŀ'), ('ł', 'ł'), ('ń', 'ń'), ('ņ', 'ņ'), ('ň', 'ʼn'), + ('ŋ', 'ŋ'), ('ō', 'ō'), ('ŏ', 'ŏ'), ('ő', 'ő'), ('œ', 'œ'), + ('ŕ', 'ŕ'), ('ŗ', 'ŗ'), ('ř', 'ř'), ('ś', 'ś'), ('ŝ', 'ŝ'), + ('ş', 'ş'), ('š', 'š'), ('ţ', 'ţ'), ('ť', 'ť'), ('ŧ', 'ŧ'), + ('ũ', 'ũ'), ('ū', 'ū'), ('ŭ', 'ŭ'), ('ů', 'ů'), ('ű', 'ű'), + ('ų', 'ų'), ('ŵ', 'ŵ'), ('ŷ', 'ŷ'), ('ź', 'ź'), ('ż', 'ż'), + ('ž', 'ƀ'), ('ƃ', 'ƃ'), ('ƅ', 'ƅ'), ('ƈ', 'ƈ'), ('ƌ', 'ƌ'), + ('ƒ', 'ƒ'), ('ƕ', 'ƕ'), ('ƙ', 'ƚ'), ('ƞ', 'ƞ'), ('ơ', 'ơ'), + ('ƣ', 'ƣ'), ('ƥ', 'ƥ'), ('ƨ', 'ƨ'), ('ƭ', 'ƭ'), ('ư', 'ư'), + ('ƴ', 'ƴ'), ('ƶ', 'ƶ'), ('ƹ', 'ƹ'), ('ƽ', 'ƽ'), ('ƿ', 'ƿ'), + ('Dž', 'dž'), ('Lj', 'lj'), ('Nj', 'nj'), ('ǎ', 'ǎ'), ('ǐ', 'ǐ'), + ('ǒ', 'ǒ'), ('ǔ', 'ǔ'), ('ǖ', 'ǖ'), ('ǘ', 'ǘ'), ('ǚ', 'ǚ'), + ('ǜ', 'ǝ'), ('ǟ', 'ǟ'), ('ǡ', 'ǡ'), ('ǣ', 'ǣ'), ('ǥ', 'ǥ'), + ('ǧ', 'ǧ'), ('ǩ', 'ǩ'), ('ǫ', 'ǫ'), ('ǭ', 'ǭ'), ('ǯ', 'ǰ'), + ('Dz', 'dz'), ('ǵ', 'ǵ'), ('ǹ', 'ǹ'), ('ǻ', 'ǻ'), ('ǽ', 'ǽ'), + ('ǿ', 'ǿ'), ('ȁ', 'ȁ'), ('ȃ', 'ȃ'), ('ȅ', 'ȅ'), ('ȇ', 'ȇ'), + ('ȉ', 'ȉ'), ('ȋ', 'ȋ'), ('ȍ', 'ȍ'), ('ȏ', 'ȏ'), ('ȑ', 'ȑ'), + ('ȓ', 'ȓ'), ('ȕ', 'ȕ'), ('ȗ', 'ȗ'), ('ș', 'ș'), ('ț', 'ț'), + ('ȝ', 'ȝ'), ('ȟ', 'ȟ'), ('ȣ', 'ȣ'), ('ȥ', 'ȥ'), ('ȧ', 'ȧ'), + ('ȩ', 'ȩ'), ('ȫ', 'ȫ'), ('ȭ', 'ȭ'), ('ȯ', 'ȯ'), ('ȱ', 'ȱ'), + ('ȳ', 'ȳ'), ('ȼ', 'ȼ'), ('ȿ', 'ɀ'), ('ɂ', 'ɂ'), ('ɇ', 'ɇ'), + ('ɉ', 'ɉ'), ('ɋ', 'ɋ'), ('ɍ', 'ɍ'), ('ɏ', 'ɔ'), ('ɖ', 'ɗ'), + ('ə', 'ə'), ('ɛ', 'ɜ'), ('ɠ', 'ɡ'), ('ɣ', 'ɣ'), ('ɥ', 'ɦ'), + ('ɨ', 'ɬ'), ('ɯ', 'ɯ'), ('ɱ', 'ɲ'), ('ɵ', 'ɵ'), ('ɽ', 'ɽ'), + ('ʀ', 'ʀ'), ('ʃ', 'ʃ'), ('ʇ', 'ʌ'), ('ʒ', 'ʒ'), ('ʝ', 'ʞ'), + ('ͅ', 'ͅ'), ('ͱ', 'ͱ'), ('ͳ', 'ͳ'), ('ͷ', 'ͷ'), ('ͻ', 'ͽ'), + ('ΐ', 'ΐ'), ('ά', 'ώ'), ('ϐ', 'ϑ'), ('ϕ', 'ϗ'), ('ϙ', 'ϙ'), + ('ϛ', 'ϛ'), ('ϝ', 'ϝ'), ('ϟ', 'ϟ'), ('ϡ', 'ϡ'), ('ϣ', 'ϣ'), + ('ϥ', 'ϥ'), ('ϧ', 'ϧ'), ('ϩ', 'ϩ'), ('ϫ', 'ϫ'), ('ϭ', 'ϭ'), + ('ϯ', 'ϳ'), ('ϵ', 'ϵ'), ('ϸ', 'ϸ'), ('ϻ', 'ϻ'), ('а', 'џ'), + ('ѡ', 'ѡ'), ('ѣ', 'ѣ'), ('ѥ', 'ѥ'), ('ѧ', 'ѧ'), ('ѩ', 'ѩ'), + ('ѫ', 'ѫ'), ('ѭ', 'ѭ'), ('ѯ', 'ѯ'), ('ѱ', 'ѱ'), ('ѳ', 'ѳ'), + ('ѵ', 'ѵ'), ('ѷ', 'ѷ'), ('ѹ', 'ѹ'), ('ѻ', 'ѻ'), ('ѽ', 'ѽ'), + ('ѿ', 'ѿ'), ('ҁ', 'ҁ'), ('ҋ', 'ҋ'), ('ҍ', 'ҍ'), ('ҏ', 'ҏ'), + ('ґ', 'ґ'), ('ғ', 'ғ'), ('ҕ', 'ҕ'), ('җ', 'җ'), ('ҙ', 'ҙ'), + ('қ', 'қ'), ('ҝ', 'ҝ'), ('ҟ', 'ҟ'), ('ҡ', 'ҡ'), ('ң', 'ң'), + ('ҥ', 'ҥ'), ('ҧ', 'ҧ'), ('ҩ', 'ҩ'), ('ҫ', 'ҫ'), ('ҭ', 'ҭ'), + ('ү', 'ү'), ('ұ', 'ұ'), ('ҳ', 'ҳ'), ('ҵ', 'ҵ'), ('ҷ', 'ҷ'), + ('ҹ', 'ҹ'), ('һ', 'һ'), ('ҽ', 'ҽ'), ('ҿ', 'ҿ'), ('ӂ', 'ӂ'), + ('ӄ', 'ӄ'), ('ӆ', 'ӆ'), ('ӈ', 'ӈ'), ('ӊ', 'ӊ'), ('ӌ', 'ӌ'), + ('ӎ', 'ӏ'), ('ӑ', 'ӑ'), ('ӓ', 'ӓ'), ('ӕ', 'ӕ'), ('ӗ', 'ӗ'), + ('ә', 'ә'), ('ӛ', 'ӛ'), ('ӝ', 'ӝ'), ('ӟ', 'ӟ'), ('ӡ', 'ӡ'), + ('ӣ', 'ӣ'), ('ӥ', 'ӥ'), ('ӧ', 'ӧ'), ('ө', 'ө'), ('ӫ', 'ӫ'), + ('ӭ', 'ӭ'), ('ӯ', 'ӯ'), ('ӱ', 'ӱ'), ('ӳ', 'ӳ'), ('ӵ', 'ӵ'), + ('ӷ', 'ӷ'), ('ӹ', 'ӹ'), ('ӻ', 'ӻ'), ('ӽ', 'ӽ'), ('ӿ', 'ӿ'), + ('ԁ', 'ԁ'), ('ԃ', 'ԃ'), ('ԅ', 'ԅ'), ('ԇ', 'ԇ'), ('ԉ', 'ԉ'), + ('ԋ', 'ԋ'), ('ԍ', 'ԍ'), ('ԏ', 'ԏ'), ('ԑ', 'ԑ'), ('ԓ', 'ԓ'), + ('ԕ', 'ԕ'), ('ԗ', 'ԗ'), ('ԙ', 'ԙ'), ('ԛ', 'ԛ'), ('ԝ', 'ԝ'), + ('ԟ', 'ԟ'), ('ԡ', 'ԡ'), ('ԣ', 'ԣ'), ('ԥ', 'ԥ'), ('ԧ', 'ԧ'), + ('ԩ', 'ԩ'), ('ԫ', 'ԫ'), ('ԭ', 'ԭ'), ('ԯ', 'ԯ'), ('ա', 'և'), + ('ᏸ', 'ᏽ'), ('ᲀ', 'ᲈ'), ('ᵹ', 'ᵹ'), ('ᵽ', 'ᵽ'), + ('ḁ', 'ḁ'), ('ḃ', 'ḃ'), ('ḅ', 'ḅ'), ('ḇ', 'ḇ'), + ('ḉ', 'ḉ'), ('ḋ', 'ḋ'), ('ḍ', 'ḍ'), ('ḏ', 'ḏ'), + ('ḑ', 'ḑ'), ('ḓ', 'ḓ'), ('ḕ', 'ḕ'), ('ḗ', 'ḗ'), + ('ḙ', 'ḙ'), ('ḛ', 'ḛ'), ('ḝ', 'ḝ'), ('ḟ', 'ḟ'), + ('ḡ', 'ḡ'), ('ḣ', 'ḣ'), ('ḥ', 'ḥ'), ('ḧ', 'ḧ'), + ('ḩ', 'ḩ'), ('ḫ', 'ḫ'), ('ḭ', 'ḭ'), ('ḯ', 'ḯ'), + ('ḱ', 'ḱ'), ('ḳ', 'ḳ'), ('ḵ', 'ḵ'), ('ḷ', 'ḷ'), + ('ḹ', 'ḹ'), ('ḻ', 'ḻ'), ('ḽ', 'ḽ'), ('ḿ', 'ḿ'), + ('ṁ', 'ṁ'), ('ṃ', 'ṃ'), ('ṅ', 'ṅ'), ('ṇ', 'ṇ'), + ('ṉ', 'ṉ'), ('ṋ', 'ṋ'), ('ṍ', 'ṍ'), ('ṏ', 'ṏ'), + ('ṑ', 'ṑ'), ('ṓ', 'ṓ'), ('ṕ', 'ṕ'), ('ṗ', 'ṗ'), + ('ṙ', 'ṙ'), ('ṛ', 'ṛ'), ('ṝ', 'ṝ'), ('ṟ', 'ṟ'), + ('ṡ', 'ṡ'), ('ṣ', 'ṣ'), ('ṥ', 'ṥ'), ('ṧ', 'ṧ'), + ('ṩ', 'ṩ'), ('ṫ', 'ṫ'), ('ṭ', 'ṭ'), ('ṯ', 'ṯ'), + ('ṱ', 'ṱ'), ('ṳ', 'ṳ'), ('ṵ', 'ṵ'), ('ṷ', 'ṷ'), + ('ṹ', 'ṹ'), ('ṻ', 'ṻ'), ('ṽ', 'ṽ'), ('ṿ', 'ṿ'), + ('ẁ', 'ẁ'), ('ẃ', 'ẃ'), ('ẅ', 'ẅ'), ('ẇ', 'ẇ'), + ('ẉ', 'ẉ'), ('ẋ', 'ẋ'), ('ẍ', 'ẍ'), ('ẏ', 'ẏ'), + ('ẑ', 'ẑ'), ('ẓ', 'ẓ'), ('ẕ', 'ẛ'), ('ạ', 'ạ'), + ('ả', 'ả'), ('ấ', 'ấ'), ('ầ', 'ầ'), ('ẩ', 'ẩ'), + ('ẫ', 'ẫ'), ('ậ', 'ậ'), ('ắ', 'ắ'), ('ằ', 'ằ'), + ('ẳ', 'ẳ'), ('ẵ', 'ẵ'), ('ặ', 'ặ'), ('ẹ', 'ẹ'), + ('ẻ', 'ẻ'), ('ẽ', 'ẽ'), ('ế', 'ế'), ('ề', 'ề'), + ('ể', 'ể'), ('ễ', 'ễ'), ('ệ', 'ệ'), ('ỉ', 'ỉ'), + ('ị', 'ị'), ('ọ', 'ọ'), ('ỏ', 'ỏ'), ('ố', 'ố'), + ('ồ', 'ồ'), ('ổ', 'ổ'), ('ỗ', 'ỗ'), ('ộ', 'ộ'), + ('ớ', 'ớ'), ('ờ', 'ờ'), ('ở', 'ở'), ('ỡ', 'ỡ'), + ('ợ', 'ợ'), ('ụ', 'ụ'), ('ủ', 'ủ'), ('ứ', 'ứ'), + ('ừ', 'ừ'), ('ử', 'ử'), ('ữ', 'ữ'), ('ự', 'ự'), + ('ỳ', 'ỳ'), ('ỵ', 'ỵ'), ('ỷ', 'ỷ'), ('ỹ', 'ỹ'), + ('ỻ', 'ỻ'), ('ỽ', 'ỽ'), ('ỿ', 'ἇ'), ('ἐ', 'ἕ'), + ('ἠ', 'ἧ'), ('ἰ', 'ἷ'), ('ὀ', 'ὅ'), ('ὐ', 'ὗ'), + ('ὠ', 'ὧ'), ('ὰ', 'ώ'), ('ᾀ', 'ᾴ'), ('ᾶ', 'ᾷ'), + ('ᾼ', 'ᾼ'), ('ι', 'ι'), ('ῂ', 'ῄ'), ('ῆ', 'ῇ'), + ('ῌ', 'ῌ'), ('ῐ', 'ΐ'), ('ῖ', 'ῗ'), ('ῠ', 'ῧ'), + ('ῲ', 'ῴ'), ('ῶ', 'ῷ'), ('ῼ', 'ῼ'), ('ⅎ', 'ⅎ'), + ('ⅰ', 'ⅿ'), ('ↄ', 'ↄ'), ('ⓐ', 'ⓩ'), ('ⰰ', 'ⱞ'), + ('ⱡ', 'ⱡ'), ('ⱥ', 'ⱦ'), ('ⱨ', 'ⱨ'), ('ⱪ', 'ⱪ'), + ('ⱬ', 'ⱬ'), ('ⱳ', 'ⱳ'), ('ⱶ', 'ⱶ'), ('ⲁ', 'ⲁ'), + ('ⲃ', 'ⲃ'), ('ⲅ', 'ⲅ'), ('ⲇ', 'ⲇ'), ('ⲉ', 'ⲉ'), + ('ⲋ', 'ⲋ'), ('ⲍ', 'ⲍ'), ('ⲏ', 'ⲏ'), ('ⲑ', 'ⲑ'), + ('ⲓ', 'ⲓ'), ('ⲕ', 'ⲕ'), ('ⲗ', 'ⲗ'), ('ⲙ', 'ⲙ'), + ('ⲛ', 'ⲛ'), ('ⲝ', 'ⲝ'), ('ⲟ', 'ⲟ'), ('ⲡ', 'ⲡ'), + ('ⲣ', 'ⲣ'), ('ⲥ', 'ⲥ'), ('ⲧ', 'ⲧ'), ('ⲩ', 'ⲩ'), + ('ⲫ', 'ⲫ'), ('ⲭ', 'ⲭ'), ('ⲯ', 'ⲯ'), ('ⲱ', 'ⲱ'), + ('ⲳ', 'ⲳ'), ('ⲵ', 'ⲵ'), ('ⲷ', 'ⲷ'), ('ⲹ', 'ⲹ'), + ('ⲻ', 'ⲻ'), ('ⲽ', 'ⲽ'), ('ⲿ', 'ⲿ'), ('ⳁ', 'ⳁ'), + ('ⳃ', 'ⳃ'), ('ⳅ', 'ⳅ'), ('ⳇ', 'ⳇ'), ('ⳉ', 'ⳉ'), + ('ⳋ', 'ⳋ'), ('ⳍ', 'ⳍ'), ('ⳏ', 'ⳏ'), ('ⳑ', 'ⳑ'), + ('ⳓ', 'ⳓ'), ('ⳕ', 'ⳕ'), ('ⳗ', 'ⳗ'), ('ⳙ', 'ⳙ'), + ('ⳛ', 'ⳛ'), ('ⳝ', 'ⳝ'), ('ⳟ', 'ⳟ'), ('ⳡ', 'ⳡ'), + ('ⳣ', 'ⳣ'), ('ⳬ', 'ⳬ'), ('ⳮ', 'ⳮ'), ('ⳳ', 'ⳳ'), + ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), ('ꙁ', 'ꙁ'), + ('ꙃ', 'ꙃ'), ('ꙅ', 'ꙅ'), ('ꙇ', 'ꙇ'), ('ꙉ', 'ꙉ'), + ('ꙋ', 'ꙋ'), ('ꙍ', 'ꙍ'), ('ꙏ', 'ꙏ'), ('ꙑ', 'ꙑ'), + ('ꙓ', 'ꙓ'), ('ꙕ', 'ꙕ'), ('ꙗ', 'ꙗ'), ('ꙙ', 'ꙙ'), + ('ꙛ', 'ꙛ'), ('ꙝ', 'ꙝ'), ('ꙟ', 'ꙟ'), ('ꙡ', 'ꙡ'), + ('ꙣ', 'ꙣ'), ('ꙥ', 'ꙥ'), ('ꙧ', 'ꙧ'), ('ꙩ', 'ꙩ'), + ('ꙫ', 'ꙫ'), ('ꙭ', 'ꙭ'), ('ꚁ', 'ꚁ'), ('ꚃ', 'ꚃ'), + ('ꚅ', 'ꚅ'), ('ꚇ', 'ꚇ'), ('ꚉ', 'ꚉ'), ('ꚋ', 'ꚋ'), + ('ꚍ', 'ꚍ'), ('ꚏ', 'ꚏ'), ('ꚑ', 'ꚑ'), ('ꚓ', 'ꚓ'), + ('ꚕ', 'ꚕ'), ('ꚗ', 'ꚗ'), ('ꚙ', 'ꚙ'), ('ꚛ', 'ꚛ'), + ('ꜣ', 'ꜣ'), ('ꜥ', 'ꜥ'), ('ꜧ', 'ꜧ'), ('ꜩ', 'ꜩ'), + ('ꜫ', 'ꜫ'), ('ꜭ', 'ꜭ'), ('ꜯ', 'ꜯ'), ('ꜳ', 'ꜳ'), + ('ꜵ', 'ꜵ'), ('ꜷ', 'ꜷ'), ('ꜹ', 'ꜹ'), ('ꜻ', 'ꜻ'), + ('ꜽ', 'ꜽ'), ('ꜿ', 'ꜿ'), ('ꝁ', 'ꝁ'), ('ꝃ', 'ꝃ'), + ('ꝅ', 'ꝅ'), ('ꝇ', 'ꝇ'), ('ꝉ', 'ꝉ'), ('ꝋ', 'ꝋ'), + ('ꝍ', 'ꝍ'), ('ꝏ', 'ꝏ'), ('ꝑ', 'ꝑ'), ('ꝓ', 'ꝓ'), + ('ꝕ', 'ꝕ'), ('ꝗ', 'ꝗ'), ('ꝙ', 'ꝙ'), ('ꝛ', 'ꝛ'), + ('ꝝ', 'ꝝ'), ('ꝟ', 'ꝟ'), ('ꝡ', 'ꝡ'), ('ꝣ', 'ꝣ'), + ('ꝥ', 'ꝥ'), ('ꝧ', 'ꝧ'), ('ꝩ', 'ꝩ'), ('ꝫ', 'ꝫ'), + ('ꝭ', 'ꝭ'), ('ꝯ', 'ꝯ'), ('ꝺ', 'ꝺ'), ('ꝼ', 'ꝼ'), + ('ꝿ', 'ꝿ'), ('ꞁ', 'ꞁ'), ('ꞃ', 'ꞃ'), ('ꞅ', 'ꞅ'), + ('ꞇ', 'ꞇ'), ('ꞌ', 'ꞌ'), ('ꞑ', 'ꞑ'), ('ꞓ', 'ꞓ'), + ('ꞗ', 'ꞗ'), ('ꞙ', 'ꞙ'), ('ꞛ', 'ꞛ'), ('ꞝ', 'ꞝ'), + ('ꞟ', 'ꞟ'), ('ꞡ', 'ꞡ'), ('ꞣ', 'ꞣ'), ('ꞥ', 'ꞥ'), + ('ꞧ', 'ꞧ'), ('ꞩ', 'ꞩ'), ('ꞵ', 'ꞵ'), ('ꞷ', 'ꞷ'), + ('ꭓ', 'ꭓ'), ('ꭰ', 'ꮿ'), ('ff', 'st'), ('ﬓ', 'ﬗ'), + ('a', 'z'), ('𐐨', '𐑏'), ('𐓘', '𐓻'), ('𐳀', '𐳲'), + ('𑣀', '𑣟'), ('𞤢', '𞥃'), +]; + +pub const DASH: &'static [(char, char)] = &[ + ('-', '-'), ('֊', '֊'), ('־', '־'), ('᐀', '᐀'), ('᠆', '᠆'), + ('‐', '―'), ('⁓', '⁓'), ('⁻', '⁻'), ('₋', '₋'), + ('−', '−'), ('⸗', '⸗'), ('⸚', '⸚'), ('⸺', '⸻'), + ('⹀', '⹀'), ('〜', '〜'), ('〰', '〰'), ('゠', '゠'), + ('︱', '︲'), ('﹘', '﹘'), ('﹣', '﹣'), ('-', '-'), +]; + +pub const DEFAULT_IGNORABLE_CODE_POINT: &'static [(char, char)] = &[ + ('\u{ad}', '\u{ad}'), ('͏', '͏'), ('\u{61c}', '\u{61c}'), ('ᅟ', 'ᅠ'), + ('឴', '឵'), ('᠋', '\u{180e}'), ('\u{200b}', '\u{200f}'), + ('\u{202a}', '\u{202e}'), ('\u{2060}', '\u{206f}'), ('ㅤ', 'ㅤ'), + ('︀', '️'), ('\u{feff}', '\u{feff}'), ('ᅠ', 'ᅠ'), + ('\u{fff0}', '\u{fff8}'), ('\u{1bca0}', '\u{1bca3}'), + ('\u{1d173}', '\u{1d17a}'), ('\u{e0000}', '\u{e0fff}'), +]; + +pub const DEPRECATED: &'static [(char, char)] = &[ + ('ʼn', 'ʼn'), ('ٳ', 'ٳ'), ('ཷ', 'ཷ'), ('ཹ', 'ཹ'), ('ឣ', 'ឤ'), + ('\u{206a}', '\u{206f}'), ('〈', '〉'), ('\u{e0001}', '\u{e0001}'), +]; + +pub const DIACRITIC: &'static [(char, char)] = &[ + ('^', '^'), ('`', '`'), ('¨', '¨'), ('¯', '¯'), ('´', '´'), + ('·', '¸'), ('ʰ', '͎'), ('͐', '͗'), ('͝', '͢'), ('ʹ', '͵'), + ('ͺ', 'ͺ'), ('΄', '΅'), ('҃', '҇'), ('ՙ', 'ՙ'), ('֑', '֡'), + ('֣', 'ֽ'), ('ֿ', 'ֿ'), ('ׁ', 'ׂ'), ('ׄ', 'ׄ'), ('ً', 'ْ'), + ('ٗ', '٘'), ('۟', '۠'), ('ۥ', 'ۦ'), ('۪', '۬'), ('ܰ', '݊'), + ('ަ', 'ް'), ('߫', 'ߵ'), ('࠘', '࠙'), ('ࣣ', 'ࣾ'), ('़', '़'), + ('्', '्'), ('॑', '॔'), ('ॱ', 'ॱ'), ('়', '়'), + ('্', '্'), ('਼', '਼'), ('੍', '੍'), ('઼', '઼'), + ('્', '્'), ('૽', '૿'), ('଼', '଼'), ('୍', '୍'), + ('்', '்'), ('్', '్'), ('಼', '಼'), ('್', '್'), + ('഻', '഼'), ('്', '്'), ('්', '්'), ('็', '์'), + ('๎', '๎'), ('່', '໌'), ('༘', '༙'), ('༵', '༵'), + ('༷', '༷'), ('༹', '༹'), ('༾', '༿'), ('ྂ', '྄'), + ('྆', '྇'), ('࿆', '࿆'), ('့', '့'), ('္', '်'), + ('ႇ', 'ႍ'), ('ႏ', 'ႏ'), ('ႚ', 'ႛ'), ('៉', '៓'), + ('៝', '៝'), ('᤹', '᤻'), ('᩵', '᩼'), ('᩿', '᩿'), + ('᪰', '᪽'), ('᬴', '᬴'), ('᭄', '᭄'), ('᭫', '᭳'), + ('᮪', '᮫'), ('ᰶ', '᰷'), ('ᱸ', 'ᱽ'), ('᳐', '᳨'), + ('᳭', '᳭'), ('᳴', '᳴'), ('᳷', '᳹'), ('ᴬ', 'ᵪ'), + ('᷄', '᷏'), ('᷵', '᷹'), ('᷽', '᷿'), ('᾽', '᾽'), + ('᾿', '῁'), ('῍', '῏'), ('῝', '῟'), ('῭', '`'), + ('´', '῾'), ('⳯', '⳱'), ('ⸯ', 'ⸯ'), ('〪', '〯'), + ('゙', '゜'), ('ー', 'ー'), ('꙯', '꙯'), ('꙼', '꙽'), + ('ꙿ', 'ꙿ'), ('ꚜ', 'ꚝ'), ('꛰', '꛱'), ('ꜗ', '꜡'), + ('ꞈ', 'ꞈ'), ('ꟸ', 'ꟹ'), ('꣄', '꣄'), ('꣠', '꣱'), + ('꤫', '꤮'), ('꥓', '꥓'), ('꦳', '꦳'), ('꧀', '꧀'), + ('ꧥ', 'ꧥ'), ('ꩻ', 'ꩽ'), ('꪿', 'ꫂ'), ('꫶', '꫶'), + ('꭛', 'ꭟ'), ('꯬', '꯭'), ('ﬞ', 'ﬞ'), ('︠', '︯'), + ('^', '^'), ('`', '`'), ('ー', 'ー'), ('゙', '゚'), + (' ̄', ' ̄'), ('𐋠', '𐋠'), ('𐫥', '𐫦'), ('𑂹', '𑂺'), + ('𑄳', '𑄴'), ('𑅳', '𑅳'), ('𑇀', '𑇀'), ('𑇊', '𑇌'), + ('𑈵', '𑈶'), ('𑋩', '𑋪'), ('𑌼', '𑌼'), ('𑍍', '𑍍'), + ('𑍦', '𑍬'), ('𑍰', '𑍴'), ('𑑂', '𑑂'), ('𑑆', '𑑆'), + ('𑓂', '𑓃'), ('𑖿', '𑗀'), ('𑘿', '𑘿'), ('𑚶', '𑚷'), + ('𑜫', '𑜫'), ('𑨴', '𑨴'), ('𑩇', '𑩇'), ('𑪙', '𑪙'), + ('𑰿', '𑰿'), ('𑵂', '𑵂'), ('𑵄', '𑵅'), ('𖫰', '𖫴'), + ('𖾏', '𖾟'), ('𝅧', '𝅩'), ('𝅭', '𝅲'), ('𝅻', '𝆂'), + ('𝆅', '𝆋'), ('𝆪', '𝆭'), ('𞣐', '𞣖'), ('𞥄', '𞥆'), + ('𞥈', '𞥊'), +]; + +pub const EXTENDER: &'static [(char, char)] = &[ + ('·', '·'), ('ː', 'ˑ'), ('ـ', 'ـ'), ('ߺ', 'ߺ'), ('ๆ', 'ๆ'), + ('ໆ', 'ໆ'), ('᠊', '᠊'), ('ᡃ', 'ᡃ'), ('ᪧ', 'ᪧ'), + ('ᰶ', 'ᰶ'), ('ᱻ', 'ᱻ'), ('々', '々'), ('〱', '〵'), + ('ゝ', 'ゞ'), ('ー', 'ヾ'), ('ꀕ', 'ꀕ'), ('ꘌ', 'ꘌ'), + ('ꧏ', 'ꧏ'), ('ꧦ', 'ꧦ'), ('ꩰ', 'ꩰ'), ('ꫝ', 'ꫝ'), + ('ꫳ', 'ꫴ'), ('ー', 'ー'), ('𑍝', '𑍝'), ('𑗆', '𑗈'), + ('𑪘', '𑪘'), ('𖭂', '𖭃'), ('𖿠', '𖿡'), ('𞥄', '𞥆'), +]; + +pub const GRAPHEME_BASE: &'static [(char, char)] = &[ + (' ', '~'), ('\u{a0}', '¬'), ('®', '˿'), ('Ͱ', 'ͷ'), ('ͺ', 'Ϳ'), + ('΄', 'Ί'), ('Ό', 'Ό'), ('Ύ', 'Ρ'), ('Σ', '҂'), ('Ҋ', 'ԯ'), + ('Ա', 'Ֆ'), ('ՙ', '՟'), ('ա', 'և'), ('։', '֊'), ('֍', '֏'), + ('־', '־'), ('׀', '׀'), ('׃', '׃'), ('׆', '׆'), ('א', 'ת'), + ('װ', '״'), ('؆', '؏'), ('؛', '؛'), ('؞', 'ي'), ('٠', 'ٯ'), + ('ٱ', 'ە'), ('۞', '۞'), ('ۥ', 'ۦ'), ('۩', '۩'), ('ۮ', '܍'), + ('ܐ', 'ܐ'), ('ܒ', 'ܯ'), ('ݍ', 'ޥ'), ('ޱ', 'ޱ'), ('߀', 'ߪ'), + ('ߴ', 'ߺ'), ('ࠀ', 'ࠕ'), ('ࠚ', 'ࠚ'), ('ࠤ', 'ࠤ'), + ('ࠨ', 'ࠨ'), ('࠰', '࠾'), ('ࡀ', 'ࡘ'), ('࡞', '࡞'), + ('ࡠ', 'ࡪ'), ('ࢠ', 'ࢴ'), ('ࢶ', 'ࢽ'), ('ः', 'ह'), + ('ऻ', 'ऻ'), ('ऽ', 'ी'), ('ॉ', 'ौ'), ('ॎ', 'ॐ'), + ('क़', 'ॡ'), ('।', 'ঀ'), ('ং', 'ঃ'), ('অ', 'ঌ'), + ('এ', 'ঐ'), ('ও', 'ন'), ('প', 'র'), ('ল', 'ল'), + ('শ', 'হ'), ('ঽ', 'ঽ'), ('ি', 'ী'), ('ে', 'ৈ'), + ('ো', 'ৌ'), ('ৎ', 'ৎ'), ('ড়', 'ঢ়'), ('য়', 'ৡ'), + ('০', '৽'), ('ਃ', 'ਃ'), ('ਅ', 'ਊ'), ('ਏ', 'ਐ'), + ('ਓ', 'ਨ'), ('ਪ', 'ਰ'), ('ਲ', 'ਲ਼'), ('ਵ', 'ਸ਼'), + ('ਸ', 'ਹ'), ('ਾ', 'ੀ'), ('ਖ਼', 'ੜ'), ('ਫ਼', 'ਫ਼'), + ('੦', '੯'), ('ੲ', 'ੴ'), ('ઃ', 'ઃ'), ('અ', 'ઍ'), + ('એ', 'ઑ'), ('ઓ', 'ન'), ('પ', 'ર'), ('લ', 'ળ'), + ('વ', 'હ'), ('ઽ', 'ી'), ('ૉ', 'ૉ'), ('ો', 'ૌ'), + ('ૐ', 'ૐ'), ('ૠ', 'ૡ'), ('૦', '૱'), ('ૹ', 'ૹ'), + ('ଂ', 'ଃ'), ('ଅ', 'ଌ'), ('ଏ', 'ଐ'), ('ଓ', 'ନ'), + ('ପ', 'ର'), ('ଲ', 'ଳ'), ('ଵ', 'ହ'), ('ଽ', 'ଽ'), + ('ୀ', 'ୀ'), ('େ', 'ୈ'), ('ୋ', 'ୌ'), ('ଡ଼', 'ଢ଼'), + ('ୟ', 'ୡ'), ('୦', '୷'), ('ஃ', 'ஃ'), ('அ', 'ஊ'), + ('எ', 'ஐ'), ('ஒ', 'க'), ('ங', 'ச'), ('ஜ', 'ஜ'), + ('ஞ', 'ட'), ('ண', 'த'), ('ந', 'ப'), ('ம', 'ஹ'), + ('ி', 'ி'), ('ு', 'ூ'), ('ெ', 'ை'), ('ொ', 'ௌ'), + ('ௐ', 'ௐ'), ('௦', '௺'), ('ఁ', 'ః'), ('అ', 'ఌ'), + ('ఎ', 'ఐ'), ('ఒ', 'న'), ('ప', 'హ'), ('ఽ', 'ఽ'), + ('ు', 'ౄ'), ('ౘ', 'ౚ'), ('ౠ', 'ౡ'), ('౦', '౯'), + ('౸', 'ಀ'), ('ಂ', 'ಃ'), ('ಅ', 'ಌ'), ('ಎ', 'ಐ'), + ('ಒ', 'ನ'), ('ಪ', 'ಳ'), ('ವ', 'ಹ'), ('ಽ', 'ಾ'), + ('ೀ', 'ು'), ('ೃ', 'ೄ'), ('ೇ', 'ೈ'), ('ೊ', 'ೋ'), + ('ೞ', 'ೞ'), ('ೠ', 'ೡ'), ('೦', '೯'), ('ೱ', 'ೲ'), + ('ം', 'ഃ'), ('അ', 'ഌ'), ('എ', 'ഐ'), ('ഒ', 'ഺ'), + ('ഽ', 'ഽ'), ('ി', 'ീ'), ('െ', 'ൈ'), ('ൊ', 'ൌ'), + ('ൎ', '൏'), ('ൔ', 'ൖ'), ('൘', 'ൡ'), ('൦', 'ൿ'), + ('ං', 'ඃ'), ('අ', 'ඖ'), ('ක', 'න'), ('ඳ', 'ර'), + ('ල', 'ල'), ('ව', 'ෆ'), ('ැ', 'ෑ'), ('ෘ', 'ෞ'), + ('෦', '෯'), ('ෲ', '෴'), ('ก', 'ะ'), ('า', 'ำ'), + ('฿', 'ๆ'), ('๏', '๛'), ('ກ', 'ຂ'), ('ຄ', 'ຄ'), + ('ງ', 'ຈ'), ('ຊ', 'ຊ'), ('ຍ', 'ຍ'), ('ດ', 'ທ'), + ('ນ', 'ຟ'), ('ມ', 'ຣ'), ('ລ', 'ລ'), ('ວ', 'ວ'), + ('ສ', 'ຫ'), ('ອ', 'ະ'), ('າ', 'ຳ'), ('ຽ', 'ຽ'), + ('ເ', 'ໄ'), ('ໆ', 'ໆ'), ('໐', '໙'), ('ໜ', 'ໟ'), + ('ༀ', '༗'), ('༚', '༴'), ('༶', '༶'), ('༸', '༸'), + ('༺', 'ཇ'), ('ཉ', 'ཬ'), ('ཿ', 'ཿ'), ('྅', '྅'), + ('ྈ', 'ྌ'), ('྾', '࿅'), ('࿇', '࿌'), ('࿎', '࿚'), + ('က', 'ာ'), ('ေ', 'ေ'), ('း', 'း'), ('ျ', 'ြ'), + ('ဿ', 'ၗ'), ('ၚ', 'ၝ'), ('ၡ', 'ၰ'), ('ၵ', 'ႁ'), + ('ႃ', 'ႄ'), ('ႇ', 'ႌ'), ('ႎ', 'ႜ'), ('႞', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), ('ა', 'ቈ'), ('ቊ', 'ቍ'), + ('ቐ', 'ቖ'), ('ቘ', 'ቘ'), ('ቚ', 'ቝ'), ('በ', 'ኈ'), + ('ኊ', 'ኍ'), ('ነ', 'ኰ'), ('ኲ', 'ኵ'), ('ኸ', 'ኾ'), + ('ዀ', 'ዀ'), ('ዂ', 'ዅ'), ('ወ', 'ዖ'), ('ዘ', 'ጐ'), + ('ጒ', 'ጕ'), ('ጘ', 'ፚ'), ('፠', '፼'), ('ᎀ', '᎙'), + ('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('᐀', '᚜'), ('ᚠ', 'ᛸ'), + ('ᜀ', 'ᜌ'), ('ᜎ', 'ᜑ'), ('ᜠ', 'ᜱ'), ('᜵', '᜶'), + ('ᝀ', 'ᝑ'), ('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), ('ក', 'ឳ'), + ('ា', 'ា'), ('ើ', 'ៅ'), ('ះ', 'ៈ'), ('។', 'ៜ'), + ('០', '៩'), ('៰', '៹'), ('᠀', '᠊'), ('᠐', '᠙'), + ('ᠠ', 'ᡷ'), ('ᢀ', 'ᢄ'), ('ᢇ', 'ᢨ'), ('ᢪ', 'ᢪ'), + ('ᢰ', 'ᣵ'), ('ᤀ', 'ᤞ'), ('ᤣ', 'ᤦ'), ('ᤩ', 'ᤫ'), + ('ᤰ', 'ᤱ'), ('ᤳ', 'ᤸ'), ('᥀', '᥀'), ('᥄', 'ᥭ'), + ('ᥰ', 'ᥴ'), ('ᦀ', 'ᦫ'), ('ᦰ', 'ᧉ'), ('᧐', '᧚'), + ('᧞', 'ᨖ'), ('ᨙ', 'ᨚ'), ('᨞', 'ᩕ'), ('ᩗ', 'ᩗ'), + ('ᩡ', 'ᩡ'), ('ᩣ', 'ᩤ'), ('ᩭ', 'ᩲ'), ('᪀', '᪉'), + ('᪐', '᪙'), ('᪠', '᪭'), ('ᬄ', 'ᬳ'), ('ᬵ', 'ᬵ'), + ('ᬻ', 'ᬻ'), ('ᬽ', 'ᭁ'), ('ᭃ', 'ᭋ'), ('᭐', '᭪'), + ('᭴', '᭼'), ('ᮂ', 'ᮡ'), ('ᮦ', 'ᮧ'), ('᮪', '᮪'), + ('ᮮ', 'ᯥ'), ('ᯧ', 'ᯧ'), ('ᯪ', 'ᯬ'), ('ᯮ', 'ᯮ'), + ('᯲', '᯳'), ('᯼', 'ᰫ'), ('ᰴ', 'ᰵ'), ('᰻', '᱉'), + ('ᱍ', 'ᲈ'), ('᳀', '᳇'), ('᳓', '᳓'), ('᳡', '᳡'), + ('ᳩ', 'ᳬ'), ('ᳮ', 'ᳳ'), ('ᳵ', '᳷'), ('ᴀ', 'ᶿ'), + ('Ḁ', 'ἕ'), ('Ἐ', 'Ἕ'), ('ἠ', 'ὅ'), ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), ('ᾶ', 'ῄ'), ('ῆ', 'ΐ'), + ('ῖ', 'Ί'), ('῝', '`'), ('ῲ', 'ῴ'), ('ῶ', '῾'), + ('\u{2000}', '\u{200a}'), ('‐', '‧'), ('\u{202f}', '\u{205f}'), + ('⁰', 'ⁱ'), ('⁴', '₎'), ('ₐ', 'ₜ'), ('₠', '₿'), + ('℀', '↋'), ('←', '␦'), ('⑀', '⑊'), ('①', '⭳'), + ('⭶', '⮕'), ('⮘', '⮹'), ('⮽', '⯈'), ('⯊', '⯒'), + ('⯬', '⯯'), ('Ⰰ', 'Ⱞ'), ('ⰰ', 'ⱞ'), ('Ⱡ', 'ⳮ'), + ('Ⳳ', 'ⳳ'), ('⳹', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), + ('ⴰ', 'ⵧ'), ('ⵯ', '⵰'), ('ⶀ', 'ⶖ'), ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), ('ⶰ', 'ⶶ'), ('ⶸ', 'ⶾ'), ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), ('ⷐ', 'ⷖ'), ('ⷘ', 'ⷞ'), ('⸀', '⹉'), + ('⺀', '⺙'), ('⺛', '⻳'), ('⼀', '⿕'), ('⿰', '⿻'), + ('\u{3000}', '〩'), ('〰', '〿'), ('ぁ', 'ゖ'), ('゛', 'ヿ'), + ('ㄅ', 'ㄮ'), ('ㄱ', 'ㆎ'), ('㆐', 'ㆺ'), ('㇀', '㇣'), + ('ㇰ', '㈞'), ('㈠', '㋾'), ('㌀', '䶵'), ('䷀', '鿪'), + ('ꀀ', 'ꒌ'), ('꒐', '꓆'), ('ꓐ', 'ꘫ'), ('Ꙁ', 'ꙮ'), + ('꙳', '꙳'), ('꙾', 'ꚝ'), ('ꚠ', 'ꛯ'), ('꛲', '꛷'), + ('꜀', 'Ɪ'), ('Ʞ', 'ꞷ'), ('ꟷ', 'ꠁ'), ('ꠃ', 'ꠅ'), + ('ꠇ', 'ꠊ'), ('ꠌ', 'ꠤ'), ('ꠧ', '꠫'), ('꠰', '꠹'), + ('ꡀ', '꡷'), ('ꢀ', 'ꣃ'), ('꣎', '꣙'), ('ꣲ', 'ꣽ'), + ('꤀', 'ꤥ'), ('꤮', 'ꥆ'), ('ꥒ', '꥓'), ('꥟', 'ꥼ'), + ('ꦃ', 'ꦲ'), ('ꦴ', 'ꦵ'), ('ꦺ', 'ꦻ'), ('ꦽ', '꧍'), + ('ꧏ', '꧙'), ('꧞', 'ꧤ'), ('ꧦ', 'ꧾ'), ('ꨀ', 'ꨨ'), + ('ꨯ', 'ꨰ'), ('ꨳ', 'ꨴ'), ('ꩀ', 'ꩂ'), ('ꩄ', 'ꩋ'), + ('ꩍ', 'ꩍ'), ('꩐', '꩙'), ('꩜', 'ꩻ'), ('ꩽ', 'ꪯ'), + ('ꪱ', 'ꪱ'), ('ꪵ', 'ꪶ'), ('ꪹ', 'ꪽ'), ('ꫀ', 'ꫀ'), + ('ꫂ', 'ꫂ'), ('ꫛ', 'ꫫ'), ('ꫮ', 'ꫵ'), ('ꬁ', 'ꬆ'), + ('ꬉ', 'ꬎ'), ('ꬑ', 'ꬖ'), ('ꬠ', 'ꬦ'), ('ꬨ', 'ꬮ'), + ('ꬰ', 'ꭥ'), ('ꭰ', 'ꯤ'), ('ꯦ', 'ꯧ'), ('ꯩ', '꯬'), + ('꯰', '꯹'), ('가', '힣'), ('ힰ', 'ퟆ'), ('ퟋ', 'ퟻ'), + ('豈', '舘'), ('並', '龎'), ('ff', 'st'), ('ﬓ', 'ﬗ'), + ('יִ', 'יִ'), ('ײַ', 'זּ'), ('טּ', 'לּ'), ('מּ', 'מּ'), + ('נּ', 'סּ'), ('ףּ', 'פּ'), ('צּ', '﯁'), ('ﯓ', '﴿'), + ('ﵐ', 'ﶏ'), ('ﶒ', 'ﷇ'), ('ﷰ', '﷽'), ('︐', '︙'), + ('︰', '﹒'), ('﹔', '﹦'), ('﹨', '﹫'), ('ﹰ', 'ﹴ'), + ('ﹶ', 'ﻼ'), ('!', 'ン'), ('ᅠ', 'ᄒ'), ('ᅡ', 'ᅦ'), + ('ᅧ', 'ᅬ'), ('ᅭ', 'ᅲ'), ('ᅳ', 'ᅵ'), ('¢', '₩'), + ('│', '○'), ('', '�'), ('𐀀', '𐀋'), ('𐀍', '𐀦'), + ('𐀨', '𐀺'), ('𐀼', '𐀽'), ('𐀿', '𐁍'), ('𐁐', '𐁝'), + ('𐂀', '𐃺'), ('𐄀', '𐄂'), ('𐄇', '𐄳'), ('𐄷', '𐆎'), + ('𐆐', '𐆛'), ('𐆠', '𐆠'), ('𐇐', '𐇼'), ('𐊀', '𐊜'), + ('𐊠', '𐋐'), ('𐋡', '𐋻'), ('𐌀', '𐌣'), ('𐌭', '𐍊'), + ('𐍐', '𐍵'), ('𐎀', '𐎝'), ('𐎟', '𐏃'), ('𐏈', '𐏕'), + ('𐐀', '𐒝'), ('𐒠', '𐒩'), ('𐒰', '𐓓'), ('𐓘', '𐓻'), + ('𐔀', '𐔧'), ('𐔰', '𐕣'), ('𐕯', '𐕯'), ('𐘀', '𐜶'), + ('𐝀', '𐝕'), ('𐝠', '𐝧'), ('𐠀', '𐠅'), ('𐠈', '𐠈'), + ('𐠊', '𐠵'), ('𐠷', '𐠸'), ('𐠼', '𐠼'), ('𐠿', '𐡕'), + ('𐡗', '𐢞'), ('𐢧', '𐢯'), ('𐣠', '𐣲'), ('𐣴', '𐣵'), + ('𐣻', '𐤛'), ('𐤟', '𐤹'), ('𐤿', '𐤿'), ('𐦀', '𐦷'), + ('𐦼', '𐧏'), ('𐧒', '𐨀'), ('𐨐', '𐨓'), ('𐨕', '𐨗'), + ('𐨙', '𐨳'), ('𐩀', '𐩇'), ('𐩐', '𐩘'), ('𐩠', '𐪟'), + ('𐫀', '𐫤'), ('𐫫', '𐫶'), ('𐬀', '𐬵'), ('𐬹', '𐭕'), + ('𐭘', '𐭲'), ('𐭸', '𐮑'), ('𐮙', '𐮜'), ('𐮩', '𐮯'), + ('𐰀', '𐱈'), ('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𐳺', '𐳿'), + ('𐹠', '𐹾'), ('𑀀', '𑀀'), ('𑀂', '𑀷'), ('𑁇', '𑁍'), + ('𑁒', '𑁯'), ('𑂂', '𑂲'), ('𑂷', '𑂸'), ('𑂻', '𑂼'), + ('𑂾', '𑃁'), ('𑃐', '𑃨'), ('𑃰', '𑃹'), ('𑄃', '𑄦'), + ('𑄬', '𑄬'), ('𑄶', '𑅃'), ('𑅐', '𑅲'), ('𑅴', '𑅶'), + ('𑆂', '𑆵'), ('𑆿', '𑇉'), ('𑇍', '𑇍'), ('𑇐', '𑇟'), + ('𑇡', '𑇴'), ('𑈀', '𑈑'), ('𑈓', '𑈮'), ('𑈲', '𑈳'), + ('𑈵', '𑈵'), ('𑈸', '𑈽'), ('𑊀', '𑊆'), ('𑊈', '𑊈'), + ('𑊊', '𑊍'), ('𑊏', '𑊝'), ('𑊟', '𑊩'), ('𑊰', '𑋞'), + ('𑋠', '𑋢'), ('𑋰', '𑋹'), ('𑌂', '𑌃'), ('𑌅', '𑌌'), + ('𑌏', '𑌐'), ('𑌓', '𑌨'), ('𑌪', '𑌰'), ('𑌲', '𑌳'), + ('𑌵', '𑌹'), ('𑌽', '𑌽'), ('𑌿', '𑌿'), ('𑍁', '𑍄'), + ('𑍇', '𑍈'), ('𑍋', '𑍍'), ('𑍐', '𑍐'), ('𑍝', '𑍣'), + ('𑐀', '𑐷'), ('𑑀', '𑑁'), ('𑑅', '𑑅'), ('𑑇', '𑑙'), + ('𑑛', '𑑛'), ('𑑝', '𑑝'), ('𑒀', '𑒯'), ('𑒱', '𑒲'), + ('𑒹', '𑒹'), ('𑒻', '𑒼'), ('𑒾', '𑒾'), ('𑓁', '𑓁'), + ('𑓄', '𑓇'), ('𑓐', '𑓙'), ('𑖀', '𑖮'), ('𑖰', '𑖱'), + ('𑖸', '𑖻'), ('𑖾', '𑖾'), ('𑗁', '𑗛'), ('𑘀', '𑘲'), + ('𑘻', '𑘼'), ('𑘾', '𑘾'), ('𑙁', '𑙄'), ('𑙐', '𑙙'), + ('𑙠', '𑙬'), ('𑚀', '𑚪'), ('𑚬', '𑚬'), ('𑚮', '𑚯'), + ('𑚶', '𑚶'), ('𑛀', '𑛉'), ('𑜀', '𑜙'), ('𑜠', '𑜡'), + ('𑜦', '𑜦'), ('𑜰', '𑜿'), ('𑢠', '𑣲'), ('𑣿', '𑣿'), + ('𑨀', '𑨀'), ('𑨇', '𑨈'), ('𑨋', '𑨲'), ('𑨹', '𑨺'), + ('𑨿', '𑩆'), ('𑩐', '𑩐'), ('𑩗', '𑩘'), ('𑩜', '𑪃'), + ('𑪆', '𑪉'), ('𑪗', '𑪗'), ('𑪚', '𑪜'), ('𑪞', '𑪢'), + ('𑫀', '𑫸'), ('𑰀', '𑰈'), ('𑰊', '𑰯'), ('𑰾', '𑰾'), + ('𑱀', '𑱅'), ('𑱐', '𑱬'), ('𑱰', '𑲏'), ('𑲩', '𑲩'), + ('𑲱', '𑲱'), ('𑲴', '𑲴'), ('𑴀', '𑴆'), ('𑴈', '𑴉'), + ('𑴋', '𑴰'), ('𑵆', '𑵆'), ('𑵐', '𑵙'), ('𒀀', '𒎙'), + ('𒐀', '𒑮'), ('𒑰', '𒑴'), ('𒒀', '𒕃'), ('𓀀', '𓐮'), + ('𔐀', '𔙆'), ('𖠀', '𖨸'), ('𖩀', '𖩞'), ('𖩠', '𖩩'), + ('𖩮', '𖩯'), ('𖫐', '𖫭'), ('𖫵', '𖫵'), ('𖬀', '𖬯'), + ('𖬷', '𖭅'), ('𖭐', '𖭙'), ('𖭛', '𖭡'), ('𖭣', '𖭷'), + ('𖭽', '𖮏'), ('𖼀', '𖽄'), ('𖽐', '𖽾'), ('𖾓', '𖾟'), + ('𖿠', '𖿡'), ('𗀀', '𘟬'), ('𘠀', '𘫲'), ('𛀀', '𛄞'), + ('𛅰', '𛋻'), ('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), + ('𛲐', '𛲙'), ('𛲜', '𛲜'), ('𛲟', '𛲟'), ('𝀀', '𝃵'), + ('𝄀', '𝄦'), ('𝄩', '𝅘𝅥𝅲'), ('𝅦', '𝅦'), ('𝅪', '𝅭'), + ('𝆃', '𝆄'), ('𝆌', '𝆩'), ('𝆮', '𝇨'), ('𝈀', '𝉁'), + ('𝉅', '𝉅'), ('𝌀', '𝍖'), ('𝍠', '𝍱'), ('𝐀', '𝑔'), + ('𝑖', '𝒜'), ('𝒞', '𝒟'), ('𝒢', '𝒢'), ('𝒥', '𝒦'), + ('𝒩', '𝒬'), ('𝒮', '𝒹'), ('𝒻', '𝒻'), ('𝒽', '𝓃'), + ('𝓅', '𝔅'), ('𝔇', '𝔊'), ('𝔍', '𝔔'), ('𝔖', '𝔜'), + ('𝔞', '𝔹'), ('𝔻', '𝔾'), ('𝕀', '𝕄'), ('𝕆', '𝕆'), + ('𝕊', '𝕐'), ('𝕒', '𝚥'), ('𝚨', '𝟋'), ('𝟎', '𝧿'), + ('𝨷', '𝨺'), ('𝩭', '𝩴'), ('𝩶', '𝪃'), ('𝪅', '𝪋'), + ('𞠀', '𞣄'), ('𞣇', '𞣏'), ('𞤀', '𞥃'), ('𞥐', '𞥙'), + ('𞥞', '𞥟'), ('𞸀', '𞸃'), ('𞸅', '𞸟'), ('𞸡', '𞸢'), + ('𞸤', '𞸤'), ('𞸧', '𞸧'), ('𞸩', '𞸲'), ('𞸴', '𞸷'), + ('𞸹', '𞸹'), ('𞸻', '𞸻'), ('𞹂', '𞹂'), ('𞹇', '𞹇'), + ('𞹉', '𞹉'), ('𞹋', '𞹋'), ('𞹍', '𞹏'), ('𞹑', '𞹒'), + ('𞹔', '𞹔'), ('𞹗', '𞹗'), ('𞹙', '𞹙'), ('𞹛', '𞹛'), + ('𞹝', '𞹝'), ('𞹟', '𞹟'), ('𞹡', '𞹢'), ('𞹤', '𞹤'), + ('𞹧', '𞹪'), ('𞹬', '𞹲'), ('𞹴', '𞹷'), ('𞹹', '𞹼'), + ('𞹾', '𞹾'), ('𞺀', '𞺉'), ('𞺋', '𞺛'), ('𞺡', '𞺣'), + ('𞺥', '𞺩'), ('𞺫', '𞺻'), ('𞻰', '𞻱'), ('🀀', '🀫'), + ('🀰', '🂓'), ('🂠', '🂮'), ('🂱', '🂿'), ('🃁', '🃏'), + ('🃑', '🃵'), ('🄀', '🄌'), ('🄐', '🄮'), ('🄰', '🅫'), + ('🅰', '🆬'), ('🇦', '🈂'), ('🈐', '🈻'), ('🉀', '🉈'), + ('🉐', '🉑'), ('🉠', '🉥'), ('🌀', '🛔'), ('🛠', '🛬'), + ('🛰', '🛸'), ('🜀', '🝳'), ('🞀', '🟔'), ('🠀', '🠋'), + ('🠐', '🡇'), ('🡐', '🡙'), ('🡠', '🢇'), ('🢐', '🢭'), + ('🤀', '🤋'), ('🤐', '🤾'), ('🥀', '🥌'), ('🥐', '🥫'), + ('🦀', '🦗'), ('🧀', '🧀'), ('🧐', '🧦'), ('𠀀', '𪛖'), + ('𪜀', '𫜴'), ('𫝀', '𫠝'), ('𫠠', '𬺡'), ('𬺰', '𮯠'), + ('丽', '𪘀'), +]; + +pub const GRAPHEME_EXTEND: &'static [(char, char)] = &[ + ('̀', 'ͯ'), ('҃', '҉'), ('֑', 'ֽ'), ('ֿ', 'ֿ'), ('ׁ', 'ׂ'), + ('ׄ', 'ׅ'), ('ׇ', 'ׇ'), ('ؐ', 'ؚ'), ('ً', 'ٟ'), ('ٰ', 'ٰ'), + ('ۖ', 'ۜ'), ('۟', 'ۤ'), ('ۧ', 'ۨ'), ('۪', 'ۭ'), ('ܑ', 'ܑ'), + ('ܰ', '݊'), ('ަ', 'ް'), ('߫', '߳'), ('ࠖ', '࠙'), ('ࠛ', 'ࠣ'), + ('ࠥ', 'ࠧ'), ('ࠩ', '࠭'), ('࡙', '࡛'), ('ࣔ', '࣡'), + ('ࣣ', 'ं'), ('ऺ', 'ऺ'), ('़', '़'), ('ु', 'ै'), + ('्', '्'), ('॑', 'ॗ'), ('ॢ', 'ॣ'), ('ঁ', 'ঁ'), + ('়', '়'), ('া', 'া'), ('ু', 'ৄ'), ('্', '্'), + ('ৗ', 'ৗ'), ('ৢ', 'ৣ'), ('ਁ', 'ਂ'), ('਼', '਼'), + ('ੁ', 'ੂ'), ('ੇ', 'ੈ'), ('ੋ', '੍'), ('ੑ', 'ੑ'), + ('ੰ', 'ੱ'), ('ੵ', 'ੵ'), ('ઁ', 'ં'), ('઼', '઼'), + ('ુ', 'ૅ'), ('ે', 'ૈ'), ('્', '્'), ('ૢ', 'ૣ'), + ('ૺ', '૿'), ('ଁ', 'ଁ'), ('଼', '଼'), ('ା', 'ି'), + ('ୁ', 'ୄ'), ('୍', '୍'), ('ୖ', 'ୗ'), ('ୢ', 'ୣ'), + ('ஂ', 'ஂ'), ('ா', 'ா'), ('ீ', 'ீ'), ('்', '்'), + ('ௗ', 'ௗ'), ('ఀ', 'ఀ'), ('ా', 'ీ'), ('ె', 'ై'), + ('ొ', '్'), ('ౕ', 'ౖ'), ('ౢ', 'ౣ'), ('ಁ', 'ಁ'), + ('಼', '಼'), ('ಿ', 'ಿ'), ('ೂ', 'ೂ'), ('ೆ', 'ೆ'), + ('ೌ', '್'), ('ೕ', 'ೖ'), ('ೢ', 'ೣ'), ('ഀ', 'ഁ'), + ('഻', '഼'), ('ാ', 'ാ'), ('ു', 'ൄ'), ('്', '്'), + ('ൗ', 'ൗ'), ('ൢ', 'ൣ'), ('්', '්'), ('ා', 'ා'), + ('ි', 'ු'), ('ූ', 'ූ'), ('ෟ', 'ෟ'), ('ั', 'ั'), + ('ิ', 'ฺ'), ('็', '๎'), ('ັ', 'ັ'), ('ິ', 'ູ'), + ('ົ', 'ຼ'), ('່', 'ໍ'), ('༘', '༙'), ('༵', '༵'), + ('༷', '༷'), ('༹', '༹'), ('ཱ', 'ཾ'), ('ྀ', '྄'), + ('྆', '྇'), ('ྍ', 'ྗ'), ('ྙ', 'ྼ'), ('࿆', '࿆'), + ('ိ', 'ူ'), ('ဲ', '့'), ('္', '်'), ('ွ', 'ှ'), + ('ၘ', 'ၙ'), ('ၞ', 'ၠ'), ('ၱ', 'ၴ'), ('ႂ', 'ႂ'), + ('ႅ', 'ႆ'), ('ႍ', 'ႍ'), ('ႝ', 'ႝ'), ('፝', '፟'), + ('ᜒ', '᜔'), ('ᜲ', '᜴'), ('ᝒ', 'ᝓ'), ('ᝲ', 'ᝳ'), + ('឴', '឵'), ('ិ', 'ួ'), ('ំ', 'ំ'), ('៉', '៓'), + ('៝', '៝'), ('᠋', '᠍'), ('ᢅ', 'ᢆ'), ('ᢩ', 'ᢩ'), + ('ᤠ', 'ᤢ'), ('ᤧ', 'ᤨ'), ('ᤲ', 'ᤲ'), ('᤹', '᤻'), + ('ᨗ', 'ᨘ'), ('ᨛ', 'ᨛ'), ('ᩖ', 'ᩖ'), ('ᩘ', 'ᩞ'), + ('᩠', '᩠'), ('ᩢ', 'ᩢ'), ('ᩥ', 'ᩬ'), ('ᩳ', '᩼'), + ('᩿', '᩿'), ('᪰', '᪾'), ('ᬀ', 'ᬃ'), ('᬴', '᬴'), + ('ᬶ', 'ᬺ'), ('ᬼ', 'ᬼ'), ('ᭂ', 'ᭂ'), ('᭫', '᭳'), + ('ᮀ', 'ᮁ'), ('ᮢ', 'ᮥ'), ('ᮨ', 'ᮩ'), ('᮫', 'ᮭ'), + ('᯦', '᯦'), ('ᯨ', 'ᯩ'), ('ᯭ', 'ᯭ'), ('ᯯ', 'ᯱ'), + ('ᰬ', 'ᰳ'), ('ᰶ', '᰷'), ('᳐', '᳒'), ('᳔', '᳠'), + ('᳢', '᳨'), ('᳭', '᳭'), ('᳴', '᳴'), ('᳸', '᳹'), + ('᷀', '᷹'), ('᷻', '᷿'), ('\u{200c}', '\u{200c}'), ('⃐', '⃰'), + ('⳯', '⳱'), ('⵿', '⵿'), ('ⷠ', 'ⷿ'), ('〪', '〯'), + ('゙', '゚'), ('꙯', '꙲'), ('ꙴ', '꙽'), ('ꚞ', 'ꚟ'), + ('꛰', '꛱'), ('ꠂ', 'ꠂ'), ('꠆', '꠆'), ('ꠋ', 'ꠋ'), + ('ꠥ', 'ꠦ'), ('꣄', 'ꣅ'), ('꣠', '꣱'), ('ꤦ', '꤭'), + ('ꥇ', 'ꥑ'), ('ꦀ', 'ꦂ'), ('꦳', '꦳'), ('ꦶ', 'ꦹ'), + ('ꦼ', 'ꦼ'), ('ꧥ', 'ꧥ'), ('ꨩ', 'ꨮ'), ('ꨱ', 'ꨲ'), + ('ꨵ', 'ꨶ'), ('ꩃ', 'ꩃ'), ('ꩌ', 'ꩌ'), ('ꩼ', 'ꩼ'), + ('ꪰ', 'ꪰ'), ('ꪲ', 'ꪴ'), ('ꪷ', 'ꪸ'), ('ꪾ', '꪿'), + ('꫁', '꫁'), ('ꫬ', 'ꫭ'), ('꫶', '꫶'), ('ꯥ', 'ꯥ'), + ('ꯨ', 'ꯨ'), ('꯭', '꯭'), ('ﬞ', 'ﬞ'), ('︀', '️'), + ('︠', '︯'), ('゙', '゚'), ('𐇽', '𐇽'), ('𐋠', '𐋠'), + ('𐍶', '𐍺'), ('𐨁', '𐨃'), ('𐨅', '𐨆'), ('𐨌', '𐨏'), + ('𐨸', '𐨺'), ('𐨿', '𐨿'), ('𐫥', '𐫦'), ('𑀁', '𑀁'), + ('𑀸', '𑁆'), ('𑁿', '𑂁'), ('𑂳', '𑂶'), ('𑂹', '𑂺'), + ('𑄀', '𑄂'), ('𑄧', '𑄫'), ('𑄭', '𑄴'), ('𑅳', '𑅳'), + ('𑆀', '𑆁'), ('𑆶', '𑆾'), ('𑇊', '𑇌'), ('𑈯', '𑈱'), + ('𑈴', '𑈴'), ('𑈶', '𑈷'), ('𑈾', '𑈾'), ('𑋟', '𑋟'), + ('𑋣', '𑋪'), ('𑌀', '𑌁'), ('𑌼', '𑌼'), ('𑌾', '𑌾'), + ('𑍀', '𑍀'), ('𑍗', '𑍗'), ('𑍦', '𑍬'), ('𑍰', '𑍴'), + ('𑐸', '𑐿'), ('𑑂', '𑑄'), ('𑑆', '𑑆'), ('𑒰', '𑒰'), + ('𑒳', '𑒸'), ('𑒺', '𑒺'), ('𑒽', '𑒽'), ('𑒿', '𑓀'), + ('𑓂', '𑓃'), ('𑖯', '𑖯'), ('𑖲', '𑖵'), ('𑖼', '𑖽'), + ('𑖿', '𑗀'), ('𑗜', '𑗝'), ('𑘳', '𑘺'), ('𑘽', '𑘽'), + ('𑘿', '𑙀'), ('𑚫', '𑚫'), ('𑚭', '𑚭'), ('𑚰', '𑚵'), + ('𑚷', '𑚷'), ('𑜝', '𑜟'), ('𑜢', '𑜥'), ('𑜧', '𑜫'), + ('𑨁', '𑨆'), ('𑨉', '𑨊'), ('𑨳', '𑨸'), ('𑨻', '𑨾'), + ('𑩇', '𑩇'), ('𑩑', '𑩖'), ('𑩙', '𑩛'), ('𑪊', '𑪖'), + ('𑪘', '𑪙'), ('𑰰', '𑰶'), ('𑰸', '𑰽'), ('𑰿', '𑰿'), + ('𑲒', '𑲧'), ('𑲪', '𑲰'), ('𑲲', '𑲳'), ('𑲵', '𑲶'), + ('𑴱', '𑴶'), ('𑴺', '𑴺'), ('𑴼', '𑴽'), ('𑴿', '𑵅'), + ('𑵇', '𑵇'), ('𖫰', '𖫴'), ('𖬰', '𖬶'), ('𖾏', '𖾒'), + ('𛲝', '𛲞'), ('𝅥', '𝅥'), ('𝅧', '𝅩'), ('𝅮', '𝅲'), + ('𝅻', '𝆂'), ('𝆅', '𝆋'), ('𝆪', '𝆭'), ('𝉂', '𝉄'), + ('𝨀', '𝨶'), ('𝨻', '𝩬'), ('𝩵', '𝩵'), ('𝪄', '𝪄'), + ('𝪛', '𝪟'), ('𝪡', '𝪯'), ('𞀀', '𞀆'), ('𞀈', '𞀘'), + ('𞀛', '𞀡'), ('𞀣', '𞀤'), ('𞀦', '𞀪'), ('𞣐', '𞣖'), + ('𞥄', '𞥊'), ('\u{e0020}', '\u{e007f}'), ('󠄀', '󠇯'), +]; + +pub const GRAPHEME_LINK: &'static [(char, char)] = &[ + ('्', '्'), ('্', '্'), ('੍', '੍'), ('્', '્'), + ('୍', '୍'), ('்', '்'), ('్', '్'), ('್', '್'), + ('഻', '഼'), ('്', '്'), ('්', '්'), ('ฺ', 'ฺ'), + ('྄', '྄'), ('္', '်'), ('᜔', '᜔'), ('᜴', '᜴'), + ('្', '្'), ('᩠', '᩠'), ('᭄', '᭄'), ('᮪', '᮫'), + ('᯲', '᯳'), ('⵿', '⵿'), ('꠆', '꠆'), ('꣄', '꣄'), + ('꥓', '꥓'), ('꧀', '꧀'), ('꫶', '꫶'), ('꯭', '꯭'), + ('𐨿', '𐨿'), ('𑁆', '𑁆'), ('𑁿', '𑁿'), ('𑂹', '𑂹'), + ('𑄳', '𑄴'), ('𑇀', '𑇀'), ('𑈵', '𑈵'), ('𑋪', '𑋪'), + ('𑍍', '𑍍'), ('𑑂', '𑑂'), ('𑓂', '𑓂'), ('𑖿', '𑖿'), + ('𑘿', '𑘿'), ('𑚶', '𑚶'), ('𑜫', '𑜫'), ('𑨴', '𑨴'), + ('𑩇', '𑩇'), ('𑪙', '𑪙'), ('𑰿', '𑰿'), ('𑵄', '𑵅'), +]; + +pub const HEX_DIGIT: &'static [(char, char)] = &[ + ('0', '9'), ('A', 'F'), ('a', 'f'), ('0', '9'), ('A', 'F'), + ('a', 'f'), +]; + +pub const HYPHEN: &'static [(char, char)] = &[ + ('-', '-'), ('\u{ad}', '\u{ad}'), ('֊', '֊'), ('᠆', '᠆'), + ('‐', '‑'), ('⸗', '⸗'), ('・', '・'), ('﹣', '﹣'), + ('-', '-'), ('・', '・'), +]; + +pub const IDS_BINARY_OPERATOR: &'static [(char, char)] = &[ + ('⿰', '⿱'), ('⿴', '⿻'), +]; + +pub const IDS_TRINARY_OPERATOR: &'static [(char, char)] = &[ + ('⿲', '⿳'), +]; + +pub const ID_CONTINUE: &'static [(char, char)] = &[ + ('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z'), ('ª', 'ª'), ('µ', 'µ'), + ('·', '·'), ('º', 'º'), ('À', 'Ö'), ('Ø', 'ö'), ('ø', 'ˁ'), + ('ˆ', 'ˑ'), ('ˠ', 'ˤ'), ('ˬ', 'ˬ'), ('ˮ', 'ˮ'), ('̀', 'ʹ'), + ('Ͷ', 'ͷ'), ('ͺ', 'ͽ'), ('Ϳ', 'Ϳ'), ('Ά', 'Ί'), ('Ό', 'Ό'), + ('Ύ', 'Ρ'), ('Σ', 'ϵ'), ('Ϸ', 'ҁ'), ('҃', '҇'), ('Ҋ', 'ԯ'), + ('Ա', 'Ֆ'), ('ՙ', 'ՙ'), ('ա', 'և'), ('֑', 'ֽ'), ('ֿ', 'ֿ'), + ('ׁ', 'ׂ'), ('ׄ', 'ׅ'), ('ׇ', 'ׇ'), ('א', 'ת'), ('װ', 'ײ'), + ('ؐ', 'ؚ'), ('ؠ', '٩'), ('ٮ', 'ۓ'), ('ە', 'ۜ'), ('۟', 'ۨ'), + ('۪', 'ۼ'), ('ۿ', 'ۿ'), ('ܐ', '݊'), ('ݍ', 'ޱ'), ('߀', 'ߵ'), + ('ߺ', 'ߺ'), ('ࠀ', '࠭'), ('ࡀ', '࡛'), ('ࡠ', 'ࡪ'), + ('ࢠ', 'ࢴ'), ('ࢶ', 'ࢽ'), ('ࣔ', '࣡'), ('ࣣ', 'ॣ'), + ('०', '९'), ('ॱ', 'ঃ'), ('অ', 'ঌ'), ('এ', 'ঐ'), + ('ও', 'ন'), ('প', 'র'), ('ল', 'ল'), ('শ', 'হ'), + ('়', 'ৄ'), ('ে', 'ৈ'), ('ো', 'ৎ'), ('ৗ', 'ৗ'), + ('ড়', 'ঢ়'), ('য়', 'ৣ'), ('০', 'ৱ'), ('ৼ', 'ৼ'), + ('ਁ', 'ਃ'), ('ਅ', 'ਊ'), ('ਏ', 'ਐ'), ('ਓ', 'ਨ'), + ('ਪ', 'ਰ'), ('ਲ', 'ਲ਼'), ('ਵ', 'ਸ਼'), ('ਸ', 'ਹ'), + ('਼', '਼'), ('ਾ', 'ੂ'), ('ੇ', 'ੈ'), ('ੋ', '੍'), + ('ੑ', 'ੑ'), ('ਖ਼', 'ੜ'), ('ਫ਼', 'ਫ਼'), ('੦', 'ੵ'), + ('ઁ', 'ઃ'), ('અ', 'ઍ'), ('એ', 'ઑ'), ('ઓ', 'ન'), + ('પ', 'ર'), ('લ', 'ળ'), ('વ', 'હ'), ('઼', 'ૅ'), + ('ે', 'ૉ'), ('ો', '્'), ('ૐ', 'ૐ'), ('ૠ', 'ૣ'), + ('૦', '૯'), ('ૹ', '૿'), ('ଁ', 'ଃ'), ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), ('ଓ', 'ନ'), ('ପ', 'ର'), ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), ('଼', 'ୄ'), ('େ', 'ୈ'), ('ୋ', '୍'), + ('ୖ', 'ୗ'), ('ଡ଼', 'ଢ଼'), ('ୟ', 'ୣ'), ('୦', '୯'), + ('ୱ', 'ୱ'), ('ஂ', 'ஃ'), ('அ', 'ஊ'), ('எ', 'ஐ'), + ('ஒ', 'க'), ('ங', 'ச'), ('ஜ', 'ஜ'), ('ஞ', 'ட'), + ('ண', 'த'), ('ந', 'ப'), ('ம', 'ஹ'), ('ா', 'ூ'), + ('ெ', 'ை'), ('ொ', '்'), ('ௐ', 'ௐ'), ('ௗ', 'ௗ'), + ('௦', '௯'), ('ఀ', 'ః'), ('అ', 'ఌ'), ('ఎ', 'ఐ'), + ('ఒ', 'న'), ('ప', 'హ'), ('ఽ', 'ౄ'), ('ె', 'ై'), + ('ొ', '్'), ('ౕ', 'ౖ'), ('ౘ', 'ౚ'), ('ౠ', 'ౣ'), + ('౦', '౯'), ('ಀ', 'ಃ'), ('ಅ', 'ಌ'), ('ಎ', 'ಐ'), + ('ಒ', 'ನ'), ('ಪ', 'ಳ'), ('ವ', 'ಹ'), ('಼', 'ೄ'), + ('ೆ', 'ೈ'), ('ೊ', '್'), ('ೕ', 'ೖ'), ('ೞ', 'ೞ'), + ('ೠ', 'ೣ'), ('೦', '೯'), ('ೱ', 'ೲ'), ('ഀ', 'ഃ'), + ('അ', 'ഌ'), ('എ', 'ഐ'), ('ഒ', 'ൄ'), ('െ', 'ൈ'), + ('ൊ', 'ൎ'), ('ൔ', 'ൗ'), ('ൟ', 'ൣ'), ('൦', '൯'), + ('ൺ', 'ൿ'), ('ං', 'ඃ'), ('අ', 'ඖ'), ('ක', 'න'), + ('ඳ', 'ර'), ('ල', 'ල'), ('ව', 'ෆ'), ('්', '්'), + ('ා', 'ු'), ('ූ', 'ූ'), ('ෘ', 'ෟ'), ('෦', '෯'), + ('ෲ', 'ෳ'), ('ก', 'ฺ'), ('เ', '๎'), ('๐', '๙'), + ('ກ', 'ຂ'), ('ຄ', 'ຄ'), ('ງ', 'ຈ'), ('ຊ', 'ຊ'), + ('ຍ', 'ຍ'), ('ດ', 'ທ'), ('ນ', 'ຟ'), ('ມ', 'ຣ'), + ('ລ', 'ລ'), ('ວ', 'ວ'), ('ສ', 'ຫ'), ('ອ', 'ູ'), + ('ົ', 'ຽ'), ('ເ', 'ໄ'), ('ໆ', 'ໆ'), ('່', 'ໍ'), + ('໐', '໙'), ('ໜ', 'ໟ'), ('ༀ', 'ༀ'), ('༘', '༙'), + ('༠', '༩'), ('༵', '༵'), ('༷', '༷'), ('༹', '༹'), + ('༾', 'ཇ'), ('ཉ', 'ཬ'), ('ཱ', '྄'), ('྆', 'ྗ'), + ('ྙ', 'ྼ'), ('࿆', '࿆'), ('က', '၉'), ('ၐ', 'ႝ'), + ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), ('ა', 'ჺ'), + ('ჼ', 'ቈ'), ('ቊ', 'ቍ'), ('ቐ', 'ቖ'), ('ቘ', 'ቘ'), + ('ቚ', 'ቝ'), ('በ', 'ኈ'), ('ኊ', 'ኍ'), ('ነ', 'ኰ'), + ('ኲ', 'ኵ'), ('ኸ', 'ኾ'), ('ዀ', 'ዀ'), ('ዂ', 'ዅ'), + ('ወ', 'ዖ'), ('ዘ', 'ጐ'), ('ጒ', 'ጕ'), ('ጘ', 'ፚ'), + ('፝', '፟'), ('፩', '፱'), ('ᎀ', 'ᎏ'), ('Ꭰ', 'Ᏽ'), + ('ᏸ', 'ᏽ'), ('ᐁ', 'ᙬ'), ('ᙯ', 'ᙿ'), ('ᚁ', 'ᚚ'), + ('ᚠ', 'ᛪ'), ('ᛮ', 'ᛸ'), ('ᜀ', 'ᜌ'), ('ᜎ', '᜔'), + ('ᜠ', '᜴'), ('ᝀ', 'ᝓ'), ('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), + ('ᝲ', 'ᝳ'), ('ក', '៓'), ('ៗ', 'ៗ'), ('ៜ', '៝'), + ('០', '៩'), ('᠋', '᠍'), ('᠐', '᠙'), ('ᠠ', 'ᡷ'), + ('ᢀ', 'ᢪ'), ('ᢰ', 'ᣵ'), ('ᤀ', 'ᤞ'), ('ᤠ', 'ᤫ'), + ('ᤰ', '᤻'), ('᥆', 'ᥭ'), ('ᥰ', 'ᥴ'), ('ᦀ', 'ᦫ'), + ('ᦰ', 'ᧉ'), ('᧐', '᧚'), ('ᨀ', 'ᨛ'), ('ᨠ', 'ᩞ'), + ('᩠', '᩼'), ('᩿', '᪉'), ('᪐', '᪙'), ('ᪧ', 'ᪧ'), + ('᪰', '᪽'), ('ᬀ', 'ᭋ'), ('᭐', '᭙'), ('᭫', '᭳'), + ('ᮀ', '᯳'), ('ᰀ', '᰷'), ('᱀', '᱉'), ('ᱍ', 'ᱽ'), + ('ᲀ', 'ᲈ'), ('᳐', '᳒'), ('᳔', '᳹'), ('ᴀ', '᷹'), + ('᷻', 'ἕ'), ('Ἐ', 'Ἕ'), ('ἠ', 'ὅ'), ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), ('ᾶ', 'ᾼ'), ('ι', 'ι'), + ('ῂ', 'ῄ'), ('ῆ', 'ῌ'), ('ῐ', 'ΐ'), ('ῖ', 'Ί'), + ('ῠ', 'Ῥ'), ('ῲ', 'ῴ'), ('ῶ', 'ῼ'), ('‿', '⁀'), + ('⁔', '⁔'), ('ⁱ', 'ⁱ'), ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), + ('⃐', '⃜'), ('⃡', '⃡'), ('⃥', '⃰'), ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), ('ℊ', 'ℓ'), ('ℕ', 'ℕ'), ('℘', 'ℝ'), + ('ℤ', 'ℤ'), ('Ω', 'Ω'), ('ℨ', 'ℨ'), ('K', 'ℹ'), + ('ℼ', 'ℿ'), ('ⅅ', 'ⅉ'), ('ⅎ', 'ⅎ'), ('Ⅰ', 'ↈ'), + ('Ⰰ', 'Ⱞ'), ('ⰰ', 'ⱞ'), ('Ⱡ', 'ⳤ'), ('Ⳬ', 'ⳳ'), + ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), ('ⴰ', 'ⵧ'), + ('ⵯ', 'ⵯ'), ('⵿', 'ⶖ'), ('ⶠ', 'ⶦ'), ('ⶨ', 'ⶮ'), + ('ⶰ', 'ⶶ'), ('ⶸ', 'ⶾ'), ('ⷀ', 'ⷆ'), ('ⷈ', 'ⷎ'), + ('ⷐ', 'ⷖ'), ('ⷘ', 'ⷞ'), ('ⷠ', 'ⷿ'), ('々', '〇'), + ('〡', '〯'), ('〱', '〵'), ('〸', '〼'), ('ぁ', 'ゖ'), + ('゙', 'ゟ'), ('ァ', 'ヺ'), ('ー', 'ヿ'), ('ㄅ', 'ㄮ'), + ('ㄱ', 'ㆎ'), ('ㆠ', 'ㆺ'), ('ㇰ', 'ㇿ'), ('㐀', '䶵'), + ('一', '鿪'), ('ꀀ', 'ꒌ'), ('ꓐ', 'ꓽ'), ('ꔀ', 'ꘌ'), + ('ꘐ', 'ꘫ'), ('Ꙁ', '꙯'), ('ꙴ', '꙽'), ('ꙿ', '꛱'), + ('ꜗ', 'ꜟ'), ('Ꜣ', 'ꞈ'), ('Ꞌ', 'Ɪ'), ('Ʞ', 'ꞷ'), + ('ꟷ', 'ꠧ'), ('ꡀ', 'ꡳ'), ('ꢀ', 'ꣅ'), ('꣐', '꣙'), + ('꣠', 'ꣷ'), ('ꣻ', 'ꣻ'), ('ꣽ', 'ꣽ'), ('꤀', '꤭'), + ('ꤰ', '꥓'), ('ꥠ', 'ꥼ'), ('ꦀ', '꧀'), ('ꧏ', '꧙'), + ('ꧠ', 'ꧾ'), ('ꨀ', 'ꨶ'), ('ꩀ', 'ꩍ'), ('꩐', '꩙'), + ('ꩠ', 'ꩶ'), ('ꩺ', 'ꫂ'), ('ꫛ', 'ꫝ'), ('ꫠ', 'ꫯ'), + ('ꫲ', '꫶'), ('ꬁ', 'ꬆ'), ('ꬉ', 'ꬎ'), ('ꬑ', 'ꬖ'), + ('ꬠ', 'ꬦ'), ('ꬨ', 'ꬮ'), ('ꬰ', 'ꭚ'), ('ꭜ', 'ꭥ'), + ('ꭰ', 'ꯪ'), ('꯬', '꯭'), ('꯰', '꯹'), ('가', '힣'), + ('ힰ', 'ퟆ'), ('ퟋ', 'ퟻ'), ('豈', '舘'), ('並', '龎'), + ('ff', 'st'), ('ﬓ', 'ﬗ'), ('יִ', 'ﬨ'), ('שׁ', 'זּ'), + ('טּ', 'לּ'), ('מּ', 'מּ'), ('נּ', 'סּ'), ('ףּ', 'פּ'), + ('צּ', 'ﮱ'), ('ﯓ', 'ﴽ'), ('ﵐ', 'ﶏ'), ('ﶒ', 'ﷇ'), + ('ﷰ', 'ﷻ'), ('︀', '️'), ('︠', '︯'), ('︳', '︴'), + ('﹍', '﹏'), ('ﹰ', 'ﹴ'), ('ﹶ', 'ﻼ'), ('0', '9'), + ('A', 'Z'), ('_', '_'), ('a', 'z'), ('ヲ', 'ᄒ'), + ('ᅡ', 'ᅦ'), ('ᅧ', 'ᅬ'), ('ᅭ', 'ᅲ'), ('ᅳ', 'ᅵ'), + ('𐀀', '𐀋'), ('𐀍', '𐀦'), ('𐀨', '𐀺'), ('𐀼', '𐀽'), + ('𐀿', '𐁍'), ('𐁐', '𐁝'), ('𐂀', '𐃺'), ('𐅀', '𐅴'), + ('𐇽', '𐇽'), ('𐊀', '𐊜'), ('𐊠', '𐋐'), ('𐋠', '𐋠'), + ('𐌀', '𐌟'), ('𐌭', '𐍊'), ('𐍐', '𐍺'), ('𐎀', '𐎝'), + ('𐎠', '𐏃'), ('𐏈', '𐏏'), ('𐏑', '𐏕'), ('𐐀', '𐒝'), + ('𐒠', '𐒩'), ('𐒰', '𐓓'), ('𐓘', '𐓻'), ('𐔀', '𐔧'), + ('𐔰', '𐕣'), ('𐘀', '𐜶'), ('𐝀', '𐝕'), ('𐝠', '𐝧'), + ('𐠀', '𐠅'), ('𐠈', '𐠈'), ('𐠊', '𐠵'), ('𐠷', '𐠸'), + ('𐠼', '𐠼'), ('𐠿', '𐡕'), ('𐡠', '𐡶'), ('𐢀', '𐢞'), + ('𐣠', '𐣲'), ('𐣴', '𐣵'), ('𐤀', '𐤕'), ('𐤠', '𐤹'), + ('𐦀', '𐦷'), ('𐦾', '𐦿'), ('𐨀', '𐨃'), ('𐨅', '𐨆'), + ('𐨌', '𐨓'), ('𐨕', '𐨗'), ('𐨙', '𐨳'), ('𐨸', '𐨺'), + ('𐨿', '𐨿'), ('𐩠', '𐩼'), ('𐪀', '𐪜'), ('𐫀', '𐫇'), + ('𐫉', '𐫦'), ('𐬀', '𐬵'), ('𐭀', '𐭕'), ('𐭠', '𐭲'), + ('𐮀', '𐮑'), ('𐰀', '𐱈'), ('𐲀', '𐲲'), ('𐳀', '𐳲'), + ('𑀀', '𑁆'), ('𑁦', '𑁯'), ('𑁿', '𑂺'), ('𑃐', '𑃨'), + ('𑃰', '𑃹'), ('𑄀', '𑄴'), ('𑄶', '𑄿'), ('𑅐', '𑅳'), + ('𑅶', '𑅶'), ('𑆀', '𑇄'), ('𑇊', '𑇌'), ('𑇐', '𑇚'), + ('𑇜', '𑇜'), ('𑈀', '𑈑'), ('𑈓', '𑈷'), ('𑈾', '𑈾'), + ('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), ('𑊏', '𑊝'), + ('𑊟', '𑊨'), ('𑊰', '𑋪'), ('𑋰', '𑋹'), ('𑌀', '𑌃'), + ('𑌅', '𑌌'), ('𑌏', '𑌐'), ('𑌓', '𑌨'), ('𑌪', '𑌰'), + ('𑌲', '𑌳'), ('𑌵', '𑌹'), ('𑌼', '𑍄'), ('𑍇', '𑍈'), + ('𑍋', '𑍍'), ('𑍐', '𑍐'), ('𑍗', '𑍗'), ('𑍝', '𑍣'), + ('𑍦', '𑍬'), ('𑍰', '𑍴'), ('𑐀', '𑑊'), ('𑑐', '𑑙'), + ('𑒀', '𑓅'), ('𑓇', '𑓇'), ('𑓐', '𑓙'), ('𑖀', '𑖵'), + ('𑖸', '𑗀'), ('𑗘', '𑗝'), ('𑘀', '𑙀'), ('𑙄', '𑙄'), + ('𑙐', '𑙙'), ('𑚀', '𑚷'), ('𑛀', '𑛉'), ('𑜀', '𑜙'), + ('𑜝', '𑜫'), ('𑜰', '𑜹'), ('𑢠', '𑣩'), ('𑣿', '𑣿'), + ('𑨀', '𑨾'), ('𑩇', '𑩇'), ('𑩐', '𑪃'), ('𑪆', '𑪙'), + ('𑫀', '𑫸'), ('𑰀', '𑰈'), ('𑰊', '𑰶'), ('𑰸', '𑱀'), + ('𑱐', '𑱙'), ('𑱲', '𑲏'), ('𑲒', '𑲧'), ('𑲩', '𑲶'), + ('𑴀', '𑴆'), ('𑴈', '𑴉'), ('𑴋', '𑴶'), ('𑴺', '𑴺'), + ('𑴼', '𑴽'), ('𑴿', '𑵇'), ('𑵐', '𑵙'), ('𒀀', '𒎙'), + ('𒐀', '𒑮'), ('𒒀', '𒕃'), ('𓀀', '𓐮'), ('𔐀', '𔙆'), + ('𖠀', '𖨸'), ('𖩀', '𖩞'), ('𖩠', '𖩩'), ('𖫐', '𖫭'), + ('𖫰', '𖫴'), ('𖬀', '𖬶'), ('𖭀', '𖭃'), ('𖭐', '𖭙'), + ('𖭣', '𖭷'), ('𖭽', '𖮏'), ('𖼀', '𖽄'), ('𖽐', '𖽾'), + ('𖾏', '𖾟'), ('𖿠', '𖿡'), ('𗀀', '𘟬'), ('𘠀', '𘫲'), + ('𛀀', '𛄞'), ('𛅰', '𛋻'), ('𛰀', '𛱪'), ('𛱰', '𛱼'), + ('𛲀', '𛲈'), ('𛲐', '𛲙'), ('𛲝', '𛲞'), ('𝅥', '𝅩'), + ('𝅭', '𝅲'), ('𝅻', '𝆂'), ('𝆅', '𝆋'), ('𝆪', '𝆭'), + ('𝉂', '𝉄'), ('𝐀', '𝑔'), ('𝑖', '𝒜'), ('𝒞', '𝒟'), + ('𝒢', '𝒢'), ('𝒥', '𝒦'), ('𝒩', '𝒬'), ('𝒮', '𝒹'), + ('𝒻', '𝒻'), ('𝒽', '𝓃'), ('𝓅', '𝔅'), ('𝔇', '𝔊'), + ('𝔍', '𝔔'), ('𝔖', '𝔜'), ('𝔞', '𝔹'), ('𝔻', '𝔾'), + ('𝕀', '𝕄'), ('𝕆', '𝕆'), ('𝕊', '𝕐'), ('𝕒', '𝚥'), + ('𝚨', '𝛀'), ('𝛂', '𝛚'), ('𝛜', '𝛺'), ('𝛼', '𝜔'), + ('𝜖', '𝜴'), ('𝜶', '𝝎'), ('𝝐', '𝝮'), ('𝝰', '𝞈'), + ('𝞊', '𝞨'), ('𝞪', '𝟂'), ('𝟄', '𝟋'), ('𝟎', '𝟿'), + ('𝨀', '𝨶'), ('𝨻', '𝩬'), ('𝩵', '𝩵'), ('𝪄', '𝪄'), + ('𝪛', '𝪟'), ('𝪡', '𝪯'), ('𞀀', '𞀆'), ('𞀈', '𞀘'), + ('𞀛', '𞀡'), ('𞀣', '𞀤'), ('𞀦', '𞀪'), ('𞠀', '𞣄'), + ('𞣐', '𞣖'), ('𞤀', '𞥊'), ('𞥐', '𞥙'), ('𞸀', '𞸃'), + ('𞸅', '𞸟'), ('𞸡', '𞸢'), ('𞸤', '𞸤'), ('𞸧', '𞸧'), + ('𞸩', '𞸲'), ('𞸴', '𞸷'), ('𞸹', '𞸹'), ('𞸻', '𞸻'), + ('𞹂', '𞹂'), ('𞹇', '𞹇'), ('𞹉', '𞹉'), ('𞹋', '𞹋'), + ('𞹍', '𞹏'), ('𞹑', '𞹒'), ('𞹔', '𞹔'), ('𞹗', '𞹗'), + ('𞹙', '𞹙'), ('𞹛', '𞹛'), ('𞹝', '𞹝'), ('𞹟', '𞹟'), + ('𞹡', '𞹢'), ('𞹤', '𞹤'), ('𞹧', '𞹪'), ('𞹬', '𞹲'), + ('𞹴', '𞹷'), ('𞹹', '𞹼'), ('𞹾', '𞹾'), ('𞺀', '𞺉'), + ('𞺋', '𞺛'), ('𞺡', '𞺣'), ('𞺥', '𞺩'), ('𞺫', '𞺻'), + ('𠀀', '𪛖'), ('𪜀', '𫜴'), ('𫝀', '𫠝'), ('𫠠', '𬺡'), + ('𬺰', '𮯠'), ('丽', '𪘀'), ('󠄀', '󠇯'), +]; + +pub const ID_START: &'static [(char, char)] = &[ + ('A', 'Z'), ('a', 'z'), ('ª', 'ª'), ('µ', 'µ'), ('º', 'º'), + ('À', 'Ö'), ('Ø', 'ö'), ('ø', 'ˁ'), ('ˆ', 'ˑ'), ('ˠ', 'ˤ'), + ('ˬ', 'ˬ'), ('ˮ', 'ˮ'), ('Ͱ', 'ʹ'), ('Ͷ', 'ͷ'), ('ͺ', 'ͽ'), + ('Ϳ', 'Ϳ'), ('Ά', 'Ά'), ('Έ', 'Ί'), ('Ό', 'Ό'), ('Ύ', 'Ρ'), + ('Σ', 'ϵ'), ('Ϸ', 'ҁ'), ('Ҋ', 'ԯ'), ('Ա', 'Ֆ'), ('ՙ', 'ՙ'), + ('ա', 'և'), ('א', 'ת'), ('װ', 'ײ'), ('ؠ', 'ي'), ('ٮ', 'ٯ'), + ('ٱ', 'ۓ'), ('ە', 'ە'), ('ۥ', 'ۦ'), ('ۮ', 'ۯ'), ('ۺ', 'ۼ'), + ('ۿ', 'ۿ'), ('ܐ', 'ܐ'), ('ܒ', 'ܯ'), ('ݍ', 'ޥ'), ('ޱ', 'ޱ'), + ('ߊ', 'ߪ'), ('ߴ', 'ߵ'), ('ߺ', 'ߺ'), ('ࠀ', 'ࠕ'), ('ࠚ', 'ࠚ'), + ('ࠤ', 'ࠤ'), ('ࠨ', 'ࠨ'), ('ࡀ', 'ࡘ'), ('ࡠ', 'ࡪ'), + ('ࢠ', 'ࢴ'), ('ࢶ', 'ࢽ'), ('ऄ', 'ह'), ('ऽ', 'ऽ'), + ('ॐ', 'ॐ'), ('क़', 'ॡ'), ('ॱ', 'ঀ'), ('অ', 'ঌ'), + ('এ', 'ঐ'), ('ও', 'ন'), ('প', 'র'), ('ল', 'ল'), + ('শ', 'হ'), ('ঽ', 'ঽ'), ('ৎ', 'ৎ'), ('ড়', 'ঢ়'), + ('য়', 'ৡ'), ('ৰ', 'ৱ'), ('ৼ', 'ৼ'), ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), ('ਓ', 'ਨ'), ('ਪ', 'ਰ'), ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), ('ਸ', 'ਹ'), ('ਖ਼', 'ੜ'), ('ਫ਼', 'ਫ਼'), + ('ੲ', 'ੴ'), ('અ', 'ઍ'), ('એ', 'ઑ'), ('ઓ', 'ન'), + ('પ', 'ર'), ('લ', 'ળ'), ('વ', 'હ'), ('ઽ', 'ઽ'), + ('ૐ', 'ૐ'), ('ૠ', 'ૡ'), ('ૹ', 'ૹ'), ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), ('ଓ', 'ନ'), ('ପ', 'ର'), ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), ('ଽ', 'ଽ'), ('ଡ଼', 'ଢ଼'), ('ୟ', 'ୡ'), + ('ୱ', 'ୱ'), ('ஃ', 'ஃ'), ('அ', 'ஊ'), ('எ', 'ஐ'), + ('ஒ', 'க'), ('ங', 'ச'), ('ஜ', 'ஜ'), ('ஞ', 'ட'), + ('ண', 'த'), ('ந', 'ப'), ('ம', 'ஹ'), ('ௐ', 'ௐ'), + ('అ', 'ఌ'), ('ఎ', 'ఐ'), ('ఒ', 'న'), ('ప', 'హ'), + ('ఽ', 'ఽ'), ('ౘ', 'ౚ'), ('ౠ', 'ౡ'), ('ಀ', 'ಀ'), + ('ಅ', 'ಌ'), ('ಎ', 'ಐ'), ('ಒ', 'ನ'), ('ಪ', 'ಳ'), + ('ವ', 'ಹ'), ('ಽ', 'ಽ'), ('ೞ', 'ೞ'), ('ೠ', 'ೡ'), + ('ೱ', 'ೲ'), ('അ', 'ഌ'), ('എ', 'ഐ'), ('ഒ', 'ഺ'), + ('ഽ', 'ഽ'), ('ൎ', 'ൎ'), ('ൔ', 'ൖ'), ('ൟ', 'ൡ'), + ('ൺ', 'ൿ'), ('අ', 'ඖ'), ('ක', 'න'), ('ඳ', 'ර'), + ('ල', 'ල'), ('ව', 'ෆ'), ('ก', 'ะ'), ('า', 'ำ'), + ('เ', 'ๆ'), ('ກ', 'ຂ'), ('ຄ', 'ຄ'), ('ງ', 'ຈ'), + ('ຊ', 'ຊ'), ('ຍ', 'ຍ'), ('ດ', 'ທ'), ('ນ', 'ຟ'), + ('ມ', 'ຣ'), ('ລ', 'ລ'), ('ວ', 'ວ'), ('ສ', 'ຫ'), + ('ອ', 'ະ'), ('າ', 'ຳ'), ('ຽ', 'ຽ'), ('ເ', 'ໄ'), + ('ໆ', 'ໆ'), ('ໜ', 'ໟ'), ('ༀ', 'ༀ'), ('ཀ', 'ཇ'), + ('ཉ', 'ཬ'), ('ྈ', 'ྌ'), ('က', 'ဪ'), ('ဿ', 'ဿ'), + ('ၐ', 'ၕ'), ('ၚ', 'ၝ'), ('ၡ', 'ၡ'), ('ၥ', 'ၦ'), + ('ၮ', 'ၰ'), ('ၵ', 'ႁ'), ('ႎ', 'ႎ'), ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), ('ა', 'ჺ'), ('ჼ', 'ቈ'), + ('ቊ', 'ቍ'), ('ቐ', 'ቖ'), ('ቘ', 'ቘ'), ('ቚ', 'ቝ'), + ('በ', 'ኈ'), ('ኊ', 'ኍ'), ('ነ', 'ኰ'), ('ኲ', 'ኵ'), + ('ኸ', 'ኾ'), ('ዀ', 'ዀ'), ('ዂ', 'ዅ'), ('ወ', 'ዖ'), + ('ዘ', 'ጐ'), ('ጒ', 'ጕ'), ('ጘ', 'ፚ'), ('ᎀ', 'ᎏ'), + ('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ᐁ', 'ᙬ'), ('ᙯ', 'ᙿ'), + ('ᚁ', 'ᚚ'), ('ᚠ', 'ᛪ'), ('ᛮ', 'ᛸ'), ('ᜀ', 'ᜌ'), + ('ᜎ', 'ᜑ'), ('ᜠ', 'ᜱ'), ('ᝀ', 'ᝑ'), ('ᝠ', 'ᝬ'), + ('ᝮ', 'ᝰ'), ('ក', 'ឳ'), ('ៗ', 'ៗ'), ('ៜ', 'ៜ'), + ('ᠠ', 'ᡷ'), ('ᢀ', 'ᢨ'), ('ᢪ', 'ᢪ'), ('ᢰ', 'ᣵ'), + ('ᤀ', 'ᤞ'), ('ᥐ', 'ᥭ'), ('ᥰ', 'ᥴ'), ('ᦀ', 'ᦫ'), + ('ᦰ', 'ᧉ'), ('ᨀ', 'ᨖ'), ('ᨠ', 'ᩔ'), ('ᪧ', 'ᪧ'), + ('ᬅ', 'ᬳ'), ('ᭅ', 'ᭋ'), ('ᮃ', 'ᮠ'), ('ᮮ', 'ᮯ'), + ('ᮺ', 'ᯥ'), ('ᰀ', 'ᰣ'), ('ᱍ', 'ᱏ'), ('ᱚ', 'ᱽ'), + ('ᲀ', 'ᲈ'), ('ᳩ', 'ᳬ'), ('ᳮ', 'ᳱ'), ('ᳵ', 'ᳶ'), + ('ᴀ', 'ᶿ'), ('Ḁ', 'ἕ'), ('Ἐ', 'Ἕ'), ('ἠ', 'ὅ'), + ('Ὀ', 'Ὅ'), ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), ('ᾶ', 'ᾼ'), + ('ι', 'ι'), ('ῂ', 'ῄ'), ('ῆ', 'ῌ'), ('ῐ', 'ΐ'), + ('ῖ', 'Ί'), ('ῠ', 'Ῥ'), ('ῲ', 'ῴ'), ('ῶ', 'ῼ'), + ('ⁱ', 'ⁱ'), ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), ('ℊ', 'ℓ'), ('ℕ', 'ℕ'), ('℘', 'ℝ'), + ('ℤ', 'ℤ'), ('Ω', 'Ω'), ('ℨ', 'ℨ'), ('K', 'ℹ'), + ('ℼ', 'ℿ'), ('ⅅ', 'ⅉ'), ('ⅎ', 'ⅎ'), ('Ⅰ', 'ↈ'), + ('Ⰰ', 'Ⱞ'), ('ⰰ', 'ⱞ'), ('Ⱡ', 'ⳤ'), ('Ⳬ', 'ⳮ'), + ('Ⳳ', 'ⳳ'), ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), + ('ⴰ', 'ⵧ'), ('ⵯ', 'ⵯ'), ('ⶀ', 'ⶖ'), ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), ('ⶰ', 'ⶶ'), ('ⶸ', 'ⶾ'), ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), ('ⷐ', 'ⷖ'), ('ⷘ', 'ⷞ'), ('々', '〇'), + ('〡', '〩'), ('〱', '〵'), ('〸', '〼'), ('ぁ', 'ゖ'), + ('゛', 'ゟ'), ('ァ', 'ヺ'), ('ー', 'ヿ'), ('ㄅ', 'ㄮ'), + ('ㄱ', 'ㆎ'), ('ㆠ', 'ㆺ'), ('ㇰ', 'ㇿ'), ('㐀', '䶵'), + ('一', '鿪'), ('ꀀ', 'ꒌ'), ('ꓐ', 'ꓽ'), ('ꔀ', 'ꘌ'), + ('ꘐ', 'ꘟ'), ('ꘪ', 'ꘫ'), ('Ꙁ', 'ꙮ'), ('ꙿ', 'ꚝ'), + ('ꚠ', 'ꛯ'), ('ꜗ', 'ꜟ'), ('Ꜣ', 'ꞈ'), ('Ꞌ', 'Ɪ'), + ('Ʞ', 'ꞷ'), ('ꟷ', 'ꠁ'), ('ꠃ', 'ꠅ'), ('ꠇ', 'ꠊ'), + ('ꠌ', 'ꠢ'), ('ꡀ', 'ꡳ'), ('ꢂ', 'ꢳ'), ('ꣲ', 'ꣷ'), + ('ꣻ', 'ꣻ'), ('ꣽ', 'ꣽ'), ('ꤊ', 'ꤥ'), ('ꤰ', 'ꥆ'), + ('ꥠ', 'ꥼ'), ('ꦄ', 'ꦲ'), ('ꧏ', 'ꧏ'), ('ꧠ', 'ꧤ'), + ('ꧦ', 'ꧯ'), ('ꧺ', 'ꧾ'), ('ꨀ', 'ꨨ'), ('ꩀ', 'ꩂ'), + ('ꩄ', 'ꩋ'), ('ꩠ', 'ꩶ'), ('ꩺ', 'ꩺ'), ('ꩾ', 'ꪯ'), + ('ꪱ', 'ꪱ'), ('ꪵ', 'ꪶ'), ('ꪹ', 'ꪽ'), ('ꫀ', 'ꫀ'), + ('ꫂ', 'ꫂ'), ('ꫛ', 'ꫝ'), ('ꫠ', 'ꫪ'), ('ꫲ', 'ꫴ'), + ('ꬁ', 'ꬆ'), ('ꬉ', 'ꬎ'), ('ꬑ', 'ꬖ'), ('ꬠ', 'ꬦ'), + ('ꬨ', 'ꬮ'), ('ꬰ', 'ꭚ'), ('ꭜ', 'ꭥ'), ('ꭰ', 'ꯢ'), + ('가', '힣'), ('ힰ', 'ퟆ'), ('ퟋ', 'ퟻ'), ('豈', '舘'), + ('並', '龎'), ('ff', 'st'), ('ﬓ', 'ﬗ'), ('יִ', 'יִ'), + ('ײַ', 'ﬨ'), ('שׁ', 'זּ'), ('טּ', 'לּ'), ('מּ', 'מּ'), + ('נּ', 'סּ'), ('ףּ', 'פּ'), ('צּ', 'ﮱ'), ('ﯓ', 'ﴽ'), + ('ﵐ', 'ﶏ'), ('ﶒ', 'ﷇ'), ('ﷰ', 'ﷻ'), ('ﹰ', 'ﹴ'), + ('ﹶ', 'ﻼ'), ('A', 'Z'), ('a', 'z'), ('ヲ', 'ᄒ'), + ('ᅡ', 'ᅦ'), ('ᅧ', 'ᅬ'), ('ᅭ', 'ᅲ'), ('ᅳ', 'ᅵ'), + ('𐀀', '𐀋'), ('𐀍', '𐀦'), ('𐀨', '𐀺'), ('𐀼', '𐀽'), + ('𐀿', '𐁍'), ('𐁐', '𐁝'), ('𐂀', '𐃺'), ('𐅀', '𐅴'), + ('𐊀', '𐊜'), ('𐊠', '𐋐'), ('𐌀', '𐌟'), ('𐌭', '𐍊'), + ('𐍐', '𐍵'), ('𐎀', '𐎝'), ('𐎠', '𐏃'), ('𐏈', '𐏏'), + ('𐏑', '𐏕'), ('𐐀', '𐒝'), ('𐒰', '𐓓'), ('𐓘', '𐓻'), + ('𐔀', '𐔧'), ('𐔰', '𐕣'), ('𐘀', '𐜶'), ('𐝀', '𐝕'), + ('𐝠', '𐝧'), ('𐠀', '𐠅'), ('𐠈', '𐠈'), ('𐠊', '𐠵'), + ('𐠷', '𐠸'), ('𐠼', '𐠼'), ('𐠿', '𐡕'), ('𐡠', '𐡶'), + ('𐢀', '𐢞'), ('𐣠', '𐣲'), ('𐣴', '𐣵'), ('𐤀', '𐤕'), + ('𐤠', '𐤹'), ('𐦀', '𐦷'), ('𐦾', '𐦿'), ('𐨀', '𐨀'), + ('𐨐', '𐨓'), ('𐨕', '𐨗'), ('𐨙', '𐨳'), ('𐩠', '𐩼'), + ('𐪀', '𐪜'), ('𐫀', '𐫇'), ('𐫉', '𐫤'), ('𐬀', '𐬵'), + ('𐭀', '𐭕'), ('𐭠', '𐭲'), ('𐮀', '𐮑'), ('𐰀', '𐱈'), + ('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𑀃', '𑀷'), ('𑂃', '𑂯'), + ('𑃐', '𑃨'), ('𑄃', '𑄦'), ('𑅐', '𑅲'), ('𑅶', '𑅶'), + ('𑆃', '𑆲'), ('𑇁', '𑇄'), ('𑇚', '𑇚'), ('𑇜', '𑇜'), + ('𑈀', '𑈑'), ('𑈓', '𑈫'), ('𑊀', '𑊆'), ('𑊈', '𑊈'), + ('𑊊', '𑊍'), ('𑊏', '𑊝'), ('𑊟', '𑊨'), ('𑊰', '𑋞'), + ('𑌅', '𑌌'), ('𑌏', '𑌐'), ('𑌓', '𑌨'), ('𑌪', '𑌰'), + ('𑌲', '𑌳'), ('𑌵', '𑌹'), ('𑌽', '𑌽'), ('𑍐', '𑍐'), + ('𑍝', '𑍡'), ('𑐀', '𑐴'), ('𑑇', '𑑊'), ('𑒀', '𑒯'), + ('𑓄', '𑓅'), ('𑓇', '𑓇'), ('𑖀', '𑖮'), ('𑗘', '𑗛'), + ('𑘀', '𑘯'), ('𑙄', '𑙄'), ('𑚀', '𑚪'), ('𑜀', '𑜙'), + ('𑢠', '𑣟'), ('𑣿', '𑣿'), ('𑨀', '𑨀'), ('𑨋', '𑨲'), + ('𑨺', '𑨺'), ('𑩐', '𑩐'), ('𑩜', '𑪃'), ('𑪆', '𑪉'), + ('𑫀', '𑫸'), ('𑰀', '𑰈'), ('𑰊', '𑰮'), ('𑱀', '𑱀'), + ('𑱲', '𑲏'), ('𑴀', '𑴆'), ('𑴈', '𑴉'), ('𑴋', '𑴰'), + ('𑵆', '𑵆'), ('𒀀', '𒎙'), ('𒐀', '𒑮'), ('𒒀', '𒕃'), + ('𓀀', '𓐮'), ('𔐀', '𔙆'), ('𖠀', '𖨸'), ('𖩀', '𖩞'), + ('𖫐', '𖫭'), ('𖬀', '𖬯'), ('𖭀', '𖭃'), ('𖭣', '𖭷'), + ('𖭽', '𖮏'), ('𖼀', '𖽄'), ('𖽐', '𖽐'), ('𖾓', '𖾟'), + ('𖿠', '𖿡'), ('𗀀', '𘟬'), ('𘠀', '𘫲'), ('𛀀', '𛄞'), + ('𛅰', '𛋻'), ('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), + ('𛲐', '𛲙'), ('𝐀', '𝑔'), ('𝑖', '𝒜'), ('𝒞', '𝒟'), + ('𝒢', '𝒢'), ('𝒥', '𝒦'), ('𝒩', '𝒬'), ('𝒮', '𝒹'), + ('𝒻', '𝒻'), ('𝒽', '𝓃'), ('𝓅', '𝔅'), ('𝔇', '𝔊'), + ('𝔍', '𝔔'), ('𝔖', '𝔜'), ('𝔞', '𝔹'), ('𝔻', '𝔾'), + ('𝕀', '𝕄'), ('𝕆', '𝕆'), ('𝕊', '𝕐'), ('𝕒', '𝚥'), + ('𝚨', '𝛀'), ('𝛂', '𝛚'), ('𝛜', '𝛺'), ('𝛼', '𝜔'), + ('𝜖', '𝜴'), ('𝜶', '𝝎'), ('𝝐', '𝝮'), ('𝝰', '𝞈'), + ('𝞊', '𝞨'), ('𝞪', '𝟂'), ('𝟄', '𝟋'), ('𞠀', '𞣄'), + ('𞤀', '𞥃'), ('𞸀', '𞸃'), ('𞸅', '𞸟'), ('𞸡', '𞸢'), + ('𞸤', '𞸤'), ('𞸧', '𞸧'), ('𞸩', '𞸲'), ('𞸴', '𞸷'), + ('𞸹', '𞸹'), ('𞸻', '𞸻'), ('𞹂', '𞹂'), ('𞹇', '𞹇'), + ('𞹉', '𞹉'), ('𞹋', '𞹋'), ('𞹍', '𞹏'), ('𞹑', '𞹒'), + ('𞹔', '𞹔'), ('𞹗', '𞹗'), ('𞹙', '𞹙'), ('𞹛', '𞹛'), + ('𞹝', '𞹝'), ('𞹟', '𞹟'), ('𞹡', '𞹢'), ('𞹤', '𞹤'), + ('𞹧', '𞹪'), ('𞹬', '𞹲'), ('𞹴', '𞹷'), ('𞹹', '𞹼'), + ('𞹾', '𞹾'), ('𞺀', '𞺉'), ('𞺋', '𞺛'), ('𞺡', '𞺣'), + ('𞺥', '𞺩'), ('𞺫', '𞺻'), ('𠀀', '𪛖'), ('𪜀', '𫜴'), + ('𫝀', '𫠝'), ('𫠠', '𬺡'), ('𬺰', '𮯠'), ('丽', '𪘀'), +]; + +pub const IDEOGRAPHIC: &'static [(char, char)] = &[ + ('〆', '〇'), ('〡', '〩'), ('〸', '〺'), ('㐀', '䶵'), + ('一', '鿪'), ('豈', '舘'), ('並', '龎'), ('𗀀', '𘟬'), + ('𘠀', '𘫲'), ('𛅰', '𛋻'), ('𠀀', '𪛖'), ('𪜀', '𫜴'), + ('𫝀', '𫠝'), ('𫠠', '𬺡'), ('𬺰', '𮯠'), ('丽', '𪘀'), +]; + +pub const JOIN_CONTROL: &'static [(char, char)] = &[ + ('\u{200c}', '\u{200d}'), +]; + +pub const LOGICAL_ORDER_EXCEPTION: &'static [(char, char)] = &[ + ('เ', 'ไ'), ('ເ', 'ໄ'), ('ᦵ', 'ᦷ'), ('ᦺ', 'ᦺ'), + ('ꪵ', 'ꪶ'), ('ꪹ', 'ꪹ'), ('ꪻ', 'ꪼ'), +]; + +pub const LOWERCASE: &'static [(char, char)] = &[ + ('a', 'z'), ('ª', 'ª'), ('µ', 'µ'), ('º', 'º'), ('ß', 'ö'), + ('ø', 'ÿ'), ('ā', 'ā'), ('ă', 'ă'), ('ą', 'ą'), ('ć', 'ć'), + ('ĉ', 'ĉ'), ('ċ', 'ċ'), ('č', 'č'), ('ď', 'ď'), ('đ', 'đ'), + ('ē', 'ē'), ('ĕ', 'ĕ'), ('ė', 'ė'), ('ę', 'ę'), ('ě', 'ě'), + ('ĝ', 'ĝ'), ('ğ', 'ğ'), ('ġ', 'ġ'), ('ģ', 'ģ'), ('ĥ', 'ĥ'), + ('ħ', 'ħ'), ('ĩ', 'ĩ'), ('ī', 'ī'), ('ĭ', 'ĭ'), ('į', 'į'), + ('ı', 'ı'), ('ij', 'ij'), ('ĵ', 'ĵ'), ('ķ', 'ĸ'), ('ĺ', 'ĺ'), + ('ļ', 'ļ'), ('ľ', 'ľ'), ('ŀ', 'ŀ'), ('ł', 'ł'), ('ń', 'ń'), + ('ņ', 'ņ'), ('ň', 'ʼn'), ('ŋ', 'ŋ'), ('ō', 'ō'), ('ŏ', 'ŏ'), + ('ő', 'ő'), ('œ', 'œ'), ('ŕ', 'ŕ'), ('ŗ', 'ŗ'), ('ř', 'ř'), + ('ś', 'ś'), ('ŝ', 'ŝ'), ('ş', 'ş'), ('š', 'š'), ('ţ', 'ţ'), + ('ť', 'ť'), ('ŧ', 'ŧ'), ('ũ', 'ũ'), ('ū', 'ū'), ('ŭ', 'ŭ'), + ('ů', 'ů'), ('ű', 'ű'), ('ų', 'ų'), ('ŵ', 'ŵ'), ('ŷ', 'ŷ'), + ('ź', 'ź'), ('ż', 'ż'), ('ž', 'ƀ'), ('ƃ', 'ƃ'), ('ƅ', 'ƅ'), + ('ƈ', 'ƈ'), ('ƌ', 'ƍ'), ('ƒ', 'ƒ'), ('ƕ', 'ƕ'), ('ƙ', 'ƛ'), + ('ƞ', 'ƞ'), ('ơ', 'ơ'), ('ƣ', 'ƣ'), ('ƥ', 'ƥ'), ('ƨ', 'ƨ'), + ('ƪ', 'ƫ'), ('ƭ', 'ƭ'), ('ư', 'ư'), ('ƴ', 'ƴ'), ('ƶ', 'ƶ'), + ('ƹ', 'ƺ'), ('ƽ', 'ƿ'), ('dž', 'dž'), ('lj', 'lj'), ('nj', 'nj'), + ('ǎ', 'ǎ'), ('ǐ', 'ǐ'), ('ǒ', 'ǒ'), ('ǔ', 'ǔ'), ('ǖ', 'ǖ'), + ('ǘ', 'ǘ'), ('ǚ', 'ǚ'), ('ǜ', 'ǝ'), ('ǟ', 'ǟ'), ('ǡ', 'ǡ'), + ('ǣ', 'ǣ'), ('ǥ', 'ǥ'), ('ǧ', 'ǧ'), ('ǩ', 'ǩ'), ('ǫ', 'ǫ'), + ('ǭ', 'ǭ'), ('ǯ', 'ǰ'), ('dz', 'dz'), ('ǵ', 'ǵ'), ('ǹ', 'ǹ'), + ('ǻ', 'ǻ'), ('ǽ', 'ǽ'), ('ǿ', 'ǿ'), ('ȁ', 'ȁ'), ('ȃ', 'ȃ'), + ('ȅ', 'ȅ'), ('ȇ', 'ȇ'), ('ȉ', 'ȉ'), ('ȋ', 'ȋ'), ('ȍ', 'ȍ'), + ('ȏ', 'ȏ'), ('ȑ', 'ȑ'), ('ȓ', 'ȓ'), ('ȕ', 'ȕ'), ('ȗ', 'ȗ'), + ('ș', 'ș'), ('ț', 'ț'), ('ȝ', 'ȝ'), ('ȟ', 'ȟ'), ('ȡ', 'ȡ'), + ('ȣ', 'ȣ'), ('ȥ', 'ȥ'), ('ȧ', 'ȧ'), ('ȩ', 'ȩ'), ('ȫ', 'ȫ'), + ('ȭ', 'ȭ'), ('ȯ', 'ȯ'), ('ȱ', 'ȱ'), ('ȳ', 'ȹ'), ('ȼ', 'ȼ'), + ('ȿ', 'ɀ'), ('ɂ', 'ɂ'), ('ɇ', 'ɇ'), ('ɉ', 'ɉ'), ('ɋ', 'ɋ'), + ('ɍ', 'ɍ'), ('ɏ', 'ʓ'), ('ʕ', 'ʸ'), ('ˀ', 'ˁ'), ('ˠ', 'ˤ'), + ('ͅ', 'ͅ'), ('ͱ', 'ͱ'), ('ͳ', 'ͳ'), ('ͷ', 'ͷ'), ('ͺ', 'ͽ'), + ('ΐ', 'ΐ'), ('ά', 'ώ'), ('ϐ', 'ϑ'), ('ϕ', 'ϗ'), ('ϙ', 'ϙ'), + ('ϛ', 'ϛ'), ('ϝ', 'ϝ'), ('ϟ', 'ϟ'), ('ϡ', 'ϡ'), ('ϣ', 'ϣ'), + ('ϥ', 'ϥ'), ('ϧ', 'ϧ'), ('ϩ', 'ϩ'), ('ϫ', 'ϫ'), ('ϭ', 'ϭ'), + ('ϯ', 'ϳ'), ('ϵ', 'ϵ'), ('ϸ', 'ϸ'), ('ϻ', 'ϼ'), ('а', 'џ'), + ('ѡ', 'ѡ'), ('ѣ', 'ѣ'), ('ѥ', 'ѥ'), ('ѧ', 'ѧ'), ('ѩ', 'ѩ'), + ('ѫ', 'ѫ'), ('ѭ', 'ѭ'), ('ѯ', 'ѯ'), ('ѱ', 'ѱ'), ('ѳ', 'ѳ'), + ('ѵ', 'ѵ'), ('ѷ', 'ѷ'), ('ѹ', 'ѹ'), ('ѻ', 'ѻ'), ('ѽ', 'ѽ'), + ('ѿ', 'ѿ'), ('ҁ', 'ҁ'), ('ҋ', 'ҋ'), ('ҍ', 'ҍ'), ('ҏ', 'ҏ'), + ('ґ', 'ґ'), ('ғ', 'ғ'), ('ҕ', 'ҕ'), ('җ', 'җ'), ('ҙ', 'ҙ'), + ('қ', 'қ'), ('ҝ', 'ҝ'), ('ҟ', 'ҟ'), ('ҡ', 'ҡ'), ('ң', 'ң'), + ('ҥ', 'ҥ'), ('ҧ', 'ҧ'), ('ҩ', 'ҩ'), ('ҫ', 'ҫ'), ('ҭ', 'ҭ'), + ('ү', 'ү'), ('ұ', 'ұ'), ('ҳ', 'ҳ'), ('ҵ', 'ҵ'), ('ҷ', 'ҷ'), + ('ҹ', 'ҹ'), ('һ', 'һ'), ('ҽ', 'ҽ'), ('ҿ', 'ҿ'), ('ӂ', 'ӂ'), + ('ӄ', 'ӄ'), ('ӆ', 'ӆ'), ('ӈ', 'ӈ'), ('ӊ', 'ӊ'), ('ӌ', 'ӌ'), + ('ӎ', 'ӏ'), ('ӑ', 'ӑ'), ('ӓ', 'ӓ'), ('ӕ', 'ӕ'), ('ӗ', 'ӗ'), + ('ә', 'ә'), ('ӛ', 'ӛ'), ('ӝ', 'ӝ'), ('ӟ', 'ӟ'), ('ӡ', 'ӡ'), + ('ӣ', 'ӣ'), ('ӥ', 'ӥ'), ('ӧ', 'ӧ'), ('ө', 'ө'), ('ӫ', 'ӫ'), + ('ӭ', 'ӭ'), ('ӯ', 'ӯ'), ('ӱ', 'ӱ'), ('ӳ', 'ӳ'), ('ӵ', 'ӵ'), + ('ӷ', 'ӷ'), ('ӹ', 'ӹ'), ('ӻ', 'ӻ'), ('ӽ', 'ӽ'), ('ӿ', 'ӿ'), + ('ԁ', 'ԁ'), ('ԃ', 'ԃ'), ('ԅ', 'ԅ'), ('ԇ', 'ԇ'), ('ԉ', 'ԉ'), + ('ԋ', 'ԋ'), ('ԍ', 'ԍ'), ('ԏ', 'ԏ'), ('ԑ', 'ԑ'), ('ԓ', 'ԓ'), + ('ԕ', 'ԕ'), ('ԗ', 'ԗ'), ('ԙ', 'ԙ'), ('ԛ', 'ԛ'), ('ԝ', 'ԝ'), + ('ԟ', 'ԟ'), ('ԡ', 'ԡ'), ('ԣ', 'ԣ'), ('ԥ', 'ԥ'), ('ԧ', 'ԧ'), + ('ԩ', 'ԩ'), ('ԫ', 'ԫ'), ('ԭ', 'ԭ'), ('ԯ', 'ԯ'), ('ա', 'և'), + ('ᏸ', 'ᏽ'), ('ᲀ', 'ᲈ'), ('ᴀ', 'ᶿ'), ('ḁ', 'ḁ'), + ('ḃ', 'ḃ'), ('ḅ', 'ḅ'), ('ḇ', 'ḇ'), ('ḉ', 'ḉ'), + ('ḋ', 'ḋ'), ('ḍ', 'ḍ'), ('ḏ', 'ḏ'), ('ḑ', 'ḑ'), + ('ḓ', 'ḓ'), ('ḕ', 'ḕ'), ('ḗ', 'ḗ'), ('ḙ', 'ḙ'), + ('ḛ', 'ḛ'), ('ḝ', 'ḝ'), ('ḟ', 'ḟ'), ('ḡ', 'ḡ'), + ('ḣ', 'ḣ'), ('ḥ', 'ḥ'), ('ḧ', 'ḧ'), ('ḩ', 'ḩ'), + ('ḫ', 'ḫ'), ('ḭ', 'ḭ'), ('ḯ', 'ḯ'), ('ḱ', 'ḱ'), + ('ḳ', 'ḳ'), ('ḵ', 'ḵ'), ('ḷ', 'ḷ'), ('ḹ', 'ḹ'), + ('ḻ', 'ḻ'), ('ḽ', 'ḽ'), ('ḿ', 'ḿ'), ('ṁ', 'ṁ'), + ('ṃ', 'ṃ'), ('ṅ', 'ṅ'), ('ṇ', 'ṇ'), ('ṉ', 'ṉ'), + ('ṋ', 'ṋ'), ('ṍ', 'ṍ'), ('ṏ', 'ṏ'), ('ṑ', 'ṑ'), + ('ṓ', 'ṓ'), ('ṕ', 'ṕ'), ('ṗ', 'ṗ'), ('ṙ', 'ṙ'), + ('ṛ', 'ṛ'), ('ṝ', 'ṝ'), ('ṟ', 'ṟ'), ('ṡ', 'ṡ'), + ('ṣ', 'ṣ'), ('ṥ', 'ṥ'), ('ṧ', 'ṧ'), ('ṩ', 'ṩ'), + ('ṫ', 'ṫ'), ('ṭ', 'ṭ'), ('ṯ', 'ṯ'), ('ṱ', 'ṱ'), + ('ṳ', 'ṳ'), ('ṵ', 'ṵ'), ('ṷ', 'ṷ'), ('ṹ', 'ṹ'), + ('ṻ', 'ṻ'), ('ṽ', 'ṽ'), ('ṿ', 'ṿ'), ('ẁ', 'ẁ'), + ('ẃ', 'ẃ'), ('ẅ', 'ẅ'), ('ẇ', 'ẇ'), ('ẉ', 'ẉ'), + ('ẋ', 'ẋ'), ('ẍ', 'ẍ'), ('ẏ', 'ẏ'), ('ẑ', 'ẑ'), + ('ẓ', 'ẓ'), ('ẕ', 'ẝ'), ('ẟ', 'ẟ'), ('ạ', 'ạ'), + ('ả', 'ả'), ('ấ', 'ấ'), ('ầ', 'ầ'), ('ẩ', 'ẩ'), + ('ẫ', 'ẫ'), ('ậ', 'ậ'), ('ắ', 'ắ'), ('ằ', 'ằ'), + ('ẳ', 'ẳ'), ('ẵ', 'ẵ'), ('ặ', 'ặ'), ('ẹ', 'ẹ'), + ('ẻ', 'ẻ'), ('ẽ', 'ẽ'), ('ế', 'ế'), ('ề', 'ề'), + ('ể', 'ể'), ('ễ', 'ễ'), ('ệ', 'ệ'), ('ỉ', 'ỉ'), + ('ị', 'ị'), ('ọ', 'ọ'), ('ỏ', 'ỏ'), ('ố', 'ố'), + ('ồ', 'ồ'), ('ổ', 'ổ'), ('ỗ', 'ỗ'), ('ộ', 'ộ'), + ('ớ', 'ớ'), ('ờ', 'ờ'), ('ở', 'ở'), ('ỡ', 'ỡ'), + ('ợ', 'ợ'), ('ụ', 'ụ'), ('ủ', 'ủ'), ('ứ', 'ứ'), + ('ừ', 'ừ'), ('ử', 'ử'), ('ữ', 'ữ'), ('ự', 'ự'), + ('ỳ', 'ỳ'), ('ỵ', 'ỵ'), ('ỷ', 'ỷ'), ('ỹ', 'ỹ'), + ('ỻ', 'ỻ'), ('ỽ', 'ỽ'), ('ỿ', 'ἇ'), ('ἐ', 'ἕ'), + ('ἠ', 'ἧ'), ('ἰ', 'ἷ'), ('ὀ', 'ὅ'), ('ὐ', 'ὗ'), + ('ὠ', 'ὧ'), ('ὰ', 'ώ'), ('ᾀ', 'ᾇ'), ('ᾐ', 'ᾗ'), + ('ᾠ', 'ᾧ'), ('ᾰ', 'ᾴ'), ('ᾶ', 'ᾷ'), ('ι', 'ι'), + ('ῂ', 'ῄ'), ('ῆ', 'ῇ'), ('ῐ', 'ΐ'), ('ῖ', 'ῗ'), + ('ῠ', 'ῧ'), ('ῲ', 'ῴ'), ('ῶ', 'ῷ'), ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), ('ℊ', 'ℊ'), ('ℎ', 'ℏ'), + ('ℓ', 'ℓ'), ('ℯ', 'ℯ'), ('ℴ', 'ℴ'), ('ℹ', 'ℹ'), + ('ℼ', 'ℽ'), ('ⅆ', 'ⅉ'), ('ⅎ', 'ⅎ'), ('ⅰ', 'ⅿ'), + ('ↄ', 'ↄ'), ('ⓐ', 'ⓩ'), ('ⰰ', 'ⱞ'), ('ⱡ', 'ⱡ'), + ('ⱥ', 'ⱦ'), ('ⱨ', 'ⱨ'), ('ⱪ', 'ⱪ'), ('ⱬ', 'ⱬ'), + ('ⱱ', 'ⱱ'), ('ⱳ', 'ⱴ'), ('ⱶ', 'ⱽ'), ('ⲁ', 'ⲁ'), + ('ⲃ', 'ⲃ'), ('ⲅ', 'ⲅ'), ('ⲇ', 'ⲇ'), ('ⲉ', 'ⲉ'), + ('ⲋ', 'ⲋ'), ('ⲍ', 'ⲍ'), ('ⲏ', 'ⲏ'), ('ⲑ', 'ⲑ'), + ('ⲓ', 'ⲓ'), ('ⲕ', 'ⲕ'), ('ⲗ', 'ⲗ'), ('ⲙ', 'ⲙ'), + ('ⲛ', 'ⲛ'), ('ⲝ', 'ⲝ'), ('ⲟ', 'ⲟ'), ('ⲡ', 'ⲡ'), + ('ⲣ', 'ⲣ'), ('ⲥ', 'ⲥ'), ('ⲧ', 'ⲧ'), ('ⲩ', 'ⲩ'), + ('ⲫ', 'ⲫ'), ('ⲭ', 'ⲭ'), ('ⲯ', 'ⲯ'), ('ⲱ', 'ⲱ'), + ('ⲳ', 'ⲳ'), ('ⲵ', 'ⲵ'), ('ⲷ', 'ⲷ'), ('ⲹ', 'ⲹ'), + ('ⲻ', 'ⲻ'), ('ⲽ', 'ⲽ'), ('ⲿ', 'ⲿ'), ('ⳁ', 'ⳁ'), + ('ⳃ', 'ⳃ'), ('ⳅ', 'ⳅ'), ('ⳇ', 'ⳇ'), ('ⳉ', 'ⳉ'), + ('ⳋ', 'ⳋ'), ('ⳍ', 'ⳍ'), ('ⳏ', 'ⳏ'), ('ⳑ', 'ⳑ'), + ('ⳓ', 'ⳓ'), ('ⳕ', 'ⳕ'), ('ⳗ', 'ⳗ'), ('ⳙ', 'ⳙ'), + ('ⳛ', 'ⳛ'), ('ⳝ', 'ⳝ'), ('ⳟ', 'ⳟ'), ('ⳡ', 'ⳡ'), + ('ⳣ', 'ⳤ'), ('ⳬ', 'ⳬ'), ('ⳮ', 'ⳮ'), ('ⳳ', 'ⳳ'), + ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), ('ꙁ', 'ꙁ'), + ('ꙃ', 'ꙃ'), ('ꙅ', 'ꙅ'), ('ꙇ', 'ꙇ'), ('ꙉ', 'ꙉ'), + ('ꙋ', 'ꙋ'), ('ꙍ', 'ꙍ'), ('ꙏ', 'ꙏ'), ('ꙑ', 'ꙑ'), + ('ꙓ', 'ꙓ'), ('ꙕ', 'ꙕ'), ('ꙗ', 'ꙗ'), ('ꙙ', 'ꙙ'), + ('ꙛ', 'ꙛ'), ('ꙝ', 'ꙝ'), ('ꙟ', 'ꙟ'), ('ꙡ', 'ꙡ'), + ('ꙣ', 'ꙣ'), ('ꙥ', 'ꙥ'), ('ꙧ', 'ꙧ'), ('ꙩ', 'ꙩ'), + ('ꙫ', 'ꙫ'), ('ꙭ', 'ꙭ'), ('ꚁ', 'ꚁ'), ('ꚃ', 'ꚃ'), + ('ꚅ', 'ꚅ'), ('ꚇ', 'ꚇ'), ('ꚉ', 'ꚉ'), ('ꚋ', 'ꚋ'), + ('ꚍ', 'ꚍ'), ('ꚏ', 'ꚏ'), ('ꚑ', 'ꚑ'), ('ꚓ', 'ꚓ'), + ('ꚕ', 'ꚕ'), ('ꚗ', 'ꚗ'), ('ꚙ', 'ꚙ'), ('ꚛ', 'ꚝ'), + ('ꜣ', 'ꜣ'), ('ꜥ', 'ꜥ'), ('ꜧ', 'ꜧ'), ('ꜩ', 'ꜩ'), + ('ꜫ', 'ꜫ'), ('ꜭ', 'ꜭ'), ('ꜯ', 'ꜱ'), ('ꜳ', 'ꜳ'), + ('ꜵ', 'ꜵ'), ('ꜷ', 'ꜷ'), ('ꜹ', 'ꜹ'), ('ꜻ', 'ꜻ'), + ('ꜽ', 'ꜽ'), ('ꜿ', 'ꜿ'), ('ꝁ', 'ꝁ'), ('ꝃ', 'ꝃ'), + ('ꝅ', 'ꝅ'), ('ꝇ', 'ꝇ'), ('ꝉ', 'ꝉ'), ('ꝋ', 'ꝋ'), + ('ꝍ', 'ꝍ'), ('ꝏ', 'ꝏ'), ('ꝑ', 'ꝑ'), ('ꝓ', 'ꝓ'), + ('ꝕ', 'ꝕ'), ('ꝗ', 'ꝗ'), ('ꝙ', 'ꝙ'), ('ꝛ', 'ꝛ'), + ('ꝝ', 'ꝝ'), ('ꝟ', 'ꝟ'), ('ꝡ', 'ꝡ'), ('ꝣ', 'ꝣ'), + ('ꝥ', 'ꝥ'), ('ꝧ', 'ꝧ'), ('ꝩ', 'ꝩ'), ('ꝫ', 'ꝫ'), + ('ꝭ', 'ꝭ'), ('ꝯ', 'ꝸ'), ('ꝺ', 'ꝺ'), ('ꝼ', 'ꝼ'), + ('ꝿ', 'ꝿ'), ('ꞁ', 'ꞁ'), ('ꞃ', 'ꞃ'), ('ꞅ', 'ꞅ'), + ('ꞇ', 'ꞇ'), ('ꞌ', 'ꞌ'), ('ꞎ', 'ꞎ'), ('ꞑ', 'ꞑ'), + ('ꞓ', 'ꞕ'), ('ꞗ', 'ꞗ'), ('ꞙ', 'ꞙ'), ('ꞛ', 'ꞛ'), + ('ꞝ', 'ꞝ'), ('ꞟ', 'ꞟ'), ('ꞡ', 'ꞡ'), ('ꞣ', 'ꞣ'), + ('ꞥ', 'ꞥ'), ('ꞧ', 'ꞧ'), ('ꞩ', 'ꞩ'), ('ꞵ', 'ꞵ'), + ('ꞷ', 'ꞷ'), ('ꟸ', 'ꟺ'), ('ꬰ', 'ꭚ'), ('ꭜ', 'ꭥ'), + ('ꭰ', 'ꮿ'), ('ff', 'st'), ('ﬓ', 'ﬗ'), ('a', 'z'), + ('𐐨', '𐑏'), ('𐓘', '𐓻'), ('𐳀', '𐳲'), ('𑣀', '𑣟'), + ('𝐚', '𝐳'), ('𝑎', '𝑔'), ('𝑖', '𝑧'), ('𝒂', '𝒛'), + ('𝒶', '𝒹'), ('𝒻', '𝒻'), ('𝒽', '𝓃'), ('𝓅', '𝓏'), + ('𝓪', '𝔃'), ('𝔞', '𝔷'), ('𝕒', '𝕫'), ('𝖆', '𝖟'), + ('𝖺', '𝗓'), ('𝗮', '𝘇'), ('𝘢', '𝘻'), ('𝙖', '𝙯'), + ('𝚊', '𝚥'), ('𝛂', '𝛚'), ('𝛜', '𝛡'), ('𝛼', '𝜔'), + ('𝜖', '𝜛'), ('𝜶', '𝝎'), ('𝝐', '𝝕'), ('𝝰', '𝞈'), + ('𝞊', '𝞏'), ('𝞪', '𝟂'), ('𝟄', '𝟉'), ('𝟋', '𝟋'), + ('𞤢', '𞥃'), +]; + +pub const MATH: &'static [(char, char)] = &[ + ('+', '+'), ('<', '>'), ('^', '^'), ('|', '|'), ('~', '~'), ('¬', '¬'), + ('±', '±'), ('×', '×'), ('÷', '÷'), ('ϐ', 'ϒ'), ('ϕ', 'ϕ'), + ('ϰ', 'ϱ'), ('ϴ', '϶'), ('؆', '؈'), ('‖', '‖'), ('′', '‴'), + ('⁀', '⁀'), ('⁄', '⁄'), ('⁒', '⁒'), ('\u{2061}', '\u{2064}'), + ('⁺', '⁾'), ('₊', '₎'), ('⃐', '⃜'), ('⃡', '⃡'), + ('⃥', '⃦'), ('⃫', '⃯'), ('ℂ', 'ℂ'), ('ℇ', 'ℇ'), + ('ℊ', 'ℓ'), ('ℕ', 'ℕ'), ('℘', 'ℝ'), ('ℤ', 'ℤ'), + ('ℨ', '℩'), ('ℬ', 'ℭ'), ('ℯ', 'ℱ'), ('ℳ', 'ℸ'), + ('ℼ', 'ⅉ'), ('⅋', '⅋'), ('←', '↧'), ('↩', '↮'), + ('↰', '↱'), ('↶', '↷'), ('↼', '⇛'), ('⇝', '⇝'), + ('⇤', '⇥'), ('⇴', '⋿'), ('⌈', '⌋'), ('⌠', '⌡'), + ('⍼', '⍼'), ('⎛', '⎵'), ('⎷', '⎷'), ('⏐', '⏐'), + ('⏜', '⏢'), ('■', '□'), ('▮', '▷'), ('▼', '◁'), + ('◆', '◇'), ('◊', '○'), ('●', '◓'), ('◢', '◢'), + ('◤', '◤'), ('◧', '◬'), ('◸', '◿'), ('★', '☆'), + ('♀', '♀'), ('♂', '♂'), ('♠', '♣'), ('♭', '♯'), + ('⟀', '⟿'), ('⤀', '⫿'), ('⬰', '⭄'), ('⭇', '⭌'), + ('﬩', '﬩'), ('﹡', '﹦'), ('﹨', '﹨'), ('+', '+'), + ('<', '>'), ('\', '\'), ('^', '^'), ('|', '|'), + ('~', '~'), ('¬', '¬'), ('←', '↓'), ('𝐀', '𝑔'), + ('𝑖', '𝒜'), ('𝒞', '𝒟'), ('𝒢', '𝒢'), ('𝒥', '𝒦'), + ('𝒩', '𝒬'), ('𝒮', '𝒹'), ('𝒻', '𝒻'), ('𝒽', '𝓃'), + ('𝓅', '𝔅'), ('𝔇', '𝔊'), ('𝔍', '𝔔'), ('𝔖', '𝔜'), + ('𝔞', '𝔹'), ('𝔻', '𝔾'), ('𝕀', '𝕄'), ('𝕆', '𝕆'), + ('𝕊', '𝕐'), ('𝕒', '𝚥'), ('𝚨', '𝟋'), ('𝟎', '𝟿'), + ('𞸀', '𞸃'), ('𞸅', '𞸟'), ('𞸡', '𞸢'), ('𞸤', '𞸤'), + ('𞸧', '𞸧'), ('𞸩', '𞸲'), ('𞸴', '𞸷'), ('𞸹', '𞸹'), + ('𞸻', '𞸻'), ('𞹂', '𞹂'), ('𞹇', '𞹇'), ('𞹉', '𞹉'), + ('𞹋', '𞹋'), ('𞹍', '𞹏'), ('𞹑', '𞹒'), ('𞹔', '𞹔'), + ('𞹗', '𞹗'), ('𞹙', '𞹙'), ('𞹛', '𞹛'), ('𞹝', '𞹝'), + ('𞹟', '𞹟'), ('𞹡', '𞹢'), ('𞹤', '𞹤'), ('𞹧', '𞹪'), + ('𞹬', '𞹲'), ('𞹴', '𞹷'), ('𞹹', '𞹼'), ('𞹾', '𞹾'), + ('𞺀', '𞺉'), ('𞺋', '𞺛'), ('𞺡', '𞺣'), ('𞺥', '𞺩'), + ('𞺫', '𞺻'), ('𞻰', '𞻱'), +]; + +pub const NONCHARACTER_CODE_POINT: &'static [(char, char)] = &[ + ('\u{fdd0}', '\u{fdef}'), ('\u{fffe}', '\u{ffff}'), + ('\u{1fffe}', '\u{1ffff}'), ('\u{2fffe}', '\u{2ffff}'), + ('\u{3fffe}', '\u{3ffff}'), ('\u{4fffe}', '\u{4ffff}'), + ('\u{5fffe}', '\u{5ffff}'), ('\u{6fffe}', '\u{6ffff}'), + ('\u{7fffe}', '\u{7ffff}'), ('\u{8fffe}', '\u{8ffff}'), + ('\u{9fffe}', '\u{9ffff}'), ('\u{afffe}', '\u{affff}'), + ('\u{bfffe}', '\u{bffff}'), ('\u{cfffe}', '\u{cffff}'), + ('\u{dfffe}', '\u{dffff}'), ('\u{efffe}', '\u{effff}'), + ('\u{ffffe}', '\u{fffff}'), ('\u{10fffe}', '\u{10ffff}'), +]; + +pub const OTHER_ALPHABETIC: &'static [(char, char)] = &[ + ('ͅ', 'ͅ'), ('ְ', 'ֽ'), ('ֿ', 'ֿ'), ('ׁ', 'ׂ'), ('ׄ', 'ׅ'), + ('ׇ', 'ׇ'), ('ؐ', 'ؚ'), ('ً', 'ٗ'), ('ٙ', 'ٟ'), ('ٰ', 'ٰ'), + ('ۖ', 'ۜ'), ('ۡ', 'ۤ'), ('ۧ', 'ۨ'), ('ۭ', 'ۭ'), ('ܑ', 'ܑ'), + ('ܰ', 'ܿ'), ('ަ', 'ް'), ('ࠖ', 'ࠗ'), ('ࠛ', 'ࠣ'), ('ࠥ', 'ࠧ'), + ('ࠩ', 'ࠬ'), ('ࣔ', 'ࣟ'), ('ࣣ', 'ࣩ'), ('ࣰ', 'ः'), + ('ऺ', 'ऻ'), ('ा', 'ौ'), ('ॎ', 'ॏ'), ('ॕ', 'ॗ'), + ('ॢ', 'ॣ'), ('ঁ', 'ঃ'), ('া', 'ৄ'), ('ে', 'ৈ'), + ('ো', 'ৌ'), ('ৗ', 'ৗ'), ('ৢ', 'ৣ'), ('ਁ', 'ਃ'), + ('ਾ', 'ੂ'), ('ੇ', 'ੈ'), ('ੋ', 'ੌ'), ('ੑ', 'ੑ'), + ('ੰ', 'ੱ'), ('ੵ', 'ੵ'), ('ઁ', 'ઃ'), ('ા', 'ૅ'), + ('ે', 'ૉ'), ('ો', 'ૌ'), ('ૢ', 'ૣ'), ('ૺ', 'ૼ'), + ('ଁ', 'ଃ'), ('ା', 'ୄ'), ('େ', 'ୈ'), ('ୋ', 'ୌ'), + ('ୖ', 'ୗ'), ('ୢ', 'ୣ'), ('ஂ', 'ஂ'), ('ா', 'ூ'), + ('ெ', 'ை'), ('ொ', 'ௌ'), ('ௗ', 'ௗ'), ('ఀ', 'ః'), + ('ా', 'ౄ'), ('ె', 'ై'), ('ొ', 'ౌ'), ('ౕ', 'ౖ'), + ('ౢ', 'ౣ'), ('ಁ', 'ಃ'), ('ಾ', 'ೄ'), ('ೆ', 'ೈ'), + ('ೊ', 'ೌ'), ('ೕ', 'ೖ'), ('ೢ', 'ೣ'), ('ഀ', 'ഃ'), + ('ാ', 'ൄ'), ('െ', 'ൈ'), ('ൊ', 'ൌ'), ('ൗ', 'ൗ'), + ('ൢ', 'ൣ'), ('ං', 'ඃ'), ('ා', 'ු'), ('ූ', 'ූ'), + ('ෘ', 'ෟ'), ('ෲ', 'ෳ'), ('ั', 'ั'), ('ิ', 'ฺ'), + ('ํ', 'ํ'), ('ັ', 'ັ'), ('ິ', 'ູ'), ('ົ', 'ຼ'), + ('ໍ', 'ໍ'), ('ཱ', 'ཱྀ'), ('ྍ', 'ྗ'), ('ྙ', 'ྼ'), + ('ါ', 'ံ'), ('း', 'း'), ('ျ', 'ှ'), ('ၖ', 'ၙ'), + ('ၞ', 'ၠ'), ('ၢ', 'ၢ'), ('ၧ', 'ၨ'), ('ၱ', 'ၴ'), + ('ႂ', 'ႆ'), ('ႜ', 'ႝ'), ('፟', '፟'), ('ᜒ', 'ᜓ'), + ('ᜲ', 'ᜳ'), ('ᝒ', 'ᝓ'), ('ᝲ', 'ᝳ'), ('ា', 'ៈ'), + ('ᢅ', 'ᢆ'), ('ᢩ', 'ᢩ'), ('ᤠ', 'ᤫ'), ('ᤰ', 'ᤸ'), + ('ᨗ', 'ᨛ'), ('ᩕ', 'ᩞ'), ('ᩡ', 'ᩴ'), ('ᬀ', 'ᬄ'), + ('ᬵ', 'ᭃ'), ('ᮀ', 'ᮂ'), ('ᮡ', 'ᮩ'), ('ᮬ', 'ᮭ'), + ('ᯧ', 'ᯱ'), ('ᰤ', 'ᰵ'), ('ᳲ', 'ᳳ'), ('ᷧ', 'ᷴ'), + ('Ⓐ', 'ⓩ'), ('ⷠ', 'ⷿ'), ('ꙴ', 'ꙻ'), ('ꚞ', 'ꚟ'), + ('ꠣ', 'ꠧ'), ('ꢀ', 'ꢁ'), ('ꢴ', 'ꣃ'), ('ꣅ', 'ꣅ'), + ('ꤦ', 'ꤪ'), ('ꥇ', 'ꥒ'), ('ꦀ', 'ꦃ'), ('ꦴ', 'ꦿ'), + ('ꨩ', 'ꨶ'), ('ꩃ', 'ꩃ'), ('ꩌ', 'ꩍ'), ('ꪰ', 'ꪰ'), + ('ꪲ', 'ꪴ'), ('ꪷ', 'ꪸ'), ('ꪾ', 'ꪾ'), ('ꫫ', 'ꫯ'), + ('ꫵ', 'ꫵ'), ('ꯣ', 'ꯪ'), ('ﬞ', 'ﬞ'), ('𐍶', '𐍺'), + ('𐨁', '𐨃'), ('𐨅', '𐨆'), ('𐨌', '𐨏'), ('𑀀', '𑀂'), + ('𑀸', '𑁅'), ('𑂂', '𑂂'), ('𑂰', '𑂸'), ('𑄀', '𑄂'), + ('𑄧', '𑄲'), ('𑆀', '𑆂'), ('𑆳', '𑆿'), ('𑈬', '𑈴'), + ('𑈷', '𑈷'), ('𑈾', '𑈾'), ('𑋟', '𑋨'), ('𑌀', '𑌃'), + ('𑌾', '𑍄'), ('𑍇', '𑍈'), ('𑍋', '𑍌'), ('𑍗', '𑍗'), + ('𑍢', '𑍣'), ('𑐵', '𑑁'), ('𑑃', '𑑅'), ('𑒰', '𑓁'), + ('𑖯', '𑖵'), ('𑖸', '𑖾'), ('𑗜', '𑗝'), ('𑘰', '𑘾'), + ('𑙀', '𑙀'), ('𑚫', '𑚵'), ('𑜝', '𑜪'), ('𑨁', '𑨊'), + ('𑨵', '𑨹'), ('𑨻', '𑨾'), ('𑩑', '𑩛'), ('𑪊', '𑪗'), + ('𑰯', '𑰶'), ('𑰸', '𑰾'), ('𑲒', '𑲧'), ('𑲩', '𑲶'), + ('𑴱', '𑴶'), ('𑴺', '𑴺'), ('𑴼', '𑴽'), ('𑴿', '𑵁'), + ('𑵃', '𑵃'), ('𑵇', '𑵇'), ('𖬰', '𖬶'), ('𖽑', '𖽾'), + ('𛲞', '𛲞'), ('𞀀', '𞀆'), ('𞀈', '𞀘'), ('𞀛', '𞀡'), + ('𞀣', '𞀤'), ('𞀦', '𞀪'), ('𞥇', '𞥇'), ('🄰', '🅉'), + ('🅐', '🅩'), ('🅰', '🆉'), +]; + +pub const OTHER_DEFAULT_IGNORABLE_CODE_POINT: &'static [(char, char)] = &[ + ('͏', '͏'), ('ᅟ', 'ᅠ'), ('឴', '឵'), ('\u{2065}', '\u{2065}'), + ('ㅤ', 'ㅤ'), ('ᅠ', 'ᅠ'), ('\u{fff0}', '\u{fff8}'), + ('\u{e0000}', '\u{e0000}'), ('\u{e0002}', '\u{e001f}'), + ('\u{e0080}', '\u{e00ff}'), ('\u{e01f0}', '\u{e0fff}'), +]; + +pub const OTHER_GRAPHEME_EXTEND: &'static [(char, char)] = &[ + ('া', 'া'), ('ৗ', 'ৗ'), ('ା', 'ା'), ('ୗ', 'ୗ'), + ('ா', 'ா'), ('ௗ', 'ௗ'), ('ೂ', 'ೂ'), ('ೕ', 'ೖ'), + ('ാ', 'ാ'), ('ൗ', 'ൗ'), ('ා', 'ා'), ('ෟ', 'ෟ'), + ('\u{200c}', '\u{200c}'), ('〮', '〯'), ('゙', '゚'), ('𑌾', '𑌾'), + ('𑍗', '𑍗'), ('𑒰', '𑒰'), ('𑒽', '𑒽'), ('𑖯', '𑖯'), + ('𝅥', '𝅥'), ('𝅮', '𝅲'), ('\u{e0020}', '\u{e007f}'), +]; + +pub const OTHER_ID_CONTINUE: &'static [(char, char)] = &[ + ('·', '·'), ('·', '·'), ('፩', '፱'), ('᧚', '᧚'), +]; + +pub const OTHER_ID_START: &'static [(char, char)] = &[ + ('ᢅ', 'ᢆ'), ('℘', '℘'), ('℮', '℮'), ('゛', '゜'), +]; + +pub const OTHER_LOWERCASE: &'static [(char, char)] = &[ + ('ª', 'ª'), ('º', 'º'), ('ʰ', 'ʸ'), ('ˀ', 'ˁ'), ('ˠ', 'ˤ'), + ('ͅ', 'ͅ'), ('ͺ', 'ͺ'), ('ᴬ', 'ᵪ'), ('ᵸ', 'ᵸ'), ('ᶛ', 'ᶿ'), + ('ⁱ', 'ⁱ'), ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), ('ⅰ', 'ⅿ'), + ('ⓐ', 'ⓩ'), ('ⱼ', 'ⱽ'), ('ꚜ', 'ꚝ'), ('ꝰ', 'ꝰ'), + ('ꟸ', 'ꟹ'), ('ꭜ', 'ꭟ'), +]; + +pub const OTHER_MATH: &'static [(char, char)] = &[ + ('^', '^'), ('ϐ', 'ϒ'), ('ϕ', 'ϕ'), ('ϰ', 'ϱ'), ('ϴ', 'ϵ'), + ('‖', '‖'), ('′', '‴'), ('⁀', '⁀'), ('\u{2061}', '\u{2064}'), + ('⁽', '⁾'), ('₍', '₎'), ('⃐', '⃜'), ('⃡', '⃡'), + ('⃥', '⃦'), ('⃫', '⃯'), ('ℂ', 'ℂ'), ('ℇ', 'ℇ'), + ('ℊ', 'ℓ'), ('ℕ', 'ℕ'), ('ℙ', 'ℝ'), ('ℤ', 'ℤ'), + ('ℨ', '℩'), ('ℬ', 'ℭ'), ('ℯ', 'ℱ'), ('ℳ', 'ℸ'), + ('ℼ', 'ℿ'), ('ⅅ', 'ⅉ'), ('↕', '↙'), ('↜', '↟'), + ('↡', '↢'), ('↤', '↥'), ('↧', '↧'), ('↩', '↭'), + ('↰', '↱'), ('↶', '↷'), ('↼', '⇍'), ('⇐', '⇑'), + ('⇓', '⇓'), ('⇕', '⇛'), ('⇝', '⇝'), ('⇤', '⇥'), + ('⌈', '⌋'), ('⎴', '⎵'), ('⎷', '⎷'), ('⏐', '⏐'), + ('⏢', '⏢'), ('■', '□'), ('▮', '▶'), ('▼', '◀'), + ('◆', '◇'), ('◊', '○'), ('●', '◓'), ('◢', '◢'), + ('◤', '◤'), ('◧', '◬'), ('★', '☆'), ('♀', '♀'), + ('♂', '♂'), ('♠', '♣'), ('♭', '♮'), ('⟅', '⟆'), + ('⟦', '⟯'), ('⦃', '⦘'), ('⧘', '⧛'), ('⧼', '⧽'), + ('﹡', '﹡'), ('﹣', '﹣'), ('﹨', '﹨'), ('\', '\'), + ('^', '^'), ('𝐀', '𝑔'), ('𝑖', '𝒜'), ('𝒞', '𝒟'), + ('𝒢', '𝒢'), ('𝒥', '𝒦'), ('𝒩', '𝒬'), ('𝒮', '𝒹'), + ('𝒻', '𝒻'), ('𝒽', '𝓃'), ('𝓅', '𝔅'), ('𝔇', '𝔊'), + ('𝔍', '𝔔'), ('𝔖', '𝔜'), ('𝔞', '𝔹'), ('𝔻', '𝔾'), + ('𝕀', '𝕄'), ('𝕆', '𝕆'), ('𝕊', '𝕐'), ('𝕒', '𝚥'), + ('𝚨', '𝛀'), ('𝛂', '𝛚'), ('𝛜', '𝛺'), ('𝛼', '𝜔'), + ('𝜖', '𝜴'), ('𝜶', '𝝎'), ('𝝐', '𝝮'), ('𝝰', '𝞈'), + ('𝞊', '𝞨'), ('𝞪', '𝟂'), ('𝟄', '𝟋'), ('𝟎', '𝟿'), + ('𞸀', '𞸃'), ('𞸅', '𞸟'), ('𞸡', '𞸢'), ('𞸤', '𞸤'), + ('𞸧', '𞸧'), ('𞸩', '𞸲'), ('𞸴', '𞸷'), ('𞸹', '𞸹'), + ('𞸻', '𞸻'), ('𞹂', '𞹂'), ('𞹇', '𞹇'), ('𞹉', '𞹉'), + ('𞹋', '𞹋'), ('𞹍', '𞹏'), ('𞹑', '𞹒'), ('𞹔', '𞹔'), + ('𞹗', '𞹗'), ('𞹙', '𞹙'), ('𞹛', '𞹛'), ('𞹝', '𞹝'), + ('𞹟', '𞹟'), ('𞹡', '𞹢'), ('𞹤', '𞹤'), ('𞹧', '𞹪'), + ('𞹬', '𞹲'), ('𞹴', '𞹷'), ('𞹹', '𞹼'), ('𞹾', '𞹾'), + ('𞺀', '𞺉'), ('𞺋', '𞺛'), ('𞺡', '𞺣'), ('𞺥', '𞺩'), + ('𞺫', '𞺻'), +]; + +pub const OTHER_UPPERCASE: &'static [(char, char)] = &[ + ('Ⅰ', 'Ⅿ'), ('Ⓐ', 'Ⓩ'), ('🄰', '🅉'), ('🅐', '🅩'), + ('🅰', '🆉'), +]; + +pub const PATTERN_SYNTAX: &'static [(char, char)] = &[ + ('!', '/'), (':', '@'), ('[', '^'), ('`', '`'), ('{', '~'), ('¡', '§'), + ('©', '©'), ('«', '¬'), ('®', '®'), ('°', '±'), ('¶', '¶'), + ('»', '»'), ('¿', '¿'), ('×', '×'), ('÷', '÷'), ('‐', '‧'), + ('‰', '‾'), ('⁁', '⁓'), ('⁕', '⁞'), ('←', '\u{245f}'), + ('─', '❵'), ('➔', '\u{2bff}'), ('⸀', '\u{2e7f}'), ('、', '〃'), + ('〈', '〠'), ('〰', '〰'), ('﴾', '﴿'), ('﹅', '﹆'), +]; + +pub const PATTERN_WHITE_SPACE: &'static [(char, char)] = &[ + ('\t', '\r'), (' ', ' '), ('\u{85}', '\u{85}'), ('\u{200e}', '\u{200f}'), + ('\u{2028}', '\u{2029}'), +]; + +pub const PREPENDED_CONCATENATION_MARK: &'static [(char, char)] = &[ + ('\u{600}', '\u{605}'), ('\u{6dd}', '\u{6dd}'), ('\u{70f}', '\u{70f}'), + ('\u{8e2}', '\u{8e2}'), ('\u{110bd}', '\u{110bd}'), +]; + +pub const QUOTATION_MARK: &'static [(char, char)] = &[ + ('\"', '\"'), ('\'', '\''), ('«', '«'), ('»', '»'), ('‘', '‟'), + ('‹', '›'), ('⹂', '⹂'), ('「', '』'), ('〝', '〟'), + ('﹁', '﹄'), ('"', '"'), (''', '''), ('「', '」'), +]; + +pub const RADICAL: &'static [(char, char)] = &[ + ('⺀', '⺙'), ('⺛', '⻳'), ('⼀', '⿕'), +]; + +pub const REGIONAL_INDICATOR: &'static [(char, char)] = &[ + ('🇦', '🇿'), +]; + +pub const SENTENCE_TERMINAL: &'static [(char, char)] = &[ + ('!', '!'), ('.', '.'), ('?', '?'), ('։', '։'), ('؟', '؟'), + ('۔', '۔'), ('܀', '܂'), ('߹', '߹'), ('।', '॥'), ('၊', '။'), + ('።', '።'), ('፧', '፨'), ('᙮', '᙮'), ('᜵', '᜶'), + ('᠃', '᠃'), ('᠉', '᠉'), ('᥄', '᥅'), ('᪨', '᪫'), + ('᭚', '᭛'), ('᭞', '᭟'), ('᰻', '᰼'), ('᱾', '᱿'), + ('‼', '‽'), ('⁇', '⁉'), ('⸮', '⸮'), ('⸼', '⸼'), + ('。', '。'), ('꓿', '꓿'), ('꘎', '꘏'), ('꛳', '꛳'), + ('꛷', '꛷'), ('꡶', '꡷'), ('꣎', '꣏'), ('꤯', '꤯'), + ('꧈', '꧉'), ('꩝', '꩟'), ('꫰', '꫱'), ('꯫', '꯫'), + ('﹒', '﹒'), ('﹖', '﹗'), ('!', '!'), ('.', '.'), + ('?', '?'), ('。', '。'), ('𐩖', '𐩗'), ('𑁇', '𑁈'), + ('𑂾', '𑃁'), ('𑅁', '𑅃'), ('𑇅', '𑇆'), ('𑇍', '𑇍'), + ('𑇞', '𑇟'), ('𑈸', '𑈹'), ('𑈻', '𑈼'), ('𑊩', '𑊩'), + ('𑑋', '𑑌'), ('𑗂', '𑗃'), ('𑗉', '𑗗'), ('𑙁', '𑙂'), + ('𑜼', '𑜾'), ('𑩂', '𑩃'), ('𑪛', '𑪜'), ('𑱁', '𑱂'), + ('𖩮', '𖩯'), ('𖫵', '𖫵'), ('𖬷', '𖬸'), ('𖭄', '𖭄'), + ('𛲟', '𛲟'), ('𝪈', '𝪈'), +]; + +pub const SOFT_DOTTED: &'static [(char, char)] = &[ + ('i', 'j'), ('į', 'į'), ('ɉ', 'ɉ'), ('ɨ', 'ɨ'), ('ʝ', 'ʝ'), + ('ʲ', 'ʲ'), ('ϳ', 'ϳ'), ('і', 'і'), ('ј', 'ј'), ('ᵢ', 'ᵢ'), + ('ᶖ', 'ᶖ'), ('ᶤ', 'ᶤ'), ('ᶨ', 'ᶨ'), ('ḭ', 'ḭ'), + ('ị', 'ị'), ('ⁱ', 'ⁱ'), ('ⅈ', 'ⅉ'), ('ⱼ', 'ⱼ'), + ('𝐢', '𝐣'), ('𝑖', '𝑗'), ('𝒊', '𝒋'), ('𝒾', '𝒿'), + ('𝓲', '𝓳'), ('𝔦', '𝔧'), ('𝕚', '𝕛'), ('𝖎', '𝖏'), + ('𝗂', '𝗃'), ('𝗶', '𝗷'), ('𝘪', '𝘫'), ('𝙞', '𝙟'), + ('𝚒', '𝚓'), +]; + +pub const TERMINAL_PUNCTUATION: &'static [(char, char)] = &[ + ('!', '!'), (',', ','), ('.', '.'), (':', ';'), ('?', '?'), (';', ';'), + ('·', '·'), ('։', '։'), ('׃', '׃'), ('،', '،'), ('؛', '؛'), + ('؟', '؟'), ('۔', '۔'), ('܀', '܊'), ('܌', '܌'), ('߸', '߹'), + ('࠰', '࠾'), ('࡞', '࡞'), ('।', '॥'), ('๚', '๛'), + ('༈', '༈'), ('།', '༒'), ('၊', '။'), ('፡', '፨'), + ('᙭', '᙮'), ('᛫', '᛭'), ('᜵', '᜶'), ('។', '៖'), + ('៚', '៚'), ('᠂', '᠅'), ('᠈', '᠉'), ('᥄', '᥅'), + ('᪨', '᪫'), ('᭚', '᭛'), ('᭝', '᭟'), ('᰻', '᰿'), + ('᱾', '᱿'), ('‼', '‽'), ('⁇', '⁉'), ('⸮', '⸮'), + ('⸼', '⸼'), ('⹁', '⹁'), ('、', '。'), ('꓾', '꓿'), + ('꘍', '꘏'), ('꛳', '꛷'), ('꡶', '꡷'), ('꣎', '꣏'), + ('꤯', '꤯'), ('꧇', '꧉'), ('꩝', '꩟'), ('꫟', '꫟'), + ('꫰', '꫱'), ('꯫', '꯫'), ('﹐', '﹒'), ('﹔', '﹗'), + ('!', '!'), (',', ','), ('.', '.'), (':', ';'), + ('?', '?'), ('。', '。'), ('、', '、'), ('𐎟', '𐎟'), + ('𐏐', '𐏐'), ('𐡗', '𐡗'), ('𐤟', '𐤟'), ('𐩖', '𐩗'), + ('𐫰', '𐫵'), ('𐬺', '𐬿'), ('𐮙', '𐮜'), ('𑁇', '𑁍'), + ('𑂾', '𑃁'), ('𑅁', '𑅃'), ('𑇅', '𑇆'), ('𑇍', '𑇍'), + ('𑇞', '𑇟'), ('𑈸', '𑈼'), ('𑊩', '𑊩'), ('𑑋', '𑑍'), + ('𑑛', '𑑛'), ('𑗂', '𑗅'), ('𑗉', '𑗗'), ('𑙁', '𑙂'), + ('𑜼', '𑜾'), ('𑩂', '𑩃'), ('𑪛', '𑪜'), ('𑪡', '𑪢'), + ('𑱁', '𑱃'), ('𑱱', '𑱱'), ('𒑰', '𒑴'), ('𖩮', '𖩯'), + ('𖫵', '𖫵'), ('𖬷', '𖬹'), ('𖭄', '𖭄'), ('𛲟', '𛲟'), + ('𝪇', '𝪊'), +]; + +pub const UNIFIED_IDEOGRAPH: &'static [(char, char)] = &[ + ('㐀', '䶵'), ('一', '鿪'), ('﨎', '﨏'), ('﨑', '﨑'), + ('﨓', '﨔'), ('﨟', '﨟'), ('﨡', '﨡'), ('﨣', '﨤'), + ('﨧', '﨩'), ('𠀀', '𪛖'), ('𪜀', '𫜴'), ('𫝀', '𫠝'), + ('𫠠', '𬺡'), ('𬺰', '𮯠'), +]; + +pub const UPPERCASE: &'static [(char, char)] = &[ + ('A', 'Z'), ('À', 'Ö'), ('Ø', 'Þ'), ('Ā', 'Ā'), ('Ă', 'Ă'), + ('Ą', 'Ą'), ('Ć', 'Ć'), ('Ĉ', 'Ĉ'), ('Ċ', 'Ċ'), ('Č', 'Č'), + ('Ď', 'Ď'), ('Đ', 'Đ'), ('Ē', 'Ē'), ('Ĕ', 'Ĕ'), ('Ė', 'Ė'), + ('Ę', 'Ę'), ('Ě', 'Ě'), ('Ĝ', 'Ĝ'), ('Ğ', 'Ğ'), ('Ġ', 'Ġ'), + ('Ģ', 'Ģ'), ('Ĥ', 'Ĥ'), ('Ħ', 'Ħ'), ('Ĩ', 'Ĩ'), ('Ī', 'Ī'), + ('Ĭ', 'Ĭ'), ('Į', 'Į'), ('İ', 'İ'), ('IJ', 'IJ'), ('Ĵ', 'Ĵ'), + ('Ķ', 'Ķ'), ('Ĺ', 'Ĺ'), ('Ļ', 'Ļ'), ('Ľ', 'Ľ'), ('Ŀ', 'Ŀ'), + ('Ł', 'Ł'), ('Ń', 'Ń'), ('Ņ', 'Ņ'), ('Ň', 'Ň'), ('Ŋ', 'Ŋ'), + ('Ō', 'Ō'), ('Ŏ', 'Ŏ'), ('Ő', 'Ő'), ('Œ', 'Œ'), ('Ŕ', 'Ŕ'), + ('Ŗ', 'Ŗ'), ('Ř', 'Ř'), ('Ś', 'Ś'), ('Ŝ', 'Ŝ'), ('Ş', 'Ş'), + ('Š', 'Š'), ('Ţ', 'Ţ'), ('Ť', 'Ť'), ('Ŧ', 'Ŧ'), ('Ũ', 'Ũ'), + ('Ū', 'Ū'), ('Ŭ', 'Ŭ'), ('Ů', 'Ů'), ('Ű', 'Ű'), ('Ų', 'Ų'), + ('Ŵ', 'Ŵ'), ('Ŷ', 'Ŷ'), ('Ÿ', 'Ź'), ('Ż', 'Ż'), ('Ž', 'Ž'), + ('Ɓ', 'Ƃ'), ('Ƅ', 'Ƅ'), ('Ɔ', 'Ƈ'), ('Ɖ', 'Ƌ'), ('Ǝ', 'Ƒ'), + ('Ɠ', 'Ɣ'), ('Ɩ', 'Ƙ'), ('Ɯ', 'Ɲ'), ('Ɵ', 'Ơ'), ('Ƣ', 'Ƣ'), + ('Ƥ', 'Ƥ'), ('Ʀ', 'Ƨ'), ('Ʃ', 'Ʃ'), ('Ƭ', 'Ƭ'), ('Ʈ', 'Ư'), + ('Ʊ', 'Ƴ'), ('Ƶ', 'Ƶ'), ('Ʒ', 'Ƹ'), ('Ƽ', 'Ƽ'), ('DŽ', 'DŽ'), + ('LJ', 'LJ'), ('NJ', 'NJ'), ('Ǎ', 'Ǎ'), ('Ǐ', 'Ǐ'), ('Ǒ', 'Ǒ'), + ('Ǔ', 'Ǔ'), ('Ǖ', 'Ǖ'), ('Ǘ', 'Ǘ'), ('Ǚ', 'Ǚ'), ('Ǜ', 'Ǜ'), + ('Ǟ', 'Ǟ'), ('Ǡ', 'Ǡ'), ('Ǣ', 'Ǣ'), ('Ǥ', 'Ǥ'), ('Ǧ', 'Ǧ'), + ('Ǩ', 'Ǩ'), ('Ǫ', 'Ǫ'), ('Ǭ', 'Ǭ'), ('Ǯ', 'Ǯ'), ('DZ', 'DZ'), + ('Ǵ', 'Ǵ'), ('Ƕ', 'Ǹ'), ('Ǻ', 'Ǻ'), ('Ǽ', 'Ǽ'), ('Ǿ', 'Ǿ'), + ('Ȁ', 'Ȁ'), ('Ȃ', 'Ȃ'), ('Ȅ', 'Ȅ'), ('Ȇ', 'Ȇ'), ('Ȉ', 'Ȉ'), + ('Ȋ', 'Ȋ'), ('Ȍ', 'Ȍ'), ('Ȏ', 'Ȏ'), ('Ȑ', 'Ȑ'), ('Ȓ', 'Ȓ'), + ('Ȕ', 'Ȕ'), ('Ȗ', 'Ȗ'), ('Ș', 'Ș'), ('Ț', 'Ț'), ('Ȝ', 'Ȝ'), + ('Ȟ', 'Ȟ'), ('Ƞ', 'Ƞ'), ('Ȣ', 'Ȣ'), ('Ȥ', 'Ȥ'), ('Ȧ', 'Ȧ'), + ('Ȩ', 'Ȩ'), ('Ȫ', 'Ȫ'), ('Ȭ', 'Ȭ'), ('Ȯ', 'Ȯ'), ('Ȱ', 'Ȱ'), + ('Ȳ', 'Ȳ'), ('Ⱥ', 'Ȼ'), ('Ƚ', 'Ⱦ'), ('Ɂ', 'Ɂ'), ('Ƀ', 'Ɇ'), + ('Ɉ', 'Ɉ'), ('Ɋ', 'Ɋ'), ('Ɍ', 'Ɍ'), ('Ɏ', 'Ɏ'), ('Ͱ', 'Ͱ'), + ('Ͳ', 'Ͳ'), ('Ͷ', 'Ͷ'), ('Ϳ', 'Ϳ'), ('Ά', 'Ά'), ('Έ', 'Ί'), + ('Ό', 'Ό'), ('Ύ', 'Ώ'), ('Α', 'Ρ'), ('Σ', 'Ϋ'), ('Ϗ', 'Ϗ'), + ('ϒ', 'ϔ'), ('Ϙ', 'Ϙ'), ('Ϛ', 'Ϛ'), ('Ϝ', 'Ϝ'), ('Ϟ', 'Ϟ'), + ('Ϡ', 'Ϡ'), ('Ϣ', 'Ϣ'), ('Ϥ', 'Ϥ'), ('Ϧ', 'Ϧ'), ('Ϩ', 'Ϩ'), + ('Ϫ', 'Ϫ'), ('Ϭ', 'Ϭ'), ('Ϯ', 'Ϯ'), ('ϴ', 'ϴ'), ('Ϸ', 'Ϸ'), + ('Ϲ', 'Ϻ'), ('Ͻ', 'Я'), ('Ѡ', 'Ѡ'), ('Ѣ', 'Ѣ'), ('Ѥ', 'Ѥ'), + ('Ѧ', 'Ѧ'), ('Ѩ', 'Ѩ'), ('Ѫ', 'Ѫ'), ('Ѭ', 'Ѭ'), ('Ѯ', 'Ѯ'), + ('Ѱ', 'Ѱ'), ('Ѳ', 'Ѳ'), ('Ѵ', 'Ѵ'), ('Ѷ', 'Ѷ'), ('Ѹ', 'Ѹ'), + ('Ѻ', 'Ѻ'), ('Ѽ', 'Ѽ'), ('Ѿ', 'Ѿ'), ('Ҁ', 'Ҁ'), ('Ҋ', 'Ҋ'), + ('Ҍ', 'Ҍ'), ('Ҏ', 'Ҏ'), ('Ґ', 'Ґ'), ('Ғ', 'Ғ'), ('Ҕ', 'Ҕ'), + ('Җ', 'Җ'), ('Ҙ', 'Ҙ'), ('Қ', 'Қ'), ('Ҝ', 'Ҝ'), ('Ҟ', 'Ҟ'), + ('Ҡ', 'Ҡ'), ('Ң', 'Ң'), ('Ҥ', 'Ҥ'), ('Ҧ', 'Ҧ'), ('Ҩ', 'Ҩ'), + ('Ҫ', 'Ҫ'), ('Ҭ', 'Ҭ'), ('Ү', 'Ү'), ('Ұ', 'Ұ'), ('Ҳ', 'Ҳ'), + ('Ҵ', 'Ҵ'), ('Ҷ', 'Ҷ'), ('Ҹ', 'Ҹ'), ('Һ', 'Һ'), ('Ҽ', 'Ҽ'), + ('Ҿ', 'Ҿ'), ('Ӏ', 'Ӂ'), ('Ӄ', 'Ӄ'), ('Ӆ', 'Ӆ'), ('Ӈ', 'Ӈ'), + ('Ӊ', 'Ӊ'), ('Ӌ', 'Ӌ'), ('Ӎ', 'Ӎ'), ('Ӑ', 'Ӑ'), ('Ӓ', 'Ӓ'), + ('Ӕ', 'Ӕ'), ('Ӗ', 'Ӗ'), ('Ә', 'Ә'), ('Ӛ', 'Ӛ'), ('Ӝ', 'Ӝ'), + ('Ӟ', 'Ӟ'), ('Ӡ', 'Ӡ'), ('Ӣ', 'Ӣ'), ('Ӥ', 'Ӥ'), ('Ӧ', 'Ӧ'), + ('Ө', 'Ө'), ('Ӫ', 'Ӫ'), ('Ӭ', 'Ӭ'), ('Ӯ', 'Ӯ'), ('Ӱ', 'Ӱ'), + ('Ӳ', 'Ӳ'), ('Ӵ', 'Ӵ'), ('Ӷ', 'Ӷ'), ('Ӹ', 'Ӹ'), ('Ӻ', 'Ӻ'), + ('Ӽ', 'Ӽ'), ('Ӿ', 'Ӿ'), ('Ԁ', 'Ԁ'), ('Ԃ', 'Ԃ'), ('Ԅ', 'Ԅ'), + ('Ԇ', 'Ԇ'), ('Ԉ', 'Ԉ'), ('Ԋ', 'Ԋ'), ('Ԍ', 'Ԍ'), ('Ԏ', 'Ԏ'), + ('Ԑ', 'Ԑ'), ('Ԓ', 'Ԓ'), ('Ԕ', 'Ԕ'), ('Ԗ', 'Ԗ'), ('Ԙ', 'Ԙ'), + ('Ԛ', 'Ԛ'), ('Ԝ', 'Ԝ'), ('Ԟ', 'Ԟ'), ('Ԡ', 'Ԡ'), ('Ԣ', 'Ԣ'), + ('Ԥ', 'Ԥ'), ('Ԧ', 'Ԧ'), ('Ԩ', 'Ԩ'), ('Ԫ', 'Ԫ'), ('Ԭ', 'Ԭ'), + ('Ԯ', 'Ԯ'), ('Ա', 'Ֆ'), ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), + ('Ꭰ', 'Ᏽ'), ('Ḁ', 'Ḁ'), ('Ḃ', 'Ḃ'), ('Ḅ', 'Ḅ'), + ('Ḇ', 'Ḇ'), ('Ḉ', 'Ḉ'), ('Ḋ', 'Ḋ'), ('Ḍ', 'Ḍ'), + ('Ḏ', 'Ḏ'), ('Ḑ', 'Ḑ'), ('Ḓ', 'Ḓ'), ('Ḕ', 'Ḕ'), + ('Ḗ', 'Ḗ'), ('Ḙ', 'Ḙ'), ('Ḛ', 'Ḛ'), ('Ḝ', 'Ḝ'), + ('Ḟ', 'Ḟ'), ('Ḡ', 'Ḡ'), ('Ḣ', 'Ḣ'), ('Ḥ', 'Ḥ'), + ('Ḧ', 'Ḧ'), ('Ḩ', 'Ḩ'), ('Ḫ', 'Ḫ'), ('Ḭ', 'Ḭ'), + ('Ḯ', 'Ḯ'), ('Ḱ', 'Ḱ'), ('Ḳ', 'Ḳ'), ('Ḵ', 'Ḵ'), + ('Ḷ', 'Ḷ'), ('Ḹ', 'Ḹ'), ('Ḻ', 'Ḻ'), ('Ḽ', 'Ḽ'), + ('Ḿ', 'Ḿ'), ('Ṁ', 'Ṁ'), ('Ṃ', 'Ṃ'), ('Ṅ', 'Ṅ'), + ('Ṇ', 'Ṇ'), ('Ṉ', 'Ṉ'), ('Ṋ', 'Ṋ'), ('Ṍ', 'Ṍ'), + ('Ṏ', 'Ṏ'), ('Ṑ', 'Ṑ'), ('Ṓ', 'Ṓ'), ('Ṕ', 'Ṕ'), + ('Ṗ', 'Ṗ'), ('Ṙ', 'Ṙ'), ('Ṛ', 'Ṛ'), ('Ṝ', 'Ṝ'), + ('Ṟ', 'Ṟ'), ('Ṡ', 'Ṡ'), ('Ṣ', 'Ṣ'), ('Ṥ', 'Ṥ'), + ('Ṧ', 'Ṧ'), ('Ṩ', 'Ṩ'), ('Ṫ', 'Ṫ'), ('Ṭ', 'Ṭ'), + ('Ṯ', 'Ṯ'), ('Ṱ', 'Ṱ'), ('Ṳ', 'Ṳ'), ('Ṵ', 'Ṵ'), + ('Ṷ', 'Ṷ'), ('Ṹ', 'Ṹ'), ('Ṻ', 'Ṻ'), ('Ṽ', 'Ṽ'), + ('Ṿ', 'Ṿ'), ('Ẁ', 'Ẁ'), ('Ẃ', 'Ẃ'), ('Ẅ', 'Ẅ'), + ('Ẇ', 'Ẇ'), ('Ẉ', 'Ẉ'), ('Ẋ', 'Ẋ'), ('Ẍ', 'Ẍ'), + ('Ẏ', 'Ẏ'), ('Ẑ', 'Ẑ'), ('Ẓ', 'Ẓ'), ('Ẕ', 'Ẕ'), + ('ẞ', 'ẞ'), ('Ạ', 'Ạ'), ('Ả', 'Ả'), ('Ấ', 'Ấ'), + ('Ầ', 'Ầ'), ('Ẩ', 'Ẩ'), ('Ẫ', 'Ẫ'), ('Ậ', 'Ậ'), + ('Ắ', 'Ắ'), ('Ằ', 'Ằ'), ('Ẳ', 'Ẳ'), ('Ẵ', 'Ẵ'), + ('Ặ', 'Ặ'), ('Ẹ', 'Ẹ'), ('Ẻ', 'Ẻ'), ('Ẽ', 'Ẽ'), + ('Ế', 'Ế'), ('Ề', 'Ề'), ('Ể', 'Ể'), ('Ễ', 'Ễ'), + ('Ệ', 'Ệ'), ('Ỉ', 'Ỉ'), ('Ị', 'Ị'), ('Ọ', 'Ọ'), + ('Ỏ', 'Ỏ'), ('Ố', 'Ố'), ('Ồ', 'Ồ'), ('Ổ', 'Ổ'), + ('Ỗ', 'Ỗ'), ('Ộ', 'Ộ'), ('Ớ', 'Ớ'), ('Ờ', 'Ờ'), + ('Ở', 'Ở'), ('Ỡ', 'Ỡ'), ('Ợ', 'Ợ'), ('Ụ', 'Ụ'), + ('Ủ', 'Ủ'), ('Ứ', 'Ứ'), ('Ừ', 'Ừ'), ('Ử', 'Ử'), + ('Ữ', 'Ữ'), ('Ự', 'Ự'), ('Ỳ', 'Ỳ'), ('Ỵ', 'Ỵ'), + ('Ỷ', 'Ỷ'), ('Ỹ', 'Ỹ'), ('Ỻ', 'Ỻ'), ('Ỽ', 'Ỽ'), + ('Ỿ', 'Ỿ'), ('Ἀ', 'Ἇ'), ('Ἐ', 'Ἕ'), ('Ἠ', 'Ἧ'), + ('Ἰ', 'Ἷ'), ('Ὀ', 'Ὅ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), ('Ὗ', 'Ὗ'), ('Ὠ', 'Ὧ'), ('Ᾰ', 'Ά'), + ('Ὲ', 'Ή'), ('Ῐ', 'Ί'), ('Ῠ', 'Ῥ'), ('Ὸ', 'Ώ'), + ('ℂ', 'ℂ'), ('ℇ', 'ℇ'), ('ℋ', 'ℍ'), ('ℐ', 'ℒ'), + ('ℕ', 'ℕ'), ('ℙ', 'ℝ'), ('ℤ', 'ℤ'), ('Ω', 'Ω'), + ('ℨ', 'ℨ'), ('K', 'ℭ'), ('ℰ', 'ℳ'), ('ℾ', 'ℿ'), + ('ⅅ', 'ⅅ'), ('Ⅰ', 'Ⅿ'), ('Ↄ', 'Ↄ'), ('Ⓐ', 'Ⓩ'), + ('Ⰰ', 'Ⱞ'), ('Ⱡ', 'Ⱡ'), ('Ɫ', 'Ɽ'), ('Ⱨ', 'Ⱨ'), + ('Ⱪ', 'Ⱪ'), ('Ⱬ', 'Ⱬ'), ('Ɑ', 'Ɒ'), ('Ⱳ', 'Ⱳ'), + ('Ⱶ', 'Ⱶ'), ('Ȿ', 'Ⲁ'), ('Ⲃ', 'Ⲃ'), ('Ⲅ', 'Ⲅ'), + ('Ⲇ', 'Ⲇ'), ('Ⲉ', 'Ⲉ'), ('Ⲋ', 'Ⲋ'), ('Ⲍ', 'Ⲍ'), + ('Ⲏ', 'Ⲏ'), ('Ⲑ', 'Ⲑ'), ('Ⲓ', 'Ⲓ'), ('Ⲕ', 'Ⲕ'), + ('Ⲗ', 'Ⲗ'), ('Ⲙ', 'Ⲙ'), ('Ⲛ', 'Ⲛ'), ('Ⲝ', 'Ⲝ'), + ('Ⲟ', 'Ⲟ'), ('Ⲡ', 'Ⲡ'), ('Ⲣ', 'Ⲣ'), ('Ⲥ', 'Ⲥ'), + ('Ⲧ', 'Ⲧ'), ('Ⲩ', 'Ⲩ'), ('Ⲫ', 'Ⲫ'), ('Ⲭ', 'Ⲭ'), + ('Ⲯ', 'Ⲯ'), ('Ⲱ', 'Ⲱ'), ('Ⲳ', 'Ⲳ'), ('Ⲵ', 'Ⲵ'), + ('Ⲷ', 'Ⲷ'), ('Ⲹ', 'Ⲹ'), ('Ⲻ', 'Ⲻ'), ('Ⲽ', 'Ⲽ'), + ('Ⲿ', 'Ⲿ'), ('Ⳁ', 'Ⳁ'), ('Ⳃ', 'Ⳃ'), ('Ⳅ', 'Ⳅ'), + ('Ⳇ', 'Ⳇ'), ('Ⳉ', 'Ⳉ'), ('Ⳋ', 'Ⳋ'), ('Ⳍ', 'Ⳍ'), + ('Ⳏ', 'Ⳏ'), ('Ⳑ', 'Ⳑ'), ('Ⳓ', 'Ⳓ'), ('Ⳕ', 'Ⳕ'), + ('Ⳗ', 'Ⳗ'), ('Ⳙ', 'Ⳙ'), ('Ⳛ', 'Ⳛ'), ('Ⳝ', 'Ⳝ'), + ('Ⳟ', 'Ⳟ'), ('Ⳡ', 'Ⳡ'), ('Ⳣ', 'Ⳣ'), ('Ⳬ', 'Ⳬ'), + ('Ⳮ', 'Ⳮ'), ('Ⳳ', 'Ⳳ'), ('Ꙁ', 'Ꙁ'), ('Ꙃ', 'Ꙃ'), + ('Ꙅ', 'Ꙅ'), ('Ꙇ', 'Ꙇ'), ('Ꙉ', 'Ꙉ'), ('Ꙋ', 'Ꙋ'), + ('Ꙍ', 'Ꙍ'), ('Ꙏ', 'Ꙏ'), ('Ꙑ', 'Ꙑ'), ('Ꙓ', 'Ꙓ'), + ('Ꙕ', 'Ꙕ'), ('Ꙗ', 'Ꙗ'), ('Ꙙ', 'Ꙙ'), ('Ꙛ', 'Ꙛ'), + ('Ꙝ', 'Ꙝ'), ('Ꙟ', 'Ꙟ'), ('Ꙡ', 'Ꙡ'), ('Ꙣ', 'Ꙣ'), + ('Ꙥ', 'Ꙥ'), ('Ꙧ', 'Ꙧ'), ('Ꙩ', 'Ꙩ'), ('Ꙫ', 'Ꙫ'), + ('Ꙭ', 'Ꙭ'), ('Ꚁ', 'Ꚁ'), ('Ꚃ', 'Ꚃ'), ('Ꚅ', 'Ꚅ'), + ('Ꚇ', 'Ꚇ'), ('Ꚉ', 'Ꚉ'), ('Ꚋ', 'Ꚋ'), ('Ꚍ', 'Ꚍ'), + ('Ꚏ', 'Ꚏ'), ('Ꚑ', 'Ꚑ'), ('Ꚓ', 'Ꚓ'), ('Ꚕ', 'Ꚕ'), + ('Ꚗ', 'Ꚗ'), ('Ꚙ', 'Ꚙ'), ('Ꚛ', 'Ꚛ'), ('Ꜣ', 'Ꜣ'), + ('Ꜥ', 'Ꜥ'), ('Ꜧ', 'Ꜧ'), ('Ꜩ', 'Ꜩ'), ('Ꜫ', 'Ꜫ'), + ('Ꜭ', 'Ꜭ'), ('Ꜯ', 'Ꜯ'), ('Ꜳ', 'Ꜳ'), ('Ꜵ', 'Ꜵ'), + ('Ꜷ', 'Ꜷ'), ('Ꜹ', 'Ꜹ'), ('Ꜻ', 'Ꜻ'), ('Ꜽ', 'Ꜽ'), + ('Ꜿ', 'Ꜿ'), ('Ꝁ', 'Ꝁ'), ('Ꝃ', 'Ꝃ'), ('Ꝅ', 'Ꝅ'), + ('Ꝇ', 'Ꝇ'), ('Ꝉ', 'Ꝉ'), ('Ꝋ', 'Ꝋ'), ('Ꝍ', 'Ꝍ'), + ('Ꝏ', 'Ꝏ'), ('Ꝑ', 'Ꝑ'), ('Ꝓ', 'Ꝓ'), ('Ꝕ', 'Ꝕ'), + ('Ꝗ', 'Ꝗ'), ('Ꝙ', 'Ꝙ'), ('Ꝛ', 'Ꝛ'), ('Ꝝ', 'Ꝝ'), + ('Ꝟ', 'Ꝟ'), ('Ꝡ', 'Ꝡ'), ('Ꝣ', 'Ꝣ'), ('Ꝥ', 'Ꝥ'), + ('Ꝧ', 'Ꝧ'), ('Ꝩ', 'Ꝩ'), ('Ꝫ', 'Ꝫ'), ('Ꝭ', 'Ꝭ'), + ('Ꝯ', 'Ꝯ'), ('Ꝺ', 'Ꝺ'), ('Ꝼ', 'Ꝼ'), ('Ᵹ', 'Ꝿ'), + ('Ꞁ', 'Ꞁ'), ('Ꞃ', 'Ꞃ'), ('Ꞅ', 'Ꞅ'), ('Ꞇ', 'Ꞇ'), + ('Ꞌ', 'Ꞌ'), ('Ɥ', 'Ɥ'), ('Ꞑ', 'Ꞑ'), ('Ꞓ', 'Ꞓ'), + ('Ꞗ', 'Ꞗ'), ('Ꞙ', 'Ꞙ'), ('Ꞛ', 'Ꞛ'), ('Ꞝ', 'Ꞝ'), + ('Ꞟ', 'Ꞟ'), ('Ꞡ', 'Ꞡ'), ('Ꞣ', 'Ꞣ'), ('Ꞥ', 'Ꞥ'), + ('Ꞧ', 'Ꞧ'), ('Ꞩ', 'Ꞩ'), ('Ɦ', 'Ɪ'), ('Ʞ', 'Ꞵ'), + ('Ꞷ', 'Ꞷ'), ('A', 'Z'), ('𐐀', '𐐧'), ('𐒰', '𐓓'), + ('𐲀', '𐲲'), ('𑢠', '𑢿'), ('𝐀', '𝐙'), ('𝐴', '𝑍'), + ('𝑨', '𝒁'), ('𝒜', '𝒜'), ('𝒞', '𝒟'), ('𝒢', '𝒢'), + ('𝒥', '𝒦'), ('𝒩', '𝒬'), ('𝒮', '𝒵'), ('𝓐', '𝓩'), + ('𝔄', '𝔅'), ('𝔇', '𝔊'), ('𝔍', '𝔔'), ('𝔖', '𝔜'), + ('𝔸', '𝔹'), ('𝔻', '𝔾'), ('𝕀', '𝕄'), ('𝕆', '𝕆'), + ('𝕊', '𝕐'), ('𝕬', '𝖅'), ('𝖠', '𝖹'), ('𝗔', '𝗭'), + ('𝘈', '𝘡'), ('𝘼', '𝙕'), ('𝙰', '𝚉'), ('𝚨', '𝛀'), + ('𝛢', '𝛺'), ('𝜜', '𝜴'), ('𝝖', '𝝮'), ('𝞐', '𝞨'), + ('𝟊', '𝟊'), ('𞤀', '𞤡'), ('🄰', '🅉'), ('🅐', '🅩'), + ('🅰', '🆉'), +]; + +pub const VARIATION_SELECTOR: &'static [(char, char)] = &[ + ('᠋', '᠍'), ('︀', '️'), ('󠄀', '󠇯'), +]; + +pub const WHITE_SPACE: &'static [(char, char)] = &[ + ('\t', '\r'), (' ', ' '), ('\u{85}', '\u{85}'), ('\u{a0}', '\u{a0}'), + ('\u{1680}', '\u{1680}'), ('\u{2000}', '\u{200a}'), + ('\u{2028}', '\u{2029}'), ('\u{202f}', '\u{202f}'), + ('\u{205f}', '\u{205f}'), ('\u{3000}', '\u{3000}'), +]; + +pub const XID_CONTINUE: &'static [(char, char)] = &[ + ('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z'), ('ª', 'ª'), ('µ', 'µ'), + ('·', '·'), ('º', 'º'), ('À', 'Ö'), ('Ø', 'ö'), ('ø', 'ˁ'), + ('ˆ', 'ˑ'), ('ˠ', 'ˤ'), ('ˬ', 'ˬ'), ('ˮ', 'ˮ'), ('̀', 'ʹ'), + ('Ͷ', 'ͷ'), ('ͻ', 'ͽ'), ('Ϳ', 'Ϳ'), ('Ά', 'Ί'), ('Ό', 'Ό'), + ('Ύ', 'Ρ'), ('Σ', 'ϵ'), ('Ϸ', 'ҁ'), ('҃', '҇'), ('Ҋ', 'ԯ'), + ('Ա', 'Ֆ'), ('ՙ', 'ՙ'), ('ա', 'և'), ('֑', 'ֽ'), ('ֿ', 'ֿ'), + ('ׁ', 'ׂ'), ('ׄ', 'ׅ'), ('ׇ', 'ׇ'), ('א', 'ת'), ('װ', 'ײ'), + ('ؐ', 'ؚ'), ('ؠ', '٩'), ('ٮ', 'ۓ'), ('ە', 'ۜ'), ('۟', 'ۨ'), + ('۪', 'ۼ'), ('ۿ', 'ۿ'), ('ܐ', '݊'), ('ݍ', 'ޱ'), ('߀', 'ߵ'), + ('ߺ', 'ߺ'), ('ࠀ', '࠭'), ('ࡀ', '࡛'), ('ࡠ', 'ࡪ'), + ('ࢠ', 'ࢴ'), ('ࢶ', 'ࢽ'), ('ࣔ', '࣡'), ('ࣣ', 'ॣ'), + ('०', '९'), ('ॱ', 'ঃ'), ('অ', 'ঌ'), ('এ', 'ঐ'), + ('ও', 'ন'), ('প', 'র'), ('ল', 'ল'), ('শ', 'হ'), + ('়', 'ৄ'), ('ে', 'ৈ'), ('ো', 'ৎ'), ('ৗ', 'ৗ'), + ('ড়', 'ঢ়'), ('য়', 'ৣ'), ('০', 'ৱ'), ('ৼ', 'ৼ'), + ('ਁ', 'ਃ'), ('ਅ', 'ਊ'), ('ਏ', 'ਐ'), ('ਓ', 'ਨ'), + ('ਪ', 'ਰ'), ('ਲ', 'ਲ਼'), ('ਵ', 'ਸ਼'), ('ਸ', 'ਹ'), + ('਼', '਼'), ('ਾ', 'ੂ'), ('ੇ', 'ੈ'), ('ੋ', '੍'), + ('ੑ', 'ੑ'), ('ਖ਼', 'ੜ'), ('ਫ਼', 'ਫ਼'), ('੦', 'ੵ'), + ('ઁ', 'ઃ'), ('અ', 'ઍ'), ('એ', 'ઑ'), ('ઓ', 'ન'), + ('પ', 'ર'), ('લ', 'ળ'), ('વ', 'હ'), ('઼', 'ૅ'), + ('ે', 'ૉ'), ('ો', '્'), ('ૐ', 'ૐ'), ('ૠ', 'ૣ'), + ('૦', '૯'), ('ૹ', '૿'), ('ଁ', 'ଃ'), ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), ('ଓ', 'ନ'), ('ପ', 'ର'), ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), ('଼', 'ୄ'), ('େ', 'ୈ'), ('ୋ', '୍'), + ('ୖ', 'ୗ'), ('ଡ଼', 'ଢ଼'), ('ୟ', 'ୣ'), ('୦', '୯'), + ('ୱ', 'ୱ'), ('ஂ', 'ஃ'), ('அ', 'ஊ'), ('எ', 'ஐ'), + ('ஒ', 'க'), ('ங', 'ச'), ('ஜ', 'ஜ'), ('ஞ', 'ட'), + ('ண', 'த'), ('ந', 'ப'), ('ம', 'ஹ'), ('ா', 'ூ'), + ('ெ', 'ை'), ('ொ', '்'), ('ௐ', 'ௐ'), ('ௗ', 'ௗ'), + ('௦', '௯'), ('ఀ', 'ః'), ('అ', 'ఌ'), ('ఎ', 'ఐ'), + ('ఒ', 'న'), ('ప', 'హ'), ('ఽ', 'ౄ'), ('ె', 'ై'), + ('ొ', '్'), ('ౕ', 'ౖ'), ('ౘ', 'ౚ'), ('ౠ', 'ౣ'), + ('౦', '౯'), ('ಀ', 'ಃ'), ('ಅ', 'ಌ'), ('ಎ', 'ಐ'), + ('ಒ', 'ನ'), ('ಪ', 'ಳ'), ('ವ', 'ಹ'), ('಼', 'ೄ'), + ('ೆ', 'ೈ'), ('ೊ', '್'), ('ೕ', 'ೖ'), ('ೞ', 'ೞ'), + ('ೠ', 'ೣ'), ('೦', '೯'), ('ೱ', 'ೲ'), ('ഀ', 'ഃ'), + ('അ', 'ഌ'), ('എ', 'ഐ'), ('ഒ', 'ൄ'), ('െ', 'ൈ'), + ('ൊ', 'ൎ'), ('ൔ', 'ൗ'), ('ൟ', 'ൣ'), ('൦', '൯'), + ('ൺ', 'ൿ'), ('ං', 'ඃ'), ('අ', 'ඖ'), ('ක', 'න'), + ('ඳ', 'ර'), ('ල', 'ල'), ('ව', 'ෆ'), ('්', '්'), + ('ා', 'ු'), ('ූ', 'ූ'), ('ෘ', 'ෟ'), ('෦', '෯'), + ('ෲ', 'ෳ'), ('ก', 'ฺ'), ('เ', '๎'), ('๐', '๙'), + ('ກ', 'ຂ'), ('ຄ', 'ຄ'), ('ງ', 'ຈ'), ('ຊ', 'ຊ'), + ('ຍ', 'ຍ'), ('ດ', 'ທ'), ('ນ', 'ຟ'), ('ມ', 'ຣ'), + ('ລ', 'ລ'), ('ວ', 'ວ'), ('ສ', 'ຫ'), ('ອ', 'ູ'), + ('ົ', 'ຽ'), ('ເ', 'ໄ'), ('ໆ', 'ໆ'), ('່', 'ໍ'), + ('໐', '໙'), ('ໜ', 'ໟ'), ('ༀ', 'ༀ'), ('༘', '༙'), + ('༠', '༩'), ('༵', '༵'), ('༷', '༷'), ('༹', '༹'), + ('༾', 'ཇ'), ('ཉ', 'ཬ'), ('ཱ', '྄'), ('྆', 'ྗ'), + ('ྙ', 'ྼ'), ('࿆', '࿆'), ('က', '၉'), ('ၐ', 'ႝ'), + ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), ('ა', 'ჺ'), + ('ჼ', 'ቈ'), ('ቊ', 'ቍ'), ('ቐ', 'ቖ'), ('ቘ', 'ቘ'), + ('ቚ', 'ቝ'), ('በ', 'ኈ'), ('ኊ', 'ኍ'), ('ነ', 'ኰ'), + ('ኲ', 'ኵ'), ('ኸ', 'ኾ'), ('ዀ', 'ዀ'), ('ዂ', 'ዅ'), + ('ወ', 'ዖ'), ('ዘ', 'ጐ'), ('ጒ', 'ጕ'), ('ጘ', 'ፚ'), + ('፝', '፟'), ('፩', '፱'), ('ᎀ', 'ᎏ'), ('Ꭰ', 'Ᏽ'), + ('ᏸ', 'ᏽ'), ('ᐁ', 'ᙬ'), ('ᙯ', 'ᙿ'), ('ᚁ', 'ᚚ'), + ('ᚠ', 'ᛪ'), ('ᛮ', 'ᛸ'), ('ᜀ', 'ᜌ'), ('ᜎ', '᜔'), + ('ᜠ', '᜴'), ('ᝀ', 'ᝓ'), ('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), + ('ᝲ', 'ᝳ'), ('ក', '៓'), ('ៗ', 'ៗ'), ('ៜ', '៝'), + ('០', '៩'), ('᠋', '᠍'), ('᠐', '᠙'), ('ᠠ', 'ᡷ'), + ('ᢀ', 'ᢪ'), ('ᢰ', 'ᣵ'), ('ᤀ', 'ᤞ'), ('ᤠ', 'ᤫ'), + ('ᤰ', '᤻'), ('᥆', 'ᥭ'), ('ᥰ', 'ᥴ'), ('ᦀ', 'ᦫ'), + ('ᦰ', 'ᧉ'), ('᧐', '᧚'), ('ᨀ', 'ᨛ'), ('ᨠ', 'ᩞ'), + ('᩠', '᩼'), ('᩿', '᪉'), ('᪐', '᪙'), ('ᪧ', 'ᪧ'), + ('᪰', '᪽'), ('ᬀ', 'ᭋ'), ('᭐', '᭙'), ('᭫', '᭳'), + ('ᮀ', '᯳'), ('ᰀ', '᰷'), ('᱀', '᱉'), ('ᱍ', 'ᱽ'), + ('ᲀ', 'ᲈ'), ('᳐', '᳒'), ('᳔', '᳹'), ('ᴀ', '᷹'), + ('᷻', 'ἕ'), ('Ἐ', 'Ἕ'), ('ἠ', 'ὅ'), ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), ('ᾶ', 'ᾼ'), ('ι', 'ι'), + ('ῂ', 'ῄ'), ('ῆ', 'ῌ'), ('ῐ', 'ΐ'), ('ῖ', 'Ί'), + ('ῠ', 'Ῥ'), ('ῲ', 'ῴ'), ('ῶ', 'ῼ'), ('‿', '⁀'), + ('⁔', '⁔'), ('ⁱ', 'ⁱ'), ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), + ('⃐', '⃜'), ('⃡', '⃡'), ('⃥', '⃰'), ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), ('ℊ', 'ℓ'), ('ℕ', 'ℕ'), ('℘', 'ℝ'), + ('ℤ', 'ℤ'), ('Ω', 'Ω'), ('ℨ', 'ℨ'), ('K', 'ℹ'), + ('ℼ', 'ℿ'), ('ⅅ', 'ⅉ'), ('ⅎ', 'ⅎ'), ('Ⅰ', 'ↈ'), + ('Ⰰ', 'Ⱞ'), ('ⰰ', 'ⱞ'), ('Ⱡ', 'ⳤ'), ('Ⳬ', 'ⳳ'), + ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), ('ⴰ', 'ⵧ'), + ('ⵯ', 'ⵯ'), ('⵿', 'ⶖ'), ('ⶠ', 'ⶦ'), ('ⶨ', 'ⶮ'), + ('ⶰ', 'ⶶ'), ('ⶸ', 'ⶾ'), ('ⷀ', 'ⷆ'), ('ⷈ', 'ⷎ'), + ('ⷐ', 'ⷖ'), ('ⷘ', 'ⷞ'), ('ⷠ', 'ⷿ'), ('々', '〇'), + ('〡', '〯'), ('〱', '〵'), ('〸', '〼'), ('ぁ', 'ゖ'), + ('゙', '゚'), ('ゝ', 'ゟ'), ('ァ', 'ヺ'), ('ー', 'ヿ'), + ('ㄅ', 'ㄮ'), ('ㄱ', 'ㆎ'), ('ㆠ', 'ㆺ'), ('ㇰ', 'ㇿ'), + ('㐀', '䶵'), ('一', '鿪'), ('ꀀ', 'ꒌ'), ('ꓐ', 'ꓽ'), + ('ꔀ', 'ꘌ'), ('ꘐ', 'ꘫ'), ('Ꙁ', '꙯'), ('ꙴ', '꙽'), + ('ꙿ', '꛱'), ('ꜗ', 'ꜟ'), ('Ꜣ', 'ꞈ'), ('Ꞌ', 'Ɪ'), + ('Ʞ', 'ꞷ'), ('ꟷ', 'ꠧ'), ('ꡀ', 'ꡳ'), ('ꢀ', 'ꣅ'), + ('꣐', '꣙'), ('꣠', 'ꣷ'), ('ꣻ', 'ꣻ'), ('ꣽ', 'ꣽ'), + ('꤀', '꤭'), ('ꤰ', '꥓'), ('ꥠ', 'ꥼ'), ('ꦀ', '꧀'), + ('ꧏ', '꧙'), ('ꧠ', 'ꧾ'), ('ꨀ', 'ꨶ'), ('ꩀ', 'ꩍ'), + ('꩐', '꩙'), ('ꩠ', 'ꩶ'), ('ꩺ', 'ꫂ'), ('ꫛ', 'ꫝ'), + ('ꫠ', 'ꫯ'), ('ꫲ', '꫶'), ('ꬁ', 'ꬆ'), ('ꬉ', 'ꬎ'), + ('ꬑ', 'ꬖ'), ('ꬠ', 'ꬦ'), ('ꬨ', 'ꬮ'), ('ꬰ', 'ꭚ'), + ('ꭜ', 'ꭥ'), ('ꭰ', 'ꯪ'), ('꯬', '꯭'), ('꯰', '꯹'), + ('가', '힣'), ('ힰ', 'ퟆ'), ('ퟋ', 'ퟻ'), ('豈', '舘'), + ('並', '龎'), ('ff', 'st'), ('ﬓ', 'ﬗ'), ('יִ', 'ﬨ'), + ('שׁ', 'זּ'), ('טּ', 'לּ'), ('מּ', 'מּ'), ('נּ', 'סּ'), + ('ףּ', 'פּ'), ('צּ', 'ﮱ'), ('ﯓ', 'ﱝ'), ('ﱤ', 'ﴽ'), + ('ﵐ', 'ﶏ'), ('ﶒ', 'ﷇ'), ('ﷰ', 'ﷹ'), ('︀', '️'), + ('︠', '︯'), ('︳', '︴'), ('﹍', '﹏'), ('ﹱ', 'ﹱ'), + ('ﹳ', 'ﹳ'), ('ﹷ', 'ﹷ'), ('ﹹ', 'ﹹ'), ('ﹻ', 'ﹻ'), + ('ﹽ', 'ﹽ'), ('ﹿ', 'ﻼ'), ('0', '9'), ('A', 'Z'), + ('_', '_'), ('a', 'z'), ('ヲ', 'ᄒ'), ('ᅡ', 'ᅦ'), + ('ᅧ', 'ᅬ'), ('ᅭ', 'ᅲ'), ('ᅳ', 'ᅵ'), ('𐀀', '𐀋'), + ('𐀍', '𐀦'), ('𐀨', '𐀺'), ('𐀼', '𐀽'), ('𐀿', '𐁍'), + ('𐁐', '𐁝'), ('𐂀', '𐃺'), ('𐅀', '𐅴'), ('𐇽', '𐇽'), + ('𐊀', '𐊜'), ('𐊠', '𐋐'), ('𐋠', '𐋠'), ('𐌀', '𐌟'), + ('𐌭', '𐍊'), ('𐍐', '𐍺'), ('𐎀', '𐎝'), ('𐎠', '𐏃'), + ('𐏈', '𐏏'), ('𐏑', '𐏕'), ('𐐀', '𐒝'), ('𐒠', '𐒩'), + ('𐒰', '𐓓'), ('𐓘', '𐓻'), ('𐔀', '𐔧'), ('𐔰', '𐕣'), + ('𐘀', '𐜶'), ('𐝀', '𐝕'), ('𐝠', '𐝧'), ('𐠀', '𐠅'), + ('𐠈', '𐠈'), ('𐠊', '𐠵'), ('𐠷', '𐠸'), ('𐠼', '𐠼'), + ('𐠿', '𐡕'), ('𐡠', '𐡶'), ('𐢀', '𐢞'), ('𐣠', '𐣲'), + ('𐣴', '𐣵'), ('𐤀', '𐤕'), ('𐤠', '𐤹'), ('𐦀', '𐦷'), + ('𐦾', '𐦿'), ('𐨀', '𐨃'), ('𐨅', '𐨆'), ('𐨌', '𐨓'), + ('𐨕', '𐨗'), ('𐨙', '𐨳'), ('𐨸', '𐨺'), ('𐨿', '𐨿'), + ('𐩠', '𐩼'), ('𐪀', '𐪜'), ('𐫀', '𐫇'), ('𐫉', '𐫦'), + ('𐬀', '𐬵'), ('𐭀', '𐭕'), ('𐭠', '𐭲'), ('𐮀', '𐮑'), + ('𐰀', '𐱈'), ('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𑀀', '𑁆'), + ('𑁦', '𑁯'), ('𑁿', '𑂺'), ('𑃐', '𑃨'), ('𑃰', '𑃹'), + ('𑄀', '𑄴'), ('𑄶', '𑄿'), ('𑅐', '𑅳'), ('𑅶', '𑅶'), + ('𑆀', '𑇄'), ('𑇊', '𑇌'), ('𑇐', '𑇚'), ('𑇜', '𑇜'), + ('𑈀', '𑈑'), ('𑈓', '𑈷'), ('𑈾', '𑈾'), ('𑊀', '𑊆'), + ('𑊈', '𑊈'), ('𑊊', '𑊍'), ('𑊏', '𑊝'), ('𑊟', '𑊨'), + ('𑊰', '𑋪'), ('𑋰', '𑋹'), ('𑌀', '𑌃'), ('𑌅', '𑌌'), + ('𑌏', '𑌐'), ('𑌓', '𑌨'), ('𑌪', '𑌰'), ('𑌲', '𑌳'), + ('𑌵', '𑌹'), ('𑌼', '𑍄'), ('𑍇', '𑍈'), ('𑍋', '𑍍'), + ('𑍐', '𑍐'), ('𑍗', '𑍗'), ('𑍝', '𑍣'), ('𑍦', '𑍬'), + ('𑍰', '𑍴'), ('𑐀', '𑑊'), ('𑑐', '𑑙'), ('𑒀', '𑓅'), + ('𑓇', '𑓇'), ('𑓐', '𑓙'), ('𑖀', '𑖵'), ('𑖸', '𑗀'), + ('𑗘', '𑗝'), ('𑘀', '𑙀'), ('𑙄', '𑙄'), ('𑙐', '𑙙'), + ('𑚀', '𑚷'), ('𑛀', '𑛉'), ('𑜀', '𑜙'), ('𑜝', '𑜫'), + ('𑜰', '𑜹'), ('𑢠', '𑣩'), ('𑣿', '𑣿'), ('𑨀', '𑨾'), + ('𑩇', '𑩇'), ('𑩐', '𑪃'), ('𑪆', '𑪙'), ('𑫀', '𑫸'), + ('𑰀', '𑰈'), ('𑰊', '𑰶'), ('𑰸', '𑱀'), ('𑱐', '𑱙'), + ('𑱲', '𑲏'), ('𑲒', '𑲧'), ('𑲩', '𑲶'), ('𑴀', '𑴆'), + ('𑴈', '𑴉'), ('𑴋', '𑴶'), ('𑴺', '𑴺'), ('𑴼', '𑴽'), + ('𑴿', '𑵇'), ('𑵐', '𑵙'), ('𒀀', '𒎙'), ('𒐀', '𒑮'), + ('𒒀', '𒕃'), ('𓀀', '𓐮'), ('𔐀', '𔙆'), ('𖠀', '𖨸'), + ('𖩀', '𖩞'), ('𖩠', '𖩩'), ('𖫐', '𖫭'), ('𖫰', '𖫴'), + ('𖬀', '𖬶'), ('𖭀', '𖭃'), ('𖭐', '𖭙'), ('𖭣', '𖭷'), + ('𖭽', '𖮏'), ('𖼀', '𖽄'), ('𖽐', '𖽾'), ('𖾏', '𖾟'), + ('𖿠', '𖿡'), ('𗀀', '𘟬'), ('𘠀', '𘫲'), ('𛀀', '𛄞'), + ('𛅰', '𛋻'), ('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), + ('𛲐', '𛲙'), ('𛲝', '𛲞'), ('𝅥', '𝅩'), ('𝅭', '𝅲'), + ('𝅻', '𝆂'), ('𝆅', '𝆋'), ('𝆪', '𝆭'), ('𝉂', '𝉄'), + ('𝐀', '𝑔'), ('𝑖', '𝒜'), ('𝒞', '𝒟'), ('𝒢', '𝒢'), + ('𝒥', '𝒦'), ('𝒩', '𝒬'), ('𝒮', '𝒹'), ('𝒻', '𝒻'), + ('𝒽', '𝓃'), ('𝓅', '𝔅'), ('𝔇', '𝔊'), ('𝔍', '𝔔'), + ('𝔖', '𝔜'), ('𝔞', '𝔹'), ('𝔻', '𝔾'), ('𝕀', '𝕄'), + ('𝕆', '𝕆'), ('𝕊', '𝕐'), ('𝕒', '𝚥'), ('𝚨', '𝛀'), + ('𝛂', '𝛚'), ('𝛜', '𝛺'), ('𝛼', '𝜔'), ('𝜖', '𝜴'), + ('𝜶', '𝝎'), ('𝝐', '𝝮'), ('𝝰', '𝞈'), ('𝞊', '𝞨'), + ('𝞪', '𝟂'), ('𝟄', '𝟋'), ('𝟎', '𝟿'), ('𝨀', '𝨶'), + ('𝨻', '𝩬'), ('𝩵', '𝩵'), ('𝪄', '𝪄'), ('𝪛', '𝪟'), + ('𝪡', '𝪯'), ('𞀀', '𞀆'), ('𞀈', '𞀘'), ('𞀛', '𞀡'), + ('𞀣', '𞀤'), ('𞀦', '𞀪'), ('𞠀', '𞣄'), ('𞣐', '𞣖'), + ('𞤀', '𞥊'), ('𞥐', '𞥙'), ('𞸀', '𞸃'), ('𞸅', '𞸟'), + ('𞸡', '𞸢'), ('𞸤', '𞸤'), ('𞸧', '𞸧'), ('𞸩', '𞸲'), + ('𞸴', '𞸷'), ('𞸹', '𞸹'), ('𞸻', '𞸻'), ('𞹂', '𞹂'), + ('𞹇', '𞹇'), ('𞹉', '𞹉'), ('𞹋', '𞹋'), ('𞹍', '𞹏'), + ('𞹑', '𞹒'), ('𞹔', '𞹔'), ('𞹗', '𞹗'), ('𞹙', '𞹙'), + ('𞹛', '𞹛'), ('𞹝', '𞹝'), ('𞹟', '𞹟'), ('𞹡', '𞹢'), + ('𞹤', '𞹤'), ('𞹧', '𞹪'), ('𞹬', '𞹲'), ('𞹴', '𞹷'), + ('𞹹', '𞹼'), ('𞹾', '𞹾'), ('𞺀', '𞺉'), ('𞺋', '𞺛'), + ('𞺡', '𞺣'), ('𞺥', '𞺩'), ('𞺫', '𞺻'), ('𠀀', '𪛖'), + ('𪜀', '𫜴'), ('𫝀', '𫠝'), ('𫠠', '𬺡'), ('𬺰', '𮯠'), + ('丽', '𪘀'), ('󠄀', '󠇯'), +]; + +pub const XID_START: &'static [(char, char)] = &[ + ('A', 'Z'), ('a', 'z'), ('ª', 'ª'), ('µ', 'µ'), ('º', 'º'), + ('À', 'Ö'), ('Ø', 'ö'), ('ø', 'ˁ'), ('ˆ', 'ˑ'), ('ˠ', 'ˤ'), + ('ˬ', 'ˬ'), ('ˮ', 'ˮ'), ('Ͱ', 'ʹ'), ('Ͷ', 'ͷ'), ('ͻ', 'ͽ'), + ('Ϳ', 'Ϳ'), ('Ά', 'Ά'), ('Έ', 'Ί'), ('Ό', 'Ό'), ('Ύ', 'Ρ'), + ('Σ', 'ϵ'), ('Ϸ', 'ҁ'), ('Ҋ', 'ԯ'), ('Ա', 'Ֆ'), ('ՙ', 'ՙ'), + ('ա', 'և'), ('א', 'ת'), ('װ', 'ײ'), ('ؠ', 'ي'), ('ٮ', 'ٯ'), + ('ٱ', 'ۓ'), ('ە', 'ە'), ('ۥ', 'ۦ'), ('ۮ', 'ۯ'), ('ۺ', 'ۼ'), + ('ۿ', 'ۿ'), ('ܐ', 'ܐ'), ('ܒ', 'ܯ'), ('ݍ', 'ޥ'), ('ޱ', 'ޱ'), + ('ߊ', 'ߪ'), ('ߴ', 'ߵ'), ('ߺ', 'ߺ'), ('ࠀ', 'ࠕ'), ('ࠚ', 'ࠚ'), + ('ࠤ', 'ࠤ'), ('ࠨ', 'ࠨ'), ('ࡀ', 'ࡘ'), ('ࡠ', 'ࡪ'), + ('ࢠ', 'ࢴ'), ('ࢶ', 'ࢽ'), ('ऄ', 'ह'), ('ऽ', 'ऽ'), + ('ॐ', 'ॐ'), ('क़', 'ॡ'), ('ॱ', 'ঀ'), ('অ', 'ঌ'), + ('এ', 'ঐ'), ('ও', 'ন'), ('প', 'র'), ('ল', 'ল'), + ('শ', 'হ'), ('ঽ', 'ঽ'), ('ৎ', 'ৎ'), ('ড়', 'ঢ়'), + ('য়', 'ৡ'), ('ৰ', 'ৱ'), ('ৼ', 'ৼ'), ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), ('ਓ', 'ਨ'), ('ਪ', 'ਰ'), ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), ('ਸ', 'ਹ'), ('ਖ਼', 'ੜ'), ('ਫ਼', 'ਫ਼'), + ('ੲ', 'ੴ'), ('અ', 'ઍ'), ('એ', 'ઑ'), ('ઓ', 'ન'), + ('પ', 'ર'), ('લ', 'ળ'), ('વ', 'હ'), ('ઽ', 'ઽ'), + ('ૐ', 'ૐ'), ('ૠ', 'ૡ'), ('ૹ', 'ૹ'), ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), ('ଓ', 'ନ'), ('ପ', 'ର'), ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), ('ଽ', 'ଽ'), ('ଡ଼', 'ଢ଼'), ('ୟ', 'ୡ'), + ('ୱ', 'ୱ'), ('ஃ', 'ஃ'), ('அ', 'ஊ'), ('எ', 'ஐ'), + ('ஒ', 'க'), ('ங', 'ச'), ('ஜ', 'ஜ'), ('ஞ', 'ட'), + ('ண', 'த'), ('ந', 'ப'), ('ம', 'ஹ'), ('ௐ', 'ௐ'), + ('అ', 'ఌ'), ('ఎ', 'ఐ'), ('ఒ', 'న'), ('ప', 'హ'), + ('ఽ', 'ఽ'), ('ౘ', 'ౚ'), ('ౠ', 'ౡ'), ('ಀ', 'ಀ'), + ('ಅ', 'ಌ'), ('ಎ', 'ಐ'), ('ಒ', 'ನ'), ('ಪ', 'ಳ'), + ('ವ', 'ಹ'), ('ಽ', 'ಽ'), ('ೞ', 'ೞ'), ('ೠ', 'ೡ'), + ('ೱ', 'ೲ'), ('അ', 'ഌ'), ('എ', 'ഐ'), ('ഒ', 'ഺ'), + ('ഽ', 'ഽ'), ('ൎ', 'ൎ'), ('ൔ', 'ൖ'), ('ൟ', 'ൡ'), + ('ൺ', 'ൿ'), ('අ', 'ඖ'), ('ක', 'න'), ('ඳ', 'ර'), + ('ල', 'ල'), ('ව', 'ෆ'), ('ก', 'ะ'), ('า', 'า'), + ('เ', 'ๆ'), ('ກ', 'ຂ'), ('ຄ', 'ຄ'), ('ງ', 'ຈ'), + ('ຊ', 'ຊ'), ('ຍ', 'ຍ'), ('ດ', 'ທ'), ('ນ', 'ຟ'), + ('ມ', 'ຣ'), ('ລ', 'ລ'), ('ວ', 'ວ'), ('ສ', 'ຫ'), + ('ອ', 'ະ'), ('າ', 'າ'), ('ຽ', 'ຽ'), ('ເ', 'ໄ'), + ('ໆ', 'ໆ'), ('ໜ', 'ໟ'), ('ༀ', 'ༀ'), ('ཀ', 'ཇ'), + ('ཉ', 'ཬ'), ('ྈ', 'ྌ'), ('က', 'ဪ'), ('ဿ', 'ဿ'), + ('ၐ', 'ၕ'), ('ၚ', 'ၝ'), ('ၡ', 'ၡ'), ('ၥ', 'ၦ'), + ('ၮ', 'ၰ'), ('ၵ', 'ႁ'), ('ႎ', 'ႎ'), ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), ('ა', 'ჺ'), ('ჼ', 'ቈ'), + ('ቊ', 'ቍ'), ('ቐ', 'ቖ'), ('ቘ', 'ቘ'), ('ቚ', 'ቝ'), + ('በ', 'ኈ'), ('ኊ', 'ኍ'), ('ነ', 'ኰ'), ('ኲ', 'ኵ'), + ('ኸ', 'ኾ'), ('ዀ', 'ዀ'), ('ዂ', 'ዅ'), ('ወ', 'ዖ'), + ('ዘ', 'ጐ'), ('ጒ', 'ጕ'), ('ጘ', 'ፚ'), ('ᎀ', 'ᎏ'), + ('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ᐁ', 'ᙬ'), ('ᙯ', 'ᙿ'), + ('ᚁ', 'ᚚ'), ('ᚠ', 'ᛪ'), ('ᛮ', 'ᛸ'), ('ᜀ', 'ᜌ'), + ('ᜎ', 'ᜑ'), ('ᜠ', 'ᜱ'), ('ᝀ', 'ᝑ'), ('ᝠ', 'ᝬ'), + ('ᝮ', 'ᝰ'), ('ក', 'ឳ'), ('ៗ', 'ៗ'), ('ៜ', 'ៜ'), + ('ᠠ', 'ᡷ'), ('ᢀ', 'ᢨ'), ('ᢪ', 'ᢪ'), ('ᢰ', 'ᣵ'), + ('ᤀ', 'ᤞ'), ('ᥐ', 'ᥭ'), ('ᥰ', 'ᥴ'), ('ᦀ', 'ᦫ'), + ('ᦰ', 'ᧉ'), ('ᨀ', 'ᨖ'), ('ᨠ', 'ᩔ'), ('ᪧ', 'ᪧ'), + ('ᬅ', 'ᬳ'), ('ᭅ', 'ᭋ'), ('ᮃ', 'ᮠ'), ('ᮮ', 'ᮯ'), + ('ᮺ', 'ᯥ'), ('ᰀ', 'ᰣ'), ('ᱍ', 'ᱏ'), ('ᱚ', 'ᱽ'), + ('ᲀ', 'ᲈ'), ('ᳩ', 'ᳬ'), ('ᳮ', 'ᳱ'), ('ᳵ', 'ᳶ'), + ('ᴀ', 'ᶿ'), ('Ḁ', 'ἕ'), ('Ἐ', 'Ἕ'), ('ἠ', 'ὅ'), + ('Ὀ', 'Ὅ'), ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), ('ᾶ', 'ᾼ'), + ('ι', 'ι'), ('ῂ', 'ῄ'), ('ῆ', 'ῌ'), ('ῐ', 'ΐ'), + ('ῖ', 'Ί'), ('ῠ', 'Ῥ'), ('ῲ', 'ῴ'), ('ῶ', 'ῼ'), + ('ⁱ', 'ⁱ'), ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), ('ℊ', 'ℓ'), ('ℕ', 'ℕ'), ('℘', 'ℝ'), + ('ℤ', 'ℤ'), ('Ω', 'Ω'), ('ℨ', 'ℨ'), ('K', 'ℹ'), + ('ℼ', 'ℿ'), ('ⅅ', 'ⅉ'), ('ⅎ', 'ⅎ'), ('Ⅰ', 'ↈ'), + ('Ⰰ', 'Ⱞ'), ('ⰰ', 'ⱞ'), ('Ⱡ', 'ⳤ'), ('Ⳬ', 'ⳮ'), + ('Ⳳ', 'ⳳ'), ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), + ('ⴰ', 'ⵧ'), ('ⵯ', 'ⵯ'), ('ⶀ', 'ⶖ'), ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), ('ⶰ', 'ⶶ'), ('ⶸ', 'ⶾ'), ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), ('ⷐ', 'ⷖ'), ('ⷘ', 'ⷞ'), ('々', '〇'), + ('〡', '〩'), ('〱', '〵'), ('〸', '〼'), ('ぁ', 'ゖ'), + ('ゝ', 'ゟ'), ('ァ', 'ヺ'), ('ー', 'ヿ'), ('ㄅ', 'ㄮ'), + ('ㄱ', 'ㆎ'), ('ㆠ', 'ㆺ'), ('ㇰ', 'ㇿ'), ('㐀', '䶵'), + ('一', '鿪'), ('ꀀ', 'ꒌ'), ('ꓐ', 'ꓽ'), ('ꔀ', 'ꘌ'), + ('ꘐ', 'ꘟ'), ('ꘪ', 'ꘫ'), ('Ꙁ', 'ꙮ'), ('ꙿ', 'ꚝ'), + ('ꚠ', 'ꛯ'), ('ꜗ', 'ꜟ'), ('Ꜣ', 'ꞈ'), ('Ꞌ', 'Ɪ'), + ('Ʞ', 'ꞷ'), ('ꟷ', 'ꠁ'), ('ꠃ', 'ꠅ'), ('ꠇ', 'ꠊ'), + ('ꠌ', 'ꠢ'), ('ꡀ', 'ꡳ'), ('ꢂ', 'ꢳ'), ('ꣲ', 'ꣷ'), + ('ꣻ', 'ꣻ'), ('ꣽ', 'ꣽ'), ('ꤊ', 'ꤥ'), ('ꤰ', 'ꥆ'), + ('ꥠ', 'ꥼ'), ('ꦄ', 'ꦲ'), ('ꧏ', 'ꧏ'), ('ꧠ', 'ꧤ'), + ('ꧦ', 'ꧯ'), ('ꧺ', 'ꧾ'), ('ꨀ', 'ꨨ'), ('ꩀ', 'ꩂ'), + ('ꩄ', 'ꩋ'), ('ꩠ', 'ꩶ'), ('ꩺ', 'ꩺ'), ('ꩾ', 'ꪯ'), + ('ꪱ', 'ꪱ'), ('ꪵ', 'ꪶ'), ('ꪹ', 'ꪽ'), ('ꫀ', 'ꫀ'), + ('ꫂ', 'ꫂ'), ('ꫛ', 'ꫝ'), ('ꫠ', 'ꫪ'), ('ꫲ', 'ꫴ'), + ('ꬁ', 'ꬆ'), ('ꬉ', 'ꬎ'), ('ꬑ', 'ꬖ'), ('ꬠ', 'ꬦ'), + ('ꬨ', 'ꬮ'), ('ꬰ', 'ꭚ'), ('ꭜ', 'ꭥ'), ('ꭰ', 'ꯢ'), + ('가', '힣'), ('ힰ', 'ퟆ'), ('ퟋ', 'ퟻ'), ('豈', '舘'), + ('並', '龎'), ('ff', 'st'), ('ﬓ', 'ﬗ'), ('יִ', 'יִ'), + ('ײַ', 'ﬨ'), ('שׁ', 'זּ'), ('טּ', 'לּ'), ('מּ', 'מּ'), + ('נּ', 'סּ'), ('ףּ', 'פּ'), ('צּ', 'ﮱ'), ('ﯓ', 'ﱝ'), + ('ﱤ', 'ﴽ'), ('ﵐ', 'ﶏ'), ('ﶒ', 'ﷇ'), ('ﷰ', 'ﷹ'), + ('ﹱ', 'ﹱ'), ('ﹳ', 'ﹳ'), ('ﹷ', 'ﹷ'), ('ﹹ', 'ﹹ'), + ('ﹻ', 'ﹻ'), ('ﹽ', 'ﹽ'), ('ﹿ', 'ﻼ'), ('A', 'Z'), + ('a', 'z'), ('ヲ', 'ン'), ('ᅠ', 'ᄒ'), ('ᅡ', 'ᅦ'), + ('ᅧ', 'ᅬ'), ('ᅭ', 'ᅲ'), ('ᅳ', 'ᅵ'), ('𐀀', '𐀋'), + ('𐀍', '𐀦'), ('𐀨', '𐀺'), ('𐀼', '𐀽'), ('𐀿', '𐁍'), + ('𐁐', '𐁝'), ('𐂀', '𐃺'), ('𐅀', '𐅴'), ('𐊀', '𐊜'), + ('𐊠', '𐋐'), ('𐌀', '𐌟'), ('𐌭', '𐍊'), ('𐍐', '𐍵'), + ('𐎀', '𐎝'), ('𐎠', '𐏃'), ('𐏈', '𐏏'), ('𐏑', '𐏕'), + ('𐐀', '𐒝'), ('𐒰', '𐓓'), ('𐓘', '𐓻'), ('𐔀', '𐔧'), + ('𐔰', '𐕣'), ('𐘀', '𐜶'), ('𐝀', '𐝕'), ('𐝠', '𐝧'), + ('𐠀', '𐠅'), ('𐠈', '𐠈'), ('𐠊', '𐠵'), ('𐠷', '𐠸'), + ('𐠼', '𐠼'), ('𐠿', '𐡕'), ('𐡠', '𐡶'), ('𐢀', '𐢞'), + ('𐣠', '𐣲'), ('𐣴', '𐣵'), ('𐤀', '𐤕'), ('𐤠', '𐤹'), + ('𐦀', '𐦷'), ('𐦾', '𐦿'), ('𐨀', '𐨀'), ('𐨐', '𐨓'), + ('𐨕', '𐨗'), ('𐨙', '𐨳'), ('𐩠', '𐩼'), ('𐪀', '𐪜'), + ('𐫀', '𐫇'), ('𐫉', '𐫤'), ('𐬀', '𐬵'), ('𐭀', '𐭕'), + ('𐭠', '𐭲'), ('𐮀', '𐮑'), ('𐰀', '𐱈'), ('𐲀', '𐲲'), + ('𐳀', '𐳲'), ('𑀃', '𑀷'), ('𑂃', '𑂯'), ('𑃐', '𑃨'), + ('𑄃', '𑄦'), ('𑅐', '𑅲'), ('𑅶', '𑅶'), ('𑆃', '𑆲'), + ('𑇁', '𑇄'), ('𑇚', '𑇚'), ('𑇜', '𑇜'), ('𑈀', '𑈑'), + ('𑈓', '𑈫'), ('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), + ('𑊏', '𑊝'), ('𑊟', '𑊨'), ('𑊰', '𑋞'), ('𑌅', '𑌌'), + ('𑌏', '𑌐'), ('𑌓', '𑌨'), ('𑌪', '𑌰'), ('𑌲', '𑌳'), + ('𑌵', '𑌹'), ('𑌽', '𑌽'), ('𑍐', '𑍐'), ('𑍝', '𑍡'), + ('𑐀', '𑐴'), ('𑑇', '𑑊'), ('𑒀', '𑒯'), ('𑓄', '𑓅'), + ('𑓇', '𑓇'), ('𑖀', '𑖮'), ('𑗘', '𑗛'), ('𑘀', '𑘯'), + ('𑙄', '𑙄'), ('𑚀', '𑚪'), ('𑜀', '𑜙'), ('𑢠', '𑣟'), + ('𑣿', '𑣿'), ('𑨀', '𑨀'), ('𑨋', '𑨲'), ('𑨺', '𑨺'), + ('𑩐', '𑩐'), ('𑩜', '𑪃'), ('𑪆', '𑪉'), ('𑫀', '𑫸'), + ('𑰀', '𑰈'), ('𑰊', '𑰮'), ('𑱀', '𑱀'), ('𑱲', '𑲏'), + ('𑴀', '𑴆'), ('𑴈', '𑴉'), ('𑴋', '𑴰'), ('𑵆', '𑵆'), + ('𒀀', '𒎙'), ('𒐀', '𒑮'), ('𒒀', '𒕃'), ('𓀀', '𓐮'), + ('𔐀', '𔙆'), ('𖠀', '𖨸'), ('𖩀', '𖩞'), ('𖫐', '𖫭'), + ('𖬀', '𖬯'), ('𖭀', '𖭃'), ('𖭣', '𖭷'), ('𖭽', '𖮏'), + ('𖼀', '𖽄'), ('𖽐', '𖽐'), ('𖾓', '𖾟'), ('𖿠', '𖿡'), + ('𗀀', '𘟬'), ('𘠀', '𘫲'), ('𛀀', '𛄞'), ('𛅰', '𛋻'), + ('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), ('𛲐', '𛲙'), + ('𝐀', '𝑔'), ('𝑖', '𝒜'), ('𝒞', '𝒟'), ('𝒢', '𝒢'), + ('𝒥', '𝒦'), ('𝒩', '𝒬'), ('𝒮', '𝒹'), ('𝒻', '𝒻'), + ('𝒽', '𝓃'), ('𝓅', '𝔅'), ('𝔇', '𝔊'), ('𝔍', '𝔔'), + ('𝔖', '𝔜'), ('𝔞', '𝔹'), ('𝔻', '𝔾'), ('𝕀', '𝕄'), + ('𝕆', '𝕆'), ('𝕊', '𝕐'), ('𝕒', '𝚥'), ('𝚨', '𝛀'), + ('𝛂', '𝛚'), ('𝛜', '𝛺'), ('𝛼', '𝜔'), ('𝜖', '𝜴'), + ('𝜶', '𝝎'), ('𝝐', '𝝮'), ('𝝰', '𝞈'), ('𝞊', '𝞨'), + ('𝞪', '𝟂'), ('𝟄', '𝟋'), ('𞠀', '𞣄'), ('𞤀', '𞥃'), + ('𞸀', '𞸃'), ('𞸅', '𞸟'), ('𞸡', '𞸢'), ('𞸤', '𞸤'), + ('𞸧', '𞸧'), ('𞸩', '𞸲'), ('𞸴', '𞸷'), ('𞸹', '𞸹'), + ('𞸻', '𞸻'), ('𞹂', '𞹂'), ('𞹇', '𞹇'), ('𞹉', '𞹉'), + ('𞹋', '𞹋'), ('𞹍', '𞹏'), ('𞹑', '𞹒'), ('𞹔', '𞹔'), + ('𞹗', '𞹗'), ('𞹙', '𞹙'), ('𞹛', '𞹛'), ('𞹝', '𞹝'), + ('𞹟', '𞹟'), ('𞹡', '𞹢'), ('𞹤', '𞹤'), ('𞹧', '𞹪'), + ('𞹬', '𞹲'), ('𞹴', '𞹷'), ('𞹹', '𞹼'), ('𞹾', '𞹾'), + ('𞺀', '𞺉'), ('𞺋', '𞺛'), ('𞺡', '𞺣'), ('𞺥', '𞺩'), + ('𞺫', '𞺻'), ('𠀀', '𪛖'), ('𪜀', '𫜴'), ('𫝀', '𫠝'), + ('𫠠', '𬺡'), ('𬺰', '𮯠'), ('丽', '𪘀'), +]; diff --git a/regex-syntax-2/src/unicode_tables/property_names.rs b/regex-syntax-2/src/unicode_tables/property_names.rs new file mode 100644 index 0000000000..1d1032d337 --- /dev/null +++ b/regex-syntax-2/src/unicode_tables/property_names.rs @@ -0,0 +1,146 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate property-names tmp/ucd-10.0.0/ +// +// ucd-generate is available on crates.io. + +pub const PROPERTY_NAMES: &'static [(&'static str, &'static str)] = &[ + ("age", "Age"), ("ahex", "ASCII_Hex_Digit"), ("alpha", "Alphabetic"), + ("alphabetic", "Alphabetic"), ("asciihexdigit", "ASCII_Hex_Digit"), + ("bc", "Bidi_Class"), ("bidic", "Bidi_Control"), + ("bidiclass", "Bidi_Class"), ("bidicontrol", "Bidi_Control"), + ("bidim", "Bidi_Mirrored"), ("bidimirrored", "Bidi_Mirrored"), + ("bidimirroringglyph", "Bidi_Mirroring_Glyph"), + ("bidipairedbracket", "Bidi_Paired_Bracket"), + ("bidipairedbrackettype", "Bidi_Paired_Bracket_Type"), ("blk", "Block"), + ("block", "Block"), ("bmg", "Bidi_Mirroring_Glyph"), + ("bpb", "Bidi_Paired_Bracket"), ("bpt", "Bidi_Paired_Bracket_Type"), + ("c", "ISO_Comment"), + ("canonicalcombiningclass", "Canonical_Combining_Class"), + ("cased", "Cased"), ("casefolding", "Case_Folding"), + ("caseignorable", "Case_Ignorable"), ("ccc", "Canonical_Combining_Class"), + ("ce", "Composition_Exclusion"), ("cf", "Case_Folding"), + ("changeswhencasefolded", "Changes_When_Casefolded"), + ("changeswhencasemapped", "Changes_When_Casemapped"), + ("changeswhenlowercased", "Changes_When_Lowercased"), + ("changeswhennfkccasefolded", "Changes_When_NFKC_Casefolded"), + ("changeswhentitlecased", "Changes_When_Titlecased"), + ("changeswhenuppercased", "Changes_When_Uppercased"), + ("ci", "Case_Ignorable"), ("cjkaccountingnumeric", "kAccountingNumeric"), + ("cjkcompatibilityvariant", "kCompatibilityVariant"), + ("cjkiicore", "kIICore"), ("cjkirggsource", "kIRG_GSource"), + ("cjkirghsource", "kIRG_HSource"), ("cjkirgjsource", "kIRG_JSource"), + ("cjkirgkpsource", "kIRG_KPSource"), ("cjkirgksource", "kIRG_KSource"), + ("cjkirgmsource", "kIRG_MSource"), ("cjkirgtsource", "kIRG_TSource"), + ("cjkirgusource", "kIRG_USource"), ("cjkirgvsource", "kIRG_VSource"), + ("cjkothernumeric", "kOtherNumeric"), + ("cjkprimarynumeric", "kPrimaryNumeric"), ("cjkrsunicode", "kRSUnicode"), + ("compex", "Full_Composition_Exclusion"), + ("compositionexclusion", "Composition_Exclusion"), + ("cwcf", "Changes_When_Casefolded"), ("cwcm", "Changes_When_Casemapped"), + ("cwkcf", "Changes_When_NFKC_Casefolded"), + ("cwl", "Changes_When_Lowercased"), ("cwt", "Changes_When_Titlecased"), + ("cwu", "Changes_When_Uppercased"), ("dash", "Dash"), + ("decompositionmapping", "Decomposition_Mapping"), + ("decompositiontype", "Decomposition_Type"), + ("defaultignorablecodepoint", "Default_Ignorable_Code_Point"), + ("dep", "Deprecated"), ("deprecated", "Deprecated"), + ("di", "Default_Ignorable_Code_Point"), ("dia", "Diacritic"), + ("diacritic", "Diacritic"), ("dm", "Decomposition_Mapping"), + ("dt", "Decomposition_Type"), ("ea", "East_Asian_Width"), + ("eastasianwidth", "East_Asian_Width"), ("expandsonnfc", "Expands_On_NFC"), + ("expandsonnfd", "Expands_On_NFD"), ("expandsonnfkc", "Expands_On_NFKC"), + ("expandsonnfkd", "Expands_On_NFKD"), ("ext", "Extender"), + ("extender", "Extender"), ("fcnfkc", "FC_NFKC_Closure"), + ("fcnfkcclosure", "FC_NFKC_Closure"), + ("fullcompositionexclusion", "Full_Composition_Exclusion"), + ("gc", "General_Category"), ("gcb", "Grapheme_Cluster_Break"), + ("generalcategory", "General_Category"), ("graphemebase", "Grapheme_Base"), + ("graphemeclusterbreak", "Grapheme_Cluster_Break"), + ("graphemeextend", "Grapheme_Extend"), ("graphemelink", "Grapheme_Link"), + ("grbase", "Grapheme_Base"), ("grext", "Grapheme_Extend"), + ("grlink", "Grapheme_Link"), ("hangulsyllabletype", "Hangul_Syllable_Type"), + ("hex", "Hex_Digit"), ("hexdigit", "Hex_Digit"), + ("hst", "Hangul_Syllable_Type"), ("hyphen", "Hyphen"), + ("idc", "ID_Continue"), ("idcontinue", "ID_Continue"), + ("ideo", "Ideographic"), ("ideographic", "Ideographic"), + ("ids", "ID_Start"), ("idsb", "IDS_Binary_Operator"), + ("idsbinaryoperator", "IDS_Binary_Operator"), + ("idst", "IDS_Trinary_Operator"), ("idstart", "ID_Start"), + ("idstrinaryoperator", "IDS_Trinary_Operator"), + ("indicpositionalcategory", "Indic_Positional_Category"), + ("indicsyllabiccategory", "Indic_Syllabic_Category"), + ("inpc", "Indic_Positional_Category"), ("insc", "Indic_Syllabic_Category"), + ("jamoshortname", "Jamo_Short_Name"), ("jg", "Joining_Group"), + ("joinc", "Join_Control"), ("joincontrol", "Join_Control"), + ("joininggroup", "Joining_Group"), ("joiningtype", "Joining_Type"), + ("jsn", "Jamo_Short_Name"), ("jt", "Joining_Type"), + ("kaccountingnumeric", "kAccountingNumeric"), + ("kcompatibilityvariant", "kCompatibilityVariant"), ("kiicore", "kIICore"), + ("kirggsource", "kIRG_GSource"), ("kirghsource", "kIRG_HSource"), + ("kirgjsource", "kIRG_JSource"), ("kirgkpsource", "kIRG_KPSource"), + ("kirgksource", "kIRG_KSource"), ("kirgmsource", "kIRG_MSource"), + ("kirgtsource", "kIRG_TSource"), ("kirgusource", "kIRG_USource"), + ("kirgvsource", "kIRG_VSource"), ("kothernumeric", "kOtherNumeric"), + ("kprimarynumeric", "kPrimaryNumeric"), ("krsunicode", "kRSUnicode"), + ("lb", "Line_Break"), ("lc", "Lowercase_Mapping"), + ("linebreak", "Line_Break"), ("loe", "Logical_Order_Exception"), + ("logicalorderexception", "Logical_Order_Exception"), + ("lower", "Lowercase"), ("lowercase", "Lowercase"), + ("lowercasemapping", "Lowercase_Mapping"), ("math", "Math"), ("na", "Name"), + ("na1", "Unicode_1_Name"), ("name", "Name"), ("namealias", "Name_Alias"), + ("nchar", "Noncharacter_Code_Point"), ("nfcqc", "NFC_Quick_Check"), + ("nfcquickcheck", "NFC_Quick_Check"), ("nfdqc", "NFD_Quick_Check"), + ("nfdquickcheck", "NFD_Quick_Check"), ("nfkccasefold", "NFKC_Casefold"), + ("nfkccf", "NFKC_Casefold"), ("nfkcqc", "NFKC_Quick_Check"), + ("nfkcquickcheck", "NFKC_Quick_Check"), ("nfkdqc", "NFKD_Quick_Check"), + ("nfkdquickcheck", "NFKD_Quick_Check"), + ("noncharactercodepoint", "Noncharacter_Code_Point"), + ("nt", "Numeric_Type"), ("numerictype", "Numeric_Type"), + ("numericvalue", "Numeric_Value"), ("nv", "Numeric_Value"), + ("oalpha", "Other_Alphabetic"), ("ocomment", "ISO_Comment"), + ("odi", "Other_Default_Ignorable_Code_Point"), + ("ogrext", "Other_Grapheme_Extend"), ("oidc", "Other_ID_Continue"), + ("oids", "Other_ID_Start"), ("olower", "Other_Lowercase"), + ("omath", "Other_Math"), ("otheralphabetic", "Other_Alphabetic"), + ("otherdefaultignorablecodepoint", "Other_Default_Ignorable_Code_Point"), + ("othergraphemeextend", "Other_Grapheme_Extend"), + ("otheridcontinue", "Other_ID_Continue"), + ("otheridstart", "Other_ID_Start"), ("otherlowercase", "Other_Lowercase"), + ("othermath", "Other_Math"), ("otheruppercase", "Other_Uppercase"), + ("oupper", "Other_Uppercase"), ("patsyn", "Pattern_Syntax"), + ("patternsyntax", "Pattern_Syntax"), + ("patternwhitespace", "Pattern_White_Space"), + ("patws", "Pattern_White_Space"), ("pcm", "Prepended_Concatenation_Mark"), + ("prependedconcatenationmark", "Prepended_Concatenation_Mark"), + ("qmark", "Quotation_Mark"), ("quotationmark", "Quotation_Mark"), + ("radical", "Radical"), ("regionalindicator", "Regional_Indicator"), + ("ri", "Regional_Indicator"), ("sb", "Sentence_Break"), ("sc", "Script"), + ("scf", "Simple_Case_Folding"), ("script", "Script"), + ("scriptextensions", "Script_Extensions"), ("scx", "Script_Extensions"), + ("sd", "Soft_Dotted"), ("sentencebreak", "Sentence_Break"), + ("sentenceterminal", "Sentence_Terminal"), ("sfc", "Simple_Case_Folding"), + ("simplecasefolding", "Simple_Case_Folding"), + ("simplelowercasemapping", "Simple_Lowercase_Mapping"), + ("simpletitlecasemapping", "Simple_Titlecase_Mapping"), + ("simpleuppercasemapping", "Simple_Uppercase_Mapping"), + ("slc", "Simple_Lowercase_Mapping"), ("softdotted", "Soft_Dotted"), + ("space", "White_Space"), ("stc", "Simple_Titlecase_Mapping"), + ("sterm", "Sentence_Terminal"), ("suc", "Simple_Uppercase_Mapping"), + ("tc", "Titlecase_Mapping"), ("term", "Terminal_Punctuation"), + ("terminalpunctuation", "Terminal_Punctuation"), + ("titlecasemapping", "Titlecase_Mapping"), ("uc", "Uppercase_Mapping"), + ("uideo", "Unified_Ideograph"), ("unicode1name", "Unicode_1_Name"), + ("unicoderadicalstroke", "kRSUnicode"), + ("unifiedideograph", "Unified_Ideograph"), ("upper", "Uppercase"), + ("uppercase", "Uppercase"), ("uppercasemapping", "Uppercase_Mapping"), + ("urs", "kRSUnicode"), ("variationselector", "Variation_Selector"), + ("verticalorientation", "Vertical_Orientation"), + ("vo", "Vertical_Orientation"), ("vs", "Variation_Selector"), + ("wb", "Word_Break"), ("whitespace", "White_Space"), + ("wordbreak", "Word_Break"), ("wspace", "White_Space"), + ("xidc", "XID_Continue"), ("xidcontinue", "XID_Continue"), + ("xids", "XID_Start"), ("xidstart", "XID_Start"), + ("xonfc", "Expands_On_NFC"), ("xonfd", "Expands_On_NFD"), + ("xonfkc", "Expands_On_NFKC"), ("xonfkd", "Expands_On_NFKD"), +]; diff --git a/regex-syntax-2/src/unicode_tables/property_values.rs b/regex-syntax-2/src/unicode_tables/property_values.rs new file mode 100644 index 0000000000..1ce9795b1c --- /dev/null +++ b/regex-syntax-2/src/unicode_tables/property_values.rs @@ -0,0 +1,277 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate property-values tmp/ucd-10.0.0 --include gc,script,scx,age +// +// ucd-generate is available on crates.io. + +pub const PROPERTY_VALUES: &'static [(&'static str, &'static [(&'static str, &'static str)])] = &[ + ("Age", &[("1.1", "V1_1"), ("10.0", "V10_0"), ("2.0", "V2_0"), + ("2.1", "V2_1"), ("3.0", "V3_0"), ("3.1", "V3_1"), ("3.2", "V3_2"), + ("4.0", "V4_0"), ("4.1", "V4_1"), ("5.0", "V5_0"), ("5.1", "V5_1"), + ("5.2", "V5_2"), ("6.0", "V6_0"), ("6.1", "V6_1"), ("6.2", "V6_2"), + ("6.3", "V6_3"), ("7.0", "V7_0"), ("8.0", "V8_0"), ("9.0", "V9_0"), + ("na", "Unassigned"), ("unassigned", "Unassigned"), ("v100", "V10_0"), + ("v11", "V1_1"), ("v20", "V2_0"), ("v21", "V2_1"), ("v30", "V3_0"), + ("v31", "V3_1"), ("v32", "V3_2"), ("v40", "V4_0"), ("v41", "V4_1"), + ("v50", "V5_0"), ("v51", "V5_1"), ("v52", "V5_2"), ("v60", "V6_0"), + ("v61", "V6_1"), ("v62", "V6_2"), ("v63", "V6_3"), ("v70", "V7_0"), + ("v80", "V8_0"), ("v90", "V9_0"), ]), + + ("General_Category", &[("c", "Other"), ("casedletter", "Cased_Letter"), + ("cc", "Control"), ("cf", "Format"), + ("closepunctuation", "Close_Punctuation"), ("cn", "Unassigned"), + ("cntrl", "Control"), ("co", "Private_Use"), ("combiningmark", "Mark"), + ("connectorpunctuation", "Connector_Punctuation"), ("control", "Control"), + ("cs", "Surrogate"), ("currencysymbol", "Currency_Symbol"), + ("dashpunctuation", "Dash_Punctuation"), + ("decimalnumber", "Decimal_Number"), ("digit", "Decimal_Number"), + ("enclosingmark", "Enclosing_Mark"), + ("finalpunctuation", "Final_Punctuation"), ("format", "Format"), + ("initialpunctuation", "Initial_Punctuation"), ("l", "Letter"), + ("lc", "Cased_Letter"), ("letter", "Letter"), + ("letternumber", "Letter_Number"), ("lineseparator", "Line_Separator"), + ("ll", "Lowercase_Letter"), ("lm", "Modifier_Letter"), + ("lo", "Other_Letter"), ("lowercaseletter", "Lowercase_Letter"), + ("lt", "Titlecase_Letter"), ("lu", "Uppercase_Letter"), ("m", "Mark"), + ("mark", "Mark"), ("mathsymbol", "Math_Symbol"), ("mc", "Spacing_Mark"), + ("me", "Enclosing_Mark"), ("mn", "Nonspacing_Mark"), + ("modifierletter", "Modifier_Letter"), + ("modifiersymbol", "Modifier_Symbol"), ("n", "Number"), + ("nd", "Decimal_Number"), ("nl", "Letter_Number"), ("no", "Other_Number"), + ("nonspacingmark", "Nonspacing_Mark"), ("number", "Number"), + ("openpunctuation", "Open_Punctuation"), ("other", "Other"), + ("otherletter", "Other_Letter"), ("othernumber", "Other_Number"), + ("otherpunctuation", "Other_Punctuation"), ("othersymbol", "Other_Symbol"), + ("p", "Punctuation"), ("paragraphseparator", "Paragraph_Separator"), + ("pc", "Connector_Punctuation"), ("pd", "Dash_Punctuation"), + ("pe", "Close_Punctuation"), ("pf", "Final_Punctuation"), + ("pi", "Initial_Punctuation"), ("po", "Other_Punctuation"), + ("privateuse", "Private_Use"), ("ps", "Open_Punctuation"), + ("punct", "Punctuation"), ("punctuation", "Punctuation"), ("s", "Symbol"), + ("sc", "Currency_Symbol"), ("separator", "Separator"), + ("sk", "Modifier_Symbol"), ("sm", "Math_Symbol"), ("so", "Other_Symbol"), + ("spaceseparator", "Space_Separator"), ("spacingmark", "Spacing_Mark"), + ("surrogate", "Surrogate"), ("symbol", "Symbol"), + ("titlecaseletter", "Titlecase_Letter"), ("unassigned", "Unassigned"), + ("uppercaseletter", "Uppercase_Letter"), ("z", "Separator"), + ("zl", "Line_Separator"), ("zp", "Paragraph_Separator"), + ("zs", "Space_Separator"), ]), + + ("Script", &[("adlam", "Adlam"), ("adlm", "Adlam"), + ("aghb", "Caucasian_Albanian"), ("ahom", "Ahom"), + ("anatolianhieroglyphs", "Anatolian_Hieroglyphs"), ("arab", "Arabic"), + ("arabic", "Arabic"), ("armenian", "Armenian"), + ("armi", "Imperial_Aramaic"), ("armn", "Armenian"), ("avestan", "Avestan"), + ("avst", "Avestan"), ("bali", "Balinese"), ("balinese", "Balinese"), + ("bamu", "Bamum"), ("bamum", "Bamum"), ("bass", "Bassa_Vah"), + ("bassavah", "Bassa_Vah"), ("batak", "Batak"), ("batk", "Batak"), + ("beng", "Bengali"), ("bengali", "Bengali"), ("bhaiksuki", "Bhaiksuki"), + ("bhks", "Bhaiksuki"), ("bopo", "Bopomofo"), ("bopomofo", "Bopomofo"), + ("brah", "Brahmi"), ("brahmi", "Brahmi"), ("brai", "Braille"), + ("braille", "Braille"), ("bugi", "Buginese"), ("buginese", "Buginese"), + ("buhd", "Buhid"), ("buhid", "Buhid"), ("cakm", "Chakma"), + ("canadianaboriginal", "Canadian_Aboriginal"), + ("cans", "Canadian_Aboriginal"), ("cari", "Carian"), ("carian", "Carian"), + ("caucasianalbanian", "Caucasian_Albanian"), ("chakma", "Chakma"), + ("cham", "Cham"), ("cher", "Cherokee"), ("cherokee", "Cherokee"), + ("common", "Common"), ("copt", "Coptic"), ("coptic", "Coptic"), + ("cprt", "Cypriot"), ("cuneiform", "Cuneiform"), ("cypriot", "Cypriot"), + ("cyrillic", "Cyrillic"), ("cyrl", "Cyrillic"), ("deseret", "Deseret"), + ("deva", "Devanagari"), ("devanagari", "Devanagari"), ("dsrt", "Deseret"), + ("dupl", "Duployan"), ("duployan", "Duployan"), + ("egyp", "Egyptian_Hieroglyphs"), + ("egyptianhieroglyphs", "Egyptian_Hieroglyphs"), ("elba", "Elbasan"), + ("elbasan", "Elbasan"), ("ethi", "Ethiopic"), ("ethiopic", "Ethiopic"), + ("geor", "Georgian"), ("georgian", "Georgian"), ("glag", "Glagolitic"), + ("glagolitic", "Glagolitic"), ("gonm", "Masaram_Gondi"), ("goth", "Gothic"), + ("gothic", "Gothic"), ("gran", "Grantha"), ("grantha", "Grantha"), + ("greek", "Greek"), ("grek", "Greek"), ("gujarati", "Gujarati"), + ("gujr", "Gujarati"), ("gurmukhi", "Gurmukhi"), ("guru", "Gurmukhi"), + ("han", "Han"), ("hang", "Hangul"), ("hangul", "Hangul"), ("hani", "Han"), + ("hano", "Hanunoo"), ("hanunoo", "Hanunoo"), ("hatr", "Hatran"), + ("hatran", "Hatran"), ("hebr", "Hebrew"), ("hebrew", "Hebrew"), + ("hira", "Hiragana"), ("hiragana", "Hiragana"), + ("hluw", "Anatolian_Hieroglyphs"), ("hmng", "Pahawh_Hmong"), + ("hrkt", "Katakana_Or_Hiragana"), ("hung", "Old_Hungarian"), + ("imperialaramaic", "Imperial_Aramaic"), ("inherited", "Inherited"), + ("inscriptionalpahlavi", "Inscriptional_Pahlavi"), + ("inscriptionalparthian", "Inscriptional_Parthian"), ("ital", "Old_Italic"), + ("java", "Javanese"), ("javanese", "Javanese"), ("kaithi", "Kaithi"), + ("kali", "Kayah_Li"), ("kana", "Katakana"), ("kannada", "Kannada"), + ("katakana", "Katakana"), ("katakanaorhiragana", "Katakana_Or_Hiragana"), + ("kayahli", "Kayah_Li"), ("khar", "Kharoshthi"), + ("kharoshthi", "Kharoshthi"), ("khmer", "Khmer"), ("khmr", "Khmer"), + ("khoj", "Khojki"), ("khojki", "Khojki"), ("khudawadi", "Khudawadi"), + ("knda", "Kannada"), ("kthi", "Kaithi"), ("lana", "Tai_Tham"), + ("lao", "Lao"), ("laoo", "Lao"), ("latin", "Latin"), ("latn", "Latin"), + ("lepc", "Lepcha"), ("lepcha", "Lepcha"), ("limb", "Limbu"), + ("limbu", "Limbu"), ("lina", "Linear_A"), ("linb", "Linear_B"), + ("lineara", "Linear_A"), ("linearb", "Linear_B"), ("lisu", "Lisu"), + ("lyci", "Lycian"), ("lycian", "Lycian"), ("lydi", "Lydian"), + ("lydian", "Lydian"), ("mahajani", "Mahajani"), ("mahj", "Mahajani"), + ("malayalam", "Malayalam"), ("mand", "Mandaic"), ("mandaic", "Mandaic"), + ("mani", "Manichaean"), ("manichaean", "Manichaean"), ("marc", "Marchen"), + ("marchen", "Marchen"), ("masaramgondi", "Masaram_Gondi"), + ("meeteimayek", "Meetei_Mayek"), ("mend", "Mende_Kikakui"), + ("mendekikakui", "Mende_Kikakui"), ("merc", "Meroitic_Cursive"), + ("mero", "Meroitic_Hieroglyphs"), ("meroiticcursive", "Meroitic_Cursive"), + ("meroitichieroglyphs", "Meroitic_Hieroglyphs"), ("miao", "Miao"), + ("mlym", "Malayalam"), ("modi", "Modi"), ("mong", "Mongolian"), + ("mongolian", "Mongolian"), ("mro", "Mro"), ("mroo", "Mro"), + ("mtei", "Meetei_Mayek"), ("mult", "Multani"), ("multani", "Multani"), + ("myanmar", "Myanmar"), ("mymr", "Myanmar"), ("nabataean", "Nabataean"), + ("narb", "Old_North_Arabian"), ("nbat", "Nabataean"), ("newa", "Newa"), + ("newtailue", "New_Tai_Lue"), ("nko", "Nko"), ("nkoo", "Nko"), + ("nshu", "Nushu"), ("nushu", "Nushu"), ("ogam", "Ogham"), + ("ogham", "Ogham"), ("olchiki", "Ol_Chiki"), ("olck", "Ol_Chiki"), + ("oldhungarian", "Old_Hungarian"), ("olditalic", "Old_Italic"), + ("oldnortharabian", "Old_North_Arabian"), ("oldpermic", "Old_Permic"), + ("oldpersian", "Old_Persian"), ("oldsoutharabian", "Old_South_Arabian"), + ("oldturkic", "Old_Turkic"), ("oriya", "Oriya"), ("orkh", "Old_Turkic"), + ("orya", "Oriya"), ("osage", "Osage"), ("osge", "Osage"), + ("osma", "Osmanya"), ("osmanya", "Osmanya"), + ("pahawhhmong", "Pahawh_Hmong"), ("palm", "Palmyrene"), + ("palmyrene", "Palmyrene"), ("pauc", "Pau_Cin_Hau"), + ("paucinhau", "Pau_Cin_Hau"), ("perm", "Old_Permic"), ("phag", "Phags_Pa"), + ("phagspa", "Phags_Pa"), ("phli", "Inscriptional_Pahlavi"), + ("phlp", "Psalter_Pahlavi"), ("phnx", "Phoenician"), + ("phoenician", "Phoenician"), ("plrd", "Miao"), + ("prti", "Inscriptional_Parthian"), ("psalterpahlavi", "Psalter_Pahlavi"), + ("qaac", "Coptic"), ("qaai", "Inherited"), ("rejang", "Rejang"), + ("rjng", "Rejang"), ("runic", "Runic"), ("runr", "Runic"), + ("samaritan", "Samaritan"), ("samr", "Samaritan"), + ("sarb", "Old_South_Arabian"), ("saur", "Saurashtra"), + ("saurashtra", "Saurashtra"), ("sgnw", "SignWriting"), + ("sharada", "Sharada"), ("shavian", "Shavian"), ("shaw", "Shavian"), + ("shrd", "Sharada"), ("sidd", "Siddham"), ("siddham", "Siddham"), + ("signwriting", "SignWriting"), ("sind", "Khudawadi"), ("sinh", "Sinhala"), + ("sinhala", "Sinhala"), ("sora", "Sora_Sompeng"), + ("sorasompeng", "Sora_Sompeng"), ("soyo", "Soyombo"), + ("soyombo", "Soyombo"), ("sund", "Sundanese"), ("sundanese", "Sundanese"), + ("sylo", "Syloti_Nagri"), ("sylotinagri", "Syloti_Nagri"), + ("syrc", "Syriac"), ("syriac", "Syriac"), ("tagalog", "Tagalog"), + ("tagb", "Tagbanwa"), ("tagbanwa", "Tagbanwa"), ("taile", "Tai_Le"), + ("taitham", "Tai_Tham"), ("taiviet", "Tai_Viet"), ("takr", "Takri"), + ("takri", "Takri"), ("tale", "Tai_Le"), ("talu", "New_Tai_Lue"), + ("tamil", "Tamil"), ("taml", "Tamil"), ("tang", "Tangut"), + ("tangut", "Tangut"), ("tavt", "Tai_Viet"), ("telu", "Telugu"), + ("telugu", "Telugu"), ("tfng", "Tifinagh"), ("tglg", "Tagalog"), + ("thaa", "Thaana"), ("thaana", "Thaana"), ("thai", "Thai"), + ("tibetan", "Tibetan"), ("tibt", "Tibetan"), ("tifinagh", "Tifinagh"), + ("tirh", "Tirhuta"), ("tirhuta", "Tirhuta"), ("ugar", "Ugaritic"), + ("ugaritic", "Ugaritic"), ("unknown", "Unknown"), ("vai", "Vai"), + ("vaii", "Vai"), ("wara", "Warang_Citi"), ("warangciti", "Warang_Citi"), + ("xpeo", "Old_Persian"), ("xsux", "Cuneiform"), ("yi", "Yi"), + ("yiii", "Yi"), ("zanabazarsquare", "Zanabazar_Square"), + ("zanb", "Zanabazar_Square"), ("zinh", "Inherited"), ("zyyy", "Common"), + ("zzzz", "Unknown"), ]), + + ("Script_Extensions", &[("adlam", "Adlam"), ("adlm", "Adlam"), + ("aghb", "Caucasian_Albanian"), ("ahom", "Ahom"), + ("anatolianhieroglyphs", "Anatolian_Hieroglyphs"), ("arab", "Arabic"), + ("arabic", "Arabic"), ("armenian", "Armenian"), + ("armi", "Imperial_Aramaic"), ("armn", "Armenian"), ("avestan", "Avestan"), + ("avst", "Avestan"), ("bali", "Balinese"), ("balinese", "Balinese"), + ("bamu", "Bamum"), ("bamum", "Bamum"), ("bass", "Bassa_Vah"), + ("bassavah", "Bassa_Vah"), ("batak", "Batak"), ("batk", "Batak"), + ("beng", "Bengali"), ("bengali", "Bengali"), ("bhaiksuki", "Bhaiksuki"), + ("bhks", "Bhaiksuki"), ("bopo", "Bopomofo"), ("bopomofo", "Bopomofo"), + ("brah", "Brahmi"), ("brahmi", "Brahmi"), ("brai", "Braille"), + ("braille", "Braille"), ("bugi", "Buginese"), ("buginese", "Buginese"), + ("buhd", "Buhid"), ("buhid", "Buhid"), ("cakm", "Chakma"), + ("canadianaboriginal", "Canadian_Aboriginal"), + ("cans", "Canadian_Aboriginal"), ("cari", "Carian"), ("carian", "Carian"), + ("caucasianalbanian", "Caucasian_Albanian"), ("chakma", "Chakma"), + ("cham", "Cham"), ("cher", "Cherokee"), ("cherokee", "Cherokee"), + ("common", "Common"), ("copt", "Coptic"), ("coptic", "Coptic"), + ("cprt", "Cypriot"), ("cuneiform", "Cuneiform"), ("cypriot", "Cypriot"), + ("cyrillic", "Cyrillic"), ("cyrl", "Cyrillic"), ("deseret", "Deseret"), + ("deva", "Devanagari"), ("devanagari", "Devanagari"), ("dsrt", "Deseret"), + ("dupl", "Duployan"), ("duployan", "Duployan"), + ("egyp", "Egyptian_Hieroglyphs"), + ("egyptianhieroglyphs", "Egyptian_Hieroglyphs"), ("elba", "Elbasan"), + ("elbasan", "Elbasan"), ("ethi", "Ethiopic"), ("ethiopic", "Ethiopic"), + ("geor", "Georgian"), ("georgian", "Georgian"), ("glag", "Glagolitic"), + ("glagolitic", "Glagolitic"), ("gonm", "Masaram_Gondi"), ("goth", "Gothic"), + ("gothic", "Gothic"), ("gran", "Grantha"), ("grantha", "Grantha"), + ("greek", "Greek"), ("grek", "Greek"), ("gujarati", "Gujarati"), + ("gujr", "Gujarati"), ("gurmukhi", "Gurmukhi"), ("guru", "Gurmukhi"), + ("han", "Han"), ("hang", "Hangul"), ("hangul", "Hangul"), ("hani", "Han"), + ("hano", "Hanunoo"), ("hanunoo", "Hanunoo"), ("hatr", "Hatran"), + ("hatran", "Hatran"), ("hebr", "Hebrew"), ("hebrew", "Hebrew"), + ("hira", "Hiragana"), ("hiragana", "Hiragana"), + ("hluw", "Anatolian_Hieroglyphs"), ("hmng", "Pahawh_Hmong"), + ("hrkt", "Katakana_Or_Hiragana"), ("hung", "Old_Hungarian"), + ("imperialaramaic", "Imperial_Aramaic"), ("inherited", "Inherited"), + ("inscriptionalpahlavi", "Inscriptional_Pahlavi"), + ("inscriptionalparthian", "Inscriptional_Parthian"), ("ital", "Old_Italic"), + ("java", "Javanese"), ("javanese", "Javanese"), ("kaithi", "Kaithi"), + ("kali", "Kayah_Li"), ("kana", "Katakana"), ("kannada", "Kannada"), + ("katakana", "Katakana"), ("katakanaorhiragana", "Katakana_Or_Hiragana"), + ("kayahli", "Kayah_Li"), ("khar", "Kharoshthi"), + ("kharoshthi", "Kharoshthi"), ("khmer", "Khmer"), ("khmr", "Khmer"), + ("khoj", "Khojki"), ("khojki", "Khojki"), ("khudawadi", "Khudawadi"), + ("knda", "Kannada"), ("kthi", "Kaithi"), ("lana", "Tai_Tham"), + ("lao", "Lao"), ("laoo", "Lao"), ("latin", "Latin"), ("latn", "Latin"), + ("lepc", "Lepcha"), ("lepcha", "Lepcha"), ("limb", "Limbu"), + ("limbu", "Limbu"), ("lina", "Linear_A"), ("linb", "Linear_B"), + ("lineara", "Linear_A"), ("linearb", "Linear_B"), ("lisu", "Lisu"), + ("lyci", "Lycian"), ("lycian", "Lycian"), ("lydi", "Lydian"), + ("lydian", "Lydian"), ("mahajani", "Mahajani"), ("mahj", "Mahajani"), + ("malayalam", "Malayalam"), ("mand", "Mandaic"), ("mandaic", "Mandaic"), + ("mani", "Manichaean"), ("manichaean", "Manichaean"), ("marc", "Marchen"), + ("marchen", "Marchen"), ("masaramgondi", "Masaram_Gondi"), + ("meeteimayek", "Meetei_Mayek"), ("mend", "Mende_Kikakui"), + ("mendekikakui", "Mende_Kikakui"), ("merc", "Meroitic_Cursive"), + ("mero", "Meroitic_Hieroglyphs"), ("meroiticcursive", "Meroitic_Cursive"), + ("meroitichieroglyphs", "Meroitic_Hieroglyphs"), ("miao", "Miao"), + ("mlym", "Malayalam"), ("modi", "Modi"), ("mong", "Mongolian"), + ("mongolian", "Mongolian"), ("mro", "Mro"), ("mroo", "Mro"), + ("mtei", "Meetei_Mayek"), ("mult", "Multani"), ("multani", "Multani"), + ("myanmar", "Myanmar"), ("mymr", "Myanmar"), ("nabataean", "Nabataean"), + ("narb", "Old_North_Arabian"), ("nbat", "Nabataean"), ("newa", "Newa"), + ("newtailue", "New_Tai_Lue"), ("nko", "Nko"), ("nkoo", "Nko"), + ("nshu", "Nushu"), ("nushu", "Nushu"), ("ogam", "Ogham"), + ("ogham", "Ogham"), ("olchiki", "Ol_Chiki"), ("olck", "Ol_Chiki"), + ("oldhungarian", "Old_Hungarian"), ("olditalic", "Old_Italic"), + ("oldnortharabian", "Old_North_Arabian"), ("oldpermic", "Old_Permic"), + ("oldpersian", "Old_Persian"), ("oldsoutharabian", "Old_South_Arabian"), + ("oldturkic", "Old_Turkic"), ("oriya", "Oriya"), ("orkh", "Old_Turkic"), + ("orya", "Oriya"), ("osage", "Osage"), ("osge", "Osage"), + ("osma", "Osmanya"), ("osmanya", "Osmanya"), + ("pahawhhmong", "Pahawh_Hmong"), ("palm", "Palmyrene"), + ("palmyrene", "Palmyrene"), ("pauc", "Pau_Cin_Hau"), + ("paucinhau", "Pau_Cin_Hau"), ("perm", "Old_Permic"), ("phag", "Phags_Pa"), + ("phagspa", "Phags_Pa"), ("phli", "Inscriptional_Pahlavi"), + ("phlp", "Psalter_Pahlavi"), ("phnx", "Phoenician"), + ("phoenician", "Phoenician"), ("plrd", "Miao"), + ("prti", "Inscriptional_Parthian"), ("psalterpahlavi", "Psalter_Pahlavi"), + ("qaac", "Coptic"), ("qaai", "Inherited"), ("rejang", "Rejang"), + ("rjng", "Rejang"), ("runic", "Runic"), ("runr", "Runic"), + ("samaritan", "Samaritan"), ("samr", "Samaritan"), + ("sarb", "Old_South_Arabian"), ("saur", "Saurashtra"), + ("saurashtra", "Saurashtra"), ("sgnw", "SignWriting"), + ("sharada", "Sharada"), ("shavian", "Shavian"), ("shaw", "Shavian"), + ("shrd", "Sharada"), ("sidd", "Siddham"), ("siddham", "Siddham"), + ("signwriting", "SignWriting"), ("sind", "Khudawadi"), ("sinh", "Sinhala"), + ("sinhala", "Sinhala"), ("sora", "Sora_Sompeng"), + ("sorasompeng", "Sora_Sompeng"), ("soyo", "Soyombo"), + ("soyombo", "Soyombo"), ("sund", "Sundanese"), ("sundanese", "Sundanese"), + ("sylo", "Syloti_Nagri"), ("sylotinagri", "Syloti_Nagri"), + ("syrc", "Syriac"), ("syriac", "Syriac"), ("tagalog", "Tagalog"), + ("tagb", "Tagbanwa"), ("tagbanwa", "Tagbanwa"), ("taile", "Tai_Le"), + ("taitham", "Tai_Tham"), ("taiviet", "Tai_Viet"), ("takr", "Takri"), + ("takri", "Takri"), ("tale", "Tai_Le"), ("talu", "New_Tai_Lue"), + ("tamil", "Tamil"), ("taml", "Tamil"), ("tang", "Tangut"), + ("tangut", "Tangut"), ("tavt", "Tai_Viet"), ("telu", "Telugu"), + ("telugu", "Telugu"), ("tfng", "Tifinagh"), ("tglg", "Tagalog"), + ("thaa", "Thaana"), ("thaana", "Thaana"), ("thai", "Thai"), + ("tibetan", "Tibetan"), ("tibt", "Tibetan"), ("tifinagh", "Tifinagh"), + ("tirh", "Tirhuta"), ("tirhuta", "Tirhuta"), ("ugar", "Ugaritic"), + ("ugaritic", "Ugaritic"), ("unknown", "Unknown"), ("vai", "Vai"), + ("vaii", "Vai"), ("wara", "Warang_Citi"), ("warangciti", "Warang_Citi"), + ("xpeo", "Old_Persian"), ("xsux", "Cuneiform"), ("yi", "Yi"), + ("yiii", "Yi"), ("zanabazarsquare", "Zanabazar_Square"), + ("zanb", "Zanabazar_Square"), ("zinh", "Inherited"), ("zyyy", "Common"), + ("zzzz", "Unknown"), ]), +]; diff --git a/regex-syntax-2/src/unicode_tables/script.rs b/regex-syntax-2/src/unicode_tables/script.rs new file mode 100644 index 0000000000..99c5786dea --- /dev/null +++ b/regex-syntax-2/src/unicode_tables/script.rs @@ -0,0 +1,765 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate script tmp/ucd-10.0.0/ --chars +// +// ucd-generate is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ + ("Adlam", ADLAM), ("Ahom", AHOM), + ("Anatolian_Hieroglyphs", ANATOLIAN_HIEROGLYPHS), ("Arabic", ARABIC), + ("Armenian", ARMENIAN), ("Avestan", AVESTAN), ("Balinese", BALINESE), + ("Bamum", BAMUM), ("Bassa_Vah", BASSA_VAH), ("Batak", BATAK), + ("Bengali", BENGALI), ("Bhaiksuki", BHAIKSUKI), ("Bopomofo", BOPOMOFO), + ("Brahmi", BRAHMI), ("Braille", BRAILLE), ("Buginese", BUGINESE), + ("Buhid", BUHID), ("Canadian_Aboriginal", CANADIAN_ABORIGINAL), + ("Carian", CARIAN), ("Caucasian_Albanian", CAUCASIAN_ALBANIAN), + ("Chakma", CHAKMA), ("Cham", CHAM), ("Cherokee", CHEROKEE), + ("Common", COMMON), ("Coptic", COPTIC), ("Cuneiform", CUNEIFORM), + ("Cypriot", CYPRIOT), ("Cyrillic", CYRILLIC), ("Deseret", DESERET), + ("Devanagari", DEVANAGARI), ("Duployan", DUPLOYAN), + ("Egyptian_Hieroglyphs", EGYPTIAN_HIEROGLYPHS), ("Elbasan", ELBASAN), + ("Ethiopic", ETHIOPIC), ("Georgian", GEORGIAN), ("Glagolitic", GLAGOLITIC), + ("Gothic", GOTHIC), ("Grantha", GRANTHA), ("Greek", GREEK), + ("Gujarati", GUJARATI), ("Gurmukhi", GURMUKHI), ("Han", HAN), + ("Hangul", HANGUL), ("Hanunoo", HANUNOO), ("Hatran", HATRAN), + ("Hebrew", HEBREW), ("Hiragana", HIRAGANA), + ("Imperial_Aramaic", IMPERIAL_ARAMAIC), ("Inherited", INHERITED), + ("Inscriptional_Pahlavi", INSCRIPTIONAL_PAHLAVI), + ("Inscriptional_Parthian", INSCRIPTIONAL_PARTHIAN), ("Javanese", JAVANESE), + ("Kaithi", KAITHI), ("Kannada", KANNADA), ("Katakana", KATAKANA), + ("Kayah_Li", KAYAH_LI), ("Kharoshthi", KHAROSHTHI), ("Khmer", KHMER), + ("Khojki", KHOJKI), ("Khudawadi", KHUDAWADI), ("Lao", LAO), + ("Latin", LATIN), ("Lepcha", LEPCHA), ("Limbu", LIMBU), + ("Linear_A", LINEAR_A), ("Linear_B", LINEAR_B), ("Lisu", LISU), + ("Lycian", LYCIAN), ("Lydian", LYDIAN), ("Mahajani", MAHAJANI), + ("Malayalam", MALAYALAM), ("Mandaic", MANDAIC), ("Manichaean", MANICHAEAN), + ("Marchen", MARCHEN), ("Masaram_Gondi", MASARAM_GONDI), + ("Meetei_Mayek", MEETEI_MAYEK), ("Mende_Kikakui", MENDE_KIKAKUI), + ("Meroitic_Cursive", MEROITIC_CURSIVE), + ("Meroitic_Hieroglyphs", MEROITIC_HIEROGLYPHS), ("Miao", MIAO), + ("Modi", MODI), ("Mongolian", MONGOLIAN), ("Mro", MRO), + ("Multani", MULTANI), ("Myanmar", MYANMAR), ("Nabataean", NABATAEAN), + ("New_Tai_Lue", NEW_TAI_LUE), ("Newa", NEWA), ("Nko", NKO), + ("Nushu", NUSHU), ("Ogham", OGHAM), ("Ol_Chiki", OL_CHIKI), + ("Old_Hungarian", OLD_HUNGARIAN), ("Old_Italic", OLD_ITALIC), + ("Old_North_Arabian", OLD_NORTH_ARABIAN), ("Old_Permic", OLD_PERMIC), + ("Old_Persian", OLD_PERSIAN), ("Old_South_Arabian", OLD_SOUTH_ARABIAN), + ("Old_Turkic", OLD_TURKIC), ("Oriya", ORIYA), ("Osage", OSAGE), + ("Osmanya", OSMANYA), ("Pahawh_Hmong", PAHAWH_HMONG), + ("Palmyrene", PALMYRENE), ("Pau_Cin_Hau", PAU_CIN_HAU), + ("Phags_Pa", PHAGS_PA), ("Phoenician", PHOENICIAN), + ("Psalter_Pahlavi", PSALTER_PAHLAVI), ("Rejang", REJANG), ("Runic", RUNIC), + ("Samaritan", SAMARITAN), ("Saurashtra", SAURASHTRA), ("Sharada", SHARADA), + ("Shavian", SHAVIAN), ("Siddham", SIDDHAM), ("SignWriting", SIGNWRITING), + ("Sinhala", SINHALA), ("Sora_Sompeng", SORA_SOMPENG), ("Soyombo", SOYOMBO), + ("Sundanese", SUNDANESE), ("Syloti_Nagri", SYLOTI_NAGRI), + ("Syriac", SYRIAC), ("Tagalog", TAGALOG), ("Tagbanwa", TAGBANWA), + ("Tai_Le", TAI_LE), ("Tai_Tham", TAI_THAM), ("Tai_Viet", TAI_VIET), + ("Takri", TAKRI), ("Tamil", TAMIL), ("Tangut", TANGUT), ("Telugu", TELUGU), + ("Thaana", THAANA), ("Thai", THAI), ("Tibetan", TIBETAN), + ("Tifinagh", TIFINAGH), ("Tirhuta", TIRHUTA), ("Ugaritic", UGARITIC), + ("Vai", VAI), ("Warang_Citi", WARANG_CITI), ("Yi", YI), + ("Zanabazar_Square", ZANABAZAR_SQUARE), +]; + +pub const ADLAM: &'static [(char, char)] = &[ + ('𞤀', '𞥊'), ('𞥐', '𞥙'), ('𞥞', '𞥟'), +]; + +pub const AHOM: &'static [(char, char)] = &[ + ('𑜀', '𑜙'), ('𑜝', '𑜫'), ('𑜰', '𑜿'), +]; + +pub const ANATOLIAN_HIEROGLYPHS: &'static [(char, char)] = &[ + ('𔐀', '𔙆'), +]; + +pub const ARABIC: &'static [(char, char)] = &[ + ('\u{600}', '\u{604}'), ('؆', '؋'), ('؍', 'ؚ'), ('\u{61c}', '\u{61c}'), + ('؞', '؞'), ('ؠ', 'ؿ'), ('ف', 'ي'), ('ٖ', 'ٯ'), ('ٱ', 'ۜ'), + ('۞', 'ۿ'), ('ݐ', 'ݿ'), ('ࢠ', 'ࢴ'), ('ࢶ', 'ࢽ'), ('ࣔ', '࣡'), + ('ࣣ', 'ࣿ'), ('ﭐ', '﯁'), ('ﯓ', 'ﴽ'), ('ﵐ', 'ﶏ'), + ('ﶒ', 'ﷇ'), ('ﷰ', '﷽'), ('ﹰ', 'ﹴ'), ('ﹶ', 'ﻼ'), + ('𐹠', '𐹾'), ('𞸀', '𞸃'), ('𞸅', '𞸟'), ('𞸡', '𞸢'), + ('𞸤', '𞸤'), ('𞸧', '𞸧'), ('𞸩', '𞸲'), ('𞸴', '𞸷'), + ('𞸹', '𞸹'), ('𞸻', '𞸻'), ('𞹂', '𞹂'), ('𞹇', '𞹇'), + ('𞹉', '𞹉'), ('𞹋', '𞹋'), ('𞹍', '𞹏'), ('𞹑', '𞹒'), + ('𞹔', '𞹔'), ('𞹗', '𞹗'), ('𞹙', '𞹙'), ('𞹛', '𞹛'), + ('𞹝', '𞹝'), ('𞹟', '𞹟'), ('𞹡', '𞹢'), ('𞹤', '𞹤'), + ('𞹧', '𞹪'), ('𞹬', '𞹲'), ('𞹴', '𞹷'), ('𞹹', '𞹼'), + ('𞹾', '𞹾'), ('𞺀', '𞺉'), ('𞺋', '𞺛'), ('𞺡', '𞺣'), + ('𞺥', '𞺩'), ('𞺫', '𞺻'), ('𞻰', '𞻱'), +]; + +pub const ARMENIAN: &'static [(char, char)] = &[ + ('Ա', 'Ֆ'), ('ՙ', '՟'), ('ա', 'և'), ('֊', '֊'), ('֍', '֏'), + ('ﬓ', 'ﬗ'), +]; + +pub const AVESTAN: &'static [(char, char)] = &[ + ('𐬀', '𐬵'), ('𐬹', '𐬿'), +]; + +pub const BALINESE: &'static [(char, char)] = &[ + ('ᬀ', 'ᭋ'), ('᭐', '᭼'), +]; + +pub const BAMUM: &'static [(char, char)] = &[ + ('ꚠ', '꛷'), ('𖠀', '𖨸'), +]; + +pub const BASSA_VAH: &'static [(char, char)] = &[ + ('𖫐', '𖫭'), ('𖫰', '𖫵'), +]; + +pub const BATAK: &'static [(char, char)] = &[ + ('ᯀ', '᯳'), ('᯼', '᯿'), +]; + +pub const BENGALI: &'static [(char, char)] = &[ + ('ঀ', 'ঃ'), ('অ', 'ঌ'), ('এ', 'ঐ'), ('ও', 'ন'), + ('প', 'র'), ('ল', 'ল'), ('শ', 'হ'), ('়', 'ৄ'), + ('ে', 'ৈ'), ('ো', 'ৎ'), ('ৗ', 'ৗ'), ('ড়', 'ঢ়'), + ('য়', 'ৣ'), ('০', '৽'), +]; + +pub const BHAIKSUKI: &'static [(char, char)] = &[ + ('𑰀', '𑰈'), ('𑰊', '𑰶'), ('𑰸', '𑱅'), ('𑱐', '𑱬'), +]; + +pub const BOPOMOFO: &'static [(char, char)] = &[ + ('˪', '˫'), ('ㄅ', 'ㄮ'), ('ㆠ', 'ㆺ'), +]; + +pub const BRAHMI: &'static [(char, char)] = &[ + ('𑀀', '𑁍'), ('𑁒', '𑁯'), ('𑁿', '𑁿'), +]; + +pub const BRAILLE: &'static [(char, char)] = &[ + ('⠀', '⣿'), +]; + +pub const BUGINESE: &'static [(char, char)] = &[ + ('ᨀ', 'ᨛ'), ('᨞', '᨟'), +]; + +pub const BUHID: &'static [(char, char)] = &[ + ('ᝀ', 'ᝓ'), +]; + +pub const CANADIAN_ABORIGINAL: &'static [(char, char)] = &[ + ('᐀', 'ᙿ'), ('ᢰ', 'ᣵ'), +]; + +pub const CARIAN: &'static [(char, char)] = &[ + ('𐊠', '𐋐'), +]; + +pub const CAUCASIAN_ALBANIAN: &'static [(char, char)] = &[ + ('𐔰', '𐕣'), ('𐕯', '𐕯'), +]; + +pub const CHAKMA: &'static [(char, char)] = &[ + ('𑄀', '𑄴'), ('𑄶', '𑅃'), +]; + +pub const CHAM: &'static [(char, char)] = &[ + ('ꨀ', 'ꨶ'), ('ꩀ', 'ꩍ'), ('꩐', '꩙'), ('꩜', '꩟'), +]; + +pub const CHEROKEE: &'static [(char, char)] = &[ + ('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ꭰ', 'ꮿ'), +]; + +pub const COMMON: &'static [(char, char)] = &[ + ('\u{0}', '@'), ('[', '`'), ('{', '©'), ('«', '¹'), ('»', '¿'), + ('×', '×'), ('÷', '÷'), ('ʹ', '˟'), ('˥', '˩'), ('ˬ', '˿'), + ('ʹ', 'ʹ'), (';', ';'), ('΅', '΅'), ('·', '·'), ('։', '։'), + ('\u{605}', '\u{605}'), ('،', '،'), ('؛', '؛'), ('؟', '؟'), + ('ـ', 'ـ'), ('\u{6dd}', '\u{6dd}'), ('\u{8e2}', '\u{8e2}'), + ('।', '॥'), ('฿', '฿'), ('࿕', '࿘'), ('჻', '჻'), + ('᛫', '᛭'), ('᜵', '᜶'), ('᠂', '᠃'), ('᠅', '᠅'), + ('᳓', '᳓'), ('᳡', '᳡'), ('ᳩ', 'ᳬ'), ('ᳮ', 'ᳳ'), + ('ᳵ', '᳷'), ('\u{2000}', '\u{200b}'), ('\u{200e}', '\u{2064}'), + ('\u{2066}', '⁰'), ('⁴', '⁾'), ('₀', '₎'), ('₠', '₿'), + ('℀', '℥'), ('℧', '℩'), ('ℬ', 'ℱ'), ('ℳ', '⅍'), + ('⅏', '⅟'), ('↉', '↋'), ('←', '␦'), ('⑀', '⑊'), + ('①', '⟿'), ('⤀', '⭳'), ('⭶', '⮕'), ('⮘', '⮹'), + ('⮽', '⯈'), ('⯊', '⯒'), ('⯬', '⯯'), ('⸀', '⹉'), + ('⿰', '⿻'), ('\u{3000}', '〄'), ('〆', '〆'), ('〈', '〠'), + ('〰', '〷'), ('〼', '〿'), ('゛', '゜'), ('゠', '゠'), + ('・', 'ー'), ('㆐', '㆟'), ('㇀', '㇣'), ('㈠', '㉟'), + ('㉿', '㋏'), ('㍘', '㏿'), ('䷀', '䷿'), ('꜀', '꜡'), + ('ꞈ', '꞊'), ('꠰', '꠹'), ('꤮', '꤮'), ('ꧏ', 'ꧏ'), + ('꭛', '꭛'), ('﴾', '﴿'), ('︐', '︙'), ('︰', '﹒'), + ('﹔', '﹦'), ('﹨', '﹫'), ('\u{feff}', '\u{feff}'), ('!', '@'), + ('[', '`'), ('{', '・'), ('ー', 'ー'), ('゙', '゚'), + ('¢', '₩'), ('│', '○'), ('\u{fff9}', '�'), ('𐄀', '𐄂'), + ('𐄇', '𐄳'), ('𐄷', '𐄿'), ('𐆐', '𐆛'), ('𐇐', '𐇼'), + ('𐋡', '𐋻'), ('\u{1bca0}', '\u{1bca3}'), ('𝀀', '𝃵'), + ('𝄀', '𝄦'), ('𝄩', '𝅦'), ('𝅪', '\u{1d17a}'), ('𝆃', '𝆄'), + ('𝆌', '𝆩'), ('𝆮', '𝇨'), ('𝌀', '𝍖'), ('𝍠', '𝍱'), + ('𝐀', '𝑔'), ('𝑖', '𝒜'), ('𝒞', '𝒟'), ('𝒢', '𝒢'), + ('𝒥', '𝒦'), ('𝒩', '𝒬'), ('𝒮', '𝒹'), ('𝒻', '𝒻'), + ('𝒽', '𝓃'), ('𝓅', '𝔅'), ('𝔇', '𝔊'), ('𝔍', '𝔔'), + ('𝔖', '𝔜'), ('𝔞', '𝔹'), ('𝔻', '𝔾'), ('𝕀', '𝕄'), + ('𝕆', '𝕆'), ('𝕊', '𝕐'), ('𝕒', '𝚥'), ('𝚨', '𝟋'), + ('𝟎', '𝟿'), ('🀀', '🀫'), ('🀰', '🂓'), ('🂠', '🂮'), + ('🂱', '🂿'), ('🃁', '🃏'), ('🃑', '🃵'), ('🄀', '🄌'), + ('🄐', '🄮'), ('🄰', '🅫'), ('🅰', '🆬'), ('🇦', '🇿'), + ('🈁', '🈂'), ('🈐', '🈻'), ('🉀', '🉈'), ('🉐', '🉑'), + ('🉠', '🉥'), ('🌀', '🛔'), ('🛠', '🛬'), ('🛰', '🛸'), + ('🜀', '🝳'), ('🞀', '🟔'), ('🠀', '🠋'), ('🠐', '🡇'), + ('🡐', '🡙'), ('🡠', '🢇'), ('🢐', '🢭'), ('🤀', '🤋'), + ('🤐', '🤾'), ('🥀', '🥌'), ('🥐', '🥫'), ('🦀', '🦗'), + ('🧀', '🧀'), ('🧐', '🧦'), ('\u{e0001}', '\u{e0001}'), + ('\u{e0020}', '\u{e007f}'), +]; + +pub const COPTIC: &'static [(char, char)] = &[ + ('Ϣ', 'ϯ'), ('Ⲁ', 'ⳳ'), ('⳹', '⳿'), +]; + +pub const CUNEIFORM: &'static [(char, char)] = &[ + ('𒀀', '𒎙'), ('𒐀', '𒑮'), ('𒑰', '𒑴'), ('𒒀', '𒕃'), +]; + +pub const CYPRIOT: &'static [(char, char)] = &[ + ('𐠀', '𐠅'), ('𐠈', '𐠈'), ('𐠊', '𐠵'), ('𐠷', '𐠸'), + ('𐠼', '𐠼'), ('𐠿', '𐠿'), +]; + +pub const CYRILLIC: &'static [(char, char)] = &[ + ('Ѐ', '҄'), ('҇', 'ԯ'), ('ᲀ', 'ᲈ'), ('ᴫ', 'ᴫ'), ('ᵸ', 'ᵸ'), + ('ⷠ', 'ⷿ'), ('Ꙁ', 'ꚟ'), ('︮', '︯'), +]; + +pub const DESERET: &'static [(char, char)] = &[ + ('𐐀', '𐑏'), +]; + +pub const DEVANAGARI: &'static [(char, char)] = &[ + ('ऀ', 'ॐ'), ('॓', 'ॣ'), ('०', 'ॿ'), ('꣠', 'ꣽ'), +]; + +pub const DUPLOYAN: &'static [(char, char)] = &[ + ('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), ('𛲐', '𛲙'), + ('𛲜', '𛲟'), +]; + +pub const EGYPTIAN_HIEROGLYPHS: &'static [(char, char)] = &[ + ('𓀀', '𓐮'), +]; + +pub const ELBASAN: &'static [(char, char)] = &[ + ('𐔀', '𐔧'), +]; + +pub const ETHIOPIC: &'static [(char, char)] = &[ + ('ሀ', 'ቈ'), ('ቊ', 'ቍ'), ('ቐ', 'ቖ'), ('ቘ', 'ቘ'), + ('ቚ', 'ቝ'), ('በ', 'ኈ'), ('ኊ', 'ኍ'), ('ነ', 'ኰ'), + ('ኲ', 'ኵ'), ('ኸ', 'ኾ'), ('ዀ', 'ዀ'), ('ዂ', 'ዅ'), + ('ወ', 'ዖ'), ('ዘ', 'ጐ'), ('ጒ', 'ጕ'), ('ጘ', 'ፚ'), + ('፝', '፼'), ('ᎀ', '᎙'), ('ⶀ', 'ⶖ'), ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), ('ⶰ', 'ⶶ'), ('ⶸ', 'ⶾ'), ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), ('ⷐ', 'ⷖ'), ('ⷘ', 'ⷞ'), ('ꬁ', 'ꬆ'), + ('ꬉ', 'ꬎ'), ('ꬑ', 'ꬖ'), ('ꬠ', 'ꬦ'), ('ꬨ', 'ꬮ'), +]; + +pub const GEORGIAN: &'static [(char, char)] = &[ + ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), ('ა', 'ჺ'), + ('ჼ', 'ჿ'), ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), +]; + +pub const GLAGOLITIC: &'static [(char, char)] = &[ + ('Ⰰ', 'Ⱞ'), ('ⰰ', 'ⱞ'), ('𞀀', '𞀆'), ('𞀈', '𞀘'), + ('𞀛', '𞀡'), ('𞀣', '𞀤'), ('𞀦', '𞀪'), +]; + +pub const GOTHIC: &'static [(char, char)] = &[ + ('𐌰', '𐍊'), +]; + +pub const GRANTHA: &'static [(char, char)] = &[ + ('𑌀', '𑌃'), ('𑌅', '𑌌'), ('𑌏', '𑌐'), ('𑌓', '𑌨'), + ('𑌪', '𑌰'), ('𑌲', '𑌳'), ('𑌵', '𑌹'), ('𑌼', '𑍄'), + ('𑍇', '𑍈'), ('𑍋', '𑍍'), ('𑍐', '𑍐'), ('𑍗', '𑍗'), + ('𑍝', '𑍣'), ('𑍦', '𑍬'), ('𑍰', '𑍴'), +]; + +pub const GREEK: &'static [(char, char)] = &[ + ('Ͱ', 'ͳ'), ('͵', 'ͷ'), ('ͺ', 'ͽ'), ('Ϳ', 'Ϳ'), ('΄', '΄'), + ('Ά', 'Ά'), ('Έ', 'Ί'), ('Ό', 'Ό'), ('Ύ', 'Ρ'), ('Σ', 'ϡ'), + ('ϰ', 'Ͽ'), ('ᴦ', 'ᴪ'), ('ᵝ', 'ᵡ'), ('ᵦ', 'ᵪ'), + ('ᶿ', 'ᶿ'), ('ἀ', 'ἕ'), ('Ἐ', 'Ἕ'), ('ἠ', 'ὅ'), + ('Ὀ', 'Ὅ'), ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), ('ᾶ', 'ῄ'), + ('ῆ', 'ΐ'), ('ῖ', 'Ί'), ('῝', '`'), ('ῲ', 'ῴ'), + ('ῶ', '῾'), ('Ω', 'Ω'), ('ꭥ', 'ꭥ'), ('𐅀', '𐆎'), + ('𐆠', '𐆠'), ('𝈀', '𝉅'), +]; + +pub const GUJARATI: &'static [(char, char)] = &[ + ('ઁ', 'ઃ'), ('અ', 'ઍ'), ('એ', 'ઑ'), ('ઓ', 'ન'), + ('પ', 'ર'), ('લ', 'ળ'), ('વ', 'હ'), ('઼', 'ૅ'), + ('ે', 'ૉ'), ('ો', '્'), ('ૐ', 'ૐ'), ('ૠ', 'ૣ'), + ('૦', '૱'), ('ૹ', '૿'), +]; + +pub const GURMUKHI: &'static [(char, char)] = &[ + ('ਁ', 'ਃ'), ('ਅ', 'ਊ'), ('ਏ', 'ਐ'), ('ਓ', 'ਨ'), + ('ਪ', 'ਰ'), ('ਲ', 'ਲ਼'), ('ਵ', 'ਸ਼'), ('ਸ', 'ਹ'), + ('਼', '਼'), ('ਾ', 'ੂ'), ('ੇ', 'ੈ'), ('ੋ', '੍'), + ('ੑ', 'ੑ'), ('ਖ਼', 'ੜ'), ('ਫ਼', 'ਫ਼'), ('੦', 'ੵ'), +]; + +pub const HAN: &'static [(char, char)] = &[ + ('⺀', '⺙'), ('⺛', '⻳'), ('⼀', '⿕'), ('々', '々'), + ('〇', '〇'), ('〡', '〩'), ('〸', '〻'), ('㐀', '䶵'), + ('一', '鿪'), ('豈', '舘'), ('並', '龎'), ('𠀀', '𪛖'), + ('𪜀', '𫜴'), ('𫝀', '𫠝'), ('𫠠', '𬺡'), ('𬺰', '𮯠'), + ('丽', '𪘀'), +]; + +pub const HANGUL: &'static [(char, char)] = &[ + ('ᄀ', 'ᇿ'), ('〮', '〯'), ('ㄱ', 'ㆎ'), ('㈀', '㈞'), + ('㉠', '㉾'), ('ꥠ', 'ꥼ'), ('가', '힣'), ('ힰ', 'ퟆ'), + ('ퟋ', 'ퟻ'), ('ᅠ', 'ᄒ'), ('ᅡ', 'ᅦ'), ('ᅧ', 'ᅬ'), + ('ᅭ', 'ᅲ'), ('ᅳ', 'ᅵ'), +]; + +pub const HANUNOO: &'static [(char, char)] = &[ + ('ᜠ', '᜴'), +]; + +pub const HATRAN: &'static [(char, char)] = &[ + ('𐣠', '𐣲'), ('𐣴', '𐣵'), ('𐣻', '𐣿'), +]; + +pub const HEBREW: &'static [(char, char)] = &[ + ('֑', 'ׇ'), ('א', 'ת'), ('װ', '״'), ('יִ', 'זּ'), ('טּ', 'לּ'), + ('מּ', 'מּ'), ('נּ', 'סּ'), ('ףּ', 'פּ'), ('צּ', 'ﭏ'), +]; + +pub const HIRAGANA: &'static [(char, char)] = &[ + ('ぁ', 'ゖ'), ('ゝ', 'ゟ'), ('𛀁', '𛄞'), ('🈀', '🈀'), +]; + +pub const IMPERIAL_ARAMAIC: &'static [(char, char)] = &[ + ('𐡀', '𐡕'), ('𐡗', '𐡟'), +]; + +pub const INHERITED: &'static [(char, char)] = &[ + ('̀', 'ͯ'), ('҅', '҆'), ('ً', 'ٕ'), ('ٰ', 'ٰ'), ('॑', '॒'), + ('᪰', '᪾'), ('᳐', '᳒'), ('᳔', '᳠'), ('᳢', '᳨'), + ('᳭', '᳭'), ('᳴', '᳴'), ('᳸', '᳹'), ('᷀', '᷹'), + ('᷻', '᷿'), ('\u{200c}', '\u{200d}'), ('⃐', '⃰'), ('〪', '〭'), + ('゙', '゚'), ('︀', '️'), ('︠', '︭'), ('𐇽', '𐇽'), + ('𐋠', '𐋠'), ('𝅧', '𝅩'), ('𝅻', '𝆂'), ('𝆅', '𝆋'), + ('𝆪', '𝆭'), ('󠄀', '󠇯'), +]; + +pub const INSCRIPTIONAL_PAHLAVI: &'static [(char, char)] = &[ + ('𐭠', '𐭲'), ('𐭸', '𐭿'), +]; + +pub const INSCRIPTIONAL_PARTHIAN: &'static [(char, char)] = &[ + ('𐭀', '𐭕'), ('𐭘', '𐭟'), +]; + +pub const JAVANESE: &'static [(char, char)] = &[ + ('ꦀ', '꧍'), ('꧐', '꧙'), ('꧞', '꧟'), +]; + +pub const KAITHI: &'static [(char, char)] = &[ + ('𑂀', '𑃁'), +]; + +pub const KANNADA: &'static [(char, char)] = &[ + ('ಀ', 'ಃ'), ('ಅ', 'ಌ'), ('ಎ', 'ಐ'), ('ಒ', 'ನ'), + ('ಪ', 'ಳ'), ('ವ', 'ಹ'), ('಼', 'ೄ'), ('ೆ', 'ೈ'), + ('ೊ', '್'), ('ೕ', 'ೖ'), ('ೞ', 'ೞ'), ('ೠ', 'ೣ'), + ('೦', '೯'), ('ೱ', 'ೲ'), +]; + +pub const KATAKANA: &'static [(char, char)] = &[ + ('ァ', 'ヺ'), ('ヽ', 'ヿ'), ('ㇰ', 'ㇿ'), ('㋐', '㋾'), + ('㌀', '㍗'), ('ヲ', 'ッ'), ('ア', 'ン'), ('𛀀', '𛀀'), +]; + +pub const KAYAH_LI: &'static [(char, char)] = &[ + ('꤀', '꤭'), ('꤯', '꤯'), +]; + +pub const KHAROSHTHI: &'static [(char, char)] = &[ + ('𐨀', '𐨃'), ('𐨅', '𐨆'), ('𐨌', '𐨓'), ('𐨕', '𐨗'), + ('𐨙', '𐨳'), ('𐨸', '𐨺'), ('𐨿', '𐩇'), ('𐩐', '𐩘'), +]; + +pub const KHMER: &'static [(char, char)] = &[ + ('ក', '៝'), ('០', '៩'), ('៰', '៹'), ('᧠', '᧿'), +]; + +pub const KHOJKI: &'static [(char, char)] = &[ + ('𑈀', '𑈑'), ('𑈓', '𑈾'), +]; + +pub const KHUDAWADI: &'static [(char, char)] = &[ + ('𑊰', '𑋪'), ('𑋰', '𑋹'), +]; + +pub const LAO: &'static [(char, char)] = &[ + ('ກ', 'ຂ'), ('ຄ', 'ຄ'), ('ງ', 'ຈ'), ('ຊ', 'ຊ'), + ('ຍ', 'ຍ'), ('ດ', 'ທ'), ('ນ', 'ຟ'), ('ມ', 'ຣ'), + ('ລ', 'ລ'), ('ວ', 'ວ'), ('ສ', 'ຫ'), ('ອ', 'ູ'), + ('ົ', 'ຽ'), ('ເ', 'ໄ'), ('ໆ', 'ໆ'), ('່', 'ໍ'), + ('໐', '໙'), ('ໜ', 'ໟ'), +]; + +pub const LATIN: &'static [(char, char)] = &[ + ('A', 'Z'), ('a', 'z'), ('ª', 'ª'), ('º', 'º'), ('À', 'Ö'), + ('Ø', 'ö'), ('ø', 'ʸ'), ('ˠ', 'ˤ'), ('ᴀ', 'ᴥ'), ('ᴬ', 'ᵜ'), + ('ᵢ', 'ᵥ'), ('ᵫ', 'ᵷ'), ('ᵹ', 'ᶾ'), ('Ḁ', 'ỿ'), + ('ⁱ', 'ⁱ'), ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), ('K', 'Å'), + ('Ⅎ', 'Ⅎ'), ('ⅎ', 'ⅎ'), ('Ⅰ', 'ↈ'), ('Ⱡ', 'Ɀ'), + ('Ꜣ', 'ꞇ'), ('Ꞌ', 'Ɪ'), ('Ʞ', 'ꞷ'), ('ꟷ', 'ꟿ'), + ('ꬰ', 'ꭚ'), ('ꭜ', 'ꭤ'), ('ff', 'st'), ('A', 'Z'), + ('a', 'z'), +]; + +pub const LEPCHA: &'static [(char, char)] = &[ + ('ᰀ', '᰷'), ('᰻', '᱉'), ('ᱍ', 'ᱏ'), +]; + +pub const LIMBU: &'static [(char, char)] = &[ + ('ᤀ', 'ᤞ'), ('ᤠ', 'ᤫ'), ('ᤰ', '᤻'), ('᥀', '᥀'), + ('᥄', '᥏'), +]; + +pub const LINEAR_A: &'static [(char, char)] = &[ + ('𐘀', '𐜶'), ('𐝀', '𐝕'), ('𐝠', '𐝧'), +]; + +pub const LINEAR_B: &'static [(char, char)] = &[ + ('𐀀', '𐀋'), ('𐀍', '𐀦'), ('𐀨', '𐀺'), ('𐀼', '𐀽'), + ('𐀿', '𐁍'), ('𐁐', '𐁝'), ('𐂀', '𐃺'), +]; + +pub const LISU: &'static [(char, char)] = &[ + ('ꓐ', '꓿'), +]; + +pub const LYCIAN: &'static [(char, char)] = &[ + ('𐊀', '𐊜'), +]; + +pub const LYDIAN: &'static [(char, char)] = &[ + ('𐤠', '𐤹'), ('𐤿', '𐤿'), +]; + +pub const MAHAJANI: &'static [(char, char)] = &[ + ('𑅐', '𑅶'), +]; + +pub const MALAYALAM: &'static [(char, char)] = &[ + ('ഀ', 'ഃ'), ('അ', 'ഌ'), ('എ', 'ഐ'), ('ഒ', 'ൄ'), + ('െ', 'ൈ'), ('ൊ', '൏'), ('ൔ', 'ൣ'), ('൦', 'ൿ'), +]; + +pub const MANDAIC: &'static [(char, char)] = &[ + ('ࡀ', '࡛'), ('࡞', '࡞'), +]; + +pub const MANICHAEAN: &'static [(char, char)] = &[ + ('𐫀', '𐫦'), ('𐫫', '𐫶'), +]; + +pub const MARCHEN: &'static [(char, char)] = &[ + ('𑱰', '𑲏'), ('𑲒', '𑲧'), ('𑲩', '𑲶'), +]; + +pub const MASARAM_GONDI: &'static [(char, char)] = &[ + ('𑴀', '𑴆'), ('𑴈', '𑴉'), ('𑴋', '𑴶'), ('𑴺', '𑴺'), + ('𑴼', '𑴽'), ('𑴿', '𑵇'), ('𑵐', '𑵙'), +]; + +pub const MEETEI_MAYEK: &'static [(char, char)] = &[ + ('ꫠ', '꫶'), ('ꯀ', '꯭'), ('꯰', '꯹'), +]; + +pub const MENDE_KIKAKUI: &'static [(char, char)] = &[ + ('𞠀', '𞣄'), ('𞣇', '𞣖'), +]; + +pub const MEROITIC_CURSIVE: &'static [(char, char)] = &[ + ('𐦠', '𐦷'), ('𐦼', '𐧏'), ('𐧒', '𐧿'), +]; + +pub const MEROITIC_HIEROGLYPHS: &'static [(char, char)] = &[ + ('𐦀', '𐦟'), +]; + +pub const MIAO: &'static [(char, char)] = &[ + ('𖼀', '𖽄'), ('𖽐', '𖽾'), ('𖾏', '𖾟'), +]; + +pub const MODI: &'static [(char, char)] = &[ + ('𑘀', '𑙄'), ('𑙐', '𑙙'), +]; + +pub const MONGOLIAN: &'static [(char, char)] = &[ + ('᠀', '᠁'), ('᠄', '᠄'), ('᠆', '\u{180e}'), ('᠐', '᠙'), + ('ᠠ', 'ᡷ'), ('ᢀ', 'ᢪ'), ('𑙠', '𑙬'), +]; + +pub const MRO: &'static [(char, char)] = &[ + ('𖩀', '𖩞'), ('𖩠', '𖩩'), ('𖩮', '𖩯'), +]; + +pub const MULTANI: &'static [(char, char)] = &[ + ('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), ('𑊏', '𑊝'), + ('𑊟', '𑊩'), +]; + +pub const MYANMAR: &'static [(char, char)] = &[ + ('က', '႟'), ('ꧠ', 'ꧾ'), ('ꩠ', 'ꩿ'), +]; + +pub const NABATAEAN: &'static [(char, char)] = &[ + ('𐢀', '𐢞'), ('𐢧', '𐢯'), +]; + +pub const NEW_TAI_LUE: &'static [(char, char)] = &[ + ('ᦀ', 'ᦫ'), ('ᦰ', 'ᧉ'), ('᧐', '᧚'), ('᧞', '᧟'), +]; + +pub const NEWA: &'static [(char, char)] = &[ + ('𑐀', '𑑙'), ('𑑛', '𑑛'), ('𑑝', '𑑝'), +]; + +pub const NKO: &'static [(char, char)] = &[ + ('߀', 'ߺ'), +]; + +pub const NUSHU: &'static [(char, char)] = &[ + ('𖿡', '𖿡'), ('𛅰', '𛋻'), +]; + +pub const OGHAM: &'static [(char, char)] = &[ + ('\u{1680}', '᚜'), +]; + +pub const OL_CHIKI: &'static [(char, char)] = &[ + ('᱐', '᱿'), +]; + +pub const OLD_HUNGARIAN: &'static [(char, char)] = &[ + ('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𐳺', '𐳿'), +]; + +pub const OLD_ITALIC: &'static [(char, char)] = &[ + ('𐌀', '𐌣'), ('𐌭', '𐌯'), +]; + +pub const OLD_NORTH_ARABIAN: &'static [(char, char)] = &[ + ('𐪀', '𐪟'), +]; + +pub const OLD_PERMIC: &'static [(char, char)] = &[ + ('𐍐', '𐍺'), +]; + +pub const OLD_PERSIAN: &'static [(char, char)] = &[ + ('𐎠', '𐏃'), ('𐏈', '𐏕'), +]; + +pub const OLD_SOUTH_ARABIAN: &'static [(char, char)] = &[ + ('𐩠', '𐩿'), +]; + +pub const OLD_TURKIC: &'static [(char, char)] = &[ + ('𐰀', '𐱈'), +]; + +pub const ORIYA: &'static [(char, char)] = &[ + ('ଁ', 'ଃ'), ('ଅ', 'ଌ'), ('ଏ', 'ଐ'), ('ଓ', 'ନ'), + ('ପ', 'ର'), ('ଲ', 'ଳ'), ('ଵ', 'ହ'), ('଼', 'ୄ'), + ('େ', 'ୈ'), ('ୋ', '୍'), ('ୖ', 'ୗ'), ('ଡ଼', 'ଢ଼'), + ('ୟ', 'ୣ'), ('୦', '୷'), +]; + +pub const OSAGE: &'static [(char, char)] = &[ + ('𐒰', '𐓓'), ('𐓘', '𐓻'), +]; + +pub const OSMANYA: &'static [(char, char)] = &[ + ('𐒀', '𐒝'), ('𐒠', '𐒩'), +]; + +pub const PAHAWH_HMONG: &'static [(char, char)] = &[ + ('𖬀', '𖭅'), ('𖭐', '𖭙'), ('𖭛', '𖭡'), ('𖭣', '𖭷'), + ('𖭽', '𖮏'), +]; + +pub const PALMYRENE: &'static [(char, char)] = &[ + ('𐡠', '𐡿'), +]; + +pub const PAU_CIN_HAU: &'static [(char, char)] = &[ + ('𑫀', '𑫸'), +]; + +pub const PHAGS_PA: &'static [(char, char)] = &[ + ('ꡀ', '꡷'), +]; + +pub const PHOENICIAN: &'static [(char, char)] = &[ + ('𐤀', '𐤛'), ('𐤟', '𐤟'), +]; + +pub const PSALTER_PAHLAVI: &'static [(char, char)] = &[ + ('𐮀', '𐮑'), ('𐮙', '𐮜'), ('𐮩', '𐮯'), +]; + +pub const REJANG: &'static [(char, char)] = &[ + ('ꤰ', '꥓'), ('꥟', '꥟'), +]; + +pub const RUNIC: &'static [(char, char)] = &[ + ('ᚠ', 'ᛪ'), ('ᛮ', 'ᛸ'), +]; + +pub const SAMARITAN: &'static [(char, char)] = &[ + ('ࠀ', '࠭'), ('࠰', '࠾'), +]; + +pub const SAURASHTRA: &'static [(char, char)] = &[ + ('ꢀ', 'ꣅ'), ('꣎', '꣙'), +]; + +pub const SHARADA: &'static [(char, char)] = &[ + ('𑆀', '𑇍'), ('𑇐', '𑇟'), +]; + +pub const SHAVIAN: &'static [(char, char)] = &[ + ('𐑐', '𐑿'), +]; + +pub const SIDDHAM: &'static [(char, char)] = &[ + ('𑖀', '𑖵'), ('𑖸', '𑗝'), +]; + +pub const SIGNWRITING: &'static [(char, char)] = &[ + ('𝠀', '𝪋'), ('𝪛', '𝪟'), ('𝪡', '𝪯'), +]; + +pub const SINHALA: &'static [(char, char)] = &[ + ('ං', 'ඃ'), ('අ', 'ඖ'), ('ක', 'න'), ('ඳ', 'ර'), + ('ල', 'ල'), ('ව', 'ෆ'), ('්', '්'), ('ා', 'ු'), + ('ූ', 'ූ'), ('ෘ', 'ෟ'), ('෦', '෯'), ('ෲ', '෴'), + ('𑇡', '𑇴'), +]; + +pub const SORA_SOMPENG: &'static [(char, char)] = &[ + ('𑃐', '𑃨'), ('𑃰', '𑃹'), +]; + +pub const SOYOMBO: &'static [(char, char)] = &[ + ('𑩐', '𑪃'), ('𑪆', '𑪜'), ('𑪞', '𑪢'), +]; + +pub const SUNDANESE: &'static [(char, char)] = &[ + ('ᮀ', 'ᮿ'), ('᳀', '᳇'), +]; + +pub const SYLOTI_NAGRI: &'static [(char, char)] = &[ + ('ꠀ', '꠫'), +]; + +pub const SYRIAC: &'static [(char, char)] = &[ + ('܀', '܍'), ('\u{70f}', '݊'), ('ݍ', 'ݏ'), ('ࡠ', 'ࡪ'), +]; + +pub const TAGALOG: &'static [(char, char)] = &[ + ('ᜀ', 'ᜌ'), ('ᜎ', '᜔'), +]; + +pub const TAGBANWA: &'static [(char, char)] = &[ + ('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), ('ᝲ', 'ᝳ'), +]; + +pub const TAI_LE: &'static [(char, char)] = &[ + ('ᥐ', 'ᥭ'), ('ᥰ', 'ᥴ'), +]; + +pub const TAI_THAM: &'static [(char, char)] = &[ + ('ᨠ', 'ᩞ'), ('᩠', '᩼'), ('᩿', '᪉'), ('᪐', '᪙'), + ('᪠', '᪭'), +]; + +pub const TAI_VIET: &'static [(char, char)] = &[ + ('ꪀ', 'ꫂ'), ('ꫛ', '꫟'), +]; + +pub const TAKRI: &'static [(char, char)] = &[ + ('𑚀', '𑚷'), ('𑛀', '𑛉'), +]; + +pub const TAMIL: &'static [(char, char)] = &[ + ('ஂ', 'ஃ'), ('அ', 'ஊ'), ('எ', 'ஐ'), ('ஒ', 'க'), + ('ங', 'ச'), ('ஜ', 'ஜ'), ('ஞ', 'ட'), ('ண', 'த'), + ('ந', 'ப'), ('ம', 'ஹ'), ('ா', 'ூ'), ('ெ', 'ை'), + ('ொ', '்'), ('ௐ', 'ௐ'), ('ௗ', 'ௗ'), ('௦', '௺'), +]; + +pub const TANGUT: &'static [(char, char)] = &[ + ('𖿠', '𖿠'), ('𗀀', '𘟬'), ('𘠀', '𘫲'), +]; + +pub const TELUGU: &'static [(char, char)] = &[ + ('ఀ', 'ః'), ('అ', 'ఌ'), ('ఎ', 'ఐ'), ('ఒ', 'న'), + ('ప', 'హ'), ('ఽ', 'ౄ'), ('ె', 'ై'), ('ొ', '్'), + ('ౕ', 'ౖ'), ('ౘ', 'ౚ'), ('ౠ', 'ౣ'), ('౦', '౯'), + ('౸', '౿'), +]; + +pub const THAANA: &'static [(char, char)] = &[ + ('ހ', 'ޱ'), +]; + +pub const THAI: &'static [(char, char)] = &[ + ('ก', 'ฺ'), ('เ', '๛'), +]; + +pub const TIBETAN: &'static [(char, char)] = &[ + ('ༀ', 'ཇ'), ('ཉ', 'ཬ'), ('ཱ', 'ྗ'), ('ྙ', 'ྼ'), + ('྾', '࿌'), ('࿎', '࿔'), ('࿙', '࿚'), +]; + +pub const TIFINAGH: &'static [(char, char)] = &[ + ('ⴰ', 'ⵧ'), ('ⵯ', '⵰'), ('⵿', '⵿'), +]; + +pub const TIRHUTA: &'static [(char, char)] = &[ + ('𑒀', '𑓇'), ('𑓐', '𑓙'), +]; + +pub const UGARITIC: &'static [(char, char)] = &[ + ('𐎀', '𐎝'), ('𐎟', '𐎟'), +]; + +pub const VAI: &'static [(char, char)] = &[ + ('ꔀ', 'ꘫ'), +]; + +pub const WARANG_CITI: &'static [(char, char)] = &[ + ('𑢠', '𑣲'), ('𑣿', '𑣿'), +]; + +pub const YI: &'static [(char, char)] = &[ + ('ꀀ', 'ꒌ'), ('꒐', '꓆'), +]; + +pub const ZANABAZAR_SQUARE: &'static [(char, char)] = &[ + ('𑨀', '𑩇'), +]; diff --git a/regex-syntax-2/src/unicode_tables/script_extension.rs b/regex-syntax-2/src/unicode_tables/script_extension.rs new file mode 100644 index 0000000000..10b6c3e03f --- /dev/null +++ b/regex-syntax-2/src/unicode_tables/script_extension.rs @@ -0,0 +1,785 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate script-extension tmp/ucd-10.0.0/ --chars +// +// ucd-generate is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ + ("Adlam", ADLAM), ("Ahom", AHOM), + ("Anatolian_Hieroglyphs", ANATOLIAN_HIEROGLYPHS), ("Arabic", ARABIC), + ("Armenian", ARMENIAN), ("Avestan", AVESTAN), ("Balinese", BALINESE), + ("Bamum", BAMUM), ("Bassa_Vah", BASSA_VAH), ("Batak", BATAK), + ("Bengali", BENGALI), ("Bhaiksuki", BHAIKSUKI), ("Bopomofo", BOPOMOFO), + ("Brahmi", BRAHMI), ("Braille", BRAILLE), ("Buginese", BUGINESE), + ("Buhid", BUHID), ("Canadian_Aboriginal", CANADIAN_ABORIGINAL), + ("Carian", CARIAN), ("Caucasian_Albanian", CAUCASIAN_ALBANIAN), + ("Chakma", CHAKMA), ("Cham", CHAM), ("Cherokee", CHEROKEE), + ("Common", COMMON), ("Coptic", COPTIC), ("Cuneiform", CUNEIFORM), + ("Cypriot", CYPRIOT), ("Cyrillic", CYRILLIC), ("Deseret", DESERET), + ("Devanagari", DEVANAGARI), ("Duployan", DUPLOYAN), + ("Egyptian_Hieroglyphs", EGYPTIAN_HIEROGLYPHS), ("Elbasan", ELBASAN), + ("Ethiopic", ETHIOPIC), ("Georgian", GEORGIAN), ("Glagolitic", GLAGOLITIC), + ("Gothic", GOTHIC), ("Grantha", GRANTHA), ("Greek", GREEK), + ("Gujarati", GUJARATI), ("Gurmukhi", GURMUKHI), ("Han", HAN), + ("Hangul", HANGUL), ("Hanunoo", HANUNOO), ("Hatran", HATRAN), + ("Hebrew", HEBREW), ("Hiragana", HIRAGANA), + ("Imperial_Aramaic", IMPERIAL_ARAMAIC), ("Inherited", INHERITED), + ("Inscriptional_Pahlavi", INSCRIPTIONAL_PAHLAVI), + ("Inscriptional_Parthian", INSCRIPTIONAL_PARTHIAN), ("Javanese", JAVANESE), + ("Kaithi", KAITHI), ("Kannada", KANNADA), ("Katakana", KATAKANA), + ("Kayah_Li", KAYAH_LI), ("Kharoshthi", KHAROSHTHI), ("Khmer", KHMER), + ("Khojki", KHOJKI), ("Khudawadi", KHUDAWADI), ("Lao", LAO), + ("Latin", LATIN), ("Lepcha", LEPCHA), ("Limbu", LIMBU), + ("Linear_A", LINEAR_A), ("Linear_B", LINEAR_B), ("Lisu", LISU), + ("Lycian", LYCIAN), ("Lydian", LYDIAN), ("Mahajani", MAHAJANI), + ("Malayalam", MALAYALAM), ("Mandaic", MANDAIC), ("Manichaean", MANICHAEAN), + ("Marchen", MARCHEN), ("Masaram_Gondi", MASARAM_GONDI), + ("Meetei_Mayek", MEETEI_MAYEK), ("Mende_Kikakui", MENDE_KIKAKUI), + ("Meroitic_Cursive", MEROITIC_CURSIVE), + ("Meroitic_Hieroglyphs", MEROITIC_HIEROGLYPHS), ("Miao", MIAO), + ("Modi", MODI), ("Mongolian", MONGOLIAN), ("Mro", MRO), + ("Multani", MULTANI), ("Myanmar", MYANMAR), ("Nabataean", NABATAEAN), + ("New_Tai_Lue", NEW_TAI_LUE), ("Newa", NEWA), ("Nko", NKO), + ("Nushu", NUSHU), ("Ogham", OGHAM), ("Ol_Chiki", OL_CHIKI), + ("Old_Hungarian", OLD_HUNGARIAN), ("Old_Italic", OLD_ITALIC), + ("Old_North_Arabian", OLD_NORTH_ARABIAN), ("Old_Permic", OLD_PERMIC), + ("Old_Persian", OLD_PERSIAN), ("Old_South_Arabian", OLD_SOUTH_ARABIAN), + ("Old_Turkic", OLD_TURKIC), ("Oriya", ORIYA), ("Osage", OSAGE), + ("Osmanya", OSMANYA), ("Pahawh_Hmong", PAHAWH_HMONG), + ("Palmyrene", PALMYRENE), ("Pau_Cin_Hau", PAU_CIN_HAU), + ("Phags_Pa", PHAGS_PA), ("Phoenician", PHOENICIAN), + ("Psalter_Pahlavi", PSALTER_PAHLAVI), ("Rejang", REJANG), ("Runic", RUNIC), + ("Samaritan", SAMARITAN), ("Saurashtra", SAURASHTRA), ("Sharada", SHARADA), + ("Shavian", SHAVIAN), ("Siddham", SIDDHAM), ("SignWriting", SIGNWRITING), + ("Sinhala", SINHALA), ("Sora_Sompeng", SORA_SOMPENG), ("Soyombo", SOYOMBO), + ("Sundanese", SUNDANESE), ("Syloti_Nagri", SYLOTI_NAGRI), + ("Syriac", SYRIAC), ("Tagalog", TAGALOG), ("Tagbanwa", TAGBANWA), + ("Tai_Le", TAI_LE), ("Tai_Tham", TAI_THAM), ("Tai_Viet", TAI_VIET), + ("Takri", TAKRI), ("Tamil", TAMIL), ("Tangut", TANGUT), ("Telugu", TELUGU), + ("Thaana", THAANA), ("Thai", THAI), ("Tibetan", TIBETAN), + ("Tifinagh", TIFINAGH), ("Tirhuta", TIRHUTA), ("Ugaritic", UGARITIC), + ("Vai", VAI), ("Warang_Citi", WARANG_CITI), ("Yi", YI), + ("Zanabazar_Square", ZANABAZAR_SQUARE), +]; + +pub const ADLAM: &'static [(char, char)] = &[ + ('ـ', 'ـ'), ('𞤀', '𞥊'), ('𞥐', '𞥙'), ('𞥞', '𞥟'), +]; + +pub const AHOM: &'static [(char, char)] = &[ + ('𑜀', '𑜙'), ('𑜝', '𑜫'), ('𑜰', '𑜿'), +]; + +pub const ANATOLIAN_HIEROGLYPHS: &'static [(char, char)] = &[ + ('𔐀', '𔙆'), +]; + +pub const ARABIC: &'static [(char, char)] = &[ + ('\u{600}', '\u{604}'), ('؆', '\u{61c}'), ('؞', 'ۜ'), ('۞', 'ۿ'), + ('ݐ', 'ݿ'), ('ࢠ', 'ࢴ'), ('ࢶ', 'ࢽ'), ('ࣔ', '࣡'), + ('ࣣ', 'ࣿ'), ('ﭐ', '﯁'), ('ﯓ', 'ﴽ'), ('ﵐ', 'ﶏ'), + ('ﶒ', 'ﷇ'), ('ﷰ', '﷽'), ('ﹰ', 'ﹴ'), ('ﹶ', 'ﻼ'), + ('𐋠', '𐋻'), ('𐹠', '𐹾'), ('𞸀', '𞸃'), ('𞸅', '𞸟'), + ('𞸡', '𞸢'), ('𞸤', '𞸤'), ('𞸧', '𞸧'), ('𞸩', '𞸲'), + ('𞸴', '𞸷'), ('𞸹', '𞸹'), ('𞸻', '𞸻'), ('𞹂', '𞹂'), + ('𞹇', '𞹇'), ('𞹉', '𞹉'), ('𞹋', '𞹋'), ('𞹍', '𞹏'), + ('𞹑', '𞹒'), ('𞹔', '𞹔'), ('𞹗', '𞹗'), ('𞹙', '𞹙'), + ('𞹛', '𞹛'), ('𞹝', '𞹝'), ('𞹟', '𞹟'), ('𞹡', '𞹢'), + ('𞹤', '𞹤'), ('𞹧', '𞹪'), ('𞹬', '𞹲'), ('𞹴', '𞹷'), + ('𞹹', '𞹼'), ('𞹾', '𞹾'), ('𞺀', '𞺉'), ('𞺋', '𞺛'), + ('𞺡', '𞺣'), ('𞺥', '𞺩'), ('𞺫', '𞺻'), ('𞻰', '𞻱'), +]; + +pub const ARMENIAN: &'static [(char, char)] = &[ + ('Ա', 'Ֆ'), ('ՙ', '՟'), ('ա', 'և'), ('։', '֊'), ('֍', '֏'), + ('ﬓ', 'ﬗ'), +]; + +pub const AVESTAN: &'static [(char, char)] = &[ + ('𐬀', '𐬵'), ('𐬹', '𐬿'), +]; + +pub const BALINESE: &'static [(char, char)] = &[ + ('ᬀ', 'ᭋ'), ('᭐', '᭼'), +]; + +pub const BAMUM: &'static [(char, char)] = &[ + ('ꚠ', '꛷'), ('𖠀', '𖨸'), +]; + +pub const BASSA_VAH: &'static [(char, char)] = &[ + ('𖫐', '𖫭'), ('𖫰', '𖫵'), +]; + +pub const BATAK: &'static [(char, char)] = &[ + ('ᯀ', '᯳'), ('᯼', '᯿'), +]; + +pub const BENGALI: &'static [(char, char)] = &[ + ('॑', '॒'), ('।', '॥'), ('ঀ', 'ঃ'), ('অ', 'ঌ'), + ('এ', 'ঐ'), ('ও', 'ন'), ('প', 'র'), ('ল', 'ল'), + ('শ', 'হ'), ('়', 'ৄ'), ('ে', 'ৈ'), ('ো', 'ৎ'), + ('ৗ', 'ৗ'), ('ড়', 'ঢ়'), ('য়', 'ৣ'), ('০', '৽'), + ('᳷', '᳷'), ('꣱', '꣱'), +]; + +pub const BHAIKSUKI: &'static [(char, char)] = &[ + ('𑰀', '𑰈'), ('𑰊', '𑰶'), ('𑰸', '𑱅'), ('𑱐', '𑱬'), +]; + +pub const BOPOMOFO: &'static [(char, char)] = &[ + ('˪', '˫'), ('、', '〃'), ('〈', '】'), ('〓', '〟'), + ('〪', '〭'), ('〰', '〰'), ('〷', '〷'), ('・', '・'), + ('ㄅ', 'ㄮ'), ('ㆠ', 'ㆺ'), ('﹅', '﹆'), ('。', '・'), +]; + +pub const BRAHMI: &'static [(char, char)] = &[ + ('𑀀', '𑁍'), ('𑁒', '𑁯'), ('𑁿', '𑁿'), +]; + +pub const BRAILLE: &'static [(char, char)] = &[ + ('⠀', '⣿'), +]; + +pub const BUGINESE: &'static [(char, char)] = &[ + ('ᨀ', 'ᨛ'), ('᨞', '᨟'), ('ꧏ', 'ꧏ'), +]; + +pub const BUHID: &'static [(char, char)] = &[ + ('᜵', '᜶'), ('ᝀ', 'ᝓ'), +]; + +pub const CANADIAN_ABORIGINAL: &'static [(char, char)] = &[ + ('᐀', 'ᙿ'), ('ᢰ', 'ᣵ'), +]; + +pub const CARIAN: &'static [(char, char)] = &[ + ('𐊠', '𐋐'), +]; + +pub const CAUCASIAN_ALBANIAN: &'static [(char, char)] = &[ + ('𐔰', '𐕣'), ('𐕯', '𐕯'), +]; + +pub const CHAKMA: &'static [(char, char)] = &[ + ('০', '৯'), ('၀', '၉'), ('𑄀', '𑄴'), ('𑄶', '𑅃'), +]; + +pub const CHAM: &'static [(char, char)] = &[ + ('ꨀ', 'ꨶ'), ('ꩀ', 'ꩍ'), ('꩐', '꩙'), ('꩜', '꩟'), +]; + +pub const CHEROKEE: &'static [(char, char)] = &[ + ('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ꭰ', 'ꮿ'), +]; + +pub const COMMON: &'static [(char, char)] = &[ + ('\u{0}', '@'), ('[', '`'), ('{', '©'), ('«', '¹'), ('»', '¿'), + ('×', '×'), ('÷', '÷'), ('ʹ', '˟'), ('˥', '˩'), ('ˬ', '˿'), + ('ʹ', 'ʹ'), (';', ';'), ('΅', '΅'), ('·', '·'), + ('\u{605}', '\u{605}'), ('\u{6dd}', '\u{6dd}'), ('\u{8e2}', '\u{8e2}'), + ('฿', '฿'), ('࿕', '࿘'), ('᛫', '᛭'), ('\u{2000}', '\u{200b}'), + ('\u{200e}', '\u{2064}'), ('\u{2066}', '⁰'), ('⁴', '⁾'), + ('₀', '₎'), ('₠', '₿'), ('℀', '℥'), ('℧', '℩'), + ('ℬ', 'ℱ'), ('ℳ', '⅍'), ('⅏', '⅟'), ('↉', '↋'), + ('←', '␦'), ('⑀', '⑊'), ('①', '⟿'), ('⤀', '⭳'), + ('⭶', '⮕'), ('⮘', '⮹'), ('⮽', '⯈'), ('⯊', '⯒'), + ('⯬', '⯯'), ('⸀', '⹂'), ('⹄', '⹉'), ('⿰', '⿻'), + ('\u{3000}', '\u{3000}'), ('〄', '〄'), ('〒', '〒'), ('〠', '〠'), + ('〶', '〶'), ('㉈', '㉟'), ('㉿', '㉿'), ('㊱', '㊿'), + ('㋌', '㋏'), ('㍱', '㍺'), ('㎀', '㏟'), ('㏿', '㏿'), + ('䷀', '䷿'), ('꜀', '꜡'), ('ꞈ', '꞊'), ('꭛', '꭛'), + ('﴾', '﴿'), ('︐', '︙'), ('︰', '﹄'), ('﹇', '﹒'), + ('﹔', '﹦'), ('﹨', '﹫'), ('\u{feff}', '\u{feff}'), ('!', '@'), + ('[', '`'), ('{', '⦆'), ('¢', '₩'), ('│', '○'), + ('\u{fff9}', '�'), ('𐆐', '𐆛'), ('𐇐', '𐇼'), ('𝀀', '𝃵'), + ('𝄀', '𝄦'), ('𝄩', '𝅦'), ('𝅪', '\u{1d17a}'), ('𝆃', '𝆄'), + ('𝆌', '𝆩'), ('𝆮', '𝇨'), ('𝌀', '𝍖'), ('𝐀', '𝑔'), + ('𝑖', '𝒜'), ('𝒞', '𝒟'), ('𝒢', '𝒢'), ('𝒥', '𝒦'), + ('𝒩', '𝒬'), ('𝒮', '𝒹'), ('𝒻', '𝒻'), ('𝒽', '𝓃'), + ('𝓅', '𝔅'), ('𝔇', '𝔊'), ('𝔍', '𝔔'), ('𝔖', '𝔜'), + ('𝔞', '𝔹'), ('𝔻', '𝔾'), ('𝕀', '𝕄'), ('𝕆', '𝕆'), + ('𝕊', '𝕐'), ('𝕒', '𝚥'), ('𝚨', '𝟋'), ('𝟎', '𝟿'), + ('🀀', '🀫'), ('🀰', '🂓'), ('🂠', '🂮'), ('🂱', '🂿'), + ('🃁', '🃏'), ('🃑', '🃵'), ('🄀', '🄌'), ('🄐', '🄮'), + ('🄰', '🅫'), ('🅰', '🆬'), ('🇦', '🇿'), ('🈁', '🈂'), + ('🈐', '🈻'), ('🉀', '🉈'), ('🉠', '🉥'), ('🌀', '🛔'), + ('🛠', '🛬'), ('🛰', '🛸'), ('🜀', '🝳'), ('🞀', '🟔'), + ('🠀', '🠋'), ('🠐', '🡇'), ('🡐', '🡙'), ('🡠', '🢇'), + ('🢐', '🢭'), ('🤀', '🤋'), ('🤐', '🤾'), ('🥀', '🥌'), + ('🥐', '🥫'), ('🦀', '🦗'), ('🧀', '🧀'), ('🧐', '🧦'), + ('\u{e0001}', '\u{e0001}'), ('\u{e0020}', '\u{e007f}'), +]; + +pub const COPTIC: &'static [(char, char)] = &[ + ('Ϣ', 'ϯ'), ('Ⲁ', 'ⳳ'), ('⳹', '⳿'), ('𐋠', '𐋻'), +]; + +pub const CUNEIFORM: &'static [(char, char)] = &[ + ('𒀀', '𒎙'), ('𒐀', '𒑮'), ('𒑰', '𒑴'), ('𒒀', '𒕃'), +]; + +pub const CYPRIOT: &'static [(char, char)] = &[ + ('𐄀', '𐄂'), ('𐄇', '𐄳'), ('𐄷', '𐄿'), ('𐠀', '𐠅'), + ('𐠈', '𐠈'), ('𐠊', '𐠵'), ('𐠷', '𐠸'), ('𐠼', '𐠼'), + ('𐠿', '𐠿'), +]; + +pub const CYRILLIC: &'static [(char, char)] = &[ + ('Ѐ', 'ԯ'), ('ᲀ', 'ᲈ'), ('ᴫ', 'ᴫ'), ('ᵸ', 'ᵸ'), + ('ⷠ', 'ⷿ'), ('⹃', '⹃'), ('Ꙁ', 'ꚟ'), ('︮', '︯'), +]; + +pub const DESERET: &'static [(char, char)] = &[ + ('𐐀', '𐑏'), +]; + +pub const DEVANAGARI: &'static [(char, char)] = &[ + ('ऀ', 'ॿ'), ('᳐', 'ᳶ'), ('᳸', '᳹'), ('⃰', '⃰'), + ('꠰', '꠹'), ('꣠', 'ꣽ'), +]; + +pub const DUPLOYAN: &'static [(char, char)] = &[ + ('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), ('𛲐', '𛲙'), + ('𛲜', '\u{1bca3}'), +]; + +pub const EGYPTIAN_HIEROGLYPHS: &'static [(char, char)] = &[ + ('𓀀', '𓐮'), +]; + +pub const ELBASAN: &'static [(char, char)] = &[ + ('𐔀', '𐔧'), +]; + +pub const ETHIOPIC: &'static [(char, char)] = &[ + ('ሀ', 'ቈ'), ('ቊ', 'ቍ'), ('ቐ', 'ቖ'), ('ቘ', 'ቘ'), + ('ቚ', 'ቝ'), ('በ', 'ኈ'), ('ኊ', 'ኍ'), ('ነ', 'ኰ'), + ('ኲ', 'ኵ'), ('ኸ', 'ኾ'), ('ዀ', 'ዀ'), ('ዂ', 'ዅ'), + ('ወ', 'ዖ'), ('ዘ', 'ጐ'), ('ጒ', 'ጕ'), ('ጘ', 'ፚ'), + ('፝', '፼'), ('ᎀ', '᎙'), ('ⶀ', 'ⶖ'), ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), ('ⶰ', 'ⶶ'), ('ⶸ', 'ⶾ'), ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), ('ⷐ', 'ⷖ'), ('ⷘ', 'ⷞ'), ('ꬁ', 'ꬆ'), + ('ꬉ', 'ꬎ'), ('ꬑ', 'ꬖ'), ('ꬠ', 'ꬦ'), ('ꬨ', 'ꬮ'), +]; + +pub const GEORGIAN: &'static [(char, char)] = &[ + ('։', '։'), ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), + ('ა', 'ჿ'), ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), +]; + +pub const GLAGOLITIC: &'static [(char, char)] = &[ + ('҄', '҄'), ('҇', '҇'), ('Ⰰ', 'Ⱞ'), ('ⰰ', 'ⱞ'), ('⹃', '⹃'), + ('꙯', '꙯'), ('𞀀', '𞀆'), ('𞀈', '𞀘'), ('𞀛', '𞀡'), + ('𞀣', '𞀤'), ('𞀦', '𞀪'), +]; + +pub const GOTHIC: &'static [(char, char)] = &[ + ('𐌰', '𐍊'), +]; + +pub const GRANTHA: &'static [(char, char)] = &[ + ('॑', '॒'), ('।', '॥'), ('ப', 'ப'), ('வ', 'வ'), + ('௦', '௲'), ('᳐', '᳐'), ('᳒', '᳓'), ('ᳲ', '᳴'), + ('᳸', '᳹'), ('⃰', '⃰'), ('𑌀', '𑌃'), ('𑌅', '𑌌'), + ('𑌏', '𑌐'), ('𑌓', '𑌨'), ('𑌪', '𑌰'), ('𑌲', '𑌳'), + ('𑌵', '𑌹'), ('𑌼', '𑍄'), ('𑍇', '𑍈'), ('𑍋', '𑍍'), + ('𑍐', '𑍐'), ('𑍗', '𑍗'), ('𑍝', '𑍣'), ('𑍦', '𑍬'), + ('𑍰', '𑍴'), +]; + +pub const GREEK: &'static [(char, char)] = &[ + ('͂', '͂'), ('ͅ', 'ͅ'), ('Ͱ', 'ͳ'), ('͵', 'ͷ'), ('ͺ', 'ͽ'), + ('Ϳ', 'Ϳ'), ('΄', '΄'), ('Ά', 'Ά'), ('Έ', 'Ί'), ('Ό', 'Ό'), + ('Ύ', 'Ρ'), ('Σ', 'ϡ'), ('ϰ', 'Ͽ'), ('ᴦ', 'ᴪ'), ('ᵝ', 'ᵡ'), + ('ᵦ', 'ᵪ'), ('ᶿ', '᷁'), ('ἀ', 'ἕ'), ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), ('Ὀ', 'Ὅ'), ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), + ('ᾶ', 'ῄ'), ('ῆ', 'ΐ'), ('ῖ', 'Ί'), ('῝', '`'), + ('ῲ', 'ῴ'), ('ῶ', '῾'), ('Ω', 'Ω'), ('ꭥ', 'ꭥ'), + ('𐅀', '𐆎'), ('𐆠', '𐆠'), ('𝈀', '𝉅'), +]; + +pub const GUJARATI: &'static [(char, char)] = &[ + ('॑', '॒'), ('।', '॥'), ('ઁ', 'ઃ'), ('અ', 'ઍ'), + ('એ', 'ઑ'), ('ઓ', 'ન'), ('પ', 'ર'), ('લ', 'ળ'), + ('વ', 'હ'), ('઼', 'ૅ'), ('ે', 'ૉ'), ('ો', '્'), + ('ૐ', 'ૐ'), ('ૠ', 'ૣ'), ('૦', '૱'), ('ૹ', '૿'), + ('꠰', '꠹'), +]; + +pub const GURMUKHI: &'static [(char, char)] = &[ + ('॑', '॒'), ('।', '॥'), ('ਁ', 'ਃ'), ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), ('ਓ', 'ਨ'), ('ਪ', 'ਰ'), ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), ('ਸ', 'ਹ'), ('਼', '਼'), ('ਾ', 'ੂ'), + ('ੇ', 'ੈ'), ('ੋ', '੍'), ('ੑ', 'ੑ'), ('ਖ਼', 'ੜ'), + ('ਫ਼', 'ਫ਼'), ('੦', 'ੵ'), ('꠰', '꠹'), +]; + +pub const HAN: &'static [(char, char)] = &[ + ('⺀', '⺙'), ('⺛', '⻳'), ('⼀', '⿕'), ('、', '〃'), + ('々', '】'), ('〓', '〟'), ('〡', '〭'), ('〰', '〰'), + ('〷', '〿'), ('・', '・'), ('㆐', '㆟'), ('㇀', '㇣'), + ('㈠', '㉇'), ('㊀', '㊰'), ('㋀', '㋋'), ('㍘', '㍰'), + ('㍻', '㍿'), ('㏠', '㏾'), ('㐀', '䶵'), ('一', '鿪'), + ('豈', '舘'), ('並', '龎'), ('﹅', '﹆'), ('。', '・'), + ('𝍠', '𝍱'), ('🉐', '🉑'), ('𠀀', '𪛖'), ('𪜀', '𫜴'), + ('𫝀', '𫠝'), ('𫠠', '𬺡'), ('𬺰', '𮯠'), ('丽', '𪘀'), +]; + +pub const HANGUL: &'static [(char, char)] = &[ + ('ᄀ', 'ᇿ'), ('、', '〃'), ('〈', '】'), ('〓', '〟'), + ('〮', '〰'), ('〷', '〷'), ('・', '・'), ('ㄱ', 'ㆎ'), + ('㈀', '㈞'), ('㉠', '㉾'), ('ꥠ', 'ꥼ'), ('가', '힣'), + ('ힰ', 'ퟆ'), ('ퟋ', 'ퟻ'), ('﹅', '﹆'), ('。', '・'), + ('ᅠ', 'ᄒ'), ('ᅡ', 'ᅦ'), ('ᅧ', 'ᅬ'), ('ᅭ', 'ᅲ'), + ('ᅳ', 'ᅵ'), +]; + +pub const HANUNOO: &'static [(char, char)] = &[ + ('ᜠ', '᜶'), +]; + +pub const HATRAN: &'static [(char, char)] = &[ + ('𐣠', '𐣲'), ('𐣴', '𐣵'), ('𐣻', '𐣿'), +]; + +pub const HEBREW: &'static [(char, char)] = &[ + ('֑', 'ׇ'), ('א', 'ת'), ('װ', '״'), ('יִ', 'זּ'), ('טּ', 'לּ'), + ('מּ', 'מּ'), ('נּ', 'סּ'), ('ףּ', 'פּ'), ('צּ', 'ﭏ'), +]; + +pub const HIRAGANA: &'static [(char, char)] = &[ + ('、', '〃'), ('〈', '】'), ('〓', '〟'), ('〰', '〵'), + ('〷', '〷'), ('〼', '〽'), ('ぁ', 'ゖ'), ('゙', '゠'), + ('・', 'ー'), ('﹅', '﹆'), ('。', '・'), ('ー', 'ー'), + ('゙', '゚'), ('𛀁', '𛄞'), ('🈀', '🈀'), +]; + +pub const IMPERIAL_ARAMAIC: &'static [(char, char)] = &[ + ('𐡀', '𐡕'), ('𐡗', '𐡟'), +]; + +pub const INHERITED: &'static [(char, char)] = &[ + ('̀', '́'), ('̓', '̈́'), ('͆', '͢'), ('᪰', '᪾'), ('᷂', '᷹'), + ('᷻', '᷿'), ('\u{200c}', '\u{200d}'), ('⃐', '⃯'), ('︀', '️'), + ('︠', '︭'), ('𐇽', '𐇽'), ('𝅧', '𝅩'), ('𝅻', '𝆂'), + ('𝆅', '𝆋'), ('𝆪', '𝆭'), ('󠄀', '󠇯'), +]; + +pub const INSCRIPTIONAL_PAHLAVI: &'static [(char, char)] = &[ + ('𐭠', '𐭲'), ('𐭸', '𐭿'), +]; + +pub const INSCRIPTIONAL_PARTHIAN: &'static [(char, char)] = &[ + ('𐭀', '𐭕'), ('𐭘', '𐭟'), +]; + +pub const JAVANESE: &'static [(char, char)] = &[ + ('ꦀ', '꧍'), ('ꧏ', '꧙'), ('꧞', '꧟'), +]; + +pub const KAITHI: &'static [(char, char)] = &[ + ('०', '९'), ('꠰', '꠹'), ('𑂀', '𑃁'), +]; + +pub const KANNADA: &'static [(char, char)] = &[ + ('॑', '॒'), ('।', '॥'), ('ಀ', 'ಃ'), ('ಅ', 'ಌ'), + ('ಎ', 'ಐ'), ('ಒ', 'ನ'), ('ಪ', 'ಳ'), ('ವ', 'ಹ'), + ('಼', 'ೄ'), ('ೆ', 'ೈ'), ('ೊ', '್'), ('ೕ', 'ೖ'), + ('ೞ', 'ೞ'), ('ೠ', 'ೣ'), ('೦', '೯'), ('ೱ', 'ೲ'), + ('᳚', '᳚'), ('ᳵ', 'ᳵ'), ('꠰', '꠵'), +]; + +pub const KATAKANA: &'static [(char, char)] = &[ + ('、', '〃'), ('〈', '】'), ('〓', '〟'), ('〰', '〵'), + ('〷', '〷'), ('〼', '〽'), ('゙', '゜'), ('゠', 'ヿ'), + ('ㇰ', 'ㇿ'), ('㋐', '㋾'), ('㌀', '㍗'), ('﹅', '﹆'), + ('。', '゚'), ('𛀀', '𛀀'), +]; + +pub const KAYAH_LI: &'static [(char, char)] = &[ + ('꤀', '꤯'), +]; + +pub const KHAROSHTHI: &'static [(char, char)] = &[ + ('𐨀', '𐨃'), ('𐨅', '𐨆'), ('𐨌', '𐨓'), ('𐨕', '𐨗'), + ('𐨙', '𐨳'), ('𐨸', '𐨺'), ('𐨿', '𐩇'), ('𐩐', '𐩘'), +]; + +pub const KHMER: &'static [(char, char)] = &[ + ('ក', '៝'), ('០', '៩'), ('៰', '៹'), ('᧠', '᧿'), +]; + +pub const KHOJKI: &'static [(char, char)] = &[ + ('૦', '૯'), ('𑈀', '𑈑'), ('𑈓', '𑈾'), +]; + +pub const KHUDAWADI: &'static [(char, char)] = &[ + ('।', '॥'), ('꠰', '꠹'), ('𑊰', '𑋪'), ('𑋰', '𑋹'), +]; + +pub const LAO: &'static [(char, char)] = &[ + ('ກ', 'ຂ'), ('ຄ', 'ຄ'), ('ງ', 'ຈ'), ('ຊ', 'ຊ'), + ('ຍ', 'ຍ'), ('ດ', 'ທ'), ('ນ', 'ຟ'), ('ມ', 'ຣ'), + ('ລ', 'ລ'), ('ວ', 'ວ'), ('ສ', 'ຫ'), ('ອ', 'ູ'), + ('ົ', 'ຽ'), ('ເ', 'ໄ'), ('ໆ', 'ໆ'), ('່', 'ໍ'), + ('໐', '໙'), ('ໜ', 'ໟ'), +]; + +pub const LATIN: &'static [(char, char)] = &[ + ('A', 'Z'), ('a', 'z'), ('ª', 'ª'), ('º', 'º'), ('À', 'Ö'), + ('Ø', 'ö'), ('ø', 'ʸ'), ('ˠ', 'ˤ'), ('ͣ', 'ͯ'), ('҅', '҆'), + ('॑', '॒'), ('჻', '჻'), ('ᴀ', 'ᴥ'), ('ᴬ', 'ᵜ'), + ('ᵢ', 'ᵥ'), ('ᵫ', 'ᵷ'), ('ᵹ', 'ᶾ'), ('Ḁ', 'ỿ'), + ('ⁱ', 'ⁱ'), ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), ('⃰', '⃰'), + ('K', 'Å'), ('Ⅎ', 'Ⅎ'), ('ⅎ', 'ⅎ'), ('Ⅰ', 'ↈ'), + ('Ⱡ', 'Ɀ'), ('Ꜣ', 'ꞇ'), ('Ꞌ', 'Ɪ'), ('Ʞ', 'ꞷ'), + ('ꟷ', 'ꟿ'), ('꤮', '꤮'), ('ꬰ', 'ꭚ'), ('ꭜ', 'ꭤ'), + ('ff', 'st'), ('A', 'Z'), ('a', 'z'), +]; + +pub const LEPCHA: &'static [(char, char)] = &[ + ('ᰀ', '᰷'), ('᰻', '᱉'), ('ᱍ', 'ᱏ'), +]; + +pub const LIMBU: &'static [(char, char)] = &[ + ('॥', '॥'), ('ᤀ', 'ᤞ'), ('ᤠ', 'ᤫ'), ('ᤰ', '᤻'), + ('᥀', '᥀'), ('᥄', '᥏'), +]; + +pub const LINEAR_A: &'static [(char, char)] = &[ + ('𐄇', '𐄳'), ('𐘀', '𐜶'), ('𐝀', '𐝕'), ('𐝠', '𐝧'), +]; + +pub const LINEAR_B: &'static [(char, char)] = &[ + ('𐀀', '𐀋'), ('𐀍', '𐀦'), ('𐀨', '𐀺'), ('𐀼', '𐀽'), + ('𐀿', '𐁍'), ('𐁐', '𐁝'), ('𐂀', '𐃺'), ('𐄀', '𐄂'), + ('𐄇', '𐄳'), ('𐄷', '𐄿'), +]; + +pub const LISU: &'static [(char, char)] = &[ + ('ꓐ', '꓿'), +]; + +pub const LYCIAN: &'static [(char, char)] = &[ + ('𐊀', '𐊜'), +]; + +pub const LYDIAN: &'static [(char, char)] = &[ + ('𐤠', '𐤹'), ('𐤿', '𐤿'), +]; + +pub const MAHAJANI: &'static [(char, char)] = &[ + ('।', '९'), ('꠰', '꠹'), ('𑅐', '𑅶'), +]; + +pub const MALAYALAM: &'static [(char, char)] = &[ + ('॑', '॒'), ('।', '॥'), ('ഀ', 'ഃ'), ('അ', 'ഌ'), + ('എ', 'ഐ'), ('ഒ', 'ൄ'), ('െ', 'ൈ'), ('ൊ', '൏'), + ('ൔ', 'ൣ'), ('൦', 'ൿ'), ('᳚', '᳚'), +]; + +pub const MANDAIC: &'static [(char, char)] = &[ + ('ـ', 'ـ'), ('ࡀ', '࡛'), ('࡞', '࡞'), +]; + +pub const MANICHAEAN: &'static [(char, char)] = &[ + ('ـ', 'ـ'), ('𐫀', '𐫦'), ('𐫫', '𐫶'), +]; + +pub const MARCHEN: &'static [(char, char)] = &[ + ('𑱰', '𑲏'), ('𑲒', '𑲧'), ('𑲩', '𑲶'), +]; + +pub const MASARAM_GONDI: &'static [(char, char)] = &[ + ('𑴀', '𑴆'), ('𑴈', '𑴉'), ('𑴋', '𑴶'), ('𑴺', '𑴺'), + ('𑴼', '𑴽'), ('𑴿', '𑵇'), ('𑵐', '𑵙'), +]; + +pub const MEETEI_MAYEK: &'static [(char, char)] = &[ + ('ꫠ', '꫶'), ('ꯀ', '꯭'), ('꯰', '꯹'), +]; + +pub const MENDE_KIKAKUI: &'static [(char, char)] = &[ + ('𞠀', '𞣄'), ('𞣇', '𞣖'), +]; + +pub const MEROITIC_CURSIVE: &'static [(char, char)] = &[ + ('𐦠', '𐦷'), ('𐦼', '𐧏'), ('𐧒', '𐧿'), +]; + +pub const MEROITIC_HIEROGLYPHS: &'static [(char, char)] = &[ + ('𐦀', '𐦟'), +]; + +pub const MIAO: &'static [(char, char)] = &[ + ('𖼀', '𖽄'), ('𖽐', '𖽾'), ('𖾏', '𖾟'), +]; + +pub const MODI: &'static [(char, char)] = &[ + ('꠰', '꠹'), ('𑘀', '𑙄'), ('𑙐', '𑙙'), +]; + +pub const MONGOLIAN: &'static [(char, char)] = &[ + ('᠀', '\u{180e}'), ('᠐', '᠙'), ('ᠠ', 'ᡷ'), ('ᢀ', 'ᢪ'), + ('𑙠', '𑙬'), +]; + +pub const MRO: &'static [(char, char)] = &[ + ('𖩀', '𖩞'), ('𖩠', '𖩩'), ('𖩮', '𖩯'), +]; + +pub const MULTANI: &'static [(char, char)] = &[ + ('੦', '੯'), ('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), + ('𑊏', '𑊝'), ('𑊟', '𑊩'), +]; + +pub const MYANMAR: &'static [(char, char)] = &[ + ('က', '႟'), ('꤮', '꤮'), ('ꧠ', 'ꧾ'), ('ꩠ', 'ꩿ'), +]; + +pub const NABATAEAN: &'static [(char, char)] = &[ + ('𐢀', '𐢞'), ('𐢧', '𐢯'), +]; + +pub const NEW_TAI_LUE: &'static [(char, char)] = &[ + ('ᦀ', 'ᦫ'), ('ᦰ', 'ᧉ'), ('᧐', '᧚'), ('᧞', '᧟'), +]; + +pub const NEWA: &'static [(char, char)] = &[ + ('𑐀', '𑑙'), ('𑑛', '𑑛'), ('𑑝', '𑑝'), +]; + +pub const NKO: &'static [(char, char)] = &[ + ('߀', 'ߺ'), +]; + +pub const NUSHU: &'static [(char, char)] = &[ + ('𖿡', '𖿡'), ('𛅰', '𛋻'), +]; + +pub const OGHAM: &'static [(char, char)] = &[ + ('\u{1680}', '᚜'), +]; + +pub const OL_CHIKI: &'static [(char, char)] = &[ + ('᱐', '᱿'), +]; + +pub const OLD_HUNGARIAN: &'static [(char, char)] = &[ + ('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𐳺', '𐳿'), +]; + +pub const OLD_ITALIC: &'static [(char, char)] = &[ + ('𐌀', '𐌣'), ('𐌭', '𐌯'), +]; + +pub const OLD_NORTH_ARABIAN: &'static [(char, char)] = &[ + ('𐪀', '𐪟'), +]; + +pub const OLD_PERMIC: &'static [(char, char)] = &[ + ('҃', '҃'), ('𐍐', '𐍺'), +]; + +pub const OLD_PERSIAN: &'static [(char, char)] = &[ + ('𐎠', '𐏃'), ('𐏈', '𐏕'), +]; + +pub const OLD_SOUTH_ARABIAN: &'static [(char, char)] = &[ + ('𐩠', '𐩿'), +]; + +pub const OLD_TURKIC: &'static [(char, char)] = &[ + ('𐰀', '𐱈'), +]; + +pub const ORIYA: &'static [(char, char)] = &[ + ('॑', '॒'), ('।', '॥'), ('ଁ', 'ଃ'), ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), ('ଓ', 'ନ'), ('ପ', 'ର'), ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), ('଼', 'ୄ'), ('େ', 'ୈ'), ('ୋ', '୍'), + ('ୖ', 'ୗ'), ('ଡ଼', 'ଢ଼'), ('ୟ', 'ୣ'), ('୦', '୷'), +]; + +pub const OSAGE: &'static [(char, char)] = &[ + ('𐒰', '𐓓'), ('𐓘', '𐓻'), +]; + +pub const OSMANYA: &'static [(char, char)] = &[ + ('𐒀', '𐒝'), ('𐒠', '𐒩'), +]; + +pub const PAHAWH_HMONG: &'static [(char, char)] = &[ + ('𖬀', '𖭅'), ('𖭐', '𖭙'), ('𖭛', '𖭡'), ('𖭣', '𖭷'), + ('𖭽', '𖮏'), +]; + +pub const PALMYRENE: &'static [(char, char)] = &[ + ('𐡠', '𐡿'), +]; + +pub const PAU_CIN_HAU: &'static [(char, char)] = &[ + ('𑫀', '𑫸'), +]; + +pub const PHAGS_PA: &'static [(char, char)] = &[ + ('᠂', '᠃'), ('᠅', '᠅'), ('ꡀ', '꡷'), +]; + +pub const PHOENICIAN: &'static [(char, char)] = &[ + ('𐤀', '𐤛'), ('𐤟', '𐤟'), +]; + +pub const PSALTER_PAHLAVI: &'static [(char, char)] = &[ + ('ـ', 'ـ'), ('𐮀', '𐮑'), ('𐮙', '𐮜'), ('𐮩', '𐮯'), +]; + +pub const REJANG: &'static [(char, char)] = &[ + ('ꤰ', '꥓'), ('꥟', '꥟'), +]; + +pub const RUNIC: &'static [(char, char)] = &[ + ('ᚠ', 'ᛪ'), ('ᛮ', 'ᛸ'), +]; + +pub const SAMARITAN: &'static [(char, char)] = &[ + ('ࠀ', '࠭'), ('࠰', '࠾'), +]; + +pub const SAURASHTRA: &'static [(char, char)] = &[ + ('ꢀ', 'ꣅ'), ('꣎', '꣙'), +]; + +pub const SHARADA: &'static [(char, char)] = &[ + ('॑', '॑'), ('᳗', '᳗'), ('᳙', '᳙'), ('᳜', '᳝'), + ('᳠', '᳠'), ('𑆀', '𑇍'), ('𑇐', '𑇟'), +]; + +pub const SHAVIAN: &'static [(char, char)] = &[ + ('𐑐', '𐑿'), +]; + +pub const SIDDHAM: &'static [(char, char)] = &[ + ('𑖀', '𑖵'), ('𑖸', '𑗝'), +]; + +pub const SIGNWRITING: &'static [(char, char)] = &[ + ('𝠀', '𝪋'), ('𝪛', '𝪟'), ('𝪡', '𝪯'), +]; + +pub const SINHALA: &'static [(char, char)] = &[ + ('।', '॥'), ('ං', 'ඃ'), ('අ', 'ඖ'), ('ක', 'න'), + ('ඳ', 'ර'), ('ල', 'ල'), ('ව', 'ෆ'), ('්', '්'), + ('ා', 'ු'), ('ූ', 'ූ'), ('ෘ', 'ෟ'), ('෦', '෯'), + ('ෲ', '෴'), ('𑇡', '𑇴'), +]; + +pub const SORA_SOMPENG: &'static [(char, char)] = &[ + ('𑃐', '𑃨'), ('𑃰', '𑃹'), +]; + +pub const SOYOMBO: &'static [(char, char)] = &[ + ('𑩐', '𑪃'), ('𑪆', '𑪜'), ('𑪞', '𑪢'), +]; + +pub const SUNDANESE: &'static [(char, char)] = &[ + ('ᮀ', 'ᮿ'), ('᳀', '᳇'), +]; + +pub const SYLOTI_NAGRI: &'static [(char, char)] = &[ + ('।', '॥'), ('০', '৯'), ('ꠀ', '꠫'), +]; + +pub const SYRIAC: &'static [(char, char)] = &[ + ('،', '،'), ('؛', '\u{61c}'), ('؟', '؟'), ('ـ', 'ـ'), ('ً', 'ٕ'), + ('ٰ', 'ٰ'), ('܀', '܍'), ('\u{70f}', '݊'), ('ݍ', 'ݏ'), ('ࡠ', 'ࡪ'), +]; + +pub const TAGALOG: &'static [(char, char)] = &[ + ('ᜀ', 'ᜌ'), ('ᜎ', '᜔'), ('᜵', '᜶'), +]; + +pub const TAGBANWA: &'static [(char, char)] = &[ + ('᜵', '᜶'), ('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), ('ᝲ', 'ᝳ'), +]; + +pub const TAI_LE: &'static [(char, char)] = &[ + ('၀', '၉'), ('ᥐ', 'ᥭ'), ('ᥰ', 'ᥴ'), +]; + +pub const TAI_THAM: &'static [(char, char)] = &[ + ('ᨠ', 'ᩞ'), ('᩠', '᩼'), ('᩿', '᪉'), ('᪐', '᪙'), + ('᪠', '᪭'), +]; + +pub const TAI_VIET: &'static [(char, char)] = &[ + ('ꪀ', 'ꫂ'), ('ꫛ', '꫟'), +]; + +pub const TAKRI: &'static [(char, char)] = &[ + ('।', '॥'), ('꠰', '꠹'), ('𑚀', '𑚷'), ('𑛀', '𑛉'), +]; + +pub const TAMIL: &'static [(char, char)] = &[ + ('॑', '॒'), ('।', '॥'), ('ஂ', 'ஃ'), ('அ', 'ஊ'), + ('எ', 'ஐ'), ('ஒ', 'க'), ('ங', 'ச'), ('ஜ', 'ஜ'), + ('ஞ', 'ட'), ('ண', 'த'), ('ந', 'ப'), ('ம', 'ஹ'), + ('ா', 'ூ'), ('ெ', 'ை'), ('ொ', '்'), ('ௐ', 'ௐ'), + ('ௗ', 'ௗ'), ('௦', '௺'), ('᳚', '᳚'), ('ꣳ', 'ꣳ'), + ('𑌁', '𑌁'), ('𑌃', '𑌃'), ('𑌼', '𑌼'), +]; + +pub const TANGUT: &'static [(char, char)] = &[ + ('𖿠', '𖿠'), ('𗀀', '𘟬'), ('𘠀', '𘫲'), +]; + +pub const TELUGU: &'static [(char, char)] = &[ + ('॑', '॒'), ('।', '॥'), ('ఀ', 'ః'), ('అ', 'ఌ'), + ('ఎ', 'ఐ'), ('ఒ', 'న'), ('ప', 'హ'), ('ఽ', 'ౄ'), + ('ె', 'ై'), ('ొ', '్'), ('ౕ', 'ౖ'), ('ౘ', 'ౚ'), + ('ౠ', 'ౣ'), ('౦', '౯'), ('౸', '౿'), ('᳚', '᳚'), +]; + +pub const THAANA: &'static [(char, char)] = &[ + ('،', '،'), ('؛', '\u{61c}'), ('؟', '؟'), ('٠', '٩'), ('ހ', 'ޱ'), + ('ﷲ', 'ﷲ'), ('﷽', '﷽'), +]; + +pub const THAI: &'static [(char, char)] = &[ + ('ก', 'ฺ'), ('เ', '๛'), +]; + +pub const TIBETAN: &'static [(char, char)] = &[ + ('ༀ', 'ཇ'), ('ཉ', 'ཬ'), ('ཱ', 'ྗ'), ('ྙ', 'ྼ'), + ('྾', '࿌'), ('࿎', '࿔'), ('࿙', '࿚'), +]; + +pub const TIFINAGH: &'static [(char, char)] = &[ + ('ⴰ', 'ⵧ'), ('ⵯ', '⵰'), ('⵿', '⵿'), +]; + +pub const TIRHUTA: &'static [(char, char)] = &[ + ('।', '॥'), ('꠰', '꠹'), ('𑒀', '𑓇'), ('𑓐', '𑓙'), +]; + +pub const UGARITIC: &'static [(char, char)] = &[ + ('𐎀', '𐎝'), ('𐎟', '𐎟'), +]; + +pub const VAI: &'static [(char, char)] = &[ + ('ꔀ', 'ꘫ'), +]; + +pub const WARANG_CITI: &'static [(char, char)] = &[ + ('𑢠', '𑣲'), ('𑣿', '𑣿'), +]; + +pub const YI: &'static [(char, char)] = &[ + ('、', '。'), ('〈', '】'), ('〔', '〛'), ('・', '・'), + ('ꀀ', 'ꒌ'), ('꒐', '꓆'), ('。', '・'), +]; + +pub const ZANABAZAR_SQUARE: &'static [(char, char)] = &[ + ('𑨀', '𑩇'), +]; diff --git a/regex-syntax/benches/bench.rs b/regex-syntax/benches/bench.rs new file mode 100644 index 0000000000..f887772b27 --- /dev/null +++ b/regex-syntax/benches/bench.rs @@ -0,0 +1,65 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#![feature(test)] + +extern crate regex_syntax; +extern crate test; + +use regex_syntax::Expr; +use test::Bencher; + +#[bench] +fn parse_simple1(b: &mut Bencher) { + b.iter(|| { + let re = r"^bc(d|e)*$"; + Expr::parse(re).unwrap() + }); +} + +#[bench] +fn parse_simple2(b: &mut Bencher) { + b.iter(|| { + let re = r"'[a-zA-Z_][a-zA-Z0-9_]*(')\b"; + Expr::parse(re).unwrap() + }); +} + +#[bench] +fn parse_small1(b: &mut Bencher) { + b.iter(|| { + let re = r"\p{L}|\p{N}|\s|.|\d"; + Expr::parse(re).unwrap() + }); +} + +#[bench] +fn parse_medium1(b: &mut Bencher) { + b.iter(|| { + let re = r"\pL\p{Greek}\p{Hiragana}\p{Alphabetic}\p{Hebrew}\p{Arabic}"; + Expr::parse(re).unwrap() + }); +} + +#[bench] +fn parse_medium2(b: &mut Bencher) { + b.iter(|| { + let re = r"\s\S\w\W\d\D"; + Expr::parse(re).unwrap() + }); +} + +#[bench] +fn parse_huge(b: &mut Bencher) { + b.iter(|| { + let re = r"\p{L}{100}"; + Expr::parse(re).unwrap() + }); +}