diff --git a/Cargo.lock b/Cargo.lock index c051c4c7655..75e71ecdfa2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1942,6 +1942,18 @@ dependencies = [ "zerovec", ] +[[package]] +name = "icu_transliterator_parser" +version = "0.0.0" +dependencies = [ + "icu_collections", + "icu_properties", + "icu_provider", + "icu_transliteration", + "icu_unicodeset_parser", + "log", +] + [[package]] name = "icu_unicodeset_parser" version = "0.0.0" diff --git a/Cargo.toml b/Cargo.toml index ae376a0c1c0..aab60c5e231 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -43,6 +43,7 @@ members = [ "experimental/relativetime", "experimental/relativetime/data", "experimental/transliteration", + "experimental/transliterator_parser", "experimental/unicodeset_parser", "ffi/capi_cdylib", "ffi/capi_staticlib", diff --git a/experimental/transliteration/Cargo.toml b/experimental/transliteration/Cargo.toml index f6ba38b3023..eaee3ca49ef 100644 --- a/experimental/transliteration/Cargo.toml +++ b/experimental/transliteration/Cargo.toml @@ -31,4 +31,4 @@ icu_collections = { version = "1.2.0", path = "../../components/collections", fe serde = { version = "1.0", features = ["derive"] } zerovec = { version = "0.9.4", path = "../../utils/zerovec", features = ["derive"] } -# TODO: Add serde, datagen, compiled_data features \ No newline at end of file +# TODO: Add serde, datagen, compiled_data features diff --git a/experimental/transliterator_parser/Cargo.toml b/experimental/transliterator_parser/Cargo.toml new file mode 100644 index 00000000000..e9243879ca0 --- /dev/null +++ b/experimental/transliterator_parser/Cargo.toml @@ -0,0 +1,37 @@ +# This file is part of ICU4X. For terms of use, please see the file +# called LICENSE at the top level of the ICU4X source tree +# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +[package] +name = "icu_transliterator_parser" +description = "API to parse transform rules into transliterators as defined in UTS35" +version = "0.0.0" +authors = ["The ICU4X Project Developers"] +edition = "2021" +readme = "README.md" +repository = "https://github.com/unicode-org/icu4x" +license = "Unicode-DFS-2016" +categories = ["internationalization"] +# Keep this in sync with other crates unless there are exceptions +include = [ + "src/**/*", + "tests/**/*", + "Cargo.toml", + "LICENSE", + "README.md" +] + +[package.metadata.docs.rs] +all-features = true + +[dependencies] +icu_collections = { path = "../../components/collections" } +icu_properties = { path = "../../components/properties", default-features = false } +icu_provider = { path = "../../provider/core" } +icu_unicodeset_parser = { path = "../unicodeset_parser" } +icu_transliteration = { path = "../transliteration" } + +log = "0.4" + +[features] +compiled_data = ["icu_properties/compiled_data"] diff --git a/experimental/transliterator_parser/LICENSE b/experimental/transliterator_parser/LICENSE new file mode 100644 index 00000000000..9858d01abf5 --- /dev/null +++ b/experimental/transliterator_parser/LICENSE @@ -0,0 +1,51 @@ +UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE + +See Terms of Use +for definitions of Unicode Inc.’s Data Files and Software. + +NOTICE TO USER: Carefully read the following legal agreement. +BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S +DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), +YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE +TERMS AND CONDITIONS OF THIS AGREEMENT. +IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE +THE DATA FILES OR SOFTWARE. + +COPYRIGHT AND PERMISSION NOTICE + +Copyright © 1991-2022 Unicode, Inc. All rights reserved. +Distributed under the Terms of Use in https://www.unicode.org/copyright.html. + +Permission is hereby granted, free of charge, to any person obtaining +a copy of the Unicode data files and any associated documentation +(the "Data Files") or Unicode software and any associated documentation +(the "Software") to deal in the Data Files or Software +without restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, and/or sell copies of +the Data Files or Software, and to permit persons to whom the Data Files +or Software are furnished to do so, provided that either +(a) this copyright and permission notice appear with all copies +of the Data Files or Software, or +(b) this copyright and permission notice appear in associated +Documentation. + +THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT OF THIRD PARTY RIGHTS. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS +NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL +DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, +DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER +TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +PERFORMANCE OF THE DATA FILES OR SOFTWARE. + +Except as contained in this notice, the name of a copyright holder +shall not be used in advertising or otherwise to promote the sale, +use or other dealings in these Data Files or Software without prior +written authorization of the copyright holder. + +— + +Portions of ICU4X may have been adapted from ICU4C and/or ICU4J. +ICU 1.8.1 to ICU 57.1 © 1995-2016 International Business Machines Corporation and others. diff --git a/experimental/transliterator_parser/README.md b/experimental/transliterator_parser/README.md new file mode 100644 index 00000000000..cef9a94d048 --- /dev/null +++ b/experimental/transliterator_parser/README.md @@ -0,0 +1,13 @@ +# icu_transliterator_parser [![crates.io](https://img.shields.io/crates/v/icu_transliterator_parser)](https://crates.io/crates/icu_transliterator_parser) + +`icu_transliterator_parser` is a utility crate of the [`ICU4X`] project. + +This crate provides parsing functionality for [UTS #35 - Transliterators](https://unicode.org/reports/tr35/tr35-general.html#Transforms). + +See [`parse`](crate::parse()) for more information. + +[`ICU4X`]: ../icu/index.html + +## More Information + +For more information on development, authorship, contributing etc. please visit [`ICU4X home page`](https://github.com/unicode-org/icu4x). diff --git a/experimental/transliterator_parser/src/compile.rs b/experimental/transliterator_parser/src/compile.rs new file mode 100644 index 00000000000..025ff2e8d8b --- /dev/null +++ b/experimental/transliterator_parser/src/compile.rs @@ -0,0 +1,1244 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! This module has three main functions. First, it validates many aspects of transliterators. +//! Second, it compiles them into the zero-copy data struct defined in `icu_transliteration`. Third, +//! it computes the dependencies of the transliterator. +//! It is responsible for both directions of a source file, but the rest of this documentation +//! assumes a single direction. The process is simply repeated for the other direction. +//! +//! # Terminology +//! * The "direction" of a rule: Whether a rule is _forward_ (left-to-right in the source) or +//! _reverse_ (right-to-left in the source). At runtime, clients will apply a transliterator +//! in one direction. The transliterator `a <> b` replaces `a` with `b` in the forward direction, +//! and `b` with `a` in the reverse direction. +//! * The "side" of a rule: A rule has a _source_ and a _target_ side. The source is replaced +//! with the target. If we're looking at a definition of a transliterator in the _forward_ +//! direction, the source is on the left and the target is on the right, and vice versa for +//! the _reverse_ direction. +//! * "Special matchers" are non-literal items that can appear on the source side of a rule. +//! This includes, e.g., UnicodeSets and quantifiers. +//! * "Special replacers" are non-literal items that can appear on the target side of a rule. +//! This includes, e.g., function calls and back references. +//! * "Special constructs" are just any non-literal rule item. +//! +//! # Conversion rule encoding +//! +//! Conversion rules are encoded using `str`s, and private use code points are used to represent +//! the special constructs that can appear in a conversion rule (UnicodeSets, quantifiers, ...). +//! This works as follows: +//! * We use PUP (15), code points U+F0000 to U+FFFFD (inclusive) +//! * A private use code point simply encodes an integer, obtained by subtracting U+F0000 from it +//! * The integer is used as an index into `VarTable` +//! * As a `VarTable` has multiple `VarZeroVec`s (one for each special construct), an index +//! overflows into the following `VZV`s: +//! * An index of `vzv1.len() + vzv2.len() + 4` indexes the third `VZV` at index 4 +//! * Thus, if the length of an earlier `VZV` changes, the index of an element in a later `VZV` +//! will change, and its private use encoding will change +//! * Therefore we must know the number of elements of each `VZV` before we can start encoding +//! conversion rules into `str`s. +//! +//! # Passes +//! +//! This module works by performing multiple passes over the rules. +//! +//! ## Pass 1 +//! General validation of the rules and computation of the lengths of the `VZV`s in the `VarTable`. +//! +//! Only special constructs for the current direction contribute to the `VZV` lengths, +//! i.e., the rule `a <> [a-z] { b` will not increment the size of the +//! `VZV` for UnicodeSets if the current direction is `forward`, but it will if the current +//! direction is `reverse` (this is because contexts on the target side of a rule are ignored). +//! +//! Similarly, only recursive transliterators and variables actually used for this direction are +//! accounted for. +//! +//! ## Pass 2 +//! Encoding of the zero-copy data struct. +//! +//! To encode conversion rules into `str`s, we use the previously described encoded `VarTable` +//! indices. Because we know the lengths of each special construct list (in the form a `VZV`) +//! from the first pass, we can store the offsets for each special construct list (i.e., the sum of +//! the lengths of the previous lists) while encoding the conversion rules, and incrementing the +//! offset of a given special construct when we encode an element. The precomputed lengths mean we +//! never overflow into the indices of the following `VZV`. + +// more (data struct compatible) runtime optimization opportunities: +// - deduplicate special constructs ($a = hello; $b = hello; should only generate one hello element) +// - especially important for equivalent unicodesets +// - inline single-use variables ($a = x; $a > b; => x > b;) +// - replace uses of single-element variables with the element itself ($a = [a-z]; $a > a; => [a-z] > a;) +// - flatten single-element sets into literals ([a] > b; => a > b;) + +/* +Encoding example: + + $b = bc+ ; + $a = [a-z] $b ; + $a > ; + +b-data.counts: 1 compound (the definition itself), 1 quantifier plus (c+) +b-data.used_vars: - + +a-data.counts: 1 compound (the definition itself), 1 unicodeset ([a-z]) +a-data.used_vars: b + +forward-data.counts: 0 (rules are inlined) +forward-data.used_vars: a + +when collecting the counts (for forward) at the end, we sum over all counts of the transitive +dependencies of forward (using used_vars), and add the counts of forward itself. +we also compute the transitive closure of used variables. +this gives us the `Pass1Result`: +forward-data.counts: 2 compound, 1 quantifier plus, 1 unicodeset +forward-data.used_vars: a, b + +this `Pass1Result` we give to Pass2, which will produce something like this: +(note that the integer-indexed maps shown here are only semantic, in actuality the indices are implicit, +as described in the zero-copy format, and the maps here are just arrays) + + VarTable { + compounds: { + 0: "b<2>", // b's definition, bc+ + 1: "<3><0>", // a's definition, [a-z] $b + }, + quantifier_kleene_plus: { + 2: "c", // c+ + }, + unicode_sets: { + 3: , // [a-z] + } + } + Rules: [ + { + source: "<1>", // $a + target: "", + } + ] +*/ + +use crate::parse; +use crate::parse::{ElementLocation as EL, HalfRule, QuantifierKind}; +use parse::Result; +use parse::PEK; +use std::collections::{HashMap, HashSet}; + +enum SingleDirection { + Forward, + Reverse, +} + +/// The number of elements for each `VZV` in the `VarTable`. +#[derive(Debug, Copy, Clone, Default, PartialEq, Eq)] +struct SpecialConstructCounts { + num_compounds: usize, + num_quantifiers_opt: usize, + num_quantifiers_kleene: usize, + num_quantifiers_kleene_plus: usize, + num_segments: usize, + num_unicode_sets: usize, + num_function_calls: usize, +} + +// Data for a given direction or variable definition (the "key") +#[derive(Debug, Clone, Default, PartialEq, Eq)] +struct Pass1Data { + counts: SpecialConstructCounts, + // the variables used by the associated key + used_variables: HashSet, + // the recursive transliterators used by the associated key + used_transliterators: HashSet, +} + +#[allow(unused)] // TODO: remove annotation +#[derive(Debug, Clone)] +struct Pass1Result<'p> { + // data with dependencies resolved and counts summed + forward_data: Pass1Data, + reverse_data: Pass1Data, + variable_definitions: HashMap, +} + +/// Responsible for the first pass as described in the module-level documentation. +#[derive(Debug, Clone)] +struct Pass1<'p> { + direction: parse::Direction, + // data for *direct* dependencies + forward_data: Pass1Data, + reverse_data: Pass1Data, + variable_data: HashMap, + variable_definitions: HashMap, + // variables which contain constructs that are only allowed to appear on the source side + // e.g., $a = c+; $set = [a-z]; ... + target_disallowed_variables: HashSet, +} + +impl<'p> Pass1<'p> { + fn new(direction: parse::Direction) -> Self { + Self { + direction, + forward_data: Pass1Data::default(), + reverse_data: Pass1Data::default(), + variable_data: HashMap::new(), + variable_definitions: HashMap::new(), + target_disallowed_variables: HashSet::new(), + } + } + + fn run(&mut self, rules: &'p [parse::Rule]) -> Result> { + // first check global filter/global inverse filter. + // after this check, they may not appear anywhere. + let rules = self.validate_global_filters(rules)?; + + // iterate through remaining rules and perform checks according to interim specification + + for rule in rules { + match rule { + parse::Rule::GlobalFilter(_) | parse::Rule::GlobalInverseFilter(_) => { + // the previous step ensures `rules` has no more global filters + return Err(PEK::UnexpectedGlobalFilter.into()); + } + parse::Rule::Transform(forward_id, reverse_id) => { + self.validate_transform(forward_id, reverse_id.as_ref())?; + } + parse::Rule::VariableDefinition(name, definition) => { + self.validate_variable_definition(name, definition)?; + } + parse::Rule::Conversion(hr1, dir, hr2) => { + self.validate_conversion(hr1, *dir, hr2)?; + } + } + } + + Pass1ResultGenerator::generate(self) + } + + fn validate_global_filters<'a>(&self, rules: &'a [parse::Rule]) -> Result<&'a [parse::Rule]> { + let rules = match rules { + [parse::Rule::GlobalFilter(filter), rest @ ..] => { + if filter.has_strings() { + return Err(PEK::GlobalFilterWithStrings.into()); + } + + rest + } + _ => rules, + }; + let rules = match rules { + [rest @ .., parse::Rule::GlobalInverseFilter(filter)] => { + if filter.has_strings() { + return Err(PEK::GlobalFilterWithStrings.into()); + } + + rest + } + _ => rules, + }; + + Ok(rules) + } + + fn validate_transform( + &mut self, + forward_id: &parse::SingleId, + reverse_id: Option<&parse::SingleId>, + ) -> Result<()> { + let fwd_dep = forward_id.basic_id.clone(); + if !fwd_dep.is_null() { + self.forward_data.used_transliterators.insert(fwd_dep); + } + let rev_dep = reverse_id + .map(|single_id| single_id.basic_id.clone()) + .unwrap_or_else(|| forward_id.basic_id.clone().reverse()); + if !rev_dep.is_null() { + self.reverse_data.used_transliterators.insert(rev_dep); + } + Ok(()) + } + + fn validate_variable_definition( + &mut self, + name: &String, + definition: &'p [parse::Element], + ) -> Result<()> { + if self.variable_definitions.contains_key(name) { + return Err(PEK::DuplicateVariable.into()); + } + self.variable_definitions.insert(name.clone(), definition); + + let mut data = Pass1Data::default(); + // the variable definition itself is counted here + data.counts.num_compounds = 1; + + let mut validator = VariableDefinitionValidator::new( + |s| self.variable_definitions.contains_key(s), + &mut data, + &self.target_disallowed_variables, + definition, + ); + validator.validate()?; + if validator.used_target_disallowed_construct { + self.target_disallowed_variables.insert(name.clone()); + } + + self.variable_data.insert(name.clone(), data); + + Ok(()) + } + + fn validate_conversion( + &mut self, + source: &HalfRule, + dir: parse::Direction, + target: &HalfRule, + ) -> Result<()> { + // TODO(#3736): include source location/actual source text in these logs + if !self.direction.permits(dir) { + // example: metadata defines this transliterator as forward, but a `<>` or `<` rule is found. + log::warn!( + "metadata for transliterator specifies direction {:?} but conversion rule specifies {:?}", + self.direction, + dir, + ); + } + // logging for useless contexts + if dir == parse::Direction::Forward && (!target.ante.is_empty() || !target.post.is_empty()) + { + log::warn!("forward conversion rule has ignored context on target side"); + } + if dir == parse::Direction::Reverse && (!source.ante.is_empty() || !source.post.is_empty()) + { + log::warn!("reverse conversion rule has ignored context on target side"); + } + + if self.direction.permits(parse::Direction::Forward) + && dir.permits(parse::Direction::Forward) + { + self.validate_conversion_one_direction(source, target, SingleDirection::Forward)?; + } + if self.direction.permits(parse::Direction::Reverse) + && dir.permits(parse::Direction::Reverse) + { + self.validate_conversion_one_direction(target, source, SingleDirection::Reverse)?; + } + + Ok(()) + } + + fn validate_conversion_one_direction( + &mut self, + source: &HalfRule, + target: &HalfRule, + dir: SingleDirection, + ) -> Result<()> { + let data = match dir { + SingleDirection::Forward => &mut self.forward_data, + SingleDirection::Reverse => &mut self.reverse_data, + }; + let mut source_validator = SourceValidator::new( + |s| self.variable_definitions.contains_key(s), + data, + &source.ante, + &source.key, + &source.post, + ); + source_validator.validate()?; + let num_source_segments = source_validator.num_segments; + + let mut target_validator = TargetValidator::new( + |s| self.variable_definitions.contains_key(s), + &mut self.target_disallowed_variables, + data, + &target.key, + num_source_segments, + ); + target_validator.validate()?; + + Ok(()) + } +} + +struct SourceValidator<'a, 'p, F: Fn(&str) -> bool> { + is_variable_defined: F, + data: &'a mut Pass1Data, + ante: &'p [parse::Element], + key: &'p [parse::Element], + post: &'p [parse::Element], + // the number of segments this rule defines. consumed by TargetValidator. + num_segments: u32, +} + +/// Validates the source side of a rule. +/// +/// Ensures that only special constructs that may appear on the source side of a rule are used. +/// Also validates certain other source-side-only constraints, such as anchors needing to be at the +/// beginning or end of the rule. +impl<'a, 'p, F: Fn(&str) -> bool> SourceValidator<'a, 'p, F> { + fn new( + is_variable_defined: F, + data: &'a mut Pass1Data, + ante: &'p [parse::Element], + key: &'p [parse::Element], + post: &'p [parse::Element], + ) -> Self { + Self { + is_variable_defined, + data, + ante, + key, + post, + num_segments: 0, + } + } + + fn validate(&mut self) -> Result<()> { + // first validate position of ^ and $ anchors, if they exist + // ^: if ante is non-empty, must be its first element, otherwise must be first element of key + // $: if post is non-empty, must be its last element, otherwise must be last element of key + + let sections = [self.ante, self.key, self.post]; + // split off first element if it is a start anchor + let sections = match sections { + [[parse::Element::AnchorStart, ante @ ..], key, post] => [ante, key, post], + [[], [parse::Element::AnchorStart, key @ ..], post] => [&[], key, post], + _ => sections, + }; + // split off last element if it is an end anchor + let sections = match sections { + [ante, key, [post @ .., parse::Element::AnchorEnd]] => [ante, key, post], + [ante, [key @ .., parse::Element::AnchorEnd], []] => [ante, key, &[]], + _ => sections, + }; + + // now neither start nor end anchors may appear anywhere in `order` + + sections + .iter() + .try_for_each(|s| self.validate_section(s, true)) + } + + fn validate_section(&mut self, section: &[parse::Element], top_level: bool) -> Result<()> { + section + .iter() + .try_for_each(|element| self.validate_element(element, top_level)) + } + + fn validate_element(&mut self, element: &parse::Element, top_level: bool) -> Result<()> { + match element { + parse::Element::Literal(_) => {} + parse::Element::VariableRef(name) => { + if !(self.is_variable_defined)(name) { + return Err(PEK::UnknownVariable.into()); + } + self.data.used_variables.insert(name.clone()); + } + parse::Element::Quantifier(kind, inner) => { + self.validate_element(inner, false)?; + match *kind { + QuantifierKind::ZeroOrOne => self.data.counts.num_quantifiers_opt += 1, + QuantifierKind::ZeroOrMore => self.data.counts.num_quantifiers_kleene += 1, + QuantifierKind::OneOrMore => self.data.counts.num_quantifiers_kleene_plus += 1, + } + } + parse::Element::Segment(inner) => { + self.validate_section(inner, false)?; + // increment the count for this specific rule + self.num_segments += 1; + // increment the count for this direction of the entire transliterator + self.data.counts.num_segments += 1; + } + parse::Element::UnicodeSet(_) => { + self.data.counts.num_unicode_sets += 1; + } + parse::Element::Cursor(_, _) => { + // while cursors have no effect on the source side, they may appear nonetheless + // TargetValidator validates these + + // however, cursors are only allowed at the top level + if !top_level { + return Err(PEK::InvalidCursor.into()); + } + } + parse::Element::AnchorStart => { + // we check for these in `validate` + return Err(PEK::AnchorStartNotAtStart.into()); + } + parse::Element::AnchorEnd => { + // we check for these in `validate` + return Err(PEK::AnchorEndNotAtEnd.into()); + } + elt => { + return Err(PEK::UnexpectedElement(elt.kind(), EL::Source).into()); + } + } + Ok(()) + } +} + +/// Validates the target side of a rule. +/// +/// Ensures that only special constructs (including variables) that may appear on the target side +/// of a rule are used. Also validates other target-side-only constraints, such as +/// back references not being allowed to overflow and only one cursor being allowed. +struct TargetValidator<'a, 'p, F: Fn(&str) -> bool> { + is_variable_defined: F, + target_disallowed_variables: &'a mut HashSet, + data: &'a mut Pass1Data, + replacer: &'p [parse::Element], + // the number of segments defined on the corresponding source side. produced by SourceValidator + num_segments: u32, + // true if a cursor has already been encountered, i.e., any further cursors are disallowed + encountered_cursor: bool, +} + +impl<'a, 'p, F: Fn(&str) -> bool> TargetValidator<'a, 'p, F> { + fn new( + is_variable_defined: F, + target_disallowed_variables: &'a mut HashSet, + data: &'a mut Pass1Data, + replacer: &'p [parse::Element], + num_segments: u32, + ) -> Self { + Self { + is_variable_defined, + target_disallowed_variables, + data, + replacer, + num_segments, + encountered_cursor: false, + } + } + + fn validate(&mut self) -> Result<()> { + let section = self.replacer; + // special case for a single cursor + let section = match section { + [parse::Element::Cursor(pre, post)] => { + self.encounter_cursor()?; + if *pre != 0 && *post != 0 { + // corrseponds to `@@@|@@@`, i.e., placeholders on both sides of the cursor + return Err(PEK::InvalidCursor.into()); + } + return Ok(()); + } + _ => section, + }; + // strip |@@@ from beginning + let section = match section { + [parse::Element::Cursor(pre, _), rest @ ..] => { + self.encounter_cursor()?; + if *pre != 0 { + // corrseponds to `@@@|...`, i.e., placeholders in front of the cursor + return Err(PEK::InvalidCursor.into()); + } + rest + } + _ => section, + }; + // strip @@@| from end + let section = match section { + [rest @ .., parse::Element::Cursor(_, post)] => { + self.encounter_cursor()?; + if *post != 0 { + // corrseponds to `...|@@@`, i.e., placeholders after the cursor + return Err(PEK::InvalidCursor.into()); + } + rest + } + _ => section, + }; + + self.validate_section(section, true) + } + + fn validate_section(&mut self, section: &[parse::Element], top_level: bool) -> Result<()> { + section + .iter() + .try_for_each(|element| self.validate_element(element, top_level)) + } + + fn validate_element(&mut self, element: &parse::Element, top_level: bool) -> Result<()> { + match element { + parse::Element::Literal(_) => {} + parse::Element::VariableRef(name) => { + if !(self.is_variable_defined)(name) { + return Err(PEK::UnknownVariable.into()); + } + if self.target_disallowed_variables.contains(name) { + return Err(PEK::SourceOnlyVariable.into()); + } + self.data.used_variables.insert(name.clone()); + } + parse::Element::BackRef(num) => { + if *num > self.num_segments { + return Err(PEK::BackReferenceOutOfRange.into()); + } + } + parse::Element::FunctionCall(id, inner) => { + self.validate_section(inner, false)?; + self.data.used_transliterators.insert(id.basic_id.clone()); + self.data.counts.num_function_calls += 1; + } + parse::Element::Cursor(pre, post) => { + self.encounter_cursor()?; + if !top_level || *pre != 0 || *post != 0 { + // pre and post must be 0 if the cursor does not appear at the very beginning or the very end + // we account for the beginning or the end in `validate`. + return Err(PEK::InvalidCursor.into()); + } + } + parse::Element::AnchorStart => { + // while anchors have no effect on the target side, they may still appear + } + parse::Element::AnchorEnd => { + // while anchors have no effect on the target side, they may still appear + } + elt => { + return Err(PEK::UnexpectedElement(elt.kind(), EL::Target).into()); + } + } + Ok(()) + } + + fn encounter_cursor(&mut self) -> Result<()> { + if self.encountered_cursor { + return Err(PEK::DuplicateCursor.into()); + } + self.encountered_cursor = true; + Ok(()) + } +} + +/// Validates variable definitions. +/// +/// This checks that only a limited subset of special constructs appear in a variable's definition. +/// For example, segments, back references, cursors, anchors, and function calls are not allowed. +/// +/// It also propagates information about whether a variable may appear on the target side of a rule, +/// as variables are in general allowed on the target side, but only if they only contain +/// special constructs that are allowed to appear on the target side. +struct VariableDefinitionValidator<'a, 'p, F: Fn(&str) -> bool> { + is_variable_defined: F, + target_disallowed_variables: &'a HashSet, + data: &'a mut Pass1Data, + definition: &'p [parse::Element], + used_target_disallowed_construct: bool, +} + +impl<'a, 'p, F: Fn(&str) -> bool> VariableDefinitionValidator<'a, 'p, F> { + fn new( + is_variable_defined: F, + data: &'a mut Pass1Data, + target_disallowed_variables: &'a HashSet, + definition: &'p [parse::Element], + ) -> Self { + Self { + is_variable_defined, + data, + target_disallowed_variables, + definition, + used_target_disallowed_construct: false, + } + } + + fn validate(&mut self) -> Result<()> { + self.validate_section(self.definition) + } + + fn validate_section(&mut self, section: &[parse::Element]) -> Result<()> { + section + .iter() + .try_for_each(|element| self.validate_element(element)) + } + + fn validate_element(&mut self, element: &parse::Element) -> Result<()> { + match element { + parse::Element::Literal(_) => {} + parse::Element::VariableRef(name) => { + if !(self.is_variable_defined)(name) { + return Err(PEK::UnknownVariable.into()); + } + if self.target_disallowed_variables.contains(name) { + self.used_target_disallowed_construct = true; + } + self.data.used_variables.insert(name.clone()); + } + parse::Element::Quantifier(kind, inner) => { + self.used_target_disallowed_construct = true; + match *kind { + QuantifierKind::ZeroOrOne => self.data.counts.num_quantifiers_opt += 1, + QuantifierKind::ZeroOrMore => self.data.counts.num_quantifiers_kleene += 1, + QuantifierKind::OneOrMore => self.data.counts.num_quantifiers_kleene_plus += 1, + } + self.validate_element(inner)?; + } + parse::Element::UnicodeSet(_) => { + self.used_target_disallowed_construct = true; + self.data.counts.num_unicode_sets += 1; + } + elt => { + return Err(PEK::UnexpectedElement(elt.kind(), EL::VariableDefinition).into()); + } + } + Ok(()) + } +} + +// TODO(#3736): Think about adding a fourth Validator here that is run for +// all conversion rules in full (i.e., all contexts and the direction of the rule is part of the API) +// that checks for edge cases that are difficult to validate otherwise: +// - cursors (could move functionality from TargetValidator here too, but this is mostly intended for: +// - any cursors on the source side for unidirectional rules +// - any cursors in contexts) +// - anchors (could move functionality from SourceValidator here too, but this is mostly intended for: +// - anchors on the target side for unidirectional rules +// - contexts on the target side for unidirectional rules (still need to discuss what exactly, could be disallowed +// completely or just disallow target-only matchers (backrefs, function calls)) +// as part of this, it should also be decided whether these edge cases are full-blown errors or +// merely logged warnings. + +struct Pass1ResultGenerator<'a, 'p> { + pass: &'a Pass1<'p>, + // for cycle-detection + current_vars: HashSet, + transitive_var_dependencies: HashMap>, +} + +impl<'a, 'p> Pass1ResultGenerator<'a, 'p> { + fn generate(pass: &'a Pass1<'p>) -> Result> { + let mut generator = Self { + pass, + current_vars: HashSet::new(), + transitive_var_dependencies: HashMap::new(), + }; + generator.generate_result() + } + + fn generate_result(&mut self) -> Result> { + // the result for a given direction is computed by first computing the transitive + // used variables for each direction, then using that data summing over the + // special construct counts, and at last filtering the variable definitions based on + // the used variables in either direction. + + let forward_data = self.generate_result_one_direction(&self.pass.forward_data)?; + let reverse_data = self.generate_result_one_direction(&self.pass.reverse_data)?; + + let variable_definitions = self + .pass + .variable_definitions + .iter() + .filter(|&(var, _)| { + forward_data.used_variables.contains(var) + || reverse_data.used_variables.contains(var) + }) + .map(|(var, def)| (var.clone(), *def)) + .collect(); + + Ok(Pass1Result { + forward_data, + reverse_data, + variable_definitions, + }) + } + + fn generate_result_one_direction(&mut self, seed_data: &Pass1Data) -> Result { + let seed_vars = &seed_data.used_variables; + let seed_transliterators = &seed_data.used_transliterators; + + let mut used_variables = seed_vars.clone(); + for var in seed_vars { + self.visit_var(var)?; + #[allow(clippy::indexing_slicing)] // an non-error `visit_var` ensures this exists + let deps = self.transitive_var_dependencies[var].clone(); + used_variables.extend(deps); + } + + // if in the future variables are ever allowed to contain, e.g., function calls, this + // will need to take into account recursive dependencies from `used_vars` as well + let used_transliterators = seed_transliterators.clone(); + + let counts = used_variables + .iter() + .try_fold(seed_data.counts, |mut counts, var| { + // we check for unknown variables during the first pass, so these should exist + let var_data = self.pass.variable_data.get(var).ok_or(PEK::Internal)?; + counts.num_compounds += var_data.counts.num_compounds; + counts.num_segments += var_data.counts.num_segments; + counts.num_quantifiers_opt += var_data.counts.num_quantifiers_opt; + counts.num_quantifiers_kleene += var_data.counts.num_quantifiers_kleene; + counts.num_quantifiers_kleene_plus += var_data.counts.num_quantifiers_kleene_plus; + counts.num_unicode_sets += var_data.counts.num_unicode_sets; + counts.num_function_calls += var_data.counts.num_function_calls; + + Ok::<_, crate::ParseError>(counts) + })?; + + Ok(Pass1Data { + used_transliterators, + used_variables, + counts, + }) + } + + fn visit_var(&mut self, name: &str) -> Result<()> { + if self.transitive_var_dependencies.contains_key(name) { + return Ok(()); + } + if self.current_vars.contains(name) { + // cyclic dependency - should not occur + return Err(PEK::Internal.into()); + } + self.current_vars.insert(name.to_owned()); + // we check for unknown variables during the first pass, so these should exist + let var_data = self.pass.variable_data.get(name).ok_or(PEK::Internal)?; + let mut transitive_dependencies = var_data.used_variables.clone(); + var_data.used_variables.iter().try_for_each(|var| { + self.visit_var(var)?; + #[allow(clippy::indexing_slicing)] // an non-error `visit_var` ensures this exists + let deps = self.transitive_var_dependencies[var].clone(); + transitive_dependencies.extend(deps); + + Ok::<_, crate::ParseError>(()) + })?; + self.current_vars.remove(name); + self.transitive_var_dependencies + .insert(name.to_owned(), transitive_dependencies); + Ok(()) + } +} + +pub(crate) fn compile( + rules: Vec, + direction: parse::Direction, +) -> Result> { + // TODO(#3736): decide if validation should be metadata-direction dependent + // example: transliterator with metadata-direction "forward", and a rule `[a-z] < b ;` (invalid) + // - if validation is dependent, this rule is valid because it's not used in the forward direction + // - if validation is independent, this rule is invalid because the reverse direction is also checked + let mut pass1 = Pass1::new(direction); + let _result = pass1.run(&rules)?; + + todo!() +} + +#[cfg(test)] +mod tests { + use super::*; + use std::ops::Deref; + + enum ExpectedOutcome { + Pass, + Fail, + } + use ExpectedOutcome::*; + + const BOTH: parse::Direction = parse::Direction::Both; + + fn parse(s: &str) -> Vec { + match parse::parse(s) { + Ok(rules) => rules, + Err(e) => panic!("unexpected error parsing rules {s:?}: {:?}", e), + } + } + + fn pass1data_from_parts( + translit_deps: &[(&'static str, &'static str, &'static str)], + var_deps: &[&'static str], + counts: SpecialConstructCounts, + ) -> Pass1Data { + let mut data = Pass1Data { + counts, + ..Default::default() + }; + for &(source, target, variant) in translit_deps { + data.used_transliterators.insert(parse::BasicId { + source: source.into(), + target: target.into(), + variant: variant.into(), + }); + } + for &var in var_deps { + data.used_variables.insert(var.into()); + } + data + } + + #[test] + fn test_pass1_computed_data() { + let source = r" + :: [a-z] ; + $used_both = [a-z] ; # only transitively used by reverse direction + $used_rev = $used_both $used_both+ ; + $unused = a+ b+ .? $used_both $used_rev ; # unused + $unused2 = $unused ; # unused + :: [:L:] Bidi-Dependency/One ; + $used_fwd = [just a set] ; + ($used_both [a-z]) > &[a-z] Forward-Dependency($1) ; + $used_fwd > ; + < $used_rev+? ; + + $literal1 = a ; + $literal2 = b ; + $literal1 <> $literal2 ; + :: AnotherForwardDependency () ; + :: ([set] Backward-Dependency) ; + :: YetAnother-ForwardDependency (AnotherBackwardDependency) ; + &Many(&Backwardz(&Deps($2))) < (a(bc)d)+ ; + + :: ([a-z]) ; + "; + + let rules = parse(source); + let mut pass1 = Pass1::new(BOTH); + let result = pass1.run(&rules).expect("pass1 failed"); + + { + // forward + let counts = SpecialConstructCounts { + num_segments: 1, + num_function_calls: 1, + num_unicode_sets: 1, + ..Default::default() + }; + let expected_fwd_data = pass1data_from_parts( + &[ + ("Bidi", "Dependency", "One"), + ("Forward", "Dependency", ""), + ("Any", "AnotherForwardDependency", ""), + ("YetAnother", "ForwardDependency", ""), + ], + &["used_both", "used_fwd", "literal1", "literal2"], + counts, + ); + assert_eq!(expected_fwd_data, pass1.forward_data); + } + { + // reverse + let counts = SpecialConstructCounts { + num_quantifiers_opt: 1, + num_quantifiers_kleene_plus: 2, + num_segments: 2, + num_function_calls: 3, + ..Default::default() + }; + let expected_rev_data = pass1data_from_parts( + &[ + ("Dependency", "Bidi", "One"), + ("Backward", "Dependency", ""), + ("Any", "AnotherBackwardDependency", ""), + ("Any", "Many", ""), + ("Any", "Backwardz", ""), + ("Any", "Deps", ""), + ], + &["used_rev", "literal1", "literal2"], + counts, + ); + assert_eq!(expected_rev_data, pass1.reverse_data); + } + { + // $used_both + let counts = SpecialConstructCounts { + num_compounds: 1, + num_unicode_sets: 1, + ..Default::default() + }; + let expected_data = pass1data_from_parts(&[], &[], counts); + assert_eq!(expected_data, pass1.variable_data["used_both"]); + } + { + // $used_rev + let counts = SpecialConstructCounts { + num_compounds: 1, + num_quantifiers_kleene_plus: 1, + ..Default::default() + }; + let expected_data = pass1data_from_parts(&[], &["used_both"], counts); + assert_eq!(expected_data, pass1.variable_data["used_rev"]); + } + { + // $unused + let counts = SpecialConstructCounts { + num_compounds: 1, + num_unicode_sets: 1, + num_quantifiers_opt: 1, + num_quantifiers_kleene_plus: 2, + ..Default::default() + }; + let expected_data = pass1data_from_parts(&[], &["used_both", "used_rev"], counts); + assert_eq!(expected_data, pass1.variable_data["unused"]); + } + { + // $unused2 + let counts = SpecialConstructCounts { + num_compounds: 1, + ..Default::default() + }; + let expected_data = pass1data_from_parts(&[], &["unused"], counts); + assert_eq!(expected_data, pass1.variable_data["unused2"]); + } + { + // $used_both + let counts = SpecialConstructCounts { + num_compounds: 1, + num_unicode_sets: 1, + ..Default::default() + }; + let expected_data = pass1data_from_parts(&[], &[], counts); + assert_eq!(expected_data, pass1.variable_data["used_both"]); + } + { + // $used_fwd + let counts = SpecialConstructCounts { + num_compounds: 1, + num_unicode_sets: 1, + ..Default::default() + }; + let expected_data = pass1data_from_parts(&[], &[], counts); + assert_eq!(expected_data, pass1.variable_data["used_fwd"]); + } + { + // $literal1 + let counts = SpecialConstructCounts { + num_compounds: 1, + ..Default::default() + }; + let expected_data = pass1data_from_parts(&[], &[], counts); + assert_eq!(expected_data, pass1.variable_data["literal1"]); + } + { + // $literal2 + let counts = SpecialConstructCounts { + num_compounds: 1, + ..Default::default() + }; + let expected_data = pass1data_from_parts(&[], &[], counts); + assert_eq!(expected_data, pass1.variable_data["literal2"]); + } + { + let vars_with_data: HashSet<_> = pass1.variable_data.keys().map(Deref::deref).collect(); + let expected_vars_with_data = HashSet::from([ + "used_both", + "used_rev", + "unused", + "unused2", + "used_fwd", + "literal1", + "literal2", + ]); + assert_eq!(expected_vars_with_data, vars_with_data); + } + { + // check aggregated Pass1Result + let fwd_counts = SpecialConstructCounts { + num_compounds: 4, + num_unicode_sets: 3, + num_function_calls: 1, + num_segments: 1, + ..Default::default() + }; + let fwd_data = pass1data_from_parts( + &[ + ("Bidi", "Dependency", "One"), + ("Forward", "Dependency", ""), + ("Any", "AnotherForwardDependency", ""), + ("YetAnother", "ForwardDependency", ""), + ], + &["used_both", "used_fwd", "literal1", "literal2"], + fwd_counts, + ); + + let rev_counts = SpecialConstructCounts { + num_compounds: 4, + num_unicode_sets: 1, + num_quantifiers_kleene_plus: 3, + num_quantifiers_opt: 1, + num_segments: 2, + num_function_calls: 3, + ..Default::default() + }; + let rev_data = pass1data_from_parts( + &[ + ("Dependency", "Bidi", "One"), + ("Backward", "Dependency", ""), + ("Any", "AnotherBackwardDependency", ""), + ("Any", "Many", ""), + ("Any", "Backwardz", ""), + ("Any", "Deps", ""), + ], + &["used_both", "used_rev", "literal1", "literal2"], + rev_counts, + ); + + assert_eq!(fwd_data, result.forward_data); + assert_eq!(rev_data, result.reverse_data); + + let actual_definition_keys: HashSet<_> = result + .variable_definitions + .keys() + .map(Deref::deref) + .collect(); + let expected_definition_keys = + HashSet::from(["used_both", "used_fwd", "used_rev", "literal1", "literal2"]); + assert_eq!(expected_definition_keys, actual_definition_keys); + } + } + + #[test] + fn test_pass1_validate_conversion() { + let sources = [ + // anchor start must be at the beginning + (Pass, r"^ a > ;"), + (Pass, r"^ a > ^ b;"), + (Pass, r"^ a < ^ b;"), + (Pass, r"^ a <> ^ b;"), + (Pass, r"^ { a > ;"), + (Pass, r"{ ^ a > ;"), + (Fail, r"a { ^ a > ;"), + // TODO(#3736): do we enforce this? + // (Fail, r"{ ^ a > a ^ ;"), + (Fail, r"a ^ a > ;"), + (Fail, r"a ^ > ;"), + (Fail, r"< a ^ ;"), + (Fail, r"a } ^ > ;"), + (Fail, r"a } ^ a > ;"), + (Fail, r"(^) a > ;"), + (Fail, r"^+ a > ;"), + // anchor end must be at the end + (Pass, r"a $ > ;"), + (Pass, r"a $ > $;"), + (Pass, r"a $ <> a$;"), + (Pass, r"a } $ > ;"), + (Pass, r"a $ } > ;"), + (Fail, r"a $ } a > ;"), + (Fail, r"< $ a ;"), + (Fail, r"a $ a > ;"), + (Fail, r"$ a > ;"), + (Fail, r"$ { a > ;"), + (Fail, r"a $ { a > ;"), + (Fail, r"a ($) > ;"), + (Fail, r"a $+ > ;"), + // cursor checks + (Pass, r"a | b <> c | d ;"), + (Fail, r"a | b | <> | c | d ;"), + (Fail, r"a > | c | d ;"), + (Pass, r"a > | c d ;"), + (Pass, r"a > | ;"), + (Fail, r"a > || ;"), + (Fail, r"a|? > ;"), + (Fail, r"a(|) > ;"), + (Fail, r"a > &Remove(|) ;"), + (Pass, r"a > |@ ;"), + (Pass, r"a > @| ;"), + (Fail, r"a > @|@ ;"), + (Fail, r"a > @|@| ;"), + (Pass, r"a > xa @@@| ;"), + (Pass, r"a > |@@ xa ;"), + (Fail, r"a > x @| a ;"), + (Fail, r"a > x |@ a ;"), + (Fail, r"a > x @|@ a ;"), + // UnicodeSets + (Pass, r"[a-z] > a ;"), + (Fail, r"[a-z] < a ;"), + (Pass, r". > a ;"), + (Fail, r". < a ;"), + // segments + (Fail, r"(a) <> $1 ;"), + (Pass, r"(a) > $1 ;"), + (Pass, r"(a()) > $1 $2;"), + (Pass, r"(a()) > $2;"), + (Fail, r"(a) > $2;"), + (Pass, r"(a) } (abc) > $2;"), + // variables + (Fail, r"a > $a;"), + // quantifiers + (Pass, r"a+*? } b? > a;"), + (Fail, r"a > a+;"), + (Fail, r"a > a*;"), + (Fail, r"a > a?;"), + // function calls + (Pass, r"a > &Remove();"), + (Fail, r"a < &Remove();"), + (Pass, r"a (.*)> &[a-z] Latin-Greek/BGN(abc &[a]Remove($1));"), + ]; + + for (expected_outcome, source) in sources { + let rules = parse(source); + let mut pass = Pass1::new(BOTH); + let result = pass.run(&rules); + match (expected_outcome, result) { + (Fail, Ok(_)) => { + panic!("unexpected successful pass1 validation for rules {source:?}") + } + (Pass, Err(e)) => { + panic!("unexpected error in pass1 validation for rules {source:?}: {e:?}") + } + _ => {} + } + } + } + + #[test] + fn test_pass1_validate_variable_definition() { + let sources = [ + (Fail, r"$a = &Remove() ;"), + (Fail, r"$a = (abc) ;"), + (Fail, r"$a = | ;"), + (Fail, r"$a = ^ ;"), + (Fail, r"$a = $ ;"), + (Fail, r"$a = $1 ;"), + (Fail, r"$var = [a-z] ; a > $var ;"), + (Fail, r"$var = a+ ; a > $var ;"), + (Pass, r"$var = [a-z] ; $var > a ;"), + (Pass, r"$var = a+ ; $var > a ;"), + (Pass, r"$b = 'hello'; $var = a+*? [a-z] $b ;"), + ]; + + for (expected_outcome, source) in sources { + let rules = parse(source); + let mut pass = Pass1::new(BOTH); + let result = pass.run(&rules); + match (expected_outcome, result) { + (Fail, Ok(_)) => { + panic!("unexpected successful pass1 validation for rules {source:?}") + } + (Pass, Err(e)) => { + panic!("unexpected error in pass1 validation for rules {source:?}: {e:?}") + } + _ => {} + } + } + } + + #[test] + fn test_pass1_validate_global_filters() { + let sources = [ + (Pass, r":: [a-z];"), + (Pass, r":: ([a-z]);"), + (Pass, r":: [a-z] ; :: ([a-z]);"), + (Fail, r":: [{string}] ;"), + (Fail, r":: ([{string}]);"), + (Fail, r":: [a-z] ; :: [a-z] ;"), + (Fail, r":: ([a-z]) ; :: ([a-z]) ;"), + (Fail, r":: ([a-z]) ; :: [a-z] ;"), + (Pass, r":: [a-z] ; :: Remove ; :: ([a-z]) ;"), + (Fail, r":: Remove ; :: [a-z] ;"), + (Fail, r":: ([a-z]) ; :: Remove ;"), + ]; + + for (expected_outcome, source) in sources { + let rules = parse(source); + let mut pass = Pass1::new(BOTH); + let result = pass.run(&rules); + match (expected_outcome, result) { + (Fail, Ok(_)) => { + panic!("unexpected successful pass1 validation for rules {source:?}") + } + (Pass, Err(e)) => { + panic!("unexpected error in pass1 validation for rules {source:?}: {e:?}") + } + _ => {} + } + } + } +} diff --git a/experimental/transliterator_parser/src/lib.rs b/experimental/transliterator_parser/src/lib.rs new file mode 100644 index 00000000000..993a95285d1 --- /dev/null +++ b/experimental/transliterator_parser/src/lib.rs @@ -0,0 +1,114 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! `icu_transliterator_parser` is a utility crate of the [`ICU4X`] project. +//! +//! This crate provides parsing functionality for [UTS #35 - Transliterators](https://unicode.org/reports/tr35/tr35-general.html#Transforms). +//! +//! See [`parse`](crate::parse()) for more information. +//! +//! [`ICU4X`]: ../icu/index.html + +// https://github.com/unicode-org/icu4x/blob/main/docs/process/boilerplate.md#library-annotations +#![cfg_attr( + not(test), + deny( + clippy::indexing_slicing, + clippy::unwrap_used, + clippy::expect_used, + clippy::panic, + clippy::exhaustive_structs, + clippy::exhaustive_enums, + missing_debug_implementations, + ) +)] +#![warn(missing_docs)] + +use icu_properties::provider::*; +use icu_provider::prelude::*; +use icu_transliteration::provider::RuleBasedTransliterator; + +mod compile; +mod parse; + +pub use parse::ElementKind; +pub use parse::ElementLocation; +pub use parse::ParseError; +pub use parse::ParseErrorKind; + +/// Parse a rule based transliterator definition into a `TransliteratorDataStruct`. +/// +/// See [UTS #35 - Transliterators](https://unicode.org/reports/tr35/tr35-general.html#Transforms) for more information. +#[cfg(feature = "compiled_data")] +pub fn parse(source: &str) -> Result, parse::ParseError> { + parse_unstable(source, &icu_properties::provider::Baked) +} + +#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, parse())] +pub fn parse_unstable

( + source: &str, + provider: &P, +) -> Result, parse::ParseError> +where + P: ?Sized + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider, +{ + let parsed = parse::parse_unstable(source, provider)?; + // TODO(#3736): pass direction from metadata + compile::compile(parsed, parse::Direction::Both) +} diff --git a/experimental/transliterator_parser/src/parse.rs b/experimental/transliterator_parser/src/parse.rs new file mode 100644 index 00000000000..f3b0e9d15d2 --- /dev/null +++ b/experimental/transliterator_parser/src/parse.rs @@ -0,0 +1,1634 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use std::borrow::Cow; +use std::{iter::Peekable, str::CharIndices}; + +use icu_collections::{ + codepointinvlist::CodePointInversionList, + codepointinvliststringlist::CodePointInversionListAndStringList, +}; +use icu_properties::provider::*; +use icu_properties::sets::{load_pattern_white_space, load_xid_continue, load_xid_start}; +use icu_provider::prelude::*; +use icu_unicodeset_parser::{VariableMap, VariableValue}; + +/// An element that can appear in a rule. Used for error reporting in [`ParseError`]. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[non_exhaustive] +pub enum ElementKind { + /// A literal string: `abc 'abc'`. + Literal, + /// A variable reference: `$var`. + VariableReference, + /// A backreference to a segment: `$1`. + BackReference, + /// A quantifier of any sort: `c*`, `c+`, `c?`. + Quantifier, + /// A segment: `(abc)`. + Segment, + /// A UnicodeSet: `[a-z]`. + UnicodeSet, + /// A function call: `&[a-z] Remove(...)`. + FunctionCall, + /// A cursor: `|`. + Cursor, + /// A start anchor: `^`. + AnchorStart, + /// An end anchor: `$`. + AnchorEnd, +} + +/// The location in which an element can appear. Used for error reporting in [`ParseError`]. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[non_exhaustive] +pub enum ElementLocation { + /// The element appears on the source side of a rule (i.e., the side _not_ pointed at + /// by the arrow). + Source, + /// The element appears on the target side of a rule (i.e., the side pointed at by the arrow). + Target, + /// The element appears inside a variable definition. + VariableDefinition, +} + +/// The kind of error that occurred. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[non_exhaustive] +pub enum ParseErrorKind { + /// An unexpected character was encountered. This variant implies the other variants + /// (notably `UnknownProperty` and `Unimplemented`) do not apply. + UnexpectedChar(char), + /// A reference to an unknown variable. + UnknownVariable, + /// The source is incomplete. + Eof, + /// Something unexpected went wrong with our code. Please file a bug report on GitHub. + Internal, + /// The provided syntax is not supported by us. Please file an issue on GitHub if you need + /// this feature. + Unimplemented, + /// The provided escape sequence is not a valid Unicode code point. + InvalidEscape, + /// The provided transform ID is invalid. + InvalidId, + /// The provided number is invalid, which likely means it's too big. + InvalidNumber, + /// Duplicate variable definition. + DuplicateVariable, + /// Invalid UnicodeSet syntax. See `icu_unicodeset_parser`'s [`ParseError`](icu_unicodeset_parser::ParseError). + UnicodeSetError(icu_unicodeset_parser::ParseError), + + // errors originating from compilation step + /// A global filter (forward or backward) in an unexpected position. + UnexpectedGlobalFilter, + /// A global filter (forward or backward) may not contain strings. + GlobalFilterWithStrings, + /// An element of [`ElementKind`] appeared in the given [`ElementLocation`], but that is prohibited. + UnexpectedElement(ElementKind, ElementLocation), + /// The start anchor `^` was not placed at the beginning of a source. + AnchorStartNotAtStart, + /// The end anchor `$` was not placed at the end of a source. + AnchorEndNotAtEnd, + /// A variable that contains source-only matchers (e.g., UnicodeSets) was used on the target side. + SourceOnlyVariable, + /// No matching segment for this backreference was found. + BackReferenceOutOfRange, + /// The cursor is in an invalid position. + InvalidCursor, + /// Multiple cursors were defined. + DuplicateCursor, +} +pub(crate) use ParseErrorKind as PEK; + +impl ParseErrorKind { + fn with_offset(self, offset: usize) -> ParseError { + ParseError { + offset: Some(offset), + kind: self, + } + } +} + +/// The error type returned by the `parse` functions in this crate. +#[allow(unused)] // TODO(#3736): remove when doing compilation +#[derive(Debug, Clone, Copy)] +pub struct ParseError { + // offset is the index to an arbitrary byte in the last character in the source that makes sense + // to display as location for the error, e.g., the unexpected character itself or + // for an unknown property name the last character of the name. + offset: Option, + kind: ParseErrorKind, +} + +impl From for ParseError { + fn from(kind: ParseErrorKind) -> Self { + ParseError { offset: None, kind } + } +} + +impl From for ParseError { + fn from(e: icu_unicodeset_parser::ParseError) -> Self { + ParseError { + offset: None, + kind: PEK::UnicodeSetError(e), + } + } +} + +pub(crate) type Result = core::result::Result; + +// the only UnicodeSets used in this crate are parsed, and thus 'static. +pub(crate) type UnicodeSet = CodePointInversionListAndStringList<'static>; + +#[derive(Debug, Clone, Copy)] +pub(crate) enum QuantifierKind { + // ? + ZeroOrOne, + // * + ZeroOrMore, + // + + OneOrMore, +} + +// source-target/variant +#[allow(unused)] // TODO(#3736): remove when doing compilation +#[derive(Debug, Clone, Hash, PartialEq, Eq)] +pub(crate) struct BasicId { + pub(crate) source: String, + pub(crate) target: String, + pub(crate) variant: String, +} + +impl BasicId { + pub(crate) fn is_null(&self) -> bool { + self.source == "Any" && self.target == "Null" && self.variant.is_empty() + } + + pub(crate) fn reverse(self) -> Self { + if self.is_null() { + return self; + } + // TODO(#3736): add hardcoded reverses here + + Self { + source: self.target, + target: self.source, + variant: self.variant, + } + } +} + +impl Default for BasicId { + fn default() -> Self { + Self { + source: "Any".to_string(), + target: "Null".to_string(), + variant: "".to_string(), + } + } +} + +// [set] source-target/variant +#[allow(unused)] // TODO(#3736): remove when doing compilation +#[derive(Debug, Clone)] +pub(crate) struct SingleId { + pub(crate) filter: Option, + pub(crate) basic_id: BasicId, +} + +#[derive(Debug, Clone)] +pub(crate) enum Element { + // Examples: + // - hello\ world + // - 'hello world' + Literal(String), + // Example: $my_var + VariableRef(String), + // Example: $12 + BackRef(u32), + // Examples: + // - ? + // - * + // - + + // note: Box instead of Section, because a quantifier only ever refers to the immediately preceding element. + // segments or variable refs are used to group multiple elements together. + Quantifier(QuantifierKind, Box), + // Example: ( ...) + Segment(Section), + // Example: [:^L:] + UnicodeSet(UnicodeSet), + // Example: &[a-z] Any-Remove( ...) + // single id, function arguments + FunctionCall(SingleId, Section), + // Example: @@@@ |, |@@@@ + Cursor(u32, u32), + // '^' + AnchorStart, + // '$' + AnchorEnd, +} + +impl Element { + pub(crate) fn kind(&self) -> ElementKind { + match self { + Element::Literal(..) => ElementKind::Literal, + Element::VariableRef(..) => ElementKind::VariableReference, + Element::BackRef(..) => ElementKind::BackReference, + Element::Quantifier(..) => ElementKind::Quantifier, + Element::Segment(..) => ElementKind::Segment, + Element::UnicodeSet(..) => ElementKind::UnicodeSet, + Element::FunctionCall(..) => ElementKind::FunctionCall, + Element::Cursor(..) => ElementKind::Cursor, + Element::AnchorStart => ElementKind::AnchorStart, + Element::AnchorEnd => ElementKind::AnchorEnd, + } + } +} + +pub(crate) type Section = Vec; + +#[allow(unused)] // TODO(#3736): remove when doing compilation +#[derive(Debug, Clone)] +pub(crate) struct HalfRule { + pub(crate) ante: Section, + pub(crate) key: Section, + pub(crate) post: Section, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(crate) enum Direction { + Forward, + Reverse, + Both, +} + +impl Direction { + // whether `self` is a superset of `other` or not + pub(crate) fn permits(&self, other: Direction) -> bool { + match self { + Direction::Forward => other == Direction::Forward, + Direction::Reverse => other == Direction::Reverse, + Direction::Both => true, + } + } +} + +#[derive(Debug, Clone)] +#[allow(clippy::large_enum_variant)] +pub(crate) enum Rule { + GlobalFilter(UnicodeSet), + GlobalInverseFilter(UnicodeSet), + // forward and backward IDs. + // "A (B)" is Transform(A, Some(B)), + // "(B)" is Transform(Null, Some(B)), + // "A" is Transform(A, None), which indicates an auto-computed reverse ID, + // "A ()" is Transform(A, Some(Null)) + Transform(SingleId, Option), + Conversion(HalfRule, Direction, HalfRule), + VariableDefinition(String, Section), +} + +struct TransliteratorParser<'a, P: ?Sized> { + iter: Peekable>, + source: &'a str, + // flattened variable map specifically for unicodesets, i.e., only contains variables that + // are chars, strings, or UnicodeSets when all variables are inlined. + variable_map: VariableMap<'static>, + // cached set for the special set . + dot_set: Option, + // for variable identifiers (XID Start, XID Continue) + xid_start: &'a CodePointInversionList<'a>, + xid_continue: &'a CodePointInversionList<'a>, + // for skipped whitespace (Pattern White Space) + pat_ws: &'a CodePointInversionList<'a>, + property_provider: &'a P, +} + +impl<'a, P> TransliteratorParser<'a, P> +where + P: ?Sized + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider, +{ + // initiates a line comment + const COMMENT: char = '#'; + // terminates a line comment + const COMMENT_END: char = '\n'; + // terminates a rule + const RULE_END: char = ';'; + // initiates a filter or transform rule, as part of '::' + const SPECIAL_START: char = ':'; + // initiates a UnicodeSet + const SET_START: char = '['; + // equivalent to the UnicodeSet [^[:Zp:][:Zl:]\r\n$] + const DOT: char = '.'; + const DOT_SET: &'static str = r"[^[:Zp:][:Zl:]\r\n$]"; + // matches the beginning of the input + const ANCHOR_START: char = '^'; + // initiates a segment or the reverse portion of an ID + const OPEN_PAREN: char = '('; + // terminates a segment or the reverse portion of an ID + const CLOSE_PAREN: char = ')'; + // separates source and target of an ID + const ID_SEP: char = '-'; + // separates variant from ID + const VARIANT_SEP: char = '/'; + // variable reference prefix, and anchor end character + const VAR_PREFIX: char = '$'; + // variable definition operator + const VAR_DEF_OP: char = '='; + // left context + const LEFT_CONTEXT: char = '{'; + // right context + const RIGHT_CONTEXT: char = '}'; + // optional quantifier + const OPTIONAL: char = '?'; + // zero or more quantifier + const ZERO_OR_MORE: char = '*'; + // one or more quantifier + const ONE_OR_MORE: char = '+'; + // function prefix + const FUNCTION_PREFIX: char = '&'; + // quoted literals + const QUOTE: char = '\''; + // escape character + const ESCAPE: char = '\\'; + // cursor + const CURSOR: char = '|'; + // before or after a cursor + const CURSOR_PLACEHOLDER: char = '@'; + + fn new( + source: &'a str, + xid_start: &'a CodePointInversionList<'a>, + xid_continue: &'a CodePointInversionList<'a>, + pat_ws: &'a CodePointInversionList<'a>, + provider: &'a P, + ) -> Self { + Self { + iter: source.char_indices().peekable(), + source, + variable_map: Default::default(), + dot_set: None, + xid_start, + xid_continue, + pat_ws, + property_provider: provider, + } + } + + fn parse_rules(&mut self) -> Result> { + let mut rules = Vec::new(); + + loop { + self.skip_whitespace(); + if self.iter.peek().is_none() { + break; + } + // we skipped whitespace and comments, so any other chars must be part of a rule + rules.push(self.parse_rule()?); + } + + Ok(rules) + } + + // expects a rule + fn parse_rule(&mut self) -> Result { + match self.must_peek_char()? { + Self::SPECIAL_START => self.parse_filter_or_transform_rule(), + // must be a conversion or variable rule + _ => self.parse_conversion_or_variable_rule(), + } + } + + // any rules starting with '::' + fn parse_filter_or_transform_rule(&mut self) -> Result { + // Syntax: + // '::' ';' # global filter + // '::' '(' ')' ';' # global inverse filter + // '::' ()? ';' # transform rule + + self.consume(Self::SPECIAL_START)?; + self.consume(Self::SPECIAL_START)?; + + // because all three options can start with a UnicodeSet, we just try to parse everything + // into options, and assemble at the end + + let (forward_filter, forward_basic_id, reverse_filter, reverse_basic_id, has_reverse) = + self.parse_filter_or_transform_rule_parts()?; + + self.skip_whitespace(); + + // the offset of ';' + let meta_err_offset = self.must_peek_index()?; + self.consume(Self::RULE_END)?; + + // try to assemble the rule + // first try global filters + match ( + forward_filter.is_some(), + forward_basic_id.is_some(), + reverse_filter.is_some(), + reverse_basic_id.is_some(), + ) { + (true, false, false, false) => { + // by match, forward_filter.is_some() is true + #[allow(clippy::unwrap_used)] + return Ok(Rule::GlobalFilter(forward_filter.unwrap())); + } + (false, false, true, false) => { + // by match, reverse_filter.is_some() is true + #[allow(clippy::unwrap_used)] + return Ok(Rule::GlobalInverseFilter(reverse_filter.unwrap())); + } + _ => {} + } + + // if this is not a global (inverse) filter rule, this must be a transform rule + + // either forward_basic_id or reverse_basic_id must be nonempty + if forward_basic_id.is_none() && reverse_basic_id.is_none() { + return Err(PEK::InvalidId.with_offset(meta_err_offset)); + } + + if !has_reverse { + // we must have a forward id due to: + // 1. !has_reverse implying reverse_basic_id.is_none() + // 2. the above none checks implying forward_basic_id.is_some() + // because this is difficult to verify, returning a PEK::Internal anyway + // instead of unwrapping, despite technically being unnecessary + let forward_basic_id = forward_basic_id.ok_or(PEK::Internal)?; + return Ok(Rule::Transform( + SingleId { + basic_id: forward_basic_id, + filter: forward_filter, + }, + None, + )); + } + + if forward_filter.is_some() && forward_basic_id.is_none() + || reverse_filter.is_some() && reverse_basic_id.is_none() + { + // cannot have a filter without a basic id + return Err(PEK::InvalidId.with_offset(meta_err_offset)); + } + + // an empty forward rule, such as ":: (R) ;" is equivalent to ":: Any-Null (R) ;" + let forward_basic_id = forward_basic_id.unwrap_or(BasicId::default()); + // an empty reverse rule, such as ":: F () ;" is equivalent to ":: F (Any-Null) ;" + let reverse_basic_id = reverse_basic_id.unwrap_or(BasicId::default()); + + let forward_single_id = SingleId { + basic_id: forward_basic_id, + filter: forward_filter, + }; + let reverse_single_id = SingleId { + basic_id: reverse_basic_id, + filter: reverse_filter, + }; + + Ok(Rule::Transform(forward_single_id, Some(reverse_single_id))) + } + + // consumes everything between '::' and ';', exclusive. + #[allow(clippy::type_complexity)] // used internally in one place only + fn parse_filter_or_transform_rule_parts( + &mut self, + ) -> Result<( + Option, + Option, + Option, + Option, + bool, + )> { + // parse forward things, i.e., everything until Self::OPEN_PAREN + self.skip_whitespace(); + let forward_filter = self.try_parse_unicode_set()?; + self.skip_whitespace(); + let forward_basic_id = self.try_parse_basic_id()?; + self.skip_whitespace(); + + let has_reverse = match self.must_peek_char()? { + // initiates a reverse id + Self::OPEN_PAREN => true, + // we're done parsing completely, no reverse id + Self::RULE_END => false, + _ => return self.unexpected_char_here(), + }; + + let reverse_filter; + let reverse_basic_id; + + if has_reverse { + // if we have a reverse, parse it + self.consume(Self::OPEN_PAREN)?; + self.skip_whitespace(); + reverse_filter = self.try_parse_unicode_set()?; + self.skip_whitespace(); + reverse_basic_id = self.try_parse_basic_id()?; + self.skip_whitespace(); + self.consume(Self::CLOSE_PAREN)?; + } else { + reverse_filter = None; + reverse_basic_id = None; + } + + Ok(( + forward_filter, + forward_basic_id, + reverse_filter, + reverse_basic_id, + has_reverse, + )) + } + + fn parse_conversion_or_variable_rule(&mut self) -> Result { + // Syntax: + // '='

';' # variable rule + // ';' # conversion rule + + // try parsing into a variable rule + let first_elt = if Self::VAR_PREFIX == self.must_peek_char()? { + let elt = self.parse_variable_or_backref_or_anchor_end()?; + self.skip_whitespace(); + if Self::VAR_DEF_OP == self.must_peek_char()? { + // must be variable ref + let var_name = match elt { + Element::VariableRef(var_name) => var_name, + _ => return self.unexpected_char_here(), + }; + self.iter.next(); + let section = self.parse_section(None)?; + let err_offset = self.must_peek_index()?; + self.consume(Self::RULE_END)?; + self.add_variable(var_name.clone(), section.clone(), err_offset)?; + return Ok(Rule::VariableDefinition(var_name, section)); + } + Some(elt) + } else { + None + }; + + // must be conversion rule + // passing down first_elt that was already parsed for the variable rule check + let first_half = self.parse_half_rule(first_elt)?; + + let dir = self.parse_direction()?; + + let second_half = self.parse_half_rule(None)?; + self.consume(Self::RULE_END)?; + Ok(Rule::Conversion(first_half, dir, second_half)) + } + + fn parse_single_id(&mut self) -> Result { + // Syntax: + // ? + + self.skip_whitespace(); + let filter = self.try_parse_unicode_set()?; + self.skip_whitespace(); + let basic_id = self.parse_basic_id()?; + Ok(SingleId { filter, basic_id }) + } + + fn try_parse_basic_id(&mut self) -> Result> { + if let Some(c) = self.peek_char() { + if self.xid_start.contains(c) { + return Ok(Some(self.parse_basic_id()?)); + } + } + Ok(None) + } + + // TODO(#3736): factor this out for runtime ID parsing? + fn parse_basic_id(&mut self) -> Result { + // Syntax: + // ('-' )? ('/' )? + + // we must have at least one identifier. the implicit "Null" id is only allowed + // in a '::'-rule, which is handled explicitly. + let first_id = self.parse_unicode_identifier()?; + + self.skip_whitespace(); + let second_id = self.try_parse_sep_and_unicode_identifier(Self::ID_SEP)?; + self.skip_whitespace(); + let variant_id = self.try_parse_sep_and_unicode_identifier(Self::VARIANT_SEP)?; + + let (source, target) = match second_id { + None => ("Any".to_string(), first_id), + Some(second_id) => (first_id, second_id), + }; + + Ok(BasicId { + source, + target, + variant: variant_id.unwrap_or("".to_string()), + }) + } + + fn try_parse_sep_and_unicode_identifier(&mut self, sep: char) -> Result> { + if Some(sep) == self.peek_char() { + self.iter.next(); + self.skip_whitespace(); + // at this point we must be parsing a identifier + return Ok(Some(self.parse_unicode_identifier()?)); + } + Ok(None) + } + + // parses an XID-based identifier + fn parse_unicode_identifier(&mut self) -> Result { + // Syntax: + // ()* + + let mut id = String::new(); + + let (first_offset, first_c) = self.must_peek()?; + if !self.xid_start.contains(first_c) { + return Err(PEK::UnexpectedChar(first_c).with_offset(first_offset)); + } + self.iter.next(); + id.push(first_c); + + loop { + let c = self.must_peek_char()?; + if !self.xid_continue.contains(c) { + break; + } + id.push(c); + self.iter.next(); + } + + Ok(id) + } + + fn parse_half_rule(&mut self, prev_elt: Option) -> Result { + // Syntax: + // (
'{')?
('}'
)? + + let ante; + let key; + let post; + let first = self.parse_section(prev_elt)?; + if Self::LEFT_CONTEXT == self.must_peek_char()? { + self.iter.next(); + ante = first; + key = self.parse_section(None)?; + } else { + ante = vec![]; + key = first; + } + if Self::RIGHT_CONTEXT == self.must_peek_char()? { + self.iter.next(); + post = self.parse_section(None)?; + } else { + post = vec![]; + } + + Ok(HalfRule { ante, key, post }) + } + + fn parse_direction(&mut self) -> Result { + // Syntax: + // '<' | '>' | '<>' | '→' | '←' | '↔' + + match self.must_peek_char()? { + '>' | '→' => { + self.iter.next(); + Ok(Direction::Forward) + } + '↔' => { + self.iter.next(); + Ok(Direction::Both) + } + '←' => { + self.iter.next(); + Ok(Direction::Reverse) + } + '<' => { + self.iter.next(); + match self.must_peek_char()? { + '>' => { + self.iter.next(); + Ok(Direction::Both) + } + _ => Ok(Direction::Reverse), + } + } + _ => self.unexpected_char_here(), + } + } + + // whitespace before and after is consumed + fn parse_section(&mut self, prev_elt: Option) -> Result
{ + let mut section = Section::new(); + let mut prev_elt = prev_elt; + + loop { + self.skip_whitespace(); + let c = self.must_peek_char()?; + if self.is_section_end(c) { + if let Some(elt) = prev_elt.take() { + section.push(elt); + } + break; + } + + let next_elt = self.parse_element(&mut prev_elt)?; + + if let Some(elt) = prev_elt { + section.push(elt); + } + prev_elt = Some(next_elt); + } + + Ok(section) + } + + fn parse_quantifier_kind(&mut self) -> Result { + match self.must_peek_char()? { + Self::OPTIONAL => { + self.iter.next(); + Ok(QuantifierKind::ZeroOrOne) + } + Self::ZERO_OR_MORE => { + self.iter.next(); + Ok(QuantifierKind::ZeroOrMore) + } + Self::ONE_OR_MORE => { + self.iter.next(); + Ok(QuantifierKind::OneOrMore) + } + _ => self.unexpected_char_here(), + } + } + + fn parse_element(&mut self, prev_elt: &mut Option) -> Result { + match self.must_peek_char()? { + Self::VAR_PREFIX => self.parse_variable_or_backref_or_anchor_end(), + Self::ANCHOR_START => { + self.iter.next(); + Ok(Element::AnchorStart) + } + Self::OPEN_PAREN => self.parse_segment(), + Self::DOT => { + self.iter.next(); + Ok(Element::UnicodeSet(self.get_dot_set()?)) + } + Self::OPTIONAL | Self::ZERO_OR_MORE | Self::ONE_OR_MORE => { + let quantifier = self.parse_quantifier_kind()?; + if let Some(elt) = prev_elt.take() { + Ok(Element::Quantifier(quantifier, Box::new(elt))) + } else { + self.unexpected_char_here() + } + } + Self::FUNCTION_PREFIX => self.parse_function_call(), + Self::CURSOR_PLACEHOLDER | Self::CURSOR => self.parse_cursor(), + Self::QUOTE => Ok(Element::Literal(self.parse_quoted_literal()?)), + _ if self.peek_is_unicode_set_start() => { + Ok(Element::UnicodeSet(self.parse_unicode_set()?)) + } + c if self.is_valid_unquoted_literal(c) => Ok(Element::Literal(self.parse_literal()?)), + _ => self.unexpected_char_here(), + } + } + + fn parse_variable_or_backref_or_anchor_end(&mut self) -> Result { + self.consume(Self::VAR_PREFIX)?; + + match self.must_peek_char()? { + c if c.is_ascii_digit() => { + // we have a backref + let num = self.parse_number()?; + Ok(Element::BackRef(num)) + } + c if self.xid_start.contains(c) => { + // we have a variable + let variable_id = self.parse_unicode_identifier()?; + Ok(Element::VariableRef(variable_id)) + } + _ => { + // this was an anchor end + Ok(Element::AnchorEnd) + } + } + } + + fn parse_number(&mut self) -> Result { + let (first_offset, first_c) = self.must_next()?; + if !matches!(first_c, '1'..='9') { + return Err(PEK::UnexpectedChar(first_c).with_offset(first_offset)); + } + // inclusive end offset + let mut end_offset = first_offset; + + loop { + let (offset, c) = self.must_peek()?; + if !c.is_ascii_digit() { + break; + } + self.iter.next(); + end_offset = offset; + } + + // first_offset is valid by `Chars`, and the inclusive end_offset + // is valid because we only set it to the indices of ASCII chars, + // which are all exactly 1 UTF-8 byte + #[allow(clippy::indexing_slicing)] + self.source[first_offset..=end_offset] + .parse() + .map_err(|_| PEK::InvalidNumber.with_offset(end_offset)) + } + + fn parse_literal(&mut self) -> Result { + let mut buf = String::new(); + loop { + self.skip_whitespace(); + let c = self.must_peek_char()?; + if c == Self::ESCAPE { + self.parse_escaped_char_into_buf(&mut buf)?; + continue; + } + if !self.is_valid_unquoted_literal(c) { + break; + } + self.iter.next(); + buf.push(c); + } + Ok(buf) + } + + fn parse_quoted_literal(&mut self) -> Result { + // Syntax: + // \' [^']* \' + + let mut buf = String::new(); + self.consume(Self::QUOTE)?; + loop { + let c = self.must_next_char()?; + if c == Self::QUOTE { + break; + } + buf.push(c); + } + if buf.is_empty() { + // '' is the escaped version of a quote + buf.push(Self::QUOTE); + } + Ok(buf) + } + + // parses all supported escapes. code is somewhat duplicated from icu_unicodeset_parser + // might want to deduplicate this with unicodeset_parser somehow + fn parse_escaped_char_into_buf(&mut self, buf: &mut String) -> Result<()> { + self.consume(Self::ESCAPE)?; + + let (offset, next_char) = self.must_next()?; + + match next_char { + 'u' | 'x' if self.peek_char() == Some('{') => { + // bracketedHex + self.iter.next(); + + // the first codepoint is mandatory + self.skip_whitespace(); + let c = self.parse_hex_digits_into_char(1, 6)?; + buf.push(c); + + loop { + let skipped = self.skip_whitespace(); + let next_char = self.must_peek_char()?; + if next_char == '}' { + self.iter.next(); + break; + } + if skipped == 0 { + // multiple code points must be separated in multi escapes + return self.unexpected_char_here(); + } + + let c = self.parse_hex_digits_into_char(1, 6)?; + buf.push(c); + } + } + 'u' => { + // 'u' hex{4} + let c = self.parse_hex_digits_into_char(4, 4)?; + buf.push(c); + } + 'x' => { + // 'x' hex{2} + let c = self.parse_hex_digits_into_char(2, 2)?; + buf.push(c); + } + 'U' => { + // 'U00' ('0' hex{5} | '10' hex{4}) + let c = self.parse_hex_digits_into_char(6, 6)?; + buf.push(c); + } + 'N' => { + // parse code point with name in {} + // tracking issue: https://github.com/unicode-org/icu4x/issues/1397 + return Err(PEK::Unimplemented.with_offset(offset)); + } + 'a' => buf.push('\u{0007}'), + 'b' => buf.push('\u{0008}'), + 't' => buf.push('\u{0009}'), + 'n' => buf.push('\u{000A}'), + 'v' => buf.push('\u{000B}'), + 'f' => buf.push('\u{000C}'), + 'r' => buf.push('\u{000D}'), + _ => buf.push(next_char), + } + Ok(()) + } + + fn parse_hex_digits_into_char(&mut self, min: usize, max: usize) -> Result { + let first_offset = self.must_peek_index()?; + let end_offset = self.validate_hex_digits(min, max)?; + + // validate_hex_digits ensures that chars (including the last one) are ascii hex digits, + // which are all exactly one UTF-8 byte long, so slicing on these offsets always respects char boundaries + #[allow(clippy::indexing_slicing)] + let hex_source = &self.source[first_offset..=end_offset]; + let num = u32::from_str_radix(hex_source, 16).map_err(|_| PEK::Internal)?; + char::try_from(num).map_err(|_| PEK::InvalidEscape.with_offset(end_offset)) + } + + // validates [0-9a-fA-F]{min,max}, returns the offset of the last digit, consuming everything in the process + fn validate_hex_digits(&mut self, min: usize, max: usize) -> Result { + let mut last_offset = 0; + for count in 0..max { + let (offset, c) = self.must_peek()?; + if !c.is_ascii_hexdigit() { + if count < min { + return self.unexpected_char_here(); + } else { + break; + } + } + self.iter.next(); + last_offset = offset; + } + Ok(last_offset) + } + + fn parse_segment(&mut self) -> Result { + self.consume(Self::OPEN_PAREN)?; + let elt = Element::Segment(self.parse_section(None)?); + self.consume(Self::CLOSE_PAREN)?; + Ok(elt) + } + + fn try_parse_unicode_set(&mut self) -> Result> { + if self.peek_is_unicode_set_start() { + return Ok(Some(self.parse_unicode_set()?)); + } + Ok(None) + } + + fn parse_unicode_set(&mut self) -> Result { + let pre_offset = self.must_peek_index()?; + // pre_offset is a valid index because self.iter (used in must_peek_index) + // was created from self.source + #[allow(clippy::indexing_slicing)] + let set_source = &self.source[pre_offset..]; + let (set, consumed_bytes) = self.unicode_set_from_str(set_source).map_err(|mut e| { + e.offset.get_or_insert(pre_offset); + e + })?; + + // advance self.iter consumed_bytes bytes + while let Some(offset) = self.peek_index() { + // we can use equality because unicodeset_parser also lexes on char boundaries + // note: we must not consume this final token because it is the first non-consumed char + if offset == pre_offset + consumed_bytes { + break; + } + self.iter.next(); + } + + Ok(set) + } + + fn get_dot_set(&mut self) -> Result { + match &self.dot_set { + Some(set) => Ok(set.clone()), + None => { + let (set, _) = self + .unicode_set_from_str(Self::DOT_SET) + .map_err(|_| PEK::Internal)?; + self.dot_set = Some(set.clone()); + Ok(set) + } + } + } + + fn unicode_set_from_str(&self, set: &str) -> Result<(UnicodeSet, usize)> { + let (set, consumed_bytes) = icu_unicodeset_parser::parse_unstable_with_variables( + set, + &self.variable_map, + self.property_provider, + )?; + Ok((set, consumed_bytes)) + } + + fn parse_function_call(&mut self) -> Result { + self.consume(Self::FUNCTION_PREFIX)?; + + // parse single-id + let single_id = self.parse_single_id()?; + self.skip_whitespace(); + self.consume(Self::OPEN_PAREN)?; + let section = self.parse_section(None)?; + self.consume(Self::CLOSE_PAREN)?; + + Ok(Element::FunctionCall(single_id, section)) + } + + fn parse_cursor(&mut self) -> Result { + // Syntax: + // '@'* '|' '@'* + + let mut num_pre = 0; + let mut num_post = 0; + // parse pre + loop { + self.skip_whitespace(); + match self.must_peek_char()? { + Self::CURSOR_PLACEHOLDER => { + self.iter.next(); + num_pre += 1; + } + Self::CURSOR => { + self.iter.next(); + break; + } + _ => return self.unexpected_char_here(), + } + } + // parse post + loop { + self.skip_whitespace(); + match self.must_peek_char()? { + Self::CURSOR_PLACEHOLDER => { + self.iter.next(); + num_post += 1; + } + _ => break, + } + } + + Ok(Element::Cursor(num_pre, num_post)) + } + + fn add_variable(&mut self, name: String, value: Section, offset: usize) -> Result<()> { + if let Some(uset_value) = self.try_uset_flatten_section(&value) { + self.variable_map + .insert(name.to_string(), uset_value) + .map_err(|_| PEK::DuplicateVariable.with_offset(offset))?; + } + Ok(()) + } + + fn try_uset_flatten_section(&self, section: &Section) -> Option> { + // note: could avoid some clones here if the VariableMap stored &T's (or both), but that is + // quite the edge case in transliterator source files + + // is this just a unicode set? + if let [Element::UnicodeSet(set)] = §ion[..] { + return Some(VariableValue::UnicodeSet(set.clone())); + } + // if it's just a variable that is already a valid uset variable, we return that + if let [Element::VariableRef(name)] = §ion[..] { + if let Some(value) = self.variable_map.get(name) { + return Some(value.clone()); + } + return None; + } + + // if not, must be a string literal + let mut combined_literal = String::new(); + for elt in section { + match elt { + Element::Literal(s) => combined_literal.push_str(s), + Element::VariableRef(name) => match self.variable_map.get(name) { + Some(VariableValue::String(s)) => combined_literal.push_str(s), + Some(VariableValue::Char(c)) => combined_literal.push(*c), + _ => return None, + }, + _ => return None, + } + } + Some(VariableValue::String(Cow::Owned(combined_literal))) + } + + fn consume(&mut self, expected: char) -> Result<()> { + match self.must_next()? { + (offset, c) if c != expected => Err(PEK::UnexpectedChar(c).with_offset(offset)), + _ => Ok(()), + } + } + + // skips whitespace and comments, returns the number of skipped chars + fn skip_whitespace(&mut self) -> usize { + let mut count = 0; + while let Some(c) = self.peek_char() { + if c == Self::COMMENT { + count += self.skip_until(Self::COMMENT_END); + continue; + } + if !self.pat_ws.contains(c) { + break; + } + self.iter.next(); + count += 1; + } + count + } + + // skips until the next occurrence of c, which is also consumed + // returns the number of skipped chars + fn skip_until(&mut self, end: char) -> usize { + let mut count = 0; + for (_, c) in self.iter.by_ref() { + count += 1; + if c == end { + break; + } + } + count + } + + fn peek_is_unicode_set_start(&mut self) -> bool { + match self.peek_char() { + Some(Self::SET_START) => true, + Some(Self::ESCAPE) => { + let mut it = self.iter.clone(); + // skip past the ESCAPE + it.next(); + matches!(it.next(), Some((_, 'p' | 'P'))) + } + _ => false, + } + } + + fn peek_char(&mut self) -> Option { + self.iter.peek().map(|(_, c)| *c) + } + + fn peek_index(&mut self) -> Option { + self.iter.peek().map(|(idx, _)| *idx) + } + + // use this whenever an empty iterator would imply an Eof error + fn must_next(&mut self) -> Result<(usize, char)> { + self.iter.next().ok_or(PEK::Eof.into()) + } + + // see must_next + fn must_next_char(&mut self) -> Result { + self.must_next().map(|(_, c)| c) + } + + // use this whenever an empty iterator would imply an Eof error + fn must_peek(&mut self) -> Result<(usize, char)> { + self.iter.peek().copied().ok_or(PEK::Eof.into()) + } + + // see must_peek + fn must_peek_char(&mut self) -> Result { + self.must_peek().map(|(_, c)| c) + } + + // see must_peek + fn must_peek_index(&mut self) -> Result { + self.must_peek().map(|(idx, _)| idx) + } + + fn unexpected_char_here(&mut self) -> Result { + let (offset, char) = self.must_peek()?; + Err(PEK::UnexpectedChar(char).with_offset(offset)) + } + + fn is_section_end(&self, c: char) -> bool { + matches!( + c, + Self::RULE_END + | Self::CLOSE_PAREN + | Self::RIGHT_CONTEXT + | Self::LEFT_CONTEXT + | Self::VAR_DEF_OP + | '<' + | '>' + | '→' + | '←' + | '↔' + ) + } + + fn is_valid_unquoted_literal(&self, c: char) -> bool { + // allowing \ since it's used for escapes, which are allowed in an unquoted context + c.is_ascii() && (c.is_ascii_alphanumeric() || c == '\\') + || (!c.is_ascii() && c != '→' && c != '←' && c != '↔') + } +} + +// used in tests +#[allow(unused)] +#[cfg(feature = "compiled_data")] +pub(crate) fn parse(source: &str) -> Result> { + parse_unstable(source, &icu_properties::provider::Baked) +} + +pub(crate) fn parse_unstable

(source: &str, provider: &P) -> Result> +where + P: ?Sized + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider + + DataProvider, +{ + let xid_start = load_xid_start(provider).map_err(|_| PEK::Internal)?; + let xid_start_list = xid_start.to_code_point_inversion_list(); + let xid_continue = load_xid_continue(provider).map_err(|_| PEK::Internal)?; + let xid_continue_list = xid_continue.to_code_point_inversion_list(); + + let pat_ws = load_pattern_white_space(provider).map_err(|_| PEK::Internal)?; + let pat_ws_list = pat_ws.to_code_point_inversion_list(); + + let mut parser = TransliteratorParser::new( + source, + &xid_start_list, + &xid_continue_list, + &pat_ws_list, + provider, + ); + parser.parse_rules() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_full() { + let source = r##" + :: [a-z\]] ; :: [b-z] Latin/BGN ; + :: Source-Target/Variant () ;::([b-z]Target-Source/Variant) ; + :: [a-z] Any ([b-z] Target-Source/Variant); + + $my_var = an arbitrary section ',' some quantifiers *+? 'and other variables: $var' $var ; + $innerMinus = '-' ; + $minus = $innerMinus ; + $good_set = [a $minus z] ; + + ^ (start) { key ' key '+ $good_set } > $102 } post\-context$; + # contexts are optional + target < source ; + # contexts can be empty + { 'source-or-target' } <> { 'target-or-source' } ; + + (nested (sections)+ are () so fun) > ; + + . > ; + + :: ([{Inverse]-filter}]) ; + "##; + + if let Err(e) = parse(source) { + panic!("Failed to parse {:?}: {:?}", source, e); + } + } + + #[test] + fn test_conversion_rules_ok() { + let sources = [ + r"a > b ;", + r"a < b ;", + r"a <> b ;", + r"a → b ;", + r"a ← b ;", + r"a ↔ b ;", + r"a \> > b ;", + r"a \→ > b ;", + r"{ a > b ;", + r"a { > b ;", + r"{ a } > b ;", + r"{ a } > { b ;", + r"{ a } > { b } ;", + r"^ pre [a-z] { a } post [$] $ > ^ [$] pre { b [b-z] } post $ ;", + r"[äöü] > ;", + r"([äöü]) > &Remove($1) ;", + r"[äöü] { ([äöü]+) > &Remove($1) ;", + r"|@@@ a <> b @@@@ @ | ;", + r"|a <> b ;", + ]; + + for source in sources { + if let Err(e) = parse(source) { + panic!("Failed to parse {:?}: {:?}", source, e); + } + } + } + + #[test] + fn test_conversion_rules_err() { + let sources = [ + r"a > > b ;", + r"a >< b ;", + r"(a > b) > b ;", + r"a \← b ;", + r"a ↔ { b > } ;", + r"a ↔ { b > } ;", + r"a > b", + r"@ a > b ;", + r"a ( { > b ;", + r"a ( { ) > b ;", + r"a } + > b ;", + r"a (+?*) > b ;", + r"+?* > b ;", + r"+ > b ;", + r"* > b ;", + r"? > b ;", + ]; + + for source in sources { + if let Ok(rules) = parse(source) { + panic!("Parsed invalid source {:?}: {:?}", source, rules); + } + } + } + + #[test] + fn test_variable_rules_ok() { + let sources = [ + r" $my_var = [a-z] ;", + r"$my_var = äüöÜ ;", + r"$my_var = [a-z] literal ; $other_var = [A-Z] [b-z];", + r"$my_var = [a-z] ; $other_var = [A-Z] [b-z];", + r"$my_var = [a-z] ; $other_var = $my_var + $2222;", + r"$my_var = [a-z] ; $other_var = $my_var \+\ \$2222 \\ 'hello\';", + r" + $innerMinus = '-' ; + $minus = $innerMinus ; + $good_set = [a $minus z] ; + ", + ]; + + for source in sources { + if let Err(e) = parse(source) { + panic!("Failed to parse {:?}: {:?}", source, e); + } + } + } + + #[test] + fn test_variable_rules_err() { + let sources = [ + r" $ my_var = a ;", + r" $my_var = a_2 ;", + r"$my_var 2 = [a-z] literal ;", + r"$my_var = [$doesnt_exist] ;", + ]; + + for source in sources { + if let Ok(rules) = parse(source) { + panic!("Parsed invalid source {:?}: {:?}", source, rules); + } + } + } + + #[test] + fn test_global_filters_ok() { + let sources = [ + r":: [^\[$] ;", + r":: \p{L} ;", + r":: [^\[{[}$] ;", + r":: [^\[{]}$] ;", + r":: [^\[{]\}]}$] ;", + r":: ([^\[$]) ;", + r":: ( [^\[$] ) ;", + r":: [^[a-z[]][]] ;", + r":: [^[a-z\[\]]\]] ;", + r":: [^\]] ;", + ]; + + for source in sources { + if let Err(e) = parse(source) { + panic!("Failed to parse {:?}: {:?}", source, e); + } + } + } + + #[test] + fn test_global_filters_err() { + let sources = [ + r":: [^\[$ ;", + r":: \p{L ;", + r":: [^[$] ;", + r":: [^\[$]) ;", + r":: ( [^\[$] ;", + r":: [^[a-z[]][]] [] ;", + r":: [^[a-z\[\]]\]] ([a-z]);", + r":: [a$-^\]] ;", + r":: ( [] [] ) ;", + r":: () [] ;", + ]; + + for source in sources { + if let Ok(rules) = parse(source) { + panic!("Parsed invalid source {:?}: {:?}", source, rules); + } + } + } + + #[test] + fn test_function_calls_ok() { + let sources = [ + r"$fn = & Any-Any/Variant ($var literal 'quoted literal' $1) ;", + r"$fn = &[a-z] Any-Any/Variant ($var literal 'quoted literal' $1) ;", + r"$fn = &[a-z]Any-Any/Variant ($var literal 'quoted literal' $1) ;", + r"$fn = &[a-z]Any/Variant ($var literal 'quoted literal' $1) ;", + r"$fn = &Any/Variant ($var literal 'quoted literal' $1) ;", + r"$fn = &[a-z]Any ($var literal 'quoted literal' $1) ;", + r"$fn = &Any($var literal 'quoted literal' $1) ;", + ]; + + for source in sources { + if let Err(e) = parse(source) { + panic!("Failed to parse {:?}: {:?}", source, e); + } + } + } + + #[test] + fn test_function_calls_err() { + let sources = [ + r"$fn = &[a-z]($var literal 'quoted literal' $1) ;", + r"$fn = &[a-z] ($var literal 'quoted literal' $1) ;", + r"$fn = &($var literal 'quoted literal' $1) ;", + ]; + + for source in sources { + if let Ok(rules) = parse(source) { + panic!("Parsed invalid source {:?}: {:?}", source, rules); + } + } + } + + #[test] + fn test_transform_rules_ok() { + let sources = [ + ":: NFD; :: NFKC;", + ":: Latin ;", + ":: any - Latin;", + ":: any - Latin/bgn;", + ":: any - Latin/bgn ();", + ":: any - Latin/bgn ([a-z] a-z);", + ":: ([a-z] a-z);", + ":: (a-z);", + ":: (a-z / variant);", + ":: [a-z] latin/variant (a-z / variant);", + ":: [a-z] latin/variant (a-z / variant) ;", + ":: [a-z] latin ( );", + ":: [a-z] latin ;", + "::[];", + ]; + + for source in sources { + if let Err(e) = parse(source) { + panic!("Failed to parse {:?}: {:?}", source, e); + } + } + } + + #[test] + fn test_transform_rules_err() { + let sources = [ + r":: a a ;", + r":: (a a) ;", + r":: a - z - b ;", + r":: ( a - z - b) ;", + r":: [] ( a - z) ;", + r":: a-z ( [] ) ;", + r":: a-z / ( [] a-z ) ;", + r":: Latin-ASCII/BGN Arab-Greek/UNGEGN ;", + r":: (Latin-ASCII/BGN Arab-Greek/UNGEGN) ;", + ]; + + for source in sources { + if let Ok(rules) = parse(source) { + panic!("Parsed invalid source {:?}: {:?}", source, rules); + } + } + } +}