Skip to content

Commit

Permalink
Squash of transliterator-datastruct-generation
Browse files Browse the repository at this point in the history
commit 1145a17
Author: Niels Saurer <[email protected]>
Date:   Thu Aug 10 02:06:46 2023 +0200

    Squash merge transliterator-ir

    commit 9d55038
    Author: Niels Saurer <[email protected]>
    Date:   Thu Aug 10 02:03:34 2023 +0200

        fix push_front/push_back mixup

    commit dc8dda7
    Author: Niels Saurer <[email protected]>
    Date:   Wed Aug 9 23:02:10 2023 +0200

        remove empty line

    commit bfe5827
    Merge: c85e861 f549131
    Author: Niels Saurer <[email protected]>
    Date:   Wed Aug 9 20:57:11 2023 +0200

        Merge branch 'main' into transliterator-ir

    commit c85e861
    Author: Niels Saurer <[email protected]>
    Date:   Wed Aug 9 17:40:53 2023 +0200

        borrow SingleID

    commit 06425a1
    Author: Niels Saurer <[email protected]>
    Date:   Wed Aug 9 17:22:03 2023 +0200

        fix comment indentation

    commit 2f70922
    Author: Niels Saurer <[email protected]>
    Date:   Wed Aug 9 17:09:13 2023 +0200

        update comments

    commit 47444ee
    Author: Niels Saurer <[email protected]>
    Date:   Wed Aug 9 17:06:43 2023 +0200

        fmt

    commit c0de3a0
    Author: Niels Saurer <[email protected]>
    Date:   Wed Aug 9 17:03:19 2023 +0200

        fix clippy, allow testing of intermediate pass1 values

    commit 227f738
    Author: Niels Saurer <[email protected]>
    Date:   Wed Aug 9 16:55:53 2023 +0200

        fix compile errors by introducing 2 small clones per transliterator

    commit 512b158
    Author: Niels Saurer <[email protected]>
    Date:   Wed Aug 9 16:49:01 2023 +0200

        doesn't compile - missing self deconstruction

    commit 7848f09
    Author: Niels Saurer <[email protected]>
    Date:   Wed Aug 9 16:40:51 2023 +0200

        use rule group aggregation in pass1

    commit 93663e4
    Author: Niels Saurer <[email protected]>
    Date:   Wed Aug 9 16:09:29 2023 +0200

        add rule group aggregation

    commit 57666eb
    Author: Niels Saurer <[email protected]>
    Date:   Wed Aug 9 14:12:19 2023 +0200

        Squash of transliterator-compiler

        commit d1812b4
        Author: Niels Saurer <[email protected]>
        Date:   Wed Aug 9 13:31:53 2023 +0200

            fix merge mistake

        commit f15f6eb
        Merge: abb91cc a39cfed
        Author: Niels Saurer <[email protected]>
        Date:   Wed Aug 9 13:27:08 2023 +0200

            Merge branch 'main' into transliterator-compiler

        commit abb91cc
        Author: Niels Saurer <[email protected]>
        Date:   Wed Aug 9 01:12:13 2023 +0200

            reformat tests

        commit f6a10f5
        Author: Niels Saurer <[email protected]>
        Date:   Wed Aug 9 00:30:09 2023 +0200

            sizes => counts

        commit 9ffc2f0
        Author: Niels Saurer <[email protected]>
        Date:   Wed Aug 9 00:26:27 2023 +0200

            add more docs

        commit eae5748
        Author: Niels Saurer <[email protected]>
        Date:   Tue Aug 8 23:46:20 2023 +0200

            remove TODO

        commit 6b09689
        Author: Niels Saurer <[email protected]>
        Date:   Tue Aug 8 23:28:42 2023 +0200

            improve docs

        commit c9b16d5
        Author: Niels Saurer <[email protected]>
        Date:   Tue Aug 8 23:15:23 2023 +0200

            clippy

        commit 020a677
        Author: Niels Saurer <[email protected]>
        Date:   Tue Aug 8 22:53:14 2023 +0200

            add result aggregation to first pass

        commit 2d1bfd7
        Author: Niels Saurer <[email protected]>
        Date:   Tue Aug 8 16:28:23 2023 +0200

            add tests

        commit 6f35ea5
        Author: Niels Saurer <[email protected]>
        Date:   Mon Aug 7 22:25:56 2023 +0200

            CI fixes

        commit c6c4844
        Author: Niels Saurer <[email protected]>
        Date:   Sun Aug 6 20:06:31 2023 +0200

            first steps

        commit fb68218
        Author: Niels Saurer <[email protected]>
        Date:   Wed Jul 19 16:21:33 2023 +0000

            Squash transliterator-parser

            structure for transliterator parser

            start parsing ':: ... ;' rules

            complete ::-rule parsing

            add more global filter tests

            add negative tests for '::'-rules, be more restrictive

            update error docs

            add comment about static UnicodeSet type alias

            add variable defs

            escaping and fix unicodeset handling

            fix unicodeset tests

            function calls

            add variable-inside-unicodesets

            update tests

            rewrite parse_section using parse_element

            fix unquoted literal handling

            add cursor/placeholder tests

            add cursor support

            add allow(unused) for this PR

            remove unused dependencies

            add todo about inefficient unicodeset variablemap handling

            allow usage of UnicodeSet's VariableMap directly in TransliteratorParser

            avoid one allocation per parsed unicodeset

            remove done todo about allocation-free unicodeset parser hook

            avoid allocations for number parsing

            invalid num err with offset

            update comment

            switch to allocation free hex parsing (and support for multi escapes)

            fix main merge conflict

            support \p unicodesets

            remove todo for \p unicodeset parsing

            turn low-prio todo about avoiding clones into note

            turn non-memory-safety safety comments into regular comments

            add issue number to TODOs

            add transliteration component crate

commit 208abd7
Author: Niels Saurer <[email protected]>
Date:   Thu Aug 10 02:02:23 2023 +0200

    add data struct generation tests

commit d1f7e7c
Author: Niels Saurer <[email protected]>
Date:   Thu Aug 10 00:58:50 2023 +0200

    fix debug_assert bug

commit 1f5c8dd
Author: Niels Saurer <[email protected]>
Date:   Wed Aug 9 23:25:17 2023 +0200

    refactor pass2 slightly

commit ae14cdc
Author: Niels Saurer <[email protected]>
Date:   Wed Aug 9 21:04:38 2023 +0200

    clippy

commit 8a14e3e
Author: Niels Saurer <[email protected]>
Date:   Wed Aug 9 21:02:28 2023 +0200

    tutorials cargo lock

commit 4256873
Merge: 72cff57 f549131
Author: Niels Saurer <[email protected]>
Date:   Wed Aug 9 20:56:20 2023 +0200

    Merge branch 'main' into transliterator-datastruct-generation

commit 72cff57
Author: Niels Saurer <[email protected]>
Date:   Wed Aug 9 20:42:03 2023 +0200

    refactor pass2 interface

commit 8fa4dfd
Author: Niels Saurer <[email protected]>
Date:   Wed Aug 9 20:31:29 2023 +0200

    skip compilation of cursors on source side, anchors on target side

commit 54b0542
Author: Niels Saurer <[email protected]>
Date:   Wed Aug 9 19:09:50 2023 +0200

    add comment

commit cba53a7
Author: Niels Saurer <[email protected]>
Date:   Wed Aug 9 19:04:27 2023 +0200

    fix clippy warnings

commit 2dd2ec8
Author: Niels Saurer <[email protected]>
Date:   Wed Aug 9 19:01:15 2023 +0200

    fmt

commit 56774fe
Author: Niels Saurer <[email protected]>
Date:   Wed Aug 9 18:45:22 2023 +0200

    refactor MutVarTable

commit 6176769
Author: Niels Saurer <[email protected]>
Date:   Wed Aug 9 18:31:18 2023 +0200

    revamp pass2 API

commit f8459c9
Author: Niels Saurer <[email protected]>
Date:   Wed Aug 9 18:22:47 2023 +0200

    initial final data struct generation

commit d6873b0
Author: Niels Saurer <[email protected]>
Date:   Wed Aug 9 17:48:41 2023 +0200

    Squash of transliterator-ir

    commit c85e861
    Author: Niels Saurer <[email protected]>
    Date:   Wed Aug 9 17:40:53 2023 +0200

        borrow SingleID

    commit 06425a1
    Author: Niels Saurer <[email protected]>
    Date:   Wed Aug 9 17:22:03 2023 +0200

        fix comment indentation

    commit 2f70922
    Author: Niels Saurer <[email protected]>
    Date:   Wed Aug 9 17:09:13 2023 +0200

        update comments

    commit 47444ee
    Author: Niels Saurer <[email protected]>
    Date:   Wed Aug 9 17:06:43 2023 +0200

        fmt

    commit c0de3a0
    Author: Niels Saurer <[email protected]>
    Date:   Wed Aug 9 17:03:19 2023 +0200

        fix clippy, allow testing of intermediate pass1 values

    commit 227f738
    Author: Niels Saurer <[email protected]>
    Date:   Wed Aug 9 16:55:53 2023 +0200

        fix compile errors by introducing 2 small clones per transliterator

    commit 512b158
    Author: Niels Saurer <[email protected]>
    Date:   Wed Aug 9 16:49:01 2023 +0200

        doesn't compile - missing self deconstruction

    commit 7848f09
    Author: Niels Saurer <[email protected]>
    Date:   Wed Aug 9 16:40:51 2023 +0200

        use rule group aggregation in pass1

    commit 93663e4
    Author: Niels Saurer <[email protected]>
    Date:   Wed Aug 9 16:09:29 2023 +0200

        add rule group aggregation

    commit 57666eb
    Author: Niels Saurer <[email protected]>
    Date:   Wed Aug 9 14:12:19 2023 +0200

        Squash of transliterator-compiler

        commit d1812b4
        Author: Niels Saurer <[email protected]>
        Date:   Wed Aug 9 13:31:53 2023 +0200

            fix merge mistake

        commit f15f6eb
        Merge: abb91cc a39cfed
        Author: Niels Saurer <[email protected]>
        Date:   Wed Aug 9 13:27:08 2023 +0200

            Merge branch 'main' into transliterator-compiler

        commit abb91cc
        Author: Niels Saurer <[email protected]>
        Date:   Wed Aug 9 01:12:13 2023 +0200

            reformat tests

        commit f6a10f5
        Author: Niels Saurer <[email protected]>
        Date:   Wed Aug 9 00:30:09 2023 +0200

            sizes => counts

        commit 9ffc2f0
        Author: Niels Saurer <[email protected]>
        Date:   Wed Aug 9 00:26:27 2023 +0200

            add more docs

        commit eae5748
        Author: Niels Saurer <[email protected]>
        Date:   Tue Aug 8 23:46:20 2023 +0200

            remove TODO

        commit 6b09689
        Author: Niels Saurer <[email protected]>
        Date:   Tue Aug 8 23:28:42 2023 +0200

            improve docs

        commit c9b16d5
        Author: Niels Saurer <[email protected]>
        Date:   Tue Aug 8 23:15:23 2023 +0200

            clippy

        commit 020a677
        Author: Niels Saurer <[email protected]>
        Date:   Tue Aug 8 22:53:14 2023 +0200

            add result aggregation to first pass

        commit 2d1bfd7
        Author: Niels Saurer <[email protected]>
        Date:   Tue Aug 8 16:28:23 2023 +0200

            add tests

        commit 6f35ea5
        Author: Niels Saurer <[email protected]>
        Date:   Mon Aug 7 22:25:56 2023 +0200

            CI fixes

        commit c6c4844
        Author: Niels Saurer <[email protected]>
        Date:   Sun Aug 6 20:06:31 2023 +0200

            first steps

        commit fb68218
        Author: Niels Saurer <[email protected]>
        Date:   Wed Jul 19 16:21:33 2023 +0000

            Squash transliterator-parser

            structure for transliterator parser

            start parsing ':: ... ;' rules

            complete ::-rule parsing

            add more global filter tests

            add negative tests for '::'-rules, be more restrictive

            update error docs

            add comment about static UnicodeSet type alias

            add variable defs

            escaping and fix unicodeset handling

            fix unicodeset tests

            function calls

            add variable-inside-unicodesets

            update tests

            rewrite parse_section using parse_element

            fix unquoted literal handling

            add cursor/placeholder tests

            add cursor support

            add allow(unused) for this PR

            remove unused dependencies

            add todo about inefficient unicodeset variablemap handling

            allow usage of UnicodeSet's VariableMap directly in TransliteratorParser

            avoid one allocation per parsed unicodeset

            remove done todo about allocation-free unicodeset parser hook

            avoid allocations for number parsing

            invalid num err with offset

            update comment

            switch to allocation free hex parsing (and support for multi escapes)

            fix main merge conflict

            support \p unicodesets

            remove todo for \p unicodeset parsing

            turn low-prio todo about avoiding clones into note

            turn non-memory-safety safety comments into regular comments

            add issue number to TODOs

            add transliteration component crate

    commit a39cfed
    Author: Niels Saurer <[email protected]>
    Date:   Wed Aug 9 13:19:28 2023 +0200

        Add Parsing for Rule-Based Transliterators (unicode-org#3730)

    commit 57e9d59
    Author: Andrew Cupps <[email protected]>
    Date:   Tue Aug 8 18:53:26 2023 -0700

        Resolve follow-up comments to unicode-org#3760 (unicode-org#3818)

        * Docs for `U` and `r`

        * Delete empty test and add todo

        * Remove old code and empty era check

        * Add todo

commit c55c641
Author: Niels Saurer <[email protected]>
Date:   Wed Aug 9 02:36:53 2023 +0200

    wip

commit c6cbb0a
Author: Niels Saurer <[email protected]>
Date:   Wed Aug 9 01:20:08 2023 +0200

    Squash of transliterator-compiler

    commit abb91cc
    Author: Niels Saurer <[email protected]>
    Date:   Wed Aug 9 01:12:13 2023 +0200

        reformat tests

    commit f6a10f5
    Author: Niels Saurer <[email protected]>
    Date:   Wed Aug 9 00:30:09 2023 +0200

        sizes => counts

    commit 9ffc2f0
    Author: Niels Saurer <[email protected]>
    Date:   Wed Aug 9 00:26:27 2023 +0200

        add more docs

    commit eae5748
    Author: Niels Saurer <[email protected]>
    Date:   Tue Aug 8 23:46:20 2023 +0200

        remove TODO

    commit 6b09689
    Author: Niels Saurer <[email protected]>
    Date:   Tue Aug 8 23:28:42 2023 +0200

        improve docs

    commit c9b16d5
    Author: Niels Saurer <[email protected]>
    Date:   Tue Aug 8 23:15:23 2023 +0200

        clippy

    commit 020a677
    Author: Niels Saurer <[email protected]>
    Date:   Tue Aug 8 22:53:14 2023 +0200

        add result aggregation to first pass

    commit 2d1bfd7
    Author: Niels Saurer <[email protected]>
    Date:   Tue Aug 8 16:28:23 2023 +0200

        add tests

    commit 6f35ea5
    Author: Niels Saurer <[email protected]>
    Date:   Mon Aug 7 22:25:56 2023 +0200

        CI fixes

    commit c6c4844
    Author: Niels Saurer <[email protected]>
    Date:   Sun Aug 6 20:06:31 2023 +0200

        first steps

    commit fb68218
    Author: Niels Saurer <[email protected]>
    Date:   Wed Jul 19 16:21:33 2023 +0000

        Squash transliterator-parser

        structure for transliterator parser

        start parsing ':: ... ;' rules

        complete ::-rule parsing

        add more global filter tests

        add negative tests for '::'-rules, be more restrictive

        update error docs

        add comment about static UnicodeSet type alias

        add variable defs

        escaping and fix unicodeset handling

        fix unicodeset tests

        function calls

        add variable-inside-unicodesets

        update tests

        rewrite parse_section using parse_element

        fix unquoted literal handling

        add cursor/placeholder tests

        add cursor support

        add allow(unused) for this PR

        remove unused dependencies

        add todo about inefficient unicodeset variablemap handling

        allow usage of UnicodeSet's VariableMap directly in TransliteratorParser

        avoid one allocation per parsed unicodeset

        remove done todo about allocation-free unicodeset parser hook

        avoid allocations for number parsing

        invalid num err with offset

        update comment

        switch to allocation free hex parsing (and support for multi escapes)

        fix main merge conflict

        support \p unicodesets

        remove todo for \p unicodeset parsing

        turn low-prio todo about avoiding clones into note

        turn non-memory-safety safety comments into regular comments

        add issue number to TODOs

        add transliteration component crate
  • Loading branch information
skius committed Aug 10, 2023
1 parent 28f11dd commit c6505b6
Show file tree
Hide file tree
Showing 5 changed files with 223 additions and 40 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions experimental/transliterator_parser/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ include = [
[package.metadata.docs.rs]
all-features = true

[dev-dependencies]
zerofrom = { version = "0.1.1", path = "../../utils/zerofrom" }

[dependencies]
icu_collections = { path = "../../components/collections" }
icu_properties = { path = "../../components/properties", default-features = false }
Expand Down
74 changes: 42 additions & 32 deletions experimental/transliterator_parser/src/compile/pass2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ use icu_collections::codepointinvlist::CodePointInversionList;
use std::fmt::{Display, Formatter};
use zerovec::VarZeroVec;

use crate::compile::rule_group_agg::UniConversionRule;
use icu_transliteration::provider as ds;

macro_rules! impl_insert {
Expand Down Expand Up @@ -194,9 +195,6 @@ pub(super) struct Pass2<'a, 'p> {
var_definitions: &'a HashMap<String, &'p [parse::Element]>,
// the inverse of VarTable.compounds
var_to_char: HashMap<String, char>,

id_group_list: Vec<VarZeroVec<'static, ds::SimpleIdULE>>,
conversion_group_list: Vec<VarZeroVec<'static, ds::RuleULE>>,
}

impl<'a, 'p> Pass2<'a, 'p> {
Expand All @@ -216,8 +214,6 @@ impl<'a, 'p> Pass2<'a, 'p> {
var_table: MutVarTable::try_new_from_counts(counts)?,
var_definitions,
var_to_char: HashMap::new(),
id_group_list: Vec::new(),
conversion_group_list: Vec::new(),
})
}

Expand All @@ -226,47 +222,61 @@ impl<'a, 'p> Pass2<'a, 'p> {
rule_groups: super::RuleGroups<'p>,
global_filter: Option<FilterSet>,
) -> Result<ds::RuleBasedTransliterator<'static>> {
let mut compiled_transform_groups: Vec<VarZeroVec<'static, ds::SimpleIdULE>> = Vec::new();
let mut compiled_conversion_groups: Vec<VarZeroVec<'static, ds::RuleULE>> = Vec::new();

for (transform_group, conversion_group) in rule_groups {
let mut compiled_transform_group = Vec::new();
for id in transform_group {
compiled_transform_group.push(self.compile_single_id(id.into_owned()));
}
self.id_group_list
.push(VarZeroVec::from(&compiled_transform_group));
let compiled_transform_group: Vec<_> = transform_group
.into_iter()
.map(|id| self.compile_single_id(id.into_owned()))
.collect();
compiled_transform_groups.push(VarZeroVec::from(&compiled_transform_group));

let mut compiled_conversion_group = Vec::new();
for rule in conversion_group {
let ante = self.compile_section(rule.ante, parse::ElementLocation::Source);
let key = self.compile_section(rule.key, parse::ElementLocation::Source);
let post = self.compile_section(rule.post, parse::ElementLocation::Source);
let replacer =
self.compile_section(rule.replacement, parse::ElementLocation::Target);
let cursor_offset = rule.cursor_offset;
compiled_conversion_group.push(ds::Rule {
ante: ante.into(),
key: key.into(),
post: post.into(),
replacer: replacer.into(),
cursor_offset,
});
}
self.conversion_group_list
.push(VarZeroVec::from(&compiled_conversion_group));
let compiled_conversion_group: Vec<_> = conversion_group
.into_iter()
.map(|rule| self.compile_conversion_rule(rule))
.collect();
compiled_conversion_groups.push(VarZeroVec::from(&compiled_conversion_group));
}

let res = ds::RuleBasedTransliterator {
visibility: true, // TODO(#3736): use metadata
filter: global_filter.unwrap_or(CodePointInversionList::all()),
id_group_list: VarZeroVec::from(&self.id_group_list),
rule_group_list: VarZeroVec::from(&self.conversion_group_list),
id_group_list: VarZeroVec::from(&compiled_transform_groups),
rule_group_list: VarZeroVec::from(&compiled_conversion_groups),
variable_table: self.var_table.finalize(),
};

Ok(res)
}

fn compile_conversion_rule(&mut self, rule: UniConversionRule<'p>) -> ds::Rule<'static> {
let ante = self.compile_section(rule.ante, parse::ElementLocation::Source);
let key = self.compile_section(rule.key, parse::ElementLocation::Source);
let post = self.compile_section(rule.post, parse::ElementLocation::Source);
let replacer = self.compile_section(rule.replacement, parse::ElementLocation::Target);
let cursor_offset = rule.cursor_offset;
ds::Rule {
ante: ante.into(),
key: key.into(),
post: post.into(),
replacer: replacer.into(),
cursor_offset,
}
}

fn compile_single_id(&mut self, id: parse::SingleId) -> ds::SimpleId<'static> {
let id_string = id.basic_id.source.clone(); // TODO(#3736): map legacy ID to internal ID and use here
// TODO(#3736): map legacy ID to internal ID and use here
let id_string = format!(
"{}-{}{}",
id.basic_id.source,
id.basic_id.target,
if let Some(v) = id.basic_id.variant {
format!("/{}", v)
} else {
"".to_owned()
}
);

ds::SimpleId {
id: id_string.into(),
Expand Down
15 changes: 7 additions & 8 deletions experimental/transliterator_parser/src/compile/rule_group_agg.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,6 @@ impl<'p> ForwardRuleGroupAggregator<'p> {

// Represents a non-empty rule group for the forward direction.
#[derive(Debug, Clone)]

enum ForwardRuleGroup<'p> {
Conversion(Vec<UniConversionRule<'p>>),
Transform(Vec<Cow<'p, parse::SingleId>>),
Expand Down Expand Up @@ -182,7 +181,7 @@ impl<'p> ForwardRuleGroup<'p> {
// contiguous C's keep the source order, but contiguous T's are reversed. Also the overall order
// is reversed, of course.
//
// We do this by using VecDeque, push_back, and make_contiguous in the end.
// We do this by using VecDeque, push_front, and make_contiguous in the end.
#[derive(Debug, Clone)]
pub(crate) struct ReverseRuleGroupAggregator<'p> {
current: ReverseRuleGroup<'p>,
Expand Down Expand Up @@ -264,7 +263,7 @@ impl<'p> ReverseRuleGroupAggregator<'p> {
};
let vec_transform_group = transform_group.into(); // non-allocating conversion
self.groups
.push_back((vec_transform_group, associated_conv_group));
.push_front((vec_transform_group, associated_conv_group));
}
}
}
Expand All @@ -277,7 +276,7 @@ impl<'p> ReverseRuleGroupAggregator<'p> {
if let Some(conv_group) = self.preceding_conversion_group.take() {
// a trailing conversion group in source order is the same as having a conversion
// group as the first in-order group. we can just prepend an empty transform group.
self.groups.push_back((Vec::new(), conv_group));
self.groups.push_front((Vec::new(), conv_group));
}

self.groups.into() // non-allocating conversion
Expand All @@ -289,7 +288,7 @@ impl<'p> ReverseRuleGroupAggregator<'p> {
enum ReverseRuleGroup<'p> {
// because contiguous C's are aggregated in source-order, we can just use a Vec
Conversion(Vec<UniConversionRule<'p>>),
// but contiguous T's are aggregated in reverse-order, so we need to use a VecDeque and push_back
// but contiguous T's are aggregated in reverse-order, so we need to use a VecDeque and push_front
Transform(VecDeque<Cow<'p, parse::SingleId>>),
}

Expand All @@ -306,7 +305,7 @@ impl<'p> ReverseRuleGroup<'p> {

fn new_transform(rule: Cow<'p, parse::SingleId>) -> Self {
let mut group = VecDeque::new();
group.push_back(rule);
group.push_front(rule);
Self::Transform(group)
}

Expand All @@ -319,8 +318,8 @@ impl<'p> ReverseRuleGroup<'p> {
}
(Self::Transform(group), UniRule::Transform(rule)) => {
// we receive rules via `push` in source-order, which is the opposite order we want,
// so we push_back.
group.push_back(rule);
// so we push_front.
group.push_front(rule);
None
}
(Self::Conversion(_), UniRule::Transform(new_rule)) => {
Expand Down
170 changes: 170 additions & 0 deletions experimental/transliterator_parser/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -128,3 +128,173 @@ where
// TODO(#3736): pass direction from metadata
compile::compile(parsed, parse::Direction::Both)
}

#[cfg(test)]
mod tests {
use super::*;
use crate::parse::UnicodeSet;
use icu_collections::codepointinvlist::CodePointInversionList;
use icu_collections::codepointinvliststringlist::CodePointInversionListAndStringList;
use icu_transliteration::provider as ds;
use zerofrom::ZeroFrom;

fn parse_set(source: &str) -> UnicodeSet {
icu_unicodeset_parser::parse_unstable(source, &icu_properties::provider::Baked)
.expect("Parsing failed")
.0
}

#[test]
fn test_source_to_struct() {
let source = r#"
:: [1] ;
:: Latin-InterIndic ;
$a = [a] [b]+ ;
$unused = [c{string}]+? ;
$b = $a? 'literal chars' ;
x } [a-z] > y ;
$a > ab ;
'reverse output:' &RevFnCall($1 'padding') < ($b) ;
^ left } $ <> ^ { right } [0-9] $ ;
:: [\ ] Remove (AnyRev-AddRandomSpaces/FiftyPercent) ;
# splits up the forward rules
forward rule that > splits up rule groups ;
:: InterIndic-Devanagari ;
"#;

let (forward, reverse) = parse(source).expect("parsing failed");
let forward = forward.expect("forward transliterator expected");
let reverse = reverse.expect("reverse transliterator expected");

{
assert_eq!(&forward.filter, parse_set("[1]").code_points());

let vt = &forward.variable_table;
assert_eq!(vt.compounds.len(), 1);
assert_eq!(vt.quantifiers_opt.len(), 0);
assert_eq!(vt.quantifiers_kleene.len(), 0);
assert_eq!(vt.quantifiers_kleene_plus.len(), 1);
assert_eq!(vt.segments.len(), 0);
assert_eq!(vt.unicode_sets.len(), 3);
assert_eq!(vt.function_calls.len(), 0);

assert_eq!(&vt.compounds[0], "\u{F0003}\u{F0001}"); // [a] and [b]+ (the quantifier contains [b])
assert_eq!(&vt.quantifiers_kleene_plus[0], "\u{F0004}"); // [b] from [b]+
let uset1 = CodePointInversionListAndStringList::zero_from(&vt.unicode_sets[0]);
assert_eq!(uset1, parse_set("[a-z]"));
let uset2 = CodePointInversionListAndStringList::zero_from(&vt.unicode_sets[1]);
assert_eq!(uset2, parse_set("[a]"));
let uset3 = CodePointInversionListAndStringList::zero_from(&vt.unicode_sets[2]);
assert_eq!(uset3, parse_set("[b]"));

assert_eq!(forward.id_group_list.len(), 3);
assert_eq!(forward.rule_group_list.len(), 3);

assert_eq!(forward.id_group_list[0].len(), 1);
assert_eq!(forward.id_group_list[1].len(), 1);
assert_eq!(forward.id_group_list[2].len(), 1);

assert_eq!(forward.rule_group_list[0].len(), 3);
assert_eq!(forward.rule_group_list[1].len(), 1);
assert_eq!(forward.rule_group_list[2].len(), 0);

let rule1_1 = ds::Rule::zero_from(&forward.rule_group_list[0][0]);
assert_eq!(rule1_1.ante, "");
assert_eq!(rule1_1.key, "x");
assert_eq!(rule1_1.post, "\u{F0002}"); // [a-z]
assert_eq!(rule1_1.replacer, "y");

let rule1_2 = ds::Rule::zero_from(&forward.rule_group_list[0][1]);
assert_eq!(rule1_2.ante, "");
assert_eq!(rule1_2.key, "\u{F0000}"); // $a
assert_eq!(rule1_2.post, "");
assert_eq!(rule1_2.replacer, "ab");

let rule1_3 = ds::Rule::zero_from(&forward.rule_group_list[0][2]);
assert_eq!(rule1_3.ante, "");
assert_eq!(rule1_3.key, "\u{FFFFC}left"); // start anchor
assert_eq!(rule1_3.post, "\u{FFFFD}"); // end anchor
assert_eq!(rule1_3.replacer, "right");

let rule2_1 = ds::Rule::zero_from(&forward.rule_group_list[1][0]);
assert_eq!(rule2_1.ante, "");
assert_eq!(rule2_1.key, "forwardrulethat");
assert_eq!(rule2_1.post, "");
assert_eq!(rule2_1.replacer, "splitsuprulegroups");

let id1 = ds::SimpleId::zero_from(&forward.id_group_list[0][0]);
assert_eq!(id1.id, "Latin-InterIndic");
assert_eq!(id1.filter, CodePointInversionList::all());

let id2 = ds::SimpleId::zero_from(&forward.id_group_list[1][0]);
assert_eq!(id2.id, "Any-Remove");
assert_eq!(&id2.filter, parse_set(r"[\ ]").code_points());

let id3 = ds::SimpleId::zero_from(&forward.id_group_list[2][0]);
assert_eq!(id3.id, "InterIndic-Devanagari");
assert_eq!(id3.filter, CodePointInversionList::all());
}
{
assert_eq!(&reverse.filter, &CodePointInversionList::all());

let vt = &reverse.variable_table;
assert_eq!(vt.compounds.len(), 2); // base: \u{F0000}
assert_eq!(vt.quantifiers_opt.len(), 1); // base: \u{F0002}
assert_eq!(vt.quantifiers_kleene.len(), 0); // base: \u{F0003}
assert_eq!(vt.quantifiers_kleene_plus.len(), 1); // base: \u{F0003}
assert_eq!(vt.segments.len(), 1); // base: \u{F0004}
assert_eq!(vt.unicode_sets.len(), 3); // base: \u{F0005}
assert_eq!(vt.function_calls.len(), 1); // base: \u{F0008}
// backref base: \u{F0009}

assert_eq!(&vt.compounds[0], "\u{F0005}\u{F0003}"); // $a = [a] [b]+ (quantifier contains [b])
assert_eq!(&vt.compounds[1], "\u{F0002}literal chars"); // $b = $a? (quantifier contains $a)
assert_eq!(&vt.quantifiers_opt[0], "\u{F0000}"); // $a from $a?
assert_eq!(&vt.quantifiers_kleene_plus[0], "\u{F0006}"); // [b] from [b]+
assert_eq!(&vt.segments[0], "\u{F0001}"); // $b from ($b)
let uset1 = CodePointInversionListAndStringList::zero_from(&vt.unicode_sets[0]);
assert_eq!(uset1, parse_set("[a]"));
let uset2 = CodePointInversionListAndStringList::zero_from(&vt.unicode_sets[1]);
assert_eq!(uset2, parse_set("[b]"));
let uset3 = CodePointInversionListAndStringList::zero_from(&vt.unicode_sets[2]);
assert_eq!(uset3, parse_set("[0-9]"));
let fcall = ds::FunctionCall::zero_from(&vt.function_calls[0]);
assert_eq!(fcall.translit.id, "Any-RevFnCall");
assert_eq!(fcall.translit.filter, CodePointInversionList::all());
assert_eq!(fcall.arg, "\u{F0009}padding"); // $1 and 'padding'

assert_eq!(reverse.id_group_list.len(), 2);
assert_eq!(reverse.rule_group_list.len(), 2);

assert_eq!(reverse.id_group_list[0].len(), 2);
assert_eq!(reverse.id_group_list[1].len(), 1);

assert_eq!(reverse.rule_group_list[0].len(), 2);
assert_eq!(reverse.rule_group_list[1].len(), 0);

let rule1_1 = ds::Rule::zero_from(&reverse.rule_group_list[0][0]);
assert_eq!(rule1_1.ante, "");
assert_eq!(rule1_1.key, "\u{F0004}");
assert_eq!(rule1_1.post, ""); // [a-z]
assert_eq!(rule1_1.replacer, "reverse output:\u{F0008}"); // function call

let rule1_2 = ds::Rule::zero_from(&reverse.rule_group_list[0][1]);
assert_eq!(rule1_2.ante, "\u{FFFFC}"); // start anchor
assert_eq!(rule1_2.key, "right");
assert_eq!(rule1_2.post, "\u{F0007}\u{FFFFD}"); // [0-9] and end anchor
assert_eq!(rule1_2.replacer, "left");

let id1_1 = ds::SimpleId::zero_from(&reverse.id_group_list[0][0]);
assert_eq!(id1_1.id, "Devanagari-InterIndic");
assert_eq!(id1_1.filter, CodePointInversionList::all());

let id1_2 = ds::SimpleId::zero_from(&reverse.id_group_list[0][1]);
assert_eq!(id1_2.id, "AnyRev-AddRandomSpaces/FiftyPercent");
assert_eq!(id1_2.filter, CodePointInversionList::all());

let id2_1 = ds::SimpleId::zero_from(&reverse.id_group_list[1][0]);
assert_eq!(id2_1.id, "InterIndic-Latin");
assert_eq!(id2_1.filter, CodePointInversionList::all());
}
}
}

0 comments on commit c6505b6

Please sign in to comment.