From 93663e41053c9aeebf85b0b94fb3fcdcae95d780 Mon Sep 17 00:00:00 2001 From: Niels Saurer Date: Wed, 9 Aug 2023 16:09:29 +0200 Subject: [PATCH] add rule group aggregation --- .../transliterator_parser/src/compile.rs | 27 +- .../src/compile/rule_group_agg.rs | 300 ++++++++++++++++++ .../transliterator_parser/src/parse.rs | 9 + 3 files changed, 334 insertions(+), 2 deletions(-) create mode 100644 experimental/transliterator_parser/src/compile/rule_group_agg.rs diff --git a/experimental/transliterator_parser/src/compile.rs b/experimental/transliterator_parser/src/compile.rs index 025ff2e8d8b..f3b35435898 100644 --- a/experimental/transliterator_parser/src/compile.rs +++ b/experimental/transliterator_parser/src/compile.rs @@ -119,16 +119,31 @@ as described in the zero-copy format, and the maps here are just arrays) */ use crate::parse; -use crate::parse::{ElementLocation as EL, HalfRule, QuantifierKind}; +use crate::parse::{ElementLocation as EL, HalfRule, QuantifierKind, UnicodeSet}; use parse::Result; use parse::PEK; use std::collections::{HashMap, HashSet}; +mod rule_group_agg; + enum SingleDirection { Forward, Reverse, } +// parse::Rule::Conversion but unidirectional +#[derive(Debug, Clone)] +struct UniConversionRule<'p> { + ante: &'p [parse::Element], + key: &'p [parse::Element], + post: &'p [parse::Element], + replacement: &'p [parse::Element], + cursor_offset: i32, +} + +// transform + conversion rule groups for a single direction +type RuleGroups<'p> = Vec<(Vec, Vec>)>; + /// The number of elements for each `VZV` in the `VarTable`. #[derive(Debug, Copy, Clone, Default, PartialEq, Eq)] struct SpecialConstructCounts { @@ -168,6 +183,8 @@ struct Pass1<'p> { forward_data: Pass1Data, reverse_data: Pass1Data, variable_data: HashMap, + forward_filter: Option, + reverse_filter: Option, variable_definitions: HashMap, // variables which contain constructs that are only allowed to appear on the source side // e.g., $a = c+; $set = [a-z]; ... @@ -183,6 +200,8 @@ impl<'p> Pass1<'p> { variable_data: HashMap::new(), variable_definitions: HashMap::new(), target_disallowed_variables: HashSet::new(), + forward_filter: None, + reverse_filter: None, } } @@ -193,6 +212,8 @@ impl<'p> Pass1<'p> { // iterate through remaining rules and perform checks according to interim specification + let mut forward_rule_group = Vec::new(); + for rule in rules { match rule { parse::Rule::GlobalFilter(_) | parse::Rule::GlobalInverseFilter(_) => { @@ -214,12 +235,13 @@ impl<'p> Pass1<'p> { Pass1ResultGenerator::generate(self) } - fn validate_global_filters<'a>(&self, rules: &'a [parse::Rule]) -> Result<&'a [parse::Rule]> { + fn validate_global_filters<'a>(&mut self, rules: &'a [parse::Rule]) -> Result<&'a [parse::Rule]> { let rules = match rules { [parse::Rule::GlobalFilter(filter), rest @ ..] => { if filter.has_strings() { return Err(PEK::GlobalFilterWithStrings.into()); } + self.forward_filter = Some(filter.clone()); rest } @@ -230,6 +252,7 @@ impl<'p> Pass1<'p> { if filter.has_strings() { return Err(PEK::GlobalFilterWithStrings.into()); } + self.reverse_filter = Some(filter.clone()); rest } diff --git a/experimental/transliterator_parser/src/compile/rule_group_agg.rs b/experimental/transliterator_parser/src/compile/rule_group_agg.rs new file mode 100644 index 00000000000..a4e954a86ec --- /dev/null +++ b/experimental/transliterator_parser/src/compile/rule_group_agg.rs @@ -0,0 +1,300 @@ +use std::collections::VecDeque; +use crate::compile::UniConversionRule; +use crate::parse; +use crate::parse::SingleId; + +enum UniRule<'p> { + Conversion(super::UniConversionRule<'p>), + Transform(parse::SingleId), +} + +enum ForwardRuleGroup<'p> { + Conversion(Vec>), + Transform(Vec), +} + +impl<'p> ForwardRuleGroup<'p> { + fn new_conversion(rule: super::UniConversionRule<'p>) -> Self { + Self::Conversion(vec![rule]) + } + + fn new_transform(rule: parse::SingleId) -> Self { + Self::Transform(vec![rule]) + } + + // if the group is full return self, and push the rule into a new group + fn push(&mut self, rule: UniRule<'p>) -> Option { + match (self, rule) { + (Self::Conversion(group), UniRule::Conversion(rule)) => { + group.push(rule); + None + } + (Self::Transform(group), UniRule::Transform(rule)) => { + group.push(rule); + None + } + (Self::Conversion(_), UniRule::Transform(new_rule)) => { + Some(std::mem::replace(self, Self::new_transform(new_rule))) + } + (Self::Transform(_), UniRule::Conversion(new_rule)) => { + Some(std::mem::replace(self, Self::new_conversion(new_rule))) + } + } + } +} + +struct ForwardRuleGroupAggregator<'p> { + current: ForwardRuleGroup<'p>, + groups: Vec<(Vec, Vec>)>, + // the transform_group of a group pair appears first + preceding_transform_group: Option>, +} + +impl<'p> ForwardRuleGroupAggregator<'p> { + pub(crate) fn new() -> Self { + Self { + // this is a somewhat important first group. + // we want &[(transform_group), (conversion_group)] in the end, and because we iterate + // in source-order, the first element of that is a transform_group. + current: ForwardRuleGroup::Transform(Vec::new()), + groups: Vec::new(), + preceding_transform_group: None, + } + } + + pub(crate) fn push(&mut self, rule: &'p parse::Rule) { + match rule { + parse::Rule::Conversion(source_half, dir, target_half) => { + if !dir.permits(parse::Direction::Forward) { + return; + } + + let ante = &source_half.ante; + let key = &source_half.key; + let post = &source_half.post; + let replacement = &target_half.key; + + let rule = UniConversionRule { + ante, + key, + post, + replacement, + cursor_offset: 0, // TODO - maybe pass this as an additional parameter to push? + }; + + let finished_group = self.current.push(UniRule::Conversion(rule)); + if let Some(finished_group) = finished_group { + self.push_rule_group(finished_group); + } + } + parse::Rule::Transform(fwd, _) => { + let finished_group = self.current.push(UniRule::Transform(fwd.clone())); + if let Some(finished_group) = finished_group { + self.push_rule_group(finished_group); + } + } + parse::Rule::VariableDefinition(..) => { + // variable definitions are handled in a previous step + } + parse::Rule::GlobalFilter(..) => { + // global filters are handled in a previous step + } + } + } + + fn push_rule_group(&mut self, group: ForwardRuleGroup<'p>) { + match group { + ForwardRuleGroup::Transform(transform_group) => { + // because ForwardRuleGroup returns a different kind of group every time, + // the previous group must have been a conversion group which pushed the + // finished group pair into self.groups. + debug_assert!(self.preceding_transform_group.is_none()); + self.preceding_transform_group = Some(transform_group); + }, + ForwardRuleGroup::Conversion(conversion_group) => { + let associated_transform_group = match self.preceding_transform_group.take() { + Some(transform_group) => transform_group, + // match arm is necessary if the first source-order rule group is a conversion group + None => Vec::new(), + }; + self.groups.push((associated_transform_group, conversion_group)); + }, + } + } + + pub(crate) fn finalize(mut self) -> Vec<(Vec, Vec>)> { + // push the current group + self.push_rule_group(self.current); + // push any remaining group pairs + match self.preceding_transform_group.take() { + Some(transform_group) => { + self.groups.push((transform_group, Vec::new())); + }, + None => {}, + } + + self.groups + } +} + + + +// Rules will be pushed in source-order (i.e., forward order), which means we have to be careful +// in which order we aggregate them. Example: (T = transform rule, C = conversion rule) +// T1 T2 C1 C2 T3 C3 C4 T4 T5 +// should be aggregated as +// (T5, T4), (C3, C4), (T3), (C1, C2), (T2, T1) (assuming all rules apply to the reverse direction) +// note in particular the discrepancy between the order of contiguous T's and contiguous C's: +// contiguous C's keep the source order, but contiguous T's are reversed. Also the overall order +// is reversed, of course. +// +// We do this by using VecDeque, push_back, and make_contiguous in the end. +#[derive(Debug, Clone)] +struct ReverseRuleGroupAggregator<'p> { + current: ReverseRuleGroup<'p>, + // VecDeque because we encounter groups in source-order, but we want to aggregate them in + // reverse-order. + groups: VecDeque<(Vec, Vec>)>, + // the conversion_group of a group pair appears first due to the reverse order + preceding_conversion_group: Option>>, +} + +impl<'p> ReverseRuleGroupAggregator<'p> { + pub(crate) fn new() -> Self { + Self { + // this is a somewhat important first group. + // we want &[(transform_group), (conversion_group)] in the end, and because we iterate + // in opposite order, the last element of that slice is a conversion_group. + current: ReverseRuleGroup::Conversion(Vec::new()), + groups: VecDeque::new(), + preceding_conversion_group: None, + } + } + + pub(crate) fn push(&mut self, rule: &'p parse::Rule) { + match rule { + parse::Rule::Conversion(target_half, dir, source_half) => { + if !dir.permits(parse::Direction::Reverse) { + return; + } + + let ante = &source_half.ante; + let key = &source_half.key; + let post = &source_half.post; + let replacement = &target_half.key; + + let rule = UniConversionRule { + ante, + key, + post, + replacement, + cursor_offset: 0, // TODO - maybe pass this as an additional parameter to push? + }; + + let finished_group = self.current.push(UniRule::Conversion(rule)); + if let Some(finished_group) = finished_group { + self.push_rule_group(finished_group); + } + } + parse::Rule::Transform(fwd, rev) => { + let rev = rev.unwrap_or_else(|| fwd.clone().reverse()); + + let finished_group = self.current.push(UniRule::Transform(rev)); + if let Some(finished_group) = finished_group { + self.push_rule_group(finished_group); + } + } + parse::Rule::VariableDefinition(..) => { + // variable definitions are handled in a previous step + } + parse::Rule::GlobalFilter(..) => { + // global filters are handled in a previous step + } + } + } + + fn push_rule_group(&mut self, group: ReverseRuleGroup<'p>) { + match group { + ReverseRuleGroup::Conversion(conv_group) => { + // because ReverseRuleGroup returns a different kind of group every time, + // the previous group must have been a transform group which pushed the + // finished group pair into self.groups. + debug_assert!(self.preceding_conversion_group.is_none()); + self.preceding_conversion_group = Some(conv_group); + }, + ReverseRuleGroup::Transform(transform_group) => { + let associated_conv_group = match self.preceding_conversion_group.take() { + Some(conv_group) => conv_group, + // match arm is necessary if the first source-order rule group is a transform group + None => Vec::new(), + }; + let vec_transform_group = transform_group.into(); // non-allocating conversion + self.groups.push_back((vec_transform_group, associated_conv_group)); + }, + } + } + + pub(crate) fn finalize(mut self) -> Vec<(Vec, Vec>)> { + // push the current group + self.push_rule_group(self.current); + // push any remaining group pairs + match self.preceding_conversion_group.take() { + Some(conv_group) => { + // a trailing conversion group in source order is the same as having a conversion + // group as the first in-order group. we can just prepend an empty transform group. + self.groups.push_back((Vec::new(), conv_group)); + }, + None => {}, + } + + self.groups.into() // non-allocating conversion + } +} + +#[derive(Debug, Clone)] +enum ReverseRuleGroup<'p> { + // because contiguous C's are aggregated in source-order, we can just use a Vec + Conversion(Vec>), + // but contiguous T's are aggregated in reverse-order, so we need to use a VecDeque and push_back + Transform(VecDeque), +} + +impl<'p> Default for ReverseRuleGroup<'p> { + fn default() -> Self { + + Self::Conversion(Vec::new()) + } +} + +impl<'p> ReverseRuleGroup<'p> { + fn new_conversion(rule: super::UniConversionRule<'p>) -> Self { + Self::Conversion(vec![rule]) + } + + fn new_transform(rule: parse::SingleId) -> Self { + let mut group = VecDeque::new(); + group.push_back(rule); + Self::Transform(group) + } + + fn push(&mut self, rule: UniRule<'p>) -> Option { + match (self, rule) { + (Self::Conversion(group), UniRule::Conversion(rule)) => { + group.push(rule); + None + } + (Self::Transform(group), UniRule::Transform(rule)) => { + // we receive rules via `push` in source-order, which is the opposite order we want, + // so we push_back. + group.push_back(rule); + None + } + (Self::Conversion(_), UniRule::Transform(new_rule)) => { + Some(std::mem::replace(self, Self::new_transform(new_rule))) + } + (Self::Transform(_), UniRule::Conversion(new_rule)) => { + Some(std::mem::replace(self, Self::new_conversion(new_rule))) + } + } + } +} \ No newline at end of file diff --git a/experimental/transliterator_parser/src/parse.rs b/experimental/transliterator_parser/src/parse.rs index f3b0e9d15d2..a69ee23bcc7 100644 --- a/experimental/transliterator_parser/src/parse.rs +++ b/experimental/transliterator_parser/src/parse.rs @@ -198,6 +198,15 @@ pub(crate) struct SingleId { pub(crate) basic_id: BasicId, } +impl SingleId { + pub(crate) fn reverse(self) -> Self { + Self { + basic_id: self.basic_id.reverse(), + ..self + } + } +} + #[derive(Debug, Clone)] pub(crate) enum Element { // Examples: