Skip to content

Commit

Permalink
add rule group aggregation
Browse files Browse the repository at this point in the history
  • Loading branch information
skius committed Aug 9, 2023
1 parent 57666eb commit 93663e4
Show file tree
Hide file tree
Showing 3 changed files with 334 additions and 2 deletions.
27 changes: 25 additions & 2 deletions experimental/transliterator_parser/src/compile.rs
Original file line number Diff line number Diff line change
Expand Up @@ -119,16 +119,31 @@ as described in the zero-copy format, and the maps here are just arrays)
*/

use crate::parse;
use crate::parse::{ElementLocation as EL, HalfRule, QuantifierKind};
use crate::parse::{ElementLocation as EL, HalfRule, QuantifierKind, UnicodeSet};
use parse::Result;
use parse::PEK;
use std::collections::{HashMap, HashSet};

mod rule_group_agg;

enum SingleDirection {
Forward,
Reverse,
}

// parse::Rule::Conversion but unidirectional
#[derive(Debug, Clone)]
struct UniConversionRule<'p> {
ante: &'p [parse::Element],
key: &'p [parse::Element],
post: &'p [parse::Element],
replacement: &'p [parse::Element],
cursor_offset: i32,
}

// transform + conversion rule groups for a single direction
type RuleGroups<'p> = Vec<(Vec<parse::SingleId>, Vec<UniConversionRule<'p>>)>;

/// The number of elements for each `VZV` in the `VarTable`.
#[derive(Debug, Copy, Clone, Default, PartialEq, Eq)]
struct SpecialConstructCounts {
Expand Down Expand Up @@ -168,6 +183,8 @@ struct Pass1<'p> {
forward_data: Pass1Data,
reverse_data: Pass1Data,
variable_data: HashMap<String, Pass1Data>,
forward_filter: Option<UnicodeSet>,
reverse_filter: Option<UnicodeSet>,
variable_definitions: HashMap<String, &'p [parse::Element]>,
// variables which contain constructs that are only allowed to appear on the source side
// e.g., $a = c+; $set = [a-z]; ...
Expand All @@ -183,6 +200,8 @@ impl<'p> Pass1<'p> {
variable_data: HashMap::new(),
variable_definitions: HashMap::new(),
target_disallowed_variables: HashSet::new(),
forward_filter: None,
reverse_filter: None,
}
}

Expand All @@ -193,6 +212,8 @@ impl<'p> Pass1<'p> {

// iterate through remaining rules and perform checks according to interim specification

let mut forward_rule_group = Vec::new();

for rule in rules {
match rule {
parse::Rule::GlobalFilter(_) | parse::Rule::GlobalInverseFilter(_) => {
Expand All @@ -214,12 +235,13 @@ impl<'p> Pass1<'p> {
Pass1ResultGenerator::generate(self)
}

fn validate_global_filters<'a>(&self, rules: &'a [parse::Rule]) -> Result<&'a [parse::Rule]> {
fn validate_global_filters<'a>(&mut self, rules: &'a [parse::Rule]) -> Result<&'a [parse::Rule]> {
let rules = match rules {
[parse::Rule::GlobalFilter(filter), rest @ ..] => {
if filter.has_strings() {
return Err(PEK::GlobalFilterWithStrings.into());
}
self.forward_filter = Some(filter.clone());

rest
}
Expand All @@ -230,6 +252,7 @@ impl<'p> Pass1<'p> {
if filter.has_strings() {
return Err(PEK::GlobalFilterWithStrings.into());
}
self.reverse_filter = Some(filter.clone());

rest
}
Expand Down
300 changes: 300 additions & 0 deletions experimental/transliterator_parser/src/compile/rule_group_agg.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,300 @@
use std::collections::VecDeque;
use crate::compile::UniConversionRule;
use crate::parse;
use crate::parse::SingleId;

enum UniRule<'p> {
Conversion(super::UniConversionRule<'p>),
Transform(parse::SingleId),
}

enum ForwardRuleGroup<'p> {
Conversion(Vec<super::UniConversionRule<'p>>),
Transform(Vec<parse::SingleId>),
}

impl<'p> ForwardRuleGroup<'p> {
fn new_conversion(rule: super::UniConversionRule<'p>) -> Self {
Self::Conversion(vec![rule])
}

fn new_transform(rule: parse::SingleId) -> Self {
Self::Transform(vec![rule])
}

// if the group is full return self, and push the rule into a new group
fn push(&mut self, rule: UniRule<'p>) -> Option<Self> {
match (self, rule) {
(Self::Conversion(group), UniRule::Conversion(rule)) => {
group.push(rule);
None
}
(Self::Transform(group), UniRule::Transform(rule)) => {
group.push(rule);
None
}
(Self::Conversion(_), UniRule::Transform(new_rule)) => {
Some(std::mem::replace(self, Self::new_transform(new_rule)))
}
(Self::Transform(_), UniRule::Conversion(new_rule)) => {
Some(std::mem::replace(self, Self::new_conversion(new_rule)))
}
}
}
}

struct ForwardRuleGroupAggregator<'p> {
current: ForwardRuleGroup<'p>,
groups: Vec<(Vec<SingleId>, Vec<UniConversionRule<'p>>)>,
// the transform_group of a group pair appears first
preceding_transform_group: Option<Vec<SingleId>>,
}

impl<'p> ForwardRuleGroupAggregator<'p> {
pub(crate) fn new() -> Self {
Self {
// this is a somewhat important first group.
// we want &[(transform_group), (conversion_group)] in the end, and because we iterate
// in source-order, the first element of that is a transform_group.
current: ForwardRuleGroup::Transform(Vec::new()),
groups: Vec::new(),
preceding_transform_group: None,
}
}

pub(crate) fn push(&mut self, rule: &'p parse::Rule) {
match rule {
parse::Rule::Conversion(source_half, dir, target_half) => {
if !dir.permits(parse::Direction::Forward) {
return;
}

let ante = &source_half.ante;
let key = &source_half.key;
let post = &source_half.post;
let replacement = &target_half.key;

let rule = UniConversionRule {
ante,
key,
post,
replacement,
cursor_offset: 0, // TODO - maybe pass this as an additional parameter to push?
};

let finished_group = self.current.push(UniRule::Conversion(rule));
if let Some(finished_group) = finished_group {
self.push_rule_group(finished_group);
}
}
parse::Rule::Transform(fwd, _) => {
let finished_group = self.current.push(UniRule::Transform(fwd.clone()));
if let Some(finished_group) = finished_group {
self.push_rule_group(finished_group);
}
}
parse::Rule::VariableDefinition(..) => {
// variable definitions are handled in a previous step
}
parse::Rule::GlobalFilter(..) => {
// global filters are handled in a previous step
}
}
}

fn push_rule_group(&mut self, group: ForwardRuleGroup<'p>) {
match group {
ForwardRuleGroup::Transform(transform_group) => {
// because ForwardRuleGroup returns a different kind of group every time,
// the previous group must have been a conversion group which pushed the
// finished group pair into self.groups.
debug_assert!(self.preceding_transform_group.is_none());
self.preceding_transform_group = Some(transform_group);
},
ForwardRuleGroup::Conversion(conversion_group) => {
let associated_transform_group = match self.preceding_transform_group.take() {
Some(transform_group) => transform_group,
// match arm is necessary if the first source-order rule group is a conversion group
None => Vec::new(),
};
self.groups.push((associated_transform_group, conversion_group));
},
}
}

pub(crate) fn finalize(mut self) -> Vec<(Vec<SingleId>, Vec<UniConversionRule<'p>>)> {
// push the current group
self.push_rule_group(self.current);
// push any remaining group pairs
match self.preceding_transform_group.take() {
Some(transform_group) => {
self.groups.push((transform_group, Vec::new()));
},
None => {},
}

self.groups
}
}



// Rules will be pushed in source-order (i.e., forward order), which means we have to be careful
// in which order we aggregate them. Example: (T = transform rule, C = conversion rule)
// T1 T2 C1 C2 T3 C3 C4 T4 T5
// should be aggregated as
// (T5, T4), (C3, C4), (T3), (C1, C2), (T2, T1) (assuming all rules apply to the reverse direction)
// note in particular the discrepancy between the order of contiguous T's and contiguous C's:
// contiguous C's keep the source order, but contiguous T's are reversed. Also the overall order
// is reversed, of course.
//
// We do this by using VecDeque, push_back, and make_contiguous in the end.
#[derive(Debug, Clone)]
struct ReverseRuleGroupAggregator<'p> {
current: ReverseRuleGroup<'p>,
// VecDeque because we encounter groups in source-order, but we want to aggregate them in
// reverse-order.
groups: VecDeque<(Vec<SingleId>, Vec<UniConversionRule<'p>>)>,
// the conversion_group of a group pair appears first due to the reverse order
preceding_conversion_group: Option<Vec<UniConversionRule<'p>>>,
}

impl<'p> ReverseRuleGroupAggregator<'p> {
pub(crate) fn new() -> Self {
Self {
// this is a somewhat important first group.
// we want &[(transform_group), (conversion_group)] in the end, and because we iterate
// in opposite order, the last element of that slice is a conversion_group.
current: ReverseRuleGroup::Conversion(Vec::new()),
groups: VecDeque::new(),
preceding_conversion_group: None,
}
}

pub(crate) fn push(&mut self, rule: &'p parse::Rule) {
match rule {
parse::Rule::Conversion(target_half, dir, source_half) => {
if !dir.permits(parse::Direction::Reverse) {
return;
}

let ante = &source_half.ante;
let key = &source_half.key;
let post = &source_half.post;
let replacement = &target_half.key;

let rule = UniConversionRule {
ante,
key,
post,
replacement,
cursor_offset: 0, // TODO - maybe pass this as an additional parameter to push?
};

let finished_group = self.current.push(UniRule::Conversion(rule));
if let Some(finished_group) = finished_group {
self.push_rule_group(finished_group);
}
}
parse::Rule::Transform(fwd, rev) => {
let rev = rev.unwrap_or_else(|| fwd.clone().reverse());

let finished_group = self.current.push(UniRule::Transform(rev));
if let Some(finished_group) = finished_group {
self.push_rule_group(finished_group);
}
}
parse::Rule::VariableDefinition(..) => {
// variable definitions are handled in a previous step
}
parse::Rule::GlobalFilter(..) => {
// global filters are handled in a previous step
}
}
}

fn push_rule_group(&mut self, group: ReverseRuleGroup<'p>) {
match group {
ReverseRuleGroup::Conversion(conv_group) => {
// because ReverseRuleGroup returns a different kind of group every time,
// the previous group must have been a transform group which pushed the
// finished group pair into self.groups.
debug_assert!(self.preceding_conversion_group.is_none());
self.preceding_conversion_group = Some(conv_group);
},
ReverseRuleGroup::Transform(transform_group) => {
let associated_conv_group = match self.preceding_conversion_group.take() {
Some(conv_group) => conv_group,
// match arm is necessary if the first source-order rule group is a transform group
None => Vec::new(),
};
let vec_transform_group = transform_group.into(); // non-allocating conversion
self.groups.push_back((vec_transform_group, associated_conv_group));
},
}
}

pub(crate) fn finalize(mut self) -> Vec<(Vec<SingleId>, Vec<UniConversionRule<'p>>)> {
// push the current group
self.push_rule_group(self.current);
// push any remaining group pairs
match self.preceding_conversion_group.take() {
Some(conv_group) => {
// a trailing conversion group in source order is the same as having a conversion
// group as the first in-order group. we can just prepend an empty transform group.
self.groups.push_back((Vec::new(), conv_group));
},
None => {},
}

self.groups.into() // non-allocating conversion
}
}

#[derive(Debug, Clone)]
enum ReverseRuleGroup<'p> {
// because contiguous C's are aggregated in source-order, we can just use a Vec
Conversion(Vec<UniConversionRule<'p>>),
// but contiguous T's are aggregated in reverse-order, so we need to use a VecDeque and push_back
Transform(VecDeque<SingleId>),
}

impl<'p> Default for ReverseRuleGroup<'p> {
fn default() -> Self {

Self::Conversion(Vec::new())
}
}

impl<'p> ReverseRuleGroup<'p> {
fn new_conversion(rule: super::UniConversionRule<'p>) -> Self {
Self::Conversion(vec![rule])
}

fn new_transform(rule: parse::SingleId) -> Self {
let mut group = VecDeque::new();
group.push_back(rule);
Self::Transform(group)
}

fn push(&mut self, rule: UniRule<'p>) -> Option<Self> {
match (self, rule) {
(Self::Conversion(group), UniRule::Conversion(rule)) => {
group.push(rule);
None
}
(Self::Transform(group), UniRule::Transform(rule)) => {
// we receive rules via `push` in source-order, which is the opposite order we want,
// so we push_back.
group.push_back(rule);
None
}
(Self::Conversion(_), UniRule::Transform(new_rule)) => {
Some(std::mem::replace(self, Self::new_transform(new_rule)))
}
(Self::Transform(_), UniRule::Conversion(new_rule)) => {
Some(std::mem::replace(self, Self::new_conversion(new_rule)))
}
}
}
}
Loading

0 comments on commit 93663e4

Please sign in to comment.