Skip to content

Commit

Permalink
[Draft] Fractional MultipleOf (#3)
Browse files Browse the repository at this point in the history
  • Loading branch information
mmoskal authored Jan 5, 2025
2 parents 68f0e0a + df5abfe commit bfb30e2
Show file tree
Hide file tree
Showing 10 changed files with 240 additions and 57 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,4 @@ path = "src/derivre.rs"
[features]
default = ["compress"]
# default = []
compress = []
compress = []
48 changes: 35 additions & 13 deletions src/ast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,16 @@ pub enum Expr<'a> {
NoMatch,
Byte(u8),
ByteSet(&'a [u32]),
// RemainderIs(d, r) matches numbers N where N % d == r
// RemainderIs(d, d) is equivalent to RemainderIs(d, 0) \ EmptyString
RemainderIs(u32, u32),
// RemainderIs (with fractional_part=false) matches numbers N where (N + remainder*10^len(N)) % divisor*10^-scale == 0.
// For remainder = 0, this is equivalent to N being divisible by d*10^-scale.
// The remainder = divisor case is the same, but we exclude the empty string.
// fractional_part = true is only for bookkeeping and signifies that we have produced a decimal point.
RemainderIs {
divisor: u32,
remainder: u32,
scale: u32,
fractional_part: bool,
},
Lookahead(ExprFlags, ExprRef, u32),
Not(ExprFlags, ExprRef),
Repeat(ExprFlags, ExprRef, u32, u32),
Expand Down Expand Up @@ -187,7 +194,7 @@ impl<'a> Expr<'a> {
Expr::Lookahead(_, e, _) | Expr::Not(_, e) | Expr::Repeat(_, e, _, _) => {
std::slice::from_ref(e)
}
Expr::RemainderIs(_, _)
Expr::RemainderIs { .. }
| Expr::EmptyString
| Expr::NoMatch
| Expr::Byte(_)
Expand All @@ -199,8 +206,8 @@ impl<'a> Expr<'a> {
fn get_flags(&self) -> ExprFlags {
match self {
Expr::EmptyString => ExprFlags::POSITIVE_NULLABLE,
Expr::RemainderIs(_, k) => {
if *k == 0 {
Expr::RemainderIs { remainder, .. } => {
if *remainder == 0 {
ExprFlags::POSITIVE_NULLABLE
} else {
ExprFlags::POSITIVE
Expand Down Expand Up @@ -231,7 +238,12 @@ impl<'a> Expr<'a> {
ExprTag::ByteSet => Expr::ByteSet(&s[1..]),
ExprTag::Lookahead => Expr::Lookahead(flags, ExprRef::new(s[1]), s[2]),
ExprTag::Not => Expr::Not(flags, ExprRef::new(s[1])),
ExprTag::RemainderIs => Expr::RemainderIs(s[1], s[2]),
ExprTag::RemainderIs => Expr::RemainderIs {
divisor: s[1],
remainder: s[2],
scale: s[3],
fractional_part: s[4] != 0,
},
ExprTag::Repeat => Expr::Repeat(flags, ExprRef::new(s[1]), s[2], s[3]),
ExprTag::Concat => Expr::Concat(flags, bytemuck::cast_slice(&s[1..])),
ExprTag::Or => Expr::Or(flags, bytemuck::cast_slice(&s[1..])),
Expand All @@ -249,8 +261,19 @@ impl<'a> Expr<'a> {
match self {
Expr::EmptyString => trg.push_u32(flags.encode(ExprTag::EmptyString)),
Expr::NoMatch => trg.push_u32(flags.encode(ExprTag::NoMatch)),
Expr::RemainderIs(d, r) => {
trg.push_slice(&[flags.encode(ExprTag::RemainderIs), *d, *r]);
Expr::RemainderIs {
divisor,
remainder,
scale,
fractional_part,
} => {
trg.push_slice(&[
flags.encode(ExprTag::RemainderIs),
*divisor,
*remainder,
*scale,
*fractional_part as u32,
]);
}
Expr::Byte(b) => {
trg.push_slice(&[flags.encode(ExprTag::Byte), *b as u32]);
Expand Down Expand Up @@ -279,6 +302,7 @@ pub struct ExprSet {
pub(crate) alphabet_size: usize,
pub(crate) alphabet_words: usize,
pub(crate) digits: [u8; 10],
pub(crate) digit_dot: u8,
pub(crate) cost: u64,
pp: PrettyPrinter,
pub(crate) optimize: bool,
Expand All @@ -293,10 +317,8 @@ impl ExprSet {
exprs,
alphabet_size,
alphabet_words,
digits: [
'0' as u8, '1' as u8, '2' as u8, '3' as u8, '4' as u8, '5' as u8, '6' as u8,
'7' as u8, '8' as u8, '9' as u8,
],
digits: [b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9'],
digit_dot: b'.',
cost: 0,
pp: PrettyPrinter::new_simple(alphabet_size),
optimize: true,
Expand Down
19 changes: 15 additions & 4 deletions src/bytecompress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,12 @@ impl ByteCompressor {
Expr::Lookahead(_, _, x) => trg.mk_lookahead(args[0], x),
Expr::Not(_, _) => trg.mk_not(args[0]),
Expr::Repeat(_, _, x, y) => trg.mk_repeat(args[0], x, y),
Expr::RemainderIs(a, b) => trg.mk_remainder_is(a, b),
Expr::RemainderIs {
divisor,
remainder,
scale,
fractional_part,
} => trg.mk_remainder_is(divisor, remainder, scale, fractional_part),
Expr::Concat(_, _) => trg.mk_concat(&mut args),
Expr::Or(_, _) => trg.mk_or(&mut args),
Expr::And(_, _) => trg.mk_and(&mut args),
Expand Down Expand Up @@ -102,10 +107,15 @@ impl ByteCompressor {
match exprset.get(e) {
Expr::Byte(b) => self.add_single_byte(b),
Expr::ByteSet(bs) => self.bytesets.push(bs.to_vec()),
Expr::RemainderIs(_, _) => {
Expr::RemainderIs { scale, .. } => {
for b in exprset.digits {
self.add_single_byte(b);
}
// if scale==0 then it will only match integers
// and we don't need to distinguish the dot
if scale > 0 {
self.add_single_byte(exprset.digit_dot);
}
}
_ => {}
}
Expand All @@ -122,9 +132,10 @@ impl ByteCompressor {
}

let mut trg = ExprSet::new(self.alphabet_size);
for digit in 0..=9 {
trg.digits[digit] = self.mapping['0' as usize + digit as usize];
for digit in 0..trg.digits.len() {
trg.digits[digit] = self.mapping[exprset.digits[digit] as usize];
}
trg.digit_dot = self.mapping[exprset.digit_dot as usize];
let res_exprs: Vec<ExprRef> = rx_list
.iter()
.map(|&e| self.map_expr(&mut trg, exprset, e))
Expand Down
25 changes: 23 additions & 2 deletions src/deriv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,30 @@ impl DerivCache {
ExprRef::NO_MATCH
}
}
Expr::RemainderIs(d, r) => {
Expr::RemainderIs {
divisor,
remainder,
scale,
fractional_part,
} => {
if let Some(idx) = exprs.digits.iter().position(|&x| x == b) {
exprs.mk_remainder_is(d, (r * 10 + idx as u32) % d)
let (remainder, scale) = if !fractional_part {
(remainder * 10, scale)
} else {
if scale == 0 {
// Dead code?
return ExprRef::NO_MATCH;
}
(remainder, scale - 1)
};
exprs.mk_remainder_is(
divisor,
(remainder + (idx as u32) * 10_u32.pow(scale)) % divisor,
scale,
fractional_part,
)
} else if b == exprs.digit_dot && !fractional_part && scale > 0 {
exprs.mk_remainder_is(divisor, remainder, scale, true)
} else {
ExprRef::NO_MATCH
}
Expand Down
2 changes: 1 addition & 1 deletion src/nextbyte.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ pub(crate) fn next_byte_simple(exprs: &ExprSet, mut r: ExprRef) -> NextByte {
Expr::Byte(b) => return NextByte::ForcedByte(b),
Expr::And(_, _) => return NextByte::SomeBytes,
Expr::Not(_, _) => return NextByte::SomeBytes,
Expr::RemainderIs(_, _) => return NextByte::SomeBytes,
Expr::RemainderIs { .. } => return NextByte::SomeBytes,
Expr::Lookahead(_, e, _) => {
r = e;
}
Expand Down
6 changes: 5 additions & 1 deletion src/pp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,11 @@ impl PrettyPrinter {
write!(f, "{{{}, {}}}", min, max)
}
}
Expr::RemainderIs(a, b) => write!(f, "( % {} == {} )", a, b),
Expr::RemainderIs {
divisor, remainder, ..
} => {
write!(f, "( % {} == {} )", divisor, remainder)
}
Expr::Concat(_, es) => self.write_concat(exprset, es, f, max_len),
Expr::Or(_, es) => self.write_exprs(exprset, " | ", es, f, max_len),
Expr::And(_, es) => self.write_exprs(exprset, " & ", es, f, max_len),
Expand Down
22 changes: 13 additions & 9 deletions src/regexbuilder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,9 @@ pub enum RegexAst {
/// Repeat the regex at least min times, at most max times
/// u32::MAX means infinity
Repeat(Box<RegexAst>, u32, u32),
/// MultipleOf(d) matches if the input, interpreted as decimal ASCII number, is a multiple of d.
/// MultipleOf(d, s) matches if the input, interpreted as decimal ASCII number, is a multiple of d*10^-s.
/// EmptyString is not included.
MultipleOf(u32),
MultipleOf(u32, u32),
/// Matches the empty string. Same as Concat([]).
EmptyString,
/// Matches nothing. Same as Or([]).
Expand Down Expand Up @@ -120,7 +120,7 @@ impl RegexAst {
| RegexAst::Repeat(ast, _, _)
| RegexAst::JsonQuote(ast, _) => std::slice::from_ref(ast),
RegexAst::EmptyString
| RegexAst::MultipleOf(_)
| RegexAst::MultipleOf(_, _)
| RegexAst::NoMatch
| RegexAst::Regex(_)
| RegexAst::Literal(_)
Expand All @@ -147,7 +147,7 @@ impl RegexAst {
RegexAst::Repeat(_, _, _) => "Repeat",
RegexAst::Byte(_) => "Byte",
RegexAst::ByteSet(_) => "ByteSet",
RegexAst::MultipleOf(_) => "MultipleOf",
RegexAst::MultipleOf(_, _) => "MultipleOf",
RegexAst::JsonQuote(_, _) => "JsonQuote",
}
}
Expand Down Expand Up @@ -202,8 +202,12 @@ impl RegexAst {
RegexAst::Repeat(_, min, max) => {
dst.push_str(&format!("{{{},{}}} ", min, max));
}
RegexAst::MultipleOf(d) => {
dst.push_str(&format!(" % {} == 0 ", d));
RegexAst::MultipleOf(d, s) => {
if *s == 0 {
dst.push_str(&format!(" % {} == 0 ", d));
} else {
dst.push_str(&format!(" % {}x10^-{} == 0", d, s));
}
}
RegexAst::JsonQuote(_, opts) => {
dst.push_str(&format!(" {:?}", opts));
Expand Down Expand Up @@ -420,7 +424,7 @@ impl RegexBuilder {
}
}
// always identity
Expr::EmptyString | Expr::NoMatch | Expr::RemainderIs(_, _) => e,
Expr::EmptyString | Expr::NoMatch | Expr::RemainderIs { .. } => e,
// if all args map to themselves, return back the same expression
x if x.args() == args => e,
// otherwise, actually map the args
Expand Down Expand Up @@ -497,9 +501,9 @@ impl RegexBuilder {
RegexAst::Repeat(_, min, max) => {
self.exprset.mk_repeat(new_args[0], *min, *max)
}
RegexAst::MultipleOf(d) => {
RegexAst::MultipleOf(d, s) => {
ensure!(*d > 0, "invalid multiple of");
self.exprset.mk_remainder_is(*d, *d)
self.exprset.mk_remainder_is(*d, *d, *s, false)
}
RegexAst::Byte(b) => self.exprset.mk_byte(*b),
RegexAst::ByteSet(bs) => {
Expand Down
43 changes: 34 additions & 9 deletions src/relevance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -153,15 +153,40 @@ impl RelevanceCache {
// just unwrap lookaheads
Expr::Lookahead(_, _, _) => deriv.pop().unwrap(),

Expr::RemainderIs(d, r) => (0..10)
.map(|i| {
(
exprs.mk_byte(exprs.digits[i]),
exprs.mk_remainder_is(d, (r * 10 + i as u32) % d),
)
})
.collect(),

Expr::RemainderIs {
divisor,
remainder,
scale,
fractional_part,
} => {
let mut result = vec![];
for i in 0..10 {
let b = exprs.mk_byte(exprs.digits[i]);
let (remainder, scale) = if !fractional_part {
(remainder * 10, scale)
} else {
if scale == 0 {
// Dead code?
result.push((b, ExprRef::NO_MATCH));
continue;
}
(remainder, scale - 1)
};
let r = exprs.mk_remainder_is(
divisor,
(remainder + (i as u32) * 10_u32.pow(scale)) % divisor,
scale,
fractional_part,
);
result.push((b, r));
}
if !fractional_part && scale > 0 {
let b = exprs.mk_byte(exprs.digit_dot);
let r = exprs.mk_remainder_is(divisor, remainder, scale, true);
result.push((b, r));
}
result
}
Expr::And(_, _) => {
let mut acc = deriv.pop().unwrap();
while let Some(other) = deriv.pop() {
Expand Down
53 changes: 49 additions & 4 deletions src/simplify.rs
Original file line number Diff line number Diff line change
Expand Up @@ -384,10 +384,55 @@ impl ExprSet {
}
}

pub fn mk_remainder_is(&mut self, d: u32, r: u32) -> ExprRef {
assert!(d > 0);
assert!(r <= d);
self.mk(Expr::RemainderIs(d, r))
pub fn mk_remainder_is(
&mut self,
divisor: u32,
remainder: u32,
scale: u32,
fractional_part: bool,
) -> ExprRef {
assert!(divisor > 0);
assert!(remainder <= divisor);
self.pay(1);
if !fractional_part {
self.mk(Expr::RemainderIs {
divisor,
remainder,
scale,
fractional_part,
})
} else {
if scale == 0 && remainder == 0 {
// We're done
return ExprRef::EMPTY_STRING;
}
let scale_multiplier = 10u32.pow(scale);
let remainder_to_go = (divisor - remainder) % divisor;
if remainder_to_go < scale_multiplier {
if scale_multiplier <= divisor {
// If our scale has shrunken smaller than our divisor, we can force the rest
// of the digits
let forced_digits =
format!("{:0>width$}", remainder_to_go, width = scale as usize);
// TODO: trim trailing zeros?
let mapped = forced_digits
.as_bytes()
.iter()
.map(|b| self.digits[(b - b'0') as usize])
.collect::<Vec<_>>();
self.mk_byte_literal(&mapped)
} else {
self.mk(Expr::RemainderIs {
divisor,
remainder,
scale,
fractional_part,
})
}
} else {
ExprRef::NO_MATCH
}
}
}

// this avoids allocation when hitting the hash-cons
Expand Down
Loading

0 comments on commit bfb30e2

Please sign in to comment.