Skip to content

Commit

Permalink
Merge pull request #2 from microsoft/reminder
Browse files Browse the repository at this point in the history
Add ReminderIs operator
  • Loading branch information
mmoskal authored Dec 31, 2024
2 parents a629ecc + a72d469 commit 46023cb
Show file tree
Hide file tree
Showing 9 changed files with 131 additions and 12 deletions.
32 changes: 30 additions & 2 deletions src/ast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ pub enum Expr<'a> {
NoMatch,
Byte(u8),
ByteSet(&'a [u32]),
// RemainderIs(d, r) matches numbers N where N % d == r
// RemainderIs(d, d) is equivalent to RemainderIs(d, 0) \ EmptyString
RemainderIs(u32, u32),
Lookahead(ExprFlags, ExprRef, u32),
Not(ExprFlags, ExprRef),
Repeat(ExprFlags, ExprRef, u32, u32),
Expand Down Expand Up @@ -92,6 +95,7 @@ pub enum ExprTag {
NoMatch,
Byte,
ByteSet,
RemainderIs,
Lookahead,
Not,
Repeat,
Expand Down Expand Up @@ -183,14 +187,25 @@ impl<'a> Expr<'a> {
Expr::Lookahead(_, e, _) | Expr::Not(_, e) | Expr::Repeat(_, e, _, _) => {
std::slice::from_ref(e)
}
Expr::EmptyString | Expr::NoMatch | Expr::Byte(_) | Expr::ByteSet(_) => &[],
Expr::RemainderIs(_, _)
| Expr::EmptyString
| Expr::NoMatch
| Expr::Byte(_)
| Expr::ByteSet(_) => &[],
}
}

#[inline]
fn get_flags(&self) -> ExprFlags {
match self {
Expr::EmptyString => ExprFlags::POSITIVE_NULLABLE,
Expr::RemainderIs(_, k) => {
if *k == 0 {
ExprFlags::POSITIVE_NULLABLE
} else {
ExprFlags::POSITIVE
}
}
Expr::NoMatch => ExprFlags::ZERO,
Expr::Byte(_) | Expr::ByteSet(_) => ExprFlags::POSITIVE,
Expr::Lookahead(f, _, _) => *f,
Expand All @@ -216,6 +231,7 @@ impl<'a> Expr<'a> {
ExprTag::ByteSet => Expr::ByteSet(&s[1..]),
ExprTag::Lookahead => Expr::Lookahead(flags, ExprRef::new(s[1]), s[2]),
ExprTag::Not => Expr::Not(flags, ExprRef::new(s[1])),
ExprTag::RemainderIs => Expr::RemainderIs(s[1], s[2]),
ExprTag::Repeat => Expr::Repeat(flags, ExprRef::new(s[1]), s[2], s[3]),
ExprTag::Concat => Expr::Concat(flags, bytemuck::cast_slice(&s[1..])),
ExprTag::Or => Expr::Or(flags, bytemuck::cast_slice(&s[1..])),
Expand All @@ -233,6 +249,9 @@ impl<'a> Expr<'a> {
match self {
Expr::EmptyString => trg.push_u32(flags.encode(ExprTag::EmptyString)),
Expr::NoMatch => trg.push_u32(flags.encode(ExprTag::NoMatch)),
Expr::RemainderIs(d, r) => {
trg.push_slice(&[flags.encode(ExprTag::RemainderIs), *d, *r]);
}
Expr::Byte(b) => {
trg.push_slice(&[flags.encode(ExprTag::Byte), *b as u32]);
}
Expand All @@ -259,6 +278,7 @@ pub struct ExprSet {
exprs: VecHashCons,
pub(crate) alphabet_size: usize,
pub(crate) alphabet_words: usize,
pub(crate) digits: [u8; 10],
pub(crate) cost: u64,
pp: PrettyPrinter,
pub(crate) optimize: bool,
Expand All @@ -273,6 +293,10 @@ impl ExprSet {
exprs,
alphabet_size,
alphabet_words,
digits: [
'0' as u8, '1' as u8, '2' as u8, '3' as u8, '4' as u8, '5' as u8, '6' as u8,
'7' as u8, '8' as u8, '9' as u8,
],
cost: 0,
pp: PrettyPrinter::new_simple(alphabet_size),
optimize: true,
Expand Down Expand Up @@ -439,7 +463,11 @@ impl ExprSet {
match tag {
ExprTag::Concat | ExprTag::Or | ExprTag::And => bytemuck::cast_slice(&s[1..]),
ExprTag::Not | ExprTag::Repeat | ExprTag::Lookahead => bytemuck::cast_slice(&s[1..2]),
ExprTag::EmptyString | ExprTag::NoMatch | ExprTag::Byte | ExprTag::ByteSet => &[],
ExprTag::RemainderIs
| ExprTag::EmptyString
| ExprTag::NoMatch
| ExprTag::Byte
| ExprTag::ByteSet => &[],
}
}

Expand Down
27 changes: 17 additions & 10 deletions src/bytecompress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ impl ByteCompressor {
Expr::Lookahead(_, _, x) => trg.mk_lookahead(args[0], x),
Expr::Not(_, _) => trg.mk_not(args[0]),
Expr::Repeat(_, _, x, y) => trg.mk_repeat(args[0], x, y),
Expr::RemainderIs(a, b) => trg.mk_remainder_is(a, b),
Expr::Concat(_, _) => trg.mk_concat(&mut args),
Expr::Or(_, _) => trg.mk_or(&mut args),
Expr::And(_, _) => trg.mk_and(&mut args),
Expand All @@ -80,6 +81,13 @@ impl ByteCompressor {
self.map_cache[&e]
}

fn add_single_byte(&mut self, b: u8) {
if self.mapping[b as usize] == INVALID_MAPPING {
self.mapping[b as usize] = self.alphabet_size as u8;
self.alphabet_size += 1;
}
}

pub fn compress(&mut self, exprset: &ExprSet, rx_list: &[ExprRef]) -> (ExprSet, Vec<ExprRef>) {
self.mapping = vec![INVALID_MAPPING; exprset.alphabet_size()];

Expand All @@ -92,16 +100,12 @@ impl ByteCompressor {
visited[e.as_usize()] = true;
todo.extend_from_slice(exprset.get_args(e));
match exprset.get(e) {
Expr::Byte(b) => {
assert!(
self.mapping[b as usize] == INVALID_MAPPING,
"visiting the same byte the second time"
);
self.mapping[b as usize] = self.alphabet_size as u8;
self.alphabet_size += 1;
}
Expr::ByteSet(bs) => {
self.bytesets.push(bs.to_vec());
Expr::Byte(b) => self.add_single_byte(b),
Expr::ByteSet(bs) => self.bytesets.push(bs.to_vec()),
Expr::RemainderIs(_, _) => {
for b in exprset.digits {
self.add_single_byte(b);
}
}
_ => {}
}
Expand All @@ -118,6 +122,9 @@ impl ByteCompressor {
}

let mut trg = ExprSet::new(self.alphabet_size);
for digit in 0..=9 {
trg.digits[digit] = self.mapping['0' as usize + digit as usize];
}
let res_exprs: Vec<ExprRef> = rx_list
.iter()
.map(|&e| self.map_expr(&mut trg, exprset, e))
Expand Down
7 changes: 7 additions & 0 deletions src/deriv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,13 @@ impl DerivCache {
ExprRef::NO_MATCH
}
}
Expr::RemainderIs(d, r) => {
if let Some(idx) = exprs.digits.iter().position(|&x| x == b) {
exprs.mk_remainder_is(d, (r * 10 + idx as u32) % d)
} else {
ExprRef::NO_MATCH
}
}
Expr::And(_, _) => exprs.mk_and(deriv),
Expr::Or(_, _) => exprs.mk_or(deriv),
Expr::Not(_, _) => exprs.mk_not(deriv[0]),
Expand Down
1 change: 1 addition & 0 deletions src/nextbyte.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ pub(crate) fn next_byte_simple(exprs: &ExprSet, mut r: ExprRef) -> NextByte {
Expr::Byte(b) => return NextByte::ForcedByte(b),
Expr::And(_, _) => return NextByte::SomeBytes,
Expr::Not(_, _) => return NextByte::SomeBytes,
Expr::RemainderIs(_, _) => return NextByte::SomeBytes,
Expr::Lookahead(_, e, _) => {
r = e;
}
Expand Down
1 change: 1 addition & 0 deletions src/pp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,7 @@ impl PrettyPrinter {
write!(f, "{{{}, {}}}", min, max)
}
}
Expr::RemainderIs(a, b) => write!(f, "( % {} == {} )", a, b),
Expr::Concat(_, es) => self.write_concat(exprset, es, f, max_len),
Expr::Or(_, es) => self.write_exprs(exprset, " | ", es, f, max_len),
Expr::And(_, es) => self.write_exprs(exprset, " & ", es, f, max_len),
Expand Down
13 changes: 13 additions & 0 deletions src/regexbuilder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,9 @@ pub enum RegexAst {
/// Repeat the regex at least min times, at most max times
/// u32::MAX means infinity
Repeat(Box<RegexAst>, u32, u32),
/// MultipleOf(d) matches if the input, interpreted as decimal ASCII number, is a multiple of d.
/// EmptyString is not included.
MultipleOf(u32),
/// Matches the empty string. Same as Concat([]).
EmptyString,
/// Matches nothing. Same as Or([]).
Expand Down Expand Up @@ -111,6 +114,7 @@ impl RegexAst {
std::slice::from_ref(ast)
}
RegexAst::EmptyString
| RegexAst::MultipleOf(_)
| RegexAst::NoMatch
| RegexAst::Regex(_)
| RegexAst::Literal(_)
Expand All @@ -137,6 +141,7 @@ impl RegexAst {
RegexAst::Repeat(_, _, _) => "Repeat",
RegexAst::Byte(_) => "Byte",
RegexAst::ByteSet(_) => "ByteSet",
RegexAst::MultipleOf(_) => "MultipleOf",
}
}

Expand Down Expand Up @@ -190,6 +195,9 @@ impl RegexAst {
RegexAst::Repeat(_, min, max) => {
dst.push_str(&format!("{{{},{}}} ", min, max));
}
RegexAst::MultipleOf(d) => {
dst.push_str(&format!(" % {} == 0 ", d));
}
RegexAst::EmptyString | RegexAst::NoMatch => {}
}
for c in ast.get_args().iter().rev() {
Expand Down Expand Up @@ -397,6 +405,7 @@ impl RegexBuilder {
exprset.mk_byte(b)
}
}
Expr::RemainderIs(a, b) => exprset.mk_remainder_is(a, b),
Expr::And(_, _) => exprset.mk_and(args),
Expr::Or(_, _) => exprset.mk_or(args),
Expr::Concat(_, _) => exprset.mk_concat(args),
Expand Down Expand Up @@ -468,6 +477,10 @@ impl RegexBuilder {
RegexAst::Repeat(_, min, max) => {
self.exprset.mk_repeat(new_args[0], *min, *max)
}
RegexAst::MultipleOf(d) => {
ensure!(*d > 0, "invalid multiple of");
self.exprset.mk_remainder_is(*d, *d)
}
RegexAst::Byte(b) => self.exprset.mk_byte(*b),
RegexAst::ByteSet(bs) => {
ensure!(
Expand Down
9 changes: 9 additions & 0 deletions src/relevance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,15 @@ impl RelevanceCache {
// just unwrap lookaheads
Expr::Lookahead(_, _, _) => deriv.pop().unwrap(),

Expr::RemainderIs(d, r) => (0..10)
.map(|i| {
(
exprs.mk_byte(exprs.digits[i]),
exprs.mk_remainder_is(d, (r * 10 + i as u32) % d),
)
})
.collect(),

Expr::And(_, _) => {
let mut acc = deriv.pop().unwrap();
while let Some(other) = deriv.pop() {
Expand Down
6 changes: 6 additions & 0 deletions src/simplify.rs
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,12 @@ impl ExprSet {
}
}

pub fn mk_remainder_is(&mut self, d: u32, r: u32) -> ExprRef {
assert!(d > 0);
assert!(r <= d);
self.mk(Expr::RemainderIs(d, r))
}

// this avoids allocation when hitting the hash-cons
pub(crate) fn mk_and2(&mut self, a: ExprRef, b: ExprRef) -> ExprRef {
self.pay(2);
Expand Down
47 changes: 47 additions & 0 deletions tests/emptiness.rs
Original file line number Diff line number Diff line change
Expand Up @@ -204,3 +204,50 @@ fn test_prefixes_except() {

check_contains_prefixes_except(r"[a-z]{0,5}", "[a-zB]{0,6}Q", r#"(foo|bar)M"#);
}

#[test]
fn test_multiple_of() {
for d in 1..=300 {
let mut r = RegexBuilder::new();
let id = r.mk(&RegexAst::MultipleOf(d)).unwrap();
let mut rx = r.to_regex(id);
assert!(!rx.is_match(""));
assert!(!rx.is_match("-1"));
for t in 0..(7 * d) {
let s = format!("{}", t);
if rx.is_match(&s) != (t % d == 0) {
panic!("{} % {} == {}", t, d, t % d);
}
}
}
}

fn remainder_is_check(should_be_empty: bool, d: u32, other_rx: &str) {
let mut bld = RegexBuilder::new();
let id = bld
.mk(&RegexAst::And(vec![
RegexAst::Regex(other_rx.to_string()),
RegexAst::MultipleOf(d),
]))
.unwrap();
let mut rx = bld.to_regex(id);
if rx.always_empty() != should_be_empty {
panic!("empty({} % & {:?}) != {}", d, other_rx, should_be_empty);
}
}

fn remainder_is_empty(d: u32, other_rx: &str) {
remainder_is_check(true, d, other_rx);
}

fn remainder_is_non_empty(d: u32, other_rx: &str) {
remainder_is_check(false, d, other_rx);
}

#[test]
fn test_remainder_is_relevance() {
remainder_is_non_empty(2, "[0-9]+");
remainder_is_non_empty(3, "[2]+");
remainder_is_empty(3, "[a-z]*");
remainder_is_empty(2, "[3579]+");
}

0 comments on commit 46023cb

Please sign in to comment.