Skip to content

Commit

Permalink
start on relevance (emtiness) checks
Browse files Browse the repository at this point in the history
  • Loading branch information
mmoskal committed Jul 17, 2024
1 parent 8cb3b0b commit b0b9630
Show file tree
Hide file tree
Showing 4 changed files with 332 additions and 50 deletions.
66 changes: 40 additions & 26 deletions src/ast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,17 +51,30 @@ pub enum Expr<'a> {
pub struct ExprFlags(u32);
impl ExprFlags {
pub const NULLABLE: ExprFlags = ExprFlags(1 << 8);
pub const POSITIVE: ExprFlags = ExprFlags(1 << 9);
pub const ZERO: ExprFlags = ExprFlags(0);

pub const POSITIVE_NULLABLE: ExprFlags =
ExprFlags(ExprFlags::POSITIVE.0 | ExprFlags::NULLABLE.0);

pub fn is_nullable(&self) -> bool {
self.0 & ExprFlags::NULLABLE.0 != 0
}

pub fn from_nullable(nullable: bool) -> Self {
pub fn is_positive(&self) -> bool {
self.0 & ExprFlags::POSITIVE.0 != 0
}

pub fn from_nullable_positive(nullable: bool, positive: bool) -> Self {
if nullable {
Self::NULLABLE
// anything nullable is also positive
Self::POSITIVE_NULLABLE
} else {
Self::ZERO
if positive {
Self::POSITIVE
} else {
Self::ZERO
}
}
}

Expand Down Expand Up @@ -105,6 +118,11 @@ pub fn byteset_set(s: &mut [u32], b: usize) {
s[b / 32] |= 1 << (b % 32);
}

#[inline(always)]
pub fn byteset_clear(s: &mut [u32], b: usize) {
s[b / 32] &= !(1 << (b % 32));
}

#[inline(always)]
pub fn byteset_set_range(s: &mut [u32], range: RangeInclusive<u8>) {
for elt in range {
Expand All @@ -119,6 +137,13 @@ pub fn byteset_union(s: &mut [u32], other: &[u32]) {
}
}

#[inline(always)]
pub fn byteset_intersection(s: &mut [u32], other: &[u32]) {
for i in 0..s.len() {
s[i] &= other[i];
}
}

pub fn byteset_256() -> Vec<u32> {
vec![0u32; 256 / 32]
}
Expand Down Expand Up @@ -150,10 +175,12 @@ impl<'a> Expr<'a> {
}
}

#[inline]
fn get_flags(&self) -> ExprFlags {
match self {
Expr::EmptyString => ExprFlags::NULLABLE,
Expr::NoMatch | Expr::Byte(_) | Expr::ByteSet(_) => ExprFlags::ZERO,
Expr::EmptyString => ExprFlags::POSITIVE_NULLABLE,
Expr::NoMatch => ExprFlags::ZERO,
Expr::Byte(_) | Expr::ByteSet(_) => ExprFlags::POSITIVE,
Expr::Lookahead(f, _, _) => *f,
Expr::Not(f, _) => *f,
Expr::Repeat(f, _, _, _) => *f,
Expand Down Expand Up @@ -190,15 +217,15 @@ impl<'a> Expr<'a> {
trg.push_u32(tag);
trg.push_slice(bytemuck::cast_slice(es));
}
let zf = ExprFlags::ZERO;
let flags = self.get_flags();
match self {
Expr::EmptyString => trg.push_u32(zf.encode(ExprTag::EmptyString)),
Expr::NoMatch => trg.push_u32(zf.encode(ExprTag::NoMatch)),
Expr::EmptyString => trg.push_u32(flags.encode(ExprTag::EmptyString)),
Expr::NoMatch => trg.push_u32(flags.encode(ExprTag::NoMatch)),
Expr::Byte(b) => {
trg.push_slice(&[zf.encode(ExprTag::Byte), *b as u32]);
trg.push_slice(&[flags.encode(ExprTag::Byte), *b as u32]);
}
Expr::ByteSet(s) => {
trg.push_u32(zf.encode(ExprTag::ByteSet));
trg.push_u32(flags.encode(ExprTag::ByteSet));
trg.push_slice(s);
}
Expr::Lookahead(flags, e, n) => {
Expand Down Expand Up @@ -248,21 +275,11 @@ impl ExprSet {
ExprRef::ANY_BYTE,
),
(
r.mk(Expr::Repeat(
ExprFlags::NULLABLE,
ExprRef::ANY_BYTE,
0,
u32::MAX,
)),
r.mk_repeat(ExprRef::ANY_BYTE, 0, u32::MAX),
ExprRef::ANY_STRING,
),
(
r.mk(Expr::Repeat(
ExprFlags::ZERO,
ExprRef::ANY_BYTE,
1,
u32::MAX,
)),
r.mk_repeat(ExprRef::ANY_BYTE, 1, u32::MAX),
ExprRef::NON_EMPTY_STRING,
),
];
Expand Down Expand Up @@ -367,11 +384,8 @@ impl ExprSet {
}
}

fn get_flags(&self, id: ExprRef) -> ExprFlags {
pub fn get_flags(&self, id: ExprRef) -> ExprFlags {
assert!(id.is_valid());
if id == ExprRef::EMPTY_STRING {
return ExprFlags::NULLABLE;
}
ExprFlags(self.exprs.get(id.0)[0] & !0xff)
}

Expand Down
4 changes: 3 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@ mod ast;
mod bytecompress;
mod mapper;
mod pp;
mod regexbuilder;
mod regex;
mod regexbuilder;
mod relevance;
mod simplify;
mod syntax;

Expand All @@ -23,4 +24,5 @@ pub mod raw {
pub use super::deriv::DerivCache;
pub use super::hashcons::VecHashCons;
pub use super::nextbyte::NextByteCache;
pub use super::relevance::RelevanceCache;
}
190 changes: 190 additions & 0 deletions src/relevance.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
use std::collections::HashMap;

use crate::ast::{Expr, ExprRef, ExprSet};

// This is map (ByteSet => RegExp); ByteSet is Expr::Byte or Expr::ByteSet,
// and expresses the condition; RegExp is the derivative under that condition.
// Conditions can be overlapping, but should not repeat.
type SymRes = Vec<(ExprRef, ExprRef)>;

#[derive(Clone)]
pub struct RelevanceCache {
relevance_cache: HashMap<ExprRef, bool>,
sym_deriv: HashMap<ExprRef, SymRes>,
}

fn group_by_first<A: PartialEq + Ord + Copy, B: Ord + Copy>(
mut s: Vec<(A, B)>,
mut f: impl FnMut(Vec<B>) -> B,
) -> Vec<(A, B)> {
s.sort_unstable();
let mut by_set = vec![];
let mut i = 0;
while i < s.len() {
let mut j = i + 1;
while j < s.len() && s[i].0 == s[j].0 {
j += 1;
}
let len = j - i;
if len == 1 {
by_set.push(s[i]);
} else {
by_set.push((s[i].0, f(s[i..j].iter().map(|(_, x)| *x).collect())))
}
i = j;
}
by_set
}

fn simplify(exprs: &mut ExprSet, s: SymRes) -> SymRes {
let s = group_by_first(s, |args| exprs.mk_or(args));
let s = group_by_first(s.into_iter().map(|(a, b)| (b, a)).collect(), |args| {
exprs.mk_byte_set_or(&args)
});
s.into_iter().map(|(a, b)| (b, a)).collect()
}

impl RelevanceCache {
pub fn new() -> Self {
RelevanceCache {
relevance_cache: HashMap::default(),
sym_deriv: HashMap::default(),
}
}

pub fn num_bytes(&self) -> usize {
self.relevance_cache.len() * 3 * std::mem::size_of::<isize>()
}

fn deriv(&mut self, exprs: &mut ExprSet, e: ExprRef) -> SymRes {
exprs.map(
e,
&mut self.sym_deriv,
|e| e,
|exprs, mut deriv, e| match exprs.get(e) {
Expr::EmptyString => vec![],
Expr::NoMatch => vec![],
Expr::Byte(_) => vec![(e, ExprRef::EMPTY_STRING)],
Expr::ByteSet(_) => vec![(e, ExprRef::EMPTY_STRING)],

// ignore lookaheads
Expr::Lookahead(_, _, _) => deriv.pop().unwrap(),

Expr::And(_, _) => {
let mut acc = deriv.pop().unwrap();
while let Some(other) = deriv.pop() {
let mut new_acc = vec![];
for (b0, r0) in &acc {
for (b1, r1) in &other {
let b = exprs.mk_byte_set_and(*b0, *b1);
if b != ExprRef::NO_MATCH {
let r = exprs.mk_and(vec![*r0, *r1]);
if r != ExprRef::NO_MATCH {
new_acc.push((b, r));
}
}
}
}
acc = new_acc;
}
simplify(exprs, acc)
}

Expr::Or(_, _) => simplify(exprs, deriv.into_iter().flatten().collect()),

Expr::Not(_, _) => {
let tmp = deriv[0]
.iter()
.map(|(b, r)| (exprs.mk_byte_set_not(*b), exprs.mk_not(*r)))
.collect();
simplify(exprs, tmp)
}

Expr::Repeat(_, e, min, max) => {
let max = if max == u32::MAX {
u32::MAX
} else {
max.saturating_sub(1)
};
let tail = exprs.mk_repeat(e, min.saturating_sub(1), max);
let tmp = deriv[0]
.iter()
.map(|(b, r)| (*b, exprs.mk_concat(vec![*r, tail])))
.collect();
simplify(exprs, tmp)
}
Expr::Concat(_, args) => {
let mut or_branches = vec![];
let args = args.to_vec();
for i in 0..args.len() {
let nullable = exprs.is_nullable(args[i]);
or_branches.extend(deriv[i].iter().map(|(b, r)| {
let mut cc = vec![*r];
cc.extend_from_slice(&args[i + 1..]);
(*b, exprs.mk_concat(cc))
}));
if nullable {
break;
}
}
simplify(exprs, or_branches)
}
},
)
}

pub fn is_relevant(&mut self, exprs: &mut ExprSet, e: ExprRef) -> bool {
let flags = exprs.get_flags(e);
if flags.is_positive() {
return true;
}
if let Some(r) = self.relevance_cache.get(&e) {
return *r;
}

// TODO limit by size somehow...

let d = self.deriv(exprs, e);
if d.iter().any(|(_, e)| exprs.is_nullable(*e)) {
self.relevance_cache.insert(e, true);
true
} else if d.is_empty() {
self.relevance_cache.insert(e, false);
false
} else {
todo!("BFS")
}
}
}

/*
ite(a,E,bot) | ite(b,E,bot)
ite(a,E | ite(b, E, bot),bot | ite(b, E, bot))
ite(a, ite(b, E | E, E | bot), ite(b, bot|E,bot|bot))
ite(a, ite(b, E, E), ite(b, E,bot))
ite(a, E, ite(b, E,bot))
no ITE or ITE at the top
bunch of ITEs
c0 => r0
c1 => r1
...
a=>A,b=>B & c=>C,d=>D ==>
a&c=>A&C, ...
a=>A,b=>B | c=>C,d=>D ==>
a=>A,b=>B , c=>C,d=>D
~ (a=>A, b=>B) ==>
~a=>~A, ~b=>~B
a=>A,b=>B . c=>C,d=>D ==>
*/
Loading

0 comments on commit b0b9630

Please sign in to comment.