code: reduce all NFAs to DFAs

Related to #65 Signed-off-by: Tim Bray <[email protected]>
timbray · Jun 23, 2022 · f0743c2 · f0743c2
1 parent 25beaa4
commit f0743c2
Show file tree

Hide file tree

Showing 8 changed files with 327 additions and 203 deletions.
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 [![Tests](https://github.com/timbray/quamina/actions/workflows/go-unit-tests.yaml/badge.svg)](https://github.com/timbray/quamina/actions/workflows/go-unit-tests.yaml)
 [![Latest Release](https://img.shields.io/github/release/timbray/quamina.svg?logo=github&style=flat-square)](https://github.com/timbray/quamina/releases/latest)
-[![codecov](https://codecov.io/gh/timbray/quamina/branch/main/graph/badge.svg?token=TC7MW723JO)](https://codecov.io/gh/timbray/quamina) 
+[![codecov](https://codecov.io/gh/timbray/quamina/branch/main/graph/badge.svg?token=TC7MW723JO)](https://codecov.io/gh/timbray/quamina)
 [![Go Report Card](https://goreportcard.com/badge/github.com/timbray/quamina)](https://goreportcard.com/report/github.com/timbray/quamina)
 [![timbray/quamina](https://img.shields.io/github/go-mod/go-version/timbray/quamina)](https://github.com/timbray/quamina)
 [![Go Reference](https://pkg.go.dev/badge/github.com/timbray/quamina.svg)](https://pkg.go.dev/github.com/timbray/quamina)
@@ -51,7 +51,6 @@ in RFC 8259:
 ```
 
 The following Patterns would match it:
-
 ```json
 {"Image": {"Width": [800]}}
 ```
@@ -91,7 +90,6 @@ The following Patterns would match it:
   }
 }
 ```
-
 ```json
 {
   "Image": {
@@ -275,25 +273,48 @@ Events through it as is practical.
 
 I used to say that the performance of
 `MatchesForEvent` was `O(1)` in the number of
-Patterns. While that’s probably the right way to think
-about it, it’s not *quite* true,
-as it varies somewhat as a function of the number of
-unique fields that appear in all the Patterns that have
-been added to Quamina, but still remains sublinear
-in that number.
-
-A word of explanation: Quamina compiles the
-Patterns into a somewhat-decorated automaton and uses
-that to find matches in Events; the matching process is
-`O(1)` in the number of Patterns.
-
-However, for this to work, the incoming Event must be
-flattened into a list of pathname/value pairs and
-sorted.  This process exceeds 50% of execution time,
-and is optimized by discarding any fields that
-do not appear in one or more of the Patterns added
-to Quamina. Thus, adding a new Pattern that only
-mentions fields which are already mentioned in previous 
+Patterns. That’s probably a reasonable way to think
+about it, because it’s *almost* right. 
+
+To be correct, the performance is `O(N)` where `N` is 
+the number of unique fields that appear in all the Patterns 
+that have been added to Quamina.  
+
+For example, suppose you have a list of 50,000 words, and
+you add a Pattern for each, of the form:
+```json
+{"word": ["one of the words"]}
+```
+The performance in matching events should be about the same 
+for one word or 50,000, with some marginal loss following on 
+growth in the size of the necessary data structures.
+
+However, adding another pattern that looks like the
+following would
+roughly speaking decrease the performance by a factor of 
+roughly 2:
+```json
+{"number": [11, 22, 33]}
+```
+Then adding a few thousand more `"number"` patterns shouldn’t
+decrease the performance observably.
+
+As always, it’s a little more complex than that, with a weak
+dependency on the size of the incoming Events; Quamina has
+to plow through them end-to-end to pull out the interesting
+fields.
+
+A word of explanation: Quamina compiles the Patterns into a
+somewhat-decorated automaton and uses that to find matches in
+Events. For Quamina to work, the incoming Events must be flattened
+into a list of pathname/value pairs and sorted.  This process
+exceeds 50% of execution time, and is optimized by discarding
+any fields that do not appear in one or more of the Patterns
+added to Quamina.  Then, the cost of traversing the automaton
+is at most N, the number of fields left after discarding.
+
+Thus, adding a new Pattern that only
+mentions fields which are already mentioned in previous
 Patterns is effectively free i.e. `O(1)` in terms of run-time
 performance.
 

diff --git a/list_maker.go b/list_maker.go
@@ -1,7 +1,64 @@
 package quamina
 
 // this needs to exist so that all all the lists containing a single step to X, or the triple step to X,Y,Z are the
-//  same list, so that pack/unpack work properly
+// same list, so that pack/unpack work properly. In a large majority of cases, there's only one step in the list, so
+// those are handled straightforwardly with a map. Otherwise, we laboriously look through all the lists for a match.
+// In Java I'd implement a hashCode() method and everything would be a hash, but I haven't learned yet what the Go
+// equivalent is.
+type dfaMemory struct {
+	singletons map[*nfaStep]*dfaStep
+	plurals    []perList
+}
+type perList struct {
+	list []*nfaStep
+	dfa  *dfaStep
+}
+
+func newDfaMemory() *dfaMemory {
+	return &dfaMemory{singletons: make(map[*nfaStep]*dfaStep)}
+}
+
+func (m *dfaMemory) rememberDfaForList(dfa *dfaStep, steps ...*nfaStep) {
+	if len(steps) == 1 {
+		m.singletons[steps[0]] = dfa
+	} else {
+		m.plurals = append(m.plurals, perList{list: steps, dfa: dfa})
+	}
+}
+
+func (m *dfaMemory) dfaForNfas(steps ...*nfaStep) (*dfaStep, bool) {
+	if len(steps) == 1 {
+		d, ok := m.singletons[steps[0]]
+		return d, ok
+	}
+	for _, p := range m.plurals {
+		if nfaListsEqual(p.list, steps) {
+			return p.dfa, true
+		}
+	}
+	return nil, false
+}
+
+func nfaListsEqual(l1, l2 []*nfaStep) bool {
+	if len(l1) != len(l2) {
+		return false
+	}
+	for _, e1 := range l1 {
+		if !nfaListContains(l2, e1) {
+			return false
+		}
+	}
+	return true
+}
+
+func nfaListContains(list []*nfaStep, step *nfaStep) bool {
+	for _, e := range list {
+		if e == step {
+			return true
+		}
+	}
+	return false
+}
 
 type listMaker struct {
 	singletons map[*nfaStep]*nfaStepList

diff --git a/list_maker_test.go b/list_maker_test.go
@@ -4,6 +4,77 @@ import (
 	"testing"
 )
 
+func TestDfaMemory(t *testing.T) {
+	d1 := &dfaStep{}
+	d3 := &dfaStep{}
+	d12 := &dfaStep{}
+	d13 := &dfaStep{}
+	d123 := &dfaStep{}
+	ns1 := &nfaStep{}
+	ns2 := &nfaStep{}
+	ns3 := &nfaStep{}
+	l1 := []*nfaStep{ns1}
+	l3 := []*nfaStep{ns3}
+	l12 := []*nfaStep{ns1, ns2}
+	l13 := []*nfaStep{ns1, ns3}
+	l123 := []*nfaStep{ns1, ns2, ns3}
+
+	mem := newDfaMemory()
+	mem.rememberDfaForList(d1, l1...)
+	mem.rememberDfaForList(d3, l3...)
+	mem.rememberDfaForList(d12, l12...)
+	mem.rememberDfaForList(d13, l13...)
+	mem.rememberDfaForList(d123, l123...)
+
+	var ok bool
+	var d *dfaStep
+	d, ok = mem.dfaForNfas(l1...)
+	if ok == false || d != d1 {
+		t.Error("failed d1")
+	}
+	d, ok = mem.dfaForNfas(l3...)
+	if ok == false || d != d3 {
+		t.Error("failed d1")
+	}
+	var shouldMatches [][]*nfaStep
+	shouldMatches = [][]*nfaStep{{ns1, ns2}, {ns2, ns1}}
+	for i, should := range shouldMatches {
+		d, ok := mem.dfaForNfas(should...)
+		if ok == false || d != d12 {
+			t.Errorf("no match on %d", i)
+		}
+	}
+	shouldMatches = [][]*nfaStep{{ns1, ns3}, {ns3, ns1}}
+	for i, should := range shouldMatches {
+		d, ok := mem.dfaForNfas(should...)
+		if ok == false || d != d13 {
+			t.Errorf("no match on %d", i)
+		}
+	}
+	shouldMatches = [][]*nfaStep{{ns1, ns2, ns3}, {ns1, ns3, ns2}, {ns3, ns1, ns2}, {ns3, ns2, ns1}}
+	for i, should := range shouldMatches {
+		d, ok := mem.dfaForNfas(should...)
+		if ok == false || d != d123 {
+			t.Errorf("no match on %d", i)
+		}
+	}
+
+	noDfaFor := [][]*nfaStep{
+		{&nfaStep{}},
+		{ns2},
+		{ns3, ns2},
+		{ns1, ns2, &nfaStep{}},
+		{ns1, ns2, ns3, &nfaStep{}},
+	}
+
+	for i, no := range noDfaFor {
+		_, ok = mem.dfaForNfas(no...)
+		if ok {
+			t.Errorf("bogus match %d", i)
+		}
+	}
+}
+
 func TestListMaker(t *testing.T) {
 	steps := []*nfaStep{
 		{},

diff --git a/shell_style_test.go b/shell_style_test.go
@@ -30,54 +30,6 @@ func TestLongCase(t *testing.T) {
 	}
 }
 
-func newNfaWithStart(start *smallTable[*nfaStepList]) *valueMatcher {
-	vm := newValueMatcher()
-	state := &vmFields{startNfa: start}
-	vm.update(state)
-	return vm
-}
-
-func TestNfaMerging(t *testing.T) {
-	aMatches := []string{
-		`"Afoo"`,
-		`"ABA"`,
-	}
-	bMatches := []string{
-		`"BAB"`,
-		`"Bbar"`,
-	}
-	f1 := &fieldMatcher{}
-	f2 := &fieldMatcher{}
-	nfa1, _ := makeShellStyleAutomaton([]byte(`"A*"`), f1)
-	nfa2, _ := makeShellStyleAutomaton([]byte(`"B*"`), f2)
-
-	v1 := newNfaWithStart(nfa1)
-	v2 := newNfaWithStart(nfa2)
-
-	for _, aMatch := range aMatches {
-		t1 := v1.transitionOn([]byte(aMatch))
-		if len(t1) != 1 || t1[0] != f1 {
-			t.Error("mismatch on " + aMatch)
-		}
-	}
-	for _, bMatch := range bMatches {
-		t1 := v2.transitionOn([]byte(bMatch))
-		if len(t1) != 1 || t1[0] != f2 {
-			t.Error("mismatch on " + bMatch)
-		}
-	}
-
-	combo := mergeNfas(nfa1, nfa2)
-	v3 := newNfaWithStart(combo)
-	ab := append(aMatches, bMatches...)
-	for _, match := range ab {
-		t3 := v3.transitionOn([]byte(match))
-		if len(t3) != 1 {
-			t.Error("Fail on " + match)
-		}
-	}
-}
-
 func TestMakeShellStyleAutomaton(t *testing.T) {
 	patterns := []string{
 		`"*ST"`,
@@ -104,29 +56,59 @@ func TestMakeShellStyleAutomaton(t *testing.T) {
 		{`"ayybyyzxx"`},
 	}
 
+	// NOTE also testing nfa2Dfa
 	for i, pattern := range patterns {
 		myNext := newFieldMatcher()
 		a, wanted := makeShellStyleAutomaton([]byte(pattern), myNext)
 		if wanted != myNext {
 			t.Error("bad next on: " + pattern)
 		}
+		d := nfa2Dfa(a)
+		vm := newValueMatcher()
+		vmf := vmFields{startDfa: d}
+		vm.update(&vmf)
 		for _, should := range shouldsForPatterns[i] {
 			var transitions []*fieldMatcher
-			gotTrans := oneNfaStep(a, 0, []byte(should), transitions)
+			gotTrans := transitionDfa(d, []byte(should), transitions)
 			if len(gotTrans) != 1 || gotTrans[0] != wanted {
 				t.Errorf("Failure for %s on %s", pattern, should)
 			}
 		}
 		for _, shouldNot := range shouldNotForPatterns[i] {
 			var transitions []*fieldMatcher
-			gotTrans := oneNfaStep(a, 0, []byte(shouldNot), transitions)
+			gotTrans := transitionDfa(d, []byte(shouldNot), transitions)
 			if gotTrans != nil {
-				t.Errorf("bogus match for %s on %s", pattern, shouldNot)
+				t.Errorf("bogus DFA match for %s on %s", pattern, shouldNot)
 			}
 		}
 	}
 }
 
+/* To be used in profiling AddPattern for patterns which need NFAs
+func xTestShellStyleBuildTime(t *testing.T) {
+	words := readWWords(t)
+	starWords := make([]string, 0, len(words))
+	patterns := make([]string, 0, len(words))
+	for _, word := range words {
+		starAt := rand.Int31n(6)
+		starWord := string(word[:starAt]) + "*" + string(word[starAt:])
+		starWords = append(starWords, starWord)
+		pattern := fmt.Sprintf(`{"x": [ {"shellstyle": "%s" } ] }`, starWord)
+		patterns = append(patterns, pattern)
+	}
+	q, _ := New()
+	for i := 0; i < 32; i++ {
+		// fmt.Printf("i=%d w=%s: %s\n", i, starWords[i], matcherStats(q.matcher.(*coreMatcher)))
+		// fmt.Println(patterns[i])
+		err := q.AddPattern(starWords[i], patterns[i])
+		if err != nil {
+			t.Error("AddP: " + err.Error())
+		}
+	}
+	fmt.Println(matcherStats(q.matcher.(*coreMatcher)))
+}
+*/
+
 func TestMixedPatterns(t *testing.T) {
 	// let's mix up some prefix, infix, suffix, and exact-match searches
 	x := map[string]int{