diff --git a/README.md b/README.md
index 0465d72..0fd20ac 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,9 @@
 create an instance and add multiple **Patterns** to it,
 and then query data objects called **Events** to
 discover which of the Patterns match
-the fields in the Event.
+the fields in the Event. In typical cases, Quamina
+can match millions of Events per second, even with
+many Patterns added to the instance.
 
 Quamina has no run-time dependencies beyond built-in Go libraries.
 
@@ -292,33 +294,20 @@ Events through it as is practical.
 
 ### `AddPattern()` Performance
 
-In **most** cases, tens of thousands of Patterns per second can
+Tens of thousands of Patterns per second can
 be added to a Quamina instance; the in-memory data structure will
-become larger, but not unreasonably so. The amount of of
+become larger, but not unreasonably so. The amount of
 available memory is the only significant limit to the
 number of patterns an instance can carry.
 
-The exception is `shellstyle` Patterns. Adding many of these
-can rapidly lead to degradation in elapsed time and memory
-consumption, at a rate which is uneven but at worst
-O(2<sup>N</sup>) in the number of patterns. A fuzz test
-which adds random 5-letter words with a `*` at a random
-location slows to a crawl after 30 or so `AddPattern()`
-calls, with the Quamina instance having many millions of
-states. Note that such instances, once built, can still
-match Events at high speeds.
-
-This is after some optimization. It is possible there is a
-bug such that automaton-building is unduly wasteful but it
-may remain the case that adding this flavor of Pattern is
-simply not something that can be done at large scale.
-
 ### `MatchesForEvent()` Performance
 
 I used to say that the performance of
 `MatchesForEvent` was O(1) in the number of
 Patterns. That’s probably a reasonable way to think
-about it, because it’s *almost* right.
+about it, because it’s *almost* right, except in the
+case where a very large number of `shellstyle` patterns
+have been added; this is discussed in the next section.
 
 To be correct, the performance is a little worse than
 O(N) where N is the average number of unique fields in an
@@ -361,6 +350,23 @@ So, adding a new Pattern that only mentions fields which are
 already mentioned in previous Patterns is effectively free,
 i.e. O(1) in terms of run-time performance.
 
+### Quamina instances with large numbers of `shellstyle` Patterns
+
+A study of the theory of finite automata reveals that processing 
+regular-expression constructs such as `*` increases the complexity of
+the automaton necessary to match it. It develops that when 
+a large number of such automata are compiled together, the merged
+output can contain a high degree of nondeterminism which can result
+in a drastic slowdown.
+
+A fuzz test which adds a pattern for each of 12,959 5-letter words with
+one `*` embedded in each at a random offset slows matching speed down to 
+below 10,000/second, in stark contrast to most Quamina instances, which 
+can achieve millions of matches/second.
+
+This slowdown is under active investigation and it is possible that the
+situation will improve.
+
 ### Further documentation
 
 There is a series of blog posts entitled
diff --git a/anything_but.go b/anything_but.go
index 3e70440..4dd8379 100644
--- a/anything_but.go
+++ b/anything_but.go
@@ -73,20 +73,19 @@ func readAnythingButSpecial(pb *patternBuild, valsIn []typedVal) (pathVals []typ
 func makeMultiAnythingButFA(vals [][]byte) (*smallTable, *fieldMatcher) {
 	nextField := newFieldMatcher()
 	successStep := &faState{table: newSmallTable(), fieldTransitions: []*fieldMatcher{nextField}}
-	//DEBUG successStep.table.label = "(success)"
-	success := &faNext{steps: []*faState{successStep}}
+	success := &faNext{states: []*faState{successStep}}
 
-	ret, _ := oneMultiAnythingButStep(vals, 0, success), nextField
+	ret, _ := makeOneMultiAnythingButStep(vals, 0, success), nextField
 	return ret, nextField
 }
 
-// oneMultiAnythingButStep - spookeh. The idea is that there will be N smallTables in this FA, where N is
+// makeOneMultiAnythingButStep - spookeh. The idea is that there will be N smallTables in this FA, where N is
 // the longest among the vals. So for each value from 0 through N, we make a smallTable whose default is
 // success but transfers to the next step on whatever the current byte in each of the vals that have not
 // yet been exhausted. We notice when we get to the end of each val and put in a valueTerminator transition
 // to a step with no nextField entry, i.e. failure because we've exactly matched one of the anything-but
 // strings.
-func oneMultiAnythingButStep(vals [][]byte, index int, success *faNext) *smallTable {
+func makeOneMultiAnythingButStep(vals [][]byte, index int, success *faNext) *smallTable {
 	// this will be the default transition in all the anything-but tables.
 	var u unpackedTable
 	for i := range u {
@@ -115,18 +114,18 @@ func oneMultiAnythingButStep(vals [][]byte, index int, success *faNext) *smallTa
 
 	// for each val that still has bytes to process, recurse to process the next one
 	for utf8Byte, val := range valsWithBytesRemaining {
-		nextTable := oneMultiAnythingButStep(val, index+1, success)
+		nextTable := makeOneMultiAnythingButStep(val, index+1, success)
 		nextStep := &faState{table: nextTable}
-		u[utf8Byte] = &faNext{steps: []*faState{nextStep}}
+		u[utf8Byte] = &faNext{states: []*faState{nextStep}}
 	}
 
 	// for each val that ends at 'index', put a failure-transition for this anything-but
 	// if you hit the valueTerminator, success for everything else
 	for utf8Byte := range valsEndingHere {
 		failState := &faState{table: newSmallTable()} // note no transitions
-		lastStep := &faNext{steps: []*faState{failState}}
+		lastStep := &faNext{states: []*faState{failState}}
 		lastTable := makeSmallTable(success, []byte{valueTerminator}, []*faNext{lastStep})
-		u[utf8Byte] = &faNext{steps: []*faState{{table: lastTable}}}
+		u[utf8Byte] = &faNext{states: []*faState{{table: lastTable}}}
 	}
 
 	table := newSmallTable()
diff --git a/cl2_test.go b/cl2_test.go
index 8168943..94320c0 100644
--- a/cl2_test.go
+++ b/cl2_test.go
@@ -187,20 +187,20 @@ func TestRulerCl2(t *testing.T) {
 
 	// initial run to stabilize memory
 	bm := newBenchmarker()
-	bm.addRules(exactRules, exactMatches)
+	bm.addRules(exactRules, exactMatches, false)
 
 	bm.run(t, lines)
 
 	bm = newBenchmarker()
-	bm.addRules(exactRules, exactMatches)
+	bm.addRules(exactRules, exactMatches, true)
 	fmt.Printf("EXACT events/sec: %.1f\n", bm.run(t, lines))
 
 	bm = newBenchmarker()
-	bm.addRules(prefixRules, prefixMatches)
+	bm.addRules(prefixRules, prefixMatches, true)
 	fmt.Printf("PREFIX events/sec: %.1f\n", bm.run(t, lines))
 
 	bm = newBenchmarker()
-	bm.addRules(anythingButRules, anythingButMatches)
+	bm.addRules(anythingButRules, anythingButMatches, true)
 	fmt.Printf("ANYTHING-BUT events/sec: %.1f\n", bm.run(t, lines))
 }
 
@@ -214,13 +214,15 @@ func newBenchmarker() *benchmarker {
 	return &benchmarker{q: q, wanted: make(map[X]int)}
 }
 
-func (bm *benchmarker) addRules(rules []string, wanted []int) {
+func (bm *benchmarker) addRules(rules []string, wanted []int, report bool) {
 	for i, rule := range rules {
 		rname := fmt.Sprintf("r%d", i)
 		_ = bm.q.AddPattern(rname, rule)
 		bm.wanted[rname] = wanted[i]
 	}
-	fmt.Println(matcherStats(bm.q.matcher.(*coreMatcher)))
+	if report {
+		fmt.Println(matcherStats(bm.q.matcher.(*coreMatcher)))
+	}
 }
 
 func (bm *benchmarker) run(t *testing.T, events [][]byte) float64 {
diff --git a/core_matcher.go b/core_matcher.go
index 29741c3..d4ba267 100644
--- a/core_matcher.go
+++ b/core_matcher.go
@@ -129,7 +129,7 @@ func (m *coreMatcher) deletePatterns(_ X) error {
 // matchesForJSONEvent calls the flattener to pull the fields out of the event and
 // hands over to MatchesForFields
 // This is a leftover from previous times, is only used by tests, but it's used by a *lot*
-// so removing it would require a lot of tedious work
+// and it's a convenient API for testing.
 func (m *coreMatcher) matchesForJSONEvent(event []byte) ([]X, error) {
 	fields, err := newJSONFlattener().Flatten(event, m.getSegmentsTreeTracker())
 	if err != nil {
@@ -178,12 +178,19 @@ func (m *coreMatcher) matchesForFields(fields []Field) ([]X, error) {
 	}
 	matches := newMatchSet()
 
+	// pre-allocate a pair of buffers that will be used several levels down the call stack for efficiently
+	// transversing NFAs
+	bufs := &bufpair{
+		buf1: make([]*faState, 0),
+		buf2: make([]*faState, 0),
+	}
+
 	// for each of the fields, we'll try to match the automaton start state to that field - the tryToMatch
 	// routine will, in the case that there's a match, call itself to see if subsequent fields after the
 	// first matched will transition through the machine and eventually achieve a match
 	s := m.fields()
 	for i := 0; i < len(fields); i++ {
-		tryToMatch(fields, i, s.state, matches)
+		tryToMatch(fields, i, s.state, matches, bufs)
 	}
 	return matches.matches(), nil
 }
@@ -191,7 +198,7 @@ func (m *coreMatcher) matchesForFields(fields []Field) ([]X, error) {
 // tryToMatch tries to match the field at fields[index] to the provided state. If it does match and generate
 // 1 or more transitions to other states, it calls itself recursively to see if any of the remaining fields
 // can continue the process by matching that state.
-func tryToMatch(fields []Field, index int, state *fieldMatcher, matches *matchSet) {
+func tryToMatch(fields []Field, index int, state *fieldMatcher, matches *matchSet, bufs *bufpair) {
 	stateFields := state.fields()
 
 	// transition on exists:true?
@@ -200,16 +207,16 @@ func tryToMatch(fields []Field, index int, state *fieldMatcher, matches *matchSe
 		matches = matches.addXSingleThreaded(existsTrans.fields().matches...)
 		for nextIndex := index + 1; nextIndex < len(fields); nextIndex++ {
 			if noArrayTrailConflict(fields[index].ArrayTrail, fields[nextIndex].ArrayTrail) {
-				tryToMatch(fields, nextIndex, existsTrans, matches)
+				tryToMatch(fields, nextIndex, existsTrans, matches, bufs)
 			}
 		}
 	}
 
 	// an exists:false transition is possible if there is no matching field in the event
-	checkExistsFalse(stateFields, fields, index, matches)
+	checkExistsFalse(stateFields, fields, index, matches, bufs)
 
 	// try to transition through the machine
-	nextStates := state.transitionOn(&fields[index])
+	nextStates := state.transitionOn(&fields[index], bufs)
 
 	// for each state in the possibly-empty list of transitions from this state on fields[index]
 	for _, nextState := range nextStates {
@@ -221,17 +228,17 @@ func tryToMatch(fields []Field, index int, state *fieldMatcher, matches *matchSe
 		//  of the same array
 		for nextIndex := index + 1; nextIndex < len(fields); nextIndex++ {
 			if noArrayTrailConflict(fields[index].ArrayTrail, fields[nextIndex].ArrayTrail) {
-				tryToMatch(fields, nextIndex, nextState, matches)
+				tryToMatch(fields, nextIndex, nextState, matches, bufs)
 			}
 		}
 		// now we've run out of fields to match this state against. But suppose it has an exists:false
 		// transition, and it so happens that the exists:false pattern field is lexically larger than the other
 		// fields and that in fact such a field does not exist. That state would be left hanging. So…
-		checkExistsFalse(nextStateFields, fields, index, matches)
+		checkExistsFalse(nextStateFields, fields, index, matches, bufs)
 	}
 }
 
-func checkExistsFalse(stateFields *fmFields, fields []Field, index int, matches *matchSet) {
+func checkExistsFalse(stateFields *fmFields, fields []Field, index int, matches *matchSet, bufs *bufpair) {
 	for existsFalsePath, existsFalseTrans := range stateFields.existsFalse {
 		// it seems like there ought to be a more state-machine-idiomatic way to do this, but
 		// I thought of a few and none of them worked.  Quite likely someone will figure it out eventually.
@@ -250,9 +257,9 @@ func checkExistsFalse(stateFields *fmFields, fields []Field, index int, matches
 		if i == len(fields) {
 			matches = matches.addXSingleThreaded(existsFalseTrans.fields().matches...)
 			if thisFieldIsAnExistsFalse {
-				tryToMatch(fields, index+1, existsFalseTrans, matches)
+				tryToMatch(fields, index+1, existsFalseTrans, matches, bufs)
 			} else {
-				tryToMatch(fields, index, existsFalseTrans, matches)
+				tryToMatch(fields, index, existsFalseTrans, matches, bufs)
 			}
 		}
 	}
diff --git a/field_matcher.go b/field_matcher.go
index 8dbfc8b..54d9716 100644
--- a/field_matcher.go
+++ b/field_matcher.go
@@ -6,7 +6,7 @@ import (
 
 // fieldMatcher represents a state in the matching automaton, which matches field names and dispatches to
 // valueMatcher to complete matching of field values.
-// the fields that hold state are segregated in updateable so they can be replaced atomically and make the coreMatcher
+// the fields that hold state are segregated in updateable, so they can be replaced atomically and make the coreMatcher
 // thread-safe.
 type fieldMatcher struct {
 	updateable atomic.Value // always holds an *fmFields
@@ -112,7 +112,7 @@ func (m *fieldMatcher) addTransition(field *patternField, printer printer) []*fi
 	}
 	freshStart.transitions[field.path] = vm
 
-	// suppose I'm adding the first pattern to a matcher and it has "x": [1, 2]. In principle the branches on
+	// suppose I'm adding the first pattern to a matcher, and it has "x": [1, 2]. In principle the branches on
 	//  "x": 1 and "x": 2 could go to tne same next state. But we have to make a unique next state for each of them
 	//  because some future other pattern might have "x": [2, 3] and thus we need a separate branch to potentially
 	//  match two patterns on "x": 2 but not "x": 1. If you were optimizing the automaton for size you might detect
@@ -144,12 +144,12 @@ func (m *fieldMatcher) addTransition(field *patternField, printer printer) []*fi
 // or nil if no transitions are possible.  An example of name/value that could produce multiple next states
 // would be if you had the pattern { "a": [ "foo" ] } and another pattern that matched any value with
 // a prefix of "f".
-func (m *fieldMatcher) transitionOn(field *Field) []*fieldMatcher {
+func (m *fieldMatcher) transitionOn(field *Field, bufs *bufpair) []*fieldMatcher {
 	// are there transitions on this field name?
 	valMatcher, ok := m.fields().transitions[string(field.Path)]
 	if !ok {
 		return nil
 	}
 
-	return valMatcher.transitionOn(field.Val)
+	return valMatcher.transitionOn(field.Val, bufs)
 }
diff --git a/flatten_json.go b/flatten_json.go
index 4587bb0..e8dbb7a 100644
--- a/flatten_json.go
+++ b/flatten_json.go
@@ -51,21 +51,21 @@ var errEarlyStop = errors.New("earlyStop")
 type fjState int
 
 const (
-	startState fjState = iota
-	inObjectState
-	seekingColonState
-	memberValueState
-	inArrayState
-	afterValueState
-	numberStartState
-	numberIntegralPartState
-	numberFracState
-	numberAfterEState
-	numberExpState
-	trailerState
-	startEscapeState
-	wantEscapeUState
-	readHexDigitState
+	fjStartState fjState = iota
+	fjInObjectState
+	fjSeekingColonState
+	fjMemberValueState
+	fjInArrayState
+	fjAfterValueState
+	fjNumberStartState
+	fjNumberIntegralPartState
+	fjNumberFracState
+	fjNumberAfterEState
+	fjNumberExpState
+	fjTrailerState
+	fjStartEscapeState
+	fjWantEscapeUState
+	fjReadHexDigitState
 )
 
 func newJSONFlattener() Flattener {
@@ -93,11 +93,11 @@ func (fj *flattenJSON) Flatten(event []byte, tracker SegmentsTreeTracker) ([]Fie
 	}
 	var err error
 	fj.event = event
-	state := startState
+	state := fjStartState
 	for {
 		ch := fj.ch()
 		switch state {
-		case startState:
+		case fjStartState:
 			switch {
 			// single top-level object
 			case ch == '{':
@@ -108,7 +108,7 @@ func (fj *flattenJSON) Flatten(event []byte, tracker SegmentsTreeTracker) ([]Fie
 					}
 					return nil, err
 				}
-				state = trailerState
+				state = fjTrailerState
 
 			case fj.isSpace[ch]:
 			// no-op
@@ -118,7 +118,7 @@ func (fj *flattenJSON) Flatten(event []byte, tracker SegmentsTreeTracker) ([]Fie
 			}
 
 		// eat trailing white space, if any
-		case trailerState:
+		case fjTrailerState:
 			if !fj.isSpace[ch] {
 				return nil, fj.error(fmt.Sprintf("garbage char '%c' after top-level object", ch))
 			}
@@ -138,7 +138,7 @@ func (fj *flattenJSON) Flatten(event []byte, tracker SegmentsTreeTracker) ([]Fie
 // minimize the cost of the Flatten call.
 func (fj *flattenJSON) readObject(pathNode SegmentsTreeTracker) error {
 	var err error
-	state := inObjectState
+	state := fjInObjectState
 
 	// eventIndex points at {
 	err = fj.step()
@@ -146,7 +146,7 @@ func (fj *flattenJSON) readObject(pathNode SegmentsTreeTracker) error {
 		return err
 	}
 
-	// how many leaf steps (fieldsCount) and chidStructures (nodesCount) have been mentioned in patterns?
+	// how many leaf states (fieldsCount) and chidStructures (nodesCount) have been mentioned in patterns?
 	fieldsCount := pathNode.FieldsCount()
 	nodesCount := pathNode.NodesCount()
 
@@ -175,7 +175,7 @@ func (fj *flattenJSON) readObject(pathNode SegmentsTreeTracker) error {
 		ch := fj.ch()
 
 		switch state {
-		case inObjectState:
+		case fjInObjectState:
 			switch {
 			case fj.isSpace[ch]:
 				// no-op
@@ -187,22 +187,22 @@ func (fj *flattenJSON) readObject(pathNode SegmentsTreeTracker) error {
 
 				// we know the name of the next object member, use the pathNode to check if it's used
 				memberIsUsed = (fj.skipping == 0) && pathNode.IsSegmentUsed(memberName)
-				state = seekingColonState
+				state = fjSeekingColonState
 			case ch == '}':
 				return nil
 			default:
 				return fj.error(fmt.Sprintf("illegal character %c in JSON object", ch))
 			}
-		case seekingColonState:
+		case fjSeekingColonState:
 			switch {
 			case fj.isSpace[ch]:
 				// no-op
 			case ch == ':':
-				state = memberValueState
+				state = fjMemberValueState
 			default:
 				return fj.error(fmt.Sprintf("illegal character %c while looking for colon", ch))
 			}
-		case memberValueState:
+		case fjMemberValueState:
 			// bypass space between colon and value. A bit klunky but allows for immense simplification
 			// TODO: Investigate if there's a more efficient way to say this, or should just trust Go compiler
 			for fj.isSpace[ch] {
@@ -302,13 +302,13 @@ func (fj *flattenJSON) readObject(pathNode SegmentsTreeTracker) error {
 			if alt != nil {
 				alt = nil
 			}
-			state = afterValueState
-		case afterValueState:
+			state = fjAfterValueState
+		case fjAfterValueState:
 			switch {
 			case fj.isSpace[ch]:
 				// no-op
 			case ch == ',':
-				state = inObjectState
+				state = fjInObjectState
 			case ch == '}':
 				return nil
 			default:
@@ -337,14 +337,14 @@ func (fj *flattenJSON) readArray(pathName []byte, pathNode SegmentsTreeTracker)
 		defer fj.leaveArray()
 	}
 
-	state := inArrayState
+	state := fjInArrayState
 	isLeaf := false
 	for {
 		ch := fj.ch()
 		var val []byte // resets on each loop
 		var alt []byte
 		switch state {
-		case inArrayState:
+		case fjInArrayState:
 			// bypass space before element value. A bit klunky but allows for immense simplification
 			for fj.isSpace[ch] {
 				if fj.step() != nil {
@@ -406,15 +406,15 @@ func (fj *flattenJSON) readArray(pathName []byte, pathNode SegmentsTreeTracker)
 			if alt != nil {
 				alt = nil
 			}
-			state = afterValueState
-		case afterValueState:
+			state = fjAfterValueState
+		case fjAfterValueState:
 			switch {
 			case fj.isSpace[ch]:
 				// no-op
 			case ch == ']':
 				return nil
 			case ch == ',':
-				state = inArrayState
+				state = fjInArrayState
 			default:
 				return fj.error(fmt.Sprintf("illegal character %c in array", ch))
 			}
@@ -435,25 +435,25 @@ func (fj *flattenJSON) readArray(pathName []byte, pathNode SegmentsTreeTracker)
 func (fj *flattenJSON) readNumber() ([]byte, []byte, error) {
 	// points at the first character in the number
 	numStart := fj.eventIndex
-	state := numberStartState
+	state := fjNumberStartState
 	for {
 		ch := fj.ch()
 		switch state {
-		case numberStartState:
+		case fjNumberStartState:
 			switch ch {
 			case '-':
-				state = numberIntegralPartState
+				state = fjNumberIntegralPartState
 			case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
-				state = numberIntegralPartState
+				state = fjNumberIntegralPartState
 			}
-		case numberIntegralPartState:
+		case fjNumberIntegralPartState:
 			switch ch {
 			case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
 				// no-op
 			case '.':
-				state = numberFracState
+				state = fjNumberFracState
 			case 'e', 'E':
-				state = numberAfterEState
+				state = fjNumberAfterEState
 			case ',', ']', '}', ' ', '\t', '\n', '\r':
 				fj.eventIndex--
 				// TODO: Too expensive; make it possible for people to ask for this
@@ -467,7 +467,7 @@ func (fj *flattenJSON) readNumber() ([]byte, []byte, error) {
 			default:
 				return nil, nil, fj.error(fmt.Sprintf("illegal char '%c' in number", ch))
 			}
-		case numberFracState:
+		case fjNumberFracState:
 			switch ch {
 			case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
 				// no-op
@@ -482,20 +482,20 @@ func (fj *flattenJSON) readNumber() ([]byte, []byte, error) {
 				//}
 				return bytes, alt, nil
 			case 'e', 'E':
-				state = numberAfterEState
+				state = fjNumberAfterEState
 			default:
 				return nil, nil, fj.error(fmt.Sprintf("illegal char '%c' in number", ch))
 			}
-		case numberAfterEState:
+		case fjNumberAfterEState:
 			switch ch {
 			case '-', '1', '2', '3', '4', '5', '6', '7', '8', '9':
 				// no-op
 			default:
 				return nil, nil, fj.error(fmt.Sprintf("illegal char '%c' after 'e' in number", ch))
 			}
-			state = numberExpState
+			state = fjNumberExpState
 
-		case numberExpState:
+		case fjNumberExpState:
 			switch ch {
 			case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
 				// no-op
@@ -787,28 +787,28 @@ func (fj *flattenJSON) readHexUTF16(from int) ([]byte, int, error) {
 	var runes []rune
 	from-- // point at the \ before the u
 	var hexDigitCount int
-	state := startEscapeState
+	state := fjStartEscapeState
 	for {
 		ch := fj.event[from]
 		switch state {
-		case startEscapeState:
+		case fjStartEscapeState:
 			switch ch {
 			case '\\':
-				state = wantEscapeUState
+				state = fjWantEscapeUState
 			default:
 				runes = utf16.Decode(codepoints)
 				return []byte(string(runes)), from - 1, nil
 			}
-		case wantEscapeUState:
+		case fjWantEscapeUState:
 			switch ch {
 			case 'u':
-				state = readHexDigitState
+				state = fjReadHexDigitState
 				hexDigitCount = 0
 			default:
 				runes = utf16.Decode(codepoints)
 				return []byte(string(runes)), from - 1, nil
 			}
-		case readHexDigitState:
+		case fjReadHexDigitState:
 			switch ch {
 			case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'A', 'b', 'B', 'c', 'C', 'd', 'D', 'e', 'E', 'f', 'F':
 				hexDigitCount++
@@ -816,7 +816,7 @@ func (fj *flattenJSON) readHexUTF16(from int) ([]byte, int, error) {
 					hexString := string(fj.event[from-3 : from+1])
 					r, _ := strconv.ParseUint(hexString, 16, 16)
 					codepoints = append(codepoints, uint16(r))
-					state = startEscapeState
+					state = fjStartEscapeState
 				}
 			default:
 				fj.eventIndex = from
diff --git a/nfa.go b/nfa.go
index 1c6c91f..c9417a4 100644
--- a/nfa.go
+++ b/nfa.go
@@ -1,35 +1,81 @@
 package quamina
 
+import "fmt"
+
 // This groups the functions that traverse, merge, and debug Quamina's nondeterministic finite automata
 
-func traverseFA(table *smallTable, val []byte, transitions []*fieldMatcher) []*fieldMatcher {
-	return traverseOneFAStep(table, 0, val, transitions)
+// faState is used by the valueMatcher automaton - every step through the
+// automaton requires a smallTable and for some of them, taking the step means you've matched a value and can
+// transition to a new fieldMatcher, in which case the fieldTransitions slice will be non-nil
+type faState struct {
+	table            *smallTable
+	fieldTransitions []*fieldMatcher
+}
+
+// struct wrapper to make this comparable to help with pack/unpack
+type faNext struct {
+	states []*faState
+}
+
+type transmap struct {
+	set map[*fieldMatcher]bool
 }
 
-func traverseOneFAStep(table *smallTable, index int, val []byte, transitions []*fieldMatcher) []*fieldMatcher {
-	var utf8Byte byte
-	switch {
-	case index < len(val):
-		utf8Byte = val[index]
-	case index == len(val):
-		utf8Byte = valueTerminator
-	default:
-		return transitions
+func (tm *transmap) add(fms []*fieldMatcher) {
+	for _, fm := range fms {
+		tm.set[fm] = true
 	}
-	nextSteps := table.step(utf8Byte)
-	if nextSteps == nil {
-		return transitions
+}
+
+func (tm *transmap) all() []*fieldMatcher {
+	var all []*fieldMatcher
+	for fm := range tm.set {
+		all = append(all, fm)
 	}
-	index++
-	// 1. Note no effort to traverse multiple next-steps in parallel. The traversal compute is tiny and the
-	//    necessary concurrency apparatus would almost certainly outweigh it
-	// 2. TODO: It would probably be better to implement this iteratively rather than recursively.
-	//    The recursion will potentially go as deep as the val argument is long.
-	for _, nextStep := range nextSteps.steps {
-		transitions = append(transitions, nextStep.fieldTransitions...)
-		transitions = traverseOneFAStep(nextStep.table, index, val, transitions)
+	return all
+}
+
+func traverseFA(table *smallTable, val []byte, transitions []*fieldMatcher, bufs *bufpair) []*fieldMatcher {
+	currentStates := bufs.buf1
+	currentStates = append(currentStates, &faState{table: table})
+	nextStates := bufs.buf2
+
+	// a lot of the transitions stuff is going to be empty, but on the other hand
+	// a * entry with a transition could end up getting added a lot.
+	newTransitions := &transmap{set: make(map[*fieldMatcher]bool, len(transitions))}
+	newTransitions.add(transitions)
+	stepResult := &stepOut{}
+	for index := 0; len(currentStates) != 0 && index <= len(val); index++ {
+		var utf8Byte byte
+		if index < len(val) {
+			utf8Byte = val[index]
+		} else {
+			utf8Byte = valueTerminator
+		}
+		for _, state := range currentStates {
+			state.table.step(utf8Byte, stepResult)
+			for _, nextStep := range stepResult.steps {
+				newTransitions.add(nextStep.fieldTransitions)
+				nextStates = append(nextStates, nextStep)
+			}
+			for _, nextStep := range stepResult.epsilon {
+				newTransitions.add(nextStep.fieldTransitions)
+				nextStates = append(nextStates, nextStep)
+			}
+		}
+		// re-use these
+		swapStates := currentStates
+		currentStates = nextStates
+		nextStates = swapStates[:0]
 	}
-	return transitions
+	bufs.buf1 = currentStates[:0]
+	bufs.buf2 = nextStates[:0]
+	return newTransitions.all()
+}
+
+type faStepKey struct {
+	step1 *faState
+	step2 *faState
 }
 
 // mergeFAs compute the union of two valueMatch automata.  If you look up the textbook theory about this,
@@ -39,21 +85,13 @@ func traverseOneFAStep(table *smallTable, index int, val []byte, transitions []*
 // minimal or even avoids being wasteful.
 // INVARIANT: neither argument is nil
 // INVARIANT: To be thread-safe, no existing table can be updated except when we're building it
-
-type faStepKey struct {
-	step1 *faState
-	step2 *faState
-}
-
-func mergeFAs(table1, table2 *smallTable) *smallTable {
+func mergeFAs(table1, table2 *smallTable, printer printer) *smallTable {
 	state1 := &faState{table: table1}
 	state2 := &faState{table: table2}
-	return mergeFAStates(state1, state2, make(map[faStepKey]*faState)).table
+	return mergeFAStates(state1, state2, make(map[faStepKey]*faState), printer).table
 }
 
-// TODO: maybe memoize these based on the string of characters you matched to get here?
-// TODO: recursion seems way too deep
-func mergeFAStates(state1, state2 *faState, keyMemo map[faStepKey]*faState) *faState {
+func mergeFAStates(state1, state2 *faState, keyMemo map[faStepKey]*faState, printer printer) *faState {
 	var combined *faState
 	mKey := faStepKey{state1, state2}
 	combined, ok := keyMemo[mKey]
@@ -65,7 +103,13 @@ func mergeFAStates(state1, state2 *faState, keyMemo map[faStepKey]*faState) *faS
 
 	fieldTransitions := append(state1.fieldTransitions, state2.fieldTransitions...)
 	combined = &faState{table: newTable, fieldTransitions: fieldTransitions}
-	//DEBUG combined.table.label = fmt.Sprintf("(%s ∎ %s)", state1.table.label, state2.table.label)
+
+	pretty, ok := printer.(*prettyPrinter)
+	if ok {
+		printer.labelTable(combined.table, fmt.Sprintf("%d∎%d", pretty.tableSerial(state1.table),
+			pretty.tableSerial(state2.table)))
+	}
+
 	keyMemo[mKey] = combined
 	u1 := unpackTable(state1.table)
 	u2 := unpackTable(state2.table)
@@ -74,34 +118,28 @@ func mergeFAStates(state1, state2 *faState, keyMemo map[faStepKey]*faState) *faS
 	for i, next1 := range u1 {
 		next2 := u2[i]
 		switch {
-		case next1 == nil && next2 == nil:
-			uComb[i] = nil
+		case next1 == next2:
+			uComb[i] = next1
 		case next1 != nil && next2 == nil:
 			uComb[i] = u1[i]
 		case next1 == nil && next2 != nil:
 			uComb[i] = u2[i]
 		case next1 != nil && next2 != nil:
-			//fmt.Printf("MERGE %s & %s i=%d d=%d: ", next1, next2, i, depth)
-			if next1 == next2 {
-				//	fmt.Println("n1 == n2")
-				uComb[i] = next1
-			} else if i > 0 && next1 == u1[i-1] && next2 == u2[i-1] {
+			if i > 0 && next1 == u1[i-1] && next2 == u2[i-1] {
 				uComb[i] = uComb[i-1]
-				//	fmt.Printf("SEQ %s\n", uComb[i].steps[0].table.shortDump())
 			} else {
-				//	fmt.Println("RECURSE!")
 				var comboNext []*faState
-				for _, nextStep1 := range next1.steps {
-					for _, nextStep2 := range next2.steps {
-						comboNext = append(comboNext, mergeFAStates(nextStep1, nextStep2, keyMemo))
+				for _, nextStep1 := range next1.states {
+					for _, nextStep2 := range next2.states {
+						comboNext = append(comboNext, mergeFAStates(nextStep1, nextStep2, keyMemo, printer))
 					}
 				}
-				uComb[i] = &faNext{steps: comboNext}
-				//DEBUG uComb[i].serial = *serial
+				uComb[i] = &faNext{states: comboNext}
 			}
 		}
 	}
 	combined.table.pack(&uComb)
+	combined.table.epsilon = append(state1.table.epsilon, state2.table.epsilon...)
 
 	return combined
 }
diff --git a/nfa_test.go b/nfa_test.go
index c75d8af..9cfd540 100644
--- a/nfa_test.go
+++ b/nfa_test.go
@@ -58,7 +58,7 @@ func TestFocusedMerge(t *testing.T) {
 
 	for _, shellStyle := range shellStyles {
 		str := `"` + shellStyle + `"`
-		automaton, matcher := makeShellStyleAutomaton([]byte(str), &nullPrinter{})
+		automaton, matcher := makeShellStyleFA([]byte(str), &nullPrinter{})
 		automata = append(automata, automaton)
 		matchers = append(matchers, matcher)
 	}
@@ -71,7 +71,7 @@ func TestFocusedMerge(t *testing.T) {
 
 	merged := newSmallTable()
 	for _, automaton := range automata {
-		merged = mergeFAs(merged, automaton)
+		merged = mergeFAs(merged, automaton, sharedNullPrinter)
 
 		s := statsAccum{
 			fmVisited: make(map[*fieldMatcher]bool),
@@ -82,26 +82,3 @@ func TestFocusedMerge(t *testing.T) {
 		fmt.Println(s.stStats())
 	}
 }
-
-func TestNFABasics(t *testing.T) {
-	aFoo, fFoo := makeStringFA([]byte("foo"), nil)
-	var matches []*fieldMatcher
-
-	matches = traverseOneFAStep(aFoo, 0, []byte("foo"), nil)
-	if len(matches) != 1 || matches[0] != fFoo {
-		t.Error("ouch no foo")
-	}
-	matches = traverseOneFAStep(aFoo, 0, []byte("foot"), nil)
-	if len(matches) != 0 {
-		t.Error("ouch yes foot")
-	}
-
-	aNotFoot, fNotFoot := makeMultiAnythingButFA([][]byte{[]byte("foot")})
-	notFeet := []string{"foo", "footy", "afoot", "xyz"}
-	for _, notFoot := range notFeet {
-		matches = traverseOneFAStep(aNotFoot, 0, []byte(notFoot), nil)
-		if len(matches) != 1 || matches[0] != fNotFoot {
-			t.Error("!foot miss: " + notFoot)
-		}
-	}
-}
diff --git a/prettyprinter.go b/prettyprinter.go
index bc12097..211f3b4 100644
--- a/prettyprinter.go
+++ b/prettyprinter.go
@@ -74,7 +74,7 @@ func (pp *prettyPrinter) printNFAStep(fas *faState, indent int, already map[*sma
 	s := " " + pp.printTable(t) + trailer
 	for _, step := range t.steps {
 		if step != nil {
-			for _, state := range step.steps {
+			for _, state := range step.states {
 				_, ok := already[state.table]
 				if !ok {
 					already[state.table] = true
@@ -91,6 +91,7 @@ func (pp *prettyPrinter) printTable(t *smallTable) string {
 	// each line is going to be a range like
 	// 'c' .. 'e' => %X
 	// lines where the *faNext is nil are omitted
+	// TODO: Post-nfa-rationalization, I don't think the whole defTrans thing is necessary any more?
 	var rows []string
 	unpacked := unpackTable(t)
 
@@ -99,6 +100,18 @@ func (pp *prettyPrinter) printTable(t *smallTable) string {
 
 	defTrans := unpacked[0]
 
+	// TODO: Try to generate an NFA with a state with multiple epsilons
+	if len(t.epsilon) != 0 {
+		fas := ""
+		for i, eps := range t.epsilon {
+			ep := &faNext{states: []*faState{eps}}
+			if i != 0 {
+				fas += ", "
+			}
+			fas += pp.nextString(ep)
+		}
+		rows = append(rows, "ε → "+fas)
+	}
 	for {
 		for b < len(unpacked) && unpacked[b] == nil {
 			b++
@@ -126,35 +139,33 @@ func (pp *prettyPrinter) printTable(t *smallTable) string {
 	label := pp.tableLabel(t)
 	if defTrans != nil {
 		dtString := "★ → " + pp.nextString(defTrans)
-		return fmt.Sprintf("%d [%s] ", serial, label) + strings.Join(rows, " / ") + " / " + dtString
+		return fmt.Sprintf("%d[%s] ", serial, label) + strings.Join(rows, " / ") + " / " + dtString
 	} else {
-		return fmt.Sprintf("%d [%s] ", serial, label) + strings.Join(rows, " / ")
+		return fmt.Sprintf("%d[%s] ", serial, label) + strings.Join(rows, " / ")
 	}
 }
 
 func (pp *prettyPrinter) nextString(n *faNext) string {
 	var snames []string
-	for _, step := range n.steps {
-		snames = append(snames, fmt.Sprintf("%d %s",
+	for _, step := range n.states {
+		snames = append(snames, fmt.Sprintf("%d[%s]",
 			pp.tableSerial(step.table), pp.tableLabel(step.table)))
 	}
-	return "[" + strings.Join(snames, " · ") + "]"
+	return strings.Join(snames, " · ")
 }
 
 func branchChar(b byte) string {
 	switch b {
 	// TODO: Figure out how to test commented-out cases
-	// case 0:
-	// 	return "∅"
+	case 0:
+		return "∅"
 	case valueTerminator:
 		return "ℵ"
-	// case byte(byteCeiling):
-	// 	return "♾️"
 	default:
 		return fmt.Sprintf("%c", b)
 	}
 }
 
 func (pp *prettyPrinter) shortPrintNFA(table *smallTable) string {
-	return fmt.Sprintf("%d-%s", pp.tableSerials[table], pp.tableLabels[table])
+	return fmt.Sprintf("%d[%s]", pp.tableSerials[table], pp.tableLabels[table])
 }
diff --git a/prettyprinter_test.go b/prettyprinter_test.go
index 43ebe5b..f96d12f 100644
--- a/prettyprinter_test.go
+++ b/prettyprinter_test.go
@@ -6,21 +6,21 @@ import (
 
 func TestPP(t *testing.T) {
 	pp := newPrettyPrinter(1)
-	table, _ := makeShellStyleAutomaton([]byte(`"x*9"`), pp)
+	table, _ := makeShellStyleFA([]byte(`"x*9"`), pp)
 	pp.labelTable(table, "START HERE")
-	wanted := ` 758 [START HERE] '"' → [910 on " at 0]
- 910 [on " at 0] 'x' → [821 gS at 2]
- 821 [gS at 2] '9' → [551 gX on 9 at 3] / ★ → [821 gS at 2]
- 551 [gX on 9 at 3] '"' → [937 on " at 4] / '9' → [551 gX on 9 at 3] / ★ → [821 gS at 2]
- 937 [on " at 4] '9' → [551 gX on 9 at 3] / 'ℵ' → [820 last step at 5] / ★ → [821 gS at 2]
- 820 [last step at 5]  [1 transition(s)]
+	wanted := ` 758[START HERE] '"' → 910[on " at 0]
+ 910[on " at 0] 'x' → 821[gS at 2]
+ 821[gS at 2] ε → 821[gS at 2] / '9' → 551[gX on 9 at 3]
+ 551[gX on 9 at 3] '"' → 937[on " at 4]
+ 937[on " at 4] 'ℵ' → 820[last step at 5]
+ 820[last step at 5]  [1 transition(s)]
 `
 	s := pp.printNFA(table)
 	if s != wanted {
 		t.Errorf("LONG: wanted\n<%s>\ngot\n<%s>\n", wanted, s)
 	}
-	if pp.shortPrintNFA(table) != "758-START HERE" {
-		t.Errorf("SHORT: wanted <%s> got <%s>\n", "758-START HERE", pp.shortPrintNFA(table))
+	if pp.shortPrintNFA(table) != "758[START HERE]" {
+		t.Errorf("SHORT: wanted <%s> got <%s>\n", "758[START HERE]", pp.shortPrintNFA(table))
 	}
 }
 
diff --git a/shell_style.go b/shell_style.go
index f12b0d1..706fbdd 100644
--- a/shell_style.go
+++ b/shell_style.go
@@ -6,6 +6,8 @@ import (
 	"fmt"
 )
 
+// TODO: remove the limitation of only one "*" in the pattern
+
 // readShellStyleSpecial parses a shellStyle object in a Pattern
 func readShellStyleSpecial(pb *patternBuild, valsIn []typedVal) (pathVals []typedVal, err error) {
 	t, err := pb.jd.Token()
@@ -50,74 +52,49 @@ func readShellStyleSpecial(pb *patternBuild, valsIn []typedVal) (pathVals []type
 	return
 }
 
-// makeShellStyleAutomaton - recognize a "-delimited string containing one '*' glob.
-func makeShellStyleAutomaton(val []byte, printer printer) (start *smallTable, nextField *fieldMatcher) {
+// makeShellStyleFA does what it says.  It is precisely equivalent to a regex with the only operator
+// being a single ".*". Once we've implemented regular expressions we can use that to more or less eliminate this
+func makeShellStyleFA(val []byte, printer printer) (start *smallTable, nextField *fieldMatcher) {
 	table := newSmallTable()
 	start = table
 	nextField = newFieldMatcher()
 
 	// for each byte in the pattern
-	var globStep *faState = nil
-	var globExitStep *faState = nil
-	var globExitByte byte
-	i := 0
-	for i < len(val) {
-		ch := val[i]
+	valIndex := 0
+	for valIndex < len(val) {
+		ch := val[valIndex]
 		if ch == '*' {
 			// special-case handling for string ending in '*"' - transition to field match on any character.
-			//  we know the trailing '"' will be there because of JSON syntax.
-			if i == len(val)-2 {
-				step := &faState{table: newSmallTable(), fieldTransitions: []*fieldMatcher{nextField}}
-				table.setDefault(&faNext{steps: []*faState{step}})
-				printer.labelTable(table, fmt.Sprintf("prefix escape at %d", i))
+			// we know the trailing '"' will be there because of JSON syntax.  We could use an epsilon state
+			// but then the matcher will process through all the rest of the bytes, when it doesn't need to
+			if valIndex == len(val)-2 {
+				step := &faState{
+					table:            newSmallTable(),
+					fieldTransitions: []*fieldMatcher{nextField},
+				}
+				table.epsilon = []*faState{step}
+				printer.labelTable(table, fmt.Sprintf("prefix escape at %d", valIndex))
 				return
 			}
+			globStep := &faState{table: table}
+			printer.labelTable(table, fmt.Sprintf("gS at %d", valIndex))
+			table.epsilon = []*faState{globStep}
 
-			// loop back on everything
-			globStep = &faState{table: table}
-			printer.labelTable(table, fmt.Sprintf("gS at %d", i))
-			table.setDefault(&faNext{steps: []*faState{globStep}})
-
-			// escape the glob on the next char from the pattern - remember the byte and the state escaped to
-			i++
-			globExitByte = val[i]
-			globExitStep = &faState{table: newSmallTable()}
-			printer.labelTable(globExitStep.table, fmt.Sprintf("gX on %c at %d", val[i], i))
-			// escape the glob
-			table.addByteStep(globExitByte, &faNext{steps: []*faState{globExitStep}})
-			table = globExitStep.table
+			valIndex++
+			globNext := &faState{table: newSmallTable()}
+			printer.labelTable(globNext.table, fmt.Sprintf("gX on %c at %d", val[valIndex], valIndex))
+			table.addByteStep(val[valIndex], &faNext{states: []*faState{globNext}})
+			table = globNext.table
 		} else {
 			nextStep := &faState{table: newSmallTable()}
-			printer.labelTable(nextStep.table, fmt.Sprintf("on %c at %d", val[i], i))
-
-			// we're going to move forward on 'ch'.  On anything else, we leave it at nil or - if we've passed
-			//  a glob, loop back to the glob stae.  if 'ch' is also the glob exit byte, also put in a transfer
-			//  back to the glob exist state
-			if globExitStep != nil {
-				table.setDefault(&faNext{steps: []*faState{globStep}})
-				if ch == globExitByte {
-					table.addByteStep(ch, &faNext{steps: []*faState{globExitStep, nextStep}})
-				} else {
-					table.addByteStep(globExitByte, &faNext{steps: []*faState{globExitStep}})
-					table.addByteStep(ch, &faNext{steps: []*faState{nextStep}})
-				}
-			} else {
-				table.addByteStep(ch, &faNext{steps: []*faState{nextStep}})
-			}
+			printer.labelTable(nextStep.table, fmt.Sprintf("on %c at %d", val[valIndex], valIndex))
+			table.addByteStep(ch, &faNext{states: []*faState{nextStep}})
 			table = nextStep.table
 		}
-		i++
+		valIndex++
 	}
-
 	lastStep := &faState{table: newSmallTable(), fieldTransitions: []*fieldMatcher{nextField}}
-	printer.labelTable(lastStep.table, fmt.Sprintf("last step at %d", i))
-	if globExitStep != nil {
-		table.setDefault(&faNext{steps: []*faState{globStep}})
-		table.addByteStep(globExitByte, &faNext{steps: []*faState{globExitStep}})
-		table.addByteStep(valueTerminator, &faNext{steps: []*faState{lastStep}})
-	} else {
-		table.addByteStep(valueTerminator, &faNext{steps: []*faState{lastStep}})
-	}
-	// fmt.Printf("new for [%s]: %s\n", string(val), printer.printNFA(start))
+	printer.labelTable(lastStep.table, fmt.Sprintf("last step at %d", valIndex))
+	table.addByteStep(valueTerminator, &faNext{states: []*faState{lastStep}})
 	return
 }
diff --git a/shell_style_test.go b/shell_style_test.go
index 36df3d9..35f86c9 100644
--- a/shell_style_test.go
+++ b/shell_style_test.go
@@ -5,6 +5,7 @@ import (
 	"math/rand"
 	"strings"
 	"testing"
+	"time"
 )
 
 func TestLongCase(t *testing.T) {
@@ -30,8 +31,7 @@ func TestLongCase(t *testing.T) {
 		}
 	}
 }
-
-func TestMakeShellStyleAutomaton(t *testing.T) {
+func TestMakeShellStyleFA(t *testing.T) {
 	patterns := []string{
 		`"*ST"`,
 		`"foo*"`,
@@ -58,20 +58,25 @@ func TestMakeShellStyleAutomaton(t *testing.T) {
 	}
 
 	for i, pattern := range patterns {
-		a, wanted := makeShellStyleAutomaton([]byte(pattern), &nullPrinter{})
+		a, wanted := makeShellStyleFA([]byte(pattern), sharedNullPrinter)
 		vm := newValueMatcher()
 		vmf := vmFields{startTable: a}
 		vm.update(&vmf)
+		var bufs bufpair
 		for _, should := range shouldsForPatterns[i] {
+			fmt.Println("for: " + should)
 			var transitions []*fieldMatcher
-			gotTrans := traverseFA(a, []byte(should), transitions)
+			gotTrans := traverseFA(a, []byte(should), transitions, &bufs)
+			if len(gotTrans) != 0 {
+				fmt.Println("FOO")
+			}
 			if len(gotTrans) != 1 || gotTrans[0] != wanted {
 				t.Errorf("Failure for %s on %s", pattern, should)
 			}
 		}
 		for _, shouldNot := range shouldNotForPatterns[i] {
 			var transitions []*fieldMatcher
-			gotTrans := traverseFA(a, []byte(shouldNot), transitions)
+			gotTrans := traverseFA(a, []byte(shouldNot), transitions, &bufs)
 			if gotTrans != nil {
 				t.Errorf("bogus match for %s on %s", pattern, shouldNot)
 			}
@@ -81,6 +86,7 @@ func TestMakeShellStyleAutomaton(t *testing.T) {
 
 func TestShellStyleBuildTime(t *testing.T) {
 	words := readWWords(t)
+	fmt.Printf("WC %d\n", len(words))
 	starWords := make([]string, 0, len(words))
 	patterns := make([]string, 0, len(words))
 	source := rand.NewSource(293591)
@@ -93,13 +99,31 @@ func TestShellStyleBuildTime(t *testing.T) {
 		patterns = append(patterns, pattern)
 	}
 	q, _ := New()
-	for i := 0; i < 21; i++ {
+	for i := range words {
 		err := q.AddPattern(starWords[i], patterns[i])
 		if err != nil {
 			t.Error("AddP: " + err.Error())
 		}
 	}
 	fmt.Println(matcherStats(q.matcher.(*coreMatcher)))
+	// make sure that all the words actually are matched
+	before := time.Now()
+	for _, word := range words {
+		record := fmt.Sprintf(`{"x": "%s"}`, word)
+		matches, err := q.MatchesForEvent([]byte(record))
+		if err != nil {
+			t.Error("M4E on " + string(word))
+		}
+		if len(matches) == 0 {
+			t.Error("no matches for " + string(word))
+		}
+		if len(matches) > 1 {
+			fmt.Printf("%d matches for %s\n", len(matches), word)
+		}
+	}
+	elapsed := float64(time.Since(before).Milliseconds())
+	eps := float64(len(words)) / (elapsed / 1000.0)
+	fmt.Printf("Huge-machine events/sec: %.1f\n", eps)
 }
 
 func TestMixedPatterns(t *testing.T) {
diff --git a/small_table.go b/small_table.go
index 2612ba8..9b097c6 100644
--- a/small_table.go
+++ b/small_table.go
@@ -1,19 +1,5 @@
 package quamina
 
-// faState is used by the valueMatcher automaton - every step through the
-// automaton requires a smallTable and for some of them, taking the step means you've matched a value and can
-// transition to a new fieldMatcher, in which case the fieldTransitions slice will be non-nil
-type faState struct {
-	table            *smallTable
-	fieldTransitions []*fieldMatcher
-}
-
-// struct wrapper to make this comparable to help with pack/unpack
-type faNext struct {
-	// serial int // very useful in debugging table construction
-	steps []*faState
-}
-
 // byteCeiling - the automaton runs on UTF-8 bytes, which map nicely to Go's byte, which is uint8. The values
 // 0xF5-0xFF can't appear in UTF-8 strings. We use 0xF5 as a value terminator, so characters F6 and higher
 // can't appear.
@@ -34,7 +20,7 @@ const valueTerminator byte = 0xf5
 // byte values 3 and 4 map to ss1 and byte 0x34 maps to ss2.  Then the smallTable would look like:
 //
 //	ceilings:--|3|----|5|-|0x34|--|x35|-|byteCeiling|
-//	steps:---|nil|-|&ss1|--|nil|-|&ss2|---------|nil|
+//	states:---|nil|-|&ss1|--|nil|-|&ss2|---------|nil|
 //	invariant: The last element of ceilings is always byteCeiling
 //
 // The motivation is that we want to build a state machine on byte values to implement things like prefixes and
@@ -44,10 +30,13 @@ const valueTerminator byte = 0xf5
 // small even in large automata, so skipping throgh the ceilings list is measurably about the same speed as a map
 // or array construct. One could imagine making step() smarter and do a binary search in the case where there are
 // more than some number of entries. But I'm dubious, the ceilings field is []byte and running through a single-digit
-// number of those has a good chance of minimizing memory fetches
+// number of those has a good chance of minimizing memory fetches.
+// Since this is used to support nondeterministic finite automata (NFAs), it is possible for a state
+// to have epsilon transitions, i.e. a transition that is always taken whatever the next input symbol is.
 type smallTable struct {
 	ceilings []byte
 	steps    []*faNext
+	epsilon  []*faState
 }
 
 // newSmallTable mostly exists to enforce the constraint that every smallTable has a byteCeiling entry at
@@ -59,11 +48,26 @@ func newSmallTable() *smallTable {
 	}
 }
 
-// step finds the member of steps in the smallTable that corresponds to the utf8Byte argument. It may return nil.
-func (t *smallTable) step(utf8Byte byte) *faNext {
+type stepOut struct {
+	steps   []*faState
+	epsilon []*faState
+}
+
+// step finds the list of states that result from a transition on the utf8Byte argument. The states can come
+// as a result of looking in the table structure, and also the "epsilon" transitions that occur on every
+// input byte.  Since this is the white-hot center of Quamina's runtime CPU, we don't want to be merging
+// the two lists. So to avoid any memory allocation, the caller passes in a structure with the two lists
+// and step fills them in.
+func (t *smallTable) step(utf8Byte byte, out *stepOut) {
+	out.epsilon = t.epsilon
 	for index, ceiling := range t.ceilings {
 		if utf8Byte < ceiling {
-			return t.steps[index]
+			if t.steps[index] == nil {
+				out.steps = nil
+			} else {
+				out.steps = t.steps[index].states
+			}
+			return
 		}
 	}
 	panic("Malformed smallTable")
@@ -96,7 +100,7 @@ func makeSmallTable(defaultStep *faNext, indices []byte, steps []*faNext) *small
 	return &t
 }
 
-// unpackedTable replicates the data in the smallTable ceilings and steps arrays.  It's quite hard to
+// unpackedTable replicates the data in the smallTable ceilings and states arrays.  It's quite hard to
 // update the list structure in a smallTable, but trivial in an unpackedTable.  The idea is that to update
 // a smallTable you unpack it, update, then re-pack it.  Not gonna be the most efficient thing so at some future point…
 // TODO: Figure out how to update a smallTable in place
@@ -137,22 +141,3 @@ func (t *smallTable) addByteStep(utf8Byte byte, step *faNext) {
 	unpacked[utf8Byte] = step
 	t.pack(unpacked)
 }
-
-// setDefault sets all the values of the table to the provided faNext pointer
-// TODO: Do we need this at all? Maybe just a variant of newSmallTable?
-func (t *smallTable) setDefault(s *faNext) {
-	t.steps = []*faNext{s}
-	t.ceilings = []byte{byte(byteCeiling)}
-}
-
-// Debugging from here down
-/*
-// addRangeSteps not currently used but think it will be useful in future regex-y work
-func (t *smallTable) addRangeSteps(floor int, ceiling int, s *faNext) {
-	unpacked := unpackTable(t)
-	for i := floor; i < ceiling; i++ {
-		unpacked[i] = s
-	}
-	t.pack(unpacked)
-}
-*/
diff --git a/small_table_test.go b/small_table_test.go
index e4ce2a0..e9c2324 100644
--- a/small_table_test.go
+++ b/small_table_test.go
@@ -43,7 +43,7 @@ func TestUnpack(t *testing.T) {
 		table:            st1,
 		fieldTransitions: nil,
 	}
-	nextStep := faNext{steps: []*faState{&nextState}}
+	nextStep := faNext{states: []*faState{&nextState}}
 
 	st := smallTable{
 		ceilings: []uint8{2, 3, byte(byteCeiling)},
@@ -127,7 +127,7 @@ func fuzzPack(t *testing.T, seed int64) {
 		if c != packed.ceilings[i] {
 			t.Errorf("seed %d ceilings differ at %d wanted %d got %d", seed, i, c, packed.ceilings[i])
 		}
-		if packed.steps[i] != rePacked.steps[i] {
+		if packed.states[i] != rePacked.states[i] {
 			t.Errorf("seed %d ssteps differ at %d", seed, i)
 		}
 	}
diff --git a/stats.go b/stats.go
index cc9d3aa..c9a1275 100644
--- a/stats.go
+++ b/stats.go
@@ -15,6 +15,8 @@ type statsAccum struct {
 	stTblCount int
 	stEntries  int
 	stMax      int
+	stEpsilon  int
+	stEpMax    int
 	stVisited  map[*smallTable]bool
 	siCount    int
 }
@@ -38,12 +40,17 @@ func matcherStats(m *coreMatcher) string {
 	fmStats(m.fields().state, &s)
 	avgFmSize := fmt.Sprintf("%.3f", float64(s.fmEntries)/float64(s.fmTblCount))
 	avgStSize := "n/a"
+	avgEpSize := "n/a"
 	if s.stTblCount > 0 {
 		avgStSize = fmt.Sprintf("%.3f", float64(s.stEntries)/float64(s.stTblCount))
 	}
+	if s.stEpsilon > 0 {
+		avgEpSize = fmt.Sprintf("%.3f", float64(s.stEpsilon)/float64(s.stTblCount))
+	}
 	fmPart := fmt.Sprintf("Field matchers: %d (avg size %s, max %d)", s.fmCount, avgFmSize, s.fmMax)
 	vmPart := fmt.Sprintf("Value matchers: %d", s.vmCount)
-	stPart := fmt.Sprintf("SmallTables %d (unique %d, avg size %s, max %d), singletons %d", s.stCount, len(s.stVisited), avgStSize, s.stMax, s.siCount)
+	stPart := fmt.Sprintf("SmallTables %d (unique %d, avg %s, max %d, epsilon avg %s, max %d) singletons %d",
+		s.stCount, len(s.stVisited), avgStSize, s.stMax, avgEpSize, s.stEpMax, s.siCount)
 
 	return fmPart + "\n" + vmPart + "\n" + stPart
 }
@@ -74,7 +81,7 @@ func vmStats(m *valueMatcher, s *statsAccum) {
 	}
 	s.vmVisited[m] = true
 	s.vmCount++
-	state := m.getFields()
+	state := m.fields()
 	if state.singletonMatch != nil {
 		s.siCount++
 		fmStats(state.singletonTransition, s)
@@ -97,10 +104,14 @@ func faStats(t *smallTable, s *statsAccum) {
 		}
 		s.stTblCount++
 		s.stEntries += len(t.ceilings)
+		s.stEpsilon += len(t.epsilon)
+		if len(t.epsilon) > s.stEpMax {
+			s.stEpMax = len(t.epsilon)
+		}
 	}
 	for _, next := range t.steps {
 		if next != nil {
-			for _, step := range next.steps {
+			for _, step := range next.states {
 				if step.fieldTransitions != nil {
 					for _, m := range step.fieldTransitions {
 						fmStats(m, s)
diff --git a/value_matcher.go b/value_matcher.go
index 4c13179..bc1b1e8 100644
--- a/value_matcher.go
+++ b/value_matcher.go
@@ -5,6 +5,10 @@ import (
 	"sync/atomic"
 )
 
+type bufpair struct {
+	buf1, buf2 []*faState
+}
+
 // valueMatcher represents a byte-driven finite automaton (FA).  The table needs to be the
 // equivalent of a map[byte]nextState and is represented by smallTable.
 // In this implementation all the FAs are nondeterministic, which means each
@@ -27,7 +31,7 @@ type vmFields struct {
 	singletonTransition *fieldMatcher
 }
 
-func (m *valueMatcher) getFields() *vmFields {
+func (m *valueMatcher) fields() *vmFields {
 	return m.updateable.Load().(*vmFields)
 }
 
@@ -47,10 +51,10 @@ func newValueMatcher() *valueMatcher {
 	return &vm
 }
 
-func (m *valueMatcher) transitionOn(val []byte) []*fieldMatcher {
+func (m *valueMatcher) transitionOn(val []byte, bufs *bufpair) []*fieldMatcher {
 	var transitions []*fieldMatcher
 
-	fields := m.getFields()
+	fields := m.fields()
 
 	switch {
 	case fields.singletonMatch != nil:
@@ -64,7 +68,7 @@ func (m *valueMatcher) transitionOn(val []byte) []*fieldMatcher {
 		return transitions
 
 	case fields.startTable != nil:
-		return traverseFA(fields.startTable, val, transitions)
+		return traverseFA(fields.startTable, val, transitions, bufs)
 
 	default:
 		// no FA, no singleton, nothing to do, this probably can't happen because a flattener
@@ -87,13 +91,13 @@ func (m *valueMatcher) addTransition(val typedVal, printer printer) *fieldMatche
 		case anythingButType:
 			newFA, nextField = makeMultiAnythingButFA(val.list)
 		case shellStyleType:
-			newFA, nextField = makeShellStyleAutomaton(valBytes, printer)
+			newFA, nextField = makeShellStyleFA(valBytes, printer)
 		case prefixType:
-			newFA, nextField = makePrefixAutomaton(valBytes)
+			newFA, nextField = makePrefixFA(valBytes)
 		default:
 			panic("unknown value type")
 		}
-		fields.startTable = mergeFAs(fields.startTable, newFA)
+		fields.startTable = mergeFAs(fields.startTable, newFA, sharedNullPrinter)
 		m.update(fields)
 		return nextField
 	}
@@ -115,12 +119,12 @@ func (m *valueMatcher) addTransition(val typedVal, printer printer) *fieldMatche
 			m.update(fields)
 			return nextField
 		case shellStyleType:
-			newAutomaton, nextField := makeShellStyleAutomaton(valBytes, &nullPrinter{})
+			newAutomaton, nextField := makeShellStyleFA(valBytes, printer)
 			fields.startTable = newAutomaton
 			m.update(fields)
 			return nextField
 		case prefixType:
-			newFA, nextField := makePrefixAutomaton(valBytes)
+			newFA, nextField := makePrefixFA(valBytes)
 			fields.startTable = newFA
 			m.update(fields)
 			return nextField
@@ -147,28 +151,29 @@ func (m *valueMatcher) addTransition(val typedVal, printer printer) *fieldMatche
 	case anythingButType:
 		newFA, nextField = makeMultiAnythingButFA(val.list)
 	case shellStyleType:
-		newFA, nextField = makeShellStyleAutomaton(valBytes, &nullPrinter{})
+		newFA, nextField = makeShellStyleFA(valBytes, printer)
 	case prefixType:
-		newFA, nextField = makePrefixAutomaton(valBytes)
+		newFA, nextField = makePrefixFA(valBytes)
 	default:
 		panic("unknown value type")
 	}
 
 	// now table is ready for use, nuke singleton to signal threads to use it
-	fields.startTable = mergeFAs(singletonAutomaton, newFA)
-	// fmt.Println("Merged: " + fields.startTable.dump())
+	fields.startTable = mergeFAs(singletonAutomaton, newFA, sharedNullPrinter)
 	fields.singletonMatch = nil
 	fields.singletonTransition = nil
 	m.update(fields)
 	return nextField
 }
 
-func makePrefixAutomaton(val []byte) (*smallTable, *fieldMatcher) {
+// TODO: make these simple FA builders iterative not recursive, this will recurse as deep as the longest string match
+
+func makePrefixFA(val []byte) (*smallTable, *fieldMatcher) {
 	nextField := newFieldMatcher()
-	return onePrefixStep(val, 0, nextField), nextField
+	return makeOnePrefixFAStep(val, 0, nextField), nextField
 }
 
-func onePrefixStep(val []byte, index int, nextField *fieldMatcher) *smallTable {
+func makeOnePrefixFAStep(val []byte, index int, nextField *fieldMatcher) *smallTable {
 	var nextStep *faNext
 
 	// have to stop one short to skip the closing "
@@ -177,9 +182,9 @@ func onePrefixStep(val []byte, index int, nextField *fieldMatcher) *smallTable {
 	if index == len(val)-2 {
 		nextState = &faState{table: newSmallTable(), fieldTransitions: []*fieldMatcher{nextField}}
 	} else {
-		nextState = &faState{table: onePrefixStep(val, index+1, nextField)}
+		nextState = &faState{table: makeOnePrefixFAStep(val, index+1, nextField)}
 	}
-	nextStep = &faNext{steps: []*faState{nextState}}
+	nextStep = &faNext{states: []*faState{nextState}}
 	return makeSmallTable(nil, []byte{val[index]}, []*faNext{nextStep})
 }
 
@@ -196,27 +201,26 @@ func makeStringFA(val []byte, useThisTransition *fieldMatcher) (*smallTable, *fi
 		nextField = newFieldMatcher()
 	}
 
-	return makeOneFAStep(val, 0, nextField), nextField
+	return makeOneStringFAStep(val, 0, nextField), nextField
 }
 
-func makeOneFAStep(val []byte, index int, nextField *fieldMatcher) *smallTable {
+func makeOneStringFAStep(val []byte, index int, nextField *fieldMatcher) *smallTable {
 	var nextStepList *faNext
 	if index == len(val)-1 {
 		lastStep := &faState{
 			table:            newSmallTable(),
 			fieldTransitions: []*fieldMatcher{nextField},
 		}
-		lastStepList := &faNext{steps: []*faState{lastStep}}
+		lastStepList := &faNext{states: []*faState{lastStep}}
 		nextStep := &faState{
 			table: makeSmallTable(nil, []byte{valueTerminator}, []*faNext{lastStepList}),
 		}
-		nextStepList = &faNext{steps: []*faState{nextStep}}
+		nextStepList = &faNext{states: []*faState{nextStep}}
 	} else {
-		nextStep := &faState{table: makeOneFAStep(val, index+1, nextField)}
-		nextStepList = &faNext{steps: []*faState{nextStep}}
+		nextStep := &faState{table: makeOneStringFAStep(val, index+1, nextField)}
+		nextStepList = &faNext{states: []*faState{nextStep}}
 	}
 	var u unpackedTable
 	u[val[index]] = nextStepList
-	// return stepper.buildTable(&u)
 	return makeSmallTable(nil, []byte{val[index]}, []*faNext{nextStepList})
 }
diff --git a/value_matcher_test.go b/value_matcher_test.go
index 30eff72..deab13d 100644
--- a/value_matcher_test.go
+++ b/value_matcher_test.go
@@ -41,7 +41,7 @@ func addInvalid(t *testing.T, before []typedVal) {
 
 func TestNoOpTransition(t *testing.T) {
 	vm := newValueMatcher()
-	tr := vm.transitionOn([]byte("foo"))
+	tr := vm.transitionOn([]byte("foo"), &bufpair{})
 	if len(tr) != 0 {
 		t.Error("matched on empty valuematcher")
 	}
@@ -57,7 +57,7 @@ func TestAddTransition(t *testing.T) {
 	if t1 == nil {
 		t.Error("nil addTrans")
 	}
-	t1x := m.transitionOn([]byte("one"))
+	t1x := m.transitionOn([]byte("one"), &bufpair{})
 	if len(t1x) != 1 || t1x[0] != t1 {
 		t.Error("Retrieve failed")
 	}
@@ -73,11 +73,11 @@ func TestAddTransition(t *testing.T) {
 	}
 	t2 := m.addTransition(v2, &nullPrinter{})
 
-	t2x := m.transitionOn([]byte("two"))
+	t2x := m.transitionOn([]byte("two"), &bufpair{})
 	if len(t2x) != 1 || t2x[0] != t2 {
 		t.Error("trans failed T2")
 	}
-	t1x = m.transitionOn([]byte("one"))
+	t1x = m.transitionOn([]byte("one"), &bufpair{})
 	if len(t1x) != 1 || t1x[0] != t1 {
 		t.Error("Retrieve failed")
 	}
@@ -86,15 +86,15 @@ func TestAddTransition(t *testing.T) {
 		val:   "three",
 	}
 	t3 := m.addTransition(v3, &nullPrinter{})
-	t3x := m.transitionOn([]byte("three"))
+	t3x := m.transitionOn([]byte("three"), &bufpair{})
 	if len(t3x) != 1 || t3x[0] != t3 {
 		t.Error("Match failed T3")
 	}
-	t2x = m.transitionOn([]byte("two"))
+	t2x = m.transitionOn([]byte("two"), &bufpair{})
 	if len(t2x) != 1 || t2x[0] != t2 {
 		t.Error("trans failed T2")
 	}
-	t1x = m.transitionOn([]byte("one"))
+	t1x = m.transitionOn([]byte("one"), &bufpair{})
 	if len(t1x) != 1 || t1x[0] != t1 {
 		t.Error("Retrieve failed")
 	}