Skip to content

Commit

Permalink
numbits+base128 8-byte full-precision numbers (#349)
Browse files Browse the repository at this point in the history
* numbits+base128 8-byte full-precision numbers

Signed-off-by: Tim Bray <[email protected]>

* fix up comments and README

Signed-off-by: Tim Bray <[email protected]>

* make UTF-8 version of numbits variable-length

Signed-off-by: Tim Bray <[email protected]>

* Address feedback from Arne, rewrite varwidth numbits

Signed-off-by: Tim Bray <[email protected]>

* fix lint, use latest numbits.go

Signed-off-by: Tim Bray <[email protected]>

---------

Signed-off-by: Tim Bray <[email protected]>
  • Loading branch information
timbray authored Aug 28, 2024
1 parent be1752d commit cd8d31a
Show file tree
Hide file tree
Showing 12 changed files with 756 additions and 703 deletions.
12 changes: 2 additions & 10 deletions PATTERNS.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,16 +58,8 @@ Thus, the following Pattern would match both JSON events above:

### Numeric Values

It would be convenient if Quamina knew, for matching purposes, that 35,
35.00, and 3.5e1 were all the same number.

In many cases, Quamina can manage this. Specifically, for numbers that:

* are between -5.0e9 and 5.0e9 inclusive.
* have five or fewer fractional digits.

Numbers which do not meet these criteria will be treated as strings, which
usually produces good results.
Quamina can match numeric values with precision and range exactly the same as that provided by
Go's `float64` data type, which is said to conform to IEE 754 `binary64`.

## Extended Patterns
An **Extended Pattern** **MUST** be a JSON object containing
Expand Down
5 changes: 2 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -150,9 +150,6 @@ The `"exists":true` and `"exists":false` patterns
have corner cases; details are covered in
[Patterns in Quamina](PATTERNS.md).

Quamina can match numeric values correctly, subject to
certain limits; details are in [Patterns in Quamina](PATTERNS.md).

## Flattening and Matching

The first step in finding matches for an Event is
Expand Down Expand Up @@ -386,3 +383,5 @@ colonies before slavery was abolished.
@embano1: CI/CD and project structure.

@yosiat: Flattening optimization.

@arnehormann: compact high-precision number representation.
942 changes: 471 additions & 471 deletions case_folding.go

Large diffs are not rendered by default.

8 changes: 7 additions & 1 deletion core_matcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,13 @@ func (m *coreMatcher) deletePatterns(_ X) error {
// This is a leftover from previous times, is only used by tests, but it's used by a *lot*
// and it's a convenient API for testing.
func (m *coreMatcher) matchesForJSONEvent(event []byte) ([]X, error) {
fields, _ := newJSONFlattener().Flatten(event, m.getSegmentsTreeTracker())
return m.matchesForJSONWithFlattener(event, newJSONFlattener())
}

// if your test is a benchmark, call newJSONFlattener and pass it to this routine, matchesForJSONWithFlattener
// because newJSONFlattener() is fairly heavyweight and you want it out of the benchmark loop
func (m *coreMatcher) matchesForJSONWithFlattener(event []byte, f Flattener) ([]X, error) {
fields, _ := f.Flatten(event, m.getSegmentsTreeTracker())
return m.matchesForFields(fields)
}

Expand Down
67 changes: 27 additions & 40 deletions flatten_json.go
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ func (fj *flattenJSON) readObject(pathNode SegmentsTreeTracker) error {
}

var val []byte
isQNumber := false
isNumber := false
switch ch {
case '"':
if fj.skipping > 0 || !memberIsUsed {
Expand All @@ -233,7 +233,10 @@ func (fj *flattenJSON) readObject(pathNode SegmentsTreeTracker) error {
val, err = fj.readLiteral(nullBytes)
isLeaf = true
case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
val, isQNumber, err = fj.readNumber()
val, err = fj.readNumber()
if err == nil {
isNumber = true
}
isLeaf = true
case '[':
if !pathNode.IsSegmentUsed(memberName) {
Expand Down Expand Up @@ -296,7 +299,7 @@ func (fj *flattenJSON) readObject(pathNode SegmentsTreeTracker) error {
}
if val != nil {
if memberIsUsed {
fj.storeObjectMemberField(pathNode.PathForSegment(memberName), arrayTrail, val, isQNumber)
fj.storeObjectMemberField(pathNode.PathForSegment(memberName), arrayTrail, val, isNumber)
fieldsCount--
}
}
Expand Down Expand Up @@ -340,7 +343,7 @@ func (fj *flattenJSON) readArray(pathName []byte, pathNode SegmentsTreeTracker)
for {
ch := fj.ch()
var val []byte // resets on each loop
isQNumber := false
isNumber := false
switch state {
case fjInArrayState:
// bypass space before element value. A bit klunky but allows for immense simplification
Expand All @@ -365,7 +368,10 @@ func (fj *flattenJSON) readArray(pathName []byte, pathNode SegmentsTreeTracker)
val, err = fj.readLiteral(nullBytes)
isLeaf = true
case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
val, isQNumber, err = fj.readNumber()
val, err = fj.readNumber()
if err == nil {
isNumber = true
}
isLeaf = true
case '{':
if fj.skipping == 0 {
Expand Down Expand Up @@ -398,7 +404,7 @@ func (fj *flattenJSON) readArray(pathName []byte, pathNode SegmentsTreeTracker)
if val != nil {
if fj.skipping == 0 {
fj.stepOneArrayElement()
fj.storeArrayElementField(pathName, val, isQNumber)
fj.storeArrayElementField(pathName, val, isNumber)
}
}
state = fjAfterValueState
Expand Down Expand Up @@ -427,13 +433,10 @@ func (fj *flattenJSON) readArray(pathName []byte, pathNode SegmentsTreeTracker)
* these higher-level funcs are going to advance the pointer after each invocation
*/

func (fj *flattenJSON) readNumber() ([]byte, bool, error) {
func (fj *flattenJSON) readNumber() ([]byte, error) {
// points at the first character in the number
numStart := fj.eventIndex
state := fjNumberStartState
isQNumber := false
fracStart := 0
expStart := 0
for {
ch := fj.ch()
switch state {
Expand All @@ -450,38 +453,33 @@ func (fj *flattenJSON) readNumber() ([]byte, bool, error) {
// no-op
case '.':
state = fjNumberFracState
fracStart = fj.eventIndex + 1
case 'e', 'E':
state = fjNumberAfterEState
expStart = fj.eventIndex + 1
case ',', ']', '}', ' ', '\t', '\n', '\r':
fj.eventIndex--
return fj.event[numStart : fj.eventIndex+1], true, nil
return fj.event[numStart : fj.eventIndex+1], nil
default:
return nil, false, fj.error(fmt.Sprintf("illegal char '%c' in number", ch))
return nil, fj.error(fmt.Sprintf("illegal char '%c' in number", ch))
}
case fjNumberFracState:
switch ch {
case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
// no-op
case ',', ']', '}', ' ', '\t', '\n', '\r':
fractionalDigits := (expStart - 1) - fracStart
isQNumber = fractionalDigits <= MaxFractionalDigits
fj.eventIndex--
bytes := fj.event[numStart : fj.eventIndex+1]
return bytes, isQNumber, nil
return bytes, nil
case 'e', 'E':
state = fjNumberAfterEState
expStart = fj.eventIndex + 1
default:
return nil, false, fj.error(fmt.Sprintf("illegal char '%c' in number", ch))
return nil, fj.error(fmt.Sprintf("illegal char '%c' in number", ch))
}
case fjNumberAfterEState:
switch ch {
case '-', '1', '2', '3', '4', '5', '6', '7', '8', '9':
// no-op
default:
return nil, false, fj.error(fmt.Sprintf("illegal char '%c' after 'e' in number", ch))
return nil, fj.error(fmt.Sprintf("illegal char '%c' after 'e' in number", ch))
}
state = fjNumberExpState

Expand All @@ -490,27 +488,14 @@ func (fj *flattenJSON) readNumber() ([]byte, bool, error) {
case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
// no-op
case ',', ']', '}', ' ', '\t', '\n', '\r':
fractionalDigits := 0
if fracStart != 0 {
fractionalDigits = (expStart - 1) - fracStart
if fractionalDigits > MaxFractionalDigits {
if expStart != 0 {
exp, err := strconv.ParseInt(string(fj.event[expStart:fj.eventIndex]), 10, 32)
if err == nil {
fractionalDigits -= int(exp)
}
}
}
}
isQNumber = fractionalDigits <= MaxFractionalDigits
fj.eventIndex--
return fj.event[numStart : fj.eventIndex+1], isQNumber, nil
return fj.event[numStart : fj.eventIndex+1], nil
default:
return nil, false, fj.error(fmt.Sprintf("illegal char '%c' in exponent", ch))
return nil, fj.error(fmt.Sprintf("illegal char '%c' in exponent", ch))
}
}
if fj.step() != nil {
return nil, false, fj.error("event truncated in number")
return nil, fj.error("event truncated in number")
}
}
}
Expand Down Expand Up @@ -811,6 +796,8 @@ func (fj *flattenJSON) readHexUTF16(from int) ([]byte, int, error) {
if hexDigitCount == 4 {
hexString := string(fj.event[from-3 : from+1])
r, _ := strconv.ParseUint(hexString, 16, 16)
// parsing 4 hex digits can't overflow a uint16
//nolint:gosec
codepoints = append(codepoints, uint16(r))
state = fjStartEscapeState
}
Expand All @@ -831,14 +818,14 @@ func (fj *flattenJSON) readHexUTF16(from int) ([]byte, int, error) {
// its own snapshot of the array-trail data, because it'll be different for each array element
// NOTE: The profiler says this is the most expensive function in the whole matchesForJSONEvent universe, presumably
// because of the necessity to construct a new arrayTrail for each element.
func (fj *flattenJSON) storeArrayElementField(path []byte, val []byte, isQNumber bool) {
f := Field{Path: path, ArrayTrail: make([]ArrayPos, len(fj.arrayTrail)), Val: val, IsQNumber: isQNumber}
func (fj *flattenJSON) storeArrayElementField(path []byte, val []byte, isNumber bool) {
f := Field{Path: path, ArrayTrail: make([]ArrayPos, len(fj.arrayTrail)), Val: val, IsNumber: isNumber}
copy(f.ArrayTrail, fj.arrayTrail)
fj.fields = append(fj.fields, f)
}

func (fj *flattenJSON) storeObjectMemberField(path []byte, arrayTrail []ArrayPos, val []byte, isQNumber bool) {
fj.fields = append(fj.fields, Field{Path: path, ArrayTrail: arrayTrail, Val: val, IsQNumber: isQNumber})
func (fj *flattenJSON) storeObjectMemberField(path []byte, arrayTrail []ArrayPos, val []byte, isNumber bool) {
fj.fields = append(fj.fields, Field{Path: path, ArrayTrail: arrayTrail, Val: val, IsNumber: isNumber})
}

func (fj *flattenJSON) enterArray() {
Expand Down
2 changes: 1 addition & 1 deletion flattener.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,5 +58,5 @@ type Field struct {
Path []byte
Val []byte
ArrayTrail []ArrayPos
IsQNumber bool
IsNumber bool
}
113 changes: 23 additions & 90 deletions numbers.go
Original file line number Diff line number Diff line change
@@ -1,121 +1,54 @@
package quamina

import (
"encoding/binary"
"errors"
"fmt"
"strconv"
)

// You can't easily build automata to compare numbers based on either the decimal notation found
// in text data or the internal floating-point bits. Therefore, for a restricted subset of numbers,
// we define a 7-byte (14 hex digit) representation that facilitates building automata to support
// equality and ordering comparison.
//
// The representation supports 10**15 numbers. The first three are:
// decimal: -5_000_000_000, -4_999_999_999.99999, -4_999_999_999.99998, ...
// 14-byte: 00000000000000, 00000000000009, 00000000000014
// and the last three are
// decimal: .., 4_999_999_999.99998, 4_999_999_999.99999, 5_000_000_000
// 14-byte: 2386F26FC0FFEC, 2386F26FC0FFF6, 2386F26FC10000
//
// In English: all numbers that are between negative and positive 5 billion inclusive, with up to five
// digits after the decimal point.
// These numbers have fifteen decimal digits of precision, which is what double floats can offer.
// They include most numbers that are used in practice, including prices, occurrence counts, size
// measurements, and so on.
// Examples of numbers that do NOT meet these criteria include AWS account numbers, some telephone
// numbers, and cryptographic keys/signatures. For these, treatment as strings seems to produce
// satisfactory results for equality testing.
// in text data or the internal floating-point bits. Therefore, we map floating-point numbers
// (which is what JSON numbers basically are) to comparable slices of 7-bit bytes which preserve the
// numbers' ordering. Versions of Quamina up to 1.3 used a home-grown format which used 14 hex digits
// to represent a subset of numbers. This has now been replaced by Arne Hormann's "numbits"
// construct, see numbits.go. It uses up to 10 base128 bytes to represent the entire range of float64 numbers.
// Both this file and numbits.go are very short, but I'm keeping them separated because someone might
// figure out a still-better serialization of numbers and then this part wouldn't have to change.
// In Quamina these are called "Q numbers".
// How It's Done

// There is considerable effort to track, at the NFA level, which NFAs are built to match field values
// that are Q numbers; see vmFields.hasQNumbers. Similarly, the JSONFlattener, since it has to
// that are Q numbers; see vmFields.hasNumbers. Similarly, the JSONFlattener, since it has to
// look at all the digits in a number in order to parse it, can keep track of whether it can be made
// a Q number. The key benefit of this is in valueMatcher.transitionOn, which incurs the cost of
// making a Q number only if it is known that the valueMatcher's NFA can benefit from it and
// that the number in the incoming event can in fact be made a Q number.

const (
TenE6 = 1e6
FiveBillion = 5e9
Hexes = "0123456789ABCDEF"
MaxFractionalDigits = 5
)

type qNumber []byte

// qNumFromBytes works out whether a string representing a number falls within the
// limits imposed for Q numbers. It is heavily optimized and relies on the form
// of the number already having been validated, e.g. by flattenJSON().
func qNumFromBytes(bytes []byte) (qNumber, error) {
// shortcut: The shorest number with more than 5 fractional digits is like 0.123456
if len(bytes) < 8 {
numeric, err := strconv.ParseFloat(string(bytes), 64)
if err != nil {
return nil, errors.New("not a float") // should never happen, json parser upstream
}
return qNumFromFloat(numeric)
}
// compute number of fractional digits. The loop below relies on the fact that anything between '.' and either
// 'e' or the end of the string must be a digit, as must anything between 'e' and the end of the string.
//. NOTE: This will be fooled by "35.000000"
fracStart := 0
expStart := 0
index := 0
var utf8Byte byte
fractionalDigits := 0
ForEachByte:
for index, utf8Byte = range bytes {
switch utf8Byte {
case '.':
fracStart = index + 1
case 'e', 'E':
expStart = index + 1
break ForEachByte
}
}
if fracStart != 0 {
fractionalDigits = index - fracStart
}
// if too many fractional digits, perhaps the exponent will push the '.' to the right
if fractionalDigits > MaxFractionalDigits {
if expStart != 0 {
exp, err := strconv.ParseInt(string(bytes[expStart:]), 10, 32)
if err == nil {
fractionalDigits -= int(exp)
}
}
}
if fractionalDigits > MaxFractionalDigits {
return nil, errors.New("more than 5 fractional digits")
}

numeric, err := strconv.ParseFloat(string(bytes), 64)
if err != nil {
return nil, errors.New("not a float") // shouldn't happen, upstream parser should prvent
return nil, errors.New("not a float") // should never happen, json parser upstream
}
return qNumFromFloat(numeric)
return qNumFromFloat(numeric), nil
}

func qNumFromFloat(f float64) (qNumber, error) {
if f < -FiveBillion || f > FiveBillion {
return nil, errors.New("value must be between -5e9 and +5e9 inclusive")
}
value := uint64(TenE6 * (FiveBillion + f))
return toHexStringSkippingFirstByte(value), nil
// qNumFromFLoat is here mostly to support testing
func qNumFromFloat(f float64) qNumber {
return numbitsFromFloat64(f).toQNumber()
}

func toHexStringSkippingFirstByte(value uint64) []byte {
var buf [8]byte
binary.BigEndian.PutUint64(buf[:], value)
var outputChars [14]byte
for i, utf8Byte := range buf {
if i == 0 {
continue
// for debugging
func (q qNumber) String() string {
ret := ""
for i, b := range q {
if i != 0 {
ret += "-"
}
pos := (i - 1) * 2
outputChars[pos] = Hexes[utf8Byte>>4]
outputChars[pos+1] = Hexes[buf[i]&0xf]
ret += fmt.Sprintf("%02x", b)
}
return outputChars[:]
return ret
}
Loading

1 comment on commit cd8d31a

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Performance Alert ⚠️

Possible performance regression was detected for benchmark 'Go Benchmark'.
Benchmark result of this commit is worse than the previous benchmark result exceeding threshold 1.20.

Benchmark suite Current: cd8d31a Previous: 78e2ec8 Ratio
BenchmarkCityLots 11027 ns/op 1778 B/op 103 allocs/op 5592 ns/op 773 B/op 31 allocs/op 1.97
BenchmarkCityLots - ns/op 11027 ns/op 5592 ns/op 1.97
BenchmarkCityLots - B/op 1778 B/op 773 B/op 2.30
BenchmarkCityLots - allocs/op 103 allocs/op 31 allocs/op 3.32
Benchmark_JsonFlattner_Evaluate_ContextFields 1008 ns/op 72 B/op 8 allocs/op 726.2 ns/op 56 B/op 4 allocs/op 1.39
Benchmark_JsonFlattner_Evaluate_ContextFields - ns/op 1008 ns/op 726.2 ns/op 1.39
Benchmark_JsonFlattner_Evaluate_ContextFields - B/op 72 B/op 56 B/op 1.29
Benchmark_JsonFlattner_Evaluate_ContextFields - allocs/op 8 allocs/op 4 allocs/op 2

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.