Skip to content

Commit

Permalink
In JSON chunker, allow chunking in the middle of large strings, and a…
Browse files Browse the repository at this point in the history
…dd a new location type representing the inside of a string.
  • Loading branch information
nicktobey committed Jan 8, 2025
1 parent c93d583 commit 7dd4cd0
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 11 deletions.
26 changes: 23 additions & 3 deletions go/store/prolly/tree/json_location.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ package tree

import (
"bytes"
"cmp"
"fmt"
"slices"
"strconv"
Expand Down Expand Up @@ -69,8 +68,25 @@ const (
objectInitialElement
arrayInitialElement
endOfValue
middleOfString
)

func compareJsonPathTypes(left, right jsonPathType) int {
if left == startOfValue && right != startOfValue {
return -1
}
if left == endOfValue && right != endOfValue {
return 1
}
if right == startOfValue && left != startOfValue {
return 1
}
if right == endOfValue && left != endOfValue {
return -1
}
return 0
}

func (t jsonPathType) isInitialElement() bool {
return t == objectInitialElement || t == arrayInitialElement
}
Expand Down Expand Up @@ -170,7 +186,7 @@ func isUnsupportedJsonArrayIndex(index []byte) bool {
}

func errorIfNotSupportedLocation(key []byte) error {
if jsonPathType(key[0]) > endOfValue {
if jsonPathType(key[0]) > middleOfString {
return unknownLocationKeyError
}
return nil
Expand Down Expand Up @@ -336,6 +352,10 @@ func (p *jsonLocation) getScannerState() jsonPathType {
return jsonPathType(p.key[0])
}

func (p jsonLocation) IsMiddleOfString() bool {
return p.getScannerState() == middleOfString
}

type jsonPathElement struct {
key []byte
isArrayIndex bool
Expand Down Expand Up @@ -429,7 +449,7 @@ func compareJsonLocations(left, right jsonLocation) int {
return -1
}
// left and right have the exact same key elements
return cmp.Compare(left.getScannerState(), right.getScannerState())
return compareJsonPathTypes(left.getScannerState(), right.getScannerState())

}

Expand Down
43 changes: 35 additions & 8 deletions go/store/prolly/tree/json_scanner.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,14 @@ func (s *JsonScanner) AdvanceToNextLocation() error {
} else {
return s.acceptNextKeyValue()
}
case middleOfString:
_, finishedString, err := s.acceptRestOfString()
if finishedString {
s.currentPath.setScannerState(endOfValue)
} else {
s.currentPath.setScannerState(middleOfString)
}
return err
default:
return jsonParseError
}
Expand All @@ -127,11 +135,16 @@ func (s *JsonScanner) acceptValue() error {
current := s.current()
switch current {
case '"':
_, err := s.acceptString()
_, finishedString, err := s.acceptString()
if err != nil {
return err
}
s.currentPath.setScannerState(endOfValue)
if finishedString {
s.currentPath.setScannerState(endOfValue)
} else {
s.currentPath.setScannerState(middleOfString)
}

return nil
case '[':
s.valueOffset++
Expand Down Expand Up @@ -177,22 +190,33 @@ func (s *JsonScanner) accept(b byte) error {
return nil
}

func (s *JsonScanner) acceptString() ([]byte, error) {
err := s.accept('"')
func (s *JsonScanner) acceptString() (stringBytes []byte, finishedString bool, err error) {
err = s.accept('"')
if err != nil {
return nil, err
return nil, false, err
}
return s.acceptRestOfString()
}

func (s *JsonScanner) acceptRestOfString() (stringBytes []byte, finishedString bool, err error) {
stringStart := s.valueOffset
for s.current() != '"' {
stringLength := 0
for s.current() != '"' && stringLength < 1000 {
switch s.current() {
case '\\':
s.valueOffset++
}
s.valueOffset++
stringLength++
}
result := s.jsonBuffer[stringStart:s.valueOffset]
if stringLength == 1000 {
// Split the segment here, so that the chunk doesn't get too large.
return result, false, nil
}
// Advance past the ending quotes
s.valueOffset++
return result, nil
return result, true, nil
}

func (s *JsonScanner) acceptKeyValue() error {
Expand Down Expand Up @@ -228,7 +252,10 @@ func (s *JsonScanner) acceptNextKeyValue() error {
}

func (s *JsonScanner) acceptObjectKey() error {
objectKey, err := s.acceptString()
objectKey, finishedString, err := s.acceptString()
if !finishedString {
// a very long key that might not fit? How to handle this?
}
if err != nil {
return err
}
Expand Down

0 comments on commit 7dd4cd0

Please sign in to comment.