Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

If a JSON document contains strings that can't fit in a single chunk, use the naive Blob chunker instead of the smart JSON chunker. #8723

Merged
merged 15 commits into from
Jan 9, 2025
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions go/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ require (
github.com/google/btree v1.1.2
github.com/google/go-github/v57 v57.0.0
github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510
github.com/hashicorp/go-uuid v1.0.1
github.com/hashicorp/golang-lru/v2 v2.0.2
github.com/jmoiron/sqlx v1.3.4
github.com/kch42/buzhash v0.0.0-20160816060738-9bdec3dec7c6
Expand Down
1 change: 1 addition & 0 deletions go/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,7 @@ github.com/hashicorp/go-sockaddr v1.0.0/go.mod h1:7Xibr9yA9JjQq1JpNB2Vw7kxv8xerX
github.com/hashicorp/go-syslog v1.0.0/go.mod h1:qPfqrKkXGihmCqbJM2mZgkZGvKG1dFdvsLplgctolz4=
github.com/hashicorp/go-uuid v0.0.0-20180228145832-27454136f036/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro=
github.com/hashicorp/go-uuid v1.0.0/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro=
github.com/hashicorp/go-uuid v1.0.1 h1:fv1ep09latC32wFoVwnqcnKJGnMSdBanPczbHAYm1BE=
github.com/hashicorp/go-uuid v1.0.1/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro=
github.com/hashicorp/go-version v1.2.0/go.mod h1:fltr4n8CU8Ke44wwGCBoEymUuxUHl09ZGVZPK5anwXA=
github.com/hashicorp/go.net v0.0.1/go.mod h1:hjKkEWcCURg++eb33jQU7oqQcI9XDCnUzHA0oac0k90=
Expand Down
19 changes: 15 additions & 4 deletions go/store/prolly/tree/indexed_json_diff.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import (
)

type IndexedJsonDiffer struct {
differ Differ[jsonLocationKey, jsonLocationOrdering]
differ Differ[jsonLocationKey, *jsonLocationOrdering]
currentFromCursor, currentToCursor *JsonCursor
from, to IndexedJsonDocument
started bool
Expand All @@ -32,10 +32,14 @@ type IndexedJsonDiffer struct {
var _ IJsonDiffer = &IndexedJsonDiffer{}

func NewIndexedJsonDiffer(ctx context.Context, from, to IndexedJsonDocument) (*IndexedJsonDiffer, error) {
differ, err := DifferFromRoots[jsonLocationKey, jsonLocationOrdering](ctx, from.m.NodeStore, to.m.NodeStore, from.m.Root, to.m.Root, jsonLocationOrdering{}, false)
ordering := jsonLocationOrdering{}
differ, err := DifferFromRoots[jsonLocationKey, *jsonLocationOrdering](ctx, from.m.NodeStore, to.m.NodeStore, from.m.Root, to.m.Root, &ordering, false)
if err != nil {
return nil, err
}
if ordering.err != nil {
return nil, err
}
// We want to diff the prolly tree as if it was an address map pointing to the individual blob fragments, rather
// than diffing the blob fragments themselves. We can accomplish this by just replacing the cursors in the differ
// with their parents.
Expand Down Expand Up @@ -225,7 +229,11 @@ func (jd *IndexedJsonDiffer) Next(ctx context.Context) (diff JsonDiff, err error
// Neither cursor points to the start of a value.
// This should only be possible if they're at the same location.
// Do a sanity check, then continue.
if compareJsonLocations(fromCurrentLocation, toCurrentLocation) != 0 {
cmp, err := compareJsonLocations(fromCurrentLocation, toCurrentLocation)
if err != nil {
return JsonDiff{}, err
}
if cmp != 0 {
return JsonDiff{}, jsonParseError
}
err = advanceCursor(ctx, &jd.currentFromCursor)
Expand All @@ -240,7 +248,10 @@ func (jd *IndexedJsonDiffer) Next(ctx context.Context) (diff JsonDiff, err error
}

if fromScannerAtStartOfValue && toScannerAtStartOfValue {
cmp := compareJsonLocations(fromCurrentLocation, toCurrentLocation)
cmp, err := compareJsonLocations(fromCurrentLocation, toCurrentLocation)
if err != nil {
return JsonDiff{}, err
}
switch cmp {
case 0:
key := fromCurrentLocation.Clone().key
Expand Down
7 changes: 7 additions & 0 deletions go/store/prolly/tree/json_chunker.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,13 @@ func SerializeJsonToAddr(ctx context.Context, ns NodeStore, j sql.JSONWrapper) (
}
jsonChunker.appendJsonToBuffer(jsonBytes)
err = jsonChunker.processBuffer(ctx)
if largeJsonStringError.Is(err) {
// Due to current limits on chunk sizes, and an inability of older clients to read
// string keys and values split across multiple chunks, we can't use the JSON chunker for documents
// with extremely long strings.
node, _, err := serializeJsonToBlob(ctx, ns, j)
return node, err
}
if err != nil {
return Node{}, err
}
Expand Down
45 changes: 36 additions & 9 deletions go/store/prolly/tree/json_cursor.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,14 @@ func newJsonCursor(ctx context.Context, ns NodeStore, root Node, startKey jsonLo
}

func newJsonCursorAtStartOfChunk(ctx context.Context, ns NodeStore, root Node, startKey []byte) (jCur *JsonCursor, err error) {
cur, err := newCursorAtKey(ctx, ns, root, startKey, jsonLocationOrdering{})
ordering := jsonLocationOrdering{}
cur, err := newCursorAtKey(ctx, ns, root, startKey, &ordering)
if err != nil {
return nil, err
}
if ordering.err != nil {
return nil, err
}
return newJsonCursorFromCursor(ctx, cur)
}

Expand Down Expand Up @@ -125,7 +129,15 @@ func (j *JsonCursor) NextValue(ctx context.Context) (result []byte, err error) {
return
}

for compareJsonLocations(j.jsonScanner.currentPath, path) < 0 {
for {
var cmp int
cmp, err = compareJsonLocations(j.jsonScanner.currentPath, path)
if err != nil {
return
}
if cmp < 0 {
break
}
parseChunk()
if err != nil {
return
Expand All @@ -135,13 +147,14 @@ func (j *JsonCursor) NextValue(ctx context.Context) (result []byte, err error) {
return
}

func (j *JsonCursor) isKeyInChunk(path jsonLocation) bool {
func (j *JsonCursor) isKeyInChunk(path jsonLocation) (bool, error) {
if j.cur.parent == nil {
// This is the only chunk, so the path must refer to this chunk.
return true
return true, nil
}
nodeEndPosition := jsonPathFromKey(j.cur.parent.CurrentKey())
return compareJsonLocations(path, nodeEndPosition) <= 0
cmp, err := compareJsonLocations(path, nodeEndPosition)
return cmp <= 0, err
}

// AdvanceToLocation causes the cursor to advance to the specified position. This function returns a boolean indicating
Expand All @@ -151,12 +164,20 @@ func (j *JsonCursor) isKeyInChunk(path jsonLocation) bool {
// the cursor advances to the end of the previous value, prior to the object key. This allows the key to be removed along
// with the value.
func (j *JsonCursor) AdvanceToLocation(ctx context.Context, path jsonLocation, forRemoval bool) (found bool, err error) {
if !j.isKeyInChunk(path) {
isInChunk, err := j.isKeyInChunk(path)
if err != nil {
return false, err
}
if !isInChunk {
// Our destination is in another chunk, load it.
err := Seek(ctx, j.cur.parent, path.key, jsonLocationOrdering{})
ordering := jsonLocationOrdering{}
err := Seek(ctx, j.cur.parent, path.key, &ordering)
if err != nil {
return false, err
}
if ordering.err != nil {
return false, err
}
j.cur.nd, err = fetchChild(ctx, j.cur.nrw, j.cur.parent.currentRef())
if err != nil {
return false, err
Expand All @@ -169,7 +190,10 @@ func (j *JsonCursor) AdvanceToLocation(ctx context.Context, path jsonLocation, f
}

previousScanner := j.jsonScanner
cmp := compareJsonLocations(j.jsonScanner.currentPath, path)
cmp, err := compareJsonLocations(j.jsonScanner.currentPath, path)
if err != nil {
return false, err
}
for cmp < 0 {
previousScanner = j.jsonScanner.Clone()
err := j.jsonScanner.AdvanceToNextLocation()
Expand All @@ -180,7 +204,10 @@ func (j *JsonCursor) AdvanceToLocation(ctx context.Context, path jsonLocation, f
} else if err != nil {
return false, err
}
cmp = compareJsonLocations(j.jsonScanner.currentPath, path)
cmp, err = compareJsonLocations(j.jsonScanner.currentPath, path)
if err != nil {
return false, err
}
}
// If the supplied path doesn't exist in the document, then we want to stop the cursor at the start of the point
// were it would appear. This may mean that we've gone too far and need to rewind one location.
Expand Down
27 changes: 19 additions & 8 deletions go/store/prolly/tree/json_indexed_document.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ type jsonLocationKey = []byte

type address = []byte

type StaticJsonMap = StaticMap[jsonLocationKey, address, jsonLocationOrdering]
type StaticJsonMap = StaticMap[jsonLocationKey, address, *jsonLocationOrdering]

// IndexedJsonDocument is an implementation of sql.JSONWrapper that stores the document in a prolly tree.
// Every leaf node in the tree is a blob containing a substring of the original document. This allows the document
Expand All @@ -51,10 +51,10 @@ var _ fmt.Stringer = IndexedJsonDocument{}
var _ driver.Valuer = IndexedJsonDocument{}

func NewIndexedJsonDocument(ctx context.Context, root Node, ns NodeStore) IndexedJsonDocument {
m := StaticMap[jsonLocationKey, address, jsonLocationOrdering]{
m := StaticMap[jsonLocationKey, address, *jsonLocationOrdering]{
Root: root,
NodeStore: ns,
Order: jsonLocationOrdering{},
Order: &jsonLocationOrdering{},
}
return IndexedJsonDocument{
m: m,
Expand Down Expand Up @@ -114,8 +114,8 @@ func tryWithFallback(
tryFunc func() error,
fallbackFunc func(document types.JSONDocument) error) error {
err := tryFunc()
if err == unknownLocationKeyError || err == unsupportedPathError {
if err == unknownLocationKeyError {
if err == unknownLocationKeyError || err == unsupportedPathError || err == jsonParseError || largeJsonStringError.Is(err) {
if err != unsupportedPathError {
if sqlCtx, ok := ctx.(*sql.Context); ok {
sqlCtx.GetLogger().Warn(err)
}
Expand Down Expand Up @@ -282,7 +282,10 @@ func (i IndexedJsonDocument) insertIntoCursor(ctx context.Context, keyPath jsonL
// For example, attempting to insert into the path "$.a.b" in the document {"a": 1}
// We can detect this by checking to see if the insertion point in the original document comes before the inserted path.
// (For example, the insertion point occurs at $.a.START, which is before $.a.b)
cmp := compareJsonLocations(cursorPath, keyPath)
cmp, err := compareJsonLocations(cursorPath, keyPath)
if err != nil {
return IndexedJsonDocument{}, false, err
}
if cmp < 0 && cursorPath.getScannerState() == startOfValue {
// We just attempted to insert into a scalar.
return i, false, nil
Expand Down Expand Up @@ -444,7 +447,11 @@ func (i IndexedJsonDocument) setWithLocation(ctx context.Context, keyPath jsonLo
}

keyPath.pop()
found = compareJsonLocations(keyPath, jsonCursor.jsonScanner.currentPath) == 0
cmp, err := compareJsonLocations(keyPath, jsonCursor.jsonScanner.currentPath)
if err != nil {
return IndexedJsonDocument{}, false, err
}
found = cmp == 0
}

if found {
Expand Down Expand Up @@ -491,7 +498,11 @@ func (i IndexedJsonDocument) tryReplace(ctx context.Context, path string, val sq
}

keyPath.pop()
found = compareJsonLocations(keyPath, jsonCursor.jsonScanner.currentPath) == 0
cmp, err := compareJsonLocations(keyPath, jsonCursor.jsonScanner.currentPath)
if err != nil {
return nil, false, err
}
found = cmp == 0
}

if !found {
Expand Down
16 changes: 16 additions & 0 deletions go/store/prolly/tree/json_indexed_document_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -455,3 +455,19 @@ func TestJsonCompare(t *testing.T) {
})
})
}

// Test that we can write a JSON document with a multi-MB string value into storage and read it back.
func TestIndexedJsonDocument_CreateLargeStringValues(t *testing.T) {
ctx := sql.NewEmptyContext()
ns := NewTestNodeStore()
doc := make(map[string]interface{})
value := strings.Repeat("x", 2097152)
doc["key"] = value
indexedDoc := newIndexedJsonDocumentFromValue(t, ctx, ns, doc)

lookup, err := indexedDoc.Lookup(ctx, "$.key")
require.NoError(t, err)
extractedValue, _, err := types.LongText.Convert(lookup)
require.NoError(t, err)
require.Equal(t, value, extractedValue)
}
Loading
Loading