From d1e7752ed6567ef17044b93ac37b3c1861083fd3 Mon Sep 17 00:00:00 2001 From: Nick Tobey Date: Tue, 7 Jan 2025 18:38:11 -0800 Subject: [PATCH 01/15] Add tests for inserting and reading large string values in JSON documents. --- .../prolly/tree/json_indexed_document_test.go | 16 ++++++++++++++++ .../test_files/backward_compatible_versions.txt | 1 + .../test_files/bats/compatibility.bats | 9 +++++++++ .../compatibility/test_files/setup_repo.sh | 12 ++++++++++++ 4 files changed, 38 insertions(+) diff --git a/go/store/prolly/tree/json_indexed_document_test.go b/go/store/prolly/tree/json_indexed_document_test.go index 7a7adc5637b..9676043049e 100644 --- a/go/store/prolly/tree/json_indexed_document_test.go +++ b/go/store/prolly/tree/json_indexed_document_test.go @@ -455,3 +455,19 @@ func TestJsonCompare(t *testing.T) { }) }) } + +// Test that we can write a JSON document with a multi-MB string value into storage and read it back. +func TestIndexedJsonDocument_CreateLargeStringValues(t *testing.T) { + ctx := sql.NewEmptyContext() + ns := NewTestNodeStore() + doc := make(map[string]interface{}) + value := strings.Repeat("x", 2097152) + doc["key"] = value + indexedDoc := newIndexedJsonDocumentFromValue(t, ctx, ns, doc) + + lookup, err := indexedDoc.Lookup(ctx, "$.key") + require.NoError(t, err) + extractedValue, _, err := types.LongText.Convert(lookup) + require.NoError(t, err) + require.Equal(t, value, extractedValue) +} diff --git a/integration-tests/compatibility/test_files/backward_compatible_versions.txt b/integration-tests/compatibility/test_files/backward_compatible_versions.txt index 560ad74c455..7a4096aadcd 100644 --- a/integration-tests/compatibility/test_files/backward_compatible_versions.txt +++ b/integration-tests/compatibility/test_files/backward_compatible_versions.txt @@ -1,3 +1,4 @@ +v1.40.0 v1.7.0 v1.6.0 v1.5.0 diff --git a/integration-tests/compatibility/test_files/bats/compatibility.bats b/integration-tests/compatibility/test_files/bats/compatibility.bats index 0d8aacc5f42..fd4db0795ba 100755 --- a/integration-tests/compatibility/test_files/bats/compatibility.bats +++ b/integration-tests/compatibility/test_files/bats/compatibility.bats @@ -206,3 +206,12 @@ EOF dolt sql -q 'drop table abc2' } + +@test "dolt large JSON document" { + run dolt -sql "select j->>'$.a' from js;" + [ "$status" -eq 0 ] + [[ "$output" =~ "one" ]] || false + run dolt -sql "select j->>'$.c' from js;" + [ "$status" -eq 0 ] + [[ "$output" =~ "three" ]] || false +} diff --git a/integration-tests/compatibility/test_files/setup_repo.sh b/integration-tests/compatibility/test_files/setup_repo.sh index 6e4772708a1..dbee95e8db2 100755 --- a/integration-tests/compatibility/test_files/setup_repo.sh +++ b/integration-tests/compatibility/test_files/setup_repo.sh @@ -38,6 +38,18 @@ CREATE TABLE big ( ); SQL dolt sql < "../../test_files/big_table.sql" # inserts 1K rows to `big` + +dolt sql < Date: Tue, 7 Jan 2025 18:39:10 -0800 Subject: [PATCH 02/15] In JSON chunker, allow chunking in the middle of large strings, and add a new location type representing the inside of a string. --- go/store/prolly/tree/json_location.go | 26 ++++++++++++++-- go/store/prolly/tree/json_scanner.go | 43 ++++++++++++++++++++++----- 2 files changed, 58 insertions(+), 11 deletions(-) diff --git a/go/store/prolly/tree/json_location.go b/go/store/prolly/tree/json_location.go index c532f07ac9b..aa217e4b42a 100644 --- a/go/store/prolly/tree/json_location.go +++ b/go/store/prolly/tree/json_location.go @@ -16,7 +16,6 @@ package tree import ( "bytes" - "cmp" "fmt" "slices" "strconv" @@ -69,8 +68,25 @@ const ( objectInitialElement arrayInitialElement endOfValue + middleOfString ) +func compareJsonPathTypes(left, right jsonPathType) int { + if left == startOfValue && right != startOfValue { + return -1 + } + if left == endOfValue && right != endOfValue { + return 1 + } + if right == startOfValue && left != startOfValue { + return 1 + } + if right == endOfValue && left != endOfValue { + return -1 + } + return 0 +} + func (t jsonPathType) isInitialElement() bool { return t == objectInitialElement || t == arrayInitialElement } @@ -170,7 +186,7 @@ func isUnsupportedJsonArrayIndex(index []byte) bool { } func errorIfNotSupportedLocation(key []byte) error { - if jsonPathType(key[0]) > endOfValue { + if jsonPathType(key[0]) > middleOfString { return unknownLocationKeyError } return nil @@ -336,6 +352,10 @@ func (p *jsonLocation) getScannerState() jsonPathType { return jsonPathType(p.key[0]) } +func (p jsonLocation) IsMiddleOfString() bool { + return p.getScannerState() == middleOfString +} + type jsonPathElement struct { key []byte isArrayIndex bool @@ -429,7 +449,7 @@ func compareJsonLocations(left, right jsonLocation) int { return -1 } // left and right have the exact same key elements - return cmp.Compare(left.getScannerState(), right.getScannerState()) + return compareJsonPathTypes(left.getScannerState(), right.getScannerState()) } diff --git a/go/store/prolly/tree/json_scanner.go b/go/store/prolly/tree/json_scanner.go index 9478d93a378..1a0e541b756 100644 --- a/go/store/prolly/tree/json_scanner.go +++ b/go/store/prolly/tree/json_scanner.go @@ -118,6 +118,14 @@ func (s *JsonScanner) AdvanceToNextLocation() error { } else { return s.acceptNextKeyValue() } + case middleOfString: + _, finishedString, err := s.acceptRestOfString() + if finishedString { + s.currentPath.setScannerState(endOfValue) + } else { + s.currentPath.setScannerState(middleOfString) + } + return err default: return jsonParseError } @@ -127,11 +135,16 @@ func (s *JsonScanner) acceptValue() error { current := s.current() switch current { case '"': - _, err := s.acceptString() + _, finishedString, err := s.acceptString() if err != nil { return err } - s.currentPath.setScannerState(endOfValue) + if finishedString { + s.currentPath.setScannerState(endOfValue) + } else { + s.currentPath.setScannerState(middleOfString) + } + return nil case '[': s.valueOffset++ @@ -177,22 +190,33 @@ func (s *JsonScanner) accept(b byte) error { return nil } -func (s *JsonScanner) acceptString() ([]byte, error) { - err := s.accept('"') +func (s *JsonScanner) acceptString() (stringBytes []byte, finishedString bool, err error) { + err = s.accept('"') if err != nil { - return nil, err + return nil, false, err } + return s.acceptRestOfString() +} + +func (s *JsonScanner) acceptRestOfString() (stringBytes []byte, finishedString bool, err error) { stringStart := s.valueOffset - for s.current() != '"' { + stringLength := 0 + for s.current() != '"' && stringLength < 1000 { switch s.current() { case '\\': s.valueOffset++ } s.valueOffset++ + stringLength++ } result := s.jsonBuffer[stringStart:s.valueOffset] + if stringLength == 1000 { + // Split the segment here, so that the chunk doesn't get too large. + return result, false, nil + } + // Advance past the ending quotes s.valueOffset++ - return result, nil + return result, true, nil } func (s *JsonScanner) acceptKeyValue() error { @@ -228,7 +252,10 @@ func (s *JsonScanner) acceptNextKeyValue() error { } func (s *JsonScanner) acceptObjectKey() error { - objectKey, err := s.acceptString() + objectKey, finishedString, err := s.acceptString() + if !finishedString { + // a very long key that might not fit? How to handle this? + } if err != nil { return err } From 83d858955d7a10a45456b2df43a12ebd88610b99 Mon Sep 17 00:00:00 2001 From: Nick Tobey Date: Wed, 8 Jan 2025 12:07:37 -0800 Subject: [PATCH 03/15] Detect when a json key is too large to chunk and don't use smart JSON chunking. --- go/store/prolly/tree/json_indexed_document.go | 2 +- go/store/prolly/tree/json_location.go | 6 +- go/store/prolly/tree/json_scanner.go | 63 +++++++++++++------ 3 files changed, 48 insertions(+), 23 deletions(-) diff --git a/go/store/prolly/tree/json_indexed_document.go b/go/store/prolly/tree/json_indexed_document.go index b0ac82f43b1..31970cc3a26 100644 --- a/go/store/prolly/tree/json_indexed_document.go +++ b/go/store/prolly/tree/json_indexed_document.go @@ -114,7 +114,7 @@ func tryWithFallback( tryFunc func() error, fallbackFunc func(document types.JSONDocument) error) error { err := tryFunc() - if err == unknownLocationKeyError || err == unsupportedPathError { + if err == unknownLocationKeyError || err == unsupportedPathError || largeJsonKeyError.Is(err) { if err == unknownLocationKeyError { if sqlCtx, ok := ctx.(*sql.Context); ok { sqlCtx.GetLogger().Warn(err) diff --git a/go/store/prolly/tree/json_location.go b/go/store/prolly/tree/json_location.go index aa217e4b42a..30240ed8b27 100644 --- a/go/store/prolly/tree/json_location.go +++ b/go/store/prolly/tree/json_location.go @@ -68,7 +68,7 @@ const ( objectInitialElement arrayInitialElement endOfValue - middleOfString + middleOfStringValue ) func compareJsonPathTypes(left, right jsonPathType) int { @@ -186,7 +186,7 @@ func isUnsupportedJsonArrayIndex(index []byte) bool { } func errorIfNotSupportedLocation(key []byte) error { - if jsonPathType(key[0]) > middleOfString { + if jsonPathType(key[0]) > middleOfStringValue { return unknownLocationKeyError } return nil @@ -353,7 +353,7 @@ func (p *jsonLocation) getScannerState() jsonPathType { } func (p jsonLocation) IsMiddleOfString() bool { - return p.getScannerState() == middleOfString + return p.getScannerState() == middleOfStringValue } type jsonPathElement struct { diff --git a/go/store/prolly/tree/json_scanner.go b/go/store/prolly/tree/json_scanner.go index 1a0e541b756..3c1347da63b 100644 --- a/go/store/prolly/tree/json_scanner.go +++ b/go/store/prolly/tree/json_scanner.go @@ -16,6 +16,7 @@ package tree import ( "fmt" + errorkinds "gopkg.in/src-d/go-errors.v1" "io" ) @@ -38,7 +39,14 @@ type JsonScanner struct { valueOffset int } +// The JSONChunker can't draw a chunk boundary within an object key. +// This can lead to large chunks that may cause problems. +// Since boundaries are always drawn once a chunk exceeds maxChunkSize (16KB), +// this is the largest length that can be appended to a chunk without exceeding 1MB. +var maxJsonKeyLength = 1024*1024 - maxChunkSize + var jsonParseError = fmt.Errorf("encountered invalid JSON while reading JSON from the database, or while preparing to write JSON to the database. This is most likely a bug in JSON diffing") +var largeJsonKeyError = errorkinds.NewKind("encountered JSON key with length %s, larger than max allowed length %s") func (j JsonScanner) Clone() JsonScanner { return JsonScanner{ @@ -118,12 +126,12 @@ func (s *JsonScanner) AdvanceToNextLocation() error { } else { return s.acceptNextKeyValue() } - case middleOfString: - _, finishedString, err := s.acceptRestOfString() + case middleOfStringValue: + finishedString, err := s.acceptRestOfValueString() if finishedString { s.currentPath.setScannerState(endOfValue) } else { - s.currentPath.setScannerState(middleOfString) + s.currentPath.setScannerState(middleOfStringValue) } return err default: @@ -135,14 +143,14 @@ func (s *JsonScanner) acceptValue() error { current := s.current() switch current { case '"': - _, finishedString, err := s.acceptString() + finishedString, err := s.acceptValueString() if err != nil { return err } if finishedString { s.currentPath.setScannerState(endOfValue) } else { - s.currentPath.setScannerState(middleOfString) + s.currentPath.setScannerState(middleOfStringValue) } return nil @@ -190,18 +198,39 @@ func (s *JsonScanner) accept(b byte) error { return nil } -func (s *JsonScanner) acceptString() (stringBytes []byte, finishedString bool, err error) { +func (s *JsonScanner) acceptKeyString() (stringBytes []byte, err error) { + err = s.accept('"') + if err != nil { + return nil, err + } + stringStart := s.valueOffset + for s.current() != '"' { + switch s.current() { + case '\\': + s.valueOffset++ + } + s.valueOffset++ + } + if s.valueOffset-stringStart > maxJsonKeyLength { + return nil, largeJsonKeyError.New(s.valueOffset-stringStart, maxJsonKeyLength) + } + result := s.jsonBuffer[stringStart:s.valueOffset] + // Advance past the ending quotes + s.valueOffset++ + return result, nil +} + +func (s *JsonScanner) acceptValueString() (finishedString bool, err error) { err = s.accept('"') if err != nil { - return nil, false, err + return false, err } - return s.acceptRestOfString() + return s.acceptRestOfValueString() } -func (s *JsonScanner) acceptRestOfString() (stringBytes []byte, finishedString bool, err error) { - stringStart := s.valueOffset +func (s *JsonScanner) acceptRestOfValueString() (finishedString bool, err error) { stringLength := 0 - for s.current() != '"' && stringLength < 1000 { + for s.current() != '"' && stringLength < maxJsonKeyLength { switch s.current() { case '\\': s.valueOffset++ @@ -209,14 +238,13 @@ func (s *JsonScanner) acceptRestOfString() (stringBytes []byte, finishedString b s.valueOffset++ stringLength++ } - result := s.jsonBuffer[stringStart:s.valueOffset] - if stringLength == 1000 { + if stringLength == maxJsonKeyLength { // Split the segment here, so that the chunk doesn't get too large. - return result, false, nil + return false, nil } // Advance past the ending quotes s.valueOffset++ - return result, true, nil + return true, nil } func (s *JsonScanner) acceptKeyValue() error { @@ -252,10 +280,7 @@ func (s *JsonScanner) acceptNextKeyValue() error { } func (s *JsonScanner) acceptObjectKey() error { - objectKey, finishedString, err := s.acceptString() - if !finishedString { - // a very long key that might not fit? How to handle this? - } + objectKey, err := s.acceptKeyString() if err != nil { return err } From 04af9a689b056fbae2f6ae2ae2550f151ee19ae6 Mon Sep 17 00:00:00 2001 From: Nick Tobey Date: Wed, 8 Jan 2025 12:45:36 -0800 Subject: [PATCH 04/15] Update maxJsonKeyLength to keep chunks below 48KB --- go/store/prolly/tree/json_scanner.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/go/store/prolly/tree/json_scanner.go b/go/store/prolly/tree/json_scanner.go index 3c1347da63b..581a060fb7d 100644 --- a/go/store/prolly/tree/json_scanner.go +++ b/go/store/prolly/tree/json_scanner.go @@ -41,9 +41,10 @@ type JsonScanner struct { // The JSONChunker can't draw a chunk boundary within an object key. // This can lead to large chunks that may cause problems. +// We've observed chunks getting written incorrectly if they exceed 48KB. // Since boundaries are always drawn once a chunk exceeds maxChunkSize (16KB), -// this is the largest length that can be appended to a chunk without exceeding 1MB. -var maxJsonKeyLength = 1024*1024 - maxChunkSize +// this is the largest length that can be appended to a chunk without exceeding 48KB. +var maxJsonKeyLength = 48*1024 - maxChunkSize var jsonParseError = fmt.Errorf("encountered invalid JSON while reading JSON from the database, or while preparing to write JSON to the database. This is most likely a bug in JSON diffing") var largeJsonKeyError = errorkinds.NewKind("encountered JSON key with length %s, larger than max allowed length %s") From ad271f825256ada5bfdfd4b170b69b136078899c Mon Sep 17 00:00:00 2001 From: Nick Tobey Date: Wed, 8 Jan 2025 16:57:34 -0800 Subject: [PATCH 05/15] Detect and emit unknownLocationKeyError to more places. --- go/store/prolly/tree/indexed_json_diff.go | 19 ++++++-- go/store/prolly/tree/json_cursor.go | 45 +++++++++++++---- go/store/prolly/tree/json_indexed_document.go | 23 ++++++--- go/store/prolly/tree/json_location.go | 48 +++++++++++-------- 4 files changed, 97 insertions(+), 38 deletions(-) diff --git a/go/store/prolly/tree/indexed_json_diff.go b/go/store/prolly/tree/indexed_json_diff.go index 6773b3f679a..e23ead39ff7 100644 --- a/go/store/prolly/tree/indexed_json_diff.go +++ b/go/store/prolly/tree/indexed_json_diff.go @@ -23,7 +23,7 @@ import ( ) type IndexedJsonDiffer struct { - differ Differ[jsonLocationKey, jsonLocationOrdering] + differ Differ[jsonLocationKey, *jsonLocationOrdering] currentFromCursor, currentToCursor *JsonCursor from, to IndexedJsonDocument started bool @@ -32,10 +32,14 @@ type IndexedJsonDiffer struct { var _ IJsonDiffer = &IndexedJsonDiffer{} func NewIndexedJsonDiffer(ctx context.Context, from, to IndexedJsonDocument) (*IndexedJsonDiffer, error) { - differ, err := DifferFromRoots[jsonLocationKey, jsonLocationOrdering](ctx, from.m.NodeStore, to.m.NodeStore, from.m.Root, to.m.Root, jsonLocationOrdering{}, false) + ordering := jsonLocationOrdering{} + differ, err := DifferFromRoots[jsonLocationKey, *jsonLocationOrdering](ctx, from.m.NodeStore, to.m.NodeStore, from.m.Root, to.m.Root, &ordering, false) if err != nil { return nil, err } + if ordering.err != nil { + return nil, err + } // We want to diff the prolly tree as if it was an address map pointing to the individual blob fragments, rather // than diffing the blob fragments themselves. We can accomplish this by just replacing the cursors in the differ // with their parents. @@ -225,7 +229,11 @@ func (jd *IndexedJsonDiffer) Next(ctx context.Context) (diff JsonDiff, err error // Neither cursor points to the start of a value. // This should only be possible if they're at the same location. // Do a sanity check, then continue. - if compareJsonLocations(fromCurrentLocation, toCurrentLocation) != 0 { + cmp, err := compareJsonLocations(fromCurrentLocation, toCurrentLocation) + if err != nil { + return JsonDiff{}, err + } + if cmp != 0 { return JsonDiff{}, jsonParseError } err = advanceCursor(ctx, &jd.currentFromCursor) @@ -240,7 +248,10 @@ func (jd *IndexedJsonDiffer) Next(ctx context.Context) (diff JsonDiff, err error } if fromScannerAtStartOfValue && toScannerAtStartOfValue { - cmp := compareJsonLocations(fromCurrentLocation, toCurrentLocation) + cmp, err := compareJsonLocations(fromCurrentLocation, toCurrentLocation) + if err != nil { + return JsonDiff{}, err + } switch cmp { case 0: key := fromCurrentLocation.Clone().key diff --git a/go/store/prolly/tree/json_cursor.go b/go/store/prolly/tree/json_cursor.go index 61ef2995aec..49414b4ff8e 100644 --- a/go/store/prolly/tree/json_cursor.go +++ b/go/store/prolly/tree/json_cursor.go @@ -65,10 +65,14 @@ func newJsonCursor(ctx context.Context, ns NodeStore, root Node, startKey jsonLo } func newJsonCursorAtStartOfChunk(ctx context.Context, ns NodeStore, root Node, startKey []byte) (jCur *JsonCursor, err error) { - cur, err := newCursorAtKey(ctx, ns, root, startKey, jsonLocationOrdering{}) + ordering := jsonLocationOrdering{} + cur, err := newCursorAtKey(ctx, ns, root, startKey, &ordering) if err != nil { return nil, err } + if ordering.err != nil { + return nil, err + } return newJsonCursorFromCursor(ctx, cur) } @@ -125,7 +129,15 @@ func (j *JsonCursor) NextValue(ctx context.Context) (result []byte, err error) { return } - for compareJsonLocations(j.jsonScanner.currentPath, path) < 0 { + for { + var cmp int + cmp, err = compareJsonLocations(j.jsonScanner.currentPath, path) + if err != nil { + return + } + if cmp < 0 { + break + } parseChunk() if err != nil { return @@ -135,13 +147,14 @@ func (j *JsonCursor) NextValue(ctx context.Context) (result []byte, err error) { return } -func (j *JsonCursor) isKeyInChunk(path jsonLocation) bool { +func (j *JsonCursor) isKeyInChunk(path jsonLocation) (bool, error) { if j.cur.parent == nil { // This is the only chunk, so the path must refer to this chunk. - return true + return true, nil } nodeEndPosition := jsonPathFromKey(j.cur.parent.CurrentKey()) - return compareJsonLocations(path, nodeEndPosition) <= 0 + cmp, err := compareJsonLocations(path, nodeEndPosition) + return cmp <= 0, err } // AdvanceToLocation causes the cursor to advance to the specified position. This function returns a boolean indicating @@ -151,12 +164,20 @@ func (j *JsonCursor) isKeyInChunk(path jsonLocation) bool { // the cursor advances to the end of the previous value, prior to the object key. This allows the key to be removed along // with the value. func (j *JsonCursor) AdvanceToLocation(ctx context.Context, path jsonLocation, forRemoval bool) (found bool, err error) { - if !j.isKeyInChunk(path) { + isInChunk, err := j.isKeyInChunk(path) + if err != nil { + return false, err + } + if !isInChunk { // Our destination is in another chunk, load it. - err := Seek(ctx, j.cur.parent, path.key, jsonLocationOrdering{}) + ordering := jsonLocationOrdering{} + err := Seek(ctx, j.cur.parent, path.key, &ordering) if err != nil { return false, err } + if ordering.err != nil { + return false, err + } j.cur.nd, err = fetchChild(ctx, j.cur.nrw, j.cur.parent.currentRef()) if err != nil { return false, err @@ -169,7 +190,10 @@ func (j *JsonCursor) AdvanceToLocation(ctx context.Context, path jsonLocation, f } previousScanner := j.jsonScanner - cmp := compareJsonLocations(j.jsonScanner.currentPath, path) + cmp, err := compareJsonLocations(j.jsonScanner.currentPath, path) + if err != nil { + return false, err + } for cmp < 0 { previousScanner = j.jsonScanner.Clone() err := j.jsonScanner.AdvanceToNextLocation() @@ -180,7 +204,10 @@ func (j *JsonCursor) AdvanceToLocation(ctx context.Context, path jsonLocation, f } else if err != nil { return false, err } - cmp = compareJsonLocations(j.jsonScanner.currentPath, path) + cmp, err = compareJsonLocations(j.jsonScanner.currentPath, path) + if err != nil { + return false, err + } } // If the supplied path doesn't exist in the document, then we want to stop the cursor at the start of the point // were it would appear. This may mean that we've gone too far and need to rewind one location. diff --git a/go/store/prolly/tree/json_indexed_document.go b/go/store/prolly/tree/json_indexed_document.go index 31970cc3a26..b76e27db782 100644 --- a/go/store/prolly/tree/json_indexed_document.go +++ b/go/store/prolly/tree/json_indexed_document.go @@ -31,7 +31,7 @@ type jsonLocationKey = []byte type address = []byte -type StaticJsonMap = StaticMap[jsonLocationKey, address, jsonLocationOrdering] +type StaticJsonMap = StaticMap[jsonLocationKey, address, *jsonLocationOrdering] // IndexedJsonDocument is an implementation of sql.JSONWrapper that stores the document in a prolly tree. // Every leaf node in the tree is a blob containing a substring of the original document. This allows the document @@ -51,10 +51,10 @@ var _ fmt.Stringer = IndexedJsonDocument{} var _ driver.Valuer = IndexedJsonDocument{} func NewIndexedJsonDocument(ctx context.Context, root Node, ns NodeStore) IndexedJsonDocument { - m := StaticMap[jsonLocationKey, address, jsonLocationOrdering]{ + m := StaticMap[jsonLocationKey, address, *jsonLocationOrdering]{ Root: root, NodeStore: ns, - Order: jsonLocationOrdering{}, + Order: &jsonLocationOrdering{}, } return IndexedJsonDocument{ m: m, @@ -282,7 +282,10 @@ func (i IndexedJsonDocument) insertIntoCursor(ctx context.Context, keyPath jsonL // For example, attempting to insert into the path "$.a.b" in the document {"a": 1} // We can detect this by checking to see if the insertion point in the original document comes before the inserted path. // (For example, the insertion point occurs at $.a.START, which is before $.a.b) - cmp := compareJsonLocations(cursorPath, keyPath) + cmp, err := compareJsonLocations(cursorPath, keyPath) + if err != nil { + return IndexedJsonDocument{}, false, err + } if cmp < 0 && cursorPath.getScannerState() == startOfValue { // We just attempted to insert into a scalar. return i, false, nil @@ -444,7 +447,11 @@ func (i IndexedJsonDocument) setWithLocation(ctx context.Context, keyPath jsonLo } keyPath.pop() - found = compareJsonLocations(keyPath, jsonCursor.jsonScanner.currentPath) == 0 + cmp, err := compareJsonLocations(keyPath, jsonCursor.jsonScanner.currentPath) + if err != nil { + return IndexedJsonDocument{}, false, err + } + found = cmp == 0 } if found { @@ -491,7 +498,11 @@ func (i IndexedJsonDocument) tryReplace(ctx context.Context, path string, val sq } keyPath.pop() - found = compareJsonLocations(keyPath, jsonCursor.jsonScanner.currentPath) == 0 + cmp, err := compareJsonLocations(keyPath, jsonCursor.jsonScanner.currentPath) + if err != nil { + return nil, false, err + } + found = cmp == 0 } if !found { diff --git a/go/store/prolly/tree/json_location.go b/go/store/prolly/tree/json_location.go index 30240ed8b27..67577a1952e 100644 --- a/go/store/prolly/tree/json_location.go +++ b/go/store/prolly/tree/json_location.go @@ -71,20 +71,24 @@ const ( middleOfStringValue ) -func compareJsonPathTypes(left, right jsonPathType) int { +func compareJsonPathTypes(left, right jsonPathType) (int, error) { + if left >= jsonPathTypeNumElements || right >= jsonPathTypeNumElements { + // These paths were written by a future version of Dolt. We can't assume anything about them. + return 0, unknownLocationKeyError + } if left == startOfValue && right != startOfValue { - return -1 + return -1, nil } if left == endOfValue && right != endOfValue { - return 1 + return 1, nil } if right == startOfValue && left != startOfValue { - return 1 + return 1, nil } if right == endOfValue && left != endOfValue { - return -1 + return -1, nil } - return 0 + return 0, nil } func (t jsonPathType) isInitialElement() bool { @@ -398,17 +402,17 @@ func (p *jsonLocation) Clone() jsonLocation { // compareJsonLocations creates an ordering on locations by determining which one would come first in a normalized JSON // document where all keys are sorted lexographically. -func compareJsonLocations(left, right jsonLocation) int { +func compareJsonLocations(left, right jsonLocation) (int, error) { minLength := min(left.size(), right.size()) for i := 0; i < minLength; i++ { leftPathElement := left.getPathElement(i) rightPathElement := right.getPathElement(i) c := bytes.Compare(leftPathElement.key, rightPathElement.key) if c < 0 { - return -1 + return -1, nil } if c > 0 { - return 1 + return 1, nil } } if left.size() < right.size() { @@ -420,14 +424,14 @@ func compareJsonLocations(left, right jsonLocation) int { if left.getScannerState() == objectInitialElement { rightIsArray := right.getLastPathElement().isArrayIndex if rightIsArray { - return 1 + return 1, nil } } } if left.getScannerState() != endOfValue { - return -1 + return -1, nil } - return 1 + return 1, nil } if left.size() > right.size() { // right is a parent of left @@ -438,26 +442,28 @@ func compareJsonLocations(left, right jsonLocation) int { if right.getScannerState() == objectInitialElement { leftIsArray := left.getLastPathElement().isArrayIndex if leftIsArray { - return -1 + return -1, nil } } } if right.getScannerState() != endOfValue { - return 1 + return 1, nil } - return -1 + return -1, nil } // left and right have the exact same key elements return compareJsonPathTypes(left.getScannerState(), right.getScannerState()) } -type jsonLocationOrdering struct{} +type jsonLocationOrdering struct { + err error // store errors created during comparisons +} -var _ Ordering[[]byte] = jsonLocationOrdering{} +var _ Ordering[[]byte] = &jsonLocationOrdering{} -func (jsonLocationOrdering) Compare(left, right []byte) int { +func (o *jsonLocationOrdering) Compare(left, right []byte) int { // A JSON document that fits entirely in a single chunk has no keys, if len(left) == 0 && len(right) == 0 { return 0 @@ -468,5 +474,9 @@ func (jsonLocationOrdering) Compare(left, right []byte) int { } leftPath := jsonPathFromKey(left) rightPath := jsonPathFromKey(right) - return compareJsonLocations(leftPath, rightPath) + cmp, err := compareJsonLocations(leftPath, rightPath) + if err != nil { + o.err = err + } + return cmp } From 3267faafd2baeadae4854cb62b0e16d98bde0ddb Mon Sep 17 00:00:00 2001 From: Nick Tobey Date: Wed, 8 Jan 2025 16:58:48 -0800 Subject: [PATCH 06/15] Disable chunk boundaries inside JSON string values, and emit largeJsonStringError for both keys and values in JSON. --- go/store/prolly/tree/json_chunker.go | 7 +++++++ go/store/prolly/tree/json_scanner.go | 21 +++++++++++---------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/go/store/prolly/tree/json_chunker.go b/go/store/prolly/tree/json_chunker.go index d825a906c64..104be871fd9 100644 --- a/go/store/prolly/tree/json_chunker.go +++ b/go/store/prolly/tree/json_chunker.go @@ -56,6 +56,13 @@ func SerializeJsonToAddr(ctx context.Context, ns NodeStore, j sql.JSONWrapper) ( } jsonChunker.appendJsonToBuffer(jsonBytes) err = jsonChunker.processBuffer(ctx) + if largeJsonStringError.Is(err) { + // Due to current limits on chunk sizes, and an inability of older clients to read + // string keys and values split across multiple chunks, we can't use the JSON chunker for documents + // with extremely long strings. + node, _, err := serializeJsonToBlob(ctx, ns, j) + return node, err + } if err != nil { return Node{}, err } diff --git a/go/store/prolly/tree/json_scanner.go b/go/store/prolly/tree/json_scanner.go index 581a060fb7d..1480f24f0c8 100644 --- a/go/store/prolly/tree/json_scanner.go +++ b/go/store/prolly/tree/json_scanner.go @@ -44,10 +44,10 @@ type JsonScanner struct { // We've observed chunks getting written incorrectly if they exceed 48KB. // Since boundaries are always drawn once a chunk exceeds maxChunkSize (16KB), // this is the largest length that can be appended to a chunk without exceeding 48KB. -var maxJsonKeyLength = 48*1024 - maxChunkSize +var maxJsonStringLength = 48*1024 - maxChunkSize var jsonParseError = fmt.Errorf("encountered invalid JSON while reading JSON from the database, or while preparing to write JSON to the database. This is most likely a bug in JSON diffing") -var largeJsonKeyError = errorkinds.NewKind("encountered JSON key with length %s, larger than max allowed length %s") +var largeJsonStringError = errorkinds.NewKind("encountered JSON key with length %s, larger than max allowed length %s") func (j JsonScanner) Clone() JsonScanner { return JsonScanner{ @@ -212,8 +212,8 @@ func (s *JsonScanner) acceptKeyString() (stringBytes []byte, err error) { } s.valueOffset++ } - if s.valueOffset-stringStart > maxJsonKeyLength { - return nil, largeJsonKeyError.New(s.valueOffset-stringStart, maxJsonKeyLength) + if s.valueOffset-stringStart > maxJsonStringLength { + return nil, largeJsonStringError.New(s.valueOffset-stringStart, maxJsonStringLength) } result := s.jsonBuffer[stringStart:s.valueOffset] // Advance past the ending quotes @@ -230,18 +230,19 @@ func (s *JsonScanner) acceptValueString() (finishedString bool, err error) { } func (s *JsonScanner) acceptRestOfValueString() (finishedString bool, err error) { - stringLength := 0 - for s.current() != '"' && stringLength < maxJsonKeyLength { + stringStart := s.valueOffset + for s.current() != '"' { switch s.current() { case '\\': s.valueOffset++ } s.valueOffset++ - stringLength++ } - if stringLength == maxJsonKeyLength { - // Split the segment here, so that the chunk doesn't get too large. - return false, nil + // We don't currently split value strings across chunks because it causes issues being read by older clients. + // Instead, by returning largeJsonStringError, we trigger the fallback behavior where the JSON document + // gets treated as a non-indexed blob. + if s.valueOffset-stringStart > maxJsonStringLength { + return false, largeJsonStringError.New(s.valueOffset-stringStart, maxJsonStringLength) } // Advance past the ending quotes s.valueOffset++ From 64878f908f673b2599acedf0e0874c17a81750b6 Mon Sep 17 00:00:00 2001 From: Nick Tobey Date: Wed, 8 Jan 2025 16:59:44 -0800 Subject: [PATCH 07/15] When encountering largeJsonStringError, fall back on inserting JSON document as a non-indexed BLOB --- go/store/prolly/tree/json_indexed_document.go | 2 +- go/store/prolly/tree/json_location.go | 3 +- go/store/prolly/tree/prolly_fields.go | 33 +++++++++++-------- 3 files changed, 22 insertions(+), 16 deletions(-) diff --git a/go/store/prolly/tree/json_indexed_document.go b/go/store/prolly/tree/json_indexed_document.go index b76e27db782..414ba30450c 100644 --- a/go/store/prolly/tree/json_indexed_document.go +++ b/go/store/prolly/tree/json_indexed_document.go @@ -114,7 +114,7 @@ func tryWithFallback( tryFunc func() error, fallbackFunc func(document types.JSONDocument) error) error { err := tryFunc() - if err == unknownLocationKeyError || err == unsupportedPathError || largeJsonKeyError.Is(err) { + if err == unknownLocationKeyError || err == unsupportedPathError || largeJsonStringError.Is(err) { if err == unknownLocationKeyError { if sqlCtx, ok := ctx.(*sql.Context); ok { sqlCtx.GetLogger().Warn(err) diff --git a/go/store/prolly/tree/json_location.go b/go/store/prolly/tree/json_location.go index 67577a1952e..2d270efad2c 100644 --- a/go/store/prolly/tree/json_location.go +++ b/go/store/prolly/tree/json_location.go @@ -69,6 +69,7 @@ const ( arrayInitialElement endOfValue middleOfStringValue + jsonPathTypeNumElements ) func compareJsonPathTypes(left, right jsonPathType) (int, error) { @@ -190,7 +191,7 @@ func isUnsupportedJsonArrayIndex(index []byte) bool { } func errorIfNotSupportedLocation(key []byte) error { - if jsonPathType(key[0]) > middleOfStringValue { + if jsonPathType(key[0]) > jsonPathTypeNumElements { return unknownLocationKeyError } return nil diff --git a/go/store/prolly/tree/prolly_fields.go b/go/store/prolly/tree/prolly_fields.go index dfc7c1c516f..9ebeddc896c 100644 --- a/go/store/prolly/tree/prolly_fields.go +++ b/go/store/prolly/tree/prolly_fields.go @@ -236,14 +236,14 @@ func PutField(ctx context.Context, ns NodeStore, tb *val.TupleBuilder, i int, v // TODO: eventually remove GeometryEnc, but in the meantime write them as GeomAddrEnc case val.GeometryEnc: geo := serializeGeometry(v) - h, err := SerializeBytesToAddr(ctx, ns, bytes.NewReader(geo), len(geo)) + _, h, err := SerializeBytesToAddr(ctx, ns, bytes.NewReader(geo), len(geo)) if err != nil { return err } tb.PutGeometryAddr(i, h) case val.GeomAddrEnc: geo := serializeGeometry(v) - h, err := SerializeBytesToAddr(ctx, ns, bytes.NewReader(geo), len(geo)) + _, h, err := SerializeBytesToAddr(ctx, ns, bytes.NewReader(geo), len(geo)) if err != nil { return err } @@ -255,14 +255,14 @@ func PutField(ctx context.Context, ns NodeStore, tb *val.TupleBuilder, i int, v } tb.PutJSONAddr(i, h) case val.BytesAddrEnc: - h, err := SerializeBytesToAddr(ctx, ns, bytes.NewReader(v.([]byte)), len(v.([]byte))) + _, h, err := SerializeBytesToAddr(ctx, ns, bytes.NewReader(v.([]byte)), len(v.([]byte))) if err != nil { return err } tb.PutBytesAddr(i, h) case val.StringAddrEnc: //todo: v will be []byte after daylon's changes - h, err := SerializeBytesToAddr(ctx, ns, bytes.NewReader([]byte(v.(string))), len(v.(string))) + _, h, err := SerializeBytesToAddr(ctx, ns, bytes.NewReader([]byte(v.(string))), len(v.(string))) if err != nil { return err } @@ -292,7 +292,7 @@ func PutField(ctx context.Context, ns NodeStore, tb *val.TupleBuilder, i int, v if err != nil { return err } - h, err := SerializeBytesToAddr(ctx, ns, bytes.NewReader(b), len(b)) + _, h, err := SerializeBytesToAddr(ctx, ns, bytes.NewReader(b), len(b)) if err != nil { return err } @@ -315,11 +315,8 @@ func getJSONAddrHash(ctx context.Context, ns NodeStore, v interface{}) (hash.Has return hash.Hash{}, err } if optimizeJson == int8(0) { - buf, err := types.MarshallJson(j) - if err != nil { - return hash.Hash{}, err - } - return SerializeBytesToAddr(ctx, ns, bytes.NewReader(buf), len(buf)) + _, h, err := serializeJsonToBlob(ctx, ns, j) + return h, err } } root, err := SerializeJsonToAddr(ctx, ns, j) @@ -329,6 +326,14 @@ func getJSONAddrHash(ctx context.Context, ns NodeStore, v interface{}) (hash.Has return root.HashOf(), nil } +func serializeJsonToBlob(ctx context.Context, ns NodeStore, j sql.JSONWrapper) (Node, hash.Hash, error) { + buf, err := types.MarshallJson(j) + if err != nil { + return Node{}, hash.Hash{}, err + } + return SerializeBytesToAddr(ctx, ns, bytes.NewReader(buf), len(buf)) +} + func convInt(v interface{}) int { switch i := v.(type) { case int: @@ -417,15 +422,15 @@ func serializeGeometry(v interface{}) []byte { } } -func SerializeBytesToAddr(ctx context.Context, ns NodeStore, r io.Reader, dataSize int) (hash.Hash, error) { +func SerializeBytesToAddr(ctx context.Context, ns NodeStore, r io.Reader, dataSize int) (Node, hash.Hash, error) { bb := ns.BlobBuilder() defer ns.PutBlobBuilder(bb) bb.Init(dataSize) - _, addr, err := bb.Chunk(ctx, r) + node, addr, err := bb.Chunk(ctx, r) if err != nil { - return hash.Hash{}, err + return Node{}, hash.Hash{}, err } - return addr, nil + return node, addr, nil } func convJson(v interface{}) (res sql.JSONWrapper, err error) { From 82916cdbe635720c904bc2762421c7e000d8d484 Mon Sep 17 00:00:00 2001 From: Nick Tobey Date: Wed, 8 Jan 2025 17:09:11 -0800 Subject: [PATCH 08/15] Remove backwards compatibility tests. If we're emitting blobs, we know that older versions can read them. --- integration-tests/bats/json.bats | 22 ++++++++++++++++++- .../backward_compatible_versions.txt | 1 - .../test_files/bats/compatibility.bats | 9 -------- .../compatibility/test_files/setup_repo.sh | 11 ---------- 4 files changed, 21 insertions(+), 22 deletions(-) diff --git a/integration-tests/bats/json.bats b/integration-tests/bats/json.bats index f8724bd6a6c..f7c619dbfb6 100644 --- a/integration-tests/bats/json.bats +++ b/integration-tests/bats/json.bats @@ -294,8 +294,28 @@ SQL insert into js values (1, "[[[[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]], [[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]], [[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]], [[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]],[[[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]], [[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]], [[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]], [[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]],[[[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]], [[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]], [[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]], [[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]],[[[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]], [[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]], [[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]], [[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]]]"); SQL # If the document isn't put into an IndexedJsonDocument, it will have the following hash. - dolt show WORKING run dolt show qivuleqpbin1eise78h5u8k1hqe4f07g [ "$status" -eq 0 ] [[ "$output" =~ "Blob" ]] || false +} + +# Older clients have a bug that prevents them from reading indexed JSON documents with strings split across chunks, +# And making one large chunk can causes issues with the journal. +# Thus, we expect that the document gets stored as a blob. +@test "json: document with large string value doesn't get indexed" { + run dolt sql <>'$.a' from js;" - [ "$status" -eq 0 ] - [[ "$output" =~ "one" ]] || false - run dolt -sql "select j->>'$.c' from js;" - [ "$status" -eq 0 ] - [[ "$output" =~ "three" ]] || false -} diff --git a/integration-tests/compatibility/test_files/setup_repo.sh b/integration-tests/compatibility/test_files/setup_repo.sh index dbee95e8db2..757cc845597 100755 --- a/integration-tests/compatibility/test_files/setup_repo.sh +++ b/integration-tests/compatibility/test_files/setup_repo.sh @@ -38,17 +38,6 @@ CREATE TABLE big ( ); SQL dolt sql < "../../test_files/big_table.sql" # inserts 1K rows to `big` - -dolt sql < Date: Thu, 9 Jan 2025 01:19:34 +0000 Subject: [PATCH 09/15] [ga-format-pr] Run go/utils/repofmt/format_repo.sh and go/Godeps/update.sh --- go/go.mod | 1 + go/go.sum | 1 + go/store/prolly/tree/json_scanner.go | 3 ++- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/go/go.mod b/go/go.mod index 22588da0edb..6b55810ec71 100644 --- a/go/go.mod +++ b/go/go.mod @@ -63,6 +63,7 @@ require ( github.com/google/btree v1.1.2 github.com/google/go-github/v57 v57.0.0 github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 + github.com/hashicorp/go-uuid v1.0.1 github.com/hashicorp/golang-lru/v2 v2.0.2 github.com/jmoiron/sqlx v1.3.4 github.com/kch42/buzhash v0.0.0-20160816060738-9bdec3dec7c6 diff --git a/go/go.sum b/go/go.sum index 55f9e7af7fa..878d843c29c 100644 --- a/go/go.sum +++ b/go/go.sum @@ -389,6 +389,7 @@ github.com/hashicorp/go-sockaddr v1.0.0/go.mod h1:7Xibr9yA9JjQq1JpNB2Vw7kxv8xerX github.com/hashicorp/go-syslog v1.0.0/go.mod h1:qPfqrKkXGihmCqbJM2mZgkZGvKG1dFdvsLplgctolz4= github.com/hashicorp/go-uuid v0.0.0-20180228145832-27454136f036/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro= github.com/hashicorp/go-uuid v1.0.0/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro= +github.com/hashicorp/go-uuid v1.0.1 h1:fv1ep09latC32wFoVwnqcnKJGnMSdBanPczbHAYm1BE= github.com/hashicorp/go-uuid v1.0.1/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro= github.com/hashicorp/go-version v1.2.0/go.mod h1:fltr4n8CU8Ke44wwGCBoEymUuxUHl09ZGVZPK5anwXA= github.com/hashicorp/go.net v0.0.1/go.mod h1:hjKkEWcCURg++eb33jQU7oqQcI9XDCnUzHA0oac0k90= diff --git a/go/store/prolly/tree/json_scanner.go b/go/store/prolly/tree/json_scanner.go index 1480f24f0c8..1fbd5b37e57 100644 --- a/go/store/prolly/tree/json_scanner.go +++ b/go/store/prolly/tree/json_scanner.go @@ -16,8 +16,9 @@ package tree import ( "fmt" - errorkinds "gopkg.in/src-d/go-errors.v1" "io" + + errorkinds "gopkg.in/src-d/go-errors.v1" ) // JsonScanner is a state machine that parses already-normalized JSON while keeping track of the path to the current value. From 077e4850a4120358e711aa21314db5808a1820bc Mon Sep 17 00:00:00 2001 From: Nick Tobey Date: Wed, 8 Jan 2025 17:24:27 -0800 Subject: [PATCH 10/15] Prevent an infinite loop when reading a JSON chunk that ends in the middle of a string. --- go/store/prolly/tree/json_scanner.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/go/store/prolly/tree/json_scanner.go b/go/store/prolly/tree/json_scanner.go index 1fbd5b37e57..f6f00cf7bc9 100644 --- a/go/store/prolly/tree/json_scanner.go +++ b/go/store/prolly/tree/json_scanner.go @@ -208,6 +208,8 @@ func (s *JsonScanner) acceptKeyString() (stringBytes []byte, err error) { stringStart := s.valueOffset for s.current() != '"' { switch s.current() { + case endOfFile: + return nil, jsonParseError case '\\': s.valueOffset++ } From bb3d2f341337e3c7f5503f86aeeecae04b0203ec Mon Sep 17 00:00:00 2001 From: Nick Tobey Date: Wed, 8 Jan 2025 17:25:42 -0800 Subject: [PATCH 11/15] When encountering a jsonParseError when working with JSON, it's most likely a bug in handling JSON metadata. Emit a warning, then continue, treating the document as a blob and ignoring JSON metadata. --- go/store/prolly/tree/json_indexed_document.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/go/store/prolly/tree/json_indexed_document.go b/go/store/prolly/tree/json_indexed_document.go index 414ba30450c..9dd1ab2ba4e 100644 --- a/go/store/prolly/tree/json_indexed_document.go +++ b/go/store/prolly/tree/json_indexed_document.go @@ -114,8 +114,8 @@ func tryWithFallback( tryFunc func() error, fallbackFunc func(document types.JSONDocument) error) error { err := tryFunc() - if err == unknownLocationKeyError || err == unsupportedPathError || largeJsonStringError.Is(err) { - if err == unknownLocationKeyError { + if err == unknownLocationKeyError || err == unsupportedPathError || err == jsonParseError || largeJsonStringError.Is(err) { + if err != unsupportedPathError { if sqlCtx, ok := ctx.(*sql.Context); ok { sqlCtx.GetLogger().Warn(err) } From c984466baaadeb8a6bb286ab94f5ee830e4b3038 Mon Sep 17 00:00:00 2001 From: Nick Tobey Date: Wed, 8 Jan 2025 17:35:44 -0800 Subject: [PATCH 12/15] Fix flipped sign in JsonCursor::NextValue --- go/store/prolly/tree/json_cursor.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/store/prolly/tree/json_cursor.go b/go/store/prolly/tree/json_cursor.go index 49414b4ff8e..ee83b761f86 100644 --- a/go/store/prolly/tree/json_cursor.go +++ b/go/store/prolly/tree/json_cursor.go @@ -135,7 +135,7 @@ func (j *JsonCursor) NextValue(ctx context.Context) (result []byte, err error) { if err != nil { return } - if cmp < 0 { + if cmp >= 0 { break } parseChunk() From fb310646728ea7ca58d93288525441b6607af02c Mon Sep 17 00:00:00 2001 From: Nick Tobey Date: Wed, 8 Jan 2025 19:36:24 -0800 Subject: [PATCH 13/15] Unskip dolt query test. --- go/libraries/doltcore/sqle/enginetest/dolt_queries.go | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/go/libraries/doltcore/sqle/enginetest/dolt_queries.go b/go/libraries/doltcore/sqle/enginetest/dolt_queries.go index a4e592401e2..7197c182af0 100644 --- a/go/libraries/doltcore/sqle/enginetest/dolt_queries.go +++ b/go/libraries/doltcore/sqle/enginetest/dolt_queries.go @@ -4786,11 +4786,8 @@ var LargeJsonObjectScriptTests = []queries.ScriptTest{ Expected: []sql.Row{{types.OkResult{RowsAffected: 1}}}, }, { - Skip: true, - // TODO: The JSON is coming back truncated for some reason and failing this test. - // When that's fixed, unskip this test, and fix the length value below. - Query: `SELECT pk, length(j1) from t;`, - Expected: []sql.Row{{1, 123}}, + Query: `SELECT pk, length(j1->>"$.large_value") from t;`, + Expected: []sql.Row{{1, 1024 * 1024 * 3}}, }, }, }, From 94bcde030e2b9b5a01bb537f0c10f1e8641e4148 Mon Sep 17 00:00:00 2001 From: Nick Tobey Date: Wed, 8 Jan 2025 20:01:34 -0800 Subject: [PATCH 14/15] Revert change to compatibility test. --- integration-tests/compatibility/test_files/setup_repo.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/integration-tests/compatibility/test_files/setup_repo.sh b/integration-tests/compatibility/test_files/setup_repo.sh index 757cc845597..6e4772708a1 100755 --- a/integration-tests/compatibility/test_files/setup_repo.sh +++ b/integration-tests/compatibility/test_files/setup_repo.sh @@ -38,7 +38,6 @@ CREATE TABLE big ( ); SQL dolt sql < "../../test_files/big_table.sql" # inserts 1K rows to `big` -SQL dolt add . dolt commit -m "initialized data" dolt branch init From 5429156f1d34bc05e118d60fb942eb046f1f43a0 Mon Sep 17 00:00:00 2001 From: Nick Tobey Date: Wed, 8 Jan 2025 20:13:04 -0800 Subject: [PATCH 15/15] Update indexed document test. --- go/store/prolly/tree/json_indexed_document_test.go | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/go/store/prolly/tree/json_indexed_document_test.go b/go/store/prolly/tree/json_indexed_document_test.go index 9676043049e..dade1fc917d 100644 --- a/go/store/prolly/tree/json_indexed_document_test.go +++ b/go/store/prolly/tree/json_indexed_document_test.go @@ -460,12 +460,15 @@ func TestJsonCompare(t *testing.T) { func TestIndexedJsonDocument_CreateLargeStringValues(t *testing.T) { ctx := sql.NewEmptyContext() ns := NewTestNodeStore() - doc := make(map[string]interface{}) + docMap := make(map[string]interface{}) value := strings.Repeat("x", 2097152) - doc["key"] = value - indexedDoc := newIndexedJsonDocumentFromValue(t, ctx, ns, doc) - - lookup, err := indexedDoc.Lookup(ctx, "$.key") + docMap["key"] = value + doc, _, err := types.JSON.Convert(docMap) + require.NoError(t, err) + root, err := SerializeJsonToAddr(ctx, ns, doc.(sql.JSONWrapper)) + require.NoError(t, err) + indexedDoc, err := NewJSONDoc(root.HashOf(), ns).ToIndexedJSONDocument(ctx) + lookup, err := types.LookupJSONValue(indexedDoc, "$.key") require.NoError(t, err) extractedValue, _, err := types.LongText.Convert(lookup) require.NoError(t, err)