diff --git a/go/store/prolly/tree/json_chunker.go b/go/store/prolly/tree/json_chunker.go index a5dd0d9eed..75b9f1591f 100644 --- a/go/store/prolly/tree/json_chunker.go +++ b/go/store/prolly/tree/json_chunker.go @@ -190,9 +190,13 @@ func (j *JsonChunker) appendJsonToBuffer(jsonBytes []byte) { // processBuffer reads all new additions added by appendJsonToBuffer, and determines any new chunk boundaries. // Do not call this method directly. It should only get called from within this file. -func (j *JsonChunker) processBuffer(ctx context.Context) error { +func (j *JsonChunker) processBuffer(ctx context.Context) (err error) { chunkStart := 0 - for j.jScanner.AdvanceToNextLocation() != io.EOF { + err = j.jScanner.AdvanceToNextLocation() + for err != io.EOF { + if err != nil { + return err + } key := j.jScanner.currentPath.key value := j.jScanner.jsonBuffer[chunkStart:j.jScanner.valueOffset] if crossesBoundary(key, value) { @@ -202,6 +206,7 @@ func (j *JsonChunker) processBuffer(ctx context.Context) error { } chunkStart = j.jScanner.valueOffset } + err = j.jScanner.AdvanceToNextLocation() } if chunkStart > 0 { newValueOffset := j.jScanner.valueOffset - chunkStart diff --git a/go/store/prolly/tree/json_indexed_document.go b/go/store/prolly/tree/json_indexed_document.go index 9a781be4ca..65295f3c41 100644 --- a/go/store/prolly/tree/json_indexed_document.go +++ b/go/store/prolly/tree/json_indexed_document.go @@ -19,7 +19,6 @@ import ( "database/sql/driver" "encoding/json" "fmt" - "strings" "sync" "github.com/dolthub/go-mysql-server/sql" @@ -107,29 +106,40 @@ func getBytesFromIndexedJsonMap(ctx context.Context, m StaticJsonMap) (bytes []b return bytes, err } -// Lookup implements types.SearchableJSON -func (i IndexedJsonDocument) Lookup(ctx context.Context, pathString string) (sql.JSONWrapper, error) { - if strings.Contains(pathString, "*") { - // Optimized lookup doesn't currently support wildcards. Fall back on an unoptimized approach. - // TODO: Optimized lookup on wildcards. - val, err := i.ToInterface() - if err != nil { - return nil, err - } - nonIndexedDoc := types.JSONDocument{Val: val} - return nonIndexedDoc.Lookup(ctx, pathString) - } - result, err := i.tryLookup(ctx, pathString) - if err == unknownLocationKeyError { - if sqlCtx, ok := ctx.(*sql.Context); ok { - sqlCtx.GetLogger().Warn(err) +func tryWithFallback( + ctx context.Context, + i IndexedJsonDocument, + tryFunc func() error, + fallbackFunc func(document types.JSONDocument) error) error { + err := tryFunc() + if err == unknownLocationKeyError || err == unsupportedPathError { + if err == unknownLocationKeyError { + if sqlCtx, ok := ctx.(*sql.Context); ok { + sqlCtx.GetLogger().Warn(err) + } } v, err := i.ToInterface() if err != nil { - return nil, err + return err } - return types.JSONDocument{Val: v}.Lookup(ctx, pathString) + return fallbackFunc(types.JSONDocument{Val: v}) } + return err +} + +// Lookup implements types.SearchableJSON +func (i IndexedJsonDocument) Lookup(ctx context.Context, pathString string) (result sql.JSONWrapper, err error) { + err = tryWithFallback( + ctx, + i, + func() error { + result, err = i.tryLookup(ctx, pathString) + return err + }, + func(jsonDocument types.JSONDocument) error { + result, err = jsonDocument.Lookup(ctx, pathString) + return err + }) return result, err } @@ -160,18 +170,18 @@ func (i IndexedJsonDocument) tryLookup(ctx context.Context, pathString string) ( } // Insert implements types.MutableJSON -func (i IndexedJsonDocument) Insert(ctx context.Context, path string, val sql.JSONWrapper) (types.MutableJSON, bool, error) { - result, changed, err := i.tryInsert(ctx, path, val) - if err == unknownLocationKeyError { - if sqlCtx, ok := ctx.(*sql.Context); ok { - sqlCtx.GetLogger().Warn(err) - } - v, err := i.ToInterface() - if err != nil { - return nil, false, err - } - return types.JSONDocument{Val: v}.Insert(ctx, path, val) - } +func (i IndexedJsonDocument) Insert(ctx context.Context, path string, val sql.JSONWrapper) (result types.MutableJSON, changed bool, err error) { + err = tryWithFallback( + ctx, + i, + func() error { + result, changed, err = i.tryInsert(ctx, path, val) + return err + }, + func(jsonDocument types.JSONDocument) error { + result, changed, err = jsonDocument.Insert(ctx, path, val) + return err + }) return result, changed, err } diff --git a/go/store/prolly/tree/json_indexed_document_test.go b/go/store/prolly/tree/json_indexed_document_test.go index 251efc7d71..f6b94aa5e1 100644 --- a/go/store/prolly/tree/json_indexed_document_test.go +++ b/go/store/prolly/tree/json_indexed_document_test.go @@ -16,15 +16,20 @@ package tree import ( "context" + "fmt" + "strings" "testing" + "github.com/dolthub/go-mysql-server/sql/expression" + "github.com/dolthub/go-mysql-server/sql/expression/function/json" + "github.com/dolthub/go-mysql-server/sql" "github.com/dolthub/go-mysql-server/sql/expression/function/json/jsontests" "github.com/dolthub/go-mysql-server/sql/types" "github.com/stretchr/testify/require" ) -// NewIndexedJsonDocumentFromBytes creates an IndexedJsonDocument from a byte sequence, which has already been validated and normalized +// newIndexedJsonDocumentFromValue creates an IndexedJsonDocument from a provided value. func newIndexedJsonDocumentFromValue(t *testing.T, ctx context.Context, ns NodeStore, v interface{}) IndexedJsonDocument { doc, _, err := types.JSON.Convert(v) require.NoError(t, err) @@ -33,8 +38,91 @@ func newIndexedJsonDocumentFromValue(t *testing.T, ctx context.Context, ns NodeS return NewIndexedJsonDocument(ctx, root, ns) } +// createLargeDocumentForTesting creates a JSON document large enough to be split across multiple chunks. +// This is useful for testing mutation operations in large documents. +// Every different possible jsonPathType appears on a chunk boundary, for better test coverage: +// chunk 0 key: $[6].children[2].children[0].number(endOfValue) +// chunk 2 key: $[7].children[5].children[4].children[2].children(arrayInitialElement) +// chunk 5 key: $[8].children[6].children[4].children[3].children[0](startOfValue) +// chunk 8 key: $[8].children[7].children[6].children[5].children[3].children[2].children[1](objectInitialElement) +func createLargeDocumentForTesting(t *testing.T, ctx *sql.Context, ns NodeStore) IndexedJsonDocument { + leafDoc := make(map[string]interface{}) + leafDoc["number"] = float64(1.0) + leafDoc["string"] = "dolt" + docExpression, err := json.NewJSONArray(expression.NewLiteral(newIndexedJsonDocumentFromValue(t, ctx, ns, leafDoc), types.JSON)) + require.NoError(t, err) + + for level := 0; level < 8; level++ { + childObjectExpression, err := json.NewJSONObject(expression.NewLiteral("children", types.Text), docExpression) + require.NoError(t, err) + docExpression, err = json.NewJSONArrayAppend(docExpression, expression.NewLiteral("$", types.Text), childObjectExpression) + require.NoError(t, err) + } + doc, err := docExpression.Eval(ctx, nil) + require.NoError(t, err) + return newIndexedJsonDocumentFromValue(t, ctx, ns, doc) +} + +var jsonPathTypeNames = []string{ + "startOfValue", + "objectInitialElement", + "arrayInitialElement", + "endOfValue", +} + +type chunkBoundary struct { + chunkId int + path string + pathType jsonPathType +} + +var largeDocumentChunkBoundaries = []chunkBoundary{ + { + chunkId: 0, + path: "$[6].children[2].children[0].number", + pathType: endOfValue, + }, + { + chunkId: 2, + path: "$[7].children[5].children[4].children[2].children", + pathType: arrayInitialElement, + }, + { + chunkId: 5, + path: "$[8].children[6].children[4].children[3].children[0]", + pathType: startOfValue, + }, + { + chunkId: 8, + path: "$[8].children[7].children[6].children[5].children[3].children[2].children[1]", + pathType: objectInitialElement, + }, +} + +func jsonLocationFromMySqlPath(t *testing.T, path string, pathType jsonPathType) jsonLocation { + result, err := jsonPathElementsFromMySQLJsonPath([]byte(path)) + require.NoError(t, err) + result.setScannerState(pathType) + return result +} + +// TestIndexedJsonDocument_ValidateChunks asserts that the values defined largeDocumentChunkBoundaries are accurate, +// so they can be used in other tests. +func TestIndexedJsonDocument_ValidateChunks(t *testing.T) { + ctx := sql.NewEmptyContext() + ns := NewTestNodeStore() + largeDoc := createLargeDocumentForTesting(t, ctx, ns) + for _, boundary := range largeDocumentChunkBoundaries { + t.Run(fmt.Sprintf("validate %v at chunk %v", jsonPathTypeNames[boundary.pathType], boundary.chunkId), func(t *testing.T) { + expectedKey := jsonLocationFromMySqlPath(t, boundary.path, boundary.pathType) + actualKey := []byte(largeDoc.m.Root.GetKey(boundary.chunkId)) + require.Equal(t, expectedKey.key, actualKey) + }) + } +} + func TestIndexedJsonDocument_Insert(t *testing.T) { - ctx := context.Background() + ctx := sql.NewEmptyContext() ns := NewTestNodeStore() convertToIndexedJsonDocument := func(t *testing.T, s interface{}) interface{} { return newIndexedJsonDocumentFromValue(t, ctx, ns, s) @@ -42,6 +130,51 @@ func TestIndexedJsonDocument_Insert(t *testing.T) { testCases := jsontests.JsonInsertTestCases(t, convertToIndexedJsonDocument) jsontests.RunJsonTests(t, testCases) + + t.Run("large document inserts", func(t *testing.T) { + + largeDoc := createLargeDocumentForTesting(t, ctx, ns) + + // Generate a value large enough that, if it's inserted, will guarantee a change in chunk boundaries. + valueToInsert, err := largeDoc.Lookup(ctx, "$[6]") + require.NoError(t, err) + + for _, chunkBoundary := range largeDocumentChunkBoundaries { + t.Run(jsonPathTypeNames[chunkBoundary.pathType], func(t *testing.T) { + // Compute a location right before the chunk boundary, and insert a large value into it. + insertionPoint := chunkBoundary.path[:strings.LastIndex(chunkBoundary.path, ".")] + insertionPoint = fmt.Sprint(insertionPoint, ".a") + newDoc, changed, err := largeDoc.Insert(ctx, insertionPoint, valueToInsert) + require.NoError(t, err) + require.True(t, changed) + + // test that the chunk boundary was moved as a result of the insert. + newBoundary := []byte(largeDoc.m.Root.GetKey(chunkBoundary.chunkId)) + previousBoundary := jsonLocationFromMySqlPath(t, chunkBoundary.path, chunkBoundary.pathType) + require.NotEqual(t, newBoundary, previousBoundary) + + // test that new value is valid by converting it to interface{} + v, err := newDoc.ToInterface() + require.NoError(t, err) + newJsonDocument := types.JSONDocument{Val: v} + + // test that the JSONDocument compares equal to the IndexedJSONDocument + cmp, err := types.JSON.Compare(newDoc, newJsonDocument) + require.NoError(t, err) + require.Equal(t, cmp, 0) + + // extract the inserted value and confirm it's equal to the original inserted value. + result, err := newJsonDocument.Lookup(ctx, insertionPoint) + require.NoError(t, err) + require.NotNil(t, result) + + cmp, err = types.JSON.Compare(valueToInsert, result) + require.NoError(t, err) + require.Equal(t, cmp, 0) + }) + } + }) + } func TestIndexedJsonDocument_Extract(t *testing.T) { diff --git a/go/store/prolly/tree/json_location.go b/go/store/prolly/tree/json_location.go index 3fce44767a..48c5681a8f 100644 --- a/go/store/prolly/tree/json_location.go +++ b/go/store/prolly/tree/json_location.go @@ -72,6 +72,7 @@ const ( ) var unknownLocationKeyError = fmt.Errorf("A JSON document was written with a future version of Dolt, and the index metadata cannot be read. This will impact performance for large documents.") +var unsupportedPathError = fmt.Errorf("The supplied JSON path is not supported for optimized lookup, falling back on unoptimized implementation.") const ( beginObjectKey byte = 0xFF @@ -122,6 +123,7 @@ func jsonPathFromKey(pathKey []byte) (path jsonLocation) { i += 1 } else if pathKey[i] == beginArrayKey { ret.offsets = append(ret.offsets, i) + i += 1 i += varIntLength(pathKey[i]) } else { i += 1 @@ -143,14 +145,24 @@ func varIntLength(firstByte byte) int { return int(firstByte - 246) } -func isValidJsonPathKey(key []byte) bool { +func isUnsupportedJsonPathKey(key []byte) bool { if bytes.Equal(key, []byte("*")) { - return false + return true } if bytes.Equal(key, []byte("**")) { - return false + return true + } + return false +} + +func isUnsupportedJsonArrayIndex(index []byte) bool { + if bytes.Equal(index, []byte("*")) { + return true + } + if bytes.Equal(index, []byte("last")) { + return true } - return true + return false } func errorIfNotSupportedLocation(key []byte) error { @@ -199,11 +211,12 @@ func jsonPathElementsFromMySQLJsonPath(pathBytes []byte) (jsonLocation, error) { return jsonLocation{}, fmt.Errorf("Invalid JSON path expression. Expected field name after '.' at character %v of %s", i, string(pathBytes)) } case lexStateIdx: - if pathBytes[i] >= byte('0') && pathBytes[i] <= byte('9') { + if pathBytes[i] != byte(']') { i += 1 } else { - if pathBytes[i] != ']' { - return jsonLocation{}, fmt.Errorf("Invalid JSON path expression %s.", string(pathBytes)) + indexBytes := pathBytes[tok:i] + if isUnsupportedJsonArrayIndex(indexBytes) { + return jsonLocation{}, unsupportedPathError } conv, err := strconv.Atoi(string(pathBytes[tok:i])) if err != nil { @@ -219,10 +232,11 @@ func jsonPathElementsFromMySQLJsonPath(pathBytes []byte) (jsonLocation, error) { i += 1 } else if pathBytes[i] == '.' || pathBytes[i] == '[' { if tok == i { - return jsonLocation{}, fmt.Errorf("Invalid JSON path expression. Expected field name after '.' at character %v of %s", i, string(pathBytes)) + // This could be a .[*] expression. Let the original implementation take a stab at it. + return jsonLocation{}, unsupportedPathError } - if !isValidJsonPathKey(pathBytes[tok:i]) { - return jsonLocation{}, fmt.Errorf("Invalid JSON path expression. Expected field name after '.' at character %v of %s", tok, string(pathBytes)) + if isUnsupportedJsonPathKey(pathBytes[tok:i]) { + return jsonLocation{}, unsupportedPathError } location.appendObjectKey(pathBytes[tok:i]) state = lexStatePath @@ -235,8 +249,8 @@ func jsonPathElementsFromMySQLJsonPath(pathBytes []byte) (jsonLocation, error) { return jsonLocation{}, fmt.Errorf("Invalid JSON path expression. Expected field name after '.' at character %v of %s", i, string(pathBytes)) } pathKey := unescapeKey(pathBytes[tok+1 : i]) - if !isValidJsonPathKey(pathKey) { - return jsonLocation{}, fmt.Errorf("Invalid JSON path expression. Expected field name after '.' at character %v of %s", tok, string(pathBytes)) + if isUnsupportedJsonPathKey(pathKey) { + return jsonLocation{}, unsupportedPathError } location.appendObjectKey(pathKey) state = lexStatePath @@ -256,8 +270,8 @@ func jsonPathElementsFromMySQLJsonPath(pathBytes []byte) (jsonLocation, error) { if tok == i { return jsonLocation{}, fmt.Errorf("Invalid JSON path expression. Expected field name after '.' at character %v of %s", i, string(pathBytes)) } - if !isValidJsonPathKey(pathBytes[tok:i]) { - return jsonLocation{}, fmt.Errorf("Invalid JSON path expression. Expected field name after '.' at character %v of %s", tok, string(pathBytes)) + if isUnsupportedJsonPathKey(pathBytes[tok:i]) { + return jsonLocation{}, unsupportedPathError } location.appendObjectKey(pathBytes[tok:i]) state = lexStatePath