Skip to content

Commit

Permalink
Merge pull request #8088 from dolthub/nicktobey/json-tests
Browse files Browse the repository at this point in the history
[no-release-info] Add additional tests for manipulating large JSON documents and fix corner case bugs in JSON_LOOKUP and JSON_INSERT
  • Loading branch information
nicktobey authored Jul 3, 2024
2 parents 736eebc + 7f66c4b commit d01a38a
Show file tree
Hide file tree
Showing 4 changed files with 211 additions and 49 deletions.
9 changes: 7 additions & 2 deletions go/store/prolly/tree/json_chunker.go
Original file line number Diff line number Diff line change
Expand Up @@ -190,9 +190,13 @@ func (j *JsonChunker) appendJsonToBuffer(jsonBytes []byte) {

// processBuffer reads all new additions added by appendJsonToBuffer, and determines any new chunk boundaries.
// Do not call this method directly. It should only get called from within this file.
func (j *JsonChunker) processBuffer(ctx context.Context) error {
func (j *JsonChunker) processBuffer(ctx context.Context) (err error) {
chunkStart := 0
for j.jScanner.AdvanceToNextLocation() != io.EOF {
err = j.jScanner.AdvanceToNextLocation()
for err != io.EOF {
if err != nil {
return err
}
key := j.jScanner.currentPath.key
value := j.jScanner.jsonBuffer[chunkStart:j.jScanner.valueOffset]
if crossesBoundary(key, value) {
Expand All @@ -202,6 +206,7 @@ func (j *JsonChunker) processBuffer(ctx context.Context) error {
}
chunkStart = j.jScanner.valueOffset
}
err = j.jScanner.AdvanceToNextLocation()
}
if chunkStart > 0 {
newValueOffset := j.jScanner.valueOffset - chunkStart
Expand Down
72 changes: 41 additions & 31 deletions go/store/prolly/tree/json_indexed_document.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ import (
"database/sql/driver"
"encoding/json"
"fmt"
"strings"
"sync"

"github.com/dolthub/go-mysql-server/sql"
Expand Down Expand Up @@ -107,29 +106,40 @@ func getBytesFromIndexedJsonMap(ctx context.Context, m StaticJsonMap) (bytes []b
return bytes, err
}

// Lookup implements types.SearchableJSON
func (i IndexedJsonDocument) Lookup(ctx context.Context, pathString string) (sql.JSONWrapper, error) {
if strings.Contains(pathString, "*") {
// Optimized lookup doesn't currently support wildcards. Fall back on an unoptimized approach.
// TODO: Optimized lookup on wildcards.
val, err := i.ToInterface()
if err != nil {
return nil, err
}
nonIndexedDoc := types.JSONDocument{Val: val}
return nonIndexedDoc.Lookup(ctx, pathString)
}
result, err := i.tryLookup(ctx, pathString)
if err == unknownLocationKeyError {
if sqlCtx, ok := ctx.(*sql.Context); ok {
sqlCtx.GetLogger().Warn(err)
func tryWithFallback(
ctx context.Context,
i IndexedJsonDocument,
tryFunc func() error,
fallbackFunc func(document types.JSONDocument) error) error {
err := tryFunc()
if err == unknownLocationKeyError || err == unsupportedPathError {
if err == unknownLocationKeyError {
if sqlCtx, ok := ctx.(*sql.Context); ok {
sqlCtx.GetLogger().Warn(err)
}
}
v, err := i.ToInterface()
if err != nil {
return nil, err
return err
}
return types.JSONDocument{Val: v}.Lookup(ctx, pathString)
return fallbackFunc(types.JSONDocument{Val: v})
}
return err
}

// Lookup implements types.SearchableJSON
func (i IndexedJsonDocument) Lookup(ctx context.Context, pathString string) (result sql.JSONWrapper, err error) {
err = tryWithFallback(
ctx,
i,
func() error {
result, err = i.tryLookup(ctx, pathString)
return err
},
func(jsonDocument types.JSONDocument) error {
result, err = jsonDocument.Lookup(ctx, pathString)
return err
})
return result, err
}

Expand Down Expand Up @@ -160,18 +170,18 @@ func (i IndexedJsonDocument) tryLookup(ctx context.Context, pathString string) (
}

// Insert implements types.MutableJSON
func (i IndexedJsonDocument) Insert(ctx context.Context, path string, val sql.JSONWrapper) (types.MutableJSON, bool, error) {
result, changed, err := i.tryInsert(ctx, path, val)
if err == unknownLocationKeyError {
if sqlCtx, ok := ctx.(*sql.Context); ok {
sqlCtx.GetLogger().Warn(err)
}
v, err := i.ToInterface()
if err != nil {
return nil, false, err
}
return types.JSONDocument{Val: v}.Insert(ctx, path, val)
}
func (i IndexedJsonDocument) Insert(ctx context.Context, path string, val sql.JSONWrapper) (result types.MutableJSON, changed bool, err error) {
err = tryWithFallback(
ctx,
i,
func() error {
result, changed, err = i.tryInsert(ctx, path, val)
return err
},
func(jsonDocument types.JSONDocument) error {
result, changed, err = jsonDocument.Insert(ctx, path, val)
return err
})
return result, changed, err
}

Expand Down
137 changes: 135 additions & 2 deletions go/store/prolly/tree/json_indexed_document_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,20 @@ package tree

import (
"context"
"fmt"
"strings"
"testing"

"github.com/dolthub/go-mysql-server/sql/expression"
"github.com/dolthub/go-mysql-server/sql/expression/function/json"

"github.com/dolthub/go-mysql-server/sql"
"github.com/dolthub/go-mysql-server/sql/expression/function/json/jsontests"
"github.com/dolthub/go-mysql-server/sql/types"
"github.com/stretchr/testify/require"
)

// NewIndexedJsonDocumentFromBytes creates an IndexedJsonDocument from a byte sequence, which has already been validated and normalized
// newIndexedJsonDocumentFromValue creates an IndexedJsonDocument from a provided value.
func newIndexedJsonDocumentFromValue(t *testing.T, ctx context.Context, ns NodeStore, v interface{}) IndexedJsonDocument {
doc, _, err := types.JSON.Convert(v)
require.NoError(t, err)
Expand All @@ -33,15 +38,143 @@ func newIndexedJsonDocumentFromValue(t *testing.T, ctx context.Context, ns NodeS
return NewIndexedJsonDocument(ctx, root, ns)
}

// createLargeDocumentForTesting creates a JSON document large enough to be split across multiple chunks.
// This is useful for testing mutation operations in large documents.
// Every different possible jsonPathType appears on a chunk boundary, for better test coverage:
// chunk 0 key: $[6].children[2].children[0].number(endOfValue)
// chunk 2 key: $[7].children[5].children[4].children[2].children(arrayInitialElement)
// chunk 5 key: $[8].children[6].children[4].children[3].children[0](startOfValue)
// chunk 8 key: $[8].children[7].children[6].children[5].children[3].children[2].children[1](objectInitialElement)
func createLargeDocumentForTesting(t *testing.T, ctx *sql.Context, ns NodeStore) IndexedJsonDocument {
leafDoc := make(map[string]interface{})
leafDoc["number"] = float64(1.0)
leafDoc["string"] = "dolt"
docExpression, err := json.NewJSONArray(expression.NewLiteral(newIndexedJsonDocumentFromValue(t, ctx, ns, leafDoc), types.JSON))
require.NoError(t, err)

for level := 0; level < 8; level++ {
childObjectExpression, err := json.NewJSONObject(expression.NewLiteral("children", types.Text), docExpression)
require.NoError(t, err)
docExpression, err = json.NewJSONArrayAppend(docExpression, expression.NewLiteral("$", types.Text), childObjectExpression)
require.NoError(t, err)
}
doc, err := docExpression.Eval(ctx, nil)
require.NoError(t, err)
return newIndexedJsonDocumentFromValue(t, ctx, ns, doc)
}

var jsonPathTypeNames = []string{
"startOfValue",
"objectInitialElement",
"arrayInitialElement",
"endOfValue",
}

type chunkBoundary struct {
chunkId int
path string
pathType jsonPathType
}

var largeDocumentChunkBoundaries = []chunkBoundary{
{
chunkId: 0,
path: "$[6].children[2].children[0].number",
pathType: endOfValue,
},
{
chunkId: 2,
path: "$[7].children[5].children[4].children[2].children",
pathType: arrayInitialElement,
},
{
chunkId: 5,
path: "$[8].children[6].children[4].children[3].children[0]",
pathType: startOfValue,
},
{
chunkId: 8,
path: "$[8].children[7].children[6].children[5].children[3].children[2].children[1]",
pathType: objectInitialElement,
},
}

func jsonLocationFromMySqlPath(t *testing.T, path string, pathType jsonPathType) jsonLocation {
result, err := jsonPathElementsFromMySQLJsonPath([]byte(path))
require.NoError(t, err)
result.setScannerState(pathType)
return result
}

// TestIndexedJsonDocument_ValidateChunks asserts that the values defined largeDocumentChunkBoundaries are accurate,
// so they can be used in other tests.
func TestIndexedJsonDocument_ValidateChunks(t *testing.T) {
ctx := sql.NewEmptyContext()
ns := NewTestNodeStore()
largeDoc := createLargeDocumentForTesting(t, ctx, ns)
for _, boundary := range largeDocumentChunkBoundaries {
t.Run(fmt.Sprintf("validate %v at chunk %v", jsonPathTypeNames[boundary.pathType], boundary.chunkId), func(t *testing.T) {
expectedKey := jsonLocationFromMySqlPath(t, boundary.path, boundary.pathType)
actualKey := []byte(largeDoc.m.Root.GetKey(boundary.chunkId))
require.Equal(t, expectedKey.key, actualKey)
})
}
}

func TestIndexedJsonDocument_Insert(t *testing.T) {
ctx := context.Background()
ctx := sql.NewEmptyContext()
ns := NewTestNodeStore()
convertToIndexedJsonDocument := func(t *testing.T, s interface{}) interface{} {
return newIndexedJsonDocumentFromValue(t, ctx, ns, s)
}

testCases := jsontests.JsonInsertTestCases(t, convertToIndexedJsonDocument)
jsontests.RunJsonTests(t, testCases)

t.Run("large document inserts", func(t *testing.T) {

largeDoc := createLargeDocumentForTesting(t, ctx, ns)

// Generate a value large enough that, if it's inserted, will guarantee a change in chunk boundaries.
valueToInsert, err := largeDoc.Lookup(ctx, "$[6]")
require.NoError(t, err)

for _, chunkBoundary := range largeDocumentChunkBoundaries {
t.Run(jsonPathTypeNames[chunkBoundary.pathType], func(t *testing.T) {
// Compute a location right before the chunk boundary, and insert a large value into it.
insertionPoint := chunkBoundary.path[:strings.LastIndex(chunkBoundary.path, ".")]
insertionPoint = fmt.Sprint(insertionPoint, ".a")
newDoc, changed, err := largeDoc.Insert(ctx, insertionPoint, valueToInsert)
require.NoError(t, err)
require.True(t, changed)

// test that the chunk boundary was moved as a result of the insert.
newBoundary := []byte(largeDoc.m.Root.GetKey(chunkBoundary.chunkId))
previousBoundary := jsonLocationFromMySqlPath(t, chunkBoundary.path, chunkBoundary.pathType)
require.NotEqual(t, newBoundary, previousBoundary)

// test that new value is valid by converting it to interface{}
v, err := newDoc.ToInterface()
require.NoError(t, err)
newJsonDocument := types.JSONDocument{Val: v}

// test that the JSONDocument compares equal to the IndexedJSONDocument
cmp, err := types.JSON.Compare(newDoc, newJsonDocument)
require.NoError(t, err)
require.Equal(t, cmp, 0)

// extract the inserted value and confirm it's equal to the original inserted value.
result, err := newJsonDocument.Lookup(ctx, insertionPoint)
require.NoError(t, err)
require.NotNil(t, result)

cmp, err = types.JSON.Compare(valueToInsert, result)
require.NoError(t, err)
require.Equal(t, cmp, 0)
})
}
})

}

func TestIndexedJsonDocument_Extract(t *testing.T) {
Expand Down
42 changes: 28 additions & 14 deletions go/store/prolly/tree/json_location.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ const (
)

var unknownLocationKeyError = fmt.Errorf("A JSON document was written with a future version of Dolt, and the index metadata cannot be read. This will impact performance for large documents.")
var unsupportedPathError = fmt.Errorf("The supplied JSON path is not supported for optimized lookup, falling back on unoptimized implementation.")

const (
beginObjectKey byte = 0xFF
Expand Down Expand Up @@ -122,6 +123,7 @@ func jsonPathFromKey(pathKey []byte) (path jsonLocation) {
i += 1
} else if pathKey[i] == beginArrayKey {
ret.offsets = append(ret.offsets, i)
i += 1
i += varIntLength(pathKey[i])
} else {
i += 1
Expand All @@ -143,14 +145,24 @@ func varIntLength(firstByte byte) int {
return int(firstByte - 246)
}

func isValidJsonPathKey(key []byte) bool {
func isUnsupportedJsonPathKey(key []byte) bool {
if bytes.Equal(key, []byte("*")) {
return false
return true
}
if bytes.Equal(key, []byte("**")) {
return false
return true
}
return false
}

func isUnsupportedJsonArrayIndex(index []byte) bool {
if bytes.Equal(index, []byte("*")) {
return true
}
if bytes.Equal(index, []byte("last")) {
return true
}
return true
return false
}

func errorIfNotSupportedLocation(key []byte) error {
Expand Down Expand Up @@ -199,11 +211,12 @@ func jsonPathElementsFromMySQLJsonPath(pathBytes []byte) (jsonLocation, error) {
return jsonLocation{}, fmt.Errorf("Invalid JSON path expression. Expected field name after '.' at character %v of %s", i, string(pathBytes))
}
case lexStateIdx:
if pathBytes[i] >= byte('0') && pathBytes[i] <= byte('9') {
if pathBytes[i] != byte(']') {
i += 1
} else {
if pathBytes[i] != ']' {
return jsonLocation{}, fmt.Errorf("Invalid JSON path expression %s.", string(pathBytes))
indexBytes := pathBytes[tok:i]
if isUnsupportedJsonArrayIndex(indexBytes) {
return jsonLocation{}, unsupportedPathError
}
conv, err := strconv.Atoi(string(pathBytes[tok:i]))
if err != nil {
Expand All @@ -219,10 +232,11 @@ func jsonPathElementsFromMySQLJsonPath(pathBytes []byte) (jsonLocation, error) {
i += 1
} else if pathBytes[i] == '.' || pathBytes[i] == '[' {
if tok == i {
return jsonLocation{}, fmt.Errorf("Invalid JSON path expression. Expected field name after '.' at character %v of %s", i, string(pathBytes))
// This could be a .[*] expression. Let the original implementation take a stab at it.
return jsonLocation{}, unsupportedPathError
}
if !isValidJsonPathKey(pathBytes[tok:i]) {
return jsonLocation{}, fmt.Errorf("Invalid JSON path expression. Expected field name after '.' at character %v of %s", tok, string(pathBytes))
if isUnsupportedJsonPathKey(pathBytes[tok:i]) {
return jsonLocation{}, unsupportedPathError
}
location.appendObjectKey(pathBytes[tok:i])
state = lexStatePath
Expand All @@ -235,8 +249,8 @@ func jsonPathElementsFromMySQLJsonPath(pathBytes []byte) (jsonLocation, error) {
return jsonLocation{}, fmt.Errorf("Invalid JSON path expression. Expected field name after '.' at character %v of %s", i, string(pathBytes))
}
pathKey := unescapeKey(pathBytes[tok+1 : i])
if !isValidJsonPathKey(pathKey) {
return jsonLocation{}, fmt.Errorf("Invalid JSON path expression. Expected field name after '.' at character %v of %s", tok, string(pathBytes))
if isUnsupportedJsonPathKey(pathKey) {
return jsonLocation{}, unsupportedPathError
}
location.appendObjectKey(pathKey)
state = lexStatePath
Expand All @@ -256,8 +270,8 @@ func jsonPathElementsFromMySQLJsonPath(pathBytes []byte) (jsonLocation, error) {
if tok == i {
return jsonLocation{}, fmt.Errorf("Invalid JSON path expression. Expected field name after '.' at character %v of %s", i, string(pathBytes))
}
if !isValidJsonPathKey(pathBytes[tok:i]) {
return jsonLocation{}, fmt.Errorf("Invalid JSON path expression. Expected field name after '.' at character %v of %s", tok, string(pathBytes))
if isUnsupportedJsonPathKey(pathBytes[tok:i]) {
return jsonLocation{}, unsupportedPathError
}
location.appendObjectKey(pathBytes[tok:i])
state = lexStatePath
Expand Down

0 comments on commit d01a38a

Please sign in to comment.