Skip to content

Commit

Permalink
sql: fix trigram span generation for similarity filters
Browse files Browse the repository at this point in the history
This commit fixes a bug in the generation of trigram inverted index
spans for similarity filters. The bug could cause rows to be incorrectly
filtered out of results.

Previously, we did not generate padded trigrams when building the
inverted spans for similarity filters. Now, we generate padded trigrams
to correct the bug.

For example, for a filter such as `col % 'aab'`, we would generate the
single trigram `'aab'` and the corresponding span `['aab'-'aab']`.
This span does not contain all indexed trigrams of values that are
similar to `'aab'`. As an example, it covers none of the trigrams of
`'aaaaaa'`, which are `{'  a',' aa','aa ','aaa'}`. Now, for the same
expression `col % 'aab'`, we generate the padded trigrams
`{'  a',' aa','aab','ab '}` and the corresponding spans
`['  a'-'  a'], [' aa'-' aa'], ['aab'-'aab'], ['ab '-'ab ']` which
contain some of the trigrams of `'aaaaaa'`.

Fixes #89609

Release note (bug fix): A bug has been fixed that caused incorrect
results for queries with string similar filters (e.g., `col % 'abc'`) on
tables with trigram indexes. This bug is only present in 22.2
pre-release versions up to and including v22.2.0-beta.3.
  • Loading branch information
mgartner committed Oct 13, 2022
1 parent 73d41ba commit afbd124
Show file tree
Hide file tree
Showing 5 changed files with 120 additions and 46 deletions.
34 changes: 34 additions & 0 deletions pkg/sql/logictest/testdata/logic_test/trigram_indexes
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,11 @@ SELECT similarity(t, 'fooz'), * FROM a@a_t_idx WHERE t % 'fooz' ORDER BY a
0.4 1 foozoopa
0.5 2 Foo

query FIT
SELECT similarity(t, 'fo'), * FROM a@a_t_idx WHERE t % 'fo' ORDER BY a
----
0.4 2 Foo

statement ok
SET pg_trgm.similarity_threshold=.45

Expand All @@ -109,6 +114,15 @@ SELECT similarity(t, 'fooz'), * FROM a@a_t_idx WHERE t % 'fooz'
----
0.5 2 Foo

statement ok
SET pg_trgm.similarity_threshold=0.1

query FIT
SELECT similarity(t, 'f'), * FROM a@a_t_idx WHERE t % 'f' ORDER BY a
----
0.1 1 foozoopa
0.2 2 Foo

# Test the acceleration of the equality operator.
query IT
SELECT * FROM a@a_t_idx WHERE t = 'Foo'
Expand Down Expand Up @@ -251,3 +265,23 @@ query IT
SELECT * FROM t88558 WHERE 'aab':::STRING LIKE b;
----
1 %

# Regression test for #89609. Pad trigrams when building inverted spans for
# similarity filters.
statement ok
CREATE TABLE t89609 (
t TEXT,
INVERTED INDEX idx (t gin_trgm_ops)
);
INSERT INTO t89609 VALUES ('aaaaaa');
SET pg_trgm.similarity_threshold=.3

query T
SELECT t FROM t89609@primary WHERE t::STRING % 'aab';
----
aaaaaa

query T
SELECT t FROM t89609@idx WHERE t::STRING % 'aab';
----
aaaaaa
43 changes: 27 additions & 16 deletions pkg/sql/opt/exec/execbuilder/testdata/trigram_index
Original file line number Diff line number Diff line change
Expand Up @@ -149,10 +149,14 @@ vectorized: true
└── • index join
│ table: a@a_pkey
└── • scan
missing stats
table: a@a_b_idx
spans: 1 span
└── • inverted filter
│ inverted column: b_inverted_key
│ num spans: 4
└── • scan
missing stats
table: a@a_b_idx
spans: 4 spans

# Test that trigram indexes accelerate the % operator with an OR if the
# constant has more than one trigram.
Expand All @@ -170,15 +174,15 @@ vectorized: true
└── • inverted filter
│ inverted column: b_inverted_key
│ num spans: 2
│ num spans: 5
└── • scan
missing stats
table: a@a_b_idx
spans: 2 spans
spans: 5 spans

# Test that trigram indexes can't accelerate the % operator if there are fewer
# than 3 characters in the constant.
# Test that trigram indexes can accelerate the % operator if there are fewer
# than 3 characters in the constant by using padded trigrams.
query T
EXPLAIN SELECT * FROM a WHERE b % 'fo'
----
Expand All @@ -188,10 +192,17 @@ vectorized: true
• filter
│ filter: b % 'fo'
└── • scan
missing stats
table: a@a_pkey
spans: FULL SCAN
└── • index join
│ table: a@a_pkey
└── • inverted filter
│ inverted column: b_inverted_key
│ num spans: 3
└── • scan
missing stats
table: a@a_b_idx
spans: 3 spans

# Test that trigram indexes can accelerate the % operator in reverse order.
query T
Expand All @@ -208,12 +219,12 @@ vectorized: true
└── • inverted filter
│ inverted column: b_inverted_key
│ num spans: 2
│ num spans: 5
└── • scan
missing stats
table: a@a_b_idx
spans: 2 spans
spans: 5 spans

# Test that trigram indexes can't accelerate the % operator with no constant
# columns.
Expand Down Expand Up @@ -292,12 +303,12 @@ vectorized: true
└── • inverted filter
│ inverted column: a_inverted_key
│ num spans: 2
│ num spans: 5
└── • scan
missing stats
table: b@b_a_idx
spans: 2 spans
spans: 5 spans

# Regression test for #88925.
statement ok
Expand Down
7 changes: 3 additions & 4 deletions pkg/sql/opt/invertedidx/trigram_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,12 +80,11 @@ func TestTryFilterTrigram(t *testing.T) {

// Similarity queries.
{filters: "s % 'lkjsdlkj'", ok: true, unique: false},
{filters: "s % 'lkj'", ok: true, unique: true},
// Can't generate trigrams from such a short constant.
{filters: "s % 'lj'", ok: false},
{filters: "s % 'lkj'", ok: true, unique: false},
{filters: "s % 'lj'", ok: true, unique: false},

// AND and OR for two similarity queries behave as expected.
{filters: "s % 'lkj' AND s % 'bla'", ok: true, unique: true},
{filters: "s % 'lkj' AND s % 'bla'", ok: true, unique: false},
{filters: "s % 'lkj' OR s % 'bla'", ok: true, unique: false},

// Can combine similarity and LIKE queries and still get inverted
Expand Down
4 changes: 2 additions & 2 deletions pkg/sql/rowenc/index_encoding.go
Original file line number Diff line number Diff line change
Expand Up @@ -913,7 +913,7 @@ func encodeOverlapsArrayInvertedIndexSpans(
// expression must match every trigram in the input. Otherwise, it will match
// any trigram in the input.
func EncodeTrigramSpans(s string, allMustMatch bool) (inverted.Expression, error) {
// We do not pad the trigrams when searching the index. To see why, observe
// We do not pad the trigrams when allMustMatch is true. To see why, observe
// the keys that we insert for a string "zfooz":
//
// " z", " zf", "zfo", "foo", "foz", "oz "
Expand All @@ -922,7 +922,7 @@ func EncodeTrigramSpans(s string, allMustMatch bool) (inverted.Expression, error
// keys as well, we'd be searching for the key " f", which doesn't exist
// in the index for zfooz, even though zfooz is like %foo%.
keys, err := encodeTrigramInvertedIndexTableKeys(s, nil, /* inKey */
descpb.LatestIndexDescriptorVersion, false /* pad */)
descpb.LatestIndexDescriptorVersion, !allMustMatch /* pad */)
if err != nil {
return nil, err
}
Expand Down
78 changes: 54 additions & 24 deletions pkg/sql/rowenc/index_encoding_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -912,8 +912,10 @@ func TestEncodeTrigramInvertedIndexSpans(t *testing.T) {
value string
// Whether we're using LIKE or % operator for the search.
searchType trigramSearchType
// Whether we expect that the spans should contain all of the keys produced
// by indexing the indexedValue.
// Whether we expect that the spans should contain the keys produced by
// indexing the indexedValue. If the searchType is similar, then the
// spans should contain at least one of the indexed keys, otherwise the
// spans should contain all the indexed keys.
containsKeys bool
// Whether we expect that the indexed value should evaluate as matching
// the LIKE or % expression that we're testing.
Expand All @@ -940,9 +942,10 @@ func TestEncodeTrigramInvertedIndexSpans(t *testing.T) {
// Similarity (%) queries.
{`staticcheck`, `staricheck`, similar, true, true, false},
{`staticcheck`, `blevicchlrk`, similar, true, false, false},
{`staticcheck`, `che`, similar, true, false, true},
{`staticcheck`, `xxx`, similar, false, false, true},
{`staticcheck`, `che`, similar, true, false, false},
{`staticcheck`, `xxx`, similar, false, false, false},
{`staticcheck`, `xxxyyy`, similar, false, false, false},
{`aaaaaa`, `aab`, similar, true, true, false},

// Equality queries.
{`staticcheck`, `staticcheck`, eq, true, true, false},
Expand Down Expand Up @@ -979,10 +982,23 @@ func TestEncodeTrigramInvertedIndexSpans(t *testing.T) {
}
require.Equal(t, expectUnique, spanExpr.Unique, "%s, %s: unexpected unique attribute", indexedValue, value)

// Check if the indexedValue is included by the spans.
containsKeys, err := spanExpr.ContainsKeys(keys)
require.NoError(t, err)

// Check if the indexedValue is included by the spans. If the search is
// a similarity search, the spans should contain at least one key.
// Otherwise, the spans should contain all the keys.
var containsKeys bool
if searchType == similar {
for i := range keys {
containsKey, err := spanExpr.ContainsKeys([][]byte{keys[i]})
require.NoError(t, err)
if containsKey {
containsKeys = true
break
}
}
} else {
containsKeys, err = spanExpr.ContainsKeys(keys)
require.NoError(t, err)
}
require.Equal(t, expectContainsKeys, containsKeys, "%s, %s: expected containsKeys", indexedValue, value)

// Since the spans are never tight, apply an additional filter to determine
Expand Down Expand Up @@ -1012,13 +1028,13 @@ func TestEncodeTrigramInvertedIndexSpans(t *testing.T) {

for _, searchType := range []trigramSearchType{like, eq, similar} {
expr := makeTrigramBinOp(t, left, right, searchType)
lTrigrams := trigram.MakeTrigrams(left, false /* pad */)
lTrigrams := trigram.MakeTrigrams(left, searchType == similar /* pad */)
// Check for intersection. We're looking for a non-zero intersection
// for similar, and complete containment of the right trigrams in the left
// for eq and like.
any := false
all := true
rTrigrams := trigram.MakeTrigrams(right, false /* pad */)
rTrigrams := trigram.MakeTrigrams(right, searchType == similar /* pad */)
for _, trigram := range rTrigrams {
idx := sort.Search(len(lTrigrams), func(i int) bool {
return lTrigrams[i] >= trigram
Expand All @@ -1036,15 +1052,17 @@ func TestEncodeTrigramInvertedIndexSpans(t *testing.T) {
expectedContainsKeys = all
}

t.Logf("left: %s\nright: %s\nlTrigrams: %v\nrTrigrams: %v\nany: %v\nall: %v\n", left, right, lTrigrams, rTrigrams, any, all)

d, err := eval.Expr(context.Background(), &evalCtx, expr)
require.NoError(t, err)
expected := bool(*d.(*tree.DBool))
trigrams := trigram.MakeTrigrams(right, false /* pad */)
trigrams := trigram.MakeTrigrams(right, searchType == similar /* pad */)
nTrigrams := len(trigrams)
valid := nTrigrams > 0
unique := nTrigrams == 1
if !valid {
_, err := EncodeTrigramSpans(right, true /* allMustMatch */)
_, err := EncodeTrigramSpans(right, searchType != similar /* allMustMatch */)
require.Error(t, err)
continue
}
Expand Down Expand Up @@ -1080,19 +1098,31 @@ func TestEncodeTrigramInvertedIndexSpansError(t *testing.T) {
// Make sure that any input with a chunk with fewer than 3 characters returns
// an error, since we can't produce trigrams from strings that don't meet a
// minimum of 3 characters.
inputs := []string{
"fo",
"a",
"",
testCases := []struct {
input string
allMustMatchErr bool
anyMustMatchErr bool
}{
{"fo", true, false},
{"a", true, false},
{"", true, true},
// Non-alpha characters don't count against the limit.
"fo ",
"%fo%",
"#$(*",
{"fo ", true, false},
{"%fo%", true, false},
{"#$(*)", true, true},
}
for _, input := range inputs {
_, err := EncodeTrigramSpans(input, true /* allMustMatch */)
require.Error(t, err)
_, err = EncodeTrigramSpans(input, false /* allMustMatch */)
require.Error(t, err)
for _, tc := range testCases {
_, err := EncodeTrigramSpans(tc.input, true /* allMustMatch */)
if tc.allMustMatchErr {
require.Error(t, err)
} else {
require.NoError(t, err)
}
_, err = EncodeTrigramSpans(tc.input, false /* allMustMatch */)
if tc.anyMustMatchErr {
require.Error(t, err)
} else {
require.NoError(t, err)
}
}
}

0 comments on commit afbd124

Please sign in to comment.