Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

topdown: Adding cap to caches for regex and glob built-in functions #6846

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion internal/compiler/wasm/opa/callgraph.csv

Large diffs are not rendered by default.

Binary file modified internal/compiler/wasm/opa/opa.wasm
Binary file not shown.
9 changes: 9 additions & 0 deletions topdown/glob.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ import (
"github.com/open-policy-agent/opa/topdown/builtins"
)

const globCacheMaxSize = 100

var globCacheLock = sync.Mutex{}
var globCache map[string]glob.Glob

Expand Down Expand Up @@ -64,6 +66,13 @@ func globCompileAndMatch(id, pattern, match string, delimiters []rune) (bool, er
if p, err = glob.Compile(pattern, delimiters...); err != nil {
return false, err
}
if len(globCache) >= globCacheMaxSize {
// Delete a (semi-)random key to make room for the new one.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why semi? I think it's pretty random 😅

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not explicitly random, as the randomness is an implementation detail of the map.
I also think that it'd be deterministic over multiple runs with the same map entries.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So we're checking len(globCache) >= globCacheMaxSize but we only remove one. This is the only code path where cached patterns are inserted, right? So it cannot happen that through some other code path, the cache would still continue to grow without bounds because we're only removing one element here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correct, discounting tests, this is the only place we touch the cache.

for k := range globCache {
delete(globCache, k)
break
}
}
globCache[id] = p
}
out := p.Match(match)
Expand Down
100 changes: 100 additions & 0 deletions topdown/glob_bench_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
// Copyright 2024 The OPA Authors. All rights reserved.
// Use of this source code is governed by an Apache2
// license that can be found in the LICENSE file.

package topdown

import (
"fmt"
"sync"
"testing"

"github.com/gobwas/glob"
"github.com/open-policy-agent/opa/ast"
)

func BenchmarkBuiltinGlobMatch(b *testing.B) {
iter := func(*ast.Term) error { return nil }
ctx := BuiltinContext{}

for _, reusePattern := range []bool{true, false} {
for _, patternCount := range []int{10, 100, 1000} {
b.Run(fmt.Sprintf("reuse-pattern=%v, pattern-count=%d", reusePattern, patternCount), func(b *testing.B) {
b.ResetTimer()
for n := 0; n < b.N; n++ {
// Clearing the cache
globCache = make(map[string]glob.Glob)

for i := 0; i < patternCount; i++ {
var operands []*ast.Term
if reusePattern {
operands = []*ast.Term{
ast.NewTerm(ast.String("foo/*")),
ast.NullTerm(),
ast.NewTerm(ast.String("foo/bar")),
}
} else {
operands = []*ast.Term{
ast.NewTerm(ast.String(fmt.Sprintf("foo/*/%d", i))),
ast.NullTerm(),
ast.NewTerm(ast.String(fmt.Sprintf("foo/bar/%d", i))),
}
}
if err := builtinGlobMatch(ctx, operands, iter); err != nil {
b.Fatal(err)
}
}
}
})
}
}
}

func BenchmarkBuiltinGlobMatchAsync(b *testing.B) {
iter := func(*ast.Term) error { return nil }
ctx := BuiltinContext{}

for _, reusePattern := range []bool{true, false} {
for _, clientCount := range []int{100, 200} {
for _, patternCount := range []int{10, 100, 1000} {
b.Run(fmt.Sprintf("reuse-pattern=%v, clients=%d, pattern-count=%d", reusePattern, clientCount, patternCount), func(b *testing.B) {
b.ResetTimer()
for n := 0; n < b.N; n++ {
// Clearing the cache
globCache = make(map[string]glob.Glob)

wg := sync.WaitGroup{}
for i := 0; i < clientCount; i++ {
clientID := i
wg.Add(1)
go func() {
for j := 0; j < patternCount; j++ {
var operands []*ast.Term
if reusePattern {
operands = []*ast.Term{
ast.NewTerm(ast.String("foo/*")),
ast.NullTerm(),
ast.NewTerm(ast.String("foo/bar")),
}
} else {
operands = []*ast.Term{
ast.NewTerm(ast.String(fmt.Sprintf("foo/*/%d/%d", clientID, j))),
ast.NullTerm(),
ast.NewTerm(ast.String(fmt.Sprintf("foo/bar/%d/%d", clientID, j))),
}
}
if err := builtinGlobMatch(ctx, operands, iter); err != nil {
b.Error(err)
return
}
}
wg.Done()
}()
}
wg.Wait()
}
})
}
}
}
}
71 changes: 71 additions & 0 deletions topdown/glob_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
// Copyright 2024 The OPA Authors. All rights reserved.
// Use of this source code is governed by an Apache2
// license that can be found in the LICENSE file.

package topdown

import (
"fmt"
"testing"

"github.com/open-policy-agent/opa/ast"
)

func TestGlobBuiltinCache(t *testing.T) {
ctx := BuiltinContext{}
iter := func(*ast.Term) error { return nil }

// A novel glob pattern is cached.
glob1 := "foo/*"
operands := []*ast.Term{
ast.NewTerm(ast.String(glob1)),
ast.NullTerm(),
ast.NewTerm(ast.String("foo/bar")),
}
err := builtinGlobMatch(ctx, operands, iter)
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}

// the glob id will have a trailing '-' rune.
if _, ok := globCache[fmt.Sprintf("%s-", glob1)]; !ok {
t.Fatalf("Expected glob to be cached: %v", glob1)
}

// Fill up the cache.
for i := 0; i < regexCacheMaxSize-1; i++ {
operands := []*ast.Term{
ast.NewTerm(ast.String(fmt.Sprintf("foo/%d/*", i))),
ast.NullTerm(),
ast.NewTerm(ast.String(fmt.Sprintf("foo/%d/bar", i))),
}
err := builtinGlobMatch(ctx, operands, iter)
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
}

if len(globCache) != regexCacheMaxSize {
t.Fatalf("Expected cache to be full")
}

// A new glob pattern is cached and a random pattern is evicted.
glob2 := "bar/*"
operands = []*ast.Term{
ast.NewTerm(ast.String(glob2)),
ast.NullTerm(),
ast.NewTerm(ast.String("bar/baz")),
}
err = builtinGlobMatch(ctx, operands, iter)
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}

if len(globCache) != regexCacheMaxSize {
t.Fatalf("Expected cache be capped at %d, was %d", regexCacheMaxSize, len(globCache))
}

if _, ok := globCache[fmt.Sprintf("%s-", glob2)]; !ok {
t.Fatalf("Expected glob to be cached: %v", glob2)
}
}
9 changes: 9 additions & 0 deletions topdown/regex.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ import (
"github.com/open-policy-agent/opa/topdown/builtins"
)

const regexCacheMaxSize = 100

var regexpCacheLock = sync.Mutex{}
var regexpCache map[string]*regexp.Regexp

Expand Down Expand Up @@ -111,6 +113,13 @@ func getRegexp(pat string) (*regexp.Regexp, error) {
if err != nil {
return nil, err
}
if len(regexpCache) >= regexCacheMaxSize {
// Delete a (semi-)random key to make room for the new one.
for k := range regexpCache {
delete(regexpCache, k)
break
}
}
regexpCache[pat] = re
}
return re, nil
Expand Down
96 changes: 96 additions & 0 deletions topdown/regex_bench_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
// Copyright 2024 The OPA Authors. All rights reserved.
// Use of this source code is governed by an Apache2
// license that can be found in the LICENSE file.

package topdown

import (
"fmt"
"regexp"
"sync"
"testing"

"github.com/open-policy-agent/opa/ast"
)

func BenchmarkBuiltinRegexMatch(b *testing.B) {
iter := func(*ast.Term) error { return nil }
ctx := BuiltinContext{}

for _, reusePattern := range []bool{true, false} {
for _, patternCount := range []int{10, 100, 1000} {
b.Run(fmt.Sprintf("reuse-pattern=%v, pattern-count=%d", reusePattern, patternCount), func(b *testing.B) {
b.ResetTimer()
for n := 0; n < b.N; n++ {
// Clearing the cache
regexpCache = make(map[string]*regexp.Regexp)

for i := 0; i < patternCount; i++ {
var operands []*ast.Term
if reusePattern {
operands = []*ast.Term{
ast.NewTerm(ast.String("foo.*")),
ast.NewTerm(ast.String("foobar")),
}
} else {
operands = []*ast.Term{
ast.NewTerm(ast.String(fmt.Sprintf("foo%d.*", i))),
ast.NewTerm(ast.String(fmt.Sprintf("foo%dbar", i))),
}
}
if err := builtinRegexMatch(ctx, operands, iter); err != nil {
b.Fatal(err)
}
}
}
})
}
}
}

func BenchmarkBuiltinRegexMatchAsync(b *testing.B) {
iter := func(*ast.Term) error { return nil }
ctx := BuiltinContext{}

for _, reusePattern := range []bool{true, false} {
for _, clientCount := range []int{100, 200} {
for _, patternCount := range []int{10, 100, 1000} {
b.Run(fmt.Sprintf("reuse-pattern=%v, clients=%d, pattern-count=%d", reusePattern, clientCount, patternCount), func(b *testing.B) {
b.ResetTimer()
for n := 0; n < b.N; n++ {
// Clearing the cache
regexpCache = make(map[string]*regexp.Regexp)

wg := sync.WaitGroup{}
for i := 0; i < clientCount; i++ {
clientID := i
wg.Add(1)
go func() {
for j := 0; j < patternCount; j++ {
var operands []*ast.Term
if reusePattern {
operands = []*ast.Term{
ast.NewTerm(ast.String("foo.*")),
ast.NewTerm(ast.String("foobar")),
}
} else {
operands = []*ast.Term{
ast.NewTerm(ast.String(fmt.Sprintf("foo%d_%d.*", clientID, j))),
ast.NewTerm(ast.String(fmt.Sprintf("foo%d_%dbar", clientID, j))),
}
}
if err := builtinRegexMatch(ctx, operands, iter); err != nil {
b.Error(err)
return
}
}
wg.Done()
}()
}
wg.Wait()
}
})
}
}
}
}
67 changes: 67 additions & 0 deletions topdown/regex_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// Copyright 2024 The OPA Authors. All rights reserved.
// Use of this source code is governed by an Apache2
// license that can be found in the LICENSE file.

package topdown

import (
"fmt"
"testing"

"github.com/open-policy-agent/opa/ast"
)

func TestRegexBuiltinCache(t *testing.T) {
ctx := BuiltinContext{}
iter := func(*ast.Term) error { return nil }

// A novel regex pattern is cached.
regex1 := "foo.*"
operands := []*ast.Term{
ast.NewTerm(ast.String(regex1)),
ast.NewTerm(ast.String("foobar")),
}
err := builtinRegexMatch(ctx, operands, iter)
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}

if _, ok := regexpCache[regex1]; !ok {
t.Fatalf("Expected regex to be cached: %v", regex1)
}

// Fill up the cache.
for i := 0; i < regexCacheMaxSize-1; i++ {
operands := []*ast.Term{
ast.NewTerm(ast.String(fmt.Sprintf("foo%d.*", i))),
ast.NewTerm(ast.String(fmt.Sprintf("foo%dbar", i))),
}
err := builtinRegexMatch(ctx, operands, iter)
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}
}

if len(regexpCache) != regexCacheMaxSize {
t.Fatalf("Expected cache to be full")
}

// A new regex pattern is cached and a random pattern is evicted.
regex2 := "bar.*"
operands = []*ast.Term{
ast.NewTerm(ast.String(regex2)),
ast.NewTerm(ast.String("barbaz")),
}
err = builtinRegexMatch(ctx, operands, iter)
if err != nil {
t.Fatalf("Unexpected error: %v", err)
}

if len(regexpCache) != regexCacheMaxSize {
t.Fatalf("Expected cache be capped at %d, was %d", regexCacheMaxSize, len(regexpCache))
}

if _, ok := regexpCache[regex2]; !ok {
t.Fatalf("Expected regex to be cached: %v", regex2)
}
}
Loading