From 7c167661b0ddc5299d029acae5237b1362762c92 Mon Sep 17 00:00:00 2001 From: Dirk Avery Date: Wed, 6 Sep 2023 19:38:23 -0400 Subject: [PATCH] Standardize reg exprs before caching --- README.md | 3 +- regexache.go | 19 +++++++--- standardize.go | 28 ++++++++++++++ standardize_test.go | 92 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 135 insertions(+), 7 deletions(-) create mode 100644 standardize.go create mode 100644 standardize_test.go diff --git a/README.md b/README.md index 4157b7b..580b0aa 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,8 @@ func main() { | REGEXACHE_OFF | Any value will turn `regexache` completely off. Useful for testing with and without caching. When off, `regexache.MustCompile()` is equivalent to `regexp.MustCompile()`. By default, `regexache` caches entries. | | REGEXACHE_OUTPUT | File to output the cache contents to. Default: Empty (Don't output cache). | | REGEXACHE_OUTPUT_MIN | Minimum number of lookups entries need to include when listing cache entries. Default: 1. | -| REGEXACHE_OUTPUT_INTERVAL | If outputing the cache, output every X milliseconds. Default: 1000 (1 second). | +| REGEXACHE_OUTPUT_INTERVAL | If outputing the cache, output every X milliseconds. Default: 1000 (1 second). | +| REGEXACHE_STANDARDIZE_OFF | Do not standardize expressions before caching. Default: Empty (Standardize). | ## Tests diff --git a/regexache.go b/regexache.go index d1f70eb..6c16ecd 100644 --- a/regexache.go +++ b/regexache.go @@ -14,18 +14,15 @@ import ( ) const ( - maintenanceIntervalDefault = time.Duration(0) - expirationDefault = time.Millisecond * 10000 - minimumUsesDefault = int64(2) - cleanTimeDefault = time.Millisecond * 1000 - outputMinDefault = 1 - outputIntervalDefault = time.Millisecond * 1000 + outputMinDefault = 1 + outputIntervalDefault = time.Millisecond * 1000 REGEXACHE_OFF = "REGEXACHE_OFF" REGEXACHE_OUTPUT = "REGEXACHE_OUTPUT" REGEXACHE_OUTPUT_INTERVAL = "REGEXACHE_OUTPUT_INTERVAL" REGEXACHE_OUTPUT_MIN = "REGEXACHE_OUTPUT_MIN" REGEXACHE_PRELOAD_OFF = "REGEXACHE_PRELOAD_OFF" + REGEXACHE_STANDARDIZE_OFF = "REGEXACHE_STANDARDIZE_OFF" ) //go:embed preload.txt @@ -38,6 +35,7 @@ var ( outputMin int64 outputFile string outputInterval time.Duration + standardizing bool ) func init() { @@ -89,6 +87,11 @@ func init() { cache.Store(r, regexp.MustCompile(r)) } } + + standardizing = true + if v := os.Getenv(REGEXACHE_STANDARDIZE_OFF); v != "" { + standardizing = false + } } var cache sync.Map @@ -99,6 +102,10 @@ func MustCompile(str string) *regexp.Regexp { return regexp.MustCompile(str) } + if standardizing { + str = standardize(str) + } + if outputFile != "" { mutex.Lock() if _, ok := lookups[str]; !ok { diff --git a/standardize.go b/standardize.go new file mode 100644 index 0000000..942efbc --- /dev/null +++ b/standardize.go @@ -0,0 +1,28 @@ +package regexache + +import "regexp" + +var ( + numFront *regexp.Regexp + capFront *regexp.Regexp + lowFront *regexp.Regexp + undFront *regexp.Regexp + word *regexp.Regexp +) + +func init() { + undFront = regexp.MustCompile(`(\[)([^\]]*)(_)([^\]]*)(\])`) + lowFront = regexp.MustCompile(`(\[)([^\]]*)(a-[b-z])([^\]]*)(\])`) + capFront = regexp.MustCompile(`(\[)([^\]]*)(A-[B-Z])([^\]]*)(\])`) + numFront = regexp.MustCompile(`(\[)([^\]]*)(0-9)([^\]]*)(\])`) + word = regexp.MustCompile(`(\[)([^\]]*)(0-9A-Za-z_)([^\]]*)(\])`) +} + +func standardize(expr string) string { + expr = undFront.ReplaceAllString(expr, "$1$3$2$4$5") + expr = lowFront.ReplaceAllString(expr, "$1$3$2$4$5") + expr = capFront.ReplaceAllString(expr, "$1$3$2$4$5") + expr = numFront.ReplaceAllString(expr, "$1$3$2$4$5") + expr = word.ReplaceAllString(expr, `$1\w$2$4$5`) + return expr +} diff --git a/standardize_test.go b/standardize_test.go new file mode 100644 index 0000000..27affae --- /dev/null +++ b/standardize_test.go @@ -0,0 +1,92 @@ +package regexache + +import "testing" + +func TestStandardize(t *testing.T) { + t.Parallel() + + testCases := []struct { + TestName string + Input string + Expected string + }{ + { + TestName: "empty", + Input: "", + Expected: "", + }, + { + TestName: "basic", + Input: `.*\S.*`, + Expected: `.*\S.*`, + }, + { + TestName: "numOrder", + Input: `^[a-z0-9-_]+$`, + Expected: `^[0-9a-z_-]+$`, + }, + { + TestName: "multiClass", + Input: `^[a-z0-9-_]+[a-z0-9-_]+$`, + Expected: `^[0-9a-z_-]+[0-9a-z_-]+$`, + }, + { + TestName: "everywhere", + Input: `^[A-Za-z0-9-*&_]+$`, + Expected: `^[\w-*&]+$`, + }, + { + TestName: "hex", + Input: `^[A-Fa-f0-9-*&_]+$`, + Expected: `^[0-9A-Fa-f_-*&]+$`, + }, + { + TestName: "hex2", + Input: `^#[A-F0-9]{6}$`, + Expected: `^#[0-9A-F]{6}$`, + }, + { + TestName: "parenthesis", + Input: `(/)|(/(([^~])|(~[01]))+)`, + Expected: `(/)|(/(([^~])|(~[01]))+)`, + }, + { + TestName: "ordering", + Input: `^[a-zA-Z0-9]+$`, + Expected: `^[0-9A-Za-z]+$`, + }, + { + TestName: "ordering2", + Input: `^[-a-zA-Z0-9._]*$`, + Expected: `^[\w-.]*$`, + }, + { + TestName: "ordering3", + Input: `^[a-z0-9-]+$`, + Expected: `^[0-9a-z-]+$`, + }, + { + TestName: "ordering4", + Input: `^[a-zA-Z0-9._-]*$`, + Expected: `^[\w.-]*$`, + }, + { + TestName: "ordering5", + Input: `^[0-9a-zA-Z._-]+`, + Expected: `^[\w.-]+`, + }, + } + + for _, testCase := range testCases { + testCase := testCase + t.Run(testCase.TestName, func(t *testing.T) { + t.Parallel() + + got := standardize(testCase.Input) + + if got != testCase.Expected { + t.Errorf("got %s, expected %s", got, testCase.Expected) + } + }) + } +}