Skip to content

Commit

Permalink
Merge pull request #19 from YakDriver/f-standardize-regexps
Browse files Browse the repository at this point in the history
Standardize reg exprs before caching
  • Loading branch information
YakDriver authored Sep 6, 2023
2 parents 1355c8c + 7c16766 commit 6d1b027
Show file tree
Hide file tree
Showing 4 changed files with 135 additions and 7 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@ func main() {
| REGEXACHE_OFF | Any value will turn `regexache` completely off. Useful for testing with and without caching. When off, `regexache.MustCompile()` is equivalent to `regexp.MustCompile()`. By default, `regexache` caches entries. |
| REGEXACHE_OUTPUT | File to output the cache contents to. Default: Empty (Don't output cache). |
| REGEXACHE_OUTPUT_MIN | Minimum number of lookups entries need to include when listing cache entries. Default: 1. |
| REGEXACHE_OUTPUT_INTERVAL | If outputing the cache, output every X milliseconds. Default: 1000 (1 second). |
| REGEXACHE_OUTPUT_INTERVAL | If outputing the cache, output every X milliseconds. Default: 1000 (1 second). |
| REGEXACHE_STANDARDIZE_OFF | Do not standardize expressions before caching. Default: Empty (Standardize). |

## Tests

Expand Down
19 changes: 13 additions & 6 deletions regexache.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,15 @@ import (
)

const (
maintenanceIntervalDefault = time.Duration(0)
expirationDefault = time.Millisecond * 10000
minimumUsesDefault = int64(2)
cleanTimeDefault = time.Millisecond * 1000
outputMinDefault = 1
outputIntervalDefault = time.Millisecond * 1000
outputMinDefault = 1
outputIntervalDefault = time.Millisecond * 1000

REGEXACHE_OFF = "REGEXACHE_OFF"
REGEXACHE_OUTPUT = "REGEXACHE_OUTPUT"
REGEXACHE_OUTPUT_INTERVAL = "REGEXACHE_OUTPUT_INTERVAL"
REGEXACHE_OUTPUT_MIN = "REGEXACHE_OUTPUT_MIN"
REGEXACHE_PRELOAD_OFF = "REGEXACHE_PRELOAD_OFF"
REGEXACHE_STANDARDIZE_OFF = "REGEXACHE_STANDARDIZE_OFF"
)

//go:embed preload.txt
Expand All @@ -38,6 +35,7 @@ var (
outputMin int64
outputFile string
outputInterval time.Duration
standardizing bool
)

func init() {
Expand Down Expand Up @@ -89,6 +87,11 @@ func init() {
cache.Store(r, regexp.MustCompile(r))
}
}

standardizing = true
if v := os.Getenv(REGEXACHE_STANDARDIZE_OFF); v != "" {
standardizing = false
}
}

var cache sync.Map
Expand All @@ -99,6 +102,10 @@ func MustCompile(str string) *regexp.Regexp {
return regexp.MustCompile(str)
}

if standardizing {
str = standardize(str)
}

if outputFile != "" {
mutex.Lock()
if _, ok := lookups[str]; !ok {
Expand Down
28 changes: 28 additions & 0 deletions standardize.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
package regexache

import "regexp"

var (
numFront *regexp.Regexp
capFront *regexp.Regexp
lowFront *regexp.Regexp
undFront *regexp.Regexp
word *regexp.Regexp
)

func init() {
undFront = regexp.MustCompile(`(\[)([^\]]*)(_)([^\]]*)(\])`)
lowFront = regexp.MustCompile(`(\[)([^\]]*)(a-[b-z])([^\]]*)(\])`)
capFront = regexp.MustCompile(`(\[)([^\]]*)(A-[B-Z])([^\]]*)(\])`)
numFront = regexp.MustCompile(`(\[)([^\]]*)(0-9)([^\]]*)(\])`)
word = regexp.MustCompile(`(\[)([^\]]*)(0-9A-Za-z_)([^\]]*)(\])`)
}

func standardize(expr string) string {
expr = undFront.ReplaceAllString(expr, "$1$3$2$4$5")
expr = lowFront.ReplaceAllString(expr, "$1$3$2$4$5")
expr = capFront.ReplaceAllString(expr, "$1$3$2$4$5")
expr = numFront.ReplaceAllString(expr, "$1$3$2$4$5")
expr = word.ReplaceAllString(expr, `$1\w$2$4$5`)
return expr
}
92 changes: 92 additions & 0 deletions standardize_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
package regexache

import "testing"

func TestStandardize(t *testing.T) {
t.Parallel()

testCases := []struct {
TestName string
Input string
Expected string
}{
{
TestName: "empty",
Input: "",
Expected: "",
},
{
TestName: "basic",
Input: `.*\S.*`,
Expected: `.*\S.*`,
},
{
TestName: "numOrder",
Input: `^[a-z0-9-_]+$`,
Expected: `^[0-9a-z_-]+$`,
},
{
TestName: "multiClass",
Input: `^[a-z0-9-_]+[a-z0-9-_]+$`,
Expected: `^[0-9a-z_-]+[0-9a-z_-]+$`,
},
{
TestName: "everywhere",
Input: `^[A-Za-z0-9-*&_]+$`,
Expected: `^[\w-*&]+$`,
},
{
TestName: "hex",
Input: `^[A-Fa-f0-9-*&_]+$`,
Expected: `^[0-9A-Fa-f_-*&]+$`,
},
{
TestName: "hex2",
Input: `^#[A-F0-9]{6}$`,
Expected: `^#[0-9A-F]{6}$`,
},
{
TestName: "parenthesis",
Input: `(/)|(/(([^~])|(~[01]))+)`,
Expected: `(/)|(/(([^~])|(~[01]))+)`,
},
{
TestName: "ordering",
Input: `^[a-zA-Z0-9]+$`,
Expected: `^[0-9A-Za-z]+$`,
},
{
TestName: "ordering2",
Input: `^[-a-zA-Z0-9._]*$`,
Expected: `^[\w-.]*$`,
},
{
TestName: "ordering3",
Input: `^[a-z0-9-]+$`,
Expected: `^[0-9a-z-]+$`,
},
{
TestName: "ordering4",
Input: `^[a-zA-Z0-9._-]*$`,
Expected: `^[\w.-]*$`,
},
{
TestName: "ordering5",
Input: `^[0-9a-zA-Z._-]+`,
Expected: `^[\w.-]+`,
},
}

for _, testCase := range testCases {
testCase := testCase
t.Run(testCase.TestName, func(t *testing.T) {
t.Parallel()

got := standardize(testCase.Input)

if got != testCase.Expected {
t.Errorf("got %s, expected %s", got, testCase.Expected)
}
})
}
}

0 comments on commit 6d1b027

Please sign in to comment.