From b6e96f65d0593a1db0bb069c6bb2686e54008e2e Mon Sep 17 00:00:00 2001 From: Emil Guliyev Date: Thu, 23 Aug 2018 05:42:05 -0700 Subject: [PATCH] Hash based lookup implementation (GH-143) Added benchmarks, moved to a hash based implementation and improved performance to match that of golang.org/x/net library. See GH-133 --- cmd/gen/gen.go | 8 +- publicsuffix/publicsuffix.go | 149 +++++++++++++----------------- publicsuffix/publicsuffix_test.go | 14 +-- publicsuffix/rules.go | 4 +- 4 files changed, 82 insertions(+), 93 deletions(-) diff --git a/cmd/gen/gen.go b/cmd/gen/gen.go index 06da611..a51558e 100644 --- a/cmd/gen/gen.go +++ b/cmd/gen/gen.go @@ -11,6 +11,8 @@ import ( "context" "fmt" "go/format" + "io/ioutil" + "log" "net/http" "strings" "text/template" @@ -18,8 +20,6 @@ import ( "github.com/google/go-github/github" "github.com/weppos/publicsuffix-go/publicsuffix" - "io/ioutil" - "log" ) const ( @@ -39,7 +39,9 @@ func init() { { {{$r.Type}}, "{{$r.Value}}", {{$r.Length}}, {{$r.Private}} }, {{end}} } - DefaultList.rules = r[:] + for i := range r { + DefaultList.AddRule(&r[i]) + } } ` diff --git a/publicsuffix/publicsuffix.go b/publicsuffix/publicsuffix.go index 1efa48b..a98c904 100644 --- a/publicsuffix/publicsuffix.go +++ b/publicsuffix/publicsuffix.go @@ -11,7 +11,6 @@ import ( "io" "net/http/cookiejar" "os" - "regexp" "strings" "golang.org/x/net/idna" @@ -80,13 +79,14 @@ type FindOptions struct { // List represents a Public Suffix List. type List struct { // rules is kept private because you should not access rules directly - // for lookup optimization the list will not be guaranteed to be a simple slice forever - rules []Rule + rules map[string]*Rule } // NewList creates a new empty list. func NewList() *List { - return &List{} + return &List{ + rules: map[string]*Rule{}, + } } // NewListFromString parses a string that represents a Public Suffix source @@ -132,7 +132,7 @@ func (l *List) LoadFile(path string, options *ParserOption) ([]Rule, error) { // The list may be optimized internally for lookups, therefore the algorithm // will decide the best position for the new rule. func (l *List) AddRule(r *Rule) error { - l.rules = append(l.rules, *r) + l.rules[r.Value] = r return nil } @@ -195,43 +195,23 @@ Scanning: // Find and returns the most appropriate rule for the domain name. func (l *List) Find(name string, options *FindOptions) *Rule { - var bestRule *Rule - if options == nil { options = DefaultFindOptions } - for _, r := range l.selectRules(name, options) { - if r.Type == ExceptionType { - return &r - } - if bestRule == nil || bestRule.Length < r.Length { - bestRule = &r - } - } - - if bestRule != nil { - return bestRule - } - - return options.DefaultRule -} - -func (l *List) selectRules(name string, options *FindOptions) []Rule { - var found []Rule - - // In this phase the search is a simple sequential scan - for _, rule := range l.rules { - if !rule.Match(name) { - continue + for { + rule, ok := l.rules[name] + if ok && (!options.IgnorePrivate || !rule.Private) { + return rule } - if options.IgnorePrivate && rule.Private { - continue + i := strings.IndexRune(name, '.') + if i < 0 { + return options.DefaultRule } - found = append(found, rule) + name = name[i+1:] } - return found + return nil } // NewRule parses the rule content, creates and returns a Rule. @@ -309,36 +289,46 @@ func (r *Rule) Match(name string) bool { // Decompose takes a name as input and decomposes it into a tuple of , // according to the rule definition and type. -func (r *Rule) Decompose(name string) [2]string { - var parts []string - +func (r *Rule) Decompose(name string) (result [2]string) { + if r == DefaultRule { + i := strings.LastIndex(name, ".") + if i < 0 { + return + } + result[0], result[1] = name[:i], name[i+1:] + return + } switch r.Type { + case NormalType: + name = strings.TrimSuffix(name, r.Value) + if len(name) == 0 { + return + } + result[0], result[1] = name[:len(name)-1], r.Value case WildcardType: - parts = append([]string{`.*?`}, r.parts()...) - default: - parts = r.parts() - } - - suffix := strings.Join(parts, `\.`) - re := regexp.MustCompile(fmt.Sprintf(`^(.+)\.(%s)$`, suffix)) - - matches := re.FindStringSubmatch(name) - if len(matches) < 3 { - return [2]string{"", ""} - } - - return [2]string{matches[1], matches[2]} -} - -func (r *Rule) parts() []string { - labels := Labels(r.Value) - if r.Type == ExceptionType { - return labels[1:] - } - if r.Type == WildcardType && r.Value == "" { - return []string{} + name := strings.TrimSuffix(name, r.Value) + if len(name) == 0 { + return + } + name = name[:len(name)-1] + i := strings.LastIndex(name, ".") + if i < 0 { + return + } + result[0], result[1] = name[:i], name[i+1:]+"."+r.Value + case ExceptionType: + i := strings.IndexRune(r.Value, '.') + if i < 0 { + return + } + suffix := r.Value[i+1:] + name = strings.TrimSuffix(name, suffix) + if len(name) == 0 { + return + } + result[0], result[1] = name[:len(name)-1], suffix } - return labels + return } // Labels decomposes given domain name into labels, @@ -432,7 +422,6 @@ func DomainFromListWithOptions(l *List, name string, options *FindOptions) (stri if err != nil { return "", err } - return dn.SLD + "." + dn.TLD, nil } @@ -458,12 +447,22 @@ func ParseFromListWithOptions(l *List, name string, options *FindOptions) (*Doma } r := l.Find(n, options) - if tld := r.Decompose(n)[1]; tld == "" { + parts := r.Decompose(n) + left, tld := parts[0], parts[1] + if tld == "" { return nil, fmt.Errorf("%s is a suffix", n) } - dn := &DomainName{Rule: r} - dn.TLD, dn.SLD, dn.TRD = decompose(r, n) + dn := &DomainName{ + Rule: r, + TLD: tld, + } + if i := strings.LastIndex(left, "."); i < 0 { + dn.SLD = left + } else { + dn.TRD = left[:i] + dn.SLD = left[i+1:] + } return dn, nil } @@ -471,31 +470,15 @@ func normalize(name string) (string, error) { ret := strings.ToLower(name) if ret == "" { - return "", fmt.Errorf("Name is blank") + return "", fmt.Errorf("name is blank") } if ret[0] == '.' { - return "", fmt.Errorf("Name %s starts with a dot", ret) + return "", fmt.Errorf("name %s starts with a dot", ret) } return ret, nil } -func decompose(r *Rule, name string) (tld, sld, trd string) { - parts := r.Decompose(name) - left, tld := parts[0], parts[1] - - dot := strings.LastIndex(left, ".") - if dot == -1 { - sld = left - trd = "" - } else { - sld = left[dot+1:] - trd = left[0:dot] - } - - return -} - // ToASCII is a wrapper for idna.ToASCII. // // This wrapper exists because idna.ToASCII backward-compatibility was broken twice in few months diff --git a/publicsuffix/publicsuffix_test.go b/publicsuffix/publicsuffix_test.go index 5dd3d9b..1683cae 100644 --- a/publicsuffix/publicsuffix_test.go +++ b/publicsuffix/publicsuffix_test.go @@ -44,7 +44,7 @@ blogspot.com testRules = []Rule{} for _, rule := range rules { if rule.Private == false { - testRules = append(testRules, rule) + testRules = append(testRules, *rule) } } if want, got := 2, len(testRules); want != got { @@ -55,7 +55,7 @@ blogspot.com testRules = []Rule{} for _, rule := range rules { if rule.Private == true { - testRules = append(testRules, rule) + testRules = append(testRules, *rule) } } if want, got := 1, len(testRules); want != got { @@ -143,7 +143,7 @@ func TestNewListFromFile(t *testing.T) { testRules = []Rule{} for _, rule := range rules { if rule.Private == false { - testRules = append(testRules, rule) + testRules = append(testRules, *rule) } } if want, got := 2, len(testRules); want != got { @@ -154,7 +154,7 @@ func TestNewListFromFile(t *testing.T) { testRules = []Rule{} for _, rule := range rules { if rule.Private == true { - testRules = append(testRules, rule) + testRules = append(testRules, *rule) } } if want, got := 1, len(testRules); want != got { @@ -175,8 +175,10 @@ func TestListAddRule(t *testing.T) { if list.Size() != 1 { t.Fatalf("List should have 1 rule, got %v", list.Size()) } - if got := &list.rules[0]; !reflect.DeepEqual(rule, got) { - t.Fatalf("List[0] expected to be %v, got %v", rule, got) + for _, got := range list.rules { + if !reflect.DeepEqual(rule, got) { + t.Fatalf("List[0] expected to be %v, got %v", rule, got) + } } } diff --git a/publicsuffix/rules.go b/publicsuffix/rules.go index 70fb8e9..d7002a8 100644 --- a/publicsuffix/rules.go +++ b/publicsuffix/rules.go @@ -8627,5 +8627,7 @@ func init() { {1, "now.sh", 2, true}, {1, "zone.id", 2, true}, } - DefaultList.rules = r[:] + for i := range r { + DefaultList.AddRule(&r[i]) + } }