Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hash/Tree hybrid performance increase #143

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 8 additions & 16 deletions lib/public_suffix.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
require_relative "public_suffix/domain"
require_relative "public_suffix/version"
require_relative "public_suffix/errors"
require_relative "public_suffix/rule"
require_relative "public_suffix/rules"
require_relative "public_suffix/list"

# PublicSuffix is a Ruby domain name parser based on the Public Suffix List.
Expand Down Expand Up @@ -64,19 +64,16 @@ module PublicSuffix
# If domain is not a valid domain.
# @raise [PublicSuffix::DomainNotAllowed]
# If a rule for +domain+ is found, but the rule doesn't allow +domain+.
def self.parse(name, list: List.default, default_rule: list.default_rule, ignore_private: false)
def self.parse(name, list: List.default, ignore_private: false)
what = normalize(name)
raise what if what.is_a?(DomainInvalid)

rule = list.find(what, default: default_rule, ignore_private: ignore_private)
rule = list.find(what, ignore_private: ignore_private)

# rubocop:disable Style/IfUnlessModifier
if rule.nil?
raise DomainInvalid, "`#{what}` is not a valid domain"
end
if rule.decompose(what).last.nil?
raise DomainNotAllowed, "`#{what}` is not allowed according to Registry policy"
end
# rubocop:enable Style/IfUnlessModifier

decompose(what, rule)
Expand Down Expand Up @@ -119,13 +116,8 @@ def self.parse(name, list: List.default, default_rule: list.default_rule, ignore
# @param [String, #to_s] name The domain name or fully qualified domain name to validate.
# @param [Boolean] ignore_private
# @return [Boolean]
def self.valid?(name, list: List.default, default_rule: list.default_rule, ignore_private: false)
what = normalize(name)
return false if what.is_a?(DomainInvalid)

rule = list.find(what, default: default_rule, ignore_private: ignore_private)

!rule.nil? && !rule.decompose(what).last.nil?
def self.valid?(name, list: List.default, ignore_private: false)
!normalize(name).is_a?(DomainInvalid)
end

# Attempt to parse the name and returns the domain, if valid.
Expand All @@ -146,13 +138,13 @@ def self.domain(name, **options)
# private

def self.decompose(name, rule)
left, right = rule.decompose(name)
rule_len = rule.split(DOT).length
parts = name.split(DOT)

parts = left.split(DOT)
# If we have 0 parts left, there is just a tld and no domain or subdomain
# If we have 1 part left, there is just a tld, domain and not subdomain
# If we have 2 parts left, the last part is the domain, the other parts (combined) are the subdomain
tld = right
tld = rule.empty? ? nil : parts.pop(rule_len).join(DOT)
sld = parts.empty? ? nil : parts.pop
trd = parts.empty? ? nil : parts.join(DOT)

Expand Down
8 changes: 4 additions & 4 deletions lib/public_suffix/domain.rb
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def name
#
# @return [String]
def domain
[@sld, @tld].join(DOT) if domain?
[@sld, @tld].compact.join(DOT) if domain?
end

# Returns a subdomain-like representation of this object
Expand Down Expand Up @@ -165,7 +165,7 @@ def domain
#
# @return [String]
def subdomain
[@trd, @sld, @tld].join(DOT) if subdomain?
[@trd, @sld, @tld].compact.join(DOT) if subdomain?
end

# Checks whether <tt>self</tt> looks like a domain.
Expand Down Expand Up @@ -196,7 +196,7 @@ def subdomain
#
# @return [Boolean]
def domain?
!(@tld.nil? || @sld.nil?)
!@sld.nil?
end

# Checks whether <tt>self</tt> looks like a subdomain.
Expand Down Expand Up @@ -227,7 +227,7 @@ def domain?
#
# @return [Boolean]
def subdomain?
!(@tld.nil? || @sld.nil? || @trd.nil?)
!(@sld.nil? || @trd.nil?)
end

end
Expand Down
112 changes: 19 additions & 93 deletions lib/public_suffix/list.rb
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ def self.default=(value)
def self.parse(input, private_domains: true)
comment_token = "//".freeze
private_token = "===BEGIN PRIVATE DOMAINS===".freeze
space_re = /\p{Space}/
section = nil # 1 == ICANN, 2 == PRIVATE

new do |list|
Expand All @@ -90,7 +91,8 @@ def self.parse(input, private_domains: true)
next

else
list.add(Rule.factory(line, private: section == 2))
rule = line.split(space_re).first
list.add(rule, private: section == 2)

end
end
Expand All @@ -103,41 +105,23 @@ def self.parse(input, private_domains: true)
# @yield [self] Yields on self.
# @yieldparam [PublicSuffix::List] self The newly created instance.
def initialize
@rules = {}
@rules = Rules.new
add('*', private: false)
yield(self) if block_given?
end


# Checks whether two lists are equal.
#
# List <tt>one</tt> is equal to <tt>two</tt>, if <tt>two</tt> is an instance of
# {PublicSuffix::List} and each +PublicSuffix::Rule::*+
# in list <tt>one</tt> is available in list <tt>two</tt>, in the same order.
#
# @param other [PublicSuffix::List] the List to compare
# @return [Boolean]
def ==(other)
return false unless other.is_a?(List)
equal?(other) || @rules == other.rules
end
alias eql? ==

# Iterates each rule in the list.
def each(&block)
Enumerator.new do |y|
@rules.each do |key, node|
y << entry_to_rule(node, key)
end
end.each(&block)
end


# Adds the given object to the list and optionally refreshes the rule index.
#
# @param rule [PublicSuffix::Rule::*] the rule to add to the list
# @return [self]
def add(rule)
@rules[rule.value] = rule_to_entry(rule)
def add(rule, private: false)
exception = false
if rule[0] == BANG
exception = true
rule = rule[1..-1]
end
lbls = rule.split(DOT).reverse
@rules.add(lbls, exception, private)
self
end
alias << add
Expand All @@ -160,7 +144,7 @@ def empty?
#
# @return [self]
def clear
@rules.clear
@rules = Rules.new
self
end

Expand All @@ -169,77 +153,19 @@ def clear
# @param name [#to_s] the hostname
# @param default [PublicSuffix::Rule::*] the default rule to return in case no rule matches
# @return [PublicSuffix::Rule::*]
def find(name, default: default_rule, **options)
rule = select(name, **options).inject do |l, r|
return r if r.class == Rule::Exception
l.length > r.length ? l : r
end
rule || default
def find(name, ignore_private: false)
lbls = name.split(DOT).reverse
r = @rules.get_regdom(lbls, !ignore_private)
r.reverse[1..-1].join(DOT)
end

# Selects all the rules matching given hostame.
#
# If `ignore_private` is set to true, the algorithm will skip the rules that are flagged as
# private domain. Note that the rules will still be part of the loop.
# If you frequently need to access lists ignoring the private domains,
# you should create a list that doesn't include these domains setting the
# `private_domains: false` option when calling {.parse}.
#
# Note that this method is currently private, as you should not rely on it. Instead,
# the public interface is {#find}. The current internal algorithm allows to return all
# matching rules, but different data structures may not be able to do it, and instead would
# return only the match. For this reason, you should rely on {#find}.
#
# @param name [#to_s] the hostname
# @param ignore_private [Boolean]
# @return [Array<PublicSuffix::Rule::*>]
def select(name, ignore_private: false)
name = name.to_s

parts = name.split(DOT).reverse!
index = 0
query = parts[index]
rules = []

loop do
match = @rules[query]
if !match.nil? && (ignore_private == false || match.private == false)
rules << entry_to_rule(match, query)
end

index += 1
break if index >= parts.size
query = parts[index] + DOT + query
end

rules
end
private :select

# Gets the default rule.
#
# @see PublicSuffix::Rule.default_rule
#
# @return [PublicSuffix::Rule::*]
def default_rule
PublicSuffix::Rule.default
end


protected

attr_reader :rules


private

def entry_to_rule(entry, value)
entry.type.new(value: value, length: entry.length, private: entry.private)
end

def rule_to_entry(rule)
Rule::Entry.new(rule.class, rule.length, rule.private)
'*'
end

end
end
75 changes: 75 additions & 0 deletions lib/public_suffix/rules.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# = Public Suffix
#
# Domain name parser based on the Public Suffix List.
#
# Copyright (c) 2009-2017 Simone Carletti <[email protected]>

module PublicSuffix

# A Rule is a special object which holds a single definition
# of the Public Suffix List.
#
# There are 3 types of rules, each one represented by a specific
# subclass within the +PublicSuffix::Rule+ namespace.
#
# To create a new Rule, use the {PublicSuffix::Rule#factory} method.
#
# PublicSuffix::Rule.factory("ar")
# # => #<PublicSuffix::Rule::Normal>
#
class Rules
def initialize
@children = {}
@terminus = false
@priv = false
@exception = false
end

def empty?
@children.empty? && !@terminus
end

def size
sz = @terminus ? 1 : 0
@children.each{|k,v|sz += v.size}
sz
end

def add(x, excpt, priv)
lbl = x.shift
if lbl.nil?
raise 'Duplicate rule' if @terminus
@terminus = true
@priv = priv
@exception = excpt
return
end
@children[lbl] ||= Rules.new
@children[lbl].add(x, excpt, priv)
end

def get_regdom(lbls, priv = true, matched_lbls = [])
# Avoid modifying our input by copying it first
lbls = lbls.dup
lbl = lbls.shift
if lbl.nil?
if @terminus && (!@priv || priv)
if @exception
return matched_lbls
end
raise DomainNotAllowed, "#{matched_lbls.reverse.join(".")} is not allowed according to Registry policy"
end
return nil
end
r = @children[lbl].get_regdom(lbls, priv, matched_lbls + [lbl]) if @children.key?(lbl)
return r if !r.nil?
r = @children['*'].get_regdom(lbls, priv, matched_lbls + [lbl]) if @children.key?('*')
return r if !r.nil?
if @terminus && (!@priv || priv)
return matched_lbls if @exception
return matched_lbls + [lbl]
end
nil
end
end
end