Skip to content

Commit

Permalink
Merge pull request #260 from odedfos/master
Browse files Browse the repository at this point in the history
#259 Fix performance issue of large dict validation
  • Loading branch information
tusharmakkar08 authored Jan 19, 2017
2 parents 9daf686 + 00551bf commit 639d794
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 6 deletions.
26 changes: 24 additions & 2 deletions voluptuous/schema_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import sys
from contextlib import contextmanager

import itertools

try:
import error as er
except ImportError:
Expand Down Expand Up @@ -110,6 +112,9 @@ def _isnamedtuple(obj):
return isinstance(obj, tuple) and hasattr(obj, '_fields')


primitive_types = (str, unicode, bool, int, float)


class Undefined(object):
def __nonzero__(self):
return False
Expand Down Expand Up @@ -263,6 +268,20 @@ def _compile_mapping(self, schema, invalid_msg=None):

candidates = list(_iterate_mapping_candidates(_compiled_schema))

# After we have the list of candidates in the correct order, we want to apply some optimization so that each
# key in the data being validated will be matched against the relevant schema keys only.
# No point in matching against different keys
additional_candidates = []
candidates_by_key = {}
for skey, (ckey, cvalue) in candidates:
if type(skey) in primitive_types:
candidates_by_key.setdefault(skey, []).append((skey, (ckey, cvalue)))
elif isinstance(skey, Marker) and type(skey.schema) in primitive_types:
candidates_by_key.setdefault(skey.schema, []).append((skey, (ckey, cvalue)))
else:
# These are wildcards such as 'int', 'str', 'Remove' and others which should be applied to all keys
additional_candidates.append((skey, (ckey, cvalue)))

def validate_mapping(path, iterable, out):
try:
from util import to_utf8_py2
Expand All @@ -278,9 +297,12 @@ def validate_mapping(path, iterable, out):
key_path = path + [key]
remove_key = False

# Optimization. Validate against the matching key first, then fallback to the rest
relevant_candidates = itertools.chain(candidates_by_key.get(key, []), additional_candidates)

# compare each given key/value against all compiled key/values
# schema key, (compiled key, compiled value)
for skey, (ckey, cvalue) in candidates:
for skey, (ckey, cvalue) in relevant_candidates:
try:
new_key = ckey(key_path, key)
except er.Invalid as e:
Expand Down Expand Up @@ -634,7 +656,7 @@ def key_literal(key):
# for each item in the extension schema, replace duplicates
# or add new keys
for key, value in iteritems(schema):

# if the key is already in the dictionary, we need to replace it
# transform key to literal before checking presence
if key_literal(key) in result_key_map:
Expand Down
44 changes: 40 additions & 4 deletions voluptuous/tests/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@
Url, MultipleInvalid, LiteralInvalid, NotIn, Match, Email,
Replace, Range, Coerce, All, Any, Length, FqdnUrl, ALLOW_EXTRA, PREVENT_EXTRA,
validate, ExactSequence, Equal, Unordered, Number, Maybe, Datetime, Date,
Contains
)
Contains, Marker)
from voluptuous.humanize import humanize_error
from voluptuous.util import to_utf8_py2, u

Expand Down Expand Up @@ -82,7 +81,7 @@ def test_contains():
schema({'color': ['blue', 'yellow']})
except Invalid as e:
assert_equal(str(e),
"value is not allowed for dictionary value @ data['color']")
"value is not allowed for dictionary value @ data['color']")


def test_remove():
Expand Down Expand Up @@ -788,10 +787,47 @@ def test_ordered_dict():
if not hasattr(collections, 'OrderedDict'):
# collections.OrderedDict was added in Python2.7; only run if present
return
schema = Schema({Number(): Number()}) # x, y pairs (for interpolation or something)
schema = Schema({Number(): Number()}) # x, y pairs (for interpolation or something)
data = collections.OrderedDict([(5.0, 3.7), (24.0, 8.7), (43.0, 1.5),
(62.0, 2.1), (71.5, 6.7), (90.5, 4.1),
(109.0, 3.9)])
out = schema(data)
assert isinstance(out, collections.OrderedDict), 'Collection is no longer ordered'
assert data.keys() == out.keys(), 'Order is not consistent'


def test_validation_performance():
"""
This test comes to make sure the validation complexity of dictionaries is done in a linear time.
to achieve this a custom marker is used in the scheme that counts each time it is evaluated.
By doing so we can determine if the validation is done in linear complexity.
Prior to issue https://github.com/alecthomas/voluptuous/issues/259 this was exponential
"""
num_of_keys = 1000

schema_dict = {}
data = {}
data_extra_keys = {}

counter = [0]

class CounterMarker(Marker):
def __call__(self, *args, **kwargs):
counter[0] += 1
return super(CounterMarker, self).__call__(*args, **kwargs)

for i in range(num_of_keys):
schema_dict[CounterMarker(str(i))] = str
data[str(i)] = str(i)
data_extra_keys[str(i*2)] = str(i) # half of the keys are present, and half aren't

schema = Schema(schema_dict, extra=ALLOW_EXTRA)

schema(data)

assert counter[0] <= num_of_keys, "Validation complexity is not linear! %s > %s" % (counter[0], num_of_keys)

counter[0] = 0 # reset counter
schema(data_extra_keys)

assert counter[0] <= num_of_keys, "Validation complexity is not linear! %s > %s" % (counter[0], num_of_keys)

0 comments on commit 639d794

Please sign in to comment.