From 00551bfd75a24f98523938da6caec11e35eea535 Mon Sep 17 00:00:00 2001 From: odedfos Date: Tue, 17 Jan 2017 16:43:20 +0200 Subject: [PATCH] Fix #259 performance issue of exponential complexity where of all dict keys are matched with all keys in schema --- voluptuous/schema_builder.py | 26 +++++++++++++++++++-- voluptuous/tests/tests.py | 44 ++++++++++++++++++++++++++++++++---- 2 files changed, 64 insertions(+), 6 deletions(-) diff --git a/voluptuous/schema_builder.py b/voluptuous/schema_builder.py index 0bcd2c9..3edc8f2 100644 --- a/voluptuous/schema_builder.py +++ b/voluptuous/schema_builder.py @@ -5,6 +5,8 @@ import sys from contextlib import contextmanager +import itertools + try: import error as er except ImportError: @@ -110,6 +112,9 @@ def _isnamedtuple(obj): return isinstance(obj, tuple) and hasattr(obj, '_fields') +primitive_types = (str, unicode, bool, int, float) + + class Undefined(object): def __nonzero__(self): return False @@ -263,6 +268,20 @@ def _compile_mapping(self, schema, invalid_msg=None): candidates = list(_iterate_mapping_candidates(_compiled_schema)) + # After we have the list of candidates in the correct order, we want to apply some optimization so that each + # key in the data being validated will be matched against the relevant schema keys only. + # No point in matching against different keys + additional_candidates = [] + candidates_by_key = {} + for skey, (ckey, cvalue) in candidates: + if type(skey) in primitive_types: + candidates_by_key.setdefault(skey, []).append((skey, (ckey, cvalue))) + elif isinstance(skey, Marker) and type(skey.schema) in primitive_types: + candidates_by_key.setdefault(skey.schema, []).append((skey, (ckey, cvalue))) + else: + # These are wildcards such as 'int', 'str', 'Remove' and others which should be applied to all keys + additional_candidates.append((skey, (ckey, cvalue))) + def validate_mapping(path, iterable, out): try: from util import to_utf8_py2 @@ -278,9 +297,12 @@ def validate_mapping(path, iterable, out): key_path = path + [key] remove_key = False + # Optimization. Validate against the matching key first, then fallback to the rest + relevant_candidates = itertools.chain(candidates_by_key.get(key, []), additional_candidates) + # compare each given key/value against all compiled key/values # schema key, (compiled key, compiled value) - for skey, (ckey, cvalue) in candidates: + for skey, (ckey, cvalue) in relevant_candidates: try: new_key = ckey(key_path, key) except er.Invalid as e: @@ -634,7 +656,7 @@ def key_literal(key): # for each item in the extension schema, replace duplicates # or add new keys for key, value in iteritems(schema): - + # if the key is already in the dictionary, we need to replace it # transform key to literal before checking presence if key_literal(key) in result_key_map: diff --git a/voluptuous/tests/tests.py b/voluptuous/tests/tests.py index 1f9b7ff..86bb858 100644 --- a/voluptuous/tests/tests.py +++ b/voluptuous/tests/tests.py @@ -7,8 +7,7 @@ Url, MultipleInvalid, LiteralInvalid, NotIn, Match, Email, Replace, Range, Coerce, All, Any, Length, FqdnUrl, ALLOW_EXTRA, PREVENT_EXTRA, validate, ExactSequence, Equal, Unordered, Number, Maybe, Datetime, Date, - Contains -) + Contains, Marker) from voluptuous.humanize import humanize_error from voluptuous.util import to_utf8_py2, u @@ -82,7 +81,7 @@ def test_contains(): schema({'color': ['blue', 'yellow']}) except Invalid as e: assert_equal(str(e), - "value is not allowed for dictionary value @ data['color']") + "value is not allowed for dictionary value @ data['color']") def test_remove(): @@ -788,10 +787,47 @@ def test_ordered_dict(): if not hasattr(collections, 'OrderedDict'): # collections.OrderedDict was added in Python2.7; only run if present return - schema = Schema({Number(): Number()}) # x, y pairs (for interpolation or something) + schema = Schema({Number(): Number()}) # x, y pairs (for interpolation or something) data = collections.OrderedDict([(5.0, 3.7), (24.0, 8.7), (43.0, 1.5), (62.0, 2.1), (71.5, 6.7), (90.5, 4.1), (109.0, 3.9)]) out = schema(data) assert isinstance(out, collections.OrderedDict), 'Collection is no longer ordered' assert data.keys() == out.keys(), 'Order is not consistent' + + +def test_validation_performance(): + """ + This test comes to make sure the validation complexity of dictionaries is done in a linear time. + to achieve this a custom marker is used in the scheme that counts each time it is evaluated. + By doing so we can determine if the validation is done in linear complexity. + Prior to issue https://github.com/alecthomas/voluptuous/issues/259 this was exponential + """ + num_of_keys = 1000 + + schema_dict = {} + data = {} + data_extra_keys = {} + + counter = [0] + + class CounterMarker(Marker): + def __call__(self, *args, **kwargs): + counter[0] += 1 + return super(CounterMarker, self).__call__(*args, **kwargs) + + for i in range(num_of_keys): + schema_dict[CounterMarker(str(i))] = str + data[str(i)] = str(i) + data_extra_keys[str(i*2)] = str(i) # half of the keys are present, and half aren't + + schema = Schema(schema_dict, extra=ALLOW_EXTRA) + + schema(data) + + assert counter[0] <= num_of_keys, "Validation complexity is not linear! %s > %s" % (counter[0], num_of_keys) + + counter[0] = 0 # reset counter + schema(data_extra_keys) + + assert counter[0] <= num_of_keys, "Validation complexity is not linear! %s > %s" % (counter[0], num_of_keys)