From 00551bfd75a24f98523938da6caec11e35eea535 Mon Sep 17 00:00:00 2001
From: odedfos <odedfos@gmail.com>
Date: Tue, 17 Jan 2017 16:43:20 +0200
Subject: [PATCH] Fix #259 performance issue of exponential complexity where of
 all dict keys are matched with all keys in schema

---
 voluptuous/schema_builder.py | 26 +++++++++++++++++++--
 voluptuous/tests/tests.py    | 44 ++++++++++++++++++++++++++++++++----
 2 files changed, 64 insertions(+), 6 deletions(-)

diff --git a/voluptuous/schema_builder.py b/voluptuous/schema_builder.py
index 0bcd2c9..3edc8f2 100644
--- a/voluptuous/schema_builder.py
+++ b/voluptuous/schema_builder.py
@@ -5,6 +5,8 @@
 import sys
 from contextlib import contextmanager
 
+import itertools
+
 try:
     import error as er
 except ImportError:
@@ -110,6 +112,9 @@ def _isnamedtuple(obj):
     return isinstance(obj, tuple) and hasattr(obj, '_fields')
 
 
+primitive_types = (str, unicode, bool, int, float)
+
+
 class Undefined(object):
     def __nonzero__(self):
         return False
@@ -263,6 +268,20 @@ def _compile_mapping(self, schema, invalid_msg=None):
 
         candidates = list(_iterate_mapping_candidates(_compiled_schema))
 
+        # After we have the list of candidates in the correct order, we want to apply some optimization so that each
+        # key in the data being validated will be matched against the relevant schema keys only.
+        # No point in matching against different keys
+        additional_candidates = []
+        candidates_by_key = {}
+        for skey, (ckey, cvalue) in candidates:
+            if type(skey) in primitive_types:
+                candidates_by_key.setdefault(skey, []).append((skey, (ckey, cvalue)))
+            elif isinstance(skey, Marker) and type(skey.schema) in primitive_types:
+                candidates_by_key.setdefault(skey.schema, []).append((skey, (ckey, cvalue)))
+            else:
+                # These are wildcards such as 'int', 'str', 'Remove' and others which should be applied to all keys
+                additional_candidates.append((skey, (ckey, cvalue)))
+
         def validate_mapping(path, iterable, out):
             try:
                 from util import to_utf8_py2
@@ -278,9 +297,12 @@ def validate_mapping(path, iterable, out):
                 key_path = path + [key]
                 remove_key = False
 
+                # Optimization. Validate against the matching key first, then fallback to the rest
+                relevant_candidates = itertools.chain(candidates_by_key.get(key, []), additional_candidates)
+
                 # compare each given key/value against all compiled key/values
                 # schema key, (compiled key, compiled value)
-                for skey, (ckey, cvalue) in candidates:
+                for skey, (ckey, cvalue) in relevant_candidates:
                     try:
                         new_key = ckey(key_path, key)
                     except er.Invalid as e:
@@ -634,7 +656,7 @@ def key_literal(key):
         # for each item in the extension schema, replace duplicates
         # or add new keys
         for key, value in iteritems(schema):
-            
+
             # if the key is already in the dictionary, we need to replace it
             # transform key to literal before checking presence
             if key_literal(key) in result_key_map:
diff --git a/voluptuous/tests/tests.py b/voluptuous/tests/tests.py
index 1f9b7ff..86bb858 100644
--- a/voluptuous/tests/tests.py
+++ b/voluptuous/tests/tests.py
@@ -7,8 +7,7 @@
     Url, MultipleInvalid, LiteralInvalid, NotIn, Match, Email,
     Replace, Range, Coerce, All, Any, Length, FqdnUrl, ALLOW_EXTRA, PREVENT_EXTRA,
     validate, ExactSequence, Equal, Unordered, Number, Maybe, Datetime, Date,
-    Contains
-)
+    Contains, Marker)
 from voluptuous.humanize import humanize_error
 from voluptuous.util import to_utf8_py2, u
 
@@ -82,7 +81,7 @@ def test_contains():
         schema({'color': ['blue', 'yellow']})
     except Invalid as e:
         assert_equal(str(e),
-                    "value is not allowed for dictionary value @ data['color']")
+                     "value is not allowed for dictionary value @ data['color']")
 
 
 def test_remove():
@@ -788,10 +787,47 @@ def test_ordered_dict():
     if not hasattr(collections, 'OrderedDict'):
         # collections.OrderedDict was added in Python2.7; only run if present
         return
-    schema = Schema({Number(): Number()}) # x, y pairs (for interpolation or something)
+    schema = Schema({Number(): Number()})  # x, y pairs (for interpolation or something)
     data = collections.OrderedDict([(5.0, 3.7), (24.0, 8.7), (43.0, 1.5),
                                     (62.0, 2.1), (71.5, 6.7), (90.5, 4.1),
                                     (109.0, 3.9)])
     out = schema(data)
     assert isinstance(out, collections.OrderedDict), 'Collection is no longer ordered'
     assert data.keys() == out.keys(), 'Order is not consistent'
+
+
+def test_validation_performance():
+    """
+    This test comes to make sure the validation complexity of dictionaries is done in a linear time.
+    to achieve this a custom marker is used in the scheme that counts each time it is evaluated.
+    By doing so we can determine if the validation is done in linear complexity.
+    Prior to issue https://github.com/alecthomas/voluptuous/issues/259 this was exponential
+    """
+    num_of_keys = 1000
+
+    schema_dict = {}
+    data = {}
+    data_extra_keys = {}
+
+    counter = [0]
+
+    class CounterMarker(Marker):
+        def __call__(self, *args, **kwargs):
+            counter[0] += 1
+            return super(CounterMarker, self).__call__(*args, **kwargs)
+
+    for i in range(num_of_keys):
+        schema_dict[CounterMarker(str(i))] = str
+        data[str(i)] = str(i)
+        data_extra_keys[str(i*2)] = str(i)  # half of the keys are present, and half aren't
+
+    schema = Schema(schema_dict, extra=ALLOW_EXTRA)
+
+    schema(data)
+
+    assert counter[0] <= num_of_keys, "Validation complexity is not linear! %s > %s" % (counter[0], num_of_keys)
+
+    counter[0] = 0  # reset counter
+    schema(data_extra_keys)
+
+    assert counter[0] <= num_of_keys, "Validation complexity is not linear! %s > %s" % (counter[0], num_of_keys)