Further MultiValuedValue optimization (#469)

I noticed significant time being spent repeatedly constructing the set of known subvalues. On a profiled run on our maiin internal codebase, this makes pyanalyze ~30% faster.
quora · Feb 7, 2022 · a92fb16 · a92fb16
1 parent 0bf4700
commit a92fb16
Show file tree

Hide file tree

Showing 2 changed files with 34 additions and 26 deletions.
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -2,6 +2,7 @@
 
 ## Unreleased
 
+- Optimize type compatibility checks on large unions (#469)
 - Detect incorrect key types passed to `dict.__getitem__` (#468)
 - Pick up the signature of `open()` from typeshed correctly (#463)
 - Do not strip away generic parameters explicitly set to

diff --git a/pyanalyze/value.py b/pyanalyze/value.py
@@ -1323,16 +1323,43 @@ class MultiValuedValue(Value):
     raw_vals: InitVar[Iterable[Value]]
     vals: Tuple[Value, ...] = field(init=False)
     """The underlying values of the union."""
+    _known_subvals: Optional[Tuple[Set[Tuple[object, type]], Sequence[Value]]] = field(
+        init=False, repr=False, hash=False, compare=False
+    )
 
     def __post_init__(self, raw_vals: Iterable[Value]) -> None:
         object.__setattr__(
             self,
             "vals",
             tuple(chain.from_iterable(flatten_values(val) for val in raw_vals)),
         )
+        object.__setattr__(self, "_known_subvals", self._get_known_subvals())
+
+    def _get_known_subvals(
+        self,
+    ) -> Optional[Tuple[Set[Tuple[object, type]], Sequence[Value]]]:
+        # Not worth it for small unions
+        if len(self.vals) < 10:
+            return None
+        # Optimization for comparing Unions containing large unions of literals.
+        try:
+            # Include the type to avoid e.g. 1 and True matching
+            known_values = {
+                (subval.val, type(subval.val))
+                for subval in self.vals
+                if isinstance(subval, KnownValue)
+            }
+        except TypeError:
+            return None  # not hashable
+        else:
+            # Make remaining check not consider the KnownValues again
+            remaining_vals = [
+                subval for subval in self.vals if not isinstance(subval, KnownValue)
+            ]
+            return known_values, remaining_vals
 
     def substitute_typevars(self, typevars: TypeVarMap) -> Value:
-        if not self.vals:
+        if not self.vals or not typevars:
             return self
         return MultiValuedValue(
             [val.substitute_typevars(typevars) for val in self.vals]
@@ -1359,35 +1386,15 @@ def can_assign(self, other: Value, ctx: CanAssignContext) -> CanAssign:
             return {}
         else:
             my_vals = self.vals
-            # Optimization for large unions of literals. We could perhaps cache this set,
-            # but that's more complicated. Empirically this is already much faster.
-            # The number 20 is arbitrary. I noticed the bottleneck in production on a
-            # Union with nearly 500 values.
-            if isinstance(other, KnownValue) and len(my_vals) > 20:
+            if isinstance(other, KnownValue) and self._known_subvals is not None:
+                known_values, my_vals = self._known_subvals
                 try:
-                    # Include the type to avoid e.g. 1 and True matching
-                    known_values = {
-                        (subval.val, type(subval.val))
-                        for subval in my_vals
-                        if isinstance(subval, KnownValue)
-                    }
+                    is_present = (other.val, type(other.val)) in known_values
                 except TypeError:
                     pass  # not hashable
                 else:
-                    try:
-                        is_present = (other.val, type(other.val)) in known_values
-                    except TypeError:
-                        pass  # not hashable
-                    else:
-                        if is_present:
-                            return {}
-                        else:
-                            # Make remaining check not consider the KnownValues again
-                            my_vals = [
-                                subval
-                                for subval in my_vals
-                                if not isinstance(subval, KnownValue)
-                            ]
+                    if is_present:
+                        return {}
 
             bounds_maps = []
             errors = []