Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve shrink ordering of text #2482

Merged
merged 6 commits into from
Jul 26, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions hypothesis-python/RELEASE.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
RELEASE_TYPE: patch

This release improves the behaviour of the :func:`~hypothesis.strategies.characters` strategy
when shrinking, by changing which characters are considered smallest to prefer more "normal" ascii characters
where available.
115 changes: 115 additions & 0 deletions hypothesis-python/src/hypothesis/internal/conjecture/dfa/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
# This file is part of Hypothesis, which may be found at
# https://github.com/HypothesisWorks/hypothesis/
#
# Most of this work is copyright (C) 2013-2020 David R. MacIver
# ([email protected]), but it contains contributions by others. See
# CONTRIBUTING.rst for a full list of people who may hold copyright, and
# consult the git log if you need to determine who owns an individual
# contribution.
#
# This Source Code Form is subject to the terms of the Mozilla Public License,
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.
#
# END HEADER

import threading
from collections import deque


class DFA:
"""Base class for implementations of deterministic finite
automata.

This is abstract to allow for the possibility of states
being calculated lazily as we traverse the DFA (which
we make heavy use of in our L* implementation - see
lstar.py for details).

States can be of any hashable type.
"""

def __init__(self):
self.__caches = threading.local()

@property
def start(self):
"""Returns the starting state."""
raise NotImplementedError()

def is_accepting(self, i):
"""Returns if state ``i`` is an accepting one."""
raise NotImplementedError()

def transition(self, i, c):
"""Returns the state that i transitions to on reading
character c from a string."""
raise NotImplementedError()

def transitions(self, i):
"""Iterates over all pairs (byte, state) of transitions
which do not lead to dead states."""
for c, j in self.__raw_transitions(i):
if not self.is_dead(j):
yield c, j

def matches(self, s):
"""Returns whether the string ``s`` is accepted
by this automaton."""
i = self.start
for c in s:
i = self.transition(i, c)
return self.is_accepting(i)

def is_dead(self, i):
"""Returns True if no strings can be accepted
when starting from state ``i``."""
if self.is_accepting(i):
return False

try:
cache = self.__caches.dead
except AttributeError:
cache = {}
self.__caches.dead = cache

try:
return cache[i]
except KeyError:
pass
seen = set()
pending = deque([i])
result = True
while pending:
j = pending.popleft()
if j in seen:
continue
seen.add(j)
if self.is_accepting(j):
result = False
break
else:
for _, k in self.__raw_transitions(j):
pending.append(k)
if result:
for j in seen:
cache[j] = True
else:
cache[i] = False
return result

def all_matching_strings(self):
"""Iterate over all strings matched by this automaton
in shortlex-ascending order."""
queue = deque([(self.start, b"")])
while queue:
i, path = queue.popleft()
if self.is_accepting(i):
yield path
for c, j in self.transitions(i):
queue.append((j, path + bytes([c])))

def __raw_transitions(self, i):
for c in range(256):
j = self.transition(i, c)
yield c, j
221 changes: 221 additions & 0 deletions hypothesis-python/src/hypothesis/internal/conjecture/dfa/lstar.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
# This file is part of Hypothesis, which may be found at
# https://github.com/HypothesisWorks/hypothesis/
#
# Most of this work is copyright (C) 2013-2020 David R. MacIver
# ([email protected]), but it contains contributions by others. See
# CONTRIBUTING.rst for a full list of people who may hold copyright, and
# consult the git log if you need to determine who owns an individual
# contribution.
#
# This Source Code Form is subject to the terms of the Mozilla Public License,
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.
#
# END HEADER

from hypothesis.internal.conjecture.dfa import DFA
from hypothesis.internal.conjecture.junkdrawer import find_integer

"""
This module contains an implementation of the L* algorithm
for learning a deterministic finite automaton based on an
unknown membership function and a series of examples of
strings that may or may not satisfy it.

The two relevant papers for understanding this are:

* Angluin, Dana. "Learning regular sets from queries and counterexamples."
Information and computation 75.2 (1987): 87-106.
* Rivest, Ronald L., and Robert E. Schapire. "Inference of finite automata
using homing sequences." Information and Computation 103.2 (1993): 299-347.
DRMacIver marked this conversation as resolved.
Show resolved Hide resolved
Note that we only use the material from section 4.5 "Improving Angluin's L*
algorithm" (page 318), and all of the rest of the material on homing
sequences can be skipped.

The former explains the core algorithm, the latter a modification
we use (which we have further modified) which allows it to
be implemented more efficiently.

We have two major departures from the paper:

1. We learn the automaton lazily as we traverse it. This is particularly
valuable because if we make many corrections on the same string we only
have to learn the transitions that correspond to the string we are
correcting on.
2. We make use of our ``find_integer`` method rather than a binary search
as proposed in the Rivest and Schapire paper, as we expect that
usually most strings will be mispredicted near the beginning.

A note on performance: This code is not really fast enough for
us to ever want to run in production on large strings, and this
is somewhat intrinsic. We should only use it in testing or for
learning languages offline that we can record for later use.

"""


class LStar:
def __init__(self, member):
self.__experiments = []
self.__cache = {}
self.__member = member

self.__add_experiment(b"")

def member(self, s):
"""Check whether this string is a member of the language
to be learned."""
s = bytes(s)
try:
return self.__cache[s]
except KeyError:
return self.__cache.setdefault(s, self.__member(s))

@property
def generation(self):
"""Return an integer value that will be incremented
every time the DFA we predict changes."""
return len(self.__experiments)

@property
def dfa(self):
"""Returns our current model of a DFA for matching
the language we are learning."""
if self.__dfa is None:
self.__dfa = ExperimentDFA(self.member, self.__experiments)
return self.__dfa

def learn(self, s):
"""Learn to give the correct answer on this string.
That is, after this method completes we will have
``self.dfa.matches(s) == self.member(s)``.

Note that we do not guarantee that this will remain
true in the event that learn is called again with
a different string. It is in principle possible that
future learning will cause us to make a mistake on
this string. However, repeatedly calling learn on
each of a set of strings until the generation stops
changing is guaranteed to terminate.
"""
s = bytes(s)
correct_outcome = self.member(s)

# We don't want to check this inside the loop because it potentially
# causes us to evaluate more of the states than we actually need to,
# but if our model is mostly correct then this will be faster because
# we only need to evaluate strings that are of the form
# ``state + experiment``, which will generally be cached and/or needed
# later.
if self.dfa.matches(s) == correct_outcome:
return

# In the papers they assume that we only run this process
# once, but this is silly - often when you've got a messy
# string it will be wrong for many different reasons.
#
# Thus we iterate this to a fixed point where we repair
# the DFA by repeatedly adding experiments until the DFA
# agrees with the membership function on this string.
while True:
dfa = self.dfa

states = [dfa.start]

def seems_right(n):
"""After reading n characters from s, do we seem to be
in the right state?

We determine this by replacing the first n characters
of s with the label of the state we expect to be in.
If we are in the right state, that will replace a substring
with an equivalent one so must produce the same answer.
"""
if n > len(s):
return False

# Populate enough of the states list to know where we are.
while n >= len(states):
states.append(dfa.transition(states[-1], s[len(states) - 1]))

return self.member(dfa.label(states[n]) + s[n:]) == correct_outcome

n = find_integer(seems_right)

# We got to the end without ever finding ourself in a bad
# state, so we must correctly match this string.
if n == len(s):
assert dfa.matches(s) == correct_outcome
break

# Reading n characters does not put us in a bad state but
# reading n + 1 does. This means that the remainder of
# the string that we have not read yet is an experiment
# that allows us to distinguish the state that we ended
# up in from the state that we should have ended up in.
self.__add_experiment(s[n + 1 :])

def __add_experiment(self, e):
self.__experiments.append(e)
self.__dfa = None


class ExperimentDFA(DFA):
"""This implements a lazily calculated DFA where states
are labelled by some string that reaches them, and are
distinguished by a membership test and a set of experiments."""

def __init__(self, member, experiments):
DFA.__init__(self)
self.__experiments = tuple(experiments)
self.__member = member

self.__states = [b""]
self.__rows_to_states = {tuple(map(member, experiments)): 0}
self.__transition_cache = {}

def label(self, i):
return self.__states[i]

@property
def start(self):
return 0

def is_accepting(self, i):
return self.__member(self.__states[i])

def transition(self, i, c):
key = (i, c)
try:
return self.__transition_cache[key]
except KeyError:
pass
s = self.__states[i]

# t is either the string that labels our destination
# state or one equivalent to it.
t = s + bytes([c])

# A row is a tuple of booleans that corresponds to
# the information our experiments can reveal about
# this string. Two strings with different rows *must*
# correspond to different states in the DFA for our
# membership function, because the same path out
# of them (taken by one of the experiments) leads to
# different results.
row = tuple(self.__member(t + e) for e in self.__experiments)
try:
# If we have seen this row before, assume that this
# state is equivalent to that already discovered one.
# If it is not, we will have to find a new experiment
# to reveal that,
result = self.__rows_to_states[row]
except KeyError:
# This string is definitely not equivalent to any of
# those visited before, so it must be a new state and
# we add it to our list of states.
result = len(self.__states)
self.__states.append(t)
self.__rows_to_states[row] = result
self.__transition_cache[key] = result
return result
15 changes: 13 additions & 2 deletions hypothesis-python/src/hypothesis/internal/conjecture/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,14 @@ class RunIsComplete(Exception):


class ConjectureRunner:
def __init__(self, test_function, settings=None, random=None, database_key=None):
def __init__(
self,
test_function,
settings=None,
random=None,
database_key=None,
ignore_limits=False,
):
self._test_function = test_function
self.settings = settings or Settings()
self.shrinks = 0
Expand All @@ -96,6 +103,7 @@ def __init__(self, test_function, settings=None, random=None, database_key=None)
self.valid_examples = 0
self.random = random or Random(getrandbits(128))
self.database_key = database_key
self.ignore_limits = ignore_limits

# Global dict of per-phase statistics, and a list of per-call stats
# which transfer to the global dict at the end of each phase.
Expand Down Expand Up @@ -273,7 +281,8 @@ def test_function(self, data):
self.exit_with(ExitReason.max_shrinks)

if (
self.finish_shrinking_deadline is not None
not self.ignore_limits
and self.finish_shrinking_deadline is not None
and self.finish_shrinking_deadline < time.perf_counter()
):
# See https://github.com/HypothesisWorks/hypothesis/issues/2340
Expand Down Expand Up @@ -558,6 +567,8 @@ def reuse_existing_examples(self):
break

def exit_with(self, reason):
if self.ignore_limits:
return
self.statistics["stopped-because"] = reason.describe(self.settings)
if self.best_observed_targets:
self.statistics["targets"] = dict(self.best_observed_targets)
Expand Down
Loading