From 6941cd27604c0f9d096ac93950f7b81e75c43d2f Mon Sep 17 00:00:00 2001 From: Zac Hatfield-Dodds Date: Sat, 9 Dec 2023 16:11:01 -0800 Subject: [PATCH 1/2] Update CODEOWNERS --- CODEOWNERS | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/CODEOWNERS b/CODEOWNERS index ac74a335b6..eefcfa5318 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,7 +1,7 @@ -# Engine changes need to be approved by DRMacIver, as per +# Engine changes need to be approved by Zac-HD, as per # https://github.com/HypothesisWorks/hypothesis/blob/master/guides/review.rst#engine-changes -/hypothesis-python/src/hypothesis/internal/conjecture/ @DRMacIver @Zac-HD +/hypothesis-python/src/hypothesis/internal/conjecture/ @Zac-HD -# Changes to the paper also need to be approved by DRMacIver -/paper.md @DRMacIver -/paper.bib @DRMacIver +# Changes to the paper also need to be approved by DRMacIver or Zac, as authors +/paper.md @DRMacIver @Zac-HD +/paper.bib @DRMacIver @Zac-HD From 239c83602da2b6d0da87b76594771087f0994f04 Mon Sep 17 00:00:00 2001 From: Zac Hatfield-Dodds Date: Sat, 9 Dec 2023 16:11:01 -0800 Subject: [PATCH 2/2] Observability mode - output jsonlines --- hypothesis-python/RELEASE.rst | 4 + hypothesis-python/docs/_static/.empty | 0 .../docs/_static/wrap-in-tables.css | 15 ++ hypothesis-python/docs/changes.rst | 2 +- hypothesis-python/docs/conf.py | 3 + hypothesis-python/docs/index.rst | 1 + hypothesis-python/docs/observability.rst | 76 +++++++ .../docs/schema_observations.json | 93 +++++++++ .../src/_hypothesis_pytestplugin.py | 10 +- hypothesis-python/src/hypothesis/core.py | 185 ++++++++++++++---- .../hypothesis/internal/conjecture/data.py | 1 + .../src/hypothesis/internal/observability.py | 92 +++++++++ .../src/hypothesis/internal/reflection.py | 6 +- .../src/hypothesis/internal/scrutineer.py | 62 +++++- .../hypothesis/strategies/_internal/core.py | 11 +- .../hypothesis/strategies/_internal/utils.py | 41 ++++ .../tests/cover/test_observability.py | 67 +++++++ .../tests/cover/test_searchstrategy.py | 5 + mypy.ini | 2 + requirements/tools.in | 1 + requirements/tools.txt | 10 +- 21 files changed, 645 insertions(+), 42 deletions(-) create mode 100644 hypothesis-python/RELEASE.rst delete mode 100644 hypothesis-python/docs/_static/.empty create mode 100644 hypothesis-python/docs/_static/wrap-in-tables.css create mode 100644 hypothesis-python/docs/observability.rst create mode 100644 hypothesis-python/docs/schema_observations.json create mode 100644 hypothesis-python/src/hypothesis/internal/observability.py create mode 100644 hypothesis-python/tests/cover/test_observability.py diff --git a/hypothesis-python/RELEASE.rst b/hypothesis-python/RELEASE.rst new file mode 100644 index 0000000000..0f7eff12b2 --- /dev/null +++ b/hypothesis-python/RELEASE.rst @@ -0,0 +1,4 @@ +RELEASE_TYPE: minor + +This release adds an experimental :wikipedia:`observability ` +mode. :doc:`You can read the docs about it here `. diff --git a/hypothesis-python/docs/_static/.empty b/hypothesis-python/docs/_static/.empty deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/hypothesis-python/docs/_static/wrap-in-tables.css b/hypothesis-python/docs/_static/wrap-in-tables.css new file mode 100644 index 0000000000..3ff01dda29 --- /dev/null +++ b/hypothesis-python/docs/_static/wrap-in-tables.css @@ -0,0 +1,15 @@ +/* override table width restrictions */ +/* thanks to https://github.com/readthedocs/sphinx_rtd_theme/issues/117#issuecomment-153083280 */ +@media screen and (min-width: 767px) { + + .wy-table-responsive table td { + /* !important prevents the common CSS stylesheets from + overriding this as on RTD they are loaded after this stylesheet */ + white-space: normal !important; + } + + .wy-table-responsive { + overflow: visible !important; + } + +} diff --git a/hypothesis-python/docs/changes.rst b/hypothesis-python/docs/changes.rst index 97e4162400..e0f3c2fb65 100644 --- a/hypothesis-python/docs/changes.rst +++ b/hypothesis-python/docs/changes.rst @@ -2362,7 +2362,7 @@ Did you know that of the 2\ :superscript:`64` possible floating-point numbers, While nans *usually* have all zeros in the sign bit and mantissa, this `isn't always true `__, -and :wikipedia:`'signaling' nans might trap or error `. +and :wikipedia:`'signaling' nans might trap or error `. To help distinguish such errors in e.g. CI logs, Hypothesis now prints ``-nan`` for negative nans, and adds a comment like ``# Saw 3 signaling NaNs`` if applicable. diff --git a/hypothesis-python/docs/conf.py b/hypothesis-python/docs/conf.py index c4141e304e..8b7624f6fe 100644 --- a/hypothesis-python/docs/conf.py +++ b/hypothesis-python/docs/conf.py @@ -30,6 +30,7 @@ "hoverxref.extension", "sphinx_codeautolink", "sphinx_selective_exclude.eager_only", + "sphinx-jsonschema", ] templates_path = ["_templates"] @@ -147,6 +148,8 @@ def setup(app): html_static_path = ["_static"] +html_css_files = ["wrap-in-tables.css"] + htmlhelp_basename = "Hypothesisdoc" html_favicon = "../../brand/favicon.ico" diff --git a/hypothesis-python/docs/index.rst b/hypothesis-python/docs/index.rst index 41fd7abee7..9cbca1702d 100644 --- a/hypothesis-python/docs/index.rst +++ b/hypothesis-python/docs/index.rst @@ -80,3 +80,4 @@ check out some of the support packaging reproducing + observability diff --git a/hypothesis-python/docs/observability.rst b/hypothesis-python/docs/observability.rst new file mode 100644 index 0000000000..4d0493c20d --- /dev/null +++ b/hypothesis-python/docs/observability.rst @@ -0,0 +1,76 @@ +=================== +Observability tools +=================== + +.. warning:: + + This feature is experimental, and could have breaking changes or even be removed + without notice. Try it out, let us know what you think, but don't rely on it + just yet! + + +Motivation +========== + +Understanding what your code is doing - for example, why your test failed - is often +a frustrating exercise in adding some more instrumentation or logging (or ``print()`` calls) +and running it again. The idea of :wikipedia:`observability ` +is to let you answer questions you didn't think of in advance. In slogan form, + + *Debugging should be a data analysis problem.* + +By default, Hypothesis only reports the minimal failing example... but sometimes you might +want to know something about *all* the examples. Printing them to the terminal with +:ref:`verbose output ` might be nice, but isn't always enough. +This feature gives you an analysis-ready dataframe with useful columns and one row +per test case, with columns from arguments to code coverage to pass/fail status. + +This is deliberately a much lighter-weight and task-specific system than e.g. +`OpenTelemetry `__. It's also less detailed than time-travel +debuggers such as `rr `__ or `pytrace `__, +because there's no good way to compare multiple traces from these tools and their +Python support is relatively immature. + + +Configuration +============= + +If you set the ``HYPOTHESIS_EXPERIMENTAL_OBSERVABILITY`` environment variable, +Hypothesis will log various observations to jsonlines files in the +``.hypothesis/observed/`` directory. You can load and explore these with e.g. +:func:`pd.read_json(".hypothesis/observed/*_testcases.jsonl", lines=True) `, +or by using the :pypi:`sqlite-utils` and :pypi:`datasette` libraries:: + + sqlite-utils insert testcases.db testcases .hypothesis/observed/*_testcases.jsonl --nl --flatten + datasette serve testcases.db + + +Collecting more information +--------------------------- + +If you want to record more information about your test cases than the arguments and +outcome - for example, was ``x`` a binary tree? what was the difference between the +expected and the actual value? how many queries did it take to find a solution? - +Hypothesis makes this easy. + +:func:`~hypothesis.event` accepts a string label, and optionally a string or int or +float observation associated with it. All events are collected and summarized in +:ref:`statistics`, as well as included on a per-test-case basis in our observations. + +:func:`~hypothesis.target` is a special case of numeric-valued events: as well as +recording them in observations, Hypothesis will try to maximize the targeted value. +Knowing that, you can use this to guide the search for failing inputs. + + +Data Format +=========== + +We dump observations in `json lines format `__, with each line +describing either a test case or an information message. The tables below are derived +from :download:`this machine-readable JSON schema `, to +provide both readable and verifiable specifications. + +.. jsonschema:: ./schema_observations.json#/oneOf/0 + :hide_key: /additionalProperties, /type +.. jsonschema:: ./schema_observations.json#/oneOf/1 + :hide_key: /additionalProperties, /type diff --git a/hypothesis-python/docs/schema_observations.json b/hypothesis-python/docs/schema_observations.json new file mode 100644 index 0000000000..48b44a797f --- /dev/null +++ b/hypothesis-python/docs/schema_observations.json @@ -0,0 +1,93 @@ +{ + "title": "PBT Observations", + "description": "PBT Observations define a standard way to communicate what happened when property-based tests were run. They describe test cases, or general notifications classified as info, alert, or error messages.", + "oneOf": [ + { + "title": "Test case", + "description": "Describes the inputs to and result of running some test function on a particular input. The test might have passed, failed, or been abandoned part way through (e.g. because we failed a ``.filter()`` condition).", + "type": "object", + "properties": { + "type": { + "const": "test_case", + "description": "A tag which labels this observation as data about a specific test case." + }, + "status": { + "enum": ["passed", "failed", "gave_up"], + "description": "Whether the test passed, failed, or was aborted before completion (e.g. due to use of ``.filter()``). Note that if we gave_up partway, values such as arguments and features may be incomplete." + }, + "status_reason": { + "type": "string", + "description": "If non-empty, the reason for which the test failed or was abandoned. For Hypothesis, this is usually the exception type and location." + }, + "representation": { + "type": "string", + "description": "The string representation of the input." + }, + "arguments": { + "type": "object", + "description": "A structured json-encoded representation of the input. Hypothesis always provides a dictionary of argument names to json-ified values, including interactive draws from the :func:`~hypothesis.strategies.data` strategy. In other libraries this can be any object." + }, + "how_generated": { + "type": ["string", "null"], + "description": "How the input was generated, if known. In Hypothesis this might be an explicit example, generated during a particular phase with some backend, or by replaying the minimal failing example." + }, + "features": { + "type": "object", + "description": "Runtime observations which might help explain what this test case did. Hypothesis includes target() scores, tags from event(), time spent generating data and running user code, and so on." + }, + "coverage": { + "type": ["object", "null"], + "description": "Mapping of filename to list of covered line numbers, if coverage information is available, or None if not. Hypothesis deliberately omits stdlib and site-packages code.", + "additionalProperties": { + "type": "array", + "items": {"type": "integer", "minimum": 1}, + "uniqueItems": true + } + }, + "metadata": { + "type": "object", + "description": "Arbitrary metadata which might be of interest, but does not semantically fit in 'features'. For example, Hypothesis includes the traceback for failing tests here." + }, + "property": { + "type": "string", + "description": "The name or representation of the test function we're running." + }, + "run_start": { + "type": "number", + "description": "unix timestamp at which we started running this test function, so that later analysis can group test cases by run." + } + }, + "required": ["type", "status", "status_reason", "representation", "arguments", "how_generated", "features", "coverage", "metadata", "property", "run_start"], + "additionalProperties": false + }, + { + "title": "Information message", + "description": "Info, alert, and error messages correspond to a group of test cases or the overall run, and are intended for humans rather than machine analysis.", + "type": "object", + "properties": { + "type": { + "enum": [ "info", "alert", "error"], + "description": "A tag which labels this observation as general information to show the user. Hypothesis uses info messages to report statistics; alert or error messages can be provided by plugins." + }, + "title": { + "type": "string", + "description": "The title of this message" + }, + "content": { + "type": "string", + "description": "The body of the message. May use markdown." + }, + "property": { + "type": "string", + "description": "The name or representation of the test function we're running. For Hypothesis, usually the Pytest nodeid." + }, + "run_start": { + "type": "number", + "description": "unix timestamp at which we started running this test function, so that later analysis can group test cases by run." + } + }, + "required": [ "type", "title", "content", "property", "run_start"], + "additionalProperties": false + } + ] +} \ No newline at end of file diff --git a/hypothesis-python/src/_hypothesis_pytestplugin.py b/hypothesis-python/src/_hypothesis_pytestplugin.py index 3bb2535f3b..9875e067f5 100644 --- a/hypothesis-python/src/_hypothesis_pytestplugin.py +++ b/hypothesis-python/src/_hypothesis_pytestplugin.py @@ -373,6 +373,13 @@ def pytest_terminal_summary(terminalreporter): if fex: failing_examples.append(json.loads(fex)) + from hypothesis.internal.observability import _WROTE_TO + + if _WROTE_TO: + terminalreporter.section("Hypothesis") + for fname in sorted(_WROTE_TO): + terminalreporter.write_line(f"observations written to {fname}") + if failing_examples: # This must have been imported already to write the failing examples from hypothesis.extra._patching import gc_patches, make_patch, save_patch @@ -384,7 +391,8 @@ def pytest_terminal_summary(terminalreporter): except Exception: # fail gracefully if we hit any filesystem or permissions problems return - terminalreporter.section("Hypothesis") + if not _WROTE_TO: + terminalreporter.section("Hypothesis") terminalreporter.write_line( f"`git apply {fname}` to add failing examples to your code." ) diff --git a/hypothesis-python/src/hypothesis/core.py b/hypothesis-python/src/hypothesis/core.py index b2298418c8..82e359fc2f 100644 --- a/hypothesis-python/src/hypothesis/core.py +++ b/hypothesis-python/src/hypothesis/core.py @@ -87,6 +87,11 @@ get_trimmed_traceback, ) from hypothesis.internal.healthcheck import fail_health_check +from hypothesis.internal.observability import ( + TESTCASE_CALLBACKS, + deliver_json_blob, + make_testcase, +) from hypothesis.internal.reflection import ( convert_positional_arguments, define_function_signature, @@ -99,7 +104,12 @@ proxies, repr_call, ) -from hypothesis.internal.scrutineer import Tracer, explanatory_lines +from hypothesis.internal.scrutineer import ( + Trace, + Tracer, + explanatory_lines, + tractable_coverage_report, +) from hypothesis.internal.validation import check_type from hypothesis.reporting import ( current_verbosity, @@ -107,13 +117,14 @@ verbose_report, with_reporter, ) -from hypothesis.statistics import describe_targets, note_statistics +from hypothesis.statistics import describe_statistics, describe_targets, note_statistics from hypothesis.strategies._internal.misc import NOTHING from hypothesis.strategies._internal.strategies import ( Ex, SearchStrategy, check_strategy, ) +from hypothesis.strategies._internal.utils import to_jsonable from hypothesis.vendor.pretty import RepresentationPrinter from hypothesis.version import __version__ @@ -484,13 +495,14 @@ def execute_explicit_examples(state, wrapped_test, arguments, kwargs, original_s with local_settings(state.settings): fragments_reported = [] + empty_data = ConjectureData.for_buffer(b"") try: bits = ", ".join(nicerepr(x) for x in arguments) + ", ".join( f"{k}={nicerepr(v)}" for k, v in example_kwargs.items() ) execute_example = partial( state.execute_once, - ConjectureData.for_buffer(b""), + empty_data, is_final=True, print_example=True, example_kwargs=example_kwargs, @@ -544,7 +556,8 @@ def execute_explicit_examples(state, wrapped_test, arguments, kwargs, original_s # development, this is rather useful to replay Hypothesis' part of # a saved failure when other arguments are supplied by e.g. pytest. # See https://github.com/HypothesisWorks/hypothesis/issues/2125 - pass + with contextlib.suppress(StopTest): + empty_data.conclude_test(Status.INVALID) except BaseException as err: # In order to support reporting of multiple failing examples, we yield # each of the (report text, error) pairs we find back to the top-level @@ -567,6 +580,8 @@ def execute_explicit_examples(state, wrapped_test, arguments, kwargs, original_s new.__cause__ = err err = new + with contextlib.suppress(StopTest): + empty_data.conclude_test(Status.INVALID) yield (fragments_reported, err) if ( state.settings.report_multiple_bugs @@ -583,6 +598,15 @@ def execute_explicit_examples(state, wrapped_test, arguments, kwargs, original_s "Falsifying example", "Falsifying explicit example", 1 ) + tc = make_testcase( + start_timestamp=state._start_timestamp, + test_name_or_nodeid=state.test_identifier, + data=empty_data, + how_generated="explicit example", + string_repr=state._string_repr, + ) + deliver_json_blob(tc) + if fragments_reported: verbose_report(fragments_reported[0].replace("Falsifying", "Trying", 1)) for f in fragments_reported[1:]: @@ -738,7 +762,6 @@ def __init__(self, stuff, test, settings, random, wrapped_test): self.last_exception = None self.falsifying_examples = () self.random = random - self.__test_runtime = None self.ever_executed = False self.is_find = getattr(wrapped_test, "_hypothesis_internal_is_find", False) @@ -756,6 +779,16 @@ def __init__(self, stuff, test, settings, random, wrapped_test): self.failed_due_to_deadline = False self.explain_traces = defaultdict(set) + self._start_timestamp = time.time() + self._string_repr = "" + self._jsonable_arguments = {} + self._timing_features = {} + + @property + def test_identifier(self): + return getattr( + current_pytest_item.value, "nodeid", None + ) or get_pretty_function_description(self.wrapped_test) def execute_once( self, @@ -780,6 +813,7 @@ def execute_once( self.ever_executed = True data.is_find = self.is_find + self._string_repr = "" text_repr = None if self.settings.deadline is None: test = self.test @@ -787,16 +821,23 @@ def execute_once( @proxies(self.test) def test(*args, **kwargs): - self.__test_runtime = None + arg_drawtime = sum(data.draw_times) initial_draws = len(data.draw_times) start = time.perf_counter() - result = self.test(*args, **kwargs) - finish = time.perf_counter() - internal_draw_time = sum(data.draw_times[initial_draws:]) - runtime = datetime.timedelta( - seconds=finish - start - internal_draw_time - ) - self.__test_runtime = runtime + try: + result = self.test(*args, **kwargs) + finally: + finish = time.perf_counter() + internal_draw_time = sum(data.draw_times[initial_draws:]) + runtime = datetime.timedelta( + seconds=finish - start - internal_draw_time + ) + self._timing_features = { + "time_running_test": finish - start - internal_draw_time, + "time_drawing_args": arg_drawtime, + "time_interactive_draws": internal_draw_time, + } + current_deadline = self.settings.deadline if not is_final: current_deadline = (current_deadline // 4) * 5 @@ -855,6 +896,26 @@ def run(data): ), ) report(printer.getvalue()) + + if TESTCASE_CALLBACKS: + printer = RepresentationPrinter(context=context) + printer.repr_call( + test.__name__, + args, + kwargs, + force_split=True, + arg_slices=argslices, + leading_comment=( + "# " + context.data.slice_comments[(0, 0)] + if (0, 0) in context.data.slice_comments + else None + ), + ) + self._string_repr = printer.getvalue() + self._jsonable_arguments = { + **dict(enumerate(map(to_jsonable, args))), + **{k: to_jsonable(v) for k, v in kwargs.items()}, + } return test(*args, **kwargs) # self.test_runner can include the execute_example method, or setup/teardown @@ -870,9 +931,8 @@ def run(data): # instead raise an appropriate diagnostic error. if expected_failure is not None: exception, traceback = expected_failure - if ( - isinstance(exception, DeadlineExceeded) - and self.__test_runtime is not None + if isinstance(exception, DeadlineExceeded) and ( + runtime_secs := self._timing_features.get("time_running_test") ): report( "Unreliable test timings! On an initial run, this " @@ -884,7 +944,7 @@ def run(data): % ( exception.runtime.total_seconds() * 1000, self.settings.deadline.total_seconds() * 1000, - self.__test_runtime.total_seconds() * 1000, + runtime_secs * 1000, ) ) else: @@ -895,7 +955,7 @@ def run(data): ) from exception return result - def _execute_once_for_engine(self, data): + def _execute_once_for_engine(self, data: ConjectureData) -> None: """Wrapper around ``execute_once`` that intercepts test failure exceptions and single-test control exceptions, and turns them into appropriate method calls to `data` instead. @@ -903,16 +963,18 @@ def _execute_once_for_engine(self, data): This allows the engine to assume that any exception other than ``StopTest`` must be a fatal error, and should stop the entire engine. """ + trace: Trace = set() try: - trace = frozenset() - if ( + _can_trace = ( + sys.gettrace() is None or sys.version_info[:2] >= (3, 12) + ) and not PYPY + _trace_obs = TESTCASE_CALLBACKS + _trace_failure = ( self.failed_normally and not self.failed_due_to_deadline - and Phase.shrink in self.settings.phases - and Phase.explain in self.settings.phases - and (sys.gettrace() is None or sys.version_info[:2] >= (3, 12)) - and not PYPY - ): # pragma: no cover + and {Phase.shrink, Phase.explain}.issubset(self.settings.phases) + ) + if _can_trace and (_trace_obs or _trace_failure): # pragma: no cover # This is in fact covered by our *non-coverage* tests, but due to the # settrace() contention *not* by our coverage tests. Ah well. with Tracer() as tracer: @@ -921,7 +983,7 @@ def _execute_once_for_engine(self, data): if data.status == Status.VALID: self.explain_traces[None].add(frozenset(tracer.branches)) finally: - trace = frozenset(tracer.branches) + trace = tracer.branches else: result = self.execute_once(data) if result is not None: @@ -964,20 +1026,40 @@ def _execute_once_for_engine(self, data): tb = get_trimmed_traceback() info = data.extra_information - info.__expected_traceback = format_exception(e, tb) - info.__expected_exception = e - verbose_report(info.__expected_traceback) + info._expected_traceback = format_exception(e, tb) # type: ignore + info._expected_exception = e # type: ignore + verbose_report(info._expected_traceback) # type: ignore self.failed_normally = True interesting_origin = InterestingOrigin.from_exception(e) if trace: # pragma: no cover # Trace collection is explicitly disabled under coverage. - self.explain_traces[interesting_origin].add(trace) + self.explain_traces[interesting_origin].add(frozenset(trace)) if interesting_origin[0] == DeadlineExceeded: self.failed_due_to_deadline = True self.explain_traces.clear() - data.mark_interesting(interesting_origin) + data.mark_interesting(interesting_origin) # type: ignore # mypy bug? + finally: + # Conditional here so we can save some time constructing the payload; in + # other cases (without coverage) it's cheap enough to do that regardless. + if TESTCASE_CALLBACKS: + if self.failed_normally or self.failed_due_to_deadline: + phase = "shrink" + else: + phase = "unknown" + tc = make_testcase( + start_timestamp=self._start_timestamp, + test_name_or_nodeid=self.test_identifier, + data=data, + how_generated=f"generated during {phase} phase", + string_repr=self._string_repr, + arguments={**self._jsonable_arguments, **data._observability_args}, + metadata=self._timing_features, + coverage=tractable_coverage_report(trace) or None, + ) + deliver_json_blob(tc) + self._timing_features.clear() def run_engine(self): """Run the test function many times, on database input and generated @@ -1003,6 +1085,15 @@ def run_engine(self): # on different inputs. runner.run() note_statistics(runner.statistics) + deliver_json_blob( + { + "type": "info", + "run_start": self._start_timestamp, + "property": self.test_identifier, + "title": "Hypothesis Statistics", + "content": describe_statistics(runner.statistics), + } + ) if runner.call_count == 0: return @@ -1041,7 +1132,9 @@ def run_engine(self): falsifying_example.buffer ) ran_example.slice_comments = falsifying_example.slice_comments - assert info.__expected_exception is not None + tb = None + origin = None + assert info._expected_exception is not None try: with with_reporter(fragments.append): self.execute_once( @@ -1049,8 +1142,8 @@ def run_engine(self): print_example=not self.is_find, is_final=True, expected_failure=( - info.__expected_exception, - info.__expected_traceback, + info._expected_exception, + info._expected_traceback, ), ) except (UnsatisfiedAssumption, StopTest) as e: @@ -1066,10 +1159,34 @@ def run_engine(self): errors_to_report.append( (fragments, e.with_traceback(get_trimmed_traceback())) ) + tb = format_exception(e, get_trimmed_traceback(e)) + origin = InterestingOrigin.from_exception(e) else: # execute_once() will always raise either the expected error, or Flaky. raise NotImplementedError("This should be unreachable") finally: + # log our observability line for the final failing example + tc = { + "type": "test_case", + "run_start": self._start_timestamp, + "property": self.test_identifier, + "status": "passed" if sys.exc_info()[0] else "failed", + "status_reason": str(origin or "unexpected/flaky pass"), + "representation": self._string_repr, + "how_generated": "minimal failing example", + "features": { + **{ + k: v + for k, v in ran_example.target_observations.items() + if isinstance(k, str) + }, + **ran_example.events, + **self._timing_features, + }, + "coverage": None, # TODO: expose this? + "metadata": {"traceback": tb}, + } + deliver_json_blob(tc) # Whether or not replay actually raised the exception again, we want # to print the reproduce_failure decorator for the failing example. if self.settings.print_blob: diff --git a/hypothesis-python/src/hypothesis/internal/conjecture/data.py b/hypothesis-python/src/hypothesis/internal/conjecture/data.py index c7916d10a2..7a5542b0bd 100644 --- a/hypothesis-python/src/hypothesis/internal/conjecture/data.py +++ b/hypothesis-python/src/hypothesis/internal/conjecture/data.py @@ -1456,6 +1456,7 @@ def __init__( # try varying, to report if the minimal example always fails anyway. self.arg_slices: Set[Tuple[int, int]] = set() self.slice_comments: Dict[Tuple[int, int], str] = {} + self._observability_args: Dict[str, Any] = {} self.extra_information = ExtraInformation() diff --git a/hypothesis-python/src/hypothesis/internal/observability.py b/hypothesis-python/src/hypothesis/internal/observability.py new file mode 100644 index 0000000000..6752868737 --- /dev/null +++ b/hypothesis-python/src/hypothesis/internal/observability.py @@ -0,0 +1,92 @@ +# This file is part of Hypothesis, which may be found at +# https://github.com/HypothesisWorks/hypothesis/ +# +# Copyright the Hypothesis Authors. +# Individual contributors are listed in AUTHORS.rst and the git log. +# +# This Source Code Form is subject to the terms of the Mozilla Public License, +# v. 2.0. If a copy of the MPL was not distributed with this file, You can +# obtain one at https://mozilla.org/MPL/2.0/. + +"""Observability tools to spit out analysis-ready tables, one row per test case.""" + +import json +import os +from datetime import date, timedelta +from typing import Callable, Dict, List, Optional + +from hypothesis.configuration import storage_directory +from hypothesis.internal.conjecture.data import ConjectureData, Status + +TESTCASE_CALLBACKS: List[Callable[[dict], None]] = [] + + +def deliver_json_blob(value: dict) -> None: + for callback in TESTCASE_CALLBACKS: + callback(value) + + +def make_testcase( + *, + start_timestamp: float, + test_name_or_nodeid: str, + data: ConjectureData, + how_generated: str = "unknown", + string_repr: str = "", + arguments: Optional[dict] = None, + metadata: Optional[dict] = None, + coverage: Optional[Dict[str, List[int]]] = None, +) -> dict: + if data.interesting_origin: + status_reason = str(data.interesting_origin) + else: + status_reason = str(data.events.pop("invalid because", "")) + + return { + "type": "test_case", + "run_start": start_timestamp, + "property": test_name_or_nodeid, + "status": { + Status.OVERRUN: "gave_up", + Status.INVALID: "gave_up", + Status.VALID: "passed", + Status.INTERESTING: "failed", + }[data.status], + "status_reason": status_reason, + "representation": string_repr, + "arguments": arguments or {}, + "how_generated": how_generated, # iid, mutation, etc. + "features": { + **{ + f"target:{k}".strip(":"): v for k, v in data.target_observations.items() + }, + **data.events, + }, + "metadata": { + **(metadata or {}), + "traceback": getattr(data.extra_information, "_expected_traceback", None), + }, + "coverage": coverage, + } + + +_WROTE_TO = set() + + +def _deliver_to_file(value): # pragma: no cover + kind = "testcases" if value["type"] == "test_case" else "info" + fname = storage_directory("observed", f"{date.today().isoformat()}_{kind}.jsonl") + fname.parent.mkdir(exist_ok=True) + _WROTE_TO.add(fname) + with fname.open(mode="a") as f: + f.write(json.dumps(value) + "\n") + + +if "HYPOTHESIS_EXPERIMENTAL_OBSERVABILITY" in os.environ: # pragma: no cover + TESTCASE_CALLBACKS.append(_deliver_to_file) + + # Remove files more than a week old, to cap the size on disk + max_age = (date.today() - timedelta(days=8)).isoformat() + for f in storage_directory("observed").glob("*.jsonl"): + if f.stem < max_age: # pragma: no branch + f.unlink(missing_ok=True) diff --git a/hypothesis-python/src/hypothesis/internal/reflection.py b/hypothesis-python/src/hypothesis/internal/reflection.py index 31123b61ec..2f0480c987 100644 --- a/hypothesis-python/src/hypothesis/internal/reflection.py +++ b/hypothesis-python/src/hypothesis/internal/reflection.py @@ -306,8 +306,12 @@ def extract_lambda_source(f): This is not a good function and I am sorry for it. Forgive me my sins, oh lord """ + # You might be wondering how a lambda can have a return-type annotation? + # The answer is that we add this at runtime, in new_given_signature(), + # and we do support strange choices as applying @given() to a lambda. sig = inspect.signature(f) - assert sig.return_annotation is inspect.Parameter.empty + assert sig.return_annotation in (inspect.Parameter.empty, None), sig + if sig.parameters: if_confused = f"lambda {str(sig)[1:-1]}: " else: diff --git a/hypothesis-python/src/hypothesis/internal/scrutineer.py b/hypothesis-python/src/hypothesis/internal/scrutineer.py index 5b372ffd65..39352844b4 100644 --- a/hypothesis-python/src/hypothesis/internal/scrutineer.py +++ b/hypothesis-python/src/hypothesis/internal/scrutineer.py @@ -8,16 +8,29 @@ # v. 2.0. If a copy of the MPL was not distributed with this file, You can # obtain one at https://mozilla.org/MPL/2.0/. +import functools +import os +import subprocess import sys import types from collections import defaultdict from functools import lru_cache, reduce from os import sep from pathlib import Path +from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple from hypothesis._settings import Phase, Verbosity from hypothesis.internal.escalation import is_hypothesis_file +if TYPE_CHECKING: + from typing import TypeAlias +else: + TypeAlias = object + +Location: TypeAlias = Tuple[str, int] +Branch: TypeAlias = Tuple[Optional[Location], Location] +Trace: TypeAlias = Set[Branch] + @lru_cache(maxsize=None) def should_trace_file(fname): @@ -41,7 +54,7 @@ class Tracer: __slots__ = ("branches", "_previous_location") def __init__(self): - self.branches = set() + self.branches: Trace = set() self._previous_location = None def trace(self, frame, event, arg): @@ -179,3 +192,50 @@ def explanatory_lines(traces, settings): explanations = get_explaining_locations(traces) max_lines = 5 if settings.verbosity <= Verbosity.normal else float("inf") return make_report(explanations, cap_lines_at=max_lines) + + +# beware the code below; we're using some heuristics to make a nicer report... + + +@functools.lru_cache +def _get_git_repo_root() -> Path: + try: + where = subprocess.run( + ["git", "rev-parse", "--show-toplevel"], + check=True, + timeout=10, + capture_output=True, + text=True, + encoding="utf-8", + ).stdout.strip() + except Exception: # pragma: no cover + return Path().absolute().parents[-1] + else: + return Path(where) + + +if sys.version_info[:2] <= (3, 8): + + def is_relative_to(self, other): + return other == self or other in self.parents + +else: + is_relative_to = Path.is_relative_to + + +def tractable_coverage_report(trace: Trace) -> Dict[str, List[int]]: + """Report a simple coverage map which is (probably most) of the user's code.""" + coverage: dict = {} + t = dict(trace) + for file, line in set(t.keys()).union(t.values()) - {None}: # type: ignore + # On Python <= 3.11, we can use coverage.py xor Hypothesis' tracer, + # so the trace will be empty and this line never run under coverage. + coverage.setdefault(file, set()).add(line) # pragma: no cover + stdlib_fragment = f"{os.sep}lib{os.sep}python3.{sys.version_info.minor}{os.sep}" + return { + k: sorted(v) + for k, v in coverage.items() + if stdlib_fragment not in k + and is_relative_to(p := Path(k), _get_git_repo_root()) + and "site-packages" not in p.parts + } diff --git a/hypothesis-python/src/hypothesis/strategies/_internal/core.py b/hypothesis-python/src/hypothesis/strategies/_internal/core.py index b8c5601587..a5a862635a 100644 --- a/hypothesis-python/src/hypothesis/strategies/_internal/core.py +++ b/hypothesis-python/src/hypothesis/strategies/_internal/core.py @@ -129,7 +129,11 @@ OneCharStringStrategy, TextStrategy, ) -from hypothesis.strategies._internal.utils import cacheable, defines_strategy +from hypothesis.strategies._internal.utils import ( + cacheable, + defines_strategy, + to_jsonable, +) from hypothesis.utils.conventions import not_set from hypothesis.vendor.pretty import RepresentationPrinter @@ -2098,8 +2102,9 @@ def draw(self, strategy: SearchStrategy[Ex], label: Any = None) -> Ex: result = self.conjecture_data.draw(strategy) self.count += 1 printer = RepresentationPrinter(context=current_build_context()) - printer.text(f"Draw {self.count}") - printer.text(": " if label is None else f" ({label}): ") + desc = f"Draw {self.count}{'' if label is None else f' ({label})'}: " + self.conjecture_data._observability_args[desc] = to_jsonable(result) + printer.text(desc) printer.pretty(result) note(printer.getvalue()) return result diff --git a/hypothesis-python/src/hypothesis/strategies/_internal/utils.py b/hypothesis-python/src/hypothesis/strategies/_internal/utils.py index b0e0746314..995b179b40 100644 --- a/hypothesis-python/src/hypothesis/strategies/_internal/utils.py +++ b/hypothesis-python/src/hypothesis/strategies/_internal/utils.py @@ -8,13 +8,17 @@ # v. 2.0. If a copy of the MPL was not distributed with this file, You can # obtain one at https://mozilla.org/MPL/2.0/. +import sys import threading from inspect import signature from typing import TYPE_CHECKING, Callable, Dict +import attr + from hypothesis.internal.cache import LRUReusedCache from hypothesis.internal.floats import float_to_int from hypothesis.internal.reflection import proxies +from hypothesis.vendor.pretty import pretty if TYPE_CHECKING: from hypothesis.strategies._internal.strategies import SearchStrategy, T @@ -144,3 +148,40 @@ def accept(*args, **kwargs): return accept return decorator + + +def to_jsonable(obj: object) -> object: + """Recursively convert an object to json-encodable form. + + This is not intended to round-trip, but rather provide an analysis-ready + format for observability. To avoid side affects, we pretty-print all but + known types. + """ + if isinstance(obj, (str, int, float, bool, type(None))): + if isinstance(obj, int) and abs(obj) >= 2**63: + return float(obj) + return obj + if isinstance(obj, (list, tuple, set, frozenset)): + if isinstance(obj, tuple) and hasattr(obj, "_asdict"): + return to_jsonable(obj._asdict()) # treat namedtuples as dicts + return [to_jsonable(x) for x in obj] + if isinstance(obj, dict): + return { + k if isinstance(k, str) else pretty(k): to_jsonable(v) + for k, v in obj.items() + } + + # Special handling for dataclasses, attrs, and pydantic classes + if ( + (dcs := sys.modules.get("dataclasses")) + and dcs.is_dataclass(obj) + and not isinstance(obj, type) + ): + return to_jsonable(dcs.asdict(obj)) + if attr.has(type(obj)): + return to_jsonable(attr.asdict(obj, recurse=False)) # type: ignore + if (pyd := sys.modules.get("pydantic")) and isinstance(obj, pyd.BaseModel): + return to_jsonable(obj.model_dump()) + + # If all else fails, we'll just pretty-print as a string. + return pretty(obj) diff --git a/hypothesis-python/tests/cover/test_observability.py b/hypothesis-python/tests/cover/test_observability.py new file mode 100644 index 0000000000..761f6c768d --- /dev/null +++ b/hypothesis-python/tests/cover/test_observability.py @@ -0,0 +1,67 @@ +# This file is part of Hypothesis, which may be found at +# https://github.com/HypothesisWorks/hypothesis/ +# +# Copyright the Hypothesis Authors. +# Individual contributors are listed in AUTHORS.rst and the git log. +# +# This Source Code Form is subject to the terms of the Mozilla Public License, +# v. 2.0. If a copy of the MPL was not distributed with this file, You can +# obtain one at https://mozilla.org/MPL/2.0/. + +import contextlib + +import pytest + +from hypothesis import ( + assume, + event, + example, + given, + seed, + settings, + strategies as st, + target, +) +from hypothesis.database import InMemoryExampleDatabase +from hypothesis.internal.observability import TESTCASE_CALLBACKS + + +@contextlib.contextmanager +def capture_observations(): + ls = [] + TESTCASE_CALLBACKS.append(ls.append) + try: + yield ls + finally: + TESTCASE_CALLBACKS.remove(ls.append) + + +@seed("deterministic so we don't miss some combination of features") +@example(a=0, x=4, data=None) +@settings(database=InMemoryExampleDatabase()) +@given(st.integers(), st.integers(), st.data()) +def do_it_all(a, x, data): + event(f"{x%2=}") + target(x % 5, label="x%5") + assume(a % 9) + if data: + data.draw(st.text("abcdef", min_size=a % 3), label="interactive") + 1 / ((x or 1) % 7) + + +def test_observability(): + with capture_observations() as ls: + with pytest.raises(ZeroDivisionError): + do_it_all() + with pytest.raises(ZeroDivisionError): + do_it_all() + + infos = [t for t in ls if t["type"] == "info"] + assert len(infos) == 2 + assert {t["title"] for t in infos} == {"Hypothesis Statistics"} + + testcases = [t for t in ls if t["type"] == "test_case"] + assert len(testcases) > 50 + assert {t["property"] for t in testcases} == {do_it_all.__name__} + assert len({t["run_start"] for t in testcases}) == 2 + assert {t["status"] for t in testcases} == {"gave_up", "passed", "failed"} diff --git a/hypothesis-python/tests/cover/test_searchstrategy.py b/hypothesis-python/tests/cover/test_searchstrategy.py index ef93f0c47a..b6b6c498e9 100644 --- a/hypothesis-python/tests/cover/test_searchstrategy.py +++ b/hypothesis-python/tests/cover/test_searchstrategy.py @@ -16,6 +16,7 @@ from hypothesis.errors import InvalidArgument from hypothesis.internal.conjecture.data import ConjectureData from hypothesis.strategies import booleans, integers, just, none, tuples +from hypothesis.strategies._internal.utils import to_jsonable from tests.common.debug import assert_no_examples @@ -85,3 +86,7 @@ def test_can_flatmap_nameless(): def test_flatmap_with_invalid_expand(): with pytest.raises(InvalidArgument): just(100).flatmap(lambda n: "a").example() + + +def test_jsonable(): + assert isinstance(to_jsonable(object()), str) diff --git a/mypy.ini b/mypy.ini index ecb6e6ab37..e0063628c1 100644 --- a/mypy.ini +++ b/mypy.ini @@ -15,3 +15,5 @@ warn_no_return = True warn_unused_ignores = True warn_unused_configs = True warn_redundant_casts = True + +disable_error_code = annotation-unchecked diff --git a/requirements/tools.in b/requirements/tools.in index 919959ac37..c94870297a 100644 --- a/requirements/tools.in +++ b/requirements/tools.in @@ -17,6 +17,7 @@ shed sphinx sphinx-codeautolink sphinx-hoverxref +sphinx-jsonschema sphinx-rtd-theme sphinx-selective-exclude tox diff --git a/requirements/tools.txt b/requirements/tools.txt index 512b443392..741d0b3bf3 100644 --- a/requirements/tools.txt +++ b/requirements/tools.txt @@ -60,6 +60,7 @@ docutils==0.20.1 # readme-renderer # restructuredtext-lint # sphinx + # sphinx-jsonschema # sphinx-rtd-theme dpcontracts==0.6.0 # via -r requirements/tools.in @@ -98,6 +99,8 @@ jeepney==0.8.0 # secretstorage jinja2==3.1.2 # via sphinx +jsonpointer==2.4 + # via sphinx-jsonschema keyring==24.3.0 # via twine lark==1.1.8 @@ -183,7 +186,9 @@ python-dateutil==2.8.2 pyupgrade==3.15.0 # via shed pyyaml==6.0.1 - # via libcst + # via + # libcst + # sphinx-jsonschema readme-renderer==42.0 # via twine requests==2.31.0 @@ -191,6 +196,7 @@ requests==2.31.0 # -r requirements/tools.in # requests-toolbelt # sphinx + # sphinx-jsonschema # twine requests-toolbelt==1.0.0 # via twine @@ -232,6 +238,8 @@ sphinx-codeautolink==0.15.0 # via -r requirements/tools.in sphinx-hoverxref==1.3.0 # via -r requirements/tools.in +sphinx-jsonschema==1.19.1 + # via -r requirements/tools.in sphinx-rtd-theme==2.0.0 # via -r requirements/tools.in sphinx-selective-exclude==1.0.3