From 6941cd27604c0f9d096ac93950f7b81e75c43d2f Mon Sep 17 00:00:00 2001
From: Zac Hatfield-Dodds <zac.hatfield.dodds@gmail.com>
Date: Sat, 9 Dec 2023 16:11:01 -0800
Subject: [PATCH 1/2] Update CODEOWNERS

---
 CODEOWNERS | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/CODEOWNERS b/CODEOWNERS
index ac74a335b6..eefcfa5318 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1,7 +1,7 @@
-# Engine changes need to be approved by DRMacIver, as per
+# Engine changes need to be approved by Zac-HD, as per
 # https://github.com/HypothesisWorks/hypothesis/blob/master/guides/review.rst#engine-changes
-/hypothesis-python/src/hypothesis/internal/conjecture/ @DRMacIver @Zac-HD
+/hypothesis-python/src/hypothesis/internal/conjecture/ @Zac-HD
 
-# Changes to the paper also need to be approved by DRMacIver
-/paper.md @DRMacIver
-/paper.bib @DRMacIver
+# Changes to the paper also need to be approved by DRMacIver or Zac, as authors
+/paper.md @DRMacIver @Zac-HD
+/paper.bib @DRMacIver @Zac-HD

From 239c83602da2b6d0da87b76594771087f0994f04 Mon Sep 17 00:00:00 2001
From: Zac Hatfield-Dodds <zac.hatfield.dodds@gmail.com>
Date: Sat, 9 Dec 2023 16:11:01 -0800
Subject: [PATCH 2/2] Observability mode - output jsonlines

---
 hypothesis-python/RELEASE.rst                 |   4 +
 hypothesis-python/docs/_static/.empty         |   0
 .../docs/_static/wrap-in-tables.css           |  15 ++
 hypothesis-python/docs/changes.rst            |   2 +-
 hypothesis-python/docs/conf.py                |   3 +
 hypothesis-python/docs/index.rst              |   1 +
 hypothesis-python/docs/observability.rst      |  76 +++++++
 .../docs/schema_observations.json             |  93 +++++++++
 .../src/_hypothesis_pytestplugin.py           |  10 +-
 hypothesis-python/src/hypothesis/core.py      | 185 ++++++++++++++----
 .../hypothesis/internal/conjecture/data.py    |   1 +
 .../src/hypothesis/internal/observability.py  |  92 +++++++++
 .../src/hypothesis/internal/reflection.py     |   6 +-
 .../src/hypothesis/internal/scrutineer.py     |  62 +++++-
 .../hypothesis/strategies/_internal/core.py   |  11 +-
 .../hypothesis/strategies/_internal/utils.py  |  41 ++++
 .../tests/cover/test_observability.py         |  67 +++++++
 .../tests/cover/test_searchstrategy.py        |   5 +
 mypy.ini                                      |   2 +
 requirements/tools.in                         |   1 +
 requirements/tools.txt                        |  10 +-
 21 files changed, 645 insertions(+), 42 deletions(-)
 create mode 100644 hypothesis-python/RELEASE.rst
 delete mode 100644 hypothesis-python/docs/_static/.empty
 create mode 100644 hypothesis-python/docs/_static/wrap-in-tables.css
 create mode 100644 hypothesis-python/docs/observability.rst
 create mode 100644 hypothesis-python/docs/schema_observations.json
 create mode 100644 hypothesis-python/src/hypothesis/internal/observability.py
 create mode 100644 hypothesis-python/tests/cover/test_observability.py

diff --git a/hypothesis-python/RELEASE.rst b/hypothesis-python/RELEASE.rst
new file mode 100644
index 0000000000..0f7eff12b2
--- /dev/null
+++ b/hypothesis-python/RELEASE.rst
@@ -0,0 +1,4 @@
+RELEASE_TYPE: minor
+
+This release adds an experimental :wikipedia:`observability <Observability_(software)>`
+mode.  :doc:`You can read the docs about it here <observability>`.
diff --git a/hypothesis-python/docs/_static/.empty b/hypothesis-python/docs/_static/.empty
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/hypothesis-python/docs/_static/wrap-in-tables.css b/hypothesis-python/docs/_static/wrap-in-tables.css
new file mode 100644
index 0000000000..3ff01dda29
--- /dev/null
+++ b/hypothesis-python/docs/_static/wrap-in-tables.css
@@ -0,0 +1,15 @@
+/* override table width restrictions */
+/* thanks to https://github.com/readthedocs/sphinx_rtd_theme/issues/117#issuecomment-153083280 */
+@media screen and (min-width: 767px) {
+
+    .wy-table-responsive table td {
+        /* !important prevents the common CSS stylesheets from
+            overriding this as on RTD they are loaded after this stylesheet */
+        white-space: normal !important;
+    }
+
+    .wy-table-responsive {
+        overflow: visible !important;
+    }
+
+}
diff --git a/hypothesis-python/docs/changes.rst b/hypothesis-python/docs/changes.rst
index 97e4162400..e0f3c2fb65 100644
--- a/hypothesis-python/docs/changes.rst
+++ b/hypothesis-python/docs/changes.rst
@@ -2362,7 +2362,7 @@ Did you know that of the 2\ :superscript:`64` possible floating-point numbers,
 
 While nans *usually* have all zeros in the sign bit and mantissa, this
 `isn't always true <https://wingolog.org/archives/2011/05/18/value-representation-in-javascript-implementations>`__,
-and :wikipedia:`'signaling' nans might trap or error <https://en.wikipedia.org/wiki/NaN#Signaling_NaN>`.
+and :wikipedia:`'signaling' nans might trap or error <NaN#Signaling_NaN>`.
 To help distinguish such errors in e.g. CI logs, Hypothesis now prints ``-nan`` for
 negative nans, and adds a comment like ``# Saw 3 signaling NaNs`` if applicable.
 
diff --git a/hypothesis-python/docs/conf.py b/hypothesis-python/docs/conf.py
index c4141e304e..8b7624f6fe 100644
--- a/hypothesis-python/docs/conf.py
+++ b/hypothesis-python/docs/conf.py
@@ -30,6 +30,7 @@
     "hoverxref.extension",
     "sphinx_codeautolink",
     "sphinx_selective_exclude.eager_only",
+    "sphinx-jsonschema",
 ]
 
 templates_path = ["_templates"]
@@ -147,6 +148,8 @@ def setup(app):
 
 html_static_path = ["_static"]
 
+html_css_files = ["wrap-in-tables.css"]
+
 htmlhelp_basename = "Hypothesisdoc"
 
 html_favicon = "../../brand/favicon.ico"
diff --git a/hypothesis-python/docs/index.rst b/hypothesis-python/docs/index.rst
index 41fd7abee7..9cbca1702d 100644
--- a/hypothesis-python/docs/index.rst
+++ b/hypothesis-python/docs/index.rst
@@ -80,3 +80,4 @@ check out some of the
   support
   packaging
   reproducing
+  observability
diff --git a/hypothesis-python/docs/observability.rst b/hypothesis-python/docs/observability.rst
new file mode 100644
index 0000000000..4d0493c20d
--- /dev/null
+++ b/hypothesis-python/docs/observability.rst
@@ -0,0 +1,76 @@
+===================
+Observability tools
+===================
+
+.. warning::
+
+    This feature is experimental, and could have breaking changes or even be removed
+    without notice.  Try it out, let us know what you think, but don't rely on it
+    just yet!
+
+
+Motivation
+==========
+
+Understanding what your code is doing - for example, why your test failed - is often
+a frustrating exercise in adding some more instrumentation or logging (or ``print()`` calls)
+and running it again.  The idea of :wikipedia:`observability <Observability_(software)>`
+is to let you answer questions you didn't think of in advance.  In slogan form,
+
+  *Debugging should be a data analysis problem.*
+
+By default, Hypothesis only reports the minimal failing example... but sometimes you might
+want to know something about *all* the examples.  Printing them to the terminal with
+:ref:`verbose output <verbose-output>` might be nice, but isn't always enough.
+This feature gives you an analysis-ready dataframe with useful columns and one row
+per test case, with columns from arguments to code coverage to pass/fail status.
+
+This is deliberately a much lighter-weight and task-specific system than e.g.
+`OpenTelemetry <https://opentelemetry.io/>`__.  It's also less detailed than time-travel
+debuggers such as `rr <https://rr-project.org/>`__ or `pytrace <https://pytrace.com/>`__,
+because there's no good way to compare multiple traces from these tools and their
+Python support is relatively immature.
+
+
+Configuration
+=============
+
+If you set the ``HYPOTHESIS_EXPERIMENTAL_OBSERVABILITY`` environment variable,
+Hypothesis will log various observations to jsonlines files in the
+``.hypothesis/observed/`` directory.  You can load and explore these with e.g.
+:func:`pd.read_json(".hypothesis/observed/*_testcases.jsonl", lines=True) <pandas.read_json>`,
+or by using the :pypi:`sqlite-utils` and :pypi:`datasette` libraries::
+
+    sqlite-utils insert testcases.db testcases .hypothesis/observed/*_testcases.jsonl --nl --flatten
+    datasette serve testcases.db
+
+
+Collecting more information
+---------------------------
+
+If you want to record more information about your test cases than the arguments and
+outcome - for example, was ``x`` a binary tree?  what was the difference between the
+expected and the actual value?  how many queries did it take to find a solution? -
+Hypothesis makes this easy.
+
+:func:`~hypothesis.event` accepts a string label, and optionally a string or int or
+float observation associated with it.  All events are collected and summarized in
+:ref:`statistics`, as well as included on a per-test-case basis in our observations.
+
+:func:`~hypothesis.target` is a special case of numeric-valued events: as well as
+recording them in observations, Hypothesis will try to maximize the targeted value.
+Knowing that, you can use this to guide the search for failing inputs.
+
+
+Data Format
+===========
+
+We dump observations in `json lines format <https://jsonlines.org/>`__, with each line
+describing either a test case or an information message.  The tables below are derived
+from :download:`this machine-readable JSON schema <schema_observations.json>`, to
+provide both readable and verifiable specifications.
+
+.. jsonschema:: ./schema_observations.json#/oneOf/0
+   :hide_key: /additionalProperties, /type
+.. jsonschema:: ./schema_observations.json#/oneOf/1
+   :hide_key: /additionalProperties, /type
diff --git a/hypothesis-python/docs/schema_observations.json b/hypothesis-python/docs/schema_observations.json
new file mode 100644
index 0000000000..48b44a797f
--- /dev/null
+++ b/hypothesis-python/docs/schema_observations.json
@@ -0,0 +1,93 @@
+{
+    "title": "PBT Observations",
+    "description": "PBT Observations define a standard way to communicate what happened when property-based tests were run.  They describe test cases, or general notifications classified as info, alert, or error messages.",
+    "oneOf": [
+        {
+            "title": "Test case",
+            "description": "Describes the inputs to and result of running some test function on a particular input.  The test might have passed, failed, or been abandoned part way through (e.g. because we failed a ``.filter()`` condition).",
+            "type": "object",
+            "properties": {
+                "type": {
+                    "const": "test_case",
+                    "description": "A tag which labels this observation as data about a specific test case."
+                },
+                "status": {
+                    "enum": ["passed", "failed", "gave_up"],
+                    "description": "Whether the test passed, failed, or was aborted before completion (e.g. due to use of ``.filter()``).  Note that if we gave_up partway, values such as arguments and features may be incomplete."
+                },
+                "status_reason": {
+                    "type": "string",
+                    "description": "If non-empty, the reason for which the test failed or was abandoned.  For Hypothesis, this is usually the exception type and location."
+                },
+                "representation": {
+                    "type": "string",
+                    "description": "The string representation of the input."
+                },
+                "arguments": {
+                    "type": "object",
+                    "description": "A structured json-encoded representation of the input.  Hypothesis always provides a dictionary of argument names to json-ified values, including interactive draws from the :func:`~hypothesis.strategies.data` strategy.  In other libraries this can be any object."
+                },
+                "how_generated": {
+                    "type": ["string", "null"],
+                    "description": "How the input was generated, if known.  In Hypothesis this might be an explicit example, generated during a particular phase with some backend, or by replaying the minimal failing example."
+                },
+                "features": {
+                    "type": "object",
+                    "description": "Runtime observations which might help explain what this test case did.  Hypothesis includes target() scores, tags from event(), time spent generating data and running user code, and so on."
+                },
+                "coverage": {
+                    "type": ["object", "null"],
+                    "description": "Mapping of filename to list of covered line numbers, if coverage information is available, or None if not.  Hypothesis deliberately omits stdlib and site-packages code.",
+                    "additionalProperties": {
+                        "type": "array",
+                        "items": {"type": "integer", "minimum": 1},
+                        "uniqueItems": true
+                    }
+                },
+                "metadata": {
+                    "type": "object",
+                    "description": "Arbitrary metadata which might be of interest, but does not semantically fit in 'features'.  For example, Hypothesis includes the traceback for failing tests here."
+                },
+                "property": {
+                    "type": "string",
+                    "description": "The name or representation of the test function we're running."
+                },
+                "run_start": {
+                    "type": "number",
+                    "description": "unix timestamp at which we started running this test function, so that later analysis can group test cases by run."
+                }
+            },
+            "required": ["type", "status", "status_reason", "representation", "arguments", "how_generated", "features", "coverage", "metadata", "property", "run_start"],
+            "additionalProperties": false
+        },
+        {
+            "title": "Information message",
+            "description": "Info, alert, and error messages correspond to a group of test cases or the overall run, and are intended for humans rather than machine analysis.",
+            "type": "object",
+            "properties": {
+                "type": {
+                    "enum": [ "info", "alert", "error"],
+                    "description": "A tag which labels this observation as general information to show the user.  Hypothesis uses info messages to report statistics; alert or error messages can be provided by plugins."
+                },
+                "title": {
+                    "type": "string",
+                    "description": "The title of this message"
+                },
+                "content": {
+                    "type": "string",
+                    "description": "The body of the message.  May use markdown."
+                },
+                "property": {
+                    "type": "string",
+                    "description": "The name or representation of the test function we're running.  For Hypothesis, usually the Pytest nodeid."
+                },
+                "run_start": {
+                    "type": "number",
+                    "description": "unix timestamp at which we started running this test function, so that later analysis can group test cases by run."
+                }
+            },
+            "required": [ "type", "title", "content", "property", "run_start"],
+            "additionalProperties": false
+        }
+    ]
+}
\ No newline at end of file
diff --git a/hypothesis-python/src/_hypothesis_pytestplugin.py b/hypothesis-python/src/_hypothesis_pytestplugin.py
index 3bb2535f3b..9875e067f5 100644
--- a/hypothesis-python/src/_hypothesis_pytestplugin.py
+++ b/hypothesis-python/src/_hypothesis_pytestplugin.py
@@ -373,6 +373,13 @@ def pytest_terminal_summary(terminalreporter):
                 if fex:
                     failing_examples.append(json.loads(fex))
 
+        from hypothesis.internal.observability import _WROTE_TO
+
+        if _WROTE_TO:
+            terminalreporter.section("Hypothesis")
+            for fname in sorted(_WROTE_TO):
+                terminalreporter.write_line(f"observations written to {fname}")
+
         if failing_examples:
             # This must have been imported already to write the failing examples
             from hypothesis.extra._patching import gc_patches, make_patch, save_patch
@@ -384,7 +391,8 @@ def pytest_terminal_summary(terminalreporter):
             except Exception:
                 # fail gracefully if we hit any filesystem or permissions problems
                 return
-            terminalreporter.section("Hypothesis")
+            if not _WROTE_TO:
+                terminalreporter.section("Hypothesis")
             terminalreporter.write_line(
                 f"`git apply {fname}` to add failing examples to your code."
             )
diff --git a/hypothesis-python/src/hypothesis/core.py b/hypothesis-python/src/hypothesis/core.py
index b2298418c8..82e359fc2f 100644
--- a/hypothesis-python/src/hypothesis/core.py
+++ b/hypothesis-python/src/hypothesis/core.py
@@ -87,6 +87,11 @@
     get_trimmed_traceback,
 )
 from hypothesis.internal.healthcheck import fail_health_check
+from hypothesis.internal.observability import (
+    TESTCASE_CALLBACKS,
+    deliver_json_blob,
+    make_testcase,
+)
 from hypothesis.internal.reflection import (
     convert_positional_arguments,
     define_function_signature,
@@ -99,7 +104,12 @@
     proxies,
     repr_call,
 )
-from hypothesis.internal.scrutineer import Tracer, explanatory_lines
+from hypothesis.internal.scrutineer import (
+    Trace,
+    Tracer,
+    explanatory_lines,
+    tractable_coverage_report,
+)
 from hypothesis.internal.validation import check_type
 from hypothesis.reporting import (
     current_verbosity,
@@ -107,13 +117,14 @@
     verbose_report,
     with_reporter,
 )
-from hypothesis.statistics import describe_targets, note_statistics
+from hypothesis.statistics import describe_statistics, describe_targets, note_statistics
 from hypothesis.strategies._internal.misc import NOTHING
 from hypothesis.strategies._internal.strategies import (
     Ex,
     SearchStrategy,
     check_strategy,
 )
+from hypothesis.strategies._internal.utils import to_jsonable
 from hypothesis.vendor.pretty import RepresentationPrinter
 from hypothesis.version import __version__
 
@@ -484,13 +495,14 @@ def execute_explicit_examples(state, wrapped_test, arguments, kwargs, original_s
 
         with local_settings(state.settings):
             fragments_reported = []
+            empty_data = ConjectureData.for_buffer(b"")
             try:
                 bits = ", ".join(nicerepr(x) for x in arguments) + ", ".join(
                     f"{k}={nicerepr(v)}" for k, v in example_kwargs.items()
                 )
                 execute_example = partial(
                     state.execute_once,
-                    ConjectureData.for_buffer(b""),
+                    empty_data,
                     is_final=True,
                     print_example=True,
                     example_kwargs=example_kwargs,
@@ -544,7 +556,8 @@ def execute_explicit_examples(state, wrapped_test, arguments, kwargs, original_s
                 # development, this is rather useful to replay Hypothesis' part of
                 # a saved failure when other arguments are supplied by e.g. pytest.
                 # See https://github.com/HypothesisWorks/hypothesis/issues/2125
-                pass
+                with contextlib.suppress(StopTest):
+                    empty_data.conclude_test(Status.INVALID)
             except BaseException as err:
                 # In order to support reporting of multiple failing examples, we yield
                 # each of the (report text, error) pairs we find back to the top-level
@@ -567,6 +580,8 @@ def execute_explicit_examples(state, wrapped_test, arguments, kwargs, original_s
                     new.__cause__ = err
                     err = new
 
+                with contextlib.suppress(StopTest):
+                    empty_data.conclude_test(Status.INVALID)
                 yield (fragments_reported, err)
                 if (
                     state.settings.report_multiple_bugs
@@ -583,6 +598,15 @@ def execute_explicit_examples(state, wrapped_test, arguments, kwargs, original_s
                         "Falsifying example", "Falsifying explicit example", 1
                     )
 
+                tc = make_testcase(
+                    start_timestamp=state._start_timestamp,
+                    test_name_or_nodeid=state.test_identifier,
+                    data=empty_data,
+                    how_generated="explicit example",
+                    string_repr=state._string_repr,
+                )
+                deliver_json_blob(tc)
+
             if fragments_reported:
                 verbose_report(fragments_reported[0].replace("Falsifying", "Trying", 1))
                 for f in fragments_reported[1:]:
@@ -738,7 +762,6 @@ def __init__(self, stuff, test, settings, random, wrapped_test):
         self.last_exception = None
         self.falsifying_examples = ()
         self.random = random
-        self.__test_runtime = None
         self.ever_executed = False
 
         self.is_find = getattr(wrapped_test, "_hypothesis_internal_is_find", False)
@@ -756,6 +779,16 @@ def __init__(self, stuff, test, settings, random, wrapped_test):
         self.failed_due_to_deadline = False
 
         self.explain_traces = defaultdict(set)
+        self._start_timestamp = time.time()
+        self._string_repr = ""
+        self._jsonable_arguments = {}
+        self._timing_features = {}
+
+    @property
+    def test_identifier(self):
+        return getattr(
+            current_pytest_item.value, "nodeid", None
+        ) or get_pretty_function_description(self.wrapped_test)
 
     def execute_once(
         self,
@@ -780,6 +813,7 @@ def execute_once(
         self.ever_executed = True
         data.is_find = self.is_find
 
+        self._string_repr = ""
         text_repr = None
         if self.settings.deadline is None:
             test = self.test
@@ -787,16 +821,23 @@ def execute_once(
 
             @proxies(self.test)
             def test(*args, **kwargs):
-                self.__test_runtime = None
+                arg_drawtime = sum(data.draw_times)
                 initial_draws = len(data.draw_times)
                 start = time.perf_counter()
-                result = self.test(*args, **kwargs)
-                finish = time.perf_counter()
-                internal_draw_time = sum(data.draw_times[initial_draws:])
-                runtime = datetime.timedelta(
-                    seconds=finish - start - internal_draw_time
-                )
-                self.__test_runtime = runtime
+                try:
+                    result = self.test(*args, **kwargs)
+                finally:
+                    finish = time.perf_counter()
+                    internal_draw_time = sum(data.draw_times[initial_draws:])
+                    runtime = datetime.timedelta(
+                        seconds=finish - start - internal_draw_time
+                    )
+                    self._timing_features = {
+                        "time_running_test": finish - start - internal_draw_time,
+                        "time_drawing_args": arg_drawtime,
+                        "time_interactive_draws": internal_draw_time,
+                    }
+
                 current_deadline = self.settings.deadline
                 if not is_final:
                     current_deadline = (current_deadline // 4) * 5
@@ -855,6 +896,26 @@ def run(data):
                         ),
                     )
                 report(printer.getvalue())
+
+            if TESTCASE_CALLBACKS:
+                printer = RepresentationPrinter(context=context)
+                printer.repr_call(
+                    test.__name__,
+                    args,
+                    kwargs,
+                    force_split=True,
+                    arg_slices=argslices,
+                    leading_comment=(
+                        "# " + context.data.slice_comments[(0, 0)]
+                        if (0, 0) in context.data.slice_comments
+                        else None
+                    ),
+                )
+                self._string_repr = printer.getvalue()
+                self._jsonable_arguments = {
+                    **dict(enumerate(map(to_jsonable, args))),
+                    **{k: to_jsonable(v) for k, v in kwargs.items()},
+                }
             return test(*args, **kwargs)
 
         # self.test_runner can include the execute_example method, or setup/teardown
@@ -870,9 +931,8 @@ def run(data):
         # instead raise an appropriate diagnostic error.
         if expected_failure is not None:
             exception, traceback = expected_failure
-            if (
-                isinstance(exception, DeadlineExceeded)
-                and self.__test_runtime is not None
+            if isinstance(exception, DeadlineExceeded) and (
+                runtime_secs := self._timing_features.get("time_running_test")
             ):
                 report(
                     "Unreliable test timings! On an initial run, this "
@@ -884,7 +944,7 @@ def run(data):
                     % (
                         exception.runtime.total_seconds() * 1000,
                         self.settings.deadline.total_seconds() * 1000,
-                        self.__test_runtime.total_seconds() * 1000,
+                        runtime_secs * 1000,
                     )
                 )
             else:
@@ -895,7 +955,7 @@ def run(data):
             ) from exception
         return result
 
-    def _execute_once_for_engine(self, data):
+    def _execute_once_for_engine(self, data: ConjectureData) -> None:
         """Wrapper around ``execute_once`` that intercepts test failure
         exceptions and single-test control exceptions, and turns them into
         appropriate method calls to `data` instead.
@@ -903,16 +963,18 @@ def _execute_once_for_engine(self, data):
         This allows the engine to assume that any exception other than
         ``StopTest`` must be a fatal error, and should stop the entire engine.
         """
+        trace: Trace = set()
         try:
-            trace = frozenset()
-            if (
+            _can_trace = (
+                sys.gettrace() is None or sys.version_info[:2] >= (3, 12)
+            ) and not PYPY
+            _trace_obs = TESTCASE_CALLBACKS
+            _trace_failure = (
                 self.failed_normally
                 and not self.failed_due_to_deadline
-                and Phase.shrink in self.settings.phases
-                and Phase.explain in self.settings.phases
-                and (sys.gettrace() is None or sys.version_info[:2] >= (3, 12))
-                and not PYPY
-            ):  # pragma: no cover
+                and {Phase.shrink, Phase.explain}.issubset(self.settings.phases)
+            )
+            if _can_trace and (_trace_obs or _trace_failure):  # pragma: no cover
                 # This is in fact covered by our *non-coverage* tests, but due to the
                 # settrace() contention *not* by our coverage tests.  Ah well.
                 with Tracer() as tracer:
@@ -921,7 +983,7 @@ def _execute_once_for_engine(self, data):
                         if data.status == Status.VALID:
                             self.explain_traces[None].add(frozenset(tracer.branches))
                     finally:
-                        trace = frozenset(tracer.branches)
+                        trace = tracer.branches
             else:
                 result = self.execute_once(data)
             if result is not None:
@@ -964,20 +1026,40 @@ def _execute_once_for_engine(self, data):
 
                 tb = get_trimmed_traceback()
                 info = data.extra_information
-                info.__expected_traceback = format_exception(e, tb)
-                info.__expected_exception = e
-                verbose_report(info.__expected_traceback)
+                info._expected_traceback = format_exception(e, tb)  # type: ignore
+                info._expected_exception = e  # type: ignore
+                verbose_report(info._expected_traceback)  # type: ignore
 
                 self.failed_normally = True
 
                 interesting_origin = InterestingOrigin.from_exception(e)
                 if trace:  # pragma: no cover
                     # Trace collection is explicitly disabled under coverage.
-                    self.explain_traces[interesting_origin].add(trace)
+                    self.explain_traces[interesting_origin].add(frozenset(trace))
                 if interesting_origin[0] == DeadlineExceeded:
                     self.failed_due_to_deadline = True
                     self.explain_traces.clear()
-                data.mark_interesting(interesting_origin)
+                data.mark_interesting(interesting_origin)  # type: ignore  # mypy bug?
+        finally:
+            # Conditional here so we can save some time constructing the payload; in
+            # other cases (without coverage) it's cheap enough to do that regardless.
+            if TESTCASE_CALLBACKS:
+                if self.failed_normally or self.failed_due_to_deadline:
+                    phase = "shrink"
+                else:
+                    phase = "unknown"
+                tc = make_testcase(
+                    start_timestamp=self._start_timestamp,
+                    test_name_or_nodeid=self.test_identifier,
+                    data=data,
+                    how_generated=f"generated during {phase} phase",
+                    string_repr=self._string_repr,
+                    arguments={**self._jsonable_arguments, **data._observability_args},
+                    metadata=self._timing_features,
+                    coverage=tractable_coverage_report(trace) or None,
+                )
+                deliver_json_blob(tc)
+            self._timing_features.clear()
 
     def run_engine(self):
         """Run the test function many times, on database input and generated
@@ -1003,6 +1085,15 @@ def run_engine(self):
         # on different inputs.
         runner.run()
         note_statistics(runner.statistics)
+        deliver_json_blob(
+            {
+                "type": "info",
+                "run_start": self._start_timestamp,
+                "property": self.test_identifier,
+                "title": "Hypothesis Statistics",
+                "content": describe_statistics(runner.statistics),
+            }
+        )
 
         if runner.call_count == 0:
             return
@@ -1041,7 +1132,9 @@ def run_engine(self):
                 falsifying_example.buffer
             )
             ran_example.slice_comments = falsifying_example.slice_comments
-            assert info.__expected_exception is not None
+            tb = None
+            origin = None
+            assert info._expected_exception is not None
             try:
                 with with_reporter(fragments.append):
                     self.execute_once(
@@ -1049,8 +1142,8 @@ def run_engine(self):
                         print_example=not self.is_find,
                         is_final=True,
                         expected_failure=(
-                            info.__expected_exception,
-                            info.__expected_traceback,
+                            info._expected_exception,
+                            info._expected_traceback,
                         ),
                     )
             except (UnsatisfiedAssumption, StopTest) as e:
@@ -1066,10 +1159,34 @@ def run_engine(self):
                 errors_to_report.append(
                     (fragments, e.with_traceback(get_trimmed_traceback()))
                 )
+                tb = format_exception(e, get_trimmed_traceback(e))
+                origin = InterestingOrigin.from_exception(e)
             else:
                 # execute_once() will always raise either the expected error, or Flaky.
                 raise NotImplementedError("This should be unreachable")
             finally:
+                # log our observability line for the final failing example
+                tc = {
+                    "type": "test_case",
+                    "run_start": self._start_timestamp,
+                    "property": self.test_identifier,
+                    "status": "passed" if sys.exc_info()[0] else "failed",
+                    "status_reason": str(origin or "unexpected/flaky pass"),
+                    "representation": self._string_repr,
+                    "how_generated": "minimal failing example",
+                    "features": {
+                        **{
+                            k: v
+                            for k, v in ran_example.target_observations.items()
+                            if isinstance(k, str)
+                        },
+                        **ran_example.events,
+                        **self._timing_features,
+                    },
+                    "coverage": None,  # TODO: expose this?
+                    "metadata": {"traceback": tb},
+                }
+                deliver_json_blob(tc)
                 # Whether or not replay actually raised the exception again, we want
                 # to print the reproduce_failure decorator for the failing example.
                 if self.settings.print_blob:
diff --git a/hypothesis-python/src/hypothesis/internal/conjecture/data.py b/hypothesis-python/src/hypothesis/internal/conjecture/data.py
index c7916d10a2..7a5542b0bd 100644
--- a/hypothesis-python/src/hypothesis/internal/conjecture/data.py
+++ b/hypothesis-python/src/hypothesis/internal/conjecture/data.py
@@ -1456,6 +1456,7 @@ def __init__(
         # try varying, to report if the minimal example always fails anyway.
         self.arg_slices: Set[Tuple[int, int]] = set()
         self.slice_comments: Dict[Tuple[int, int], str] = {}
+        self._observability_args: Dict[str, Any] = {}
 
         self.extra_information = ExtraInformation()
 
diff --git a/hypothesis-python/src/hypothesis/internal/observability.py b/hypothesis-python/src/hypothesis/internal/observability.py
new file mode 100644
index 0000000000..6752868737
--- /dev/null
+++ b/hypothesis-python/src/hypothesis/internal/observability.py
@@ -0,0 +1,92 @@
+# This file is part of Hypothesis, which may be found at
+# https://github.com/HypothesisWorks/hypothesis/
+#
+# Copyright the Hypothesis Authors.
+# Individual contributors are listed in AUTHORS.rst and the git log.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public License,
+# v. 2.0. If a copy of the MPL was not distributed with this file, You can
+# obtain one at https://mozilla.org/MPL/2.0/.
+
+"""Observability tools to spit out analysis-ready tables, one row per test case."""
+
+import json
+import os
+from datetime import date, timedelta
+from typing import Callable, Dict, List, Optional
+
+from hypothesis.configuration import storage_directory
+from hypothesis.internal.conjecture.data import ConjectureData, Status
+
+TESTCASE_CALLBACKS: List[Callable[[dict], None]] = []
+
+
+def deliver_json_blob(value: dict) -> None:
+    for callback in TESTCASE_CALLBACKS:
+        callback(value)
+
+
+def make_testcase(
+    *,
+    start_timestamp: float,
+    test_name_or_nodeid: str,
+    data: ConjectureData,
+    how_generated: str = "unknown",
+    string_repr: str = "<unknown>",
+    arguments: Optional[dict] = None,
+    metadata: Optional[dict] = None,
+    coverage: Optional[Dict[str, List[int]]] = None,
+) -> dict:
+    if data.interesting_origin:
+        status_reason = str(data.interesting_origin)
+    else:
+        status_reason = str(data.events.pop("invalid because", ""))
+
+    return {
+        "type": "test_case",
+        "run_start": start_timestamp,
+        "property": test_name_or_nodeid,
+        "status": {
+            Status.OVERRUN: "gave_up",
+            Status.INVALID: "gave_up",
+            Status.VALID: "passed",
+            Status.INTERESTING: "failed",
+        }[data.status],
+        "status_reason": status_reason,
+        "representation": string_repr,
+        "arguments": arguments or {},
+        "how_generated": how_generated,  # iid, mutation, etc.
+        "features": {
+            **{
+                f"target:{k}".strip(":"): v for k, v in data.target_observations.items()
+            },
+            **data.events,
+        },
+        "metadata": {
+            **(metadata or {}),
+            "traceback": getattr(data.extra_information, "_expected_traceback", None),
+        },
+        "coverage": coverage,
+    }
+
+
+_WROTE_TO = set()
+
+
+def _deliver_to_file(value):  # pragma: no cover
+    kind = "testcases" if value["type"] == "test_case" else "info"
+    fname = storage_directory("observed", f"{date.today().isoformat()}_{kind}.jsonl")
+    fname.parent.mkdir(exist_ok=True)
+    _WROTE_TO.add(fname)
+    with fname.open(mode="a") as f:
+        f.write(json.dumps(value) + "\n")
+
+
+if "HYPOTHESIS_EXPERIMENTAL_OBSERVABILITY" in os.environ:  # pragma: no cover
+    TESTCASE_CALLBACKS.append(_deliver_to_file)
+
+    # Remove files more than a week old, to cap the size on disk
+    max_age = (date.today() - timedelta(days=8)).isoformat()
+    for f in storage_directory("observed").glob("*.jsonl"):
+        if f.stem < max_age:  # pragma: no branch
+            f.unlink(missing_ok=True)
diff --git a/hypothesis-python/src/hypothesis/internal/reflection.py b/hypothesis-python/src/hypothesis/internal/reflection.py
index 31123b61ec..2f0480c987 100644
--- a/hypothesis-python/src/hypothesis/internal/reflection.py
+++ b/hypothesis-python/src/hypothesis/internal/reflection.py
@@ -306,8 +306,12 @@ def extract_lambda_source(f):
     This is not a good function and I am sorry for it. Forgive me my
     sins, oh lord
     """
+    # You might be wondering how a lambda can have a return-type annotation?
+    # The answer is that we add this at runtime, in new_given_signature(),
+    # and we do support strange choices as applying @given() to a lambda.
     sig = inspect.signature(f)
-    assert sig.return_annotation is inspect.Parameter.empty
+    assert sig.return_annotation in (inspect.Parameter.empty, None), sig
+
     if sig.parameters:
         if_confused = f"lambda {str(sig)[1:-1]}: <unknown>"
     else:
diff --git a/hypothesis-python/src/hypothesis/internal/scrutineer.py b/hypothesis-python/src/hypothesis/internal/scrutineer.py
index 5b372ffd65..39352844b4 100644
--- a/hypothesis-python/src/hypothesis/internal/scrutineer.py
+++ b/hypothesis-python/src/hypothesis/internal/scrutineer.py
@@ -8,16 +8,29 @@
 # v. 2.0. If a copy of the MPL was not distributed with this file, You can
 # obtain one at https://mozilla.org/MPL/2.0/.
 
+import functools
+import os
+import subprocess
 import sys
 import types
 from collections import defaultdict
 from functools import lru_cache, reduce
 from os import sep
 from pathlib import Path
+from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple
 
 from hypothesis._settings import Phase, Verbosity
 from hypothesis.internal.escalation import is_hypothesis_file
 
+if TYPE_CHECKING:
+    from typing import TypeAlias
+else:
+    TypeAlias = object
+
+Location: TypeAlias = Tuple[str, int]
+Branch: TypeAlias = Tuple[Optional[Location], Location]
+Trace: TypeAlias = Set[Branch]
+
 
 @lru_cache(maxsize=None)
 def should_trace_file(fname):
@@ -41,7 +54,7 @@ class Tracer:
     __slots__ = ("branches", "_previous_location")
 
     def __init__(self):
-        self.branches = set()
+        self.branches: Trace = set()
         self._previous_location = None
 
     def trace(self, frame, event, arg):
@@ -179,3 +192,50 @@ def explanatory_lines(traces, settings):
     explanations = get_explaining_locations(traces)
     max_lines = 5 if settings.verbosity <= Verbosity.normal else float("inf")
     return make_report(explanations, cap_lines_at=max_lines)
+
+
+# beware the code below; we're using some heuristics to make a nicer report...
+
+
+@functools.lru_cache
+def _get_git_repo_root() -> Path:
+    try:
+        where = subprocess.run(
+            ["git", "rev-parse", "--show-toplevel"],
+            check=True,
+            timeout=10,
+            capture_output=True,
+            text=True,
+            encoding="utf-8",
+        ).stdout.strip()
+    except Exception:  # pragma: no cover
+        return Path().absolute().parents[-1]
+    else:
+        return Path(where)
+
+
+if sys.version_info[:2] <= (3, 8):
+
+    def is_relative_to(self, other):
+        return other == self or other in self.parents
+
+else:
+    is_relative_to = Path.is_relative_to
+
+
+def tractable_coverage_report(trace: Trace) -> Dict[str, List[int]]:
+    """Report a simple coverage map which is (probably most) of the user's code."""
+    coverage: dict = {}
+    t = dict(trace)
+    for file, line in set(t.keys()).union(t.values()) - {None}:  # type: ignore
+        # On Python <= 3.11, we can use coverage.py xor Hypothesis' tracer,
+        # so the trace will be empty and this line never run under coverage.
+        coverage.setdefault(file, set()).add(line)  # pragma: no cover
+    stdlib_fragment = f"{os.sep}lib{os.sep}python3.{sys.version_info.minor}{os.sep}"
+    return {
+        k: sorted(v)
+        for k, v in coverage.items()
+        if stdlib_fragment not in k
+        and is_relative_to(p := Path(k), _get_git_repo_root())
+        and "site-packages" not in p.parts
+    }
diff --git a/hypothesis-python/src/hypothesis/strategies/_internal/core.py b/hypothesis-python/src/hypothesis/strategies/_internal/core.py
index b8c5601587..a5a862635a 100644
--- a/hypothesis-python/src/hypothesis/strategies/_internal/core.py
+++ b/hypothesis-python/src/hypothesis/strategies/_internal/core.py
@@ -129,7 +129,11 @@
     OneCharStringStrategy,
     TextStrategy,
 )
-from hypothesis.strategies._internal.utils import cacheable, defines_strategy
+from hypothesis.strategies._internal.utils import (
+    cacheable,
+    defines_strategy,
+    to_jsonable,
+)
 from hypothesis.utils.conventions import not_set
 from hypothesis.vendor.pretty import RepresentationPrinter
 
@@ -2098,8 +2102,9 @@ def draw(self, strategy: SearchStrategy[Ex], label: Any = None) -> Ex:
         result = self.conjecture_data.draw(strategy)
         self.count += 1
         printer = RepresentationPrinter(context=current_build_context())
-        printer.text(f"Draw {self.count}")
-        printer.text(": " if label is None else f" ({label}): ")
+        desc = f"Draw {self.count}{'' if label is None else f' ({label})'}: "
+        self.conjecture_data._observability_args[desc] = to_jsonable(result)
+        printer.text(desc)
         printer.pretty(result)
         note(printer.getvalue())
         return result
diff --git a/hypothesis-python/src/hypothesis/strategies/_internal/utils.py b/hypothesis-python/src/hypothesis/strategies/_internal/utils.py
index b0e0746314..995b179b40 100644
--- a/hypothesis-python/src/hypothesis/strategies/_internal/utils.py
+++ b/hypothesis-python/src/hypothesis/strategies/_internal/utils.py
@@ -8,13 +8,17 @@
 # v. 2.0. If a copy of the MPL was not distributed with this file, You can
 # obtain one at https://mozilla.org/MPL/2.0/.
 
+import sys
 import threading
 from inspect import signature
 from typing import TYPE_CHECKING, Callable, Dict
 
+import attr
+
 from hypothesis.internal.cache import LRUReusedCache
 from hypothesis.internal.floats import float_to_int
 from hypothesis.internal.reflection import proxies
+from hypothesis.vendor.pretty import pretty
 
 if TYPE_CHECKING:
     from hypothesis.strategies._internal.strategies import SearchStrategy, T
@@ -144,3 +148,40 @@ def accept(*args, **kwargs):
         return accept
 
     return decorator
+
+
+def to_jsonable(obj: object) -> object:
+    """Recursively convert an object to json-encodable form.
+
+    This is not intended to round-trip, but rather provide an analysis-ready
+    format for observability.  To avoid side affects, we pretty-print all but
+    known types.
+    """
+    if isinstance(obj, (str, int, float, bool, type(None))):
+        if isinstance(obj, int) and abs(obj) >= 2**63:
+            return float(obj)
+        return obj
+    if isinstance(obj, (list, tuple, set, frozenset)):
+        if isinstance(obj, tuple) and hasattr(obj, "_asdict"):
+            return to_jsonable(obj._asdict())  # treat namedtuples as dicts
+        return [to_jsonable(x) for x in obj]
+    if isinstance(obj, dict):
+        return {
+            k if isinstance(k, str) else pretty(k): to_jsonable(v)
+            for k, v in obj.items()
+        }
+
+    # Special handling for dataclasses, attrs, and pydantic classes
+    if (
+        (dcs := sys.modules.get("dataclasses"))
+        and dcs.is_dataclass(obj)
+        and not isinstance(obj, type)
+    ):
+        return to_jsonable(dcs.asdict(obj))
+    if attr.has(type(obj)):
+        return to_jsonable(attr.asdict(obj, recurse=False))  # type: ignore
+    if (pyd := sys.modules.get("pydantic")) and isinstance(obj, pyd.BaseModel):
+        return to_jsonable(obj.model_dump())
+
+    # If all else fails, we'll just pretty-print as a string.
+    return pretty(obj)
diff --git a/hypothesis-python/tests/cover/test_observability.py b/hypothesis-python/tests/cover/test_observability.py
new file mode 100644
index 0000000000..761f6c768d
--- /dev/null
+++ b/hypothesis-python/tests/cover/test_observability.py
@@ -0,0 +1,67 @@
+# This file is part of Hypothesis, which may be found at
+# https://github.com/HypothesisWorks/hypothesis/
+#
+# Copyright the Hypothesis Authors.
+# Individual contributors are listed in AUTHORS.rst and the git log.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public License,
+# v. 2.0. If a copy of the MPL was not distributed with this file, You can
+# obtain one at https://mozilla.org/MPL/2.0/.
+
+import contextlib
+
+import pytest
+
+from hypothesis import (
+    assume,
+    event,
+    example,
+    given,
+    seed,
+    settings,
+    strategies as st,
+    target,
+)
+from hypothesis.database import InMemoryExampleDatabase
+from hypothesis.internal.observability import TESTCASE_CALLBACKS
+
+
+@contextlib.contextmanager
+def capture_observations():
+    ls = []
+    TESTCASE_CALLBACKS.append(ls.append)
+    try:
+        yield ls
+    finally:
+        TESTCASE_CALLBACKS.remove(ls.append)
+
+
+@seed("deterministic so we don't miss some combination of features")
+@example(a=0, x=4, data=None)
+@settings(database=InMemoryExampleDatabase())
+@given(st.integers(), st.integers(), st.data())
+def do_it_all(a, x, data):
+    event(f"{x%2=}")
+    target(x % 5, label="x%5")
+    assume(a % 9)
+    if data:
+        data.draw(st.text("abcdef", min_size=a % 3), label="interactive")
+    1 / ((x or 1) % 7)
+
+
+def test_observability():
+    with capture_observations() as ls:
+        with pytest.raises(ZeroDivisionError):
+            do_it_all()
+        with pytest.raises(ZeroDivisionError):
+            do_it_all()
+
+    infos = [t for t in ls if t["type"] == "info"]
+    assert len(infos) == 2
+    assert {t["title"] for t in infos} == {"Hypothesis Statistics"}
+
+    testcases = [t for t in ls if t["type"] == "test_case"]
+    assert len(testcases) > 50
+    assert {t["property"] for t in testcases} == {do_it_all.__name__}
+    assert len({t["run_start"] for t in testcases}) == 2
+    assert {t["status"] for t in testcases} == {"gave_up", "passed", "failed"}
diff --git a/hypothesis-python/tests/cover/test_searchstrategy.py b/hypothesis-python/tests/cover/test_searchstrategy.py
index ef93f0c47a..b6b6c498e9 100644
--- a/hypothesis-python/tests/cover/test_searchstrategy.py
+++ b/hypothesis-python/tests/cover/test_searchstrategy.py
@@ -16,6 +16,7 @@
 from hypothesis.errors import InvalidArgument
 from hypothesis.internal.conjecture.data import ConjectureData
 from hypothesis.strategies import booleans, integers, just, none, tuples
+from hypothesis.strategies._internal.utils import to_jsonable
 
 from tests.common.debug import assert_no_examples
 
@@ -85,3 +86,7 @@ def test_can_flatmap_nameless():
 def test_flatmap_with_invalid_expand():
     with pytest.raises(InvalidArgument):
         just(100).flatmap(lambda n: "a").example()
+
+
+def test_jsonable():
+    assert isinstance(to_jsonable(object()), str)
diff --git a/mypy.ini b/mypy.ini
index ecb6e6ab37..e0063628c1 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -15,3 +15,5 @@ warn_no_return = True
 warn_unused_ignores = True
 warn_unused_configs = True
 warn_redundant_casts = True
+
+disable_error_code = annotation-unchecked
diff --git a/requirements/tools.in b/requirements/tools.in
index 919959ac37..c94870297a 100644
--- a/requirements/tools.in
+++ b/requirements/tools.in
@@ -17,6 +17,7 @@ shed
 sphinx
 sphinx-codeautolink
 sphinx-hoverxref
+sphinx-jsonschema
 sphinx-rtd-theme
 sphinx-selective-exclude
 tox
diff --git a/requirements/tools.txt b/requirements/tools.txt
index 512b443392..741d0b3bf3 100644
--- a/requirements/tools.txt
+++ b/requirements/tools.txt
@@ -60,6 +60,7 @@ docutils==0.20.1
     #   readme-renderer
     #   restructuredtext-lint
     #   sphinx
+    #   sphinx-jsonschema
     #   sphinx-rtd-theme
 dpcontracts==0.6.0
     # via -r requirements/tools.in
@@ -98,6 +99,8 @@ jeepney==0.8.0
     #   secretstorage
 jinja2==3.1.2
     # via sphinx
+jsonpointer==2.4
+    # via sphinx-jsonschema
 keyring==24.3.0
     # via twine
 lark==1.1.8
@@ -183,7 +186,9 @@ python-dateutil==2.8.2
 pyupgrade==3.15.0
     # via shed
 pyyaml==6.0.1
-    # via libcst
+    # via
+    #   libcst
+    #   sphinx-jsonschema
 readme-renderer==42.0
     # via twine
 requests==2.31.0
@@ -191,6 +196,7 @@ requests==2.31.0
     #   -r requirements/tools.in
     #   requests-toolbelt
     #   sphinx
+    #   sphinx-jsonschema
     #   twine
 requests-toolbelt==1.0.0
     # via twine
@@ -232,6 +238,8 @@ sphinx-codeautolink==0.15.0
     # via -r requirements/tools.in
 sphinx-hoverxref==1.3.0
     # via -r requirements/tools.in
+sphinx-jsonschema==1.19.1
+    # via -r requirements/tools.in
 sphinx-rtd-theme==2.0.0
     # via -r requirements/tools.in
 sphinx-selective-exclude==1.0.3