From ff8315d375b7c4dcf320996138cfb415c931ea9b Mon Sep 17 00:00:00 2001
From: Seth Foster <seth@opentrons.com>
Date: Thu, 21 Mar 2024 17:07:14 -0400
Subject: [PATCH] feat(robot-server): Use a BadRun when we cant load (#14711)

Up to now, if there's a run saved in the persistence layer that cannot
be loaded - which is typically because it contains data from a version
of the robot server or api package that whatever's currently running
can't handle - we error when trying to retrieve it. That includes both a
500 error when trying to access that particular run, which clients can
broadly handle, and a 500 error when trying to list all runs, which
clients cannot. Without being able to list out all runs, there's no way
for clients to find the problematic run - the IDs are UUIDs and cannot
be enumerated - and remove it. The only recourse is to delete all the
run storage.

A different way to handle this problem is to consider a "bad run", a run
whose run metadata or engine state summary cannot be loaded, as a first
class entity that can be returned from run access endpoints wherever a
run could be, without an HTTP level error. This is done everywhere for
consistency in this commit, though the argument could be made that it
should only be done in the list-all-runs access and other endpoints
should continue to error.

This bad run contains error information about the cause of the invalid
data using a new enumerated error. The bad run will carry all the
information that could be loaded - in effect, if the state summary is
bad then the run metadata will still be present, and the ID should
generally be accessible.

Closes EXEC-344
Co-authored-by: Max Marrone <max@opentrons.com>
---
 api-client/src/runs/types.ts                  |  17 ++-
 .../robot_server/runs/router/base_router.py   |  16 +--
 .../robot_server/runs/run_data_manager.py     | 124 ++++++++++++------
 robot-server/robot_server/runs/run_models.py  |  78 ++++++++++-
 robot-server/robot_server/runs/run_store.py   | 105 ++++++++++++---
 .../test_json_v6_protocol_run.tavern.yaml     |   3 +-
 .../test_json_v7_protocol_run.tavern.yaml     |   1 +
 .../runs/test_protocol_run.tavern.yaml        |   2 +
 ...t_run_queued_protocol_commands.tavern.yaml |   2 +
 .../tests/runs/test_run_auto_deleter.py       |   9 +-
 .../tests/runs/test_run_data_manager.py       |  16 ++-
 robot-server/tests/runs/test_run_store.py     |  17 ++-
 shared-data/errors/definitions/1/errors.json  |   4 +
 .../opentrons_shared_data/errors/codes.py     |   1 +
 .../errors/exceptions.py                      |  16 +++
 15 files changed, 334 insertions(+), 77 deletions(-)

diff --git a/api-client/src/runs/types.ts b/api-client/src/runs/types.ts
index db43c01852d..7709e580a5e 100644
--- a/api-client/src/runs/types.ts
+++ b/api-client/src/runs/types.ts
@@ -5,7 +5,7 @@ import type {
   ModuleModel,
   RunTimeCommand,
 } from '@opentrons/shared-data'
-import type { ResourceLink } from '../types'
+import type { ResourceLink, ErrorDetails } from '../types'
 export * from './commands/types'
 
 export const RUN_STATUS_IDLE = 'idle' as const
@@ -33,7 +33,7 @@ export type RunStatus =
   | typeof RUN_STATUS_BLOCKED_BY_OPEN_DOOR
   | typeof RUN_STATUS_AWAITING_RECOVERY
 
-export interface RunData {
+export interface LegacyGoodRunData {
   id: string
   createdAt: string
   completedAt?: string
@@ -49,6 +49,19 @@ export interface RunData {
   labwareOffsets?: LabwareOffset[]
 }
 
+export interface KnownGoodRunData extends LegacyGoodRunData {
+  ok: true
+}
+
+export interface KnownInvalidRunData extends LegacyGoodRunData {
+  ok: false
+  dataError: ErrorDetails
+}
+
+export type GoodRunData = KnownGoodRunData | LegacyGoodRunData
+
+export type RunData = GoodRunData | KnownInvalidRunData
+
 export interface VectorOffset {
   x: number
   y: number
diff --git a/robot-server/robot_server/runs/router/base_router.py b/robot-server/robot_server/runs/router/base_router.py
index c8638ae5043..fc7b3f223e3 100644
--- a/robot-server/robot_server/runs/router/base_router.py
+++ b/robot-server/robot_server/runs/router/base_router.py
@@ -36,7 +36,7 @@
 
 from ..run_models import RunNotFoundError
 from ..run_auto_deleter import RunAutoDeleter
-from ..run_models import Run, RunCreate, RunUpdate
+from ..run_models import Run, BadRun, RunCreate, RunUpdate
 from ..engine_store import EngineConflictError
 from ..run_data_manager import RunDataManager, RunNotCurrentError
 from ..dependencies import get_run_data_manager, get_run_auto_deleter
@@ -99,7 +99,7 @@ class AllRunsLinks(BaseModel):
 async def get_run_data_from_url(
     runId: str,
     run_data_manager: RunDataManager = Depends(get_run_data_manager),
-) -> Run:
+) -> Union[Run, BadRun]:
     """Get the data of a run.
 
     Args:
@@ -144,7 +144,7 @@ async def create_run(
     deck_configuration_store: DeckConfigurationStore = Depends(
         get_deck_configuration_store
     ),
-) -> PydanticResponse[SimpleBody[Run]]:
+) -> PydanticResponse[SimpleBody[Union[Run, BadRun]]]:
     """Create a new run.
 
     Arguments:
@@ -206,7 +206,7 @@ async def create_run(
         "Get a list of all active and inactive runs, in order from oldest to newest."
     ),
     responses={
-        status.HTTP_200_OK: {"model": MultiBody[Run, AllRunsLinks]},
+        status.HTTP_200_OK: {"model": MultiBody[Union[Run, BadRun], AllRunsLinks]},
     },
 )
 async def get_runs(
@@ -220,7 +220,7 @@ async def get_runs(
         ),
     ),
     run_data_manager: RunDataManager = Depends(get_run_data_manager),
-) -> PydanticResponse[MultiBody[Run, AllRunsLinks]]:
+) -> PydanticResponse[MultiBody[Union[Run, BadRun], AllRunsLinks]]:
     """Get all runs, in order from least-recently to most-recently created.
 
     Args:
@@ -248,13 +248,13 @@ async def get_runs(
     summary="Get a run",
     description="Get a specific run by its unique identifier.",
     responses={
-        status.HTTP_200_OK: {"model": SimpleBody[Run]},
+        status.HTTP_200_OK: {"model": SimpleBody[Union[Run, BadRun]]},
         status.HTTP_404_NOT_FOUND: {"model": ErrorBody[RunNotFound]},
     },
 )
 async def get_run(
     run_data: Run = Depends(get_run_data_from_url),
-) -> PydanticResponse[SimpleBody[Run]]:
+) -> PydanticResponse[SimpleBody[Union[Run, BadRun]]]:
     """Get a run by its ID.
 
     Args:
@@ -316,7 +316,7 @@ async def update_run(
     runId: str,
     request_body: RequestModel[RunUpdate],
     run_data_manager: RunDataManager = Depends(get_run_data_manager),
-) -> PydanticResponse[SimpleBody[Run]]:
+) -> PydanticResponse[SimpleBody[Union[Run, BadRun]]]:
     """Update a run by its ID.
 
     Args:
diff --git a/robot-server/robot_server/runs/run_data_manager.py b/robot-server/robot_server/runs/run_data_manager.py
index 0fc6ee2b731..0515ee3bc26 100644
--- a/robot-server/robot_server/runs/run_data_manager.py
+++ b/robot-server/robot_server/runs/run_data_manager.py
@@ -1,9 +1,9 @@
 """Manage current and historical run data."""
 from datetime import datetime
-from typing import List, Optional
+from typing import List, Optional, Union
 
 from opentrons_shared_data.labware.labware_definition import LabwareDefinition
-
+from opentrons_shared_data.errors.exceptions import InvalidStoredData, EnumeratedError
 from opentrons.protocol_engine import (
     EngineStatus,
     LabwareOffsetCreate,
@@ -18,43 +18,89 @@
 from robot_server.service.notifications import RunsPublisher
 
 from .engine_store import EngineStore
-from .run_store import RunResource, RunStore
-from .run_models import Run
+from .run_store import RunResource, RunStore, BadRunResource, BadStateSummary
+from .run_models import Run, BadRun, RunDataError
 
 from opentrons.protocol_engine.types import DeckConfigurationType
 
 
 def _build_run(
-    run_resource: RunResource,
-    state_summary: Optional[StateSummary],
+    run_resource: Union[RunResource, BadRunResource],
+    state_summary: Union[StateSummary, BadStateSummary],
     current: bool,
-) -> Run:
+) -> Union[Run, BadRun]:
     # TODO(mc, 2022-05-16): improve persistence strategy
     # such that this default summary object is not needed
-    state_summary = state_summary or StateSummary.construct(
-        status=EngineStatus.STOPPED,
-        errors=[],
-        labware=[],
-        labwareOffsets=[],
-        pipettes=[],
-        modules=[],
-        liquids=[],
-    )
-    return Run.construct(
+
+    if run_resource.ok and isinstance(state_summary, StateSummary):
+        return Run.construct(
+            id=run_resource.run_id,
+            protocolId=run_resource.protocol_id,
+            createdAt=run_resource.created_at,
+            actions=run_resource.actions,
+            status=state_summary.status,
+            errors=state_summary.errors,
+            labware=state_summary.labware,
+            labwareOffsets=state_summary.labwareOffsets,
+            pipettes=state_summary.pipettes,
+            modules=state_summary.modules,
+            current=current,
+            completedAt=state_summary.completedAt,
+            startedAt=state_summary.startedAt,
+            liquids=state_summary.liquids,
+        )
+
+    errors: List[EnumeratedError] = []
+    if isinstance(state_summary, BadStateSummary):
+        state = StateSummary.construct(
+            status=EngineStatus.STOPPED,
+            errors=[],
+            labware=[],
+            labwareOffsets=[],
+            pipettes=[],
+            modules=[],
+            liquids=[],
+        )
+        errors.append(state_summary.dataError)
+    else:
+        state = state_summary
+    if not run_resource.ok:
+        errors.append(run_resource.error)
+
+    if len(errors) > 1:
+        run_loading_error = RunDataError.from_exc(
+            InvalidStoredData(
+                message=(
+                    "Data on this run is not valid. The run may have been "
+                    "created on a future software version."
+                ),
+                wrapping=errors,
+            )
+        )
+    elif errors:
+        run_loading_error = RunDataError.from_exc(errors[0])
+    else:
+        # We should never get here
+        run_loading_error = RunDataError.from_exc(
+            AssertionError("Logic error in parsing invalid run.")
+        )
+
+    return BadRun.construct(
+        dataError=run_loading_error,
         id=run_resource.run_id,
         protocolId=run_resource.protocol_id,
         createdAt=run_resource.created_at,
         actions=run_resource.actions,
-        status=state_summary.status,
-        errors=state_summary.errors,
-        labware=state_summary.labware,
-        labwareOffsets=state_summary.labwareOffsets,
-        pipettes=state_summary.pipettes,
-        modules=state_summary.modules,
+        status=state.status,
+        errors=state.errors,
+        labware=state.labware,
+        labwareOffsets=state.labwareOffsets,
+        pipettes=state.pipettes,
+        modules=state.modules,
         current=current,
-        completedAt=state_summary.completedAt,
-        startedAt=state_summary.startedAt,
-        liquids=state_summary.liquids,
+        completedAt=state.completedAt,
+        startedAt=state.startedAt,
+        liquids=state.liquids,
     )
 
 
@@ -97,7 +143,7 @@ async def create(
         labware_offsets: List[LabwareOffsetCreate],
         deck_configuration: DeckConfigurationType,
         protocol: Optional[ProtocolResource],
-    ) -> Run:
+    ) -> Union[Run, BadRun]:
         """Create a new, current run.
 
         Args:
@@ -133,7 +179,7 @@ async def create(
         )
         await self._runs_publisher.begin_polling_engine_store(
             get_current_command=self.get_current_command,
-            get_state_summary=self._get_state_summary,
+            get_state_summary=self._get_good_state_summary,
             run_id=run_id,
         )
 
@@ -143,7 +189,7 @@ async def create(
             current=True,
         )
 
-    def get(self, run_id: str) -> Run:
+    def get(self, run_id: str) -> Union[Run, BadRun]:
         """Get a run resource.
 
         This method will pull from the current run or the historical runs,
@@ -192,7 +238,7 @@ def get_run_loaded_labware_definitions(
             self._engine_store.engine.state_view.labware.get_loaded_labware_definitions()
         )
 
-    def get_all(self, length: Optional[int]) -> List[Run]:
+    def get_all(self, length: Optional[int]) -> List[Union[Run, BadRun]]:
         """Get current and stored run resources.
 
         Results are ordered from oldest to newest.
@@ -226,7 +272,7 @@ async def delete(self, run_id: str) -> None:
 
         self._run_store.remove(run_id=run_id)
 
-    async def update(self, run_id: str, current: Optional[bool]) -> Run:
+    async def update(self, run_id: str, current: Optional[bool]) -> Union[Run, BadRun]:
         """Get and potentially archive a run.
 
         Args:
@@ -249,7 +295,9 @@ async def update(self, run_id: str, current: Optional[bool]) -> Run:
 
         if next_current is False:
             commands, state_summary = await self._engine_store.clear()
-            run_resource = self._run_store.update_run_state(
+            run_resource: Union[
+                RunResource, BadRunResource
+            ] = self._run_store.update_run_state(
                 run_id=run_id,
                 summary=state_summary,
                 commands=commands,
@@ -319,12 +367,12 @@ def get_command(self, run_id: str, command_id: str) -> Command:
 
         return self._run_store.get_command(run_id=run_id, command_id=command_id)
 
-    def _get_state_summary(self, run_id: str) -> Optional[StateSummary]:
-        result: Optional[StateSummary]
-
+    def _get_state_summary(self, run_id: str) -> Union[StateSummary, BadStateSummary]:
         if run_id == self._engine_store.current_run_id:
-            result = self._engine_store.engine.state_view.get_summary()
+            return self._engine_store.engine.state_view.get_summary()
         else:
-            result = self._run_store.get_state_summary(run_id=run_id)
+            return self._run_store.get_state_summary(run_id=run_id)
 
-        return result
+    def _get_good_state_summary(self, run_id: str) -> Optional[StateSummary]:
+        summary = self._get_state_summary(run_id)
+        return summary if isinstance(summary, StateSummary) else None
diff --git a/robot-server/robot_server/runs/run_models.py b/robot-server/robot_server/runs/run_models.py
index 85a1446b631..379feeaea0a 100644
--- a/robot-server/robot_server/runs/run_models.py
+++ b/robot-server/robot_server/runs/run_models.py
@@ -1,7 +1,7 @@
 """Request and response models for run resources."""
 from datetime import datetime
 from pydantic import BaseModel, Field
-from typing import List, Optional
+from typing import List, Optional, Literal
 
 from opentrons.protocol_engine import (
     CommandStatus,
@@ -20,9 +20,20 @@
 )
 from opentrons_shared_data.errors import GeneralError
 from robot_server.service.json_api import ResourceModel
+from robot_server.errors.error_responses import ErrorDetails
 from .action_models import RunAction
 
 
+class RunDataError(ErrorDetails):
+    """A model for an error loading a run."""
+
+    title: str = Field(
+        "Run Loading Error",
+        description="A short, human readable name for this type of error",
+    )
+    id: Literal["RunDataError"] = "RunDataError"
+
+
 # TODO(mc, 2022-02-01): since the `/runs/:run_id/commands` response is now paginated,
 # this summary model is a lot less useful. Remove and replace with full `Command`
 # models once problematically large objects like full labware and module definitions
@@ -66,6 +77,7 @@ class RunCommandSummary(ResourceModel):
 class Run(ResourceModel):
     """Run resource model."""
 
+    ok: Literal[True] = True
     id: str = Field(..., description="Unique run identifier.")
     createdAt: datetime = Field(..., description="When the run was created")
     status: RunStatus = Field(..., description="Execution status of the run")
@@ -125,6 +137,70 @@ class Run(ResourceModel):
     )
 
 
+class BadRun(ResourceModel):
+    """Resource model representation for a bad run that could not be loaded."""
+
+    ok: Literal[False] = False
+    dataError: RunDataError = Field(..., description="Error from loading the data.")
+    id: str = Field(..., description="Unique run identifier.")
+    createdAt: datetime = Field(..., description="When the run was created")
+    status: RunStatus = Field(..., description="Execution status of the run")
+    current: bool = Field(
+        ...,
+        description=(
+            "Whether this run is currently controlling the robot."
+            " There can be, at most, one current run."
+        ),
+    )
+    actions: List[RunAction] = Field(
+        ...,
+        description="Client-initiated run control actions, ordered oldest to newest. If these could not be loaded for this bad run, this will be null.",
+    )
+    errors: List[ErrorOccurrence] = Field(
+        ...,
+        description=(
+            "The run's fatal error, if there was one."
+            " For historical reasons, this is an array,"
+            " but it won't have more than one element."
+        ),
+    )
+    pipettes: List[LoadedPipette] = Field(
+        ...,
+        description="Pipettes that have been loaded into the run.",
+    )
+    modules: List[LoadedModule] = Field(
+        ...,
+        description="Modules that have been loaded into the run.",
+    )
+    labware: List[LoadedLabware] = Field(
+        ...,
+        description="Labware that has been loaded into the run.",
+    )
+    liquids: List[Liquid] = Field(
+        ...,
+        description="Liquids loaded to the run.",
+    )
+    labwareOffsets: List[LabwareOffset] = Field(
+        ...,
+        description="Labware offsets to apply as labware are loaded.",
+    )
+    protocolId: Optional[str] = Field(
+        None,
+        description=(
+            "Protocol resource being run, if any. If not present, the run may"
+            " still be used to execute protocol commands over HTTP."
+        ),
+    )
+    completedAt: Optional[datetime] = Field(
+        None,
+        description="Run completed at timestamp.",
+    )
+    startedAt: Optional[datetime] = Field(
+        None,
+        description="Run started at timestamp.",
+    )
+
+
 class RunCreate(BaseModel):
     """Create request data for a new run."""
 
diff --git a/robot-server/robot_server/runs/run_store.py b/robot-server/robot_server/runs/run_store.py
index a6da6942a11..6178e180470 100644
--- a/robot-server/robot_server/runs/run_store.py
+++ b/robot-server/robot_server/runs/run_store.py
@@ -4,7 +4,7 @@
 from dataclasses import dataclass
 from datetime import datetime
 from functools import lru_cache
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Literal, Union
 
 import sqlalchemy
 from pydantic import ValidationError
@@ -13,6 +13,12 @@
 from opentrons.protocol_engine import StateSummary, CommandSlice
 from opentrons.protocol_engine.commands import Command
 
+from opentrons_shared_data.errors.exceptions import (
+    EnumeratedError,
+    PythonException,
+    InvalidStoredData,
+)
+
 from robot_server.persistence.database import sqlite_rowid
 from robot_server.persistence.tables import (
     run_table,
@@ -39,12 +45,41 @@ class RunResource:
     location, such as a ProtocolEngine instance.
     """
 
+    ok: Literal[True]
     run_id: str
     protocol_id: Optional[str]
     created_at: datetime
     actions: List[RunAction]
 
 
+@dataclass(frozen=True)
+class BadRunResource:
+    """A representation for an action in the run store that cannot be loaded.
+
+    This will get created, for instance, when loading a run made in a future
+    version with an action that does not exist in the current version. This should
+    never happen in released versions, but it does sometimes during development,
+    and without handling like this it would cause any list-all request to fail.
+
+    The ok field is a union discriminator. Other elements will be filled in as they
+    can be with whatever data was recoverable and should not be relied upon.
+    """
+
+    ok: Literal[False]
+    run_id: str
+    protocol_id: Optional[str]
+    created_at: datetime
+    actions: List[RunAction]
+    error: EnumeratedError
+
+
+@dataclass(frozen=True)
+class BadStateSummary:
+    """A representation for a state summary that could not be loaded."""
+
+    dataError: EnumeratedError
+
+
 class CommandNotFoundError(ValueError):
     """Error raised when a given command ID is not found in the store."""
 
@@ -132,7 +167,10 @@ def update_run_state(
 
         self._clear_caches()
         self._runs_publisher.publish_runs_advise_refetch(run_id=run_id)
-        return _convert_row_to_run(row=run_row, action_rows=action_rows)
+        maybe_run_resource = _convert_row_to_run(row=run_row, action_rows=action_rows)
+        if not maybe_run_resource.ok:
+            raise maybe_run_resource.error
+        return maybe_run_resource
 
     def insert_action(self, run_id: str, action: RunAction) -> None:
         """Insert a run action into the store.
@@ -177,6 +215,7 @@ def insert(
                 found in the store.
         """
         run = RunResource(
+            ok=True,
             run_id=run_id,
             created_at=created_at,
             protocol_id=protocol_id,
@@ -206,7 +245,7 @@ def has(self, run_id: str) -> bool:
             return self._run_exists(run_id, transaction)
 
     @lru_cache(maxsize=_CACHE_ENTRIES)
-    def get(self, run_id: str) -> RunResource:
+    def get(self, run_id: str) -> Union[RunResource, BadRunResource]:
         """Get a specific run entry by its identifier.
 
         Args:
@@ -238,7 +277,9 @@ def get(self, run_id: str) -> RunResource:
         return _convert_row_to_run(run_row, action_rows)
 
     @lru_cache(maxsize=_CACHE_ENTRIES)
-    def get_all(self, length: Optional[int] = None) -> List[RunResource]:
+    def get_all(
+        self, length: Optional[int] = None
+    ) -> List[Union[RunResource, BadRunResource]]:
         """Get all known run resources.
 
         Results are ordered from oldest to newest.
@@ -278,7 +319,7 @@ def get_all(self, length: Optional[int] = None) -> List[RunResource]:
         ]
 
     @lru_cache(maxsize=_CACHE_ENTRIES)
-    def get_state_summary(self, run_id: str) -> Optional[StateSummary]:
+    def get_state_summary(self, run_id: str) -> Union[StateSummary, BadStateSummary]:
         """Get the archived run state summary.
 
         This is a summary of run's ProtocolEngine state,
@@ -296,11 +337,20 @@ def get_state_summary(self, run_id: str) -> Optional[StateSummary]:
             return (
                 json_to_pydantic(StateSummary, row.state_summary)
                 if row.state_summary is not None
-                else None
+                else BadStateSummary(
+                    dataError=InvalidStoredData(
+                        message="There was no engine state data for this run."
+                    )
+                )
             )
         except ValidationError as e:
-            log.warning(f"Error retrieving state summary for {run_id}: {e}")
-            return None
+            log.warning(f"Error retrieving state summary for {run_id}", exc_info=True)
+            return BadStateSummary(
+                dataError=InvalidStoredData(
+                    message="Could not load stored StateSummary",
+                    wrapping=[PythonException(e)],
+                )
+            )
 
     def get_commands_slice(
         self,
@@ -442,28 +492,49 @@ def _clear_caches(self) -> None:
 def _convert_row_to_run(
     row: sqlalchemy.engine.Row,
     action_rows: List[sqlalchemy.engine.Row],
-) -> RunResource:
+) -> Union[RunResource, BadRunResource]:
     run_id = row.id
     protocol_id = row.protocol_id
     created_at = row.created_at
-
+    # Checking the fundamental data types here are not covered by the error handling
+    # because if they fire, the only thing we can do to address the issue is immediately
+    # delete the row while we still have a handle on it from sql - we won't have any
+    # other way to delete it. It's also unclear how it could happen without the table schema
+    # changing out from under us.
     assert isinstance(run_id, str), f"Run ID {run_id} is not a string"
     assert protocol_id is None or isinstance(
         protocol_id, str
     ), f"Protocol ID {protocol_id} is not a string or None"
-
-    return RunResource(
-        run_id=run_id,
-        created_at=created_at,
-        protocol_id=protocol_id,
-        actions=[
+    try:
+        actions = [
             RunAction(
                 id=action_row.id,
                 createdAt=action_row.created_at,
                 actionType=RunActionType(action_row.action_type),
             )
             for action_row in action_rows
-        ],
+        ]
+    except Exception as be:
+        log.warning("Error reading actions for run ID {run_id}:", exc_info=True)
+        return BadRunResource(
+            ok=False,
+            run_id=run_id,
+            created_at=created_at,
+            protocol_id=protocol_id,
+            actions=[],
+            error=InvalidStoredData(
+                message="This run has invalid or unknown actions. It has likely been saved in a future version of software.",
+                detail={"kind": "bad-actions"},
+                wrapping=[PythonException(be)],
+            ),
+        )
+
+    return RunResource(
+        ok=True,
+        run_id=run_id,
+        created_at=created_at,
+        protocol_id=protocol_id,
+        actions=actions,
     )
 
 
diff --git a/robot-server/tests/integration/http_api/runs/test_json_v6_protocol_run.tavern.yaml b/robot-server/tests/integration/http_api/runs/test_json_v6_protocol_run.tavern.yaml
index 65929b5c9be..1118d9a8870 100644
--- a/robot-server/tests/integration/http_api/runs/test_json_v6_protocol_run.tavern.yaml
+++ b/robot-server/tests/integration/http_api/runs/test_json_v6_protocol_run.tavern.yaml
@@ -31,6 +31,7 @@ stages:
       json:
         data:
           id: !anystr
+          ok: True
           createdAt: !anystr
           status: idle
           current: True
@@ -612,4 +613,4 @@ stages:
               namespace: opentrons
               version: 1
               labwareId: tipRackId
-              displayName: Opentrons 96 Tip Rack 10 µL
\ No newline at end of file
+              displayName: Opentrons 96 Tip Rack 10 µL
diff --git a/robot-server/tests/integration/http_api/runs/test_json_v7_protocol_run.tavern.yaml b/robot-server/tests/integration/http_api/runs/test_json_v7_protocol_run.tavern.yaml
index 580feda6597..ace0c47cf64 100644
--- a/robot-server/tests/integration/http_api/runs/test_json_v7_protocol_run.tavern.yaml
+++ b/robot-server/tests/integration/http_api/runs/test_json_v7_protocol_run.tavern.yaml
@@ -31,6 +31,7 @@ stages:
       json:
         data:
           id: !anystr
+          ok: True
           createdAt: !anystr
           status: idle
           current: True
diff --git a/robot-server/tests/integration/http_api/runs/test_protocol_run.tavern.yaml b/robot-server/tests/integration/http_api/runs/test_protocol_run.tavern.yaml
index ddac99be771..5064dd28202 100644
--- a/robot-server/tests/integration/http_api/runs/test_protocol_run.tavern.yaml
+++ b/robot-server/tests/integration/http_api/runs/test_protocol_run.tavern.yaml
@@ -28,6 +28,7 @@ stages:
       json:
         data:
           id: !anystr
+          ok: True
           createdAt: !anystr
           status: idle
           current: True
@@ -229,6 +230,7 @@ stages:
         data:
           # Unchanged from when we originally POSTed the resource:
           id: '{run_id}'
+          ok: True
           createdAt: !re_search "^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+\\+\\d{2}:\\d{2}$"
           startedAt: !re_search "^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+\\+\\d{2}:\\d{2}$"
           liquids: []
diff --git a/robot-server/tests/integration/http_api/runs/test_run_queued_protocol_commands.tavern.yaml b/robot-server/tests/integration/http_api/runs/test_run_queued_protocol_commands.tavern.yaml
index 31de3799870..3d252ac5244 100644
--- a/robot-server/tests/integration/http_api/runs/test_run_queued_protocol_commands.tavern.yaml
+++ b/robot-server/tests/integration/http_api/runs/test_run_queued_protocol_commands.tavern.yaml
@@ -18,6 +18,7 @@ stages:
       json:
         data:
           id: !anystr
+          ok: True
           status: idle
           current: true
       save:
@@ -84,6 +85,7 @@ stages:
       status_code: 200
       json:
         data:
+          ok: True
           actions: []
           createdAt: !re_search "^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+\\+\\d{2}:\\d{2}$"
           current: True
diff --git a/robot-server/tests/runs/test_run_auto_deleter.py b/robot-server/tests/runs/test_run_auto_deleter.py
index 8e156268343..5cf67201c83 100644
--- a/robot-server/tests/runs/test_run_auto_deleter.py
+++ b/robot-server/tests/runs/test_run_auto_deleter.py
@@ -3,21 +3,20 @@
 
 from datetime import datetime
 import logging
+from typing import List, Union
 
 import pytest
 from decoy import Decoy
 
 from robot_server.deletion_planner import RunDeletionPlanner
 from robot_server.runs.run_auto_deleter import RunAutoDeleter
-from robot_server.runs.run_store import (
-    RunStore,
-    RunResource,
-)
+from robot_server.runs.run_store import RunStore, RunResource, BadRunResource
 
 
 def _make_dummy_run_resource(run_id: str) -> RunResource:
     """Return a RunResource with the given ID."""
     return RunResource(
+        ok=True,
         run_id=run_id,
         protocol_id=None,
         created_at=datetime.min,
@@ -35,7 +34,7 @@ def test_make_room_for_new_run(decoy: Decoy, caplog: pytest.LogCaptureFixture) -
         deletion_planner=mock_deletion_planner,
     )
 
-    run_resources = [
+    run_resources: List[Union[RunResource, BadRunResource]] = [
         _make_dummy_run_resource("run-id-1"),
         _make_dummy_run_resource("run-id-2"),
         _make_dummy_run_resource("run-id-3"),
diff --git a/robot-server/tests/runs/test_run_data_manager.py b/robot-server/tests/runs/test_run_data_manager.py
index cabaa09ae05..d4bc37ea3d0 100644
--- a/robot-server/tests/runs/test_run_data_manager.py
+++ b/robot-server/tests/runs/test_run_data_manager.py
@@ -24,11 +24,12 @@
 from robot_server.protocols.protocol_store import ProtocolResource
 from robot_server.runs.engine_store import EngineStore, EngineConflictError
 from robot_server.runs.run_data_manager import RunDataManager, RunNotCurrentError
-from robot_server.runs.run_models import Run, RunNotFoundError
+from robot_server.runs.run_models import Run, BadRun, RunNotFoundError, RunDataError
 from robot_server.runs.run_store import (
     RunStore,
     RunResource,
     CommandNotFoundError,
+    BadStateSummary,
 )
 from robot_server.service.task_runner import TaskRunner
 from robot_server.service.notifications import RunsPublisher
@@ -36,6 +37,7 @@
 from opentrons.protocol_engine import Liquid
 
 from opentrons_shared_data.labware.labware_definition import LabwareDefinition
+from opentrons_shared_data.errors.exceptions import InvalidStoredData
 
 
 @pytest.fixture
@@ -82,6 +84,7 @@ def engine_state_summary() -> StateSummary:
 def run_resource() -> RunResource:
     """Get a StateSummary value object."""
     return RunResource(
+        ok=True,
         run_id="hello from the other side",
         protocol_id=None,
         created_at=datetime(year=2022, month=2, day=2),
@@ -354,13 +357,18 @@ async def test_get_historical_run_no_data(
     """It should get a historical run from the store."""
     run_id = "hello world"
 
+    state_exc = InvalidStoredData("Oh no!")
+    run_error = RunDataError.from_exc(state_exc)
     decoy.when(mock_run_store.get(run_id=run_id)).then_return(run_resource)
-    decoy.when(mock_run_store.get_state_summary(run_id=run_id)).then_return(None)
+    decoy.when(mock_run_store.get_state_summary(run_id=run_id)).then_return(
+        BadStateSummary(dataError=state_exc)
+    )
     decoy.when(mock_engine_store.current_run_id).then_return("some other id")
 
     result = subject.get(run_id=run_id)
 
-    assert result == Run(
+    assert result == BadRun(
+        dataError=run_error,
         current=False,
         id=run_resource.run_id,
         protocolId=run_resource.protocol_id,
@@ -404,6 +412,7 @@ async def test_get_all_runs(
     )
 
     current_run_resource = RunResource(
+        ok=True,
         run_id="current-run",
         protocol_id=None,
         created_at=datetime(year=2022, month=2, day=2),
@@ -411,6 +420,7 @@ async def test_get_all_runs(
     )
 
     historical_run_resource = RunResource(
+        ok=True,
         run_id="historical-run",
         protocol_id=None,
         created_at=datetime(year=2023, month=3, day=3),
diff --git a/robot-server/tests/runs/test_run_store.py b/robot-server/tests/runs/test_run_store.py
index 8c696426c76..bb089d4b40a 100644
--- a/robot-server/tests/runs/test_run_store.py
+++ b/robot-server/tests/runs/test_run_store.py
@@ -8,12 +8,14 @@
 from unittest import mock
 
 from opentrons_shared_data.pipette.dev_types import PipetteNameType
+from opentrons_shared_data.errors.codes import ErrorCodes
 
 from robot_server.protocols.protocol_store import ProtocolNotFoundError
 from robot_server.runs.run_store import (
     RunStore,
     RunResource,
     CommandNotFoundError,
+    BadStateSummary,
 )
 from robot_server.runs.run_models import RunNotFoundError
 from robot_server.runs.action_models import RunAction, RunActionType
@@ -192,6 +194,7 @@ def test_update_run_state(
     )
 
     assert result == RunResource(
+        ok=True,
         run_id="run-id",
         protocol_id=None,
         created_at=datetime(year=2021, month=1, day=1, tzinfo=timezone.utc),
@@ -227,6 +230,7 @@ def test_add_run(subject: RunStore) -> None:
     )
 
     assert result == RunResource(
+        ok=True,
         run_id="run-id",
         protocol_id=None,
         created_at=datetime(year=2022, month=2, day=2, tzinfo=timezone.utc),
@@ -267,6 +271,7 @@ def test_get_run_no_actions(subject: RunStore) -> None:
     result = subject.get("run-id")
 
     assert result == RunResource(
+        ok=True,
         run_id="run-id",
         protocol_id=None,
         created_at=datetime(year=2021, month=1, day=1, tzinfo=timezone.utc),
@@ -293,6 +298,7 @@ def test_get_run(subject: RunStore) -> None:
     result = subject.get(run_id="run-id")
 
     assert result == RunResource(
+        ok=True,
         run_id="run-id",
         protocol_id=None,
         created_at=datetime(year=2021, month=1, day=1, tzinfo=timezone.utc),
@@ -314,6 +320,7 @@ def test_get_run_missing(subject: RunStore) -> None:
             1,
             [
                 RunResource(
+                    ok=True,
                     run_id="run-id-2",
                     protocol_id=None,
                     created_at=datetime(year=2022, month=2, day=2, tzinfo=timezone.utc),
@@ -325,12 +332,14 @@ def test_get_run_missing(subject: RunStore) -> None:
             20,
             [
                 RunResource(
+                    ok=True,
                     run_id="run-id-1",
                     protocol_id=None,
                     created_at=datetime(year=2021, month=1, day=1, tzinfo=timezone.utc),
                     actions=[],
                 ),
                 RunResource(
+                    ok=True,
                     run_id="run-id-2",
                     protocol_id=None,
                     created_at=datetime(year=2022, month=2, day=2, tzinfo=timezone.utc),
@@ -342,12 +351,14 @@ def test_get_run_missing(subject: RunStore) -> None:
             None,
             [
                 RunResource(
+                    ok=True,
                     run_id="run-id-1",
                     protocol_id=None,
                     created_at=datetime(year=2021, month=1, day=1, tzinfo=timezone.utc),
                     actions=[],
                 ),
                 RunResource(
+                    ok=True,
                     run_id="run-id-2",
                     protocol_id=None,
                     created_at=datetime(year=2022, month=2, day=2, tzinfo=timezone.utc),
@@ -447,7 +458,8 @@ def test_get_state_summary_failure(
         run_id="run-id", summary=invalid_state_summary, commands=[]
     )
     result = subject.get_state_summary(run_id="run-id")
-    assert result is None
+    assert isinstance(result, BadStateSummary)
+    assert result.dataError.code == ErrorCodes.INVALID_STORED_DATA
 
 
 def test_get_state_summary_none(subject: RunStore) -> None:
@@ -458,7 +470,8 @@ def test_get_state_summary_none(subject: RunStore) -> None:
         created_at=datetime(year=2021, month=1, day=1, tzinfo=timezone.utc),
     )
     result = subject.get_state_summary(run_id="run-id")
-    assert result is None
+    assert isinstance(result, BadStateSummary)
+    assert result.dataError.code == ErrorCodes.INVALID_STORED_DATA
 
 
 def test_has_run_id(subject: RunStore) -> None:
diff --git a/shared-data/errors/definitions/1/errors.json b/shared-data/errors/definitions/1/errors.json
index cf1dc313539..76467faa924 100644
--- a/shared-data/errors/definitions/1/errors.json
+++ b/shared-data/errors/definitions/1/errors.json
@@ -225,6 +225,10 @@
     "4007": {
       "detail": "API Command is misconfigured",
       "category": "generalError"
+    },
+    "4008": {
+      "detail": "Invalid stored data",
+      "category": "generalError"
     }
   }
 }
diff --git a/shared-data/python/opentrons_shared_data/errors/codes.py b/shared-data/python/opentrons_shared_data/errors/codes.py
index f763006ff82..954ac72c9a1 100644
--- a/shared-data/python/opentrons_shared_data/errors/codes.py
+++ b/shared-data/python/opentrons_shared_data/errors/codes.py
@@ -86,6 +86,7 @@ class ErrorCodes(Enum):
     COMMAND_PARAMETER_LIMIT_VIOLATED = _code_from_dict_entry("4005")
     INVALID_PROTOCOL_DATA = _code_from_dict_entry("4006")
     API_MISCONFIGURATION = _code_from_dict_entry("4007")
+    INVALID_STORED_DATA = _code_from_dict_entry("4008")
 
     @classmethod
     @lru_cache(25)
diff --git a/shared-data/python/opentrons_shared_data/errors/exceptions.py b/shared-data/python/opentrons_shared_data/errors/exceptions.py
index 0a5d59e3f48..2d8a6f74487 100644
--- a/shared-data/python/opentrons_shared_data/errors/exceptions.py
+++ b/shared-data/python/opentrons_shared_data/errors/exceptions.py
@@ -942,3 +942,19 @@ def __init__(
     ) -> None:
         """Build an InvalidProtocolData."""
         super().__init__(ErrorCodes.INVALID_PROTOCOL_DATA, message, detail, wrapping)
+
+
+class InvalidStoredData(GeneralError):
+    """An error indicating that some stored data is invalid.
+
+    This will usually be because it was saved by a future version of the software.
+    """
+
+    def __init__(
+        self,
+        message: Optional[str] = None,
+        detail: Optional[Dict[str, str]] = None,
+        wrapping: Optional[Sequence[EnumeratedError]] = None,
+    ) -> None:
+        """Build an InvalidStoredData."""
+        super().__init__(ErrorCodes.INVALID_STORED_DATA, message, detail, wrapping)