kedro-org · merelcht · Mar 14, 2022 · Mar 7, 2022 · Mar 7, 2022 · Mar 8, 2022
@@ -165,7 +165,7 @@ class AWSBatchRunner(ThreadRunner):
         return super()._get_required_workers_count(pipeline)
 
     def _run(  # pylint: disable=too-many-locals,useless-suppression
-        self, pipeline: Pipeline, catalog: DataCatalog, run_id: str = None
+        self, pipeline: Pipeline, catalog: DataCatalog
     ) -> None:
         nodes = pipeline.nodes
         node_dependencies = pipeline.node_dependencies
@@ -206,7 +206,6 @@ class AWSBatchRunner(ThreadRunner):
                         node,
                         node_to_job,
                         node_dependencies[node],
-                        run_id,
                     )
                     futures.add(future)
 
@@ -232,11 +231,10 @@ def _submit_job(
     node: Node,
     node_to_job: Dict[Node, str],
     node_dependencies: Set[Node],
-    run_id: str,
 ) -> Node:
     self._logger.info("Submitting the job for node: %s", str(node))
 
-    job_name = f"kedro_{run_id}_{node.name}".replace(".", "-")
+    job_name = f"kedro_{node.name}".replace(".", "-")
     depends_on = [{"jobId": node_to_job[dep]} for dep in node_dependencies]
     command = ["kedro", "run", "--node", node.name]
 

@@ -112,7 +112,6 @@ class DaskRunner(AbstractRunner):
         node: Node,
         catalog: DataCatalog,
         is_async: bool = False,
-        run_id: str = None,
         *dependencies: Node,
     ) -> Node:
         """Run a single `Node` with inputs from and outputs to the `catalog`.
@@ -126,18 +125,15 @@ class DaskRunner(AbstractRunner):
             catalog: A ``DataCatalog`` containing the node's inputs and outputs.
             is_async: If True, the node inputs and outputs are loaded and saved
                 asynchronously with threads. Defaults to False.
-            run_id: The id of the pipeline run.
             dependencies: The upstream ``Node``s to allow Dask to handle
                 dependency tracking. Their values are not actually used.
 
         Returns:
             The node argument.
         """
-        return run_node(node, catalog, is_async, run_id)
+        return run_node(node, catalog, is_async)
 
-    def _run(
-        self, pipeline: Pipeline, catalog: DataCatalog, run_id: str = None
-    ) -> None:
+    def _run(self, pipeline: Pipeline, catalog: DataCatalog) -> None:
         nodes = pipeline.nodes
         load_counts = Counter(chain.from_iterable(n.inputs for n in nodes))
         node_dependencies = pipeline.node_dependencies
@@ -153,7 +149,6 @@ class DaskRunner(AbstractRunner):
                 node,
                 catalog,
                 self._is_async,
-                run_id,
                 *dependencies,
             )
 

@@ -68,7 +68,6 @@ def after_catalog_created(
     conf_creds: Dict[str, Any],
     save_version: str,
     load_versions: Dict[str, str],
-    run_id: str,
 ) -> None:
     pass
 ```
@@ -379,24 +378,20 @@ class DataValidationHooks:
     }
 
     @hook_impl
-    def before_node_run(
-        self, catalog: DataCatalog, inputs: Dict[str, Any], run_id: str
-    ) -> None:
+    def before_node_run(self, catalog: DataCatalog, inputs: Dict[str, Any]) -> None:
         """Validate inputs data to a node based on using great expectation
         if an expectation suite is defined in ``DATASET_EXPECTATION_MAPPING``.
         """
-        self._run_validation(catalog, inputs, run_id)
+        self._run_validation(catalog, inputs)
 
     @hook_impl
-    def after_node_run(
-        self, catalog: DataCatalog, outputs: Dict[str, Any], run_id: str
-    ) -> None:
+    def after_node_run(self, catalog: DataCatalog, outputs: Dict[str, Any]) -> None:
         """Validate outputs data from a node based on using great expectation
         if an expectation suite is defined in ``DATASET_EXPECTATION_MAPPING``.
         """
-        self._run_validation(catalog, outputs, run_id)
+        self._run_validation(catalog, outputs)
 
-    def _run_validation(self, catalog: DataCatalog, data: Dict[str, Any], run_id: str):
+    def _run_validation(self, catalog: DataCatalog, data: Dict[str, Any]):
         for dataset_name, dataset_value in data.items():
             if dataset_name not in self.DATASET_EXPECTATION_MAPPING:
                 continue
@@ -411,7 +406,7 @@ class DataValidationHooks:
                 expectation_suite,
             )
             expectation_context.run_validation_operator(
-                "action_list_operator", assets_to_validate=[batch], run_id=run_id
+                "action_list_operator", assets_to_validate=[batch]
             )
 ```
 
@@ -499,9 +494,9 @@ class ModelTrackingHooks:
     @hook_impl
     def before_pipeline_run(self, run_params: Dict[str, Any]) -> None:
         """Hook implementation to start an MLflow run
-        with the same run_id as the Kedro pipeline run.
+        with the session_id of the Kedro pipeline run.
         """
-        mlflow.start_run(run_name=run_params["run_id"])
+        mlflow.start_run(run_name=run_params["session_id"])
         mlflow.log_params(run_params)
 
     @hook_impl

@@ -81,9 +81,7 @@ class DryRunner(AbstractRunner):
         """
         return MemoryDataSet()
 
-    def _run(
-        self, pipeline: Pipeline, catalog: DataCatalog, run_id: str = None
-    ) -> None:
+    def _run(self, pipeline: Pipeline, catalog: DataCatalog) -> None:
         """The method implementing dry pipeline running.
         Example logs output using this implementation:
 
@@ -95,7 +93,6 @@ class DryRunner(AbstractRunner):
         Args:
             pipeline: The ``Pipeline`` to run.
             catalog: The ``DataCatalog`` from which to fetch data.
-            run_id: The id of the run.
 
         """
         nodes = pipeline.nodes

@@ -290,7 +290,6 @@ def _get_catalog(
             feed_dict=feed_dict,
             save_version=save_version,
             load_versions=load_versions,
-            run_id=self.run_id or save_version,
         )
         return catalog
 
@@ -335,22 +334,6 @@ def _get_config_credentials(self) -> Dict[str, Any]:
             conf_creds = {}
         return conf_creds
 
-    @property
-    def run_id(self) -> Union[None, str]:
-        """Unique identifier for a run, defaults to None.
-        If `run_id` is None, `save_version` will be used instead.
-        """
-        return self._get_run_id()
-
-    def _get_run_id(  # pylint: disable=no-self-use
-        self, *args, **kwargs  # pylint: disable=unused-argument
-    ) -> Union[None, str]:
-        """A hook for generating a unique identifier for a
-        run, defaults to None.
-        If None, `save_version` will be used instead.
-        """
-        return None
-
 
 class KedroContextError(Exception):
     """Error occurred when loading project and running context pipeline."""
@@ -23,7 +23,6 @@ def after_catalog_created(  # pylint: disable=too-many-arguments
         feed_dict: Dict[str, Any],
         save_version: str,
         load_versions: Dict[str, str],
-        run_id: str,
     ) -> None:
         """Hooks to be invoked after a data catalog is created.
         It receives the ``catalog`` as well as
@@ -38,7 +37,6 @@ def after_catalog_created(  # pylint: disable=too-many-arguments
                 for all datasets in the catalog.
             load_versions: The load_versions used in ``load`` operations
                 for each dataset in the catalog.
-            run_id: The id of the run for which the catalog is loaded.
         """
         pass
 
@@ -47,13 +45,12 @@ class NodeSpecs:
     """Namespace that defines all specifications for a node's lifecycle hooks."""
 
     @hook_spec
-    def before_node_run(  # pylint: disable=too-many-arguments
+    def before_node_run(
         self,
         node: Node,
         catalog: DataCatalog,
         inputs: Dict[str, Any],
         is_async: bool,
-        run_id: str,
     ) -> Optional[Dict[str, Any]]:
         """Hook to be invoked before a node runs.
         The arguments received are the same as those used by ``kedro.runner.run_node``
@@ -65,7 +62,6 @@ def before_node_run(  # pylint: disable=too-many-arguments
                 The keys are dataset names and the values are the actual loaded input data,
                 not the dataset instance.
             is_async: Whether the node was run in ``async`` mode.
-            run_id: The id of the run.
 
         Returns:
             Either None or a dictionary mapping dataset name(s) to new value(s).
@@ -82,7 +78,6 @@ def after_node_run(  # pylint: disable=too-many-arguments
         inputs: Dict[str, Any],
         outputs: Dict[str, Any],
         is_async: bool,
-        run_id: str,
     ) -> None:
         """Hook to be invoked after a node runs.
         The arguments received are the same as those used by ``kedro.runner.run_node``
@@ -98,7 +93,6 @@ def after_node_run(  # pylint: disable=too-many-arguments
                 The keys are dataset names and the values are the actual computed output data,
                 not the dataset instance.
             is_async: Whether the node was run in ``async`` mode.
-            run_id: The id of the run.
         """
         pass
 
@@ -110,7 +104,6 @@ def on_node_error(  # pylint: disable=too-many-arguments
         catalog: DataCatalog,
         inputs: Dict[str, Any],
         is_async: bool,
-        run_id: str,
     ):
         """Hook to be invoked if a node run throws an uncaught error.
         The signature of this error hook should match the signature of ``before_node_run``
@@ -124,7 +117,6 @@ def on_node_error(  # pylint: disable=too-many-arguments
                 The keys are dataset names and the values are the actual loaded input data,
                 not the dataset instance.
             is_async: Whether the node was run in ``async`` mode.
-            run_id: The id of the run.
         """
         pass
 
@@ -143,7 +135,7 @@ def before_pipeline_run(
                 Should have the following schema::
 
                    {
-                     "run_id": str
+                     "session_id": str
                      "project_path": str,
                      "env": str,
                      "kedro_version": str,
@@ -178,7 +170,7 @@ def after_pipeline_run(
                 Should have the following schema::
 
                    {
-                     "run_id": str
+                     "session_id": str
                      "project_path": str,
                      "env": str,
                      "kedro_version": str,
@@ -217,7 +209,7 @@ def on_pipeline_error(
                 Should have the following schema::
 
                    {
-                     "run_id": str
+                     "session_id": str
                      "project_path": str,
                      "env": str,
                      "kedro_version": str,

@@ -75,6 +75,14 @@ def _jsonify_cli_context(ctx: click.core.Context) -> Dict[str, Any]:
     }
 
 
+class KedroSessionError(Exception):
+    """``KedroSessionError`` raised by ``KedroSession``
+    in case of run failure as part of a session.
+    """
+
+    pass
+
+
 class KedroSession:
     """``KedroSession`` is the object that is responsible for managing the lifecycle
     of a Kedro run.
@@ -94,18 +102,20 @@ class KedroSession:
         >>>
     """
 
-    def __init__(
+    def __init__(  # pylint: disable=too-many-arguments
         self,
         session_id: str,
         package_name: str = None,
         project_path: Union[Path, str] = None,
         save_on_close: bool = False,
+        run_called: bool = False,
     ):
         self._project_path = Path(project_path or Path.cwd()).resolve()
         self.session_id = session_id
         self.save_on_close = save_on_close
         self._package_name = package_name
         self._store = self._init_store()
+        self._run_called = run_called
 
         hook_manager = _create_hook_manager()
         _register_hooks(hook_manager, settings.HOOKS)
@@ -318,6 +328,8 @@ def run(  # pylint: disable=too-many-arguments,too-many-locals
                 defined by `register_pipelines`.
             Exception: Any uncaught exception during the run will be re-raised
                 after being passed to ``on_pipeline_error`` hook.
+            KedroSessionError: If more than one run is attempted to be executed during
+                a single session.
         Returns:
             Any node outputs that cannot be processed by the ``DataCatalog``.
             These are returned in a dictionary, where the keys are defined
@@ -327,7 +339,14 @@ def run(  # pylint: disable=too-many-arguments,too-many-locals
         # Report project name
         self._logger.info("** Kedro project %s", self._project_path.name)
 
-        save_version = run_id = self.store["session_id"]
+        if self._run_called:
+            raise KedroSessionError(
+                "A run has already been executed as part of the"
+                " active KedroSession. KedroSession has a 1-1 mapping with"
+                " runs, and thus only one run should be executed per session."
+            )
+
+        save_version = self.store["session_id"]
         extra_params = self.store.get("extra_params") or {}
         context = self.load_context()
 
@@ -352,7 +371,7 @@ def run(  # pylint: disable=too-many-arguments,too-many-locals
         )
 
         record_data = {
-            "run_id": run_id,
+            "session_id": self.store["session_id"],
             "project_path": self._project_path.as_posix(),
             "env": context.env,
             "kedro_version": kedro_version,
@@ -379,7 +398,8 @@ def run(  # pylint: disable=too-many-arguments,too-many-locals
         )
 
         try:
-            run_result = runner.run(filtered_pipeline, catalog, hook_manager, run_id)
+            run_result = runner.run(filtered_pipeline, catalog, hook_manager)
+            self._run_called = True
         except Exception as error:
             hook_manager.hook.on_pipeline_error(
                 error=error,