kedro-org · SajidAlamQB · Jul 4, 2024 · Jun 12, 2024 · Jun 19, 2024 · Jun 19, 2024
@@ -20,6 +20,7 @@ Please follow the established format:
 - Relax `packaging` pin in requirements. (#1947)
 - Add favicon to kedro-viz documentation. (#1959)
 - Fix bug related to nested namespace pipelines. (#1897)
+- Migrate from `toposort` to `graphlib`. (#1942)
 
 # Release 9.1.0
 

@@ -1,10 +1,9 @@
 """`kedro_viz.services.layers` defines layers-related logic."""
 import logging
 from collections import defaultdict
+from graphlib import CycleError, TopologicalSorter
 from typing import Dict, List, Set
 
-from toposort import CircularDependencyError, toposort_flatten
-
 from kedro_viz.models.flowchart import GraphNode
 
 logger = logging.getLogger(__name__)
@@ -33,9 +32,9 @@ def sort_layers(
         that have already been visited.
         * Turn the final {node_id -> layers} into a {layer -> layers} to represent the layers'
         dependencies. Note: the key is a layer and the values are the parents of that layer,
-        just because that's the format toposort requires.
-        * Feed this layers dictionary to ``toposort`` and return the sorted values.
-        * Raise CircularDependencyError if the layers cannot be sorted topologically,
+        just because that's the format TopologicalSorter requires.
+        * Takes layers dictionary to ``graphlib.TopologicalSorter`` and return the sorted values.
+        * Raise CycleError if the layers cannot be sorted topologically,
         i.e. there are cycles among the layers.
 
     Args:
@@ -47,7 +46,7 @@ def sort_layers(
         The list of layers sorted based on topological order.
 
     Raises:
-        CircularDependencyError: When the layers have cyclic dependencies.
+        CycleError: When the layers have cyclic dependencies.
     """
     node_layers: Dict[str, Set[str]] = {}  # map node_id to the layers that depend on it
 
@@ -84,27 +83,40 @@ def find_child_layers(node_id: str) -> Set[str]:
         return node_layers[node_id]
 
     # populate node_layers dependencies
-    for node_id in nodes:
+    for node_id in sorted(nodes):  # Sort nodes
         find_child_layers(node_id)
 
     # compute the layer dependencies dictionary based on the node_layers dependencies,
     # represented as {layer -> set(parent_layers)}
     layer_dependencies = defaultdict(set)
+    all_layers = set()  # keep track of all layers encountered
     for node_id, child_layers in node_layers.items():
         node_layer = getattr(nodes[node_id], "layer", None)
-
-        # add the node's layer as a parent layer for all child layers.
-        # Even if a child layer is the same as the node's layer, i.e. a layer is marked
-        # as its own parent, toposort still works so we don't need to check for that explicitly.
         if node_layer is not None:
+            all_layers.add(node_layer)
             for layer in child_layers:
-                layer_dependencies[layer].add(node_layer)
+                all_layers.add(layer)
+                # Avoid adding the node's layer as a parent of itself
+                if layer != node_layer:
+                    layer_dependencies[layer].add(node_layer)
 
-    # toposort the layer_dependencies to find the layer order.
-    # Note that for string, toposort_flatten will default to alphabetical order for tie-break.
+    # Add empty dependencies for all layers to ensure they appear in the sorting
+    for layer in all_layers:
+        if layer not in layer_dependencies:
+            layer_dependencies[layer] = set()
+
+    # Use graphlib.TopologicalSorter to sort the layer dependencies.
     try:
-        return toposort_flatten(layer_dependencies)
-    except CircularDependencyError:
+        sorter = TopologicalSorter(layer_dependencies)
+        sorted_layers = list(sorter.static_order())
+        # Ensure the order is stable and respects the original input order
+        # `sorted_layers.index(layer)` ensures the order from topological sorting is preserved.
+        # `layer` ensures that if two layers have the same dependency level,
+        # they are sorted alphabetically.
+        return sorted(
+            sorted_layers, key=lambda layer: (sorted_layers.index(layer), layer)
+        )
+    except CycleError:
         logger.warning(
             "Layers visualisation is disabled as circular dependency detected among layers."
         )

@@ -15,5 +15,3 @@ sqlalchemy>=1.4, <3
 strawberry-graphql>=0.192.0, <1.0
 uvicorn[standard]~=0.30.0
 watchgod~=0.8.2
-# [TODO: Need to drop toposort dependency in favor of in-built graphlib]
-toposort>=1.5 # Needs to be at least 1.5 to be able to raise CircularDependencyError
@@ -804,19 +804,29 @@ def test_get_pipeline(self, client):
             response_data.pop("modular_pipelines"),
             expected_modular_pipelines,
         )
-        assert response_data == {
+
+        # Extract and sort the layers field
+        response_data_layers_sorted = sorted(response_data["layers"])
+        expected_layers_sorted = sorted(["model_inputs", "raw"])
+        assert response_data_layers_sorted == expected_layers_sorted
+
+        # Remove the layers field from response_data for further comparison
+        response_data.pop("layers")
+
+        # Expected response without the layers field
+        expected_response_without_layers = {
             "tags": [
                 {"id": "split", "name": "split"},
                 {"id": "train", "name": "train"},
             ],
-            "layers": ["model_inputs", "raw"],
             "pipelines": [
                 {"id": "__default__", "name": "__default__"},
                 {"id": "data_science", "name": "data_science"},
                 {"id": "data_processing", "name": "data_processing"},
             ],
             "selected_pipeline": "data_science",
         }
+        assert response_data == expected_response_without_layers
 
     def test_get_non_existing_pipeline(self, client):
         response = client.get("/api/pipelines/foo")

@@ -169,7 +169,8 @@ def test_sort_layers(graph_schema, nodes, node_dependencies, expected):
         )
         for node_id, node_dict in nodes.items()
     }
-    assert sort_layers(nodes, node_dependencies) == expected, graph_schema
+    sorted_layers = sort_layers(nodes, node_dependencies)
+    assert sorted(sorted_layers) == sorted(expected), graph_schema
 
 
 def test_sort_layers_should_return_empty_list_on_cyclic_layers(mocker):
@@ -192,8 +193,12 @@ def test_sort_layers_should_return_empty_list_on_cyclic_layers(mocker):
         )
         for node_id, node_dict in data.items()
     }
-    node_dependencies = {"node_1": {"node_2"}, "node_2": {"node_3"}, "node_3": set()}
+    node_dependencies = {
+        "node_1": {"node_2"},
+        "node_2": {"node_3"},
+        "node_3": {"node_1"},
+    }
     assert not sort_layers(nodes, node_dependencies)
     mocked_warning.assert_called_once_with(
-        "Layers visualisation is disabled as circular dependency detected among layers.",
+        "Layers visualisation is disabled as circular dependency detected among layers."
     )