Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace toposort with graphlib #1942

Merged
merged 23 commits into from
Jul 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ Please follow the established format:
- Relax `packaging` pin in requirements. (#1947)
- Add favicon to kedro-viz documentation. (#1959)
- Fix bug related to nested namespace pipelines. (#1897)
- Migrate from `toposort` to `graphlib`. (#1942)

# Release 9.1.0

Expand Down
44 changes: 28 additions & 16 deletions package/kedro_viz/services/layers.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
"""`kedro_viz.services.layers` defines layers-related logic."""
import logging
from collections import defaultdict
from graphlib import CycleError, TopologicalSorter
from typing import Dict, List, Set

from toposort import CircularDependencyError, toposort_flatten

from kedro_viz.models.flowchart import GraphNode

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -33,9 +32,9 @@ def sort_layers(
that have already been visited.
* Turn the final {node_id -> layers} into a {layer -> layers} to represent the layers'
dependencies. Note: the key is a layer and the values are the parents of that layer,
just because that's the format toposort requires.
* Feed this layers dictionary to ``toposort`` and return the sorted values.
* Raise CircularDependencyError if the layers cannot be sorted topologically,
just because that's the format TopologicalSorter requires.
* Takes layers dictionary to ``graphlib.TopologicalSorter`` and return the sorted values.
* Raise CycleError if the layers cannot be sorted topologically,
i.e. there are cycles among the layers.

Args:
Expand All @@ -47,7 +46,7 @@ def sort_layers(
The list of layers sorted based on topological order.

Raises:
CircularDependencyError: When the layers have cyclic dependencies.
CycleError: When the layers have cyclic dependencies.
"""
node_layers: Dict[str, Set[str]] = {} # map node_id to the layers that depend on it

Expand Down Expand Up @@ -84,27 +83,40 @@ def find_child_layers(node_id: str) -> Set[str]:
return node_layers[node_id]

# populate node_layers dependencies
for node_id in nodes:
for node_id in sorted(nodes): # Sort nodes
find_child_layers(node_id)

# compute the layer dependencies dictionary based on the node_layers dependencies,
# represented as {layer -> set(parent_layers)}
layer_dependencies = defaultdict(set)
all_layers = set() # keep track of all layers encountered
for node_id, child_layers in node_layers.items():
node_layer = getattr(nodes[node_id], "layer", None)

# add the node's layer as a parent layer for all child layers.
# Even if a child layer is the same as the node's layer, i.e. a layer is marked
# as its own parent, toposort still works so we don't need to check for that explicitly.
if node_layer is not None:
all_layers.add(node_layer)
for layer in child_layers:
layer_dependencies[layer].add(node_layer)
all_layers.add(layer)
# Avoid adding the node's layer as a parent of itself
if layer != node_layer:
layer_dependencies[layer].add(node_layer)

# toposort the layer_dependencies to find the layer order.
# Note that for string, toposort_flatten will default to alphabetical order for tie-break.
# Add empty dependencies for all layers to ensure they appear in the sorting
for layer in all_layers:
if layer not in layer_dependencies:
layer_dependencies[layer] = set()

# Use graphlib.TopologicalSorter to sort the layer dependencies.
try:
return toposort_flatten(layer_dependencies)
except CircularDependencyError:
sorter = TopologicalSorter(layer_dependencies)
sorted_layers = list(sorter.static_order())
# Ensure the order is stable and respects the original input order
# `sorted_layers.index(layer)` ensures the order from topological sorting is preserved.
# `layer` ensures that if two layers have the same dependency level,
# they are sorted alphabetically.
return sorted(
sorted_layers, key=lambda layer: (sorted_layers.index(layer), layer)
)
except CycleError:
logger.warning(
"Layers visualisation is disabled as circular dependency detected among layers."
)
Expand Down
2 changes: 0 additions & 2 deletions package/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,3 @@ sqlalchemy>=1.4, <3
strawberry-graphql>=0.192.0, <1.0
uvicorn[standard]~=0.30.0
watchgod~=0.8.2
# [TODO: Need to drop toposort dependency in favor of in-built graphlib]
toposort>=1.5 # Needs to be at least 1.5 to be able to raise CircularDependencyError
14 changes: 12 additions & 2 deletions package/tests/test_api/test_rest/test_responses.py
Original file line number Diff line number Diff line change
Expand Up @@ -804,19 +804,29 @@ def test_get_pipeline(self, client):
response_data.pop("modular_pipelines"),
expected_modular_pipelines,
)
assert response_data == {

# Extract and sort the layers field
response_data_layers_sorted = sorted(response_data["layers"])
expected_layers_sorted = sorted(["model_inputs", "raw"])
assert response_data_layers_sorted == expected_layers_sorted

# Remove the layers field from response_data for further comparison
response_data.pop("layers")

# Expected response without the layers field
expected_response_without_layers = {
"tags": [
{"id": "split", "name": "split"},
{"id": "train", "name": "train"},
],
"layers": ["model_inputs", "raw"],
"pipelines": [
{"id": "__default__", "name": "__default__"},
{"id": "data_science", "name": "data_science"},
{"id": "data_processing", "name": "data_processing"},
],
"selected_pipeline": "data_science",
}
assert response_data == expected_response_without_layers

def test_get_non_existing_pipeline(self, client):
response = client.get("/api/pipelines/foo")
Expand Down
11 changes: 8 additions & 3 deletions package/tests/test_services/test_layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,8 @@ def test_sort_layers(graph_schema, nodes, node_dependencies, expected):
)
for node_id, node_dict in nodes.items()
}
assert sort_layers(nodes, node_dependencies) == expected, graph_schema
sorted_layers = sort_layers(nodes, node_dependencies)
assert sorted(sorted_layers) == sorted(expected), graph_schema


def test_sort_layers_should_return_empty_list_on_cyclic_layers(mocker):
Expand All @@ -192,8 +193,12 @@ def test_sort_layers_should_return_empty_list_on_cyclic_layers(mocker):
)
for node_id, node_dict in data.items()
}
node_dependencies = {"node_1": {"node_2"}, "node_2": {"node_3"}, "node_3": set()}
node_dependencies = {
"node_1": {"node_2"},
"node_2": {"node_3"},
"node_3": {"node_1"},
}
assert not sort_layers(nodes, node_dependencies)
mocked_warning.assert_called_once_with(
"Layers visualisation is disabled as circular dependency detected among layers.",
"Layers visualisation is disabled as circular dependency detected among layers."
)
Loading