Skip to content

Commit

Permalink
Lightweight Kedro Viz Experimentation using AST (#1966)
Browse files Browse the repository at this point in the history
* merge main from remote

Signed-off-by: ravi-kumar-pilla <[email protected]>

* partially working parser - WIP

Signed-off-by: ravi-kumar-pilla <[email protected]>

* partial working commit

Signed-off-by: ravi-kumar-pilla <[email protected]>

* testing show code

Signed-off-by: ravi-kumar-pilla <[email protected]>

* adjust file permissions

Signed-off-by: ravi-kumar-pilla <[email protected]>

* update comments and rename parser file

Signed-off-by: ravi-kumar-pilla <[email protected]>

* remove gitignore

Signed-off-by: ravi-kumar-pilla <[email protected]>

* handle func lambda case

Signed-off-by: ravi-kumar-pilla <[email protected]>

* mocking working draft proposal

* reuse session with mock modules

* wip integration tests

* sporadic working needs testing

* update sys modules with patch

* fix lint and pytests

* add dataset factories test

* add e2e test

* fix CI

* dataset factory pattern support in lite mode

* add doc strings

* add e2e test and clear unused func

* testing relative to absolute imports

* testing relative imports

* working draft for relative imports multi-level

* remove resolving relative dependencies

* test

* working draft

* modify test and standalone support for lite

* improve readability

* fix lint and pytest

* revert link redirect

* remove side effects

* pr suggestions addressed

* fix dict issue

* moved package check under dirs and add exception block

---------

Signed-off-by: ravi-kumar-pilla <[email protected]>
  • Loading branch information
ravi-kumar-pilla authored Sep 3, 2024
1 parent 8620181 commit 023a05b
Show file tree
Hide file tree
Showing 12 changed files with 1,071 additions and 46 deletions.
33 changes: 33 additions & 0 deletions package/features/steps/cli_steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,16 @@ def exec_viz_command(context):
)


@when("I execute the kedro viz run command with lite option")
def exec_viz_lite_command(context):
"""Execute Kedro-Viz command."""
context.result = ChildTerminatingPopen(
[context.kedro, "viz", "run", "--lite", "--no-browser"],
env=context.env,
cwd=str(context.root_project_dir),
)


@then("kedro-viz should start successfully")
def check_kedroviz_up(context):
"""Check that Kedro-Viz is up and responding to requests."""
Expand All @@ -169,3 +179,26 @@ def check_kedroviz_up(context):
)
finally:
context.result.terminate()


@then("I store the response from main endpoint")
def get_main_api_response(context):
max_duration = 30 # 30 seconds
end_by = time() + max_duration

while time() < end_by:
try:
response = requests.get("http://localhost:4141/api/main")
context.response = response.json()
assert response.status_code == 200
except Exception:
sleep(2.0)
continue
else:
break


@then("I compare the responses in regular and lite mode")
def compare_main_api_responses(context):
regular_mode_response = requests.get("http://localhost:4141/api/main").json()
assert context.response == regular_mode_response
14 changes: 14 additions & 0 deletions package/features/viz.feature
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,17 @@ Feature: Viz plugin in new project
When I execute the kedro viz run command
Then kedro-viz should start successfully

Scenario: Execute viz lite with latest Kedro
Given I have installed kedro version "latest"
And I have run a non-interactive kedro new with spaceflights-pandas starter
When I execute the kedro viz run command with lite option
Then kedro-viz should start successfully

Scenario: Compare viz responses in regular and lite mode
Given I have installed kedro version "latest"
And I have run a non-interactive kedro new with spaceflights-pandas starter
When I execute the kedro viz run command with lite option
Then I store the response from main endpoint
Given I have installed the project's requirements
When I execute the kedro viz run command
Then I compare the responses in regular and lite mode
14 changes: 13 additions & 1 deletion package/kedro_viz/data_access/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

import networkx as nx
from kedro.io import DataCatalog
from kedro.io.core import DatasetError
from kedro.io.memory_dataset import MemoryDataset
from kedro.pipeline import Pipeline as KedroPipeline
from kedro.pipeline.node import Node as KedroNode
from sqlalchemy.orm import sessionmaker
Expand Down Expand Up @@ -316,7 +318,17 @@ def add_dataset(
Returns:
The GraphNode instance representing the dataset that was added to the NodesRepository.
"""
obj = self.catalog.get_dataset(dataset_name)
try:
obj = self.catalog.get_dataset(dataset_name)
except DatasetError:
# This is to handle dataset factory patterns when running
# Kedro Viz in lite mode. The `get_dataset` function
# of DataCatalog calls AbstractDataset.from_config
# which tries to create a Dataset instance from the pattern

# pylint: disable=abstract-class-instantiated
obj = MemoryDataset() # type: ignore[abstract]

layer = self.catalog.get_layer_for_dataset(dataset_name)
graph_node: Union[DataNode, TranscodedDataNode, ParametersNode]
(
Expand Down
76 changes: 76 additions & 0 deletions package/kedro_viz/integrations/kedro/data_catalog_lite.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
"""``DataCatalogLite`` is a custom implementation of Kedro's ``DataCatalog``
to provide a MemoryDataset instance when running Kedro-Viz in lite mode.
"""

import copy
from typing import Any, Optional

from kedro.io.core import AbstractDataset, DatasetError, generate_timestamp
from kedro.io.data_catalog import DataCatalog, _resolve_credentials
from kedro.io.memory_dataset import MemoryDataset


class DataCatalogLite(DataCatalog):
"""``DataCatalogLite`` is a custom implementation of Kedro's ``DataCatalog``
to provide a MemoryDataset instance by overriding ``from_config`` of ``DataCatalog``
when running Kedro-Viz in lite mode.
"""

@classmethod
def from_config(
cls,
catalog: Optional[dict[str, dict[str, Any]]],
credentials: Optional[dict[str, dict[str, Any]]] = None,
load_versions: Optional[dict[str, str]] = None,
save_version: Optional[str] = None,
) -> DataCatalog:
datasets = {}
dataset_patterns = {}
catalog = copy.deepcopy(catalog) or {}
credentials = copy.deepcopy(credentials) or {}
save_version = save_version or generate_timestamp()
load_versions = copy.deepcopy(load_versions) or {}
user_default = {}

for ds_name, ds_config in catalog.items():
if not isinstance(ds_config, dict):
raise DatasetError(
f"Catalog entry '{ds_name}' is not a valid dataset configuration. "
"\nHint: If this catalog entry is intended for variable interpolation, "
"make sure that the key is preceded by an underscore."
)

try:
ds_config = _resolve_credentials(
ds_config, credentials
) # noqa: PLW2901
if cls._is_pattern(ds_name):
# Add each factory to the dataset_patterns dict.
dataset_patterns[ds_name] = ds_config

else:
try:
datasets[ds_name] = AbstractDataset.from_config(
ds_name, ds_config, load_versions.get(ds_name), save_version
)
except DatasetError:
# pylint: disable=abstract-class-instantiated
datasets[ds_name] = MemoryDataset() # type: ignore[abstract]
except KeyError:
# pylint: disable=abstract-class-instantiated
datasets[ds_name] = MemoryDataset() # type: ignore[abstract]

sorted_patterns = cls._sort_patterns(dataset_patterns)
if sorted_patterns:
# If the last pattern is a catch-all pattern, pop it and set it as the default
if cls._specificity(list(sorted_patterns.keys())[-1]) == 0:
last_pattern = sorted_patterns.popitem()
user_default = {last_pattern[0]: last_pattern[1]}

return cls(
datasets=datasets,
dataset_patterns=sorted_patterns,
load_versions=load_versions,
save_version=save_version,
default_pattern=user_default,
)
99 changes: 84 additions & 15 deletions package/kedro_viz/integrations/kedro/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,22 @@

import json
import logging
import sys
from pathlib import Path
from typing import Any, Dict, Optional, Tuple
from typing import Any, Dict, Optional, Set, Tuple
from unittest.mock import patch

This comment has been minimized.

Copy link
@astrojuanlu

astrojuanlu Sep 16, 2024

Member

@rashidakanchwala Not a blocker for release, but I don't know how I feel about having non-test code make use of unittest.mock. Stack Overflow, Reddit, and ChatGPT all agree that it's not a good practice... do you think we should open an issue to explore alternative approaches? cc @ravi-kumar-pilla

This comment has been minimized.

Copy link
@ravi-kumar-pilla

ravi-kumar-pilla Sep 16, 2024

Author Contributor

Hi @astrojuanlu , Thank you for raising this. I felt the same initially but I was updating/reverting sys modules directly without having a mock context which might not be safe to do. After discussing with Nok, the idea of having a mock context made sense. At this moment I am not sure how we can mock a module other than to use a mock library. Happy to explore more if this is a concern. Thank you


from kedro import __version__
from kedro.framework.project import configure_project, pipelines
from kedro.framework.project import configure_project, pipelines, settings
from kedro.framework.session import KedroSession
from kedro.framework.session.store import BaseSessionStore
from kedro.framework.startup import bootstrap_project
from kedro.io import DataCatalog
from kedro.pipeline import Pipeline

from kedro_viz.constants import VIZ_METADATA_ARGS
from kedro_viz.integrations.kedro.data_catalog_lite import DataCatalogLite
from kedro_viz.integrations.kedro.lite_parser import LiteParser

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -69,33 +73,29 @@ def _get_dataset_stats(project_path: Path) -> Dict:
return {}


def load_data(
def _load_data_helper(
project_path: Path,
env: Optional[str] = None,
include_hooks: bool = False,
package_name: Optional[str] = None,
extra_params: Optional[Dict[str, Any]] = None,
) -> Tuple[DataCatalog, Dict[str, Pipeline], BaseSessionStore, Dict]:
"""Load data from a Kedro project.
is_lite: bool = False,
):
"""Helper to load data from a Kedro project.
Args:
project_path: the path where the Kedro project is located.
env: the Kedro environment to load the data. If not provided.
it will use Kedro default, which is local.
include_hooks: A flag to include all registered hooks in your Kedro Project.
package_name: The name of the current package
extra_params: Optional dictionary containing extra project parameters
for underlying KedroContext. If specified, will update (and therefore
take precedence over) the parameters retrieved from the project
configuration.
is_lite: A flag to run Kedro-Viz in lite mode.
Returns:
A tuple containing the data catalog and the pipeline dictionary
and the session store.
A tuple containing the data catalog, pipeline dictionary, session store
and dataset stats dictionary.
"""
if package_name:
configure_project(package_name)
else:
# bootstrap project when viz is run in dev mode
bootstrap_project(project_path)

with KedroSession.create(
project_path=project_path,
Expand All @@ -109,12 +109,81 @@ def load_data(

context = session.load_context()
session_store = session._store

# Update the DataCatalog class for a custom implementation
# to handle kedro.io.core.DatasetError from
# `settings.DATA_CATALOG_CLASS.from_config`
if is_lite:
settings.DATA_CATALOG_CLASS = DataCatalogLite

catalog = context.catalog

# Pipelines is a lazy dict-like object, so we force it to populate here
# in case user doesn't have an active session down the line when it's first accessed.
# Useful for users who have `get_current_session` in their `register_pipelines()`.
pipelines_dict = dict(pipelines)
stats_dict = _get_dataset_stats(project_path)

return catalog, pipelines_dict, session_store, stats_dict


def load_data(
project_path: Path,
env: Optional[str] = None,
include_hooks: bool = False,
package_name: Optional[str] = None,
extra_params: Optional[Dict[str, Any]] = None,
is_lite: bool = False,
) -> Tuple[DataCatalog, Dict[str, Pipeline], BaseSessionStore, Dict]:
"""Load data from a Kedro project.
Args:
project_path: the path where the Kedro project is located.
env: the Kedro environment to load the data. If not provided.
it will use Kedro default, which is local.
include_hooks: A flag to include all registered hooks in your Kedro Project.
package_name: The name of the current package
extra_params: Optional dictionary containing extra project parameters
for underlying KedroContext. If specified, will update (and therefore
take precedence over) the parameters retrieved from the project
configuration.
is_lite: A flag to run Kedro-Viz in lite mode.
Returns:
A tuple containing the data catalog, pipeline dictionary, session store
and dataset stats dictionary.
"""
if package_name:
configure_project(package_name)
else:
# bootstrap project when viz is run in dev mode
bootstrap_project(project_path)

if is_lite:
lite_parser = LiteParser(package_name)
unresolved_imports = lite_parser.parse(project_path)
sys_modules_patch = sys.modules.copy()

if unresolved_imports and len(unresolved_imports) > 0:
modules_to_mock: Set[str] = set()

for unresolved_module_set in unresolved_imports.values():
modules_to_mock = modules_to_mock.union(unresolved_module_set)

mocked_modules = lite_parser.create_mock_modules(modules_to_mock)
sys_modules_patch.update(mocked_modules)

logger.warning(
"Kedro-Viz has mocked the following dependencies for lite-mode.\n"
"%s \n"
"In order to get a complete experience of Viz, "
"please install the missing Kedro project dependencies\n",
list(mocked_modules.keys()),
)

# Patch actual sys modules
with patch.dict("sys.modules", sys_modules_patch):
return _load_data_helper(
project_path, env, include_hooks, extra_params, is_lite
)
else:
return _load_data_helper(
project_path, env, include_hooks, extra_params, is_lite
)
Loading

0 comments on commit 023a05b

Please sign in to comment.