diff --git a/package/features/steps/cli_steps.py b/package/features/steps/cli_steps.py index c452217521..769cb08d64 100644 --- a/package/features/steps/cli_steps.py +++ b/package/features/steps/cli_steps.py @@ -147,6 +147,16 @@ def exec_viz_command(context): ) +@when("I execute the kedro viz run command with lite option") +def exec_viz_lite_command(context): + """Execute Kedro-Viz command.""" + context.result = ChildTerminatingPopen( + [context.kedro, "viz", "run", "--lite", "--no-browser"], + env=context.env, + cwd=str(context.root_project_dir), + ) + + @then("kedro-viz should start successfully") def check_kedroviz_up(context): """Check that Kedro-Viz is up and responding to requests.""" @@ -169,3 +179,26 @@ def check_kedroviz_up(context): ) finally: context.result.terminate() + + +@then("I store the response from main endpoint") +def get_main_api_response(context): + max_duration = 30 # 30 seconds + end_by = time() + max_duration + + while time() < end_by: + try: + response = requests.get("http://localhost:4141/api/main") + context.response = response.json() + assert response.status_code == 200 + except Exception: + sleep(2.0) + continue + else: + break + + +@then("I compare the responses in regular and lite mode") +def compare_main_api_responses(context): + regular_mode_response = requests.get("http://localhost:4141/api/main").json() + assert context.response == regular_mode_response diff --git a/package/features/viz.feature b/package/features/viz.feature index 75c7b65fed..d3c01e2f7f 100644 --- a/package/features/viz.feature +++ b/package/features/viz.feature @@ -24,3 +24,17 @@ Feature: Viz plugin in new project When I execute the kedro viz run command Then kedro-viz should start successfully + Scenario: Execute viz lite with latest Kedro + Given I have installed kedro version "latest" + And I have run a non-interactive kedro new with spaceflights-pandas starter + When I execute the kedro viz run command with lite option + Then kedro-viz should start successfully + + Scenario: Compare viz responses in regular and lite mode + Given I have installed kedro version "latest" + And I have run a non-interactive kedro new with spaceflights-pandas starter + When I execute the kedro viz run command with lite option + Then I store the response from main endpoint + Given I have installed the project's requirements + When I execute the kedro viz run command + Then I compare the responses in regular and lite mode diff --git a/package/kedro_viz/data_access/managers.py b/package/kedro_viz/data_access/managers.py index 4eb3e72130..afeba93586 100644 --- a/package/kedro_viz/data_access/managers.py +++ b/package/kedro_viz/data_access/managers.py @@ -7,6 +7,8 @@ import networkx as nx from kedro.io import DataCatalog +from kedro.io.core import DatasetError +from kedro.io.memory_dataset import MemoryDataset from kedro.pipeline import Pipeline as KedroPipeline from kedro.pipeline.node import Node as KedroNode from sqlalchemy.orm import sessionmaker @@ -316,7 +318,17 @@ def add_dataset( Returns: The GraphNode instance representing the dataset that was added to the NodesRepository. """ - obj = self.catalog.get_dataset(dataset_name) + try: + obj = self.catalog.get_dataset(dataset_name) + except DatasetError: + # This is to handle dataset factory patterns when running + # Kedro Viz in lite mode. The `get_dataset` function + # of DataCatalog calls AbstractDataset.from_config + # which tries to create a Dataset instance from the pattern + + # pylint: disable=abstract-class-instantiated + obj = MemoryDataset() # type: ignore[abstract] + layer = self.catalog.get_layer_for_dataset(dataset_name) graph_node: Union[DataNode, TranscodedDataNode, ParametersNode] ( diff --git a/package/kedro_viz/integrations/kedro/data_catalog_lite.py b/package/kedro_viz/integrations/kedro/data_catalog_lite.py new file mode 100755 index 0000000000..11c179cdb9 --- /dev/null +++ b/package/kedro_viz/integrations/kedro/data_catalog_lite.py @@ -0,0 +1,76 @@ +"""``DataCatalogLite`` is a custom implementation of Kedro's ``DataCatalog`` +to provide a MemoryDataset instance when running Kedro-Viz in lite mode. +""" + +import copy +from typing import Any, Optional + +from kedro.io.core import AbstractDataset, DatasetError, generate_timestamp +from kedro.io.data_catalog import DataCatalog, _resolve_credentials +from kedro.io.memory_dataset import MemoryDataset + + +class DataCatalogLite(DataCatalog): + """``DataCatalogLite`` is a custom implementation of Kedro's ``DataCatalog`` + to provide a MemoryDataset instance by overriding ``from_config`` of ``DataCatalog`` + when running Kedro-Viz in lite mode. + """ + + @classmethod + def from_config( + cls, + catalog: Optional[dict[str, dict[str, Any]]], + credentials: Optional[dict[str, dict[str, Any]]] = None, + load_versions: Optional[dict[str, str]] = None, + save_version: Optional[str] = None, + ) -> DataCatalog: + datasets = {} + dataset_patterns = {} + catalog = copy.deepcopy(catalog) or {} + credentials = copy.deepcopy(credentials) or {} + save_version = save_version or generate_timestamp() + load_versions = copy.deepcopy(load_versions) or {} + user_default = {} + + for ds_name, ds_config in catalog.items(): + if not isinstance(ds_config, dict): + raise DatasetError( + f"Catalog entry '{ds_name}' is not a valid dataset configuration. " + "\nHint: If this catalog entry is intended for variable interpolation, " + "make sure that the key is preceded by an underscore." + ) + + try: + ds_config = _resolve_credentials( + ds_config, credentials + ) # noqa: PLW2901 + if cls._is_pattern(ds_name): + # Add each factory to the dataset_patterns dict. + dataset_patterns[ds_name] = ds_config + + else: + try: + datasets[ds_name] = AbstractDataset.from_config( + ds_name, ds_config, load_versions.get(ds_name), save_version + ) + except DatasetError: + # pylint: disable=abstract-class-instantiated + datasets[ds_name] = MemoryDataset() # type: ignore[abstract] + except KeyError: + # pylint: disable=abstract-class-instantiated + datasets[ds_name] = MemoryDataset() # type: ignore[abstract] + + sorted_patterns = cls._sort_patterns(dataset_patterns) + if sorted_patterns: + # If the last pattern is a catch-all pattern, pop it and set it as the default + if cls._specificity(list(sorted_patterns.keys())[-1]) == 0: + last_pattern = sorted_patterns.popitem() + user_default = {last_pattern[0]: last_pattern[1]} + + return cls( + datasets=datasets, + dataset_patterns=sorted_patterns, + load_versions=load_versions, + save_version=save_version, + default_pattern=user_default, + ) diff --git a/package/kedro_viz/integrations/kedro/data_loader.py b/package/kedro_viz/integrations/kedro/data_loader.py index 1ac1521e61..e83c377e95 100644 --- a/package/kedro_viz/integrations/kedro/data_loader.py +++ b/package/kedro_viz/integrations/kedro/data_loader.py @@ -7,11 +7,13 @@ import json import logging +import sys from pathlib import Path -from typing import Any, Dict, Optional, Tuple +from typing import Any, Dict, Optional, Set, Tuple +from unittest.mock import patch from kedro import __version__ -from kedro.framework.project import configure_project, pipelines +from kedro.framework.project import configure_project, pipelines, settings from kedro.framework.session import KedroSession from kedro.framework.session.store import BaseSessionStore from kedro.framework.startup import bootstrap_project @@ -19,6 +21,8 @@ from kedro.pipeline import Pipeline from kedro_viz.constants import VIZ_METADATA_ARGS +from kedro_viz.integrations.kedro.data_catalog_lite import DataCatalogLite +from kedro_viz.integrations.kedro.lite_parser import LiteParser logger = logging.getLogger(__name__) @@ -69,33 +73,29 @@ def _get_dataset_stats(project_path: Path) -> Dict: return {} -def load_data( +def _load_data_helper( project_path: Path, env: Optional[str] = None, include_hooks: bool = False, - package_name: Optional[str] = None, extra_params: Optional[Dict[str, Any]] = None, -) -> Tuple[DataCatalog, Dict[str, Pipeline], BaseSessionStore, Dict]: - """Load data from a Kedro project. + is_lite: bool = False, +): + """Helper to load data from a Kedro project. + Args: project_path: the path where the Kedro project is located. env: the Kedro environment to load the data. If not provided. it will use Kedro default, which is local. include_hooks: A flag to include all registered hooks in your Kedro Project. - package_name: The name of the current package extra_params: Optional dictionary containing extra project parameters for underlying KedroContext. If specified, will update (and therefore take precedence over) the parameters retrieved from the project configuration. + is_lite: A flag to run Kedro-Viz in lite mode. Returns: - A tuple containing the data catalog and the pipeline dictionary - and the session store. + A tuple containing the data catalog, pipeline dictionary, session store + and dataset stats dictionary. """ - if package_name: - configure_project(package_name) - else: - # bootstrap project when viz is run in dev mode - bootstrap_project(project_path) with KedroSession.create( project_path=project_path, @@ -109,6 +109,13 @@ def load_data( context = session.load_context() session_store = session._store + + # Update the DataCatalog class for a custom implementation + # to handle kedro.io.core.DatasetError from + # `settings.DATA_CATALOG_CLASS.from_config` + if is_lite: + settings.DATA_CATALOG_CLASS = DataCatalogLite + catalog = context.catalog # Pipelines is a lazy dict-like object, so we force it to populate here @@ -116,5 +123,67 @@ def load_data( # Useful for users who have `get_current_session` in their `register_pipelines()`. pipelines_dict = dict(pipelines) stats_dict = _get_dataset_stats(project_path) - return catalog, pipelines_dict, session_store, stats_dict + + +def load_data( + project_path: Path, + env: Optional[str] = None, + include_hooks: bool = False, + package_name: Optional[str] = None, + extra_params: Optional[Dict[str, Any]] = None, + is_lite: bool = False, +) -> Tuple[DataCatalog, Dict[str, Pipeline], BaseSessionStore, Dict]: + """Load data from a Kedro project. + Args: + project_path: the path where the Kedro project is located. + env: the Kedro environment to load the data. If not provided. + it will use Kedro default, which is local. + include_hooks: A flag to include all registered hooks in your Kedro Project. + package_name: The name of the current package + extra_params: Optional dictionary containing extra project parameters + for underlying KedroContext. If specified, will update (and therefore + take precedence over) the parameters retrieved from the project + configuration. + is_lite: A flag to run Kedro-Viz in lite mode. + Returns: + A tuple containing the data catalog, pipeline dictionary, session store + and dataset stats dictionary. + """ + if package_name: + configure_project(package_name) + else: + # bootstrap project when viz is run in dev mode + bootstrap_project(project_path) + + if is_lite: + lite_parser = LiteParser(package_name) + unresolved_imports = lite_parser.parse(project_path) + sys_modules_patch = sys.modules.copy() + + if unresolved_imports and len(unresolved_imports) > 0: + modules_to_mock: Set[str] = set() + + for unresolved_module_set in unresolved_imports.values(): + modules_to_mock = modules_to_mock.union(unresolved_module_set) + + mocked_modules = lite_parser.create_mock_modules(modules_to_mock) + sys_modules_patch.update(mocked_modules) + + logger.warning( + "Kedro-Viz has mocked the following dependencies for lite-mode.\n" + "%s \n" + "In order to get a complete experience of Viz, " + "please install the missing Kedro project dependencies\n", + list(mocked_modules.keys()), + ) + + # Patch actual sys modules + with patch.dict("sys.modules", sys_modules_patch): + return _load_data_helper( + project_path, env, include_hooks, extra_params, is_lite + ) + else: + return _load_data_helper( + project_path, env, include_hooks, extra_params, is_lite + ) diff --git a/package/kedro_viz/integrations/kedro/lite_parser.py b/package/kedro_viz/integrations/kedro/lite_parser.py new file mode 100755 index 0000000000..bd9fcb6824 --- /dev/null +++ b/package/kedro_viz/integrations/kedro/lite_parser.py @@ -0,0 +1,273 @@ +"""`kedro_viz.integrations.kedro.lite_parser` defines a Kedro parser using AST.""" + +import ast +import importlib.util +import logging +from pathlib import Path +from typing import Dict, List, Set, Union +from unittest.mock import MagicMock + +logger = logging.getLogger(__name__) + + +class LiteParser: + """Represents a Kedro Parser which uses AST + + Args: + package_name (Union[str, None]): The name of the current package + """ + + def __init__(self, package_name: Union[str, None] = None) -> None: + self._package_name = package_name + + @staticmethod + def _is_module_importable(module_name: str) -> bool: + """Checks if a module is importable + + Args: + module_name (str): The name of the module to check + importability + Returns: + Whether the module can be imported + """ + try: + # Check if the module can be importable + # In case of submodule (contains a dot, e.g: sklearn.linear_model), + # find_spec imports the parent module + if importlib.util.find_spec(module_name) is None: + return False + return True + except ModuleNotFoundError as mnf_exc: + logger.debug( + "ModuleNotFoundError in resolving %s : %s", module_name, mnf_exc + ) + return False + except ImportError as imp_exc: + logger.debug("ImportError in resolving %s : %s", module_name, imp_exc) + return False + except ValueError as val_exc: + logger.debug("ValueError in resolving %s : %s", module_name, val_exc) + return False + # pylint: disable=broad-except + except Exception as exc: # pragma: no cover + logger.debug( + "An exception occurred while resolving %s : %s", module_name, exc + ) + return False + + @staticmethod + def _get_module_parts(module_name: str) -> List[str]: + """Creates a list of module parts to check for importability + + Args: + module_name (str): The module name to split + + Returns: + A list of module parts + + Example: + >>> LiteParser._get_module_parts("kedro.framework.project") + ["kedro", "kedro.framework", "kedro.framework.project"] + + """ + module_split = module_name.split(".") + full_module_name = "" + module_parts = [] + + for idx, sub_module_name in enumerate(module_split): + full_module_name = ( + sub_module_name if idx == 0 else f"{full_module_name}.{sub_module_name}" + ) + module_parts.append(full_module_name) + + return module_parts + + def _is_relative_import(self, module_name: str, project_file_paths: Set[Path]): + """Checks if a module is a relative import. This is needed + in dev or standalone mode when the package_name is None and + internal package files have unresolved external dependencies + + Args: + module_name (str): The name of the module to check + importability + project_file_paths (Set[Path]): A set of project file paths + + Returns: + Whether the module is a relative import starting + from the root package dir + + Example: + >>> lite_parser_obj = LiteParser() + >>> module_name = "kedro_project_package.pipelines.reporting.nodes" + >>> project_file_paths = set([Path("/path/to/relative/file")]) + >>> lite_parser_obj._is_relative_import(module_name, project_file_paths) + True + """ + relative_module_path = module_name.replace(".", "/") + + # Check if the relative_module_path + # is a substring of current project file path + is_relative_import_path = any( + relative_module_path in str(project_file_path) + for project_file_path in project_file_paths + ) + + return is_relative_import_path + + def _populate_missing_dependencies( + self, module_name: str, missing_dependencies: Set[str] + ) -> None: + """Helper to populate missing dependencies + + Args: + module_name (str): The module name to check if it is importable + missing_dependencies (Set[str]): A set of missing dependencies + + """ + module_name_parts = self._get_module_parts(module_name) + for module_name_part in module_name_parts: + if ( + not self._is_module_importable(module_name_part) + and module_name_part not in missing_dependencies + ): + missing_dependencies.add(module_name_part) + + def _get_unresolved_imports( + self, file_path: Path, project_file_paths: Union[Set[Path], None] = None + ) -> Set[str]: + """Parse the file using AST and return any missing dependencies + in the current file + + Args: + file_path (Path): The file path to parse + project_file_paths Union[Set[Path], None]: A set of project file paths + + Returns: + A set of missing dependencies + """ + + missing_dependencies: Set[str] = set() + + # Read the file + with open(file_path, "r", encoding="utf-8") as file: + file_content = file.read() + + # parse file content using ast + parsed_content_ast_node: ast.Module = ast.parse(file_content) + file_path = file_path.resolve() + + # Explore each node in the AST tree + for node in ast.walk(parsed_content_ast_node): + # Handling dependencies that starts with "import " + # Example: import logging + # Corresponding AST node will be: + # Import(names=[alias(name='logging')]) + if isinstance(node, ast.Import): + for alias in node.names: + module_name = alias.name + self._populate_missing_dependencies( + module_name, missing_dependencies + ) + + # Handling dependencies that starts with "from " + # Example: from typing import Dict, Union + # Corresponding AST node will be: + # ImportFrom(module='typing', names=[alias(name='Dict'), + # alias(name='Union')], + # level=0) + elif isinstance(node, ast.ImportFrom): + module_name = node.module if node.module else "" + level = node.level + + # Ignore relative imports like "from . import a" + if not module_name: + continue + + # Ignore relative imports within the package + # Examples: + # "from demo_project.pipelines.reporting import test", + # "from ..nodes import func_test" + if (self._package_name and self._package_name in module_name) or ( + # dev or standalone mode + not self._package_name + and project_file_paths + and self._is_relative_import(module_name, project_file_paths) + ): + continue + + # absolute modules in the env + # Examples: + # from typing import Dict, Union + # from sklearn.linear_model import LinearRegression + if level == 0: + self._populate_missing_dependencies( + module_name, missing_dependencies + ) + + return missing_dependencies + + def create_mock_modules(self, unresolved_imports: Set[str]) -> Dict[str, MagicMock]: + """Creates mock modules for unresolved imports + + Args: + unresolved_imports (Set[str]): A set of unresolved imports + + Returns: + A dictionary of mocked modules for the unresolved imports + """ + mocked_modules: Dict[str, MagicMock] = {} + + for unresolved_import in unresolved_imports: + mocked_modules[unresolved_import] = MagicMock() + + return mocked_modules + + def parse(self, target_path: Path) -> Union[Dict[str, Set[str]], None]: + """Parses the file(s) in the target path and returns + any unresolved imports for all the dependency errors + as a dictionary of file(s) in the target path and a set of module names + + Args: + target_path (Path): The path to parse file(s) + + Returns: + A dictionary of file(s) in the target path and a set of module names + """ + + if not target_path.exists(): + logger.warning("Path `%s` does not exist", str(target_path)) + return None + + unresolved_imports: Dict[str, Set[str]] = {} + + if target_path.is_file(): + missing_dependencies = self._get_unresolved_imports(target_path) + if len(missing_dependencies) > 0: + unresolved_imports[str(target_path)] = missing_dependencies + return unresolved_imports + + # handling directories + _project_file_paths = set(target_path.rglob("*.py")) + + for file_path in _project_file_paths: + try: + # Ensure the package name is in the file path + if self._package_name and self._package_name not in file_path.parts: + # we are only mocking the dependencies + # inside the package + continue + + missing_dependencies = self._get_unresolved_imports( + file_path, _project_file_paths + ) + if len(missing_dependencies) > 0: + unresolved_imports[str(file_path)] = missing_dependencies + # pylint: disable=broad-except + except Exception as exc: # pragma: no cover + logger.error( + "An error occurred in LiteParser while mocking dependencies : %s", + exc, + ) + continue + + return unresolved_imports diff --git a/package/kedro_viz/launchers/cli/run.py b/package/kedro_viz/launchers/cli/run.py index 54b179b4b8..87b2722330 100644 --- a/package/kedro_viz/launchers/cli/run.py +++ b/package/kedro_viz/launchers/cli/run.py @@ -78,6 +78,11 @@ help=PARAMS_ARG_HELP, callback=_split_params, ) +@click.option( + "--lite", + is_flag=True, + help="A flag to load an experimental light-weight Kedro Viz", +) # pylint: disable=import-outside-toplevel, too-many-locals def run( host, @@ -90,6 +95,7 @@ def run( autoreload, include_hooks, params, + lite, ): """Launch local Kedro Viz instance""" # Deferring Imports @@ -155,6 +161,7 @@ def run( "include_hooks": include_hooks, "package_name": PACKAGE_NAME, "extra_params": params, + "is_lite": lite, } if autoreload: from watchgod import RegExpWatcher, run_process diff --git a/package/kedro_viz/server.py b/package/kedro_viz/server.py index b950cc8b77..76026ddbbf 100644 --- a/package/kedro_viz/server.py +++ b/package/kedro_viz/server.py @@ -51,16 +51,13 @@ def load_and_populate_data( package_name: Optional[str] = None, pipeline_name: Optional[str] = None, extra_params: Optional[Dict[str, Any]] = None, + is_lite: bool = False, ): """Loads underlying Kedro project data and populates Kedro Viz Repositories""" # Loads data from underlying Kedro Project catalog, pipelines, session_store, stats_dict = kedro_data_loader.load_data( - path, - env, - include_hooks, - package_name, - extra_params, + path, env, include_hooks, package_name, extra_params, is_lite ) pipelines = ( @@ -86,6 +83,7 @@ def run_server( include_hooks: bool = False, package_name: Optional[str] = None, extra_params: Optional[Dict[str, Any]] = None, + is_lite: bool = False, ): # pylint: disable=redefined-outer-name """Run a uvicorn server with a FastAPI app that either launches API response data from a file or from reading data from a real Kedro project. @@ -108,6 +106,7 @@ def run_server( for underlying KedroContext. If specified, will update (and therefore take precedence over) the parameters retrieved from the project configuration. + is_lite: A flag to run Kedro-Viz in lite mode. """ # Importing below dependencies inside `run_server` to avoid ImportError # when calling `load_and_populate_data` from VSCode @@ -121,12 +120,7 @@ def run_server( if load_file is None: load_and_populate_data( - path, - env, - include_hooks, - package_name, - pipeline_name, - extra_params, + path, env, include_hooks, package_name, pipeline_name, extra_params, is_lite ) # [TODO: As we can do this with `kedro viz build`, # we need to shift this feature outside of kedro viz run] diff --git a/package/tests/test_data_access/test_managers.py b/package/tests/test_data_access/test_managers.py index af94785cb9..d1c43162bc 100644 --- a/package/tests/test_data_access/test_managers.py +++ b/package/tests/test_data_access/test_managers.py @@ -3,6 +3,7 @@ import networkx as nx import pytest from kedro.io import DataCatalog, MemoryDataset +from kedro.io.core import DatasetError from kedro.pipeline import Pipeline, node from kedro.pipeline.modular_pipeline import pipeline from kedro_datasets.pandas import CSVDataset @@ -27,25 +28,6 @@ def identity(x): return x -def assert_expected_modular_pipeline_values_for_edge_cases( - expected_modular_pipeline_tree_obj, - modular_pipeline_node_id, - data_access_manager, - modular_pipeline_tree_values, - expected_key, -): - """This asserts an `expected_key` value present in modular_pipeline_tree - that is constructed in the edge cases with the expected_modular_pipeline_tree""" - assert sorted( - list(expected_modular_pipeline_tree_obj[modular_pipeline_node_id][expected_key]) - ) == sorted( - list( - data_access_manager.nodes.get_node_by_id(node_id).name - for node_id in modular_pipeline_tree_values - ) - ) - - class TestAddCatalog: def test_add_catalog( self, @@ -378,6 +360,29 @@ def test_add_dataset_with_modular_pipeline( "uk.data_science", } + def test_add_dataset_with_unresolved_pattern( + self, + data_access_manager: DataAccessManager, + example_pipelines: Dict[str, Pipeline], + example_modular_pipelines_repo_obj, + mocker, + ): + dataset = CSVDataset(filepath="dataset.csv") + dataset_name = "companies#csv" + catalog = DataCatalog(datasets={dataset_name: dataset}) + data_access_manager.add_catalog(catalog, example_pipelines) + + with mocker.patch.object( + data_access_manager.catalog, + "get_dataset", + side_effect=DatasetError("Dataset not found"), + ): + dataset_obj = data_access_manager.add_dataset( + "my_pipeline", dataset_name, example_modular_pipelines_repo_obj + ) + + assert isinstance(dataset_obj.kedro_obj, MemoryDataset) + def test_add_all_parameters( self, data_access_manager: DataAccessManager, diff --git a/package/tests/test_integrations/test_data_catalog_lite.py b/package/tests/test_integrations/test_data_catalog_lite.py new file mode 100644 index 0000000000..e68bc1bd6c --- /dev/null +++ b/package/tests/test_integrations/test_data_catalog_lite.py @@ -0,0 +1,323 @@ +import logging +import re +import sys +from copy import deepcopy + +import pandas as pd +import pytest +from kedro.io import DatasetError +from kedro_datasets.pandas import CSVDataset +from pandas.testing import assert_frame_equal + +from kedro_viz.integrations.kedro.data_catalog_lite import DataCatalogLite + + +@pytest.fixture +def filepath(tmp_path): + return (tmp_path / "some" / "dir" / "test.csv").as_posix() + + +@pytest.fixture +def dummy_dataframe(): + return pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) + + +@pytest.fixture +def sane_config(filepath): + return { + "catalog": { + "boats": {"type": "pandas.CSVDataset", "filepath": filepath}, + "cars": { + "type": "pandas.CSVDataset", + "filepath": "s3://test_bucket/test_file.csv", + "credentials": "s3_credentials", + }, + }, + "credentials": { + "s3_credentials": {"key": "FAKE_ACCESS_KEY", "secret": "FAKE_SECRET_KEY"} + }, + } + + +@pytest.fixture +def sane_config_with_nested_creds(sane_config): + sane_config["catalog"]["cars"]["credentials"] = { + "client_kwargs": {"credentials": "other_credentials"}, + "key": "secret", + } + sane_config["credentials"]["other_credentials"] = { + "client_kwargs": { + "aws_access_key_id": "OTHER_FAKE_ACCESS_KEY", + "aws_secret_access_key": "OTHER_FAKE_SECRET_KEY", + } + } + return sane_config + + +@pytest.fixture +def config_with_dataset_factories(): + return { + "catalog": { + "{brand}_cars": { + "type": "pandas.CSVDataset", + "filepath": "data/01_raw/{brand}_cars.csv", + }, + "audi_cars": { + "type": "pandas.ParquetDataset", + "filepath": "data/01_raw/audi_cars.pq", + }, + "{type}_boats": { + "type": "pandas.CSVDataset", + "filepath": "data/01_raw/{type}_boats.csv", + }, + "{default1}": { + "type": "pandas.CSVDataset", + "filepath": "data/01_raw/{default1}.csv", + }, + }, + } + + +@pytest.fixture +def bad_config(filepath): + return { + "bad": {"type": "tests.io.test_data_catalog.BadDataset", "filepath": filepath} + } + + +@pytest.fixture +def data_catalog_lite_from_config(sane_config): + return DataCatalogLite.from_config(**sane_config) + + +class TestDataCatalogLiteFromConfig: + def test_from_sane_config(self, data_catalog_lite_from_config, dummy_dataframe): + """Test populating the data catalog from config""" + data_catalog_lite_from_config.save("boats", dummy_dataframe) + reloaded_df = data_catalog_lite_from_config.load("boats") + assert_frame_equal(reloaded_df, dummy_dataframe) + + def test_config_missing_type(self, sane_config): + """Check for no error if type attribute is missing for some data set(s) + in the config""" + del sane_config["catalog"]["boats"]["type"] + try: + # DataCatalogLite should not raise DatasetError + DataCatalogLite.from_config(**sane_config) + except DatasetError: + pytest.fail("DataCatalogLite.from_config raised DatasetError unexpectedly") + + def test_config_invalid_module(self, sane_config): + """Check for no error if the type points to nonexistent module""" + + sane_config["catalog"]["boats"][ + "type" + ] = "kedro.invalid_module_name.io.CSVDataset" + + try: + # DataCatalogLite should not raise DatasetError + DataCatalogLite.from_config(**sane_config) + except DatasetError: + pytest.fail("DataCatalogLite.from_config raised DatasetError unexpectedly") + + def test_config_relative_import(self, sane_config): + """Check for no error if the type points to a relative import""" + sane_config["catalog"]["boats"]["type"] = ".CSVDatasetInvalid" + + try: + # DataCatalogLite should not raise DatasetError + DataCatalogLite.from_config(**sane_config) + except DatasetError: + pytest.fail("DataCatalogLite.from_config raised DatasetError unexpectedly") + + def test_config_missing_class(self, sane_config): + """Check for no error if the type points to nonexistent class""" + sane_config["catalog"]["boats"]["type"] = "kedro.io.CSVDatasetInvalid" + + try: + # DataCatalogLite should not raise DatasetError + DataCatalogLite.from_config(**sane_config) + except DatasetError: + pytest.fail("DataCatalogLite.from_config raised DatasetError unexpectedly") + + @pytest.mark.skipif( + sys.version_info < (3, 9), + reason="for python 3.8 kedro-datasets version 1.8 is used which has the old spelling", + ) + def test_config_incorrect_spelling(self, sane_config): + """Check hint if the type uses the old DataSet spelling""" + sane_config["catalog"]["boats"]["type"] = "pandas.CSVDataSet" + + try: + # DataCatalogLite should not raise DatasetError + DataCatalogLite.from_config(**sane_config) + except DatasetError: + pytest.fail("DataCatalogLite.from_config raised DatasetError unexpectedly") + + def test_config_invalid_dataset(self, sane_config): + """Check for no error if the type points to invalid class""" + sane_config["catalog"]["boats"]["type"] = "DataCatalogLite" + + try: + # DataCatalogLite should not raise DatasetError + DataCatalogLite.from_config(**sane_config) + except DatasetError: + pytest.fail("DataCatalogLite.from_config raised DatasetError unexpectedly") + + def test_config_invalid_arguments(self, sane_config): + """Check for no error if the data set config contains invalid arguments""" + sane_config["catalog"]["boats"]["save_and_load_args"] = False + + try: + # DataCatalogLite should not raise DatasetError + DataCatalogLite.from_config(**sane_config) + except DatasetError: + pytest.fail("DataCatalogLite.from_config raised DatasetError unexpectedly") + + def test_config_invalid_dataset_config(self, sane_config): + """Check for valid config""" + sane_config["catalog"]["invalid_entry"] = "some string" + pattern = ( + "Catalog entry 'invalid_entry' is not a valid dataset configuration. " + "\nHint: If this catalog entry is intended for variable interpolation, " + "make sure that the key is preceded by an underscore." + ) + with pytest.raises(DatasetError, match=pattern): + DataCatalogLite.from_config(**sane_config) + + def test_empty_config(self): + """Test empty config""" + assert DataCatalogLite.from_config(None) + + def test_missing_credentials(self, sane_config): + """Check for no error if credentials can't be located""" + sane_config["catalog"]["cars"]["credentials"] = "missing" + + try: + # DataCatalogLite should not raise KeyError + DataCatalogLite.from_config(**sane_config) + except DatasetError: + pytest.fail("DataCatalogLite.from_config raised KeyError unexpectedly") + + def test_link_credentials(self, sane_config, mocker): + """Test credentials being linked to the relevant data set""" + mock_client = mocker.patch("kedro_datasets.pandas.csv_dataset.fsspec") + config = deepcopy(sane_config) + del config["catalog"]["boats"] + + DataCatalogLite.from_config(**config) + + expected_client_kwargs = sane_config["credentials"]["s3_credentials"] + mock_client.filesystem.assert_called_with("s3", **expected_client_kwargs) + + def test_nested_credentials(self, sane_config_with_nested_creds, mocker): + mock_client = mocker.patch("kedro_datasets.pandas.csv_dataset.fsspec") + config = deepcopy(sane_config_with_nested_creds) + del config["catalog"]["boats"] + DataCatalogLite.from_config(**config) + + expected_client_kwargs = { + "client_kwargs": { + "credentials": { + "client_kwargs": { + "aws_access_key_id": "OTHER_FAKE_ACCESS_KEY", + "aws_secret_access_key": "OTHER_FAKE_SECRET_KEY", + } + } + }, + "key": "secret", + } + mock_client.filesystem.assert_called_once_with("s3", **expected_client_kwargs) + + def test_missing_nested_credentials(self, sane_config_with_nested_creds): + """Check for no error if credentials are missing from nested credentials""" + del sane_config_with_nested_creds["credentials"]["other_credentials"] + + try: + # DataCatalogLite should not raise KeyError + DataCatalogLite.from_config(**sane_config_with_nested_creds) + except DatasetError: + pytest.fail("DataCatalogLite.from_config raised KeyError unexpectedly") + + def test_missing_dependency(self, sane_config, mocker): + """Test that no error is thrown when a dependency is missing.""" + pattern = "dependency issue" + + def dummy_load(obj_path, *args, **kwargs): + if obj_path == "kedro_datasets.pandas.CSVDataset": + raise AttributeError(pattern) + if obj_path == "kedro_datasets.pandas.__all__": + return ["CSVDataset"] + return None + + mocker.patch("kedro.io.core.load_obj", side_effect=dummy_load) + + try: + # DataCatalogLite should not raise DatasetError + DataCatalogLite.from_config(**sane_config) + except DatasetError: + pytest.fail("DataCatalogLite.from_config raised DatasetError unexpectedly") + + def test_idempotent_catalog(self, sane_config): + """Test that data catalog instantiations are idempotent""" + _ = DataCatalogLite.from_config(**sane_config) + catalog = DataCatalogLite.from_config(**sane_config) + assert catalog + + def test_error_dataset_init(self, bad_config): + """Check for no error when trying to instantiate erroneous data set""" + try: + # DataCatalogLite should not raise DatasetError + DataCatalogLite.from_config(bad_config) + except DatasetError: + pytest.fail("DataCatalogLite.from_config raised DatasetError unexpectedly") + + def test_confirm(self, tmp_path, caplog, mocker): + """Confirm the dataset""" + with caplog.at_level(logging.INFO): + mock_confirm = mocker.patch( + "kedro_datasets.partitions.incremental_dataset.IncrementalDataset.confirm" + ) + catalog = { + "ds_to_confirm": { + "type": "kedro_datasets.partitions.incremental_dataset.IncrementalDataset", + "dataset": "pandas.CSVDataset", + "path": str(tmp_path), + } + } + data_catalog = DataCatalogLite.from_config(catalog=catalog) + data_catalog.confirm("ds_to_confirm") + assert caplog.record_tuples == [ + ( + "kedro.io.data_catalog", + logging.INFO, + "Confirming dataset 'ds_to_confirm'", + ) + ] + mock_confirm.assert_called_once_with() + + @pytest.mark.parametrize( + "dataset_name,pattern", + [ + ("missing", "Dataset 'missing' not found in the catalog"), + ("boats", "Dataset 'boats' does not have 'confirm' method"), + ], + ) + def test_bad_confirm(self, sane_config, dataset_name, pattern): + """Test confirming non existent dataset or the one that + does not have `confirm` method""" + data_catalog_lite = DataCatalogLite.from_config(**sane_config) + + with pytest.raises(DatasetError, match=re.escape(pattern)): + data_catalog_lite.confirm(dataset_name) + + def test_match_added_to_datasets_on_get(self, config_with_dataset_factories): + """Check that the datasets that match patterns are only added when fetched""" + catalog = DataCatalogLite.from_config(**config_with_dataset_factories) + assert "{brand}_cars" not in catalog._datasets + assert "tesla_cars" not in catalog._datasets + assert "{brand}_cars" in catalog._dataset_patterns + + tesla_cars = catalog._get_dataset("tesla_cars") + assert isinstance(tesla_cars, CSVDataset) + assert "tesla_cars" in catalog._datasets diff --git a/package/tests/test_integrations/test_lite_parser.py b/package/tests/test_integrations/test_lite_parser.py new file mode 100644 index 0000000000..b3ae1eedea --- /dev/null +++ b/package/tests/test_integrations/test_lite_parser.py @@ -0,0 +1,195 @@ +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from kedro_viz.integrations.kedro.lite_parser import LiteParser + + +@pytest.fixture +def sample_project_path(tmp_path): + # Create a sample directory structure + package_dir = tmp_path / "mock_spaceflights" + package_dir.mkdir() + (package_dir / "__init__.py").touch() + (package_dir / "__init__.py").write_text( + "from mock_spaceflights import data_processing\n" + "from mock_spaceflights.data_processing import create_metrics" + ) + (package_dir / "data_processing.py").write_text( + "import os\n" + "import nonexistentmodule\n" + "from . import test\n" + "from typing import Dict" + ) + return tmp_path + + +@pytest.fixture +def lite_parser(): + return LiteParser(package_name="mock_spaceflights") + + +class TestLiteParser: + def test_is_module_importable_existing_module(self, lite_parser): + assert lite_parser._is_module_importable("os") is True + + def test_is_module_importable_nonexistent_module(self, lite_parser): + assert lite_parser._is_module_importable("nonexistentmodule") is False + + def test_is_module_importable_importerror(self, lite_parser): + with patch("importlib.util.find_spec", side_effect=ImportError): + assert lite_parser._is_module_importable("nonexistentmodule") is False + + def test_is_module_importable_modulenotfounderror(self, lite_parser): + with patch("importlib.util.find_spec", side_effect=ModuleNotFoundError): + assert lite_parser._is_module_importable("nonexistentmodule") is False + + def test_is_module_importable_valueerror(self, lite_parser): + with patch("importlib.util.find_spec", side_effect=ValueError): + assert lite_parser._is_module_importable("nonexistentmodule") is False + + @pytest.mark.parametrize( + "module_name, expected_module_parts", + [ + ("sklearn", ["sklearn"]), + ( + "demo_project.pipelines.ingestion", + [ + "demo_project", + "demo_project.pipelines", + "demo_project.pipelines.ingestion", + ], + ), + ], + ) + def test_get_module_parts(self, lite_parser, module_name, expected_module_parts): + assert lite_parser._get_module_parts(module_name) == expected_module_parts + + def test_is_relative_import_found(self, lite_parser): + module_name = "kedro_project_package.pipelines.reporting.nodes" + project_file_paths = { + Path("/path/to/kedro_project_package/pipelines/reporting/nodes.py") + } + assert lite_parser._is_relative_import(module_name, project_file_paths) + + def test_relative_import_not_found(self, lite_parser): + module_name = "kedro_project_package.pipelines.reporting.nodes" + project_file_paths = { + Path("/path/to/another_project/pipelines/reporting/nodes.py") + } + assert not lite_parser._is_relative_import(module_name, project_file_paths) + + def test_relative_import_partial_match(self, lite_parser): + module_name = "kedro_project_package.pipelines" + project_file_paths = { + Path("/path/to/kedro_project_package/pipelines/reporting/nodes.py"), + Path("/path/to/kedro_project_package/pipelines/something_else.py"), + } + assert lite_parser._is_relative_import(module_name, project_file_paths) + + def test_relative_import_empty_file_paths(self, lite_parser): + module_name = "kedro_project_package.pipelines.reporting.nodes" + project_file_paths = set() + assert not lite_parser._is_relative_import(module_name, project_file_paths) + + def test_populate_missing_dependencies(self, lite_parser): + module_name = "non_importable.module.part" + missing_dependencies = set() + + lite_parser._populate_missing_dependencies(module_name, missing_dependencies) + + # The test expects the missing dependencies to + # include each part of the module name + expected_missing = { + "non_importable", + "non_importable.module", + "non_importable.module.part", + } + assert missing_dependencies == expected_missing + + def test_no_missing_dependencies(self, lite_parser, mocker): + module_name = "importable_module" + missing_dependencies = set() + mocker.patch( + "kedro_viz.integrations.kedro.lite_parser.LiteParser._is_module_importable", + return_value=True, + ) + + lite_parser._populate_missing_dependencies(module_name, missing_dependencies) + + # Since the module is importable, + # the set should remain empty + assert not missing_dependencies + + def test_partial_importability(self, lite_parser, mocker): + module_name = "importable_module.non_importable_part" + missing_dependencies = set() + mocker.patch( + "kedro_viz.integrations.kedro.lite_parser.LiteParser._is_module_importable", + side_effect=lambda part: part == "importable_module", + ) + + lite_parser._populate_missing_dependencies(module_name, missing_dependencies) + + # Only the non-importable part + # should be added to the set + expected_missing = {"importable_module.non_importable_part"} + assert missing_dependencies == expected_missing + + def test_get_unresolved_imports(self, lite_parser, sample_project_path, mocker): + file_path = Path(sample_project_path / "mock_spaceflights/data_processing.py") + mock_populate = mocker.patch( + "kedro_viz.integrations.kedro.lite_parser.LiteParser._populate_missing_dependencies" + ) + + lite_parser._get_unresolved_imports(file_path) + + # Ensure _populate_missing_dependencies was called + # with correct module names + mock_populate.assert_any_call("os", set()) + mock_populate.assert_any_call("nonexistentmodule", set()) + + def test_get_unresolved_relative_imports(self, sample_project_path, mocker): + lite_parser_obj = LiteParser() + file_path = Path(sample_project_path / "mock_spaceflights/__init__.py") + + unresolvable_imports = lite_parser_obj._get_unresolved_imports( + file_path, set(sample_project_path.rglob("*.py")) + ) + + assert len(unresolvable_imports) == 0 + + def test_create_mock_modules(self, lite_parser): + unresolved_imports = {"sklearn", "pyspark.pandas"} + mocked_modules = lite_parser.create_mock_modules(unresolved_imports) + + assert len(mocked_modules) == len(unresolved_imports) + assert "sklearn" in mocked_modules + assert "pyspark.pandas" in mocked_modules + assert isinstance(mocked_modules["sklearn"], MagicMock) + + def test_parse_non_existent_path(self, lite_parser): + assert not lite_parser.parse(Path("non/existent/path")) + assert not lite_parser.parse(Path("non/existent/path/file.py")) + + def test_file_parse(self, lite_parser, sample_project_path): + file_path = Path(sample_project_path / "mock_spaceflights/data_processing.py") + unresolved_imports = lite_parser.parse(file_path) + + assert unresolved_imports == {str(file_path): {"nonexistentmodule"}} + + def test_directory_parse(self, lite_parser, sample_project_path): + unresolved_imports = lite_parser.parse(sample_project_path) + expected_file_path = Path( + sample_project_path / "mock_spaceflights/data_processing.py" + ) + assert unresolved_imports == {str(expected_file_path): {"nonexistentmodule"}} + + def test_directory_parse_non_package_path(self, sample_project_path): + lite_parser_obj = LiteParser("mock_pyspark") + unresolvable_imports = lite_parser_obj.parse(sample_project_path) + + # ignore files in other packages if + # LiteParser is instantiated with a package_name + assert len(unresolvable_imports) == 0 diff --git a/package/tests/test_launchers/test_cli/test_run.py b/package/tests/test_launchers/test_cli/test_run.py index bd5db7bfcf..b2d5c59b39 100644 --- a/package/tests/test_launchers/test_cli/test_run.py +++ b/package/tests/test_launchers/test_cli/test_run.py @@ -53,6 +53,7 @@ class TestCliRunViz: "include_hooks": False, "package_name": None, "extra_params": {}, + "is_lite": False, }, ), ( @@ -69,6 +70,7 @@ class TestCliRunViz: "include_hooks": False, "package_name": None, "extra_params": {}, + "is_lite": False, }, ), ( @@ -90,6 +92,7 @@ class TestCliRunViz: "include_hooks": False, "package_name": None, "extra_params": {}, + "is_lite": False, }, ), ( @@ -122,6 +125,7 @@ class TestCliRunViz: "include_hooks": False, "package_name": None, "extra_params": {"extra_param": "param"}, + "is_lite": False, }, ), ( @@ -154,6 +158,7 @@ class TestCliRunViz: "include_hooks": False, "package_name": None, "extra_params": {"extra_param": "param"}, + "is_lite": False, }, ), ( @@ -170,6 +175,24 @@ class TestCliRunViz: "include_hooks": True, "package_name": None, "extra_params": {}, + "is_lite": False, + }, + ), + ( + ["viz", "run", "--lite"], + { + "host": "127.0.0.1", + "port": 4141, + "load_file": None, + "save_file": None, + "pipeline_name": None, + "env": None, + "project_path": "testPath", + "autoreload": False, + "include_hooks": False, + "package_name": None, + "extra_params": {}, + "is_lite": True, }, ), ], @@ -349,6 +372,7 @@ def test_kedro_viz_command_with_autoreload( "include_hooks": False, "package_name": None, "extra_params": {}, + "is_lite": False, }, "watcher_cls": RegExpWatcher, "watcher_kwargs": {"re_files": "^.*(\\.yml|\\.yaml|\\.py|\\.json)$"},