From 97f72760cd8cb5e305179c36d3528484a0dd4791 Mon Sep 17 00:00:00 2001 From: rashidakanchwala <37628668+rashidakanchwala@users.noreply.github.com> Date: Fri, 8 Oct 2021 11:54:06 +0100 Subject: [PATCH] Fix : Metrics Tree doesn't pick up the latest kedro run metrics data This issue occurs when you run kedro viz and kedro run simultaneously on two terminals. After your last kedro run, if you refresh the browser running kedro viz you will notice that while the plot gets updated with the latest. The json tree with metrics data does not. Initially, we would get metrics data using get_load_path function from kedro-core which would load the most recent version -- but this does not happen in the above settings because that function is cached. So we replicate the functionality of kedro-core on kedro-viz backend so the latest metrics are loaded. --- package/features/environment.py | 2 +- package/kedro_viz/models/graph.py | 37 ++++++++- .../test_graph/test_graph_nodes.py | 79 ++++++++++++++++--- 3 files changed, 102 insertions(+), 16 deletions(-) diff --git a/package/features/environment.py b/package/features/environment.py index 1966c27da1..ad507fb5cb 100644 --- a/package/features/environment.py +++ b/package/features/environment.py @@ -109,7 +109,7 @@ def _setup_context_with_venv(context, venv_dir): "botocore", "dynaconf==3.1.5", "PyYAML>=4.2, <6.0", - "click<8.0" + "click<8.0", ], env=context.env, ) diff --git a/package/kedro_viz/models/graph.py b/package/kedro_viz/models/graph.py index a6e4cd07a5..1b8c2678b9 100644 --- a/package/kedro_viz/models/graph.py +++ b/package/kedro_viz/models/graph.py @@ -38,7 +38,7 @@ from enum import Enum from pathlib import Path from types import FunctionType -from typing import Any, Dict, List, Optional, Set, Union, cast +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Union, cast import pandas as pd import plotly.express as px @@ -49,6 +49,12 @@ from kedro.pipeline.pipeline import TRANSCODING_SEPARATOR, _strip_transcoding from pandas.core.frame import DataFrame +# only import MetricsDataSet at top-level for type-checking +# so it doesn't blow up if user doesn't have the dataset dependencies installed. +if TYPE_CHECKING: # pragma: no cover + from kedro.extras.datasets.tracking.metrics_dataset import MetricsDataSet + + logger = logging.getLogger(__name__) @@ -514,9 +520,10 @@ def __post_init__(self, data_node: DataNode): dataset = cast(MetricsDataSet, dataset) if not dataset._exists() or self.filepath is None: return - load_path = get_filepath_str(dataset._get_load_path(), dataset._protocol) - with dataset._fs.open(load_path, **dataset._fs_open_args_load) as fs_file: - self.metrics = json.load(fs_file) + metrics = self.load_latest_metrics_data(dataset) + if not metrics: + return + self.metrics = metrics metrics_data = self.load_metrics_versioned_data(self.filepath) if not metrics_data: return @@ -528,6 +535,28 @@ def __post_init__(self, data_node: DataNode): if not data_node.is_free_input: self.run_command = f'kedro run --to-outputs="{data_node.full_name}"' + @staticmethod + def load_latest_metrics_data( + dataset: "MetricsDataSet", + ) -> Optional[Dict[str, float]]: + """Load data for latest versions of the metrics dataset. + Below operation is also on kedro.io.core -> fetched_latest_load_version() + However it is a cached function and hence cannot be relied upon + Args: + dataset: the latest version of the metrics dataset + Returns: + A dictionary containing json data for the latest version + """ + pattern = str(dataset._get_versioned_path("*")) + version_paths = sorted(dataset._glob_function(pattern), reverse=True) + most_recent = next( + (path for path in version_paths if dataset._exists_function(path)), None + ) + if not most_recent: + return None + with dataset._fs.open(most_recent, **dataset._fs_open_args_load) as fs_file: + return json.load(fs_file) + @staticmethod def load_metrics_versioned_data( filepath: str, num_versions: int = 10 diff --git a/package/tests/test_models/test_graph/test_graph_nodes.py b/package/tests/test_models/test_graph/test_graph_nodes.py index b6922d920c..db59848583 100644 --- a/package/tests/test_models/test_graph/test_graph_nodes.py +++ b/package/tests/test_models/test_graph/test_graph_nodes.py @@ -25,8 +25,10 @@ # # See the License for the specific language governing permissions and # limitations under the License. +# pylint: disable=too-many-public-methods import datetime import json +import shutil from pathlib import Path from textwrap import dedent from unittest.mock import MagicMock, call, patch @@ -35,6 +37,7 @@ import pytest from kedro.extras.datasets.pandas import CSVDataSet, ParquetDataSet from kedro.extras.datasets.spark import SparkDataSet +from kedro.extras.datasets.tracking.metrics_dataset import MetricsDataSet from kedro.io import MemoryDataSet, PartitionedDataSet from kedro.pipeline.node import node @@ -397,14 +400,14 @@ def test_plotly_data_node_dataset_not_exist(self, patched_import): plotly_node_metadata = DataNodeMetadata(data_node=plotly_data_node) assert not hasattr(plotly_node_metadata, "plot") - @patch("json.load") @patch("kedro_viz.models.graph.DataNodeMetadata.load_metrics_versioned_data") + @patch("kedro_viz.models.graph.DataNodeMetadata.load_latest_metrics_data") @patch("kedro_viz.models.graph.DataNodeMetadata.create_metrics_plot") def test_metrics_data_node_metadata( self, patched_metrics_plot, + patched_latest_metrics, patched_data_loader, - patched_json_load, ): mock_metrics_data = { "recommendations": 0.0009277445547700936, @@ -432,8 +435,8 @@ def test_metrics_data_node_metadata( } ] } - patched_json_load.return_value = mock_metrics_data patched_data_loader.return_value = mock_version_data + patched_latest_metrics.return_value = mock_metrics_data patched_metrics_plot.return_value = mock_plot_data metrics_data_node = MagicMock() metrics_data_node.is_plot_node.return_value = False @@ -448,22 +451,35 @@ def test_metrics_data_node_metadata_dataset_not_exist(self): metrics_data_node.is_metric_node.return_value = True metrics_data_node.kedro_obj._exists.return_value = False metrics_node_metadata = DataNodeMetadata(data_node=metrics_data_node) - assert not hasattr(metrics_node_metadata, "metric") + assert not hasattr(metrics_node_metadata, "metrics") assert not hasattr(metrics_node_metadata, "plot") - @patch("json.load") + @patch("kedro_viz.models.graph.DataNodeMetadata.load_latest_metrics_data") + def test_metrics_data_node_metadata_latest_metrics_not_exist( + self, + patched_latest_metrics, + ): + patched_latest_metrics.return_value = None + metrics_data_node = MagicMock() + metrics_data_node.is_plot_node.return_value = False + metrics_data_node.is_metric_node.return_value = True + metrics_node_metadata = DataNodeMetadata(data_node=metrics_data_node) + assert not hasattr(metrics_node_metadata, "metrics") + assert not hasattr(metrics_node_metadata, "plot") + + @patch("kedro_viz.models.graph.DataNodeMetadata.load_latest_metrics_data") @patch("kedro_viz.models.graph.DataNodeMetadata.load_metrics_versioned_data") def test_metrics_data_node_metadata_versioned_dataset_not_exist( self, patched_data_loader, - patched_json_load, + patched_latest_metrics, ): mock_metrics_data = { "recommendations": 0.0009277445547700936, "recommended_controls": 0.001159680693462617, "projected_optimization": 0.0013916168321551402, } - patched_json_load.return_value = mock_metrics_data + patched_latest_metrics.return_value = mock_metrics_data patched_data_loader.return_value = {} metrics_data_node = MagicMock() metrics_data_node.is_plot_node.return_value = False @@ -488,7 +504,7 @@ def test_data_node_metadata_create_metrics_plot(self): assert "layout" in test_plot @pytest.fixture - def metrics_filepath(self, tmpdir): + def metrics_filepath(self, tmp_path): dir_name = ["2021-09-10T09.02.44.245Z", "2021-09-10T09.03.23.733Z"] filename = "metrics.json" json_content = [ @@ -503,7 +519,7 @@ def metrics_filepath(self, tmpdir): "projected_optimization": 0.30057499608184196, }, ] - source_dir = Path(tmpdir / filename) + source_dir = Path(tmp_path / filename) for index, directory in enumerate(dir_name): filepath = Path(source_dir / directory / filename) filepath.parent.mkdir(parents=True, exist_ok=True) @@ -511,7 +527,30 @@ def metrics_filepath(self, tmpdir): return source_dir @pytest.fixture - def metrics_filepath_invalid_timestamp(self, tmpdir): + def metrics_filepath_reload(self, tmp_path): + dir_name = ["2021-09-10T09.03.55.245Z", "2021-09-10T09.03.56.733Z"] + filename = "metrics.json" + json_content = [ + { + "recommendations": 0.4, + "recommended_controls": 0.5, + "projected_optimization": 0.6, + }, + { + "recommendations": 0.7, + "recommended_controls": 0.8, + "projected_optimization": 0.9, + }, + ] + source_dir = Path(tmp_path / filename) + for index, directory in enumerate(dir_name): + filepath = Path(source_dir / directory / filename) + filepath.parent.mkdir(parents=True, exist_ok=True) + filepath.write_text(json.dumps(json_content[index])) + return source_dir + + @pytest.fixture + def metrics_filepath_invalid_timestamp(self, tmp_path): dir_name = ["2021", "2021"] filename = "metrics.json" json_content = [ @@ -526,13 +565,31 @@ def metrics_filepath_invalid_timestamp(self, tmpdir): "projected_optimization": 0.30057499608184196, }, ] - source_dir = Path(tmpdir / filename) + source_dir = Path(tmp_path / filename) for index, directory in enumerate(dir_name): filepath = Path(source_dir / directory / filename) filepath.parent.mkdir(parents=True, exist_ok=True) filepath.write_text(json.dumps(json_content[index])) return source_dir + def test_load_latest_metrics(self, tmp_path): + # Note - filepath is assigned temp.json as temp solution instead of metrics_filepath + # as it fails on windows build. This will be cleaned up in the future. + filename = "temp.json" + dataset = MetricsDataSet(filepath=filename) + data = {"col1": 1, "col2": 0.23, "col3": 0.002} + dataset.save(data) + assert DataNodeMetadata.load_latest_metrics_data(dataset) == data + new_data = {"col1": 3, "col2": 3.23, "col3": 3.002} + dataset.save(new_data) + assert DataNodeMetadata.load_latest_metrics_data(dataset) == new_data + shutil.rmtree(filename) + + def test_load_latest_metrics_fail(self, mocker, metrics_filepath): + dataset = MetricsDataSet(filepath=f"{metrics_filepath}") + mocker.patch.object(dataset, "_exists_function", return_value=False) + assert DataNodeMetadata.load_latest_metrics_data(dataset) is None + def test_load_metrics_versioned_data(self, metrics_filepath): mock_metrics_json = { datetime.datetime(2021, 9, 10, 9, 2, 44, 245000): {