diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index c582655c..84c15d87 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -19,7 +19,7 @@ jobs: - name: 🔨 Setup poetry uses: abatilo/actions-poetry@v2.0.0 with: - poetry-version: "1.4.0" + poetry-version: "1.7.1" - name: 🔨Install dependencies run: | poetry config virtualenvs.create false diff --git a/.github/workflows/pytest.yaml b/.github/workflows/pytest.yaml index 3b6236b9..327a91fc 100644 --- a/.github/workflows/pytest.yaml +++ b/.github/workflows/pytest.yaml @@ -3,8 +3,6 @@ name: 🧪 pytest on: pull_request: types: [ opened, reopened, synchronize ] - pull_request_target: - types: [ opened, reopened, synchronize ] workflow_call: jobs: @@ -13,7 +11,7 @@ jobs: fail-fast: false matrix: python-version: [ "3.9","3.10","3.11"] - os: [ ubuntu-latest, macos-latest] + os: [ ubuntu-latest, macos-12] runs-on: ${{ matrix.os }} timeout-minutes: 30 name: 🧪 Run pytests @@ -27,11 +25,13 @@ jobs: - name: 🔨 Setup poetry uses: abatilo/actions-poetry@v2.0.0 with: - poetry-version: "1.4.0" + poetry-version: "1.7.1" - name: 🔨Install dependencies run: | poetry config virtualenvs.create false + pip install --no-build-isolation pendulum==2.1.2 poetry install --no-interaction -E visu -E tensorflow --with dev --no-ansi + - run: mkdir results && touch results/test-results-${{ matrix.python-version }}-${{matrix.os}}-summary.md - name: 🧪 Run tests uses: nuhrberg/pytest-summary@main diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 7823b7c8..47c3fc90 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -29,7 +29,7 @@ jobs: - name: 🔨 Setup poetry uses: abatilo/actions-poetry@v2.0.0 with: - poetry-version: "1.4.0" + poetry-version: "1.7.1" - name: 🔨Install dependencies run: | poetry config virtualenvs.create false diff --git a/CHANGELOG.md b/CHANGELOG.md index 667f7cf3..6e0c95d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,15 @@ +## v0.14.1 (2024-04-16) + +### Fix + +* Test pipeline pendulum dependency ([`299426a`](https://github.com/codecentric-oss/niceml/commit/299426a27cfaf4f46e79958f600cdc1a8ad05466)) +* Update poetry in Github pipelines to 1.7.1 ([`2d0dd2e`](https://github.com/codecentric-oss/niceml/commit/2d0dd2e2f250fe7517c12b68f7d36802ddd73964)) +* Load non-parq files with DFLoader load_df ([`b693710`](https://github.com/codecentric-oss/niceml/commit/b693710fdf03962a543579342ba95b359b634974)) +* Add original model id to eval experiment info ([`494c101`](https://github.com/codecentric-oss/niceml/commit/494c101fcd5993971cdb1b39895494cf6de82293)) + ## v0.14.0 (2024-02-08) ### Feature diff --git a/README.md b/README.md index adfc6980..693622d9 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ It provides pipelines for a variety of ML tasks, like All you have to do is configure your pipeline, and you're ready to go! You can also add your own components to the build-in dashboard, -where you can compair the results and performance of your ML models. +where you can compare the results and performance of your ML models. Further documentation is available at [niceML.io](https://niceml.io). diff --git a/configs/jobs/job_eval/job_eval_objdet/job_eval_objdet_number.yaml b/configs/jobs/job_eval/job_eval_objdet/job_eval_objdet_number.yaml index 3aaba352..1670d963 100644 --- a/configs/jobs/job_eval/job_eval_objdet/job_eval_objdet_number.yaml +++ b/configs/jobs/job_eval/job_eval_objdet/job_eval_objdet_number.yaml @@ -12,7 +12,7 @@ defaults: - ops/exptests@ops.exptests.config.tests: exptests_default.yaml # experiment locations - shared/locations@globals: exp_locations.yaml - # ressources + # resources - resources/mlflow@resources.mlflow.config: res_mlflow_base.yaml - _self_ diff --git a/configs/jobs/job_eval/job_eval_reg/job_eval_reg_number.yaml b/configs/jobs/job_eval/job_eval_reg/job_eval_reg_number.yaml index c78d13b9..406178df 100644 --- a/configs/jobs/job_eval/job_eval_reg/job_eval_reg_number.yaml +++ b/configs/jobs/job_eval/job_eval_reg/job_eval_reg_number.yaml @@ -12,7 +12,7 @@ defaults: - ops/exptests@ops.exptests.config.tests: exptests_default.yaml # experiment locations - shared/locations@globals: exp_locations.yaml - # ressources + # resources - resources/mlflow@resources.mlflow.config: res_mlflow_base.yaml - _self_ diff --git a/docs/index.md b/docs/index.md index 50441ed9..7afdcc96 100644 --- a/docs/index.md +++ b/docs/index.md @@ -12,7 +12,7 @@ It provides pipelines for a variety of ML tasks, like All you have to do is configure your pipeline, and you're ready to go! You can also add your own components to the built-in dashboard, -where you can compair the results and performance of your ML models. +where you can compare the results and performance of your ML models. The documentation of **niceML** is separated into four paths: diff --git a/niceml/__init__.py b/niceml/__init__.py index 9e78220f..f075dd36 100644 --- a/niceml/__init__.py +++ b/niceml/__init__.py @@ -1 +1 @@ -__version__ = "0.14.0" +__version__ = "0.14.1" diff --git a/niceml/config/envconfig.py b/niceml/config/envconfig.py index 11c5b1a4..cb1bb66a 100644 --- a/niceml/config/envconfig.py +++ b/niceml/config/envconfig.py @@ -13,6 +13,7 @@ DESCRIPTION_KEY = "DESCRIPTION" LOCAL_EXP_CACHE_PATH_KEY = "LOCAL_EXP_CACHE_PATH" LAST_MODIFIED_KEY = "LAST_MODIFIED" +PRETRAINED_MODEL_KEY = "PRETRAINED_MODEL" def replace_id_keys(input_str: str, short_id: str, run_id: str) -> str: diff --git a/niceml/dagster/ops/evalcopyexp.py b/niceml/dagster/ops/evalcopyexp.py index d5694739..716c9c53 100644 --- a/niceml/dagster/ops/evalcopyexp.py +++ b/niceml/dagster/ops/evalcopyexp.py @@ -7,7 +7,13 @@ from fsspec import AbstractFileSystem from tqdm import tqdm -from niceml.config.envconfig import DESCRIPTION_KEY, RUN_ID_KEY, SHORT_ID_KEY +from niceml.config.envconfig import ( + DESCRIPTION_KEY, + RUN_ID_KEY, + SHORT_ID_KEY, + PRETRAINED_MODEL_KEY, + ENVIRONMENT_KEY, +) from niceml.dagster.ops.experiment import create_exp_settings from niceml.experiments.experimentcontext import ExperimentContext from niceml.experiments.expfilenames import ExperimentFilenames, ExpEvalCopyNames @@ -84,6 +90,7 @@ def change_ids_from_expinfo( with file_system.open(exp_info_path, "r") as cur_file: data = yaml.load(cur_file, Loader=yaml.SafeLoader) + data[ENVIRONMENT_KEY][PRETRAINED_MODEL_KEY] = data[SHORT_ID_KEY] data[RUN_ID_KEY] = run_id data[SHORT_ID_KEY] = short_id data[DESCRIPTION_KEY] = ( diff --git a/niceml/dagster/ops/filelockops.py b/niceml/dagster/ops/filelockops.py index 7c53fc30..5e7f5fcd 100644 --- a/niceml/dagster/ops/filelockops.py +++ b/niceml/dagster/ops/filelockops.py @@ -24,6 +24,7 @@ def release_locks(_: OpExecutionContext, filelock_dict: dict): """op for releasing locks""" for filelock in filelock_dict.values(): filelock.release() + return filelock_dict @op(config_schema=dict(filelock_dict=HydraMapField(FileLock))) @@ -34,3 +35,4 @@ def clear_locks(context: OpExecutionContext): filelock_dict = instantiated_op_config["filelock_dict"] for filelock in filelock_dict.values(): filelock.force_delete() + return filelock_dict diff --git a/niceml/dashboard/components/expviscomponent.py b/niceml/dashboard/components/expviscomponent.py index 9f11d726..eb85c85e 100644 --- a/niceml/dashboard/components/expviscomponent.py +++ b/niceml/dashboard/components/expviscomponent.py @@ -25,8 +25,8 @@ def __init__( target_value_list: Optional[List[Any]] = None, assert_on_error: bool = False, ): - # Create empty list for chart images self.component_name: Optional[str] = component_name + # Create empty list for chart images self.chart_images_list: List[Image.Image] = [] self.meta_function = meta_function self.target_value_list = [] if target_value_list is None else target_value_list diff --git a/niceml/dashboard/components/prefixviscomponent.py b/niceml/dashboard/components/prefixviscomponent.py index 16cbb63c..9f370752 100644 --- a/niceml/dashboard/components/prefixviscomponent.py +++ b/niceml/dashboard/components/prefixviscomponent.py @@ -47,6 +47,7 @@ def _render( subset_name: Optional[str]: Render the experiment data to a subset """ + # select components for prefix exp_data_list: List[ExperimentData] = [ exp_manager.get_exp_by_id(exp_id) for exp_id in exp_ids ] @@ -74,11 +75,14 @@ def _render( comp.get_component_name() or f"Component {comp_index}" ) + # arrange tabs comp_index = 0 if self.use_tabs: st_comp_list = list(st.tabs(comp_names)) else: st_comp_list = [st.expander(label) for label in comp_names] + + # render components for comp_key, cur_comps in self.components.items(): if comp_key in exp_dict: for comp in cur_comps: diff --git a/niceml/data/dataloaders/dfloaders.py b/niceml/data/dataloaders/dfloaders.py index ef88e152..6a4a5e40 100644 --- a/niceml/data/dataloaders/dfloaders.py +++ b/niceml/data/dataloaders/dfloaders.py @@ -8,12 +8,12 @@ from niceml.data.dataloaders.interfaces.dfloader import DfLoader from niceml.data.storages.localstorage import LocalStorage from niceml.data.storages.storageinterface import StorageInterface -from niceml.experiments.loaddatafunctions import LoadParquetFile -from niceml.utilities.ioutils import read_parquet, write_parquet +from niceml.experiments.loaddatafunctions import LoadParquetFile, LoadCsvFile +from niceml.utilities.ioutils import read_parquet, write_parquet, read_csv, write_csv class SimpleDfLoader(DfLoader): # pylint: disable=too-few-public-methods - """SimpleLoader for parquet files""" + """SimpleLoader for parquet or csv files""" def __init__( self, @@ -25,14 +25,21 @@ def __init__( self.storage = storage or LocalStorage() self.working_dir = working_dir - def load_df(self, df_path: str) -> pd.DataFrame: - """Loads and returns a dataframe from a given parquet file path""" - target_path = join(self.working_dir, df_path) if self.working_dir else df_path - return LoadParquetFile().load_data(target_path, self.storage) + def load_df(self, df_path: str, **kwargs) -> pd.DataFrame: + """Loads and returns a dataframe from a given parquet or csv file path""" + target_path = ( + self.storage.join_paths(self.working_dir, df_path) + if self.working_dir + else df_path + ) + if ".parq" in target_path: + return LoadParquetFile().load_data(target_path, self.storage) + else: + return LoadCsvFile().load_data(target_path, self.storage, **kwargs) class SimpleDfLoaderFactory(DfLoaderFactory): # pylint: disable=too-few-public-methods - """SimpleLoader for parquet files""" + """SimpleLoader for parquet or csv files""" def create_df_loader(self, storage: StorageInterface, working_dir: str) -> DfLoader: """Returns SimpleDfLoader""" @@ -40,7 +47,7 @@ def create_df_loader(self, storage: StorageInterface, working_dir: str) -> DfLoa class RemoteDiskCachedDfLoader(DfLoader): # pylint: disable=too-few-public-methods - """SimpleLoader for parquet files from cache or remote storage""" + """SimpleLoader for parquet or csv files from cache or remote storage""" def __init__( self, @@ -53,7 +60,7 @@ def __init__( self.cache_path = cache_dir self.working_dir = working_dir - def load_df(self, df_path: str) -> pd.DataFrame: + def load_df(self, df_path: str, **kwargs) -> pd.DataFrame: """Loads and returns dataframe from cache""" target_path = ( self.storage.join_paths(self.working_dir, df_path) @@ -62,14 +69,20 @@ def load_df(self, df_path: str) -> pd.DataFrame: ) cached_filepath = join(self.cache_path, target_path) if isfile(cached_filepath): - dataframe = read_parquet(cached_filepath) - else: + if ".parq" in target_path: + dataframe = read_parquet(cached_filepath) + else: + dataframe = read_csv(cached_filepath, **kwargs) + elif ".parq" in target_path: dataframe = LoadParquetFile().load_data(target_path, self.storage) write_parquet(dataframe, cached_filepath) + else: + dataframe = LoadCsvFile().load_data(target_path, self.storage, **kwargs) + write_csv(dataframe, cached_filepath, **kwargs) return dataframe -class RemoteDiskCachedDfLoaderFactory( # QUEST: still used? +class RemoteDiskCachedDfLoaderFactory( DfLoaderFactory ): # pylint: disable=too-few-public-methods """Factory of RemoteDiskCachedDfLoader""" diff --git a/niceml/data/dataloaders/interfaces/dfloader.py b/niceml/data/dataloaders/interfaces/dfloader.py index 26554e34..942a034a 100644 --- a/niceml/data/dataloaders/interfaces/dfloader.py +++ b/niceml/data/dataloaders/interfaces/dfloader.py @@ -8,5 +8,5 @@ class DfLoader(ABC): # pylint: disable=too-few-public-methods """Abstract class DfLoader (Dataframe Loader)""" @abstractmethod - def load_df(self, df_path: str) -> pd.DataFrame: + def load_df(self, df_path: str, **kwargs) -> pd.DataFrame: """Loads and returns the dataframe""" diff --git a/niceml/dlframeworks/keras/models/clsmodelfactory.py b/niceml/dlframeworks/keras/models/clsmodelfactory.py index 9f6db04f..7a51468d 100644 --- a/niceml/dlframeworks/keras/models/clsmodelfactory.py +++ b/niceml/dlframeworks/keras/models/clsmodelfactory.py @@ -1,3 +1,4 @@ +"""Module for ClsModelFactory""" from typing import List, Optional from tensorflow.keras import layers @@ -15,7 +16,10 @@ class ClsModelFactory(ModelFactory): - def __init__( + """Model factory for classification models. Used to create the model + before training""" + + def __init__( # noqa:PLR0913 self, model: Model, dense_layer_list: List[int], @@ -25,7 +29,9 @@ def __init__( allow_preconvolution: bool = False, dropout_prob_list: Optional[List[float]] = None, additional_conv_layers: Optional[List[int]] = None, + trainable_base_model: Optional[bool] = True, ): + """Initialize the CLSModelFactory""" self.model_params = model self.dense_layer_list = dense_layer_list self.use_scale_lambda = use_scale_lambda @@ -36,15 +42,20 @@ def __init__( ) self.dense_activation = dense_activation self.additional_conv_layers = additional_conv_layers + self.trainable_base_model = trainable_base_model def create_model(self, data_desc: DataDescription): + """Creates a model for training according to the data_description""" input_dd: InputImageDataDescription = check_instance( data_desc, InputImageDataDescription ) output_dd: OutputVectorDataDescription = check_instance( data_desc, OutputVectorDataDescription ) - if not self.allow_preconvolution and input_dd.get_input_channel_count() != 3: + if ( + not self.allow_preconvolution + and input_dd.get_input_channel_count() != 3 # noqa:PLR2004 + ): raise Exception( f"Input channels must have the size of 3! Instead " f"{input_dd.get_input_channel_count()}" @@ -53,6 +64,7 @@ def create_model(self, data_desc: DataDescription): input_shape = input_size.to_numpy_shape() + (3,) in_layer = layers.Input(shape=input_shape, name="image") actual_layer = in_layer + self.model_params.trainable = self.trainable_base_model model: Model = self.model_params actual_layer = model(actual_layer) @@ -87,6 +99,7 @@ def create_dense_layers( dense_activation, dropout_prob_list: Optional[List[float]] = None, ): + """Creates dense layers for model""" if dropout_prob_list is None: dropout_prob_list = [] actual_layer = layers.Flatten()(actual_layer) diff --git a/niceml/experiments/loaddatafunctions.py b/niceml/experiments/loaddatafunctions.py index 67306f76..bd7bd41a 100644 --- a/niceml/experiments/loaddatafunctions.py +++ b/niceml/experiments/loaddatafunctions.py @@ -25,6 +25,7 @@ class LoadYamlFile(LoadDataFunc): # pylint: disable=too-few-public-methods """Loads yaml data from a cloud storage""" def load_data(self, file_path: str, storage: StorageInterface): + """Loads yaml file from cloud storage""" data = storage.download_as_str(file_path) return yaml.load(data, Loader=yaml.SafeLoader) @@ -32,9 +33,12 @@ def load_data(self, file_path: str, storage: StorageInterface): class LoadCsvFile(LoadDataFunc): # pylint: disable=too-few-public-methods """Loads csv data from a cloud storage""" - def load_data(self, file_path: str, storage: StorageInterface): + def load_data( + self, file_path: str, storage: StorageInterface, **kwargs + ) -> pd.DataFrame: + """Loads csv file from cloud storage""" data = storage.download_as_str(file_path) - data_frame = pd.read_csv(io.BytesIO(data)) + data_frame = pd.read_csv(io.BytesIO(data), **kwargs) return data_frame @@ -42,6 +46,7 @@ class LoadParquetFile(LoadDataFunc): # pylint: disable=too-few-public-methods """Loads parquet data from a cloud storage""" def load_data(self, file_path: str, storage: StorageInterface): + """Loads parquet file from cloud storage""" data = storage.download_as_str(file_path) if data == b"": raise FileNotFoundError("File empty") @@ -54,10 +59,12 @@ class LoadImageFile(LoadDataFunc): # pylint: disable=too-few-public-methods """Loads image data from a cloud storage""" def __init__(self, target_size: ImageSize, output_dtype=np.uint8): + """Initialize LoadImageFile object""" self.target_size = target_size self.output_dtype = output_dtype def load_data(self, file_path: str, storage: StorageInterface): + """Loads image file from cloud storage""" data = storage.download_as_str(file_path) image: Image.Image = Image.open(io.BytesIO(data)) if self.target_size is not None: diff --git a/pyproject.toml b/pyproject.toml index 57d034f9..4f5007be 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "niceml" -version = "0.14.0" +version = "0.14.1" description = "Welcome to niceML 🍦, a Python-based MLOps framework that uses TensorFlow and Dagster. This framework streamlines the development, and maintenance of machine learning models, providing an end-to-end solution for building efficient and scalable pipelines." authors = [ "Denis Stalz-John ", diff --git a/tests/unit/niceml/data/dataloaders/test_dfloaders.py b/tests/unit/niceml/data/dataloaders/test_dfloaders.py index bb897cac..b0f2313c 100644 --- a/tests/unit/niceml/data/dataloaders/test_dfloaders.py +++ b/tests/unit/niceml/data/dataloaders/test_dfloaders.py @@ -7,7 +7,7 @@ from niceml.data.dataloaders.dfloaders import RemoteDiskCachedDfLoader, SimpleDfLoader from niceml.data.storages.localstorage import LocalStorage -from niceml.utilities.ioutils import write_parquet +from niceml.utilities.ioutils import write_parquet, write_csv @pytest.fixture() @@ -34,14 +34,14 @@ def tmp_cache_dir() -> str: yield tmpdir -def test_simple_df_loader(tmp_folder_with_parquet: str, example_df: pd.DataFrame): +def test_simple_df_loader_parq(tmp_folder_with_parquet: str, example_df: pd.DataFrame): df_loader = SimpleDfLoader() df_test = df_loader.load_df(join(tmp_folder_with_parquet, "test.parquet")) assert isinstance(df_test, pd.DataFrame) assert df_test.equals(example_df) -def test_remote_disk_cached_df_loader( +def test_remote_disk_cached_df_loader_parq( tmp_folder_with_parquet: str, example_df: pd.DataFrame, tmp_cache_dir: str ): storage = LocalStorage(tmp_folder_with_parquet) @@ -57,3 +57,35 @@ def test_remote_disk_cached_df_loader( df_test = df_loader.load_df("test.parquet") assert isinstance(df_test, pd.DataFrame) assert df_test.equals(example_df) + + +@pytest.fixture() +def tmp_folder_with_csv(example_df): + with TemporaryDirectory() as tmpdir: + write_csv(example_df, join(tmpdir, "test.csv"), sep=";") + yield tmpdir + + +def test_simple_df_loader_csv(tmp_folder_with_csv: str, example_df: pd.DataFrame): + df_loader = SimpleDfLoader() + df_test = df_loader.load_df(join(tmp_folder_with_csv, "test.csv"), sep=";") + assert isinstance(df_test, pd.DataFrame) + assert df_test.equals(example_df) + + +def test_remote_disk_cached_df_loader_csv( + tmp_folder_with_csv: str, example_df: pd.DataFrame, tmp_cache_dir: str +): + storage = LocalStorage(tmp_folder_with_csv) + df_loader = RemoteDiskCachedDfLoader(storage, tmp_cache_dir) + df_test = df_loader.load_df("test.csv", sep=";") + assert isinstance(df_test, pd.DataFrame) + assert df_test.equals(example_df) + assert isfile(join(tmp_cache_dir, "test.csv")) + + # remove file from orig folder to test if it is loaded from cache + os.remove(join(tmp_folder_with_csv, "test.csv")) + + df_test = df_loader.load_df("test.csv", sep=";") + assert isinstance(df_test, pd.DataFrame) + assert df_test.equals(example_df) diff --git a/tests/unit/niceml/utilities/test_readwritelock.py b/tests/unit/niceml/utilities/test_readwritelock.py index 83ea7b1f..607f62ee 100644 --- a/tests/unit/niceml/utilities/test_readwritelock.py +++ b/tests/unit/niceml/utilities/test_readwritelock.py @@ -9,7 +9,6 @@ LocationConfig, join_fs_path, open_location, - join_location_w_path, ) from niceml.utilities.readwritelock import ( ReadLock,