Fix errors for the remainder of amlb module. (#575)

* Update type hints * Disable incremental as there is a bug with ruamel * Update type hints * Add type stub requirements for xmltodict and boto * Remove unused variables from format arguments * Remove dead code, update type hints * Update type hints * Add ignore import for libraries without typestub * Update type hints
openml · Feb 24, 2024 · cdd9ed5 · cdd9ed5
1 parent fe33eab
commit cdd9ed5
Show file tree

Hide file tree

Showing 14 changed files with 104 additions and 108 deletions.
diff --git a/amlb/benchmark.py b/amlb/benchmark.py
@@ -7,6 +7,8 @@
 - run the jobs.
 - collect and save results.
 """
+from __future__ import annotations
+
 import time
 from copy import copy
 from enum import Enum
@@ -66,6 +68,7 @@ def __init__(self, framework_name: str, benchmark_name: str, constraint_name: st
         self.job_runner = None
 
         if rconfig().run_mode == 'script':
+            # Used for recovery script
             self.framework_def, self.framework_name, self.framework_module = None, None, None
             self.benchmark_def, self.benchmark_name, self.benchmark_path = None, None, None
             self.constraint_def, self.constraint_name = None, None
@@ -197,7 +200,7 @@ def cleanup(self):
         # anything to do?
         pass
 
-    def run(self, tasks: Union[str, List[str]] = None, folds: Union[int, List[int]] = None):
+    def run(self, tasks: str | list[str] | None = None, folds: int | list[int] | None = None):
         """
         :param tasks: a single task name [str] or a list of task names to run. If None, then the whole benchmark will be used.
         :param folds: a fold [int] or a list of folds to run. If None, then the all folds from each task definition will be used.

diff --git a/amlb/benchmarks/openml.py b/amlb/benchmarks/openml.py
@@ -1,7 +1,10 @@
+from __future__ import annotations
+
 import logging
-from typing import List, Tuple, Optional
+from typing import cast
 
 import openml
+import pandas as pd
 
 from amlb.utils import Namespace, str_sanitize
 
@@ -20,7 +23,7 @@ def is_openml_benchmark(benchmark: str) -> bool:
     return False
 
 
-def load_oml_benchmark(benchmark: str) -> Tuple[str, Optional[str], List[Namespace]]:
+def load_oml_benchmark(benchmark: str) -> tuple[str, str | None, list[Namespace]]:
     """ Loads benchmark defined by openml suite or task, from openml/s/X or openml/t/Y. """
     domain, oml_type, oml_id = benchmark.split('/')
     path = None  # benchmark file does not exist on disk
@@ -50,9 +53,9 @@ def load_oml_benchmark(benchmark: str) -> Tuple[str, Optional[str], List[Namespa
 
         # Here we know the (task, dataset) pairs so only download dataset meta-data is sufficient
         tasks = []
-        datasets = openml.datasets.list_datasets(data_id=suite.data, output_format='dataframe')
+        datasets = cast(pd.DataFrame, openml.datasets.list_datasets(data_id=suite.data, output_format='dataframe'))
         datasets.set_index('did', inplace=True)
-        for tid, did in zip(suite.tasks, suite.data):
+        for tid, did in zip(cast(list[int], suite.tasks), cast(list[int], suite.data)):
             tasks.append(Namespace(name=str_sanitize(datasets.loc[did]['name']),
                                    description=f"{openml.config.server.replace('/api/v1/xml', '')}/d/{did}",
                                    openml_task_id=tid,

diff --git a/amlb/datasets/file.py b/amlb/datasets/file.py
@@ -1,6 +1,7 @@
+from __future__ import annotations
+
 from abc import abstractmethod
 import logging
-import math
 import os
 import re
 import tempfile
@@ -136,7 +137,26 @@ def __repr__(self):
 class FileDataset(Dataset):
 
     def __init__(self, train: Datasplit, test: Datasplit,
-                 target: Union[int, str] = None, features: List[Union[ns, str]] = None, type: str = None):
+                 target: int | str | None = None, features: list[ns | str] | None = None, type: str | None = None):
+        """
+        
+        Parameters
+        ----------
+        train: Datasplit
+        test: Datasplit
+        target: int or str, optional
+            If int, specifies the column index of the target feature.
+            If str, specifies the column name of the target features.
+            If None, defaults to a feature with name "class" or "target", or the last
+            feature otherwise.
+        features: list[ns | str]
+            #TODO: DEADCODE?
+            I don't see this accessed anywhere, and `features` property is retrieved
+            from split metadata, which also do not reference this.
+        type: str, optional
+          A valid DatasetType. If not specified, it is inferred by the properties of the
+          target column.
+        """
         super().__init__()
         self._train = train
         self._test = test
@@ -213,9 +233,10 @@ def _get_data(self, fmt):
 
     def _find_target_feature(self, features: List[Feature]):
         target = self.dataset._target
+        default_target = next((f for f in features if f.name.lower() in ['target', 'class']), features[-1])
         return (features[target] if isinstance(target, int)
                 else next(f for f in features if f.name == target) if isinstance(target, str)
-                else next((f for f in features if f.name.lower() in ['target', 'class']), None) or features[-1])
+                else default_target)
 
     def _set_feature_as_target(self, target: Feature):
         # for classification problems, ensure that the target appears as categorical
@@ -470,7 +491,7 @@ def _unique_values(self, col_name: str):
 
 
 class FileConverter:
-    format = None
+    format: str | None = None
 
     def __init__(self) -> None:
         super().__init__()

diff --git a/amlb/datasets/openml.py b/amlb/datasets/openml.py
@@ -11,9 +11,10 @@
 import logging
 import os
 import re
-from typing import Generic, Tuple, TypeVar, List
+from typing import Generic, Tuple, TypeVar, Hashable
 
 import arff
+import numpy as np
 import pandas as pd
 import pandas.api.types as pat
 import openml as oml
@@ -77,7 +78,7 @@ def __init__(self, oml_task: oml.OpenMLTask, oml_dataset: oml.OpenMLDataset, fol
         self.fold = fold
         self._train = None
         self._test = None
-        self._nrows = None
+        self._nrows: int | None = None
 
 
     @property
@@ -147,7 +148,7 @@ def _inference_subsample(self, fmt: str, n: int, seed: int = 0, with_labels: boo
         If `keep_empty_features` is true, columns with all nan values will be imputed as 0.
         If false, they get removed instead.
         """
-        def get_non_empty_columns(data: DF) -> List[str]:
+        def get_non_empty_columns(data: DF) -> list[Hashable]:
             return [
                 c
                 for c, is_empty in data.isnull().all(axis=0).items()
@@ -256,7 +257,7 @@ class OpenmlDatasplit(Datasplit):
 
     def __init__(self, dataset: OpenmlDataset):
         super().__init__(dataset, 'arff')  # TODO: fix format
-        self._data = {}
+        self._data: dict[str, AM | DF | str] = {}
 
     def data_path(self, format):
         if format not in __supported_file_formats__:
@@ -343,7 +344,7 @@ def _save_split(self, df, path, name):
         with open(path, 'w') as file:
             description = f"Split dataset file generated by automlbenchmark from OpenML dataset openml.org/d/{self.ds._oml_dataset.dataset_id}"
 
-            def determine_arff_type(column_name: str, dtype: 'dtype') -> str | list[str]:
+            def determine_arff_type(column_name: str, dtype: np.dtype | pd.core.dtypes.base.ExtensionDtype) -> str | list[str]:
                 if pat.is_integer_dtype(dtype):
                     return "INTEGER"
                 if pat.is_float_dtype(dtype):

diff --git a/amlb/job.py b/amlb/job.py
@@ -6,6 +6,9 @@
   - SimpleJobRunner runs the jobs sequentially.
   - ParallelJobRunner queues the jobs and run them in a dedicated thread
 """
+from __future__ import annotations
+
+import concurrent.futures
 from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
 from enum import Enum, auto
 import logging
@@ -60,7 +63,7 @@ class Job:
     ]
 
     @classmethod
-    def is_state_transition_ok(cls, old_state: State, new_state: State):
+    def is_state_transition_ok(cls, old_state: State | None, new_state: State | None):
         allowed = next((head for tail, head in cls.state_machine if tail == old_state), None)
         return allowed and new_state in allowed
 
@@ -82,7 +85,7 @@ def __init__(self, name: str = "",
         self.name = name
         self.timeout = timeout_secs
         self.priority = priority
-        self.state = None
+        self.state: State | None = None
         self.thread_id = None
         self.raise_on_failure = raise_on_failure
         self.set_state(State.created)
@@ -203,14 +206,14 @@ class JobRunner:
     END_Q = object()
 
     @classmethod
-    def is_state_transition_ok(cls, old_state: State, new_state: State):
+    def is_state_transition_ok(cls, old_state: State | None, new_state: State | None):
         allowed = next((head for tail, head in cls.state_machine if tail == old_state), None)
         return allowed and new_state in allowed
 
     def __init__(self, jobs: List, on_new_result: Optional[Callable] = None):
         self.jobs = jobs
-        self.results = []
-        self.state = None
+        self.results: list[Namespace] = []
+        self.state: State | None = None
         self._queue = None
         self._last_priority = 0
         self._on_new_result = on_new_result
@@ -336,7 +339,7 @@ def _on_state(self, state: State):
 
 class MultiThreadingJobRunner(JobRunner):
 
-    class QueueingStrategy:
+    class QueueingStrategy(Enum):
         keep_queue_full = 0
         enforce_job_priority = 1
 
@@ -354,8 +357,8 @@ def __init__(self, jobs: List,
         self._daemons = use_daemons
         self._queueing_strategy = queueing_strategy
         self._interrupt = threading.Event()
-        self._exec = None
-        self.futures = []
+        self._exec: ThreadPoolExecutor | None = None
+        self.futures: list[concurrent.futures.Future] = []
 
     def _safe_call_from_exec(self, fn):
         if self._exec:

diff --git a/amlb/logger.py b/amlb/logger.py
@@ -17,7 +17,7 @@
 
 class MillisFormatter(logging.Formatter):
 
-    converter = dt.datetime.fromtimestamp
+    converter = dt.datetime.fromtimestamp  # type: ignore
 
     def formatTime(self, record, datefmt=None):
         ct = self.converter(record.created)

diff --git a/amlb/resources.py b/amlb/resources.py
@@ -2,6 +2,8 @@
 **resources** modules exposes a singleton ``Resources`` instance providing easy access to app configuration properties,
 as well as handy methods to access other resources like *automl frameworks* and *benchmark definitions*
 """
+from __future__ import annotations
+
 import copy
 import logging
 import os
@@ -254,7 +256,7 @@ def _validate_task(self, task, lenient=False):
             log.debug("Config `{config}` not set for task {name}, using default `{value}`.".format(config=conf, name=task.name, value=task[conf]))
 
 
-__INSTANCE__: Resources = None
+__INSTANCE__: Resources | None = None
 
 
 def from_config(config: Namespace):
@@ -273,6 +275,9 @@ def from_configs(*configs: Namespace):
 
 
 def get() -> Resources:
+    if __INSTANCE__ is None:
+        # TODO: Instead why not do normal lazy loading pattern?
+        raise RuntimeError("No configuration has been loaded yet.")
     return __INSTANCE__
 
 

diff --git a/amlb/results.py b/amlb/results.py
@@ -2,6 +2,9 @@
 **results** module provides the logic to format, save and read predictions generated by the *automl frameworks* (cf. ``TaskResult``),
 as well as logic to compute, format, save, read and merge scores obtained from those predictions (cf. ``Result`` and ``Scoreboard``).
 """
+from __future__ import annotations
+
+from functools import partial
 import collections
 import io
 import logging
@@ -331,7 +334,7 @@ def save_predictions(dataset: Dataset, output_file: str,
         df = df.assign(truth=truth)
 
         if optional_columns is not None:
-            df = pd.concat([df, optional_columns], axis=1)
+            df = pd.concat([df, optional_columns], axis=1)  # type: ignore # int not seen as valid Axis
 
         if preview:
             log.info("Predictions preview:\n %s\n", df.head(20).to_string())
@@ -407,7 +410,7 @@ def score_from_predictions_file(cls, path):
         task_result = cls(task, fold, constraint, predictions_dir=path)
         return task_result.compute_score()
 
-    def __init__(self, task_def, fold: int, constraint: str, predictions_dir: str = None, metadata: Namespace = None):
+    def __init__(self, task_def, fold: int, constraint: str, predictions_dir: str | None = None, metadata: Namespace = None):
         self.task = task_def
         self.fold = fold
         self.constraint = constraint