Skip to content

Commit

Permalink
Fix errors for the remainder of amlb module. (#575)
Browse files Browse the repository at this point in the history
* Update type hints

* Disable incremental as there is a bug with ruamel

* Update type hints

* Add type stub requirements for xmltodict and boto

* Remove unused variables from format arguments

* Remove dead code, update type hints

* Update type hints

* Add ignore import for libraries without typestub

* Update type hints
  • Loading branch information
PGijsbers authored Feb 24, 2024
1 parent fe33eab commit cdd9ed5
Show file tree
Hide file tree
Showing 14 changed files with 104 additions and 108 deletions.
5 changes: 4 additions & 1 deletion amlb/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
- run the jobs.
- collect and save results.
"""
from __future__ import annotations

import time
from copy import copy
from enum import Enum
Expand Down Expand Up @@ -66,6 +68,7 @@ def __init__(self, framework_name: str, benchmark_name: str, constraint_name: st
self.job_runner = None

if rconfig().run_mode == 'script':
# Used for recovery script
self.framework_def, self.framework_name, self.framework_module = None, None, None
self.benchmark_def, self.benchmark_name, self.benchmark_path = None, None, None
self.constraint_def, self.constraint_name = None, None
Expand Down Expand Up @@ -197,7 +200,7 @@ def cleanup(self):
# anything to do?
pass

def run(self, tasks: Union[str, List[str]] = None, folds: Union[int, List[int]] = None):
def run(self, tasks: str | list[str] | None = None, folds: int | list[int] | None = None):
"""
:param tasks: a single task name [str] or a list of task names to run. If None, then the whole benchmark will be used.
:param folds: a fold [int] or a list of folds to run. If None, then the all folds from each task definition will be used.
Expand Down
11 changes: 7 additions & 4 deletions amlb/benchmarks/openml.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
from __future__ import annotations

import logging
from typing import List, Tuple, Optional
from typing import cast

import openml
import pandas as pd

from amlb.utils import Namespace, str_sanitize

Expand All @@ -20,7 +23,7 @@ def is_openml_benchmark(benchmark: str) -> bool:
return False


def load_oml_benchmark(benchmark: str) -> Tuple[str, Optional[str], List[Namespace]]:
def load_oml_benchmark(benchmark: str) -> tuple[str, str | None, list[Namespace]]:
""" Loads benchmark defined by openml suite or task, from openml/s/X or openml/t/Y. """
domain, oml_type, oml_id = benchmark.split('/')
path = None # benchmark file does not exist on disk
Expand Down Expand Up @@ -50,9 +53,9 @@ def load_oml_benchmark(benchmark: str) -> Tuple[str, Optional[str], List[Namespa

# Here we know the (task, dataset) pairs so only download dataset meta-data is sufficient
tasks = []
datasets = openml.datasets.list_datasets(data_id=suite.data, output_format='dataframe')
datasets = cast(pd.DataFrame, openml.datasets.list_datasets(data_id=suite.data, output_format='dataframe'))
datasets.set_index('did', inplace=True)
for tid, did in zip(suite.tasks, suite.data):
for tid, did in zip(cast(list[int], suite.tasks), cast(list[int], suite.data)):
tasks.append(Namespace(name=str_sanitize(datasets.loc[did]['name']),
description=f"{openml.config.server.replace('/api/v1/xml', '')}/d/{did}",
openml_task_id=tid,
Expand Down
29 changes: 25 additions & 4 deletions amlb/datasets/file.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

from abc import abstractmethod
import logging
import math
import os
import re
import tempfile
Expand Down Expand Up @@ -136,7 +137,26 @@ def __repr__(self):
class FileDataset(Dataset):

def __init__(self, train: Datasplit, test: Datasplit,
target: Union[int, str] = None, features: List[Union[ns, str]] = None, type: str = None):
target: int | str | None = None, features: list[ns | str] | None = None, type: str | None = None):
"""
Parameters
----------
train: Datasplit
test: Datasplit
target: int or str, optional
If int, specifies the column index of the target feature.
If str, specifies the column name of the target features.
If None, defaults to a feature with name "class" or "target", or the last
feature otherwise.
features: list[ns | str]
#TODO: DEADCODE?
I don't see this accessed anywhere, and `features` property is retrieved
from split metadata, which also do not reference this.
type: str, optional
A valid DatasetType. If not specified, it is inferred by the properties of the
target column.
"""
super().__init__()
self._train = train
self._test = test
Expand Down Expand Up @@ -213,9 +233,10 @@ def _get_data(self, fmt):

def _find_target_feature(self, features: List[Feature]):
target = self.dataset._target
default_target = next((f for f in features if f.name.lower() in ['target', 'class']), features[-1])
return (features[target] if isinstance(target, int)
else next(f for f in features if f.name == target) if isinstance(target, str)
else next((f for f in features if f.name.lower() in ['target', 'class']), None) or features[-1])
else default_target)

def _set_feature_as_target(self, target: Feature):
# for classification problems, ensure that the target appears as categorical
Expand Down Expand Up @@ -470,7 +491,7 @@ def _unique_values(self, col_name: str):


class FileConverter:
format = None
format: str | None = None

def __init__(self) -> None:
super().__init__()
Expand Down
11 changes: 6 additions & 5 deletions amlb/datasets/openml.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,10 @@
import logging
import os
import re
from typing import Generic, Tuple, TypeVar, List
from typing import Generic, Tuple, TypeVar, Hashable

import arff
import numpy as np
import pandas as pd
import pandas.api.types as pat
import openml as oml
Expand Down Expand Up @@ -77,7 +78,7 @@ def __init__(self, oml_task: oml.OpenMLTask, oml_dataset: oml.OpenMLDataset, fol
self.fold = fold
self._train = None
self._test = None
self._nrows = None
self._nrows: int | None = None


@property
Expand Down Expand Up @@ -147,7 +148,7 @@ def _inference_subsample(self, fmt: str, n: int, seed: int = 0, with_labels: boo
If `keep_empty_features` is true, columns with all nan values will be imputed as 0.
If false, they get removed instead.
"""
def get_non_empty_columns(data: DF) -> List[str]:
def get_non_empty_columns(data: DF) -> list[Hashable]:
return [
c
for c, is_empty in data.isnull().all(axis=0).items()
Expand Down Expand Up @@ -256,7 +257,7 @@ class OpenmlDatasplit(Datasplit):

def __init__(self, dataset: OpenmlDataset):
super().__init__(dataset, 'arff') # TODO: fix format
self._data = {}
self._data: dict[str, AM | DF | str] = {}

def data_path(self, format):
if format not in __supported_file_formats__:
Expand Down Expand Up @@ -343,7 +344,7 @@ def _save_split(self, df, path, name):
with open(path, 'w') as file:
description = f"Split dataset file generated by automlbenchmark from OpenML dataset openml.org/d/{self.ds._oml_dataset.dataset_id}"

def determine_arff_type(column_name: str, dtype: 'dtype') -> str | list[str]:
def determine_arff_type(column_name: str, dtype: np.dtype | pd.core.dtypes.base.ExtensionDtype) -> str | list[str]:
if pat.is_integer_dtype(dtype):
return "INTEGER"
if pat.is_float_dtype(dtype):
Expand Down
19 changes: 11 additions & 8 deletions amlb/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
- SimpleJobRunner runs the jobs sequentially.
- ParallelJobRunner queues the jobs and run them in a dedicated thread
"""
from __future__ import annotations

import concurrent.futures
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
from enum import Enum, auto
import logging
Expand Down Expand Up @@ -60,7 +63,7 @@ class Job:
]

@classmethod
def is_state_transition_ok(cls, old_state: State, new_state: State):
def is_state_transition_ok(cls, old_state: State | None, new_state: State | None):
allowed = next((head for tail, head in cls.state_machine if tail == old_state), None)
return allowed and new_state in allowed

Expand All @@ -82,7 +85,7 @@ def __init__(self, name: str = "",
self.name = name
self.timeout = timeout_secs
self.priority = priority
self.state = None
self.state: State | None = None
self.thread_id = None
self.raise_on_failure = raise_on_failure
self.set_state(State.created)
Expand Down Expand Up @@ -203,14 +206,14 @@ class JobRunner:
END_Q = object()

@classmethod
def is_state_transition_ok(cls, old_state: State, new_state: State):
def is_state_transition_ok(cls, old_state: State | None, new_state: State | None):
allowed = next((head for tail, head in cls.state_machine if tail == old_state), None)
return allowed and new_state in allowed

def __init__(self, jobs: List, on_new_result: Optional[Callable] = None):
self.jobs = jobs
self.results = []
self.state = None
self.results: list[Namespace] = []
self.state: State | None = None
self._queue = None
self._last_priority = 0
self._on_new_result = on_new_result
Expand Down Expand Up @@ -336,7 +339,7 @@ def _on_state(self, state: State):

class MultiThreadingJobRunner(JobRunner):

class QueueingStrategy:
class QueueingStrategy(Enum):
keep_queue_full = 0
enforce_job_priority = 1

Expand All @@ -354,8 +357,8 @@ def __init__(self, jobs: List,
self._daemons = use_daemons
self._queueing_strategy = queueing_strategy
self._interrupt = threading.Event()
self._exec = None
self.futures = []
self._exec: ThreadPoolExecutor | None = None
self.futures: list[concurrent.futures.Future] = []

def _safe_call_from_exec(self, fn):
if self._exec:
Expand Down
2 changes: 1 addition & 1 deletion amlb/logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

class MillisFormatter(logging.Formatter):

converter = dt.datetime.fromtimestamp
converter = dt.datetime.fromtimestamp # type: ignore

def formatTime(self, record, datefmt=None):
ct = self.converter(record.created)
Expand Down
7 changes: 6 additions & 1 deletion amlb/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
**resources** modules exposes a singleton ``Resources`` instance providing easy access to app configuration properties,
as well as handy methods to access other resources like *automl frameworks* and *benchmark definitions*
"""
from __future__ import annotations

import copy
import logging
import os
Expand Down Expand Up @@ -254,7 +256,7 @@ def _validate_task(self, task, lenient=False):
log.debug("Config `{config}` not set for task {name}, using default `{value}`.".format(config=conf, name=task.name, value=task[conf]))


__INSTANCE__: Resources = None
__INSTANCE__: Resources | None = None


def from_config(config: Namespace):
Expand All @@ -273,6 +275,9 @@ def from_configs(*configs: Namespace):


def get() -> Resources:
if __INSTANCE__ is None:
# TODO: Instead why not do normal lazy loading pattern?
raise RuntimeError("No configuration has been loaded yet.")
return __INSTANCE__


Expand Down
7 changes: 5 additions & 2 deletions amlb/results.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
**results** module provides the logic to format, save and read predictions generated by the *automl frameworks* (cf. ``TaskResult``),
as well as logic to compute, format, save, read and merge scores obtained from those predictions (cf. ``Result`` and ``Scoreboard``).
"""
from __future__ import annotations

from functools import partial
import collections
import io
import logging
Expand Down Expand Up @@ -331,7 +334,7 @@ def save_predictions(dataset: Dataset, output_file: str,
df = df.assign(truth=truth)

if optional_columns is not None:
df = pd.concat([df, optional_columns], axis=1)
df = pd.concat([df, optional_columns], axis=1) # type: ignore # int not seen as valid Axis

if preview:
log.info("Predictions preview:\n %s\n", df.head(20).to_string())
Expand Down Expand Up @@ -407,7 +410,7 @@ def score_from_predictions_file(cls, path):
task_result = cls(task, fold, constraint, predictions_dir=path)
return task_result.compute_score()

def __init__(self, task_def, fold: int, constraint: str, predictions_dir: str = None, metadata: Namespace = None):
def __init__(self, task_def, fold: int, constraint: str, predictions_dir: str | None = None, metadata: Namespace = None):
self.task = task_def
self.fold = fold
self.constraint = constraint
Expand Down
Loading

0 comments on commit cdd9ed5

Please sign in to comment.