Merge branch 'master' into dev/mypy

openml · Feb 24, 2024 · c326888 · c326888
2 parents 0f3fc50 + fe33eab
commit c326888
Show file tree

Hide file tree

Showing 30 changed files with 491 additions and 50 deletions.
diff --git a/amlb/datasets/file.py b/amlb/datasets/file.py
@@ -364,6 +364,8 @@ def __init__(self, path, fold, target, features, cache_dir, config):
         self.id_column = config['id_column']
         self.timestamp_column = config['timestamp_column']
 
+        # Ensure that id_column is parsed as string to avoid incorrect sorting
+        full_data[self.id_column] = full_data[self.id_column].astype(str)
         full_data[self.timestamp_column] = pd.to_datetime(full_data[self.timestamp_column])
         if config['name'] is not None:
             file_name = config['name']
@@ -374,11 +376,11 @@ def __init__(self, path, fold, target, features, cache_dir, config):
 
         self._train = CsvDatasplit(self, train_path, timestamp_column=self.timestamp_column)
         self._test = CsvDatasplit(self, test_path, timestamp_column=self.timestamp_column)
-        self._dtypes = None
+        self._dtypes = full_data.dtypes
 
         # Store repeated item_id & in-sample seasonal error for each time step in the forecast horizon - needed later for metrics like MASE.
         # We need to store this information here because Result object has no access to past time series values.
-        self.repeated_item_id = self.test.data[self.id_column].cat.codes.to_numpy()
+        self.repeated_item_id = self.test.data[self.id_column].astype("category").cat.codes.to_numpy()
         self.repeated_abs_seasonal_error = self.compute_seasonal_error()
 
     def save_train_and_test_splits(self, full_data, fold, save_dir):

diff --git a/amlb/datasets/openml.py b/amlb/datasets/openml.py
@@ -26,6 +26,12 @@
 from ..utils import as_list, lazy_property, path_from_split, profile, split_path, unsparsify
 
 
+# https://github.com/openml/automlbenchmark/pull/574#issuecomment-1646179921
+try:
+  set_openml_cache = oml.config.set_cache_directory
+except AttributeError:
+  set_openml_cache = oml.config.set_root_cache_directory
+
 log = logging.getLogger(__name__)
 
 # hack (only adding a ? to the regexp pattern) to ensure that '?' values remain quoted when we save dataplits in arff format.
@@ -40,7 +46,7 @@ class OpenmlLoader:
     def __init__(self, api_key, cache_dir=None):
         oml.config.apikey = api_key
         if cache_dir:
-            oml.config.set_cache_directory(cache_dir)
+            set_openml_cache(cache_dir)
 
         if oml.config.retry_policy != "robot":
             log.debug("Setting openml retry_policy from '%s' to 'robot'." % oml.config.retry_policy)

diff --git a/amlb/defaults.py b/amlb/defaults.py
@@ -1,9 +1,15 @@
 import pathlib
 
-from openml.config import cache_directory
+import openml
 
 from amlb.utils import Namespace as ns
 
+# https://github.com/openml/automlbenchmark/pull/574#issuecomment-1646179921
+try:
+    cache_directory = openml.config.cache_directory
+except AttributeError:
+    cache_directory = openml.config.get_cache_directory()
+
 default_dirs = ns(
     input_dir=cache_directory,
     output_dir=str(pathlib.Path(__file__).parent.parent / "results"),

diff --git a/amlb/results.py b/amlb/results.py
@@ -18,6 +18,7 @@
 from numpy import nan, sort
 import pandas as pd
 import scipy as sci
+import scipy.sparse
 
 from .data import Dataset, DatasetType, Feature
 from .datautils import accuracy_score, auc, average_precision_score, balanced_accuracy_score, confusion_matrix, fbeta_score, log_loss, \
@@ -295,6 +296,8 @@ def save_predictions(dataset: Dataset, output_file: str,
             predictions = predictions.squeeze()
         if isinstance(predictions, S):
             predictions = predictions.values
+        if scipy.sparse.issparse(truth) and truth.shape[1] == 1:
+            truth = pd.DataFrame(truth.todense())
         if isinstance(truth, DF):
             truth = truth.squeeze()
         if isinstance(truth, S):

diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md
@@ -0,0 +1,130 @@
+# Contributing to the AutoML Benchmark
+We appreciate you are considering contributing to the AutoML Benchmark.
+Remote collaboration may be hard sometimes, so we provide guidelines in this document
+to make the experience as smooth as possible.
+
+In this document there is information on:
+
+ - [Reporting a Bug](#reporting-a-bug)
+ - [Suggesting a Feature](#features)
+ - [Suggesting a Dataset](#datasets)
+ - [Suggesting Ideas on Benchmark Design](#ideas)
+ - [Contributing Code or Documentation Changes](#contributing-changes)
+
+## Reporting a Bug
+If you find a bug with the software, please first search our [issue tracker](https://github.com/openml/automlbenchmark/issues) to see if it has been reported before.
+If it has been, please see if there is relevant information missing that may help reproduce the issue and add it if necessary.
+If there is nothing to add, simply leave a 👍 on the issue. This lets us know more people are affected by it.
+
+### Creating a Bug Report
+After confirming your bug isn't reported on our issue tracker, please open a new issue to make a bug report.
+A good bug report should describe the error and also provide:
+
+ * A minimal script (and/or configuration) to reproduce the issue.
+ * The _observed_ behavior, for example a stack trace with the error.
+ * The _expected_ behavior. What did you expect to happen?
+ * Any additional information you may have.
+ * Information on your installed versions. If applicable, please provide both information about the `runbenchmark` environment and the `framework` environment (typically in `frameworks/FRAMEWORK/venv`).
+
+Observing these guidelines greatly improves the chance that we are able to help you.
+It also allows us to address the issue more quickly, which means we can help more people.
+
+## Features
+If you want to suggest a new feature for the benchmark software, please [open an issue](https://github.com/openml/automlbenchmark/issues/new).
+Please motivate why we should consider adding the feature and how the user is expected to use it.
+
+## Datasets
+If you have a suggestion for a new dataset to include in the benchmark,
+please [open a discussion on the datasets board](https://github.com/openml/automlbenchmark/discussions/new?category=datasets).
+Please motivate why the dataset is a good inclusion for the benchmark.
+Examples of good motivations may include:
+
+ * Evidence that it produces interesting results, for example by reporting a small scale benchmark on the dataset.
+ * Evidence that is represents a very relevant problem, e.g., because it is frequently used in scientific literature.
+
+Additionally, please provide a link to the data, preferably on [OpenML](openml.org), and indicate its license (if known).
+Please note that the benchmark currently supports limited data types.
+Suggestions for datasets with data types which are currently not yet be supported are still welcome,
+as they may help us more effectively great a good benchmark later when support is added.
+
+## Ideas
+If you have other suggestions about benchmark design, [please open a suggestion on the general board](https://github.com/openml/automlbenchmark/discussions/new?category=general).
+Please motivate why we should consider changing (or adding to) the benchmark design.
+
+
+## Contributing Changes
+We welcome all contributions by the community. To contribute changes to the 
+code or documentation, we follow a default git workflow which is also outlined below.
+
+!!! note "For text changes"
+
+    If you only want to contribute minor text changes, it is possible to do so 
+    directly on Github. Click the pencil icon on the relevant file(s) to edit the documents,
+    and Github should allow you to automatically commit to your own fork.
+    After that, set up a pull request as specified below under 'Opening a Pull Request'.
+
+### Volunteering an Issue
+In order to avoid a scenario where multiple people do the same work, the first thing
+to do is to make sure we (and other contributors) know you are working on a particular issue or feature.
+Please ensure that a related issue is open on the issue board or open one if necessary, and ask to be assigned to that issue.
+This communicates with all collaborators that they should not work on that issue, and thus we can avoid double work.
+It also gives us a chance to indicate whether we are (still) interested in the proposed changes.
+If it is unclear how to add the feature, or if you are unsure of which fix to apply to remove a bug, please discuss this in the issue.
+
+### Setting up the Development Environment
+Fork the repository by clicking on the `fork` button on the top right of our [Github](https://github.com/openml/automlbenchmark) page.
+This should create a repository named `automlbenchmark` under your Github account.
+Clone this repository (replace `GITHUB_USERNAME`):
+
+```text
+git clone https://github.com/GITHUB_USERNAME/automlbenchmark.git
+```
+
+!!! warning "Use Python 3.9"
+
+    AutoML benchmark currently only officially supports Python 3.9.
+    We advise you use that version when developing locally. 
+
+then set up your local virtual environment:
+
+```text
+cd automlbenchmark
+python -m venv venv
+source venv\bin\activate
+python -m pip install -r requirements.txt
+python -m pip install -r requirements-dev.txt
+```
+
+this should set up the minimum requirements for running the benchmark and running our developer tools.
+The following commands should now both successfully:
+
+```text
+python runbenchmark.py constantpredictor -f 0
+python -m pytest 
+python -m mkdocs serve
+```
+
+When `python -m mkdocs serve` is running, you should be able to navigate to the 
+local documentation server (by default at `127.0.0.1:8000`) and see the documentation.
+
+### Make Code Changes 
+Please make sure that:
+
+ * All added code has annotated type hints and functions have docstrings.
+ * Changed or added code is covered by unit tests.
+ * The pull request does not add/change more than it has to in order to fix the bug/add the feature and meet the above criteria.
+ * The tests and `runbenchmark.py` script still work the same as above.
+
+In case the PR is a bug fix, please try to convert the minimal reproducing example of 
+the original issue to a unit test and include it in the test suite to help avoid future regressions.
+Finally, commit the changes with a meaningful commit message about what was changed and why. 
+
+### Make Documentation Changes
+The software documentation pages are written on `mkdocs` using [`mkdocs-material`](https://squidfunk.github.io/mkdocs-material/getting-started/),
+when editing these pages you can see live updates when running the `python -m mkdocs serve` command.
+The main landing page with information about the project is written in pure `html` and `css`.
+
+### Open a Pull Request
+When opening a pull request, reference the issue that it closes.
+Please also provide any additional context that helps review the pull request that may not have been appropriate as code comments.
+
diff --git a/docs/readme.md b/docs/readme.md
@@ -0,0 +1,30 @@
+# AutoML Benchmark
+The OpenML AutoML Benchmark provides a framework for evaluating and comparing open-source AutoML systems.  
+The system is *extensible* because you can [add your own](https://openml.github.io/automlbenchmark/docs/extending/) 
+AutoML frameworks and datasets. For a thorough explanation of the benchmark, and evaluation of results, 
+you can read our [paper](https://arxiv.org/abs/2207.12560).
+
+Automatic Machine Learning (AutoML) systems automatically build machine learning pipelines
+or neural architectures in a data-driven, objective, and automatic way. They automate a lot 
+of drudge work in designing machine learning systems, so that better systems can be developed, 
+faster. However, AutoML research is also slowed down by two factors:
+
+* We currently lack standardized, easily-accessible benchmarking suites of tasks (datasets) that are curated to reflect important problem domains, practical to use, and sufficiently challenging to support a rigorous analysis of performance results. 
+
+* Subtle differences in the problem definition, such as the design of the hyperparameter search space or the way time budgets are defined, can drastically alter a task’s difficulty. This issue makes it difficult to reproduce published research and compare results from different papers.
+
+This toolkit aims to address these problems by setting up standardized environments for in-depth experimentation with a wide range of AutoML systems.
+
+Website: <https://openml.github.io/automlbenchmark/index.html>
+
+Documentation: <https://openml.github.io/automlbenchmark/docs/index.html>
+
+Installation: <https://openml.github.io/automlbenchmark/docs/getting_started/>
+
+### Features:
+
+* Curated suites of benchmarking datasets from [OpenML](https://www.openml.org) ([regression](https://www.openml.org/s/269), [classification](https://www.openml.org/s/271)).
+* Includes code to benchmark a number of [popular AutoML systems](https://openml.github.io/automlbenchmark/frameworks.html) on regression and classification tasks.
+* [New AutoML systems can be added](https://openml.github.io/automlbenchmark/docs/extending/framework/)
+* Experiments can be run in Docker or Singularity containers
+* Execute experiments locally or on AWS
diff --git a/docs/website/data.html b/docs/website/data.html
@@ -0,0 +1,15 @@
+<html>
+  <head>
+    <title>AMLB</title>
+    <meta charset="UTF-8" />
+    <meta http-equiv="refresh" content="0; URL=https://test.openml.org/amlb/"/>
+  </head>
+  <body>
+    <p>
+      This is a redirect page to make sure we can always redirect you to our data,
+      even if we move it after publication! You should be taken to
+      <a href="https://test.openml.org/amlb/">https://test.openml.org/amlb/</a>
+      automatically.
+    </p>
+  </body>
+</html>
diff --git a/docs/website/visualization.html b/docs/website/visualization.html
@@ -0,0 +1,19 @@
+<html>
+  <head>
+    <title>AMLB</title>
+    <meta charset="UTF-8" />
+    <meta http-equiv="refresh" content="0; URL=https://compstat-lmu.shinyapps.io/AutoML-Benchmark-Analysis/"/>
+  </head>
+  <body>
+    <p>
+      This is a redirect page to make sure we can always redirect you to our best
+      source for visualizing results from the AutoML benchmark, even if we change
+      where that is after publication!
+      This page will take you to
+      <a href="https://compstat-lmu.shinyapps.io/AutoML-Benchmark-Analysis/">
+        https://compstat-lmu.shinyapps.io/AutoML-Benchmark-Analysis/
+      </a>
+      .
+    </p>
+  </body>
+</html>
diff --git a/docs/website/welcome.html b/docs/website/welcome.html
@@ -0,0 +1,21 @@
+<html>
+  <head>
+    <title>AMLB</title>
+    <meta charset="UTF-8" />
+    <!--meta http-equiv="refresh" content="0; URL=https://test.openml.org/amlb/"/-->
+  </head>
+  <body>
+    <p>
+      This is will be a redirect page to make sure we can always redirect you to
+      the best place to get started with contributions to the AutoML benchmark,
+      even if we change its location after publication!
+      For now, if you have questions you can visit our
+      <a href="https://github.com/openml/automlbenchmark/discussions/">
+        Github discussions
+      </a>. If you want to learn more about how to use the software,
+      please visit our <a href="https://openml.github.io/automlbenchmark/">
+      documentation
+      </a>.
+    </p>
+  </body>
+</html>
diff --git a/examples/custom/extensions/Stacking/exec.py b/examples/custom/extensions/Stacking/exec.py
@@ -30,18 +30,17 @@ def run(dataset, config):
 
     training_params = {k: v for k, v in config.framework_params.items() if not k.startswith('_')}
     n_jobs = config.framework_params.get('_n_jobs', config.cores)  # useful to disable multicore, regardless of the dataset config
-    estimators_params = {e: config.framework_params.get(f'_{e}_params', {}) for e in ['rf', 'gbm', 'linear', 'svc', 'final']}
+    estimators_params = {e: config.framework_params.get(f'_{e}_params', {}) for e in ['rf', 'gbm', 'sgdclassifier', 'sgdregressor', 'svc', 'final']}
 
     log.info("Running Sklearn Stacking Ensemble with a maximum time of {}s on {} cores.".format(config.max_runtime_seconds, n_jobs))
     log.warning("We completely ignore the requirement to stay within the time limit.")
     log.warning("We completely ignore the advice to optimize towards metric: {}.".format(config.metric))
 
-
     if is_classification:
         estimator = StackingClassifier(
             estimators=[('rf', RandomForestClassifier(n_jobs=n_jobs, random_state=config.seed, **estimators_params['rf'])),
                         ('gbm', GradientBoostingClassifier(random_state=config.seed, **estimators_params['gbm'])),
-                        ('linear', SGDClassifier(n_jobs=n_jobs, random_state=config.seed, **estimators_params['linear'])),
+                        ('linear', SGDClassifier(n_jobs=n_jobs, random_state=config.seed, **estimators_params['sgdclassifier'])),
                         # ('svc', LinearSVC(random_state=config.seed, **estimators_params['svc']))
                         ],
             # final_estimator=SGDClassifier(n_jobs=n_jobs, random_state=config.seed, **estimators_params['final']),
@@ -54,19 +53,20 @@ def run(dataset, config):
         estimator = StackingRegressor(
             estimators=[('rf', RandomForestRegressor(n_jobs=n_jobs, random_state=config.seed, **estimators_params['rf'])),
                         ('gbm', GradientBoostingRegressor(random_state=config.seed, **estimators_params['gbm'])),
-                        ('linear', SGDRegressor(random_state=config.seed, **estimators_params['linear'])),
+                        ('linear', SGDRegressor(random_state=config.seed, **estimators_params['sgdregressor'])),
                         ('svc', LinearSVR(random_state=config.seed, **estimators_params['svc']))
                         ],
             # final_estimator=SGDRegressor(random_state=config.seed, **estimators_params['final']),
-            final_estimator=LinearRegression(n_jobs=n_jobs, random_state=config.seed, **estimators_params['final']),
+            final_estimator=LinearRegression(n_jobs=n_jobs),
             n_jobs=n_jobs,
             **training_params
         )
 
     with Timer() as training:
         estimator.fit(X_train, y_train)
 
-    predictions = estimator.predict(X_test)
+    with Timer() as predict:
+        predictions = estimator.predict(X_test)
     probabilities = estimator.predict_proba(X_test) if is_classification else None
 
     return result(output_file=config.output_predictions_file,
@@ -75,7 +75,8 @@ def run(dataset, config):
                   probabilities=probabilities,
                   target_is_encoded=is_classification,
                   models_count=len(estimator.estimators_) + 1,
-                  training_duration=training.duration)
+                  training_duration=training.duration,
+                  predict_duration=predict.duration)
 
 
 if __name__ == '__main__':

diff --git a/examples/custom/extensions/Stacking/requirements.txt b/examples/custom/extensions/Stacking/requirements.txt
@@ -1 +1 @@
-scikit-learn==0.22.1
+scikit-learn==1.3.1
diff --git a/examples/custom/extensions/Stacking/setup.sh b/examples/custom/extensions/Stacking/setup.sh
@@ -2,7 +2,7 @@
 shopt -s expand_aliases
 HERE=$(dirname "$0")
 
-. "$HERE/.setup_env"
+. "$HERE/.setup/setup_env"
 . "$AMLB_ROOT/frameworks/shared/setup.sh" "$HERE" true
 PIP install -r "$HERE/requirements.txt"
 

diff --git a/examples/custom/frameworks.yaml b/examples/custom/frameworks.yaml
@@ -9,15 +9,15 @@ GradientBoosting:
 
 Stacking:
   module: extensions.Stacking
-  version: '0.22.1'
+  version: '1.3.1'
   project: https://scikit-learn.org/stable/modules/ensemble.html#stacking
   params:
     _rf_params: {n_estimators: 200}
     _gbm_params: {n_estimators: 200}
-    _linear_params: {penalty: elasticnet, loss: log}
+    _sgdclassifier_params: {penalty: elasticnet, loss: log_loss}
+    _sgdregressor_params: {penalty: elasticnet}
 #    _svc_params: {tol: 1e-3, max_iter: 1e5}
 #    _final_params: {penalty: elasticnet, loss: log} # sgd linear
-    _final_params: {max_iter: 1000}  # logistic/linear
 
 H2OAutoML_nightly:
   module: frameworks.H2OAutoML