diff --git a/amlb/datasets/file.py b/amlb/datasets/file.py
index 68d8c6162..abc51fc82 100644
--- a/amlb/datasets/file.py
+++ b/amlb/datasets/file.py
@@ -364,6 +364,8 @@ def __init__(self, path, fold, target, features, cache_dir, config):
         self.id_column = config['id_column']
         self.timestamp_column = config['timestamp_column']
 
+        # Ensure that id_column is parsed as string to avoid incorrect sorting
+        full_data[self.id_column] = full_data[self.id_column].astype(str)
         full_data[self.timestamp_column] = pd.to_datetime(full_data[self.timestamp_column])
         if config['name'] is not None:
             file_name = config['name']
@@ -374,11 +376,11 @@ def __init__(self, path, fold, target, features, cache_dir, config):
 
         self._train = CsvDatasplit(self, train_path, timestamp_column=self.timestamp_column)
         self._test = CsvDatasplit(self, test_path, timestamp_column=self.timestamp_column)
-        self._dtypes = None
+        self._dtypes = full_data.dtypes
 
         # Store repeated item_id & in-sample seasonal error for each time step in the forecast horizon - needed later for metrics like MASE.
         # We need to store this information here because Result object has no access to past time series values.
-        self.repeated_item_id = self.test.data[self.id_column].cat.codes.to_numpy()
+        self.repeated_item_id = self.test.data[self.id_column].astype("category").cat.codes.to_numpy()
         self.repeated_abs_seasonal_error = self.compute_seasonal_error()
 
     def save_train_and_test_splits(self, full_data, fold, save_dir):
diff --git a/amlb/datasets/openml.py b/amlb/datasets/openml.py
index 803815275..88da77d80 100644
--- a/amlb/datasets/openml.py
+++ b/amlb/datasets/openml.py
@@ -26,6 +26,12 @@
 from ..utils import as_list, lazy_property, path_from_split, profile, split_path, unsparsify
 
 
+# https://github.com/openml/automlbenchmark/pull/574#issuecomment-1646179921
+try:
+  set_openml_cache = oml.config.set_cache_directory
+except AttributeError:
+  set_openml_cache = oml.config.set_root_cache_directory
+
 log = logging.getLogger(__name__)
 
 # hack (only adding a ? to the regexp pattern) to ensure that '?' values remain quoted when we save dataplits in arff format.
@@ -40,7 +46,7 @@ class OpenmlLoader:
     def __init__(self, api_key, cache_dir=None):
         oml.config.apikey = api_key
         if cache_dir:
-            oml.config.set_cache_directory(cache_dir)
+            set_openml_cache(cache_dir)
 
         if oml.config.retry_policy != "robot":
             log.debug("Setting openml retry_policy from '%s' to 'robot'." % oml.config.retry_policy)
diff --git a/amlb/defaults.py b/amlb/defaults.py
index 6d0bf35c5..3031be71b 100644
--- a/amlb/defaults.py
+++ b/amlb/defaults.py
@@ -1,9 +1,15 @@
 import pathlib
 
-from openml.config import cache_directory
+import openml
 
 from amlb.utils import Namespace as ns
 
+# https://github.com/openml/automlbenchmark/pull/574#issuecomment-1646179921
+try:
+    cache_directory = openml.config.cache_directory
+except AttributeError:
+    cache_directory = openml.config.get_cache_directory()
+
 default_dirs = ns(
     input_dir=cache_directory,
     output_dir=str(pathlib.Path(__file__).parent.parent / "results"),
diff --git a/amlb/results.py b/amlb/results.py
index 07bb48b25..4ce41a1a9 100644
--- a/amlb/results.py
+++ b/amlb/results.py
@@ -18,6 +18,7 @@
 from numpy import nan, sort
 import pandas as pd
 import scipy as sci
+import scipy.sparse
 
 from .data import Dataset, DatasetType, Feature
 from .datautils import accuracy_score, auc, average_precision_score, balanced_accuracy_score, confusion_matrix, fbeta_score, log_loss, \
@@ -295,6 +296,8 @@ def save_predictions(dataset: Dataset, output_file: str,
             predictions = predictions.squeeze()
         if isinstance(predictions, S):
             predictions = predictions.values
+        if scipy.sparse.issparse(truth) and truth.shape[1] == 1:
+            truth = pd.DataFrame(truth.todense())
         if isinstance(truth, DF):
             truth = truth.squeeze()
         if isinstance(truth, S):
diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md
new file mode 100644
index 000000000..a1e52f517
--- /dev/null
+++ b/docs/CONTRIBUTING.md
@@ -0,0 +1,130 @@
+# Contributing to the AutoML Benchmark
+We appreciate you are considering contributing to the AutoML Benchmark.
+Remote collaboration may be hard sometimes, so we provide guidelines in this document
+to make the experience as smooth as possible.
+
+In this document there is information on:
+
+ - [Reporting a Bug](#reporting-a-bug)
+ - [Suggesting a Feature](#features)
+ - [Suggesting a Dataset](#datasets)
+ - [Suggesting Ideas on Benchmark Design](#ideas)
+ - [Contributing Code or Documentation Changes](#contributing-changes)
+
+## Reporting a Bug
+If you find a bug with the software, please first search our [issue tracker](https://github.com/openml/automlbenchmark/issues) to see if it has been reported before.
+If it has been, please see if there is relevant information missing that may help reproduce the issue and add it if necessary.
+If there is nothing to add, simply leave a 👍 on the issue. This lets us know more people are affected by it.
+
+### Creating a Bug Report
+After confirming your bug isn't reported on our issue tracker, please open a new issue to make a bug report.
+A good bug report should describe the error and also provide:
+
+ * A minimal script (and/or configuration) to reproduce the issue.
+ * The _observed_ behavior, for example a stack trace with the error.
+ * The _expected_ behavior. What did you expect to happen?
+ * Any additional information you may have.
+ * Information on your installed versions. If applicable, please provide both information about the `runbenchmark` environment and the `framework` environment (typically in `frameworks/FRAMEWORK/venv`).
+
+Observing these guidelines greatly improves the chance that we are able to help you.
+It also allows us to address the issue more quickly, which means we can help more people.
+
+## Features
+If you want to suggest a new feature for the benchmark software, please [open an issue](https://github.com/openml/automlbenchmark/issues/new).
+Please motivate why we should consider adding the feature and how the user is expected to use it.
+
+## Datasets
+If you have a suggestion for a new dataset to include in the benchmark,
+please [open a discussion on the datasets board](https://github.com/openml/automlbenchmark/discussions/new?category=datasets).
+Please motivate why the dataset is a good inclusion for the benchmark.
+Examples of good motivations may include:
+
+ * Evidence that it produces interesting results, for example by reporting a small scale benchmark on the dataset.
+ * Evidence that is represents a very relevant problem, e.g., because it is frequently used in scientific literature.
+
+Additionally, please provide a link to the data, preferably on [OpenML](openml.org), and indicate its license (if known).
+Please note that the benchmark currently supports limited data types.
+Suggestions for datasets with data types which are currently not yet be supported are still welcome,
+as they may help us more effectively great a good benchmark later when support is added.
+
+## Ideas
+If you have other suggestions about benchmark design, [please open a suggestion on the general board](https://github.com/openml/automlbenchmark/discussions/new?category=general).
+Please motivate why we should consider changing (or adding to) the benchmark design.
+
+
+## Contributing Changes
+We welcome all contributions by the community. To contribute changes to the 
+code or documentation, we follow a default git workflow which is also outlined below.
+
+!!! note "For text changes"
+
+    If you only want to contribute minor text changes, it is possible to do so 
+    directly on Github. Click the pencil icon on the relevant file(s) to edit the documents,
+    and Github should allow you to automatically commit to your own fork.
+    After that, set up a pull request as specified below under 'Opening a Pull Request'.
+
+### Volunteering an Issue
+In order to avoid a scenario where multiple people do the same work, the first thing
+to do is to make sure we (and other contributors) know you are working on a particular issue or feature.
+Please ensure that a related issue is open on the issue board or open one if necessary, and ask to be assigned to that issue.
+This communicates with all collaborators that they should not work on that issue, and thus we can avoid double work.
+It also gives us a chance to indicate whether we are (still) interested in the proposed changes.
+If it is unclear how to add the feature, or if you are unsure of which fix to apply to remove a bug, please discuss this in the issue.
+
+### Setting up the Development Environment
+Fork the repository by clicking on the `fork` button on the top right of our [Github](https://github.com/openml/automlbenchmark) page.
+This should create a repository named `automlbenchmark` under your Github account.
+Clone this repository (replace `GITHUB_USERNAME`):
+
+```text
+git clone https://github.com/GITHUB_USERNAME/automlbenchmark.git
+```
+
+!!! warning "Use Python 3.9"
+
+    AutoML benchmark currently only officially supports Python 3.9.
+    We advise you use that version when developing locally. 
+
+then set up your local virtual environment:
+
+```text
+cd automlbenchmark
+python -m venv venv
+source venv\bin\activate
+python -m pip install -r requirements.txt
+python -m pip install -r requirements-dev.txt
+```
+
+this should set up the minimum requirements for running the benchmark and running our developer tools.
+The following commands should now both successfully:
+
+```text
+python runbenchmark.py constantpredictor -f 0
+python -m pytest 
+python -m mkdocs serve
+```
+
+When `python -m mkdocs serve` is running, you should be able to navigate to the 
+local documentation server (by default at `127.0.0.1:8000`) and see the documentation.
+
+### Make Code Changes 
+Please make sure that:
+
+ * All added code has annotated type hints and functions have docstrings.
+ * Changed or added code is covered by unit tests.
+ * The pull request does not add/change more than it has to in order to fix the bug/add the feature and meet the above criteria.
+ * The tests and `runbenchmark.py` script still work the same as above.
+
+In case the PR is a bug fix, please try to convert the minimal reproducing example of 
+the original issue to a unit test and include it in the test suite to help avoid future regressions.
+Finally, commit the changes with a meaningful commit message about what was changed and why. 
+
+### Make Documentation Changes
+The software documentation pages are written on `mkdocs` using [`mkdocs-material`](https://squidfunk.github.io/mkdocs-material/getting-started/),
+when editing these pages you can see live updates when running the `python -m mkdocs serve` command.
+The main landing page with information about the project is written in pure `html` and `css`.
+
+### Open a Pull Request
+When opening a pull request, reference the issue that it closes.
+Please also provide any additional context that helps review the pull request that may not have been appropriate as code comments.
+
diff --git a/docs/readme.md b/docs/readme.md
new file mode 100644
index 000000000..735022066
--- /dev/null
+++ b/docs/readme.md
@@ -0,0 +1,30 @@
+# AutoML Benchmark
+The OpenML AutoML Benchmark provides a framework for evaluating and comparing open-source AutoML systems.  
+The system is *extensible* because you can [add your own](https://openml.github.io/automlbenchmark/docs/extending/) 
+AutoML frameworks and datasets. For a thorough explanation of the benchmark, and evaluation of results, 
+you can read our [paper](https://arxiv.org/abs/2207.12560).
+
+Automatic Machine Learning (AutoML) systems automatically build machine learning pipelines
+or neural architectures in a data-driven, objective, and automatic way. They automate a lot 
+of drudge work in designing machine learning systems, so that better systems can be developed, 
+faster. However, AutoML research is also slowed down by two factors:
+
+* We currently lack standardized, easily-accessible benchmarking suites of tasks (datasets) that are curated to reflect important problem domains, practical to use, and sufficiently challenging to support a rigorous analysis of performance results. 
+
+* Subtle differences in the problem definition, such as the design of the hyperparameter search space or the way time budgets are defined, can drastically alter a task’s difficulty. This issue makes it difficult to reproduce published research and compare results from different papers.
+
+This toolkit aims to address these problems by setting up standardized environments for in-depth experimentation with a wide range of AutoML systems.
+
+Website: <https://openml.github.io/automlbenchmark/index.html>
+
+Documentation: <https://openml.github.io/automlbenchmark/docs/index.html>
+
+Installation: <https://openml.github.io/automlbenchmark/docs/getting_started/>
+
+### Features:
+
+* Curated suites of benchmarking datasets from [OpenML](https://www.openml.org) ([regression](https://www.openml.org/s/269), [classification](https://www.openml.org/s/271)).
+* Includes code to benchmark a number of [popular AutoML systems](https://openml.github.io/automlbenchmark/frameworks.html) on regression and classification tasks.
+* [New AutoML systems can be added](https://openml.github.io/automlbenchmark/docs/extending/framework/)
+* Experiments can be run in Docker or Singularity containers
+* Execute experiments locally or on AWS
diff --git a/docs/website/data.html b/docs/website/data.html
new file mode 100644
index 000000000..985a87c2e
--- /dev/null
+++ b/docs/website/data.html
@@ -0,0 +1,15 @@
+<html>
+  <head>
+    <title>AMLB</title>
+    <meta charset="UTF-8" />
+    <meta http-equiv="refresh" content="0; URL=https://test.openml.org/amlb/"/>
+  </head>
+  <body>
+    <p>
+      This is a redirect page to make sure we can always redirect you to our data,
+      even if we move it after publication! You should be taken to
+      <a href="https://test.openml.org/amlb/">https://test.openml.org/amlb/</a>
+      automatically.
+    </p>
+  </body>
+</html>
diff --git a/docs/website/visualization.html b/docs/website/visualization.html
new file mode 100644
index 000000000..f25baa8ef
--- /dev/null
+++ b/docs/website/visualization.html
@@ -0,0 +1,19 @@
+<html>
+  <head>
+    <title>AMLB</title>
+    <meta charset="UTF-8" />
+    <meta http-equiv="refresh" content="0; URL=https://compstat-lmu.shinyapps.io/AutoML-Benchmark-Analysis/"/>
+  </head>
+  <body>
+    <p>
+      This is a redirect page to make sure we can always redirect you to our best
+      source for visualizing results from the AutoML benchmark, even if we change
+      where that is after publication!
+      This page will take you to
+      <a href="https://compstat-lmu.shinyapps.io/AutoML-Benchmark-Analysis/">
+        https://compstat-lmu.shinyapps.io/AutoML-Benchmark-Analysis/
+      </a>
+      .
+    </p>
+  </body>
+</html>
diff --git a/docs/website/welcome.html b/docs/website/welcome.html
new file mode 100644
index 000000000..bc3c5cd0d
--- /dev/null
+++ b/docs/website/welcome.html
@@ -0,0 +1,21 @@
+<html>
+  <head>
+    <title>AMLB</title>
+    <meta charset="UTF-8" />
+    <!--meta http-equiv="refresh" content="0; URL=https://test.openml.org/amlb/"/-->
+  </head>
+  <body>
+    <p>
+      This is will be a redirect page to make sure we can always redirect you to
+      the best place to get started with contributions to the AutoML benchmark,
+      even if we change its location after publication!
+      For now, if you have questions you can visit our
+      <a href="https://github.com/openml/automlbenchmark/discussions/">
+        Github discussions
+      </a>. If you want to learn more about how to use the software,
+      please visit our <a href="https://openml.github.io/automlbenchmark/">
+      documentation
+      </a>.
+    </p>
+  </body>
+</html>
diff --git a/examples/custom/extensions/Stacking/exec.py b/examples/custom/extensions/Stacking/exec.py
index d8c80879d..47321feb6 100644
--- a/examples/custom/extensions/Stacking/exec.py
+++ b/examples/custom/extensions/Stacking/exec.py
@@ -30,18 +30,17 @@ def run(dataset, config):
 
     training_params = {k: v for k, v in config.framework_params.items() if not k.startswith('_')}
     n_jobs = config.framework_params.get('_n_jobs', config.cores)  # useful to disable multicore, regardless of the dataset config
-    estimators_params = {e: config.framework_params.get(f'_{e}_params', {}) for e in ['rf', 'gbm', 'linear', 'svc', 'final']}
+    estimators_params = {e: config.framework_params.get(f'_{e}_params', {}) for e in ['rf', 'gbm', 'sgdclassifier', 'sgdregressor', 'svc', 'final']}
 
     log.info("Running Sklearn Stacking Ensemble with a maximum time of {}s on {} cores.".format(config.max_runtime_seconds, n_jobs))
     log.warning("We completely ignore the requirement to stay within the time limit.")
     log.warning("We completely ignore the advice to optimize towards metric: {}.".format(config.metric))
 
-
     if is_classification:
         estimator = StackingClassifier(
             estimators=[('rf', RandomForestClassifier(n_jobs=n_jobs, random_state=config.seed, **estimators_params['rf'])),
                         ('gbm', GradientBoostingClassifier(random_state=config.seed, **estimators_params['gbm'])),
-                        ('linear', SGDClassifier(n_jobs=n_jobs, random_state=config.seed, **estimators_params['linear'])),
+                        ('linear', SGDClassifier(n_jobs=n_jobs, random_state=config.seed, **estimators_params['sgdclassifier'])),
                         # ('svc', LinearSVC(random_state=config.seed, **estimators_params['svc']))
                         ],
             # final_estimator=SGDClassifier(n_jobs=n_jobs, random_state=config.seed, **estimators_params['final']),
@@ -54,11 +53,11 @@ def run(dataset, config):
         estimator = StackingRegressor(
             estimators=[('rf', RandomForestRegressor(n_jobs=n_jobs, random_state=config.seed, **estimators_params['rf'])),
                         ('gbm', GradientBoostingRegressor(random_state=config.seed, **estimators_params['gbm'])),
-                        ('linear', SGDRegressor(random_state=config.seed, **estimators_params['linear'])),
+                        ('linear', SGDRegressor(random_state=config.seed, **estimators_params['sgdregressor'])),
                         ('svc', LinearSVR(random_state=config.seed, **estimators_params['svc']))
                         ],
             # final_estimator=SGDRegressor(random_state=config.seed, **estimators_params['final']),
-            final_estimator=LinearRegression(n_jobs=n_jobs, random_state=config.seed, **estimators_params['final']),
+            final_estimator=LinearRegression(n_jobs=n_jobs),
             n_jobs=n_jobs,
             **training_params
         )
@@ -66,7 +65,8 @@ def run(dataset, config):
     with Timer() as training:
         estimator.fit(X_train, y_train)
 
-    predictions = estimator.predict(X_test)
+    with Timer() as predict:
+        predictions = estimator.predict(X_test)
     probabilities = estimator.predict_proba(X_test) if is_classification else None
 
     return result(output_file=config.output_predictions_file,
@@ -75,7 +75,8 @@ def run(dataset, config):
                   probabilities=probabilities,
                   target_is_encoded=is_classification,
                   models_count=len(estimator.estimators_) + 1,
-                  training_duration=training.duration)
+                  training_duration=training.duration,
+                  predict_duration=predict.duration)
 
 
 if __name__ == '__main__':
diff --git a/examples/custom/extensions/Stacking/requirements.txt b/examples/custom/extensions/Stacking/requirements.txt
index d2afe9e80..73f0cadcd 100644
--- a/examples/custom/extensions/Stacking/requirements.txt
+++ b/examples/custom/extensions/Stacking/requirements.txt
@@ -1 +1 @@
-scikit-learn==0.22.1
+scikit-learn==1.3.1
diff --git a/examples/custom/extensions/Stacking/setup.sh b/examples/custom/extensions/Stacking/setup.sh
index 352f776ba..0f489d07e 100755
--- a/examples/custom/extensions/Stacking/setup.sh
+++ b/examples/custom/extensions/Stacking/setup.sh
@@ -2,7 +2,7 @@
 shopt -s expand_aliases
 HERE=$(dirname "$0")
 
-. "$HERE/.setup_env"
+. "$HERE/.setup/setup_env"
 . "$AMLB_ROOT/frameworks/shared/setup.sh" "$HERE" true
 PIP install -r "$HERE/requirements.txt"
 
diff --git a/examples/custom/frameworks.yaml b/examples/custom/frameworks.yaml
index a68884811..e711bcbbb 100644
--- a/examples/custom/frameworks.yaml
+++ b/examples/custom/frameworks.yaml
@@ -9,15 +9,15 @@ GradientBoosting:
 
 Stacking:
   module: extensions.Stacking
-  version: '0.22.1'
+  version: '1.3.1'
   project: https://scikit-learn.org/stable/modules/ensemble.html#stacking
   params:
     _rf_params: {n_estimators: 200}
     _gbm_params: {n_estimators: 200}
-    _linear_params: {penalty: elasticnet, loss: log}
+    _sgdclassifier_params: {penalty: elasticnet, loss: log_loss}
+    _sgdregressor_params: {penalty: elasticnet}
 #    _svc_params: {tol: 1e-3, max_iter: 1e5}
 #    _final_params: {penalty: elasticnet, loss: log} # sgd linear
-    _final_params: {max_iter: 1000}  # logistic/linear
 
 H2OAutoML_nightly:
   module: frameworks.H2OAutoML
diff --git a/frameworks/AutoGluon/README.md b/frameworks/AutoGluon/README.md
index 51286533e..1b5c2dc65 100644
--- a/frameworks/AutoGluon/README.md
+++ b/frameworks/AutoGluon/README.md
@@ -1,16 +1,5 @@
 # AutoGluon
 
-To run v0.5.2: ```python3 ../automlbenchmark/runbenchmark.py autogluon ...```
+To run v0.8.2: ```python3 ../automlbenchmark/runbenchmark.py autogluon ...```
 
-To run mainline: ```python3 ../automlbenchmark/runbenchmark.py autogluonts:latest ...```
-
-
-# AutoGluonTS
-
-AutoGluonTS stands for autogluon.timeseries. This framework handles time series problems.
-
-## Run Steps
-
-To run v0.5.2: ```python3 ../automlbenchmark/runbenchmark.py autogluonts timeseries ...```
-
-To run mainline: ```python3 ../automlbenchmark/runbenchmark.py autogluonts:latest timeseries ...```
+To run mainline: ```python3 ../automlbenchmark/runbenchmark.py autogluon:latest ...```
diff --git a/frameworks/AutoGluon/exec.py b/frameworks/AutoGluon/exec.py
index 89c3372b4..4b670c4fd 100644
--- a/frameworks/AutoGluon/exec.py
+++ b/frameworks/AutoGluon/exec.py
@@ -48,9 +48,10 @@ def run(dataset, config):
 
     is_classification = config.type == 'classification'
     training_params = {k: v for k, v in config.framework_params.items() if not k.startswith('_')}
+    time_limit = config.max_runtime_seconds
     presets = training_params.get("presets", [])
     presets = presets if isinstance(presets, list) else [presets]
-    if preset_with_refit_full := (set(presets) & {"good_quality", "high_quality"}):
+    if (preset_with_refit_full := (set(presets) & {"good_quality", "high_quality"})) and (time_limit is not None):
         preserve = 0.9
         preset = next(iter(preset_with_refit_full))
         msg = (
@@ -61,7 +62,7 @@ def run(dataset, config):
             "See https://auto.gluon.ai/stable/api/autogluon.tabular.TabularPredictor.refit_full.html"
         )
         log.info(msg)
-        config.max_runtime_seconds = preserve * config.max_runtime_seconds
+        time_limit = preserve * config.max_runtime_seconds
 
     train_path, test_path = dataset.train.path, dataset.test.path
     label = dataset.target.name
@@ -77,15 +78,17 @@ def run(dataset, config):
             problem_type=problem_type,
         ).fit(
             train_data=train_path,
-            time_limit=config.max_runtime_seconds,
+            time_limit=time_limit,
             **training_params
         )
 
     log.info(f"Finished fit in {training.duration}s.")
 
     # Persist model in memory that is going to be predicting to get correct inference latency
-    # max_memory=0.4 will be future default: https://github.com/autogluon/autogluon/pull/3338
-    predictor.persist_models('best', max_memory=0.4)
+    if hasattr(predictor, 'persist'):  # autogluon>=1.0
+        predictor.persist('best')
+    else:
+        predictor.persist_models('best')
 
     def inference_time_classification(data: Union[str, pd.DataFrame]):
         return None, predictor.predict_proba(data, as_multiclass=True)
@@ -108,14 +111,17 @@ def inference_time_regression(data: Union[str, pd.DataFrame]):
     with Timer() as predict:
         predictions, probabilities = infer(test_data)
     if is_classification:
-        predictions = probabilities.idxmax(axis=1).to_numpy()
+        if hasattr(predictor, 'predict_from_proba'):  # autogluon>=1.0
+            predictions = predictor.predict_from_proba(probabilities).to_numpy()
+        else:
+            predictions = probabilities.idxmax(axis=1).to_numpy()
 
     prob_labels = probabilities.columns.values.astype(str).tolist() if probabilities is not None else None
     log.info(f"Finished predict in {predict.duration}s.")
 
     _leaderboard_extra_info = config.framework_params.get('_leaderboard_extra_info', False)  # whether to get extra model info (very verbose)
     _leaderboard_test = config.framework_params.get('_leaderboard_test', False)  # whether to compute test scores in leaderboard (expensive)
-    leaderboard_kwargs = dict(silent=True, extra_info=_leaderboard_extra_info)
+    leaderboard_kwargs = dict(extra_info=_leaderboard_extra_info)
     # Disabled leaderboard test data input by default to avoid long running computation, remove 7200s timeout limitation to re-enable
     if _leaderboard_test:
         leaderboard_kwargs['data'] = test_data
diff --git a/frameworks/AutoGluon/exec_ts.py b/frameworks/AutoGluon/exec_ts.py
index 32fd34072..864946d22 100644
--- a/frameworks/AutoGluon/exec_ts.py
+++ b/frameworks/AutoGluon/exec_ts.py
@@ -17,7 +17,7 @@
 from joblib.externals.loky import get_reusable_executor
 
 from frameworks.shared.callee import call_run, result, output_subdir
-from frameworks.shared.utils import Timer, zip_path
+from frameworks.shared.utils import Timer, zip_path, load_timeseries_dataset
 
 log = logging.getLogger(__name__)
 
@@ -25,9 +25,16 @@
 def run(dataset, config):
     log.info(f"\n**** AutoGluon TimeSeries [v{__version__}] ****\n")
     prediction_length = dataset.forecast_horizon_in_steps
+    train_df, test_df = load_timeseries_dataset(dataset)
 
-    train_data = TimeSeriesDataFrame.from_path(
-        dataset.train_path,
+    train_data = TimeSeriesDataFrame.from_data_frame(
+        train_df,
+        id_column=dataset.id_column,
+        timestamp_column=dataset.timestamp_column,
+    )
+
+    test_data = TimeSeriesDataFrame.from_data_frame(
+        test_df,
         id_column=dataset.id_column,
         timestamp_column=dataset.timestamp_column,
     )
@@ -45,6 +52,7 @@ def run(dataset, config):
         predictor.fit(
             train_data=train_data,
             time_limit=config.max_runtime_seconds,
+            random_seed=config.seed,
             **{k: v for k, v in config.framework_params.items() if not k.startswith('_')},
         )
 
@@ -52,7 +60,6 @@ def run(dataset, config):
         predictions = pd.DataFrame(predictor.predict(train_data))
 
     # Add columns necessary for the metric computation + quantile forecast to `optional_columns`
-    test_data_future = pd.read_csv(dataset.test_path, parse_dates=[dataset.timestamp_column])
     optional_columns = dict(
         repeated_item_id=np.load(dataset.repeated_item_id),
         repeated_abs_seasonal_error=np.load(dataset.repeated_abs_seasonal_error),
@@ -61,13 +68,12 @@ def run(dataset, config):
         optional_columns[str(q)] = predictions[str(q)].values
 
     predictions_only = get_point_forecast(predictions, config.metric)
-    truth_only = test_data_future[dataset.target].values
+    truth_only = test_df[dataset.target].values
 
     # Sanity check - make sure predictions are ordered correctly
-    future_index = pd.MultiIndex.from_frame(test_data_future[[dataset.id_column, dataset.timestamp_column]])
-    assert predictions.index.equals(future_index), "Predictions and test data index do not match"
+    assert predictions.index.equals(test_data.index), "Predictions and test data index do not match"
 
-    test_data_full = pd.concat([train_data, test_data_future.set_index([dataset.id_column, dataset.timestamp_column])])
+    test_data_full = pd.concat([train_data, test_data])
     leaderboard = predictor.leaderboard(test_data_full, silent=True)
 
     with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 1000):
diff --git a/frameworks/AutoGluon/setup.sh b/frameworks/AutoGluon/setup.sh
index c6a61a8c6..6980b6353 100755
--- a/frameworks/AutoGluon/setup.sh
+++ b/frameworks/AutoGluon/setup.sh
@@ -1,8 +1,11 @@
 #!/usr/bin/env bash
 
+# exit when any command fails
+set -e
+
 HERE=$(dirname "$0")
 VERSION=${1:-"stable"}
-REPO=${2:-"https://github.com/awslabs/autogluon.git"}
+REPO=${2:-"https://github.com/autogluon/autogluon.git"}
 PKG=${3:-"autogluon"}
 if [[ "$VERSION" == "latest" ]]; then
     VERSION="master"
diff --git a/frameworks/FEDOT/__init__.py b/frameworks/FEDOT/__init__.py
new file mode 100644
index 000000000..86e68de98
--- /dev/null
+++ b/frameworks/FEDOT/__init__.py
@@ -0,0 +1,25 @@
+from amlb.benchmark import TaskConfig
+from amlb.data import Dataset
+from amlb.utils import call_script_in_same_dir
+
+
+def setup(*args, **kwargs):
+    call_script_in_same_dir(__file__, "setup.sh", *args, **kwargs)
+
+
+def run(dataset: Dataset, config: TaskConfig):
+    from frameworks.shared.caller import run_in_venv
+
+    data = dict(
+        train=dict(
+            X=dataset.train.X,
+            y=dataset.train.y
+        ),
+        test=dict(
+            X=dataset.test.X,
+            y=dataset.test.y
+        )
+    )
+
+    return run_in_venv(__file__, "exec.py",
+                       input_data=data, dataset=dataset, config=config)
diff --git a/frameworks/FEDOT/exec.py b/frameworks/FEDOT/exec.py
new file mode 100644
index 000000000..b57448949
--- /dev/null
+++ b/frameworks/FEDOT/exec.py
@@ -0,0 +1,99 @@
+import logging
+import os
+from pathlib import Path
+
+from fedot.api.main import Fedot
+
+from frameworks.shared.callee import call_run, result, output_subdir
+from frameworks.shared.utils import Timer
+
+log = logging.getLogger(__name__)
+
+
+def run(dataset, config):
+    log.info("\n**** FEDOT ****\n")
+
+    is_classification = config.type == 'classification'
+    # Mapping of benchmark metrics to FEDOT metrics
+    metrics_mapping = dict(
+        acc='acc',
+        auc='roc_auc',
+        f1='f1',
+        logloss='logloss',
+        mae='mae',
+        mse='mse',
+        msle='msle',
+        r2='r2',
+        rmse='rmse'
+    )
+    scoring_metric = metrics_mapping.get(config.metric, None)
+
+    if scoring_metric is None:
+        log.warning("Performance metric %s not supported.", config.metric)
+
+    training_params = {"preset": "best_quality", "n_jobs": config.cores}
+    training_params |= {k: v for k, v in config.framework_params.items() if not k.startswith('_')}
+    n_jobs = training_params["n_jobs"]
+
+    log.info('Running FEDOT with a maximum time of %ss on %s cores, optimizing %s.',
+             config.max_runtime_seconds, n_jobs, scoring_metric)
+    runtime_min = config.max_runtime_seconds / 60
+
+    fedot = Fedot(problem=config.type, timeout=runtime_min, metric=scoring_metric, seed=config.seed,
+                  max_pipeline_fit_time=runtime_min / 10, **training_params)
+
+    with Timer() as training:
+        fedot.fit(features=dataset.train.X, target=dataset.train.y)
+
+    log.info('Predicting on the test set.')
+    with Timer() as predict:
+        predictions = fedot.predict(features=dataset.test.X)
+        probabilities = None
+        if is_classification:
+            probabilities = fedot.predict_proba(features=dataset.test.X, probs_for_all_classes=True)
+
+    save_artifacts(fedot, config)
+
+    return result(output_file=config.output_predictions_file,
+                  predictions=predictions,
+                  truth=dataset.test.y,
+                  probabilities=probabilities,
+                  target_is_encoded=False,
+                  models_count=fedot.current_pipeline.length,
+                  training_duration=training.duration,
+                  predict_duration=predict.duration)
+
+
+def save_artifacts(automl, config):
+
+    artifacts = config.framework_params.get('_save_artifacts', [])
+    if 'models' in artifacts:
+        try:
+            models_dir = output_subdir('models', config)
+            models_file = os.path.join(models_dir, 'model.json')
+            automl.current_pipeline.save(models_file)
+        except Exception as e:
+            log.info(f"Error when saving 'models': {e}.", exc_info=True)
+
+    if 'info' in artifacts:
+        try:
+            info_dir = output_subdir("info", config)
+            if automl.history:
+                automl.history.save(os.path.join(info_dir, 'history.json'))
+            else:
+                log.info(f"There is no optimization history info to save.")
+        except Exception as e:
+            log.info(f"Error when saving info about optimisation history: {e}.", exc_info=True)
+
+    if 'leaderboard' in artifacts:
+        try:
+            leaderboard_dir = output_subdir("leaderboard", config)
+            if automl.history:
+                lb = automl.history.get_leaderboard()
+                Path(os.path.join(leaderboard_dir, "leaderboard.csv")).write_text(lb)
+        except Exception as e:
+            log.info(f"Error when saving 'leaderboard': {e}.", exc_info=True)
+
+
+if __name__ == '__main__':
+    call_run(run)
diff --git a/frameworks/FEDOT/setup.sh b/frameworks/FEDOT/setup.sh
new file mode 100644
index 000000000..a89781583
--- /dev/null
+++ b/frameworks/FEDOT/setup.sh
@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+HERE=$(dirname "$0")
+VERSION=${1:-"stable"}
+REPO=${2:-"https://github.com/aimclub/FEDOT.git"}
+PKG=${3:-"fedot"}
+if [[ "$VERSION" == "latest" ]]; then
+    VERSION="master"
+fi
+
+# creating local venv
+. ${HERE}/../shared/setup.sh ${HERE} true
+
+RAWREPO=$(echo ${REPO} | sed "s/github\.com/raw\.githubusercontent\.com/")
+if [[ "$VERSION" == "stable" ]]; then
+    PIP install --no-cache-dir -U ${PKG}
+    echo GET_VERSION_STABLE
+    VERSION=$(PY -c "${GET_VERSION_STABLE}")
+elif [[ "$VERSION" =~ ^[0-9] ]]; then
+    PIP install --no-cache-dir -U ${PKG}==${VERSION}
+else
+    TARGET_DIR="${HERE}/lib/${PKG}"
+    rm -Rf ${TARGET_DIR}
+
+    if [[ "$VERSION" =~ ^# ]]; then
+      COMMIT="${VERSION:1}"
+    else
+      # find the latest commit to the VERSION branch
+      COMMIT=$(git ls-remote "${REPO}" | grep "refs/heads/${VERSION}" | cut -f 1)
+      DEPTH="--depth 1 --branch ${VERSION}"
+    fi
+
+    git clone  --recurse-submodules --shallow-submodules ${DEPTH} ${REPO} ${TARGET_DIR}
+    cd ${TARGET_DIR}
+    git checkout "${COMMIT}"
+    git submodule update --init --recursive
+    cd ${HERE}
+    PIP install -U -e ${TARGET_DIR}
+fi
+
+installed="${HERE}/.setup/installed"
+PY -c "from fedot import __version__; print(__version__)" >> "$installed"
+if [[ -n $COMMIT ]]; then
+  truncate -s-1 "$installed"
+  echo "#${COMMIT}" >> "$installed"
+fi
diff --git a/frameworks/shared/utils.py b/frameworks/shared/utils.py
index abcff3717..26a39f96e 100644
--- a/frameworks/shared/utils.py
+++ b/frameworks/shared/utils.py
@@ -2,6 +2,7 @@
 import importlib.util
 import logging
 import os
+import pandas as pd
 import sys
 
 
@@ -42,6 +43,13 @@ def load_amlb_module(mod, amlb_path=None):
     return import_module(mod)
 
 
+def load_timeseries_dataset(dataset):
+    # Ensure that id_column is loaded as string to avoid incorrect sorting
+    train_data = pd.read_csv(dataset.train_path, dtype={dataset.id_column: str}, parse_dates=[dataset.timestamp_column])
+    test_data = pd.read_csv(dataset.test_path, dtype={dataset.id_column: str}, parse_dates=[dataset.timestamp_column])
+    return train_data, test_data
+
+
 utils = load_amlb_module("amlb.utils")
 # unorthodox for it's only now that we can safely import those functions
 from amlb.utils import *
diff --git a/mkdocs.yml b/mkdocs.yml
index 5f2dd0f6c..831f69efe 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -27,6 +27,7 @@ nav:
       - extending/constraint.md
       - Frameworks: extending/framework.md
   - FAQ: faq.md
+  - Contributing: CONTRIBUTING.md
 
 extra_css:
   - stylesheets/extra.css
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 5939f4af5..600256292 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -7,3 +7,6 @@ types-xmltodict
 pandas-stubs
 boto3-stubs
 mypy
+
+# documentation
+mkdocs-material
\ No newline at end of file
diff --git a/requirements.in b/requirements.in
index 19adef7d2..02f64a594 100644
--- a/requirements.in
+++ b/requirements.in
@@ -11,3 +11,7 @@ scikit-learn>=1.0,<2.0
 
 pyarrow>=11.0
 # tables>=3.6
+
+# Allow loading datasets from S3
+fsspec
+s3fs
diff --git a/requirements.txt b/requirements.txt
index 52d6e87a6..2cee4f1c1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,6 +9,7 @@ boto3==1.26.98
 botocore==1.29.98
     # via
     #   boto3
+    #   s3fs
     #   s3transfer
 certifi==2022.12.7
     # via
@@ -18,6 +19,10 @@ charset-normalizer==3.1.0
     # via requests
 filelock==3.12.0
     # via -r requirements.in
+fsspec==2023.6.0
+    # via
+    #   -r requirements.in
+    #   s3fs
 idna==3.4
     # via requests
 jmespath==1.0.1
@@ -65,6 +70,8 @@ ruamel-yaml==0.17.21
     # via -r requirements.in
 ruamel-yaml-clib==0.2.7
     # via ruamel-yaml
+s3fs==0.4.2
+    # via -r requirements.in
 s3transfer==0.6.0
     # via boto3
 scikit-learn==1.2.2
diff --git a/resources/frameworks.yaml b/resources/frameworks.yaml
index de3e4aaa6..da2881ce0 100644
--- a/resources/frameworks.yaml
+++ b/resources/frameworks.yaml
@@ -203,6 +203,16 @@ TPOT:
 #    population_size: 25
 #    verbosity: 2
 
+FEDOT:
+  version: 'master'
+  description: |
+    FEDOT is a AutoML tool that optimizes composite machine learning pipelines using evolutionary optimisation.
+  project: https://github.com/aimclub/FEDOT
+  refs:
+    - https://doi.org/10.1016/j.future.2021.08.022
+#  params:
+#    _save_artifacts: ['leaderboard', 'models', 'info']
+
 #######################################
 ### Non AutoML reference frameworks ###
 #######################################
diff --git a/resources/frameworks_2023Q2.yaml b/resources/frameworks_2023Q2.yaml
index af4e46848..1ac098b6f 100644
--- a/resources/frameworks_2023Q2.yaml
+++ b/resources/frameworks_2023Q2.yaml
@@ -97,8 +97,7 @@ mlr3automl:
   project: https://github.com/a-hanf/mlr3automl
 
 NaiveAutoML:
-  repo: https://github.com/pgijsbers/naiveautoml
-  version: '#182f5148e9d360ad92254fe47c12fc35d9fabd62'
+  version: '0.0.27'
 
 TPOT:
   version: '0.12.0'
diff --git a/resources/frameworks_latest.yaml b/resources/frameworks_latest.yaml
index d56b14dac..44f924232 100644
--- a/resources/frameworks_latest.yaml
+++ b/resources/frameworks_latest.yaml
@@ -86,6 +86,9 @@ oboe:
 TPOT:
   version: 'latest'
 
+FEDOT:
+  version: 'latest'
+
 #######################################
 ### Non AutoML reference frameworks ###
 #######################################
diff --git a/resources/frameworks_stable.yaml b/resources/frameworks_stable.yaml
index 3de7da369..d6b5a1ce0 100644
--- a/resources/frameworks_stable.yaml
+++ b/resources/frameworks_stable.yaml
@@ -91,8 +91,8 @@ oboe:
 TPOT:
   version: 'stable'
 
-
-
+FEDOT:
+  version: 'stable'
 
 #######################################
 ### Non AutoML reference frameworks ###
diff --git a/tests/unit/amlb/datasets/file/test_file_dataloader.py b/tests/unit/amlb/datasets/file/test_file_dataloader.py
index 778cccdf7..b46379724 100644
--- a/tests/unit/amlb/datasets/file/test_file_dataloader.py
+++ b/tests/unit/amlb/datasets/file/test_file_dataloader.py
@@ -292,7 +292,7 @@ def test_load_timeseries_task_csv(file_loader):
     assert len(ds.repeated_abs_seasonal_error) == len(ds.test.data)
     assert len(ds.repeated_item_id) == len(ds.test.data)
 
-    assert pat.is_categorical_dtype(ds._dtypes[ds.id_column])
+    assert pat.is_string_dtype(ds._dtypes[ds.id_column])
     assert pat.is_datetime64_dtype(ds._dtypes[ds.timestamp_column])
     assert pat.is_float_dtype(ds._dtypes[ds.target.name])