From 36df9257b0131ef3dad2a82944a727b49e3e238e Mon Sep 17 00:00:00 2001
From: Sebastian Fischer <sebf.fischer@gmail.com>
Date: Thu, 7 Nov 2024 12:48:46 +0100
Subject: [PATCH] add chapter on validation and internal tuning (#829)

---
 book/_quarto.yml                              |   1 +
 book/chapters/appendices/errata.qmd           |  35 +-
 book/chapters/appendices/solutions.qmd        | 199 +++++++-
 .../solutions_large-scale_benchmarking.qmd    |   4 +-
 .../chapter1/introduction_and_overview.qmd    |   2 +
 .../advanced_technical_aspects_of_mlr3.qmd    |  18 +-
 .../chapter11/large-scale_benchmarking.qmd    |   6 +-
 .../chapter12/model_interpretation.qmd        |   4 +-
 .../chapter15/predsets_valid_inttune.qmd      | 481 ++++++++++++++++++
 .../chapter3/evaluation_and_benchmarking.qmd  |   2 +-
 ...ing_methods_and_black_box_optimization.qmd |  14 +-
 .../non-sequential_pipelines_and_tuning.qmd   |   3 +-
 book/common/chap_auths.csv                    |   3 +-
 book/renv.lock                                | 134 ++---
 14 files changed, 764 insertions(+), 142 deletions(-)
 create mode 100644 book/chapters/chapter15/predsets_valid_inttune.qmd

diff --git a/book/_quarto.yml b/book/_quarto.yml
index e9868b338..30fd4afcb 100644
--- a/book/_quarto.yml
+++ b/book/_quarto.yml
@@ -44,6 +44,7 @@ book:
       - chapters/chapter12/model_interpretation.qmd
       - chapters/chapter13/beyond_regression_and_classification.qmd
       - chapters/chapter14/algorithmic_fairness.qmd
+      - chapters/chapter15/predsets_valid_inttune.qmd
     - chapters/references.qmd
   appendices:
       - chapters/appendices/solutions.qmd # online only
diff --git a/book/chapters/appendices/errata.qmd b/book/chapters/appendices/errata.qmd
index 2af94b36f..898f886a9 100644
--- a/book/chapters/appendices/errata.qmd
+++ b/book/chapters/appendices/errata.qmd
@@ -11,19 +11,27 @@ aliases:
 
 This appendix lists changes to the online version of this book to chapters included in the first edition.
 
+## 1. Introduction and Overview
 
-## Data and Basic Modeling
+* Add
+
+
+## 2. Data and Basic Modeling
 
 * Replaced reference to `Param` with `Domain`.
 
-## Hyperparameter Optimization
+## 3. Evaluation and Benchmarking
+
+* Use `$encapsulate()` method instead of the `$encapsulate` and `$fallback` fields.
+
+## 4. Hyperparameter Optimization
 
 * Renamed `TuningInstanceSingleCrit` to `TuningInstanceBatchSingleCrit`.
 * Renamed `TuningInstanceMultiCrit` to `TuningInstanceBatchMultiCrit`.
 * Renamed `Tuner` to `TunerBatch`.
 * Replaced reference to `Param` with `Domain`.
 
-## Advanced Tuning Methods and Black Box Optimization
+## 5. Advanced Tuning Methods and Black Box Optimization
 
 * Renamed `TuningInstanceSingleCrit` to `TuningInstanceBatchSingleCrit`.
 * Renamed `TuningInstanceMultiCrit` to `TuningInstanceBatchMultiCrit`.
@@ -33,10 +41,29 @@ This appendix lists changes to the online version of this book to chapters inclu
 * Renamed `Optimizer` to `OptimizerBatch`.
 * Replaced `OptimInstanceSingleCrit$new()` with `oi()`.
 * Add `oi()` to the table about important functions.
+* Use `$encapsulate()` method instead of the `$encapsulate` and `$fallback` fields.
 
-## Feature Selection
+## 6. Feature Selection
 
 * Renamed `FSelectInstanceSingleCrit` to `FSelectInstanceBatchSingleCrit`.
 * Renamed `FSelectInstanceMultiCrit` to `FSelectInstanceBatchMultiCrit`.
 * Renamed `FeatureSelector` to `FeatureSelectorBatch`.
 * Add `fsi()` to the table about important functions.
+
+## 8. Non-sequential Pipelines and Tuning
+
+* Use `$encapsulate()` method instead of the `$encapsulate` and `$fallback` fields.
+
+## 10. Advanced Technical Aspects of mlr3
+
+* Use `$encapsulate()` method instead of the `$encapsulate` and `$fallback` fields.
+
+## 11. Large-Scale Benchmarking
+
+* Use `$encapsulate()` method instead of the `$encapsulate` and `$fallback` fields.
+
+## 12. Model Interpretation
+
+* Subset task to row 127 instead of 35 for the local surrogate model.
+* Add `as.data.frame()` to "Correctly Interpreting Shapley Values" section.
+
diff --git a/book/chapters/appendices/solutions.qmd b/book/chapters/appendices/solutions.qmd
index 97edfa3d0..c2067c570 100644
--- a/book/chapters/appendices/solutions.qmd
+++ b/book/chapters/appendices/solutions.qmd
@@ -1711,9 +1711,9 @@ First, we create the learner that we want to tune, mark the relevant parameter f
 
 ```{r}
 lrn_debug = lrn("classif.debug",
-  error_train = to_tune(0, 1),
-  fallback = lrn("classif.rpart")
+  error_train = to_tune(0, 1)
 )
+lrn_debug$encapsulate("evaluate", fallback = lrn("classif.rpart"))
 lrn_debug
 ```
 
@@ -2171,4 +2171,199 @@ prediction$score(msr_3, adult_subset)
 We can see, that between women there is an even bigger discrepancy compared to men.
 
 * The bias mitigation strategies we employed do not optimize for the *false omission rate* metric, but other metrics instead. It might therefore be better to try to achieve fairness via other strategies, using different or more powerful models or tuning hyperparameters.
+
+## Solutions to @sec-predsets-valid-inttune
+
+1. Manually `$train()` a LightGBM classifier from `r ref_pkg("mlr3extralearners")` on the pima task using $1/3$ of the training data for validation.
+   As the pima task has missing values, select a method from `r ref_pkg("mlr3pipelines")` to impute them.
+   Explicitly set the evaluation metric to logloss (`"binary_logloss"`), the maximum number of boosting iterations to 1000, the patience parameter to 10, and the step size to 0.01.
+   After training the learner, inspect the final validation scores as well as the early stopped number of iterations.
+
+We start by loading the packages and creating the task.
+
+```{r}
+library(mlr3)
+library(mlr3extralearners)
+library(mlr3pipelines)
+
+tsk_pima = tsk("pima")
+tsk_pima
+```
+
+Below, we see that the task has five features with missing values.
+
+```{r}
+tsk_pima$missings()
+```
+
+Next, we create the LightGBM classifier, but don't specify the validation data yet.
+We handle the missing values using a simple median imputation.
+
+```{r}
+lrn_lgbm = lrn("classif.lightgbm",
+  num_iterations = 1000,
+  early_stopping_rounds = 10,
+  learning_rate = 0.01,
+  eval = "binary_logloss"
+)
+
+glrn = as_learner(po("imputemedian") %>>% lrn_lgbm)
+glrn$id = "lgbm"
+```
+
+After constructing the graphlearner, we now configure the validation data using `r ref("set_validate()")`.
+The call below sets the `$validate` field of the LightGBM pipeop to `"predefined"` and of the graphlearner to `0.3`.
+Recall that only the graphlearner itself can specify *how* the validation data is generated.
+The individual pipeops can either use it (`"predefined"`) or not (`NULL`).
+
+```{r}
+set_validate(glrn, validate = 0.3, ids = "classif.lightgbm")
+glrn$validate
+glrn$graph$pipeops$classif.lightgbm$validate
+```
+
+Finally, we train the learner and inspect the validation scores and internally tuned parameters.
+
+```{r}
+glrn$train(tsk_pima)
+
+glrn$internal_tuned_values
+glrn$internal_valid_scores
+```
+
+2. Wrap the learner from exercise 1) in an `AutoTuner` using a three-fold CV for the tuning.
+   Also change the rule for aggregating the different boosting iterations from averaging to taking the maximum across the folds.
+   Don't tune any parameters other than `nrounds`, which can be done using `tnr("internal")`.
+   Use the internal validation metric as the tuning measure.
+   Compare this learner with a `lrn("classif.rpart")` using a 10-fold outer cross-validation with respect to classification accuracy.
+
+We start by setting the number of boosting iterations to an internal tune token where the maximum number of boosting iterations is 1000 and the aggregation function the maximum.
+Note that the input to the aggregation function is a list of integer values (the early stopped values for the different resampling iterations), so we need to `unlist()` it first before taking the maximum.
+
+```{r}
+library(mlr3tuning)
+
+glrn$param_set$set_values(
+  classif.lightgbm.num_iterations = to_tune(
+    upper = 1000, internal = TRUE, aggr = function(x) max(unlist(x))
+  )
+)
+```
+
+Now, we change the validation data from `0.3` to `"test"`, where we can omit the `ids` specification as LightGBM is the base learner.
+
+```{r}
+set_validate(glrn, validate = "test")
+```
+
+Next, we create the autotuner using the configuration given in the instructions.
+As the internal validation measures are calculated by `lightgbm` and not `mlr3`, we need to specify whether the metric should be minimized.
+
+```{r}
+at_lgbm = auto_tuner(
+  learner = glrn,
+  tuner = tnr("internal"),
+  resampling = rsmp("cv", folds = 3),
+  measure = msr("internal_valid_score",
+    select = "classif.lightgbm.binary_logloss", minimize = TRUE)
+)
+at_lgbm$id = "at_lgbm"
+```
+
+Finally, we set up the benchmark design, run it, and evaluate the learners in terms of their classification accuracy.
+
+```{r}
+design = benchmark_grid(
+  task = tsk_pima,
+  learners = list(at_lgbm, lrn("classif.rpart")),
+  resamplings = rsmp("cv", folds = 10)
+)
+
+bmr = benchmark(design)
+
+bmr$aggregate(msr("classif.acc"))
+```
+
+3. Consider the code below:
+
+   ```{r}
+   branch_lrn = as_learner(
+     ppl("branch", list(
+       lrn("classif.ranger"),
+       lrn("classif.xgboost",
+         early_stopping_rounds = 10,
+         eval_metric = "error",
+         eta = to_tune(0.001, 0.1, logscale = TRUE),
+         nrounds = to_tune(upper = 1000, internal = TRUE)))))
+
+   set_validate(branch_lrn, validate = "test", ids = "classif.xgboost")
+   branch_lrn$param_set$set_values(branch.selection = to_tune())
+
+   at = auto_tuner(
+     tuner = tnr("grid_search"),
+     learner = branch_lrn,
+     resampling = rsmp("holdout", ratio = 0.8),
+     # cannot use internal validation score because ranger does not have one
+     measure = msr("classif.ce"),
+     term_evals = 10L,
+     store_models = TRUE
+   )
+
+   tsk_sonar = tsk("sonar")$filter(1:100)
+
+   rr = resample(
+     tsk_sonar, at, rsmp("holdout", ratio = 0.8), store_models = TRUE
+   )
+   ```
+
+   Answer the following questions (ideally without running the code):
+
+  3.1 During the hyperparameter optimization, how many observations are used to train the XGBoost algorithm (excluding validation data) and how many for the random forest?
+      Hint: learners that cannot make use of validation data ignore it.
+
+The outer resampling already removes 20 observations from the data (the outer test set), leaving only 80 data points (the outer train set) for the inner resampling.
+Then 16 (0.2 * 80; the test set of the inner holdout resampling) observations are used to evaluate the hyperparameter configurations.
+This leaves 64 (80 - 16) observations for training.
+For XGBoost, the 16 observations that make up the inner test set are also used for validation, so no more observations from the 64 training points are removed.
+Because the random forest does not support validation, the 16 observations from the inner test set will only be used for evaluation the hyperparameter configuration, but not simultanteously for internal validation.
+Therefore, both the random forest and XGBoost models use 64 observations for training.
+
+  3.2 How many observations would be used to train the final model if XGBoost was selected? What if the random forest was chosen?
+
+In both cases, all 80 observations (the train set from the outer resampling) would be used.
+This is because during the final model fit no validation data is generated.
+
+  3.3 How would the answers to the last two questions change if we had set the `$validate` field of the graphlearner to `0.25` instead of `"test"`?
+
+In this case, the validation data is no longer identical to the inner resampling test set.
+Instead, it is split from the 64 observations that make up the inner training set.
+Because this happens before the task enters the graphlearner, both the XGBoost model *and* the random forest only have access to 48 ((1 - 0.25) * 64) observations, and the remaining 16 are used to create the validation data.
+Note that the random forest will again ignore the validation data as it does not have the 'validation' property and therefore cannot use it.
+Also, the autotuner would now use a different set for tuning the step size and boosting iterations (which coincidentally both have size 16).
+Therefore, the answer to question 3.1 would be 48 instead of 64.
+
+However, this does not change the answer to 3.2, as, again, no validation is performed during the final model fit.
+
+Note that we would normally recommend setting the validation data to `"test"` when tuning, so this should be thought of as a illustrative example.
+
+
+4. Look at the (failing) code below:
+
+   ```{r, error = TRUE}
+   tsk_sonar = tsk("sonar")
+   glrn = as_learner(
+     po("pca") %>>% lrn("classif.xgboost", validate = 0.3)
+   )
+   ```
+
+   Can you explain *why* the code fails?
+   Hint: Should the data that xgboost uses for validation be preprocessed according to the *train* or *predict* logic?
+
+If we set the `$validate` field of the XGBoost classifier to `0.3`, the validation data would be generated from the output task of `PipeOpOpPCA`.
+However, this task has been exclusively preprocessed using the train logic, because the `PipeOpPCA` does not 'know' that the LightGBM classifier wants to do validation.
+Because validation performance is intended to measure how well a model would perform during prediction, the validation should be preprocessed according to the predict logic.
+For this reason, splitting of the 30% of the output from `PipeOpPCA` to use as validation data in the XGBoost classifier would be invalid.
+Therefore, it is not possible to set the `$validate` field of `PipeOps` to values other than `predefined' or `NULL'.
+Only the `GraphLearner` itself can dictate *how* the validation data is created *before* it enters the `Graph`, so the validation data is then preprocessed according to the predict logic.
+
 :::
diff --git a/book/chapters/appendices/solutions_large-scale_benchmarking.qmd b/book/chapters/appendices/solutions_large-scale_benchmarking.qmd
index 0284a3afb..51d344f0c 100644
--- a/book/chapters/appendices/solutions_large-scale_benchmarking.qmd
+++ b/book/chapters/appendices/solutions_large-scale_benchmarking.qmd
@@ -104,14 +104,14 @@ lrn_ranger = as_learner(
     po("learner", lrn("regr.ranger"))
 )
 lrn_ranger$id = "ranger"
-lrn_ranger$fallback = lrn("regr.featureless")
+lrn_ranger$encapsulate("evaluate", fallback = lrn("regr.featureless"))
 
 lrn_rpart = as_learner(
   ppl("robustify", learner = lrn("regr.rpart")) %>>%
     po("learner", lrn("regr.rpart"))
 )
 lrn_rpart$id = "rpart"
-lrn_rpart$fallback = lrn("regr.featureless")
+lrn_rpart$encapsulate("evaluate", fallback = lrn("regr.featureless"))
 
 learners = list(lrn_ranger, lrn_rpart)
 ```
diff --git a/book/chapters/chapter1/introduction_and_overview.qmd b/book/chapters/chapter1/introduction_and_overview.qmd
index 70ef27363..8d7b45026 100644
--- a/book/chapters/chapter1/introduction_and_overview.qmd
+++ b/book/chapters/chapter1/introduction_and_overview.qmd
@@ -27,6 +27,8 @@ Before we can show you the full power of `mlr3`, we recommend installing the `r
 install.packages("mlr3verse")
 ```
 
+Chapters that were added after the release of the printed version of this book are marked with a '+'.
+
 ## Installation Guidelines {#installguide}
 
 There are many packages in the `mlr3` ecosystem that you may want to use as you work through this book.
diff --git a/book/chapters/chapter10/advanced_technical_aspects_of_mlr3.qmd b/book/chapters/chapter10/advanced_technical_aspects_of_mlr3.qmd
index 6ddac2bf6..0fbece769 100644
--- a/book/chapters/chapter10/advanced_technical_aspects_of_mlr3.qmd
+++ b/book/chapters/chapter10/advanced_technical_aspects_of_mlr3.qmd
@@ -530,7 +530,7 @@ This means that models can be used for fitting and predicting and any conditions
 However, the result of the experiment will be a missing model and/or predictions, depending on where the error occurs.
 In @sec-fallback, we will discuss fallback learners to replace missing models and/or predictions.
 
-Each `r ref("Learner")` contains the field `r index("$encapsulate", parent = "Learner", aside = TRUE, code = TRUE)` to control how the train or predict steps are wrapped.
+Each `r ref("Learner")` has the method `r index("$encapsulate()", parent = "Learner", aside = TRUE, code = TRUE)` to control how the train or predict steps are wrapped.
 The first way to encapsulate the execution is provided by the package `r ref_pkg("evaluate")`, which evaluates R expressions and captures and tracks conditions (outputs, messages, warnings or errors) without letting them stop the process (see documentation of `r ref("mlr3misc::encapsulate()")` for full details):
 
 ```{r technical-017}
@@ -538,13 +538,10 @@ The first way to encapsulate the execution is provided by the package `r ref_pkg
 lrn_debug = lrn("classif.debug", warning_train = 1, error_train = 1)
 
 # enable encapsulation for train() and predict()
-lrn_debug$encapsulate = c(train = "evaluate", predict = "evaluate")
+lrn_debug$encapsulate("evaluate", fallback = lrn("classif.featureless"))
 lrn_debug$train(tsk_penguins)
 ```
 
-Note how we passed `"evaluate"` to `train` and `predict` to enable encapsulation in both training and predicting.
-However, we could have only set encapsulation for one of these stages by instead passing `c(train = "evaluate", predict = "none")` or `c(train = "none", predict = "evaluate")`.
-
 Note that encapsulation captures all output written to the standard output (stdout) and standard error (stderr) streams and stores them in the learner's log.
 However, in some computational setups, the calling process needs to operate on the log output, such as the `r ref_pkg("batchtools")` package in @sec-large-benchmarking.
 In this case, use the encapsulation method `"try"` instead, which catches signaled conditions but does not suppress the output.
@@ -563,7 +560,7 @@ This guards the calling session against segmentation faults which otherwise woul
 On the downside, starting new processes comes with comparably more computational overhead.
 
 ```{r technical-019}
-lrn_debug$encapsulate = c(train = "callr", predict = "callr")
+lrn_debug$encapsulate("callr", fallback = lrn("classif.featureless"))
 # set segfault_train and remove warning_train and error_train
 lrn_debug$param_set$values = list(segfault_train = 1)
 lrn_debug$train(task = tsk_penguins)$errors
@@ -613,13 +610,12 @@ Say an error has occurred when training a model in one or more iterations during
 We strongly recommend the final option, which is statistically sound and can be easily used in any practical experiment.
 `mlr3` includes two baseline learners: `lrn("classif.featureless")`, which, in its default configuration, always predicts the majority class, and `lrn("regr.featureless")`, which predicts the average response by default.
 
-To make this procedure convenient during resampling and benchmarking, we support fitting a baseline (though in theory you could use any `Learner`) as a `r index('fallback learner')` by passing a `r ref("Learner")` to `r index('$fallback', parent = "Learner", aside = TRUE, code = TRUE)`.
+To make this procedure convenient during resampling and benchmarking, we support fitting a baseline (though in theory you could use any `Learner`) as a `r index('fallback learner')` by passing a `r ref("Learner")` to `r index('$encapsulate()', parent = "Learner", aside = TRUE, code = TRUE)`.
 In the next example, we add a classification baseline to our debug learner, so that when the debug learner errors, `mlr3` falls back to the predictions of the featureless learner internally.
-Note that while encapsulation is not enabled explicitly, it is automatically enabled and set to `"evaluate"` if a fallback learner is added.
 
 ```{r technical-022}
 lrn_debug = lrn("classif.debug", error_train = 1)
-lrn_debug$fallback = lrn("classif.featureless")
+lrn_debug$encapsulate("evaluate", fallback = lrn("classif.featureless"))
 
 lrn_debug$train(tsk_penguins)
 lrn_debug
@@ -639,7 +635,7 @@ We re-parametrize the debug learner to fail in roughly 50% of the resampling ite
 
 ```{r technical-024}
 lrn_debug = lrn("classif.debug", error_train = 0.5)
-lrn_debug$fallback = lrn("classif.featureless")
+lrn_debug$encapsulate("evaluate", fallback = lrn("classif.featureless"))
 
 aggr = benchmark(benchmark_grid(
   tsk_penguins,
@@ -970,7 +966,7 @@ For an overview of available DBMS in R, see the CRAN task view on databases at `
 | - | `r ref("future::plan()")` | - |
 | - | `r ref("set_threads()")` | - |
 | - | `r ref("future::tweak()")` | - |
-| `Learner` | `lrn()` | `$encapsulate`; `$fallback`; `$timeout`; `$parallel_predict`; `$log` |
+| `Learner` | `lrn()` | `$encapsulate()`; `$timeout`; `$parallel_predict`; `$log` |
 | `r ref("lgr::Logger")` | `r ref("lgr::get_logger")` | `$set_threshold()` |
 | `r ref("mlr3db::DataBackendDplyr")` | `r ref("mlr3::as_data_backend")` | - |
 | `r ref("mlr3db::DataBackendDuckDB")` | `r ref("as_duckdb_backend")` | - |
diff --git a/book/chapters/chapter11/large-scale_benchmarking.qmd b/book/chapters/chapter11/large-scale_benchmarking.qmd
index 1743cc4a6..812f05935 100644
--- a/book/chapters/chapter11/large-scale_benchmarking.qmd
+++ b/book/chapters/chapter11/large-scale_benchmarking.qmd
@@ -49,15 +49,13 @@ lrn_baseline = lrn("classif.featureless", id = "featureless")
 lrn_lr = lrn("classif.log_reg")
 lrn_lr = as_learner(ppl("robustify", learner = lrn_lr) %>>% lrn_lr)
 lrn_lr$id = "logreg"
-lrn_lr$fallback = lrn_baseline
-lrn_lr$encapsulate = c(train = "try", predict = "try")
+lrn_lr$encapsulate("try", fallback = lrn_baseline)
 
 # random forest pipeline
 lrn_rf = lrn("classif.ranger")
 lrn_rf = as_learner(ppl("robustify", learner = lrn_rf) %>>% lrn_rf)
 lrn_rf$id = "ranger"
-lrn_rf$fallback = lrn_baseline
-lrn_rf$encapsulate = c(train = "try", predict = "try")
+lrn_rf$encapsulate("try", fallback = lrn_baseline)
 
 learners = list(lrn_lr, lrn_rf, lrn_baseline)
 ```
diff --git a/book/chapters/chapter12/model_interpretation.qmd b/book/chapters/chapter12/model_interpretation.qmd
index 078595c7b..e6f343d9c 100644
--- a/book/chapters/chapter12/model_interpretation.qmd
+++ b/book/chapters/chapter12/model_interpretation.qmd
@@ -239,7 +239,7 @@ To illustrate this, we will select a random data point to explain.
 As we are dealing with people, we will name our observation "Charlie" and first look at the black box predictions:
 
 ```{r Charlie,  asis='results'}
-Charlie = credit_x[35, ]
+Charlie = tsk_german$data(rows = 127L, cols = tsk_german$feature_names)
 gbm_predict = predictor$predict(Charlie)
 gbm_predict
 ```
@@ -315,7 +315,7 @@ The `sample.size` argument (default is `sample.size = 100`) can be increased to
 #| fig-cap: Shapley values for Charlie. The actual prediction (0.63) displays the prediction of the model for the observation we are interested in, the average prediction (0.71) displays the average prediction over the given test dataset. Each horizontal bar is the Shapley value (phi) for the given feature.
 #| fig-alt: 10 bar plots of Shapley values, one for each feature. x-axis says 'phi' and ranges from -0.1 to 0.05. The strongest positive contributions are from the `duration`, `purpose` and `property` variables. The strongest negative contributions are `status`, `amount`, and `savings`.
 #| label: fig-iml-shapley
-shapley = Shapley$new(predictor, x.interest = Charlie,
+shapley = Shapley$new(predictor, x.interest = as.data.frame(Charlie),
   sample.size = 1000)
 shapley$plot()
 ```
diff --git a/book/chapters/chapter15/predsets_valid_inttune.qmd b/book/chapters/chapter15/predsets_valid_inttune.qmd
new file mode 100644
index 000000000..b63724971
--- /dev/null
+++ b/book/chapters/chapter15/predsets_valid_inttune.qmd
@@ -0,0 +1,481 @@
+---
+aliases:
+  - "/predsets_valid_inttune.html"
+---
+
+# Predict Sets, Validation and Internal Tuning (+) {#sec-predsets-valid-inttune}
+
+{{< include ../../common/_setup.qmd >}}
+
+`r chapter = "Predict Sets, Validation and Internal Tuning (+)"` `r authors(chapter)`
+
+## Predict Sets and Training Error Estimation {#sec-predict-sets}
+
+In @sec-performance we have already studied in detail how to train, predict and evaluate many different learners. Evaluating a fully trained model usually requires making predictions on unseen test observations. When we predict directly with a trained learner, we can explicitly control which observations are used:
+
+```{r}
+tsk_sonar = tsk("sonar")
+lrn_rf = lrn("classif.ranger")
+lrn_rf$train(tsk_sonar, row_ids = 4:208)
+pred1 = lrn_rf$predict(tsk_sonar, row_ids = 1:3)
+pred2 = lrn_rf$predict_newdata(tsk_sonar$data(1:3))
+```
+
+But when using `resample()` or `benchmark()`, the default behavior is to predict on the *test* set of the resampling. It is also possible to make predictions on other dedicated subsets of the task and data, i.e. the *train* and *internal_valid* data, by configuring the `$predict_sets` of a learner.
+We will discuss the more complex *internal_valid* option in the next sections.
+We will now look at how to predict on *train* sets.
+This is sometimes be of interest for further analysis or to study overfitting. Or maybe we are simply curious.
+Let's configure our learner to simultaneously predict on *train* and *test*:
+
+```{r}
+lrn_rf$predict_sets = c("train", "test")
+rr = resample(tsk_sonar, lrn_rf, rsmp("cv", folds = 3))
+```
+
+The learner, during resampling, will now after having been trained for the current iteration, produce predictions on all requested sets. To access them, we can either ask for a list of 3 prediction objects, one per CV fold, or we can ask for a combined prediction object for the whole CV -- which in this case contains as many prediction rows as observations in the task.
+
+```{r}
+str(rr$predictions("test")) # or str(rr$predictions("train"))
+rr$prediction("test") # or rr$prediction("train")
+```
+
+We can also apply performance measures to specific sets of the resample result:
+
+```{r}
+rr$aggregate(list(
+  msr("classif.ce", predict_sets = "train", id = "ce_train"),
+  msr("classif.ce", predict_sets = "test", id = "ce_test")
+))
+```
+
+The default predict set for a measure is usually the test set. But we can request other sets here. If multiple predict sets are requested for the measure, their predictions are joined before they are passed into the measure, which then usually calculates an aggregated score over all predicted rows of the set. In our case, unsurprisingly, the train error is lower than the test error.
+
+If we only want to access information that is computed during training, we can even configure the learner not to make any predictions at all. This is useful, for example, for learners that already (in their underlying implementation) produce an estimate of their generalization error during training, e.g. using out-of-bag error estimates or validation scores. The former, which is only available to learners with the 'oob_error' property, can be accessed via `r ref("MeasureOOBError")`. The latter is available to learners with the 'validation' property and is implemented as `r ref("MeasureInternalValidScore")`. Below we evaluate a random forest using its out-of-bag error. Since we do not need any predict sets, we can use `r ref("ResamplingInsample")`, which will use the entire dataset for training.
+
+```{r}
+lrn_rf$predict_sets = NULL
+rsmp_in = rsmp("insample")
+rr = resample(tsk_sonar, lrn_rf, rsmp_in, store_models = TRUE)
+msr_oob = msr("oob_error")
+rr$aggregate(msr_oob)
+```
+
+All this works in exactly the same way for benchmarking, tuning, nested resampling, and any other procedure where resampling is internally involved and we either generate predictions or apply performance measures on them. Below we illustrate this by tuning the `mtry.ratio` parameter of a random forest (with a simple grid search).
+Instead of explicitly making predictions on some test data and evaluating them, we use OOB error to evaluate `mtry.ratio`.
+This can speed up the tuning process considerably, as in this case only one RF is fitted (it is simply trained) and we can access the OOB from this single model, instead of fitting multiple models. As the OOB observations are untouched during the training of each tree in the ensemble, this still produces a valid performance estimate.
+
+```{r}
+lrn_rf$param_set$set_values(
+  mtry.ratio = to_tune(0.1, 1)
+)
+
+ti = tune(
+  task = tsk_sonar,
+  tuner = tnr("grid_search"),
+  learner = lrn_rf,
+  resampling = rsmp_in,
+  measure = msr_oob,
+  term_evals = 10,
+  store_models = TRUE
+)
+```
+
+## Validation {#sec-validation}
+
+For iterative training (which many learners use) it can be interesting to track performance *during* training on *validation* data. One can use this for simple logging or posthoc analysis, but the major use case is early stopping. If the model’s performance on the training data keeps improving but the performance on the validation data plateaus or degrades, this indicates overfitting and we should stop iterative training. Handling this in an online fashion during training is much more efficient than configuring the number of iterations from the outside via traditional, offline hyperparameter tuning, where we would fit the model again and again with different iteration numbers (and would not exploit any information regarding sequential progress).
+
+In `mlr3`, learners can have the 'validation' and 'internal_tuning' properties to indicate whether they can make use of a validation set and whether they can internally optimize hyperparameters, for example by stopping early. To check if a given learner supports this, we can simply access its `$properties` field. Examples of such learners are boosting algorithms like XGBoost, LightGBM, or CatBoost, as well as deep learning models from `r ref_pkg("mlr3torch")`. In this section we will train XGBoost on sonar and keep track of its performance on a validation set.
+
+```{r}
+tsk_sonar = tsk("sonar")
+lrn_xgb = lrn("classif.xgboost")
+lrn_xgb
+```
+
+To enable validation, we need to configure how the validation data is constructed. For XGBoost there is a special `watchlist` parameter, but `mlr3` also provides a standardized -- and as we will see later, more powerful -- interface via the learner's `$validate` field. This field can be set to:
+
+-   `NULL` to use no validation data (default),
+-   a ratio indicating the proportion of training data to be used as the validation set,
+-   `"predefined"` to use the validation data specified in the task (we will see shortly how to configure this), and
+-   `"test"` to use the test set as validation data, which only works in combination with resampling and tuning.
+
+::: callout-note
+## Test Data Leakage
+
+If a learner's `$validate` field is set to 'test', we will leak the resampling test set during training. This will lead to biased performance estimates if the validation scores are used for early stopping. Whether this is desireable depends on the context: if the test set is used to evaluate parameter configurations during HPO (i.e. it acts as a validation set), then this is usually OK; However, if the purpose of the test set is to provide an unbiased estimate of performance, e.g. to compare different learners, then this is not OK.
+:::
+
+Below, we configure the XGBoost learner to use $1/3$ of its training data for validation:
+
+```{r}
+lrn_xgb$validate = 1/3
+```
+
+Next, we set the number of iterations (`nrounds`) and which metric to track (`eval_metric`) and train the learner. Here, $1/3$ of the observations from the training task will be solely used for validation and the remaining $2/3$ for training. If stratification or grouping is enabled in the task, this will also be respected.
+For further details on this see @sec-performance.
+
+```{r}
+lrn_xgb$param_set$set_values(
+  nrounds = 100,
+  eval_metric = "logloss"
+)
+lrn_xgb$train(tsk_sonar)
+```
+
+Because the XGBoost learner kept a log of the validation performance, we can now access this through the `$model` slot. Where exactly in the model this information is stored, depends on the specific learning algorithm. For XGBoost, the history is stored in `$evaluation_log`:
+
+```{r}
+tail(lrn_xgb$model$evaluation_log)
+```
+
+The validation loss over time is visualized in the figure below, with the iterations on the x-axis and the validation logloss on the y-axis:
+
+```{r, out.width = "70%", echo = FALSE, warning = FALSE}
+library(ggplot2)
+set.seed(1)
+ggplot(lrn_xgb$model$evaluation_log, aes(x = iter, y = test_logloss)) +
+  geom_line() +
+  labs(
+    x = "Boosting Iteration",
+    y = "Validation Logloss"
+  ) + theme_minimal()
+```
+
+`mlr3` also provides a standardized acccessor for the final validation performance. We can access this via the `$internal_valid_scores` field, which is a named list containing possibly more than one validation metric.
+
+```{r}
+lrn_xgb$internal_valid_scores
+```
+
+In some cases one might want to have more control over the construction of the validation data. This can be useful, for example, if there is a predefined validation split to be used with a task. Such fine-grained control over the validation data is possible by setting the `validate` field to `"predefined"`.
+
+```{r}
+lrn_xgb$validate = "predefined"
+```
+
+This allows us to use the `$internal_valid_task` defined in the training task. Below, we set the validation task to use 60 randomly sampled ids and remove them from the primary task.
+
+```{r}
+valid_ids = sample(tsk_sonar$nrow, 60)
+tsk_valid = tsk_sonar$clone(deep = TRUE)
+tsk_valid$filter(valid_ids)
+tsk_sonar$filter(setdiff(tsk_sonar$row_ids, valid_ids))
+tsk_sonar$internal_valid_task = tsk_valid
+```
+
+Note that we could have achieved the same by simply setting `tsk_valid$internal_valid_task = valid_ids`, but showed the explicit way for completeness sake.
+The associated validation task now has 60 observations and the primary task 148:
+
+```{r}
+c(tsk_sonar$internal_valid_task$nrow, tsk_sonar$nrow)
+```
+
+When we now train, the learner will validate itself on the specified additional task.
+Note that the `$internal_valid_task` slot is always used internally, even if you set a ratio value in `learner$validate`, it is simply automatically auto-constructed (and then passed down).
+
+```{r}
+lrn_xgb$train(tsk_sonar)
+```
+
+In many cases, however, one does not only train an individual learner, but combines it with other (preprocessing) steps in a `r ref("GraphLearner")`, see @sec-preprocessing. Validation in a `GraphLearner` is still possible, because preprocessing `PipeOp`s also handle the validation task. While the *train* logic of the `PipeOp`s is applied to the primary task, the *predict* logic is applied to the validation data. This ensures that there is no data leakage when the XGBoost learner evaluates its performance on the validation data. Below, we construct a `PipeOpPCA` and apply it to the sonar task with a validation task.
+
+```{r}
+po_pca = po("pca")
+taskout = po_pca$train(list(tsk_sonar))[[1]]
+taskout$internal_valid_task
+```
+
+The preprocessing that is applied to the `$internal_valid_task` during `$train()` is equivalent to predicting on it:
+
+```{r}
+po_pca$predict(list(tsk_sonar$internal_valid_task))[[1L]]
+```
+
+This means that tracking validation performance works even in complex graph learners, which would not be possible when simply setting the `watchlist` parameter of XGBoost. Below, we chain the PCA operator to XGBoost and convert it to a learner.
+
+```{r}
+glrn = as_learner(po_pca %>>% lrn_xgb)
+```
+
+While this almost 'just works', we now need to specify the `$validate` field on two levels:
+
+1.  For the `GraphLearner` itself, i.e. how the validation data is created before the `Task` enters the graph.
+2.  Which `PipeOp`s that have the property `"validation"` should actually use it.
+
+This configuration can be simplified by using `set_validate()`. When applied to a `GraphLearner`, we can specify the arguments `validate` which determines *how* to create the validation data and optionally the argument `ids` which specifies *which* `PipeOp`s should use it. By default, the latter is set to the `$base_learner()` of the `Graph`, which is the last learner. This means that both calls below are equivalent:
+
+```{r}
+set_validate(glrn, validate = "predefined")
+set_validate(glrn, validate = "predefined", ids = "classif.xgboost")
+```
+
+We can now train the graph learner just as before and inspect the final validation metric, which is now prefixed with the ID of the corresponding `PipeOp`.
+
+```{r}
+glrn$validate = "predefined"
+glrn$train(tsk_sonar)
+glrn$internal_valid_scores
+```
+
+::: callout-note
+## Field `$validate` for `PipeOp`s
+
+Since individual `PipeOp`s cannot control how the validation data is generated, only whether to use it, their `$validate` field can only be set to `NULL` or `"predefined"`. This is why we get an error when running `as_pipeop(lrn("classif.xgboost", validate = 0.3))`. When using validation in a GraphLearner, it is best to first construct the learner without specifying the validation data and then use `set_validate()`.
+:::
+
+## Internal Tuning {#sec-internal-tuning}
+
+Not only can XGBoost log its validation performance, it can also monitor it to *early stop* its training, i.e. perform internal tuning of the `nrounds` hyperparameter during training. This is marked by the `"internal_tuning"` property:
+
+```{r}
+"internal_tuning" %in% lrn_xgb$properties
+```
+
+Early stopping for XGBoost can be enabled by specifying the `early_stopping_rounds` parameter. This is also known as *patience* and specifies for how many iterations the validation loss must not improve for the training to terminate. The metric that is used for early stopping is the first value that we passed to `eval_metric`, which was the logloss.
+
+```{r}
+lrn_xgb$param_set$set_values(
+  early_stopping_rounds = 10,
+  nrounds = 100
+)
+```
+
+When we now train the learner, we can access the internally optimized `nrounds` through the `$internal_tuned_values` field.
+
+```{r}
+lrn_xgb$train(tsk_sonar)
+lrn_xgb$internal_tuned_values
+```
+
+By using early stopping, we were able to already terminate training after `r lrn_xgb$internal_tuned_values$nrounds + lrn_xgb$param_set$values$early_stopping_rounds` iterations. Below, we visualize the validation loss over time and the optimal nrounds is marked red. We can see that the logloss plateaus after `r lrn_xgb$internal_tuned_values$nrounds` rounds, but training continues for a while afterwards due to the patience setting.
+
+```{r, echo = FALSE, out.width = "70%"}
+theme_set(theme_minimal())
+data = lrn_xgb$model$evaluation_log
+ggplot(data, aes(x = iter, y = test_logloss)) +
+  geom_line() +
+  geom_point(data = data.table(x = lrn_xgb$internal_tuned_values$nrounds,
+    y = lrn_xgb$internal_valid_scores$logloss), aes(x = x, y = y, color = "red"), show.legend = FALSE) +
+  labs(
+    x = "Iteration", y = "Validation Logloss"
+  )
+```
+
+So far we have only used the early stopping implementation of XGBoost to optimize `nrounds`, but have not tuned any other hyperparameters. This is where `r mlr3` comes in, as it allows us to combine the internal tuning of a learner with (non-internal) hyperparameter tuning via `r ref_pkg("mlr3tuning")`. To do this, we set both parameters to `to_tune()`, but mark `nrounds` to be tuned internally.
+
+```{r}
+lrn_xgb$param_set$set_values(
+  eta = to_tune(0.001, 0.1, logscale = TRUE),
+  nrounds = to_tune(upper = 500, internal = TRUE)
+)
+```
+
+In such scenarios, one might often want to use the same validation data to optimize `eta` and `nrounds`. This is possible by specifying the `"test"` option of the `validate` field. This means that in each resampling iteration the validation data will be set to the test set, i.e. the same data that will also be used to evaluate the parameter configuration (to tune `eta`).
+
+```{r}
+lrn_xgb$validate = "test"
+```
+
+We will now continue to tune XGBoost with a simple grid search with 10 evaluations and a 3-fold CV for inner resampling.
+Internally, this will train XGBoost with 10 different values of `eta` and the `nrounds` parameter fixed at 500, i.e. the upper bound from above. For each value of `eta` a 3-fold CV with early stopping will be performed, yielding 3 (possibly different) early stopped values for `nrounds` for each value of `eta`. These are combined into a single value according to an aggregation rule, which by default is set to averaging, but which can be overridden when creating the internal tune token, see `r ref("to_tune()")` for more information.
+
+When combining internal tuning with hyperparameter optimization via `r ref_pkg("mlr3tuning")` we need to specify two performance metrics: one for the internal tuning and one for the `Tuner`. For this reason, `mlr3` requires the internal tuning metric to be set explicitly, even if a default value exists. There are two ways to use the same metric for both types of hyperparameter optimization:
+
+1.  Use `msr("internal_valid_scores", select = <id>)`, i.e. the final validation score, as the tuning measure. As a learner can have multiple internal valid scores, the measure allows us to select one by specifying the `select` argument. We also need to specify whether the measure should be minimized.
+2.  Set both, the `eval_metric` and the tuning measure to the same metric, e.g. `eval_metric = "error"` and `measure = msr("classif.ce")`. Some learners even allow to set the validation metric to an `mlr3::Measure`. You can find out which ones support this feature by checking their corresponding documentation. One example for this is XGBoost.
+
+The advantage of using the first option is that the predict step can be skipped because the internal validation scores are already computed during training.
+In a certain sense, this is similar to the evaluation of the random forest with the OOB error in @sec-predict-sets.
+
+```{r}
+tsk_sonar = tsk("sonar")
+lrn_xgb$predict_sets = NULL
+
+ti = tune(
+  tuner = tnr("grid_search"),
+  learner = lrn_xgb,
+  task = tsk_sonar,
+  resampling = rsmp("cv", folds = 3),
+  measure = msr("internal_valid_score",
+    select = "logloss", minimize = TRUE),
+  term_evals = 10L
+)
+```
+
+The tuning result contains the best found configuration for both `eta` and `nrounds`.
+
+```{r}
+ti$result_learner_param_vals[c("eta", "nrounds")]
+```
+
+We now show how to extract the different parameter configurations from the tuning archive. All internally tuned parameters are accessible via the `$internal_tuned_values`. This is a list column, because it is possible to tune more than one parameter internally, e.g. in a `GraphLearner`. Below we extract the values for `eta` (transformed back from its log scale), `nrounds` (internally tuned) and the logloss. The latter was evaluated on the internal validation tasks, which corresponded to the `Resampling`'s test sets as we specified `validate = "test"`. By visualizing the results we can see an inverse relationship between the two tuning parameters: a smaller step size (eta) requires more boosting iterations (nrounds).
+
+```{r, out.width = "70%"}
+d = ti$archive$data
+
+d = data.table(
+  eta = exp(d$eta),
+  nrounds = unlist(d$internal_tuned_values),
+  logloss = d$logloss
+)
+
+ggplot(data = d, aes(x = eta, y = nrounds, color = logloss)) +
+  geom_point() + theme_minimal()
+```
+
+This also works with an `r ref("AutoTuner")`, which will use the internally optimized `nrounds`, as well as the offline tuned `eta` for the final model fit.
+This means that there is no validation or early stopping when training the final model, and we use all available data.
+
+```{r}
+at = auto_tuner(
+  tuner = tnr("grid_search"),
+  learner = lrn_xgb,
+  resampling = rsmp("cv", folds = 3),
+  measure = msr("internal_valid_score",
+    select = "logloss", minimize = TRUE),
+  term_evals = 10L
+)
+at$train(tsk_sonar)
+```
+
+If we were to resample the `AutoTuner` from above, we would still get valid performance estimates.
+This is because the test set of the outer resampling is *never* used as validation data, since the final model fit does not perform any validation.
+The validation data generated during the hyperparameter tuning uses the test set of the inner resampling, which is a subset of the training set of the outer resampling.
+
+However, care must be taken when using the test set of a resampling for validation. Whether this is OK depends on the context and purpose of the resampling.
+If the purpose of resampling is to get an unbiased performance estimate of algorithms, some of which stop early and some of which don't, this is not OK.
+In such a situation, the former would have an unfair advantage over the latter. The example below illustrates such a case where this would not be a fair comparison between the two learners.
+
+```{r}
+lrn_xgb$param_set$set_values(
+  eta = 0.1, nrounds = 500, early_stopping_rounds = 10
+)
+lrn_xgb$predict_sets = "test"
+
+design = benchmark_grid(
+  tsk_sonar, list(lrn_xgb, lrn("classif.rpart")), rsmp("cv", folds = 3)
+)
+bmr = benchmark(design)
+bmr$aggregate(msr("classif.ce"))
+```
+
+At last, we will cover how to enable internal tuning when manually specifying a search space with the `ps()` function instead of the `to_tune()`-mechanism.
+While the latter is more convenient and therefore usually recommended, manually defining a search space gives you for more flexibility with respect to parameter transformations, see e.g. @sec-tune-trafo.
+There are two ways to do this:
+1. We can include the internally tuned parameters in the primary `search_space`, specify an aggregation function and tag them with `"internal_tuning"`.
+2. We define a seperate `internal_search_space` where we also define the aggregation function but don't include the `"internal_tuning"` tag.
+
+We start by illustrating the first approach, which is more compact as the search space is defined as a single object.
+
+```{r}
+search_space = ps(
+  eta = p_dbl(0.001, 0.1, logscale = TRUE),
+  nrounds = p_int(upper = 500, tags = "internal_tuning",
+    aggr = function(x) as.integer(mean(unlist(x))))
+)
+```
+
+This search space can be passed to the `AutoTuner` and the optimization will then proceed as before.
+
+```{r}
+at = auto_tuner(
+  tuner = tnr("grid_search"),
+  learner = lrn_xgb,
+  resampling = rsmp("cv", folds = 3),
+  measure = msr("internal_valid_score",
+    select = "logloss", minimize = TRUE),
+  search_space = search_space,
+  term_evals = 10L
+)
+at$train(tsk_sonar)
+```
+
+One reason to use the second approach is when a global parameter transformation (`.extra_trafo`) is applied to the primary search space.
+Univariate transformation, such as logarithmic scaling, are no problem.
+Below, we demonstrate this using an illustrative example where we sample set the `max_depth` of the trees depending on the value of `eta`:
+
+```{r}
+search_space = ps(eta = p_dbl(0.001, 0.1, logscale = TRUE),
+  .extra_trafo = function(x, param_set) {
+    x$max_depth = if (exp(x$eta) < 0.01) 10 else 20
+    x
+  })
+
+internal_search_space = ps(
+  nrounds = p_dbl(upper = 500, tags = "internal_tuning",
+    aggr = function(x) as.integer(mean(unlist(x))))
+)
+
+at = auto_tuner(
+  tuner = tnr("grid_search"),
+  learner = lrn_xgb,
+  resampling = rsmp("cv", folds = 3),
+  measure = msr("internal_valid_score",
+    select = "logloss", minimize = TRUE),
+  search_space = search_space,
+  internal_search_space = internal_search_space,
+  term_evals = 10L
+)
+at$train(tsk_sonar)
+```
+
+## Conclusion
+
+In this chapter we first learned how to evaluate machine learning methods on different prediction sets, namely *train*, *internal_valid* and *test*. Then we learned how to track the performance of an iterative learning procedure on a validation set. This technique also works seamlessly in a graphlearner, the only difference being that you have to specify not only how to create the validation data, but also which PipeOps should use it. Furthermore, mlr3's *internal tuning* mechanism allows you to combine hyperparameter tuning via `r mlr3tuning` with internal tuning of the learning algorithm, such as early stopping of XGBoost.
+
+## Exercises
+
+1.  Manually `$train()` a LightGBM classifier from `r ref_pkg("mlr3extralearners")` on the pima task using $1/3$ of the training data for validation. As the pima task has missing values, select a method from `r ref_pkg("mlr3pipelines")` to impute them. Explicitly set the evaluation metric to logloss (`"binary_logloss"`), the maximum number of boosting iterations to 1000, the patience parameter to 10, and the step size to 0.01. After training the learner, inspect the final validation scores as well as the early stopped number of iterations.
+
+2.  Wrap the learner from exercise 1) in an `AutoTuner` using a three-fold CV for the tuning. Also change the rule for aggregating the different boosting iterations from averaging to taking the maximum across the folds. Don't tune any parameters other than `nrounds`, which can be done using `tnr("internal")`. Use the internal validation metric as the tuning measure. Compare this learner with a `lrn("classif.rpart")` using a 10-fold outer cross-validation with respect to classification accuracy.
+
+3.  Consider the code below:
+
+    ```{r}
+    branch_lrn = as_learner(
+      ppl("branch", list(
+        lrn("classif.ranger"),
+        lrn("classif.xgboost",
+          early_stopping_rounds = 10,
+          eval_metric = "error",
+          eta = to_tune(0.001, 0.1, logscale = TRUE),
+          nrounds = to_tune(upper = 1000, internal = TRUE)))))
+
+    set_validate(branch_lrn, validate = "test", ids = "classif.xgboost")
+    branch_lrn$param_set$set_values(branch.selection = to_tune())
+
+    at = auto_tuner(
+      tuner = tnr("grid_search"),
+      learner = branch_lrn,
+      resampling = rsmp("holdout", ratio = 0.8),
+      # cannot use internal validation score because ranger does not have one
+      measure = msr("classif.ce"),
+      term_evals = 10L,
+      store_models = TRUE
+    )
+
+    tsk_sonar = tsk("sonar")$filter(1:100)
+
+    rr = resample(
+      tsk_sonar, at, rsmp("holdout", ratio = 0.8), store_models = TRUE
+    )
+    ```
+
+    Answer the following questions (ideally without running the code):
+
+    3.1 During the hyperparameter optimization, how many observations are used to train the XGBoost algorithm (excluding validation data) and how many for the random forest? Hint: learners that cannot make use of validation data ignore it. 3.2 How many observations would be used to train the final model if XGBoost was selected? What if the random forest was chosen? 3.3 How would the answers to the last two questions change if we had set the `$validate` field of the graphlearner to `0.25` instead of `"test"`?
+
+4.  Look at the (failing) code below:
+
+    ```{r, eval = FALSE}
+    tsk_sonar = tsk("sonar")
+    glrn = as_learner(
+      po("pca") %>>% lrn("classif.xgboost", validate = 0.3)
+    )
+    ```
+
+    Can you explain *why* the code fails? Hint: Should the data that xgboost uses for validation be preprocessed according to the *train* or *predict* logic?
+
+::: {.content-visible when-format="html"}
+`r citeas(chapter)`
+:::
diff --git a/book/chapters/chapter3/evaluation_and_benchmarking.qmd b/book/chapters/chapter3/evaluation_and_benchmarking.qmd
index cb2c1ceb6..d41c4907e 100644
--- a/book/chapters/chapter3/evaluation_and_benchmarking.qmd
+++ b/book/chapters/chapter3/evaluation_and_benchmarking.qmd
@@ -204,7 +204,7 @@ acc[, .(iteration, classif.ce)]
 
 ::: {.callout-tip}
 ## Evaluating Train Sets
-By default, `$score()` evaluates the performance in the *test* sets in each iteration, however, you could evaluate the *train* set performance with `$score(predict_sets = "train")`.
+By default, `$score()` evaluates the performance in the *test* sets in each iteration, however, you could evaluate the *train* set performance, see @sec-valid-tuning.
 :::
 
 While `$score()` returns the performance in each evaluation, `r index('$aggregate()', parent = "Learner", aside = TRUE, code = TRUE)`, returns the aggregated score across all resampling iterations.
diff --git a/book/chapters/chapter5/advanced_tuning_methods_and_black_box_optimization.qmd b/book/chapters/chapter5/advanced_tuning_methods_and_black_box_optimization.qmd
index 587c40e5d..81751346a 100644
--- a/book/chapters/chapter5/advanced_tuning_methods_and_black_box_optimization.qmd
+++ b/book/chapters/chapter5/advanced_tuning_methods_and_black_box_optimization.qmd
@@ -52,13 +52,13 @@ In the above example, we can see the tuning process breaks and we lose all infor
 This is even worse in nested resampling or benchmarking when errors could cause us to lose all progress across multiple configurations or even learners and tasks.
 
 `r index('Encapsulation')` (@sec-encapsulation) allows errors to be isolated and handled, without disrupting the tuning process.
-We can tell a learner to encapsulate an error by setting the `$encapsulate` field as follows:
+We can tell a learner to encapsulate an error using the `$encapsulate()` method as follows:
 
 ```{r optimization-035}
-learner$encapsulate = c(train = "evaluate", predict = "evaluate")
+learner$encapsulate(method = "evaluate", fallback = lrn("classif.featureless"))
 ```
 
-Note by passing `"evaluate"` to both `train` and `predict`, we are telling the learner to set up encapsulation in both the training and prediction stages (see @sec-error-handling for other encapsulation options).
+Note by passing `"evaluate"`, we are telling the learner to set up encapsulation in both the training and prediction stages (see @sec-error-handling for other encapsulation options).
 
 Another common issue that cannot be easily solved during HPO is learners not converging and the process running indefinitely.
 We can prevent this from happening by setting the `timeout` field in a learner, which signals the learner to stop if it has been running for that much time (in seconds), again this can be set for training and prediction individually:
@@ -72,11 +72,7 @@ When this happens, our hyperparameter optimization experiment will fail as we ca
 Therefore it is essential to select a `r index('fallback learner')` (@sec-fallback), which is a learner that will be fitted if the learner of interest fails.
 
 A common approach is to use a featureless baseline\index{baselines} (`lrn("regr.featureless")` or `lrn("classif.featureless")`).
-Below we set `lrn("classif.featureless")`, which always predicts the majority class, by passing this learner to the `$fallback` field.
-
-```{r optimization-037}
-learner$fallback = lrn("classif.featureless")
-```
+We use `lrn("classif.featureless")`, which always predicts the majority class.
 
 We can now run our experiment and see errors that occurred during tuning in the archive.
 
@@ -1095,7 +1091,7 @@ In the next chapter we will look at feature selection and see how `r mlr3filters
 
 | Class                                                                                           | Constructor/Function                                                                                                                  | Fields/Methods              |
 |-------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------|-----------------------------|
-| `r ref("Learner")`                                                                              | `r ref("lrn")`                                                                                                                        | `$encapsulate`; `$fallback` |
+| `r ref("Learner")`                                                                              | `r ref("lrn")`                                                                                                                        | `$encapsulate()`; |
 | `r ref("TuningInstanceBatchMultiCrit")`                                                         | `r ref("ti()")`/`r ref("tune()")`                                                                                                     | `$result`; `$archive`       |
 | `r ref("mlr3hyperband::TunerHyperband")`                                                        | `tnr("hyperband")`                                                                                                                    | -                           |
 | `r ref("bbotk::Objective")`                                                                     | -                                                                                                                                     |                             |
diff --git a/book/chapters/chapter8/non-sequential_pipelines_and_tuning.qmd b/book/chapters/chapter8/non-sequential_pipelines_and_tuning.qmd
index 21bb70f16..27e5625cf 100644
--- a/book/chapters/chapter8/non-sequential_pipelines_and_tuning.qmd
+++ b/book/chapters/chapter8/non-sequential_pipelines_and_tuning.qmd
@@ -750,9 +750,8 @@ graph_learner = as_learner(
 As good practice, we encapsulate our learner and add a fallback to prevent fatal errors (@sec-tuning-errors).
 
 ```{r}
-graph_learner$encapsulate = c(train = "evaluate", predict = "evaluate")
+graph_learner$encapsulate("evaluate", lrn("classif.featureless"))
 graph_learner$timeout = c(train = 30, predict = 30)
-graph_learner$fallback = lrn("classif.featureless")
 ```
 
 Now we can tune our SVM by tuning our `GraphLearner` as normal, below we set `eta = 3` for Hyperband.
diff --git a/book/common/chap_auths.csv b/book/common/chap_auths.csv
index e1a814ec6..3d95b93d7 100644
--- a/book/common/chap_auths.csv
+++ b/book/common/chap_auths.csv
@@ -12,4 +12,5 @@ Chapter Number,Title,Authors
 11,Large-Scale Benchmarking,"Sebastian Fischer, Michel Lang, Marc Becker"
 12,Model Interpretation,"Susanne Dandl, Przemysław Biecek, Giuseppe Casalicchio, Marvin N. Wright"
 13,Beyond Regression and Classification,"Raphael Sonabend, Patrick Schratz, Damir Pulatov"
-14,Algorithmic Fairness,Florian Pfisterer
\ No newline at end of file
+14,Algorithmic Fairness,Florian Pfisterer
+15,"Predict Sets, Validation and Internal Tuning (+)", Sebastian Fischer
diff --git a/book/renv.lock b/book/renv.lock
index cb12b6486..ae9e48405 100644
--- a/book/renv.lock
+++ b/book/renv.lock
@@ -834,7 +834,7 @@
     },
     "bbotk": {
       "Package": "bbotk",
-      "Version": "1.0.1",
+      "Version": "1.2.0",
       "Source": "Repository",
       "Repository": "CRAN",
       "Requirements": [
@@ -3146,6 +3146,11 @@
       ],
       "Hash": "b8552d117e1b808b09a832f589b79035"
     },
+    "lightgbm": {
+      "Package": "lightgbm",
+      "Version": "4.5.0",
+      "Source": "Repository"
+    },
     "linprog": {
       "Package": "linprog",
       "Version": "0.9-4",
@@ -3486,28 +3491,8 @@
     },
     "mlr3": {
       "Package": "mlr3",
-      "Version": "0.20.2",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R",
-        "R6",
-        "backports",
-        "checkmate",
-        "data.table",
-        "evaluate",
-        "future",
-        "future.apply",
-        "lgr",
-        "mlbench",
-        "mlr3measures",
-        "mlr3misc",
-        "palmerpenguins",
-        "paradox",
-        "parallelly",
-        "uuid"
-      ],
-      "Hash": "2e5225a210cef7155a2899a4d16f1cf2"
+      "Version": "0.21.1",
+      "Source": "Repository"
     },
     "mlr3batchmark": {
       "Package": "mlr3batchmark",
@@ -3643,27 +3628,14 @@
     },
     "mlr3extralearners": {
       "Package": "mlr3extralearners",
-      "Version": "0.8.0-9000",
+      "Version": "0.9.0-9000",
       "Source": "GitHub",
-      "Remotes": "binderh/CoxBoost, catboost/catboost/catboost/R-package, mlr-org/mlr3proba, RaphaelS1/survivalmodels, PlantedML/randomPlantedForest, xoopR/distr6, xoopR/param6, xoopR/set6, ropensci/aorsf",
       "RemoteType": "github",
       "RemoteHost": "api.github.com",
-      "RemoteRepo": "mlr3extralearners",
       "RemoteUsername": "mlr-org",
-      "RemotePkgRef": "mlr-org/mlr3extralearners",
-      "RemoteRef": "HEAD",
-      "RemoteSha": "177791078dfdb168e715f6bd36c8aa7f0724d4f1",
-      "Requirements": [
-        "R",
-        "R6",
-        "checkmate",
-        "data.table",
-        "methods",
-        "mlr3",
-        "mlr3misc",
-        "paradox"
-      ],
-      "Hash": "f7c35090f956152200eefb718e5cbae2"
+      "RemoteRepo": "mlr3extralearners",
+      "RemoteRef": "main",
+      "RemoteSha": "5a3f664df5b3c55a80d1f2c76bc9d4d6cb8d30ef"
     },
     "mlr3fairness": {
       "Package": "mlr3fairness",
@@ -3711,7 +3683,7 @@
     },
     "mlr3fselect": {
       "Package": "mlr3fselect",
-      "Version": "1.0.0",
+      "Version": "1.2.0",
       "Source": "Repository",
       "Repository": "CRAN",
       "Requirements": [
@@ -3765,7 +3737,7 @@
     },
     "mlr3mbo": {
       "Package": "mlr3mbo",
-      "Version": "0.2.4",
+      "Version": "0.2.6",
       "Source": "Repository",
       "Repository": "CRAN",
       "Requirements": [
@@ -3785,15 +3757,8 @@
     },
     "mlr3measures": {
       "Package": "mlr3measures",
-      "Version": "0.6.0",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "PRROC",
-        "R",
-        "checkmate"
-      ],
-      "Hash": "b029a9f31f60ee3895291ec49ce5fe03"
+      "Version": "1.0.0",
+      "Source": "Repository"
     },
     "mlr3misc": {
       "Package": "mlr3misc",
@@ -3838,51 +3803,19 @@
     },
     "mlr3pipelines": {
       "Package": "mlr3pipelines",
-      "Version": "0.6.0",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R",
-        "R6",
-        "backports",
-        "checkmate",
-        "data.table",
-        "digest",
-        "lgr",
-        "mlr3",
-        "mlr3misc",
-        "paradox",
-        "withr"
-      ],
-      "Hash": "4377f534ff568943e8ab56af58c0e65a"
+      "Version": "0.7.0",
+      "Source": "Repository"
     },
     "mlr3proba": {
       "Package": "mlr3proba",
-      "Version": "0.6.7",
+      "Version": "0.7.1",
       "Source": "GitHub",
-      "Remotes": "xoopR/distr6, xoopR/param6, xoopR/set6",
       "RemoteType": "github",
       "RemoteHost": "api.github.com",
-      "RemoteRepo": "mlr3proba",
       "RemoteUsername": "mlr-org",
-      "RemotePkgRef": "mlr-org/mlr3proba",
-      "RemoteRef": "HEAD",
-      "RemoteSha": "737498ef4ec4a95f0322baaee242c47995975cf3",
-      "Requirements": [
-        "R",
-        "R6",
-        "Rcpp",
-        "checkmate",
-        "data.table",
-        "distr6",
-        "ggplot2",
-        "mlr3",
-        "mlr3misc",
-        "mlr3viz",
-        "paradox",
-        "survival"
-      ],
-      "Hash": "8b541bafcadd93623961cbe8d3ed8def"
+      "RemoteRepo": "mlr3proba",
+      "RemoteRef": "main",
+      "RemoteSha": "de18eb589c6a1c675b9e8895fe1a5c3bd7d1a6c5"
     },
     "mlr3spatial": {
       "Package": "mlr3spatial",
@@ -3924,21 +3857,14 @@
     },
     "mlr3tuning": {
       "Package": "mlr3tuning",
-      "Version": "1.0.0",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R",
-        "R6",
-        "bbotk",
-        "checkmate",
-        "data.table",
-        "lgr",
-        "mlr3",
-        "mlr3misc",
-        "paradox"
-      ],
-      "Hash": "1e725cff61c253b0e109c0ca2cf527b4"
+      "Version": "1.1.0.9000",
+      "Source": "GitHub",
+      "RemoteType": "github",
+      "RemoteHost": "api.github.com",
+      "RemoteUsername": "mlr-org",
+      "RemoteRepo": "mlr3tuning",
+      "RemoteRef": "main",
+      "RemoteSha": "8c4a4a2f322dd4ec62b0a8ce3c524e14a2aefd8b"
     },
     "mlr3tuningspaces": {
       "Package": "mlr3tuningspaces",