diff --git a/NEWS.md b/NEWS.md index dba77a500..867ed08a4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,8 +1,9 @@ # mlr3pipelines 0.5.0-9000 -* Feature: The `$add_pipeop()` method got an argument `clone` (old behaviour `TRUE` by default) -* Bugfix: `PipeOpFeatureUnion` in some rare cases dropped variables called `"x"` -* Compatibility with upcoming paradox release +* `pipeline_bagging()` gets the `replace` argument (old behaviour `FALSE` by default). +* Feature: The `$add_pipeop()` method got an argument `clone` (old behaviour `TRUE` by default). +* Bugfix: `PipeOpFeatureUnion` in some rare cases dropped variables called `"x"`. +* Compatibility with upcoming paradox release. # mlr3pipelines 0.5.0-2 diff --git a/R/pipeline_bagging.R b/R/pipeline_bagging.R index afcf0c7f9..31b743d32 100644 --- a/R/pipeline_bagging.R +++ b/R/pipeline_bagging.R @@ -28,6 +28,9 @@ #' predictions respectively. #' If `NULL` (default), no averager is added to the end of the graph. #' Note that setting `collect_multipliciy = TRUE` during construction of the averager is required. +#' @param replace `logical(1)` \cr +#' Whether to sample with replacement. +#' Default `FALSE`. #' @return [`Graph`] #' @export #' @examples @@ -36,9 +39,14 @@ #' lrn_po = po("learner", lrn("regr.rpart")) #' task = mlr_tasks$get("boston_housing") #' gr = pipeline_bagging(lrn_po, 3, averager = po("regravg", collect_multiplicity = TRUE)) -#' resample(task, GraphLearner$new(gr), rsmp("holdout")) +#' resample(task, GraphLearner$new(gr), rsmp("holdout"))$aggregate() +#' +#' # The original bagging method uses boosting by sampling with replacement. +#' gr = ppl("bagging", lrn_po, frac = 1, replace = TRUE, +#' averager = po("regravg", collect_multiplicity = TRUE)) +#' resample(task, GraphLearner$new(gr), rsmp("holdout"))$aggregate() #' } -pipeline_bagging = function(graph, iterations = 10, frac = 0.7, averager = NULL) { +pipeline_bagging = function(graph, iterations = 10, frac = 0.7, averager = NULL, replace = FALSE) { g = as_graph(graph) assert_count(iterations) assert_number(frac, lower = 0, upper = 1) @@ -50,7 +58,7 @@ pipeline_bagging = function(graph, iterations = 10, frac = 0.7, averager = NULL) } po("replicate", param_vals = list(reps = iterations)) %>>!% - po("subsample", param_vals = list(frac = frac)) %>>!% + po("subsample", param_vals = list(frac = frac, replace = replace)) %>>!% g %>>!% averager } diff --git a/man/mlr_graphs_bagging.Rd b/man/mlr_graphs_bagging.Rd index 42828ef94..58d5e1e83 100644 --- a/man/mlr_graphs_bagging.Rd +++ b/man/mlr_graphs_bagging.Rd @@ -5,7 +5,13 @@ \alias{pipeline_bagging} \title{Create a bagging learner} \usage{ -pipeline_bagging(graph, iterations = 10, frac = 0.7, averager = NULL) +pipeline_bagging( + graph, + iterations = 10, + frac = 0.7, + averager = NULL, + replace = FALSE +) } \arguments{ \item{graph}{\code{\link{PipeOp}} | \code{\link{Graph}} \cr @@ -27,6 +33,10 @@ in order to perform simple averaging of classification and regression predictions respectively. If \code{NULL} (default), no averager is added to the end of the graph. Note that setting \code{collect_multipliciy = TRUE} during construction of the averager is required.} + +\item{replace}{\code{logical(1)} \cr +Whether to sample with replacement. +Default \code{FALSE}.} } \value{ \code{\link{Graph}} @@ -49,6 +59,11 @@ library(mlr3) lrn_po = po("learner", lrn("regr.rpart")) task = mlr_tasks$get("boston_housing") gr = pipeline_bagging(lrn_po, 3, averager = po("regravg", collect_multiplicity = TRUE)) -resample(task, GraphLearner$new(gr), rsmp("holdout")) +resample(task, GraphLearner$new(gr), rsmp("holdout"))$aggregate() + +# The original bagging method uses boosting by sampling with replacement. +gr = ppl("bagging", lrn_po, frac = 1, replace = TRUE, + averager = po("regravg", collect_multiplicity = TRUE)) +resample(task, GraphLearner$new(gr), rsmp("holdout"))$aggregate() } } diff --git a/tests/testthat/test_mlr_graphs_bagging.R b/tests/testthat/test_mlr_graphs_bagging.R index a5dc1067c..15a70fb0c 100644 --- a/tests/testthat/test_mlr_graphs_bagging.R +++ b/tests/testthat/test_mlr_graphs_bagging.R @@ -39,3 +39,36 @@ test_that("Bagging Pipeline", { expect_true(all(map_lgl(predict_out, function(x) "PredictionClassif" %in% class(x)))) }) +test_that("Bagging with replacement", { + tsk = tsk("iris") + lrn = lrn("classif.rpart") + p = ppl("bagging", graph = po(lrn), replace = TRUE, averager = po("classifavg", collect_multiplicity = TRUE)) + expect_graph(p) + res = resample(tsk, GraphLearner$new(p), rsmp("holdout")) + expect_resample_result(res) + + tsk$filter(1:140) + expect_equal(anyDuplicated(tsk$data()), 0) # make sure no duplicates + + p = ppl("bagging", iterations = 2, frac = 1, + graph = lrn("classif.debug", save_tasks = TRUE), + replace = TRUE, averager = po("classifavg", collect_multiplicity = TRUE) + ) + p$train(tsk) + + expect_true(anyDuplicated(p$pipeops$classif.debug$state[[1]]$model$task_train$data()) != 0) + + getOrigId = function(data) { + tsk$data()[, origline := .I][data, on = colnames(tsk$data()), origline] + } + orig_id_1 = getOrigId(p$pipeops$classif.debug$state[[1]]$model$task_train$data()) + orig_id_2 = getOrigId(p$pipeops$classif.debug$state[[2]]$model$task_train$data()) + + expect_equal(length(orig_id_1), 140) + expect_equal(length(orig_id_2), 140) + # if we sampled the same values twice, the all.equal() would just give TRUE + expect_string(all.equal(orig_id_1, orig_id_2)) + + expect_true(length(unique(orig_id_1)) < 140) + expect_true(length(unique(orig_id_2)) < 140) +})