Skip to content

Commit

Permalink
Merge pull request #866 from mlr-org/handle_multi_role_cols
Browse files Browse the repository at this point in the history
PipeOpTaskPreproc does not drop additional roles of feature cols during train or predict
  • Loading branch information
advieser authored Jan 21, 2025
2 parents f8e8bc0 + c78a58d commit 2f8c355
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 12 deletions.
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
* New parameter `new_role_direct` for `PipeOpColRoles` / `po("colroles")` to change column roles by role instead of by column.
* Dictionary sugar functions `po()` / `pos()` / `ppl()` / `ppls()` now make suggestions for entries in both `mlr_pipeops` as well as `mlr_graphs` when an object by the given name could not be found in the respective dictionary.
* New PipeOp `PipeOpDecode` / `po("decode")` to reverse one-hot or treatment encoding.
* Fix: Columns that are `feature` and something else no longer lose the other column role during training or predicting of `PipeOp`s inheriting from `PipeOpTaskPreproc`.
* Fix: Made tests for `PipeOpBLSmote` deterministic.

# mlr3pipelines 0.7.1
Expand Down
13 changes: 4 additions & 9 deletions R/PipeOpTaskPreproc.R
Original file line number Diff line number Diff line change
Expand Up @@ -208,9 +208,8 @@ PipeOpTaskPreproc = R6Class("PipeOpTaskPreproc",
if (do_subset) {
affected_cols = self$param_set$values$affect_columns(intask)
assert_subset(affected_cols, intask$feature_names, empty.ok = TRUE)
# FIXME: this fails when something is both a feature and something else
remove_cols = setdiff(intask$feature_names, affected_cols)
intask$col_roles = map(intask$col_roles, .f = setdiff, y = remove_cols)
intask$col_roles$feature = affected_cols
}
intasklayout = copy(intask$feature_types)

Expand All @@ -228,8 +227,7 @@ PipeOpTaskPreproc = R6Class("PipeOpTaskPreproc",
}

if (do_subset) {
# FIXME: this fails if .train_task added a column with the same name
intask$col_roles$feature = union(intask$col_roles$feature, y = remove_cols)
intask$col_roles$feature = union(intask$feature_names, y = remove_cols)
}

list(intask)
Expand All @@ -238,11 +236,9 @@ PipeOpTaskPreproc = R6Class("PipeOpTaskPreproc",
.predict = function(inputs) {
intask = inputs[[1]]$clone(deep = TRUE)
do_subset = !is.null(self$param_set$values$affect_columns)

if (do_subset) {
# FIXME: see train fixme: this fails when something is both a feature and something else
remove_cols = setdiff(intask$feature_names, self$state$affected_cols)
intask$col_roles = map(intask$col_roles, .f = setdiff, y = remove_cols)
intask$col_roles$feature = setdiff(intask$feature_names, remove_cols)
}
if (!isTRUE(all.equal(self$state$intasklayout, intask$feature_types, ignore.row.order = TRUE))) {
stopf("Input task during prediction of %s does not match input task during training.", self$id)
Expand All @@ -262,8 +258,7 @@ PipeOpTaskPreproc = R6Class("PipeOpTaskPreproc",
stopf("Processed output task during prediction of %s does not match output task during training.", self$id)
}
if (do_subset) {
# FIXME: see train fixme: this fails if .train_task added a column with the same name
intask$col_roles$feature = union(intask$col_roles$feature, y = remove_cols)
intask$col_roles$feature = union(intask$feature_names, y = remove_cols)
}
list(intask)
},
Expand Down
27 changes: 24 additions & 3 deletions tests/testthat/test_pipeop_task_preproc.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ test_that("PipeOpTaskPreprocSimple - basic properties", {
test_that("Wrong affect_columns errors", {
POPP = R6Class("POPP",
inherit = PipeOpTaskPreproc,
public = list(
train_dt = function(dt, levels, target) dt,
predict_dt = function(dt, levels) dt
private = list(
.train_dt = function(dt, levels, target) dt,
.predict_dt = function(dt, levels) dt
)
)
tsk = tsk("boston_housing_classic")
Expand All @@ -35,3 +35,24 @@ test_that("Wrong affect_columns errors", {
po = POPP$new("foo", param_vals = list(affect_columns = function(x) x$target_names))
expect_error(po$train(list(tsk)), "affected_cols")
})

test_that("PipeOpTaskPreproc - fix for #864 works", {
# Fixes #864: A column that is a feature and something else does not loose the other role during training or prediction
POPP = R6Class("POPP",
inherit = PipeOpTaskPreproc,
private = list(
.train_dt = function(dt, levels, target) dt,
.predict_dt = function(dt, levels) dt
)
)
po = POPP$new("test", param_vals = list(affect_columns = selector_name("Petal.Length")))
expect_pipeop(po)
task = mlr_tasks$get("iris")
task$col_roles$order = "Petal.Width"

train_out = po$train(list(task$clone(deep = TRUE)))[[1L]]
expect_equal(train_out$col_roles, task$col_roles)

predict_out = po$predict(list(task$clone(deep = TRUE)))[[1L]]
expect_equal(predict_out$col_roles, task$col_roles)
})

0 comments on commit 2f8c355

Please sign in to comment.