Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PipeOpTaskPreproc should work with features that also have other roles #866

Merged
merged 8 commits into from
Jan 21, 2025
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
* New parameter `new_role_direct` for `PipeOpColRoles` / `po("colroles")` to change column roles by role instead of by column.
* Dictionary sugar functions `po()` / `pos()` / `ppl()` / `ppls()` now make suggestions for entries in both `mlr_pipeops` as well as `mlr_graphs` when an object by the given name could not be found in the respective dictionary.
* New PipeOp `PipeOpDecode` / `po("decode")` to reverse one-hot or treatment encoding.
* Fix: Columns that are `feature` and something else no longer lose the other column role during training or prediction.

# mlr3pipelines 0.7.1

Expand Down
13 changes: 4 additions & 9 deletions R/PipeOpTaskPreproc.R
Original file line number Diff line number Diff line change
Expand Up @@ -208,9 +208,8 @@ PipeOpTaskPreproc = R6Class("PipeOpTaskPreproc",
if (do_subset) {
affected_cols = self$param_set$values$affect_columns(intask)
assert_subset(affected_cols, intask$feature_names, empty.ok = TRUE)
# FIXME: this fails when something is both a feature and something else
remove_cols = setdiff(intask$feature_names, affected_cols)
intask$col_roles = map(intask$col_roles, .f = setdiff, y = remove_cols)
intask$col_roles$feature = affected_cols
}
intasklayout = copy(intask$feature_types)

Expand All @@ -228,8 +227,7 @@ PipeOpTaskPreproc = R6Class("PipeOpTaskPreproc",
}

if (do_subset) {
# FIXME: this fails if .train_task added a column with the same name
intask$col_roles$feature = union(intask$col_roles$feature, y = remove_cols)
intask$col_roles$feature = union(intask$feature_names, y = remove_cols)
}

list(intask)
Expand All @@ -238,11 +236,9 @@ PipeOpTaskPreproc = R6Class("PipeOpTaskPreproc",
.predict = function(inputs) {
intask = inputs[[1]]$clone(deep = TRUE)
do_subset = !is.null(self$param_set$values$affect_columns)

if (do_subset) {
# FIXME: see train fixme: this fails when something is both a feature and something else
remove_cols = setdiff(intask$feature_names, self$state$affected_cols)
intask$col_roles = map(intask$col_roles, .f = setdiff, y = remove_cols)
intask$col_roles$feature = setdiff(intask$feature_names, remove_cols)
}
if (!isTRUE(all.equal(self$state$intasklayout, intask$feature_types, ignore.row.order = TRUE))) {
stopf("Input task during prediction of %s does not match input task during training.", self$id)
Expand All @@ -262,8 +258,7 @@ PipeOpTaskPreproc = R6Class("PipeOpTaskPreproc",
stopf("Processed output task during prediction of %s does not match output task during training.", self$id)
}
if (do_subset) {
# FIXME: see train fixme: this fails if .train_task added a column with the same name
intask$col_roles$feature = union(intask$col_roles$feature, y = remove_cols)
intask$col_roles$feature = union(intask$feature_names, y = remove_cols)
}
list(intask)
},
Expand Down
27 changes: 24 additions & 3 deletions tests/testthat/test_pipeop_task_preproc.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ test_that("PipeOpTaskPreprocSimple - basic properties", {
test_that("Wrong affect_columns errors", {
POPP = R6Class("POPP",
inherit = PipeOpTaskPreproc,
public = list(
train_dt = function(dt, levels, target) dt,
predict_dt = function(dt, levels) dt
private = list(
.train_dt = function(dt, levels, target) dt,
.predict_dt = function(dt, levels) dt
)
)
tsk = tsk("boston_housing_classic")
Expand All @@ -35,3 +35,24 @@ test_that("Wrong affect_columns errors", {
po = POPP$new("foo", param_vals = list(affect_columns = function(x) x$target_names))
expect_error(po$train(list(tsk)), "affected_cols")
})

test_that("PipeOpTaskPreproc - fix for #864 works", {
# Fixes #864: A column that is a feature and something else does not loose the other role during training or prediction
POPP = R6Class("POPP",
inherit = PipeOpTaskPreproc,
private = list(
.train_dt = function(dt, levels, target) dt,
.predict_dt = function(dt, levels) dt
)
)
po = POPP$new("test", param_vals = list(affect_columns = selector_name("Petal.Length")))
expect_pipeop(po)
task = mlr_tasks$get("iris")
task$col_roles$order = "Petal.Width"

train_out = po$train(list(task))[[1L]]
mb706 marked this conversation as resolved.
Show resolved Hide resolved
expect_equal(train_out$col_roles, task$col_roles)

predict_out = po$predict(list(task))[[1L]]
mb706 marked this conversation as resolved.
Show resolved Hide resolved
expect_equal(predict_out$col_roles, task$col_roles)
})