Skip to content

Commit

Permalink
Merge pull request #840 from mlr-org/po_collapsefcts_prev
Browse files Browse the repository at this point in the history
New param `no_collapse_above_absolute` for PipeOpCollapseFactors
  • Loading branch information
mb706 authored Nov 19, 2024
2 parents 1566ae6 + c87253e commit f04a3c2
Show file tree
Hide file tree
Showing 4 changed files with 175 additions and 14 deletions.
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# mlr3pipelines 0.7.1-9000

* New parameter `no_collapse_above_absolute` in `PipeOpCollapseFactors` / `po("collapse_factors")`.
* Fix: `PipeOpCollapseFactors` now correctly collapses levels of ordered factors.

# mlr3pipelines 0.7.1

* Compatibility fix for upcoming `mlr3`
Expand Down
62 changes: 52 additions & 10 deletions R/PipeOpCollapseFactors.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,13 @@
#' @format [`R6Class`][R6::R6Class] object inheriting from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`].
#'
#' @description
#' Collapses factors of type `factor`, `ordered`: Collapses the rarest factors in the
#' training samples, until `target_level_count` levels remain. Levels that have prevalence above `no_collapse_above_prevalence`
#' are retained, however. For `factor` variables, these are collapsed to the next larger level, for `ordered` variables,
#' rare variables are collapsed to the neighbouring class, whichever has fewer samples.
#' Collapses factors of type `factor`, `ordered`: Collapses the rarest factors in the training samples, until `target_level_count`
#' levels remain. Levels that have prevalence strictly above `no_collapse_above_prevalence` or absolute count strictly above `no_collapse_above_absolute`
#' are retained, however. For `factor` variables, these are collapsed to the next larger level, for `ordered` variables, rare variables
#' are collapsed to the neighbouring class, whichever has fewer samples.
#' In case both `no_collapse_above_prevalence` and `no_collapse_above_absolute` are given, the less strict threshold of the two will be used, i.e. if
#' `no_collapse_above_prevalence` is 1 and `no_collapse_above_absolute` is 10 for a task with 100 samples, levels that are seen more than 10 times
#' will not be collapsed.
#'
#' Levels not seen during training are not touched during prediction; Therefore it is useful to combine this with the
#' [`PipeOpFixFactors`].
Expand Down Expand Up @@ -39,6 +42,9 @@
#' * `no_collapse_above_prevalence` :: `numeric(1)` \cr
#' Fraction of samples below which factor levels get collapsed. Default is 1, which causes all levels
#' to be collapsed until `target_level_count` remain.
#' * `no_collapse_above_absolute` :: `integer(1)` \cr
#' Number of samples below which factor levels get collapsed. Default is `Inf`, which causes all levels
#' to be collapsed until `target_level_count` remain.
#' * `target_level_count` :: `integer(1)` \cr
#' Number of levels to retain. Default is 2.
#'
Expand All @@ -55,15 +61,41 @@
#' @export
#' @examples
#' library("mlr3")
#' op = PipeOpCollapseFactors$new()
#'
#' # Create example training task
#' df = data.frame(
#' target = runif(100),
#' fct = factor(rep(LETTERS[1:6], times = c(25, 30, 5, 15, 5, 20))),
#' ord = factor(rep(1:6, times = c(20, 25, 30, 5, 5, 15)), ordered = TRUE)
#' )
#' task = TaskRegr$new(df, target = "target", id = "example_train")
#'
#' # Training
#' train_task_collapsed = op$train(list(task))[[1]]
#' train_task_collapsed$levels(c("fct", "ord"))
#'
#' # Create example prediction task
#' df_pred = data.frame(
#' target = runif(7),
#' fct = factor(LETTERS[1:7]),
#' ord = factor(1:7, ordered = TRUE)
#' )
#' pred_task = TaskRegr$new(df_pred, target = "target", id = "example_pred")
#'
#' # Prediction
#' pred_task_collapsed = op$predict(list(pred_task))[[1]]
#' pred_task_collapsed$levels(c("fct", "ord"))
PipeOpCollapseFactors = R6Class("PipeOpCollapseFactors",
inherit = PipeOpTaskPreprocSimple,
public = list(
initialize = function(id = "collapsefactors", param_vals = list()) {
ps = ps(
no_collapse_above_prevalence = p_dbl(0, 1, tags = c("train", "predict")),
no_collapse_above_absolute = p_int(0, special_vals = list(Inf), tags = c("train", "predict")),
target_level_count = p_int(2, tags = c("train", "predict"))
)
ps$values = list(no_collapse_above_prevalence = 1, target_level_count = 2)
ps$values = list(no_collapse_above_prevalence = 1, no_collapse_above_absolute = Inf, target_level_count = 2)
super$initialize(id, param_set = ps, param_vals = param_vals, feature_types = c("factor", "ordered"))
}
),
Expand All @@ -74,6 +106,7 @@ PipeOpCollapseFactors = R6Class("PipeOpCollapseFactors",
dt = task$data(cols = private$.select_cols(task))

keep_fraction = self$param_set$values$no_collapse_above_prevalence
keep_absolute = self$param_set$values$no_collapse_above_absolute
target_count = self$param_set$values$target_level_count

collapse_map = sapply(dt, function(d) {
Expand All @@ -83,22 +116,30 @@ PipeOpCollapseFactors = R6Class("PipeOpCollapseFactors",
if (length(levels(d)) <= target_count) {
return(NULL)
}

dtable = table(d)
fractions = sort(dtable, decreasing = TRUE) / sum(!is.na(d))
keep_fraction = names(fractions)[fractions >= keep_fraction]

absolutes = sort(dtable, decreasing = TRUE)
keep_absolute = names(absolutes)[absolutes > keep_absolute]

fractions = absolutes / sum(!is.na(d))
keep_fraction = names(fractions)[fractions > keep_fraction]

keep_count = names(fractions)[seq_len(target_count)] # at this point we know there are more levels than target_count
keep = union(keep_fraction, keep_count)

keep = union(keep_fraction, union(keep_count, keep_absolute))
dont_keep = setdiff(levels(d), keep)

if (is.ordered(d)) {
cmap = stats::setNames(as.list(levels(d)), levels(d))
for (eliminating in dont_keep) {
position = match(eliminating, names(cmap))
if (position == 1) {
cmap[[2]] = c(cmap[[2]], eliminating)
} else if (position == length(cmap) || dtable[position - 1] < dtable[position + 1]) {
cmap[[position - 1]] = c(cmap[[position - 1]], eliminating)
cmap[[position - 1]] = c(cmap[[position - 1]], cmap[[eliminating]])
} else {
cmap[[position + 1]] = c(cmap[[position + 1]], eliminating)
cmap[[position + 1]] = c(cmap[[position + 1]], cmap[[eliminating]])
}
dtable = dtable[-position]
cmap[[position]] = NULL
Expand All @@ -108,6 +149,7 @@ PipeOpCollapseFactors = R6Class("PipeOpCollapseFactors",
lowest_kept = keep[length(keep)]
cmap[[lowest_kept]] = c(lowest_kept, dont_keep)
}

cmap
}, simplify = FALSE)

Expand Down
39 changes: 35 additions & 4 deletions man/mlr_pipeops_collapsefactors.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

83 changes: 83 additions & 0 deletions tests/testthat/test_pipeop_collapsefactors.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
context("PipeOpCollapseFactors")

test_that("PipeOpCollapseFactors - basic properties", {
task = mlr_tasks$get("penguins")

expect_datapreproc_pipeop_class(PipeOpCollapseFactors, task = task)
})

test_that("PipeOpCollapseFactors - train and predict work", {
op = PipeOpCollapseFactors$new()
df = data.frame(
target = runif(100),
fct = factor(rep(LETTERS[1:6], times = c(25, 30, 5, 15, 5, 20))),
ord = factor(rep(1:6, times = c(20, 25, 30, 5, 5, 15)), ordered = TRUE)
)
task = TaskRegr$new(df, target = "target", id = "test")

# test (default): levels are reduced to target_count, correct levels are chosen for this
train_out = op$train(list(task))[[1]]
expect_equal(train_out$data(cols = c("fct"))[[1]], factor(rep(c("A", "B", "A"), times = c(25, 30, 45))))
expect_equal(train_out$data(cols = c("ord"))[[1]], factor(rep(c("2", "3"), times = c(45, 55)), ordered = TRUE))

predict_out = op$predict(list(task))[[1]]
expect_equal(predict_out$data(cols = c("fct"))[[1]], factor(rep(c("A", "B", "A"), times = c(25, 30, 45))))
expect_equal(predict_out$data(cols = c("ord"))[[1]], factor(rep(c("2", "3"), times = c(45, 55)), ordered = TRUE))

# test: target_count works
op$param_set$values$target_level_count = 4
train_out = op$train(list(task))[[1]]
expect_equal(train_out$data(cols = c("fct"))[[1]], factor(rep(c("A", "B", "D", "F"), times = c(25, 30, 25, 20))))
expect_equal(train_out$data(cols = c("ord"))[[1]], factor(rep(c("1", "2", "3", "6"), times = c(20, 25, 30, 25)), ordered = TRUE))

predict_out = op$predict(list(task))[[1]]
expect_equal(predict_out$data(cols = c("fct"))[[1]], factor(rep(c("A", "B", "D", "F"), times = c(25, 30, 25, 20))))
expect_equal(predict_out$data(cols = c("ord"))[[1]], factor(rep(c("1", "2", "3", "6"), times = c(20, 25, 30, 25)), ordered = TRUE))
op$param_set$values$target_level_count = 2

# test: absolute works
op$param_set$values$no_collapse_above_absolute = 15
train_out = op$train(list(task))[[1]]
expect_equal(train_out$data(cols = c("fct"))[[1]], factor(rep(c("A", "B", "F"), times = c(25, 30, 45))))
expect_equal(train_out$data(cols = c("ord"))[[1]], factor(rep(c("1", "2", "3"), times = c(20, 25, 55)), ordered = TRUE))

predict_out = op$predict(list(task))[[1]]
expect_equal(predict_out$data(cols = c("fct"))[[1]], factor(rep(c("A", "B", "F"), times = c(25, 30, 45))))
expect_equal(predict_out$data(cols = c("ord"))[[1]], factor(rep(c("1", "2", "3"), times = c(20, 25, 55)), ordered = TRUE))
op$param_set$values$no_collapse_above_absolute = Inf

# test: prevalence works
op$param_set$values$no_collapse_above_prevalence = 0.15
train_out = op$train(list(task))[[1]]
expect_equal(train_out$data(cols = c("fct"))[[1]], factor(rep(c("A", "B", "F"), times = c(25, 30, 45))))
expect_equal(train_out$data(cols = c("ord"))[[1]], factor(rep(c("1", "2", "3"), times = c(20, 25, 55)), ordered = TRUE))

predict_out = op$predict(list(task))[[1]]
expect_equal(predict_out$data(cols = c("fct"))[[1]], factor(rep(c("A", "B", "F"), times = c(25, 30, 45))))
expect_equal(predict_out$data(cols = c("ord"))[[1]], factor(rep(c("1", "2", "3"), times = c(20, 25, 55)), ordered = TRUE))

# test: if given both, does as documented (i.e. lower one is used since we are using union)
op$param_set$values$no_collapse_above_absolute = 10
train_out = op$train(list(task))[[1]]
expect_equal(train_out$data(cols = c("fct"))[[1]], factor(rep(c("A", "B", "D", "F"), times = c(25, 30, 25, 20))))
expect_equal(train_out$data(cols = c("ord"))[[1]], factor(rep(c("1", "2", "3", "6"), times = c(20, 25, 30, 25)), ordered = TRUE))

predict_out = op$predict(list(task))[[1]]
expect_equal(predict_out$data(cols = c("fct"))[[1]], factor(rep(c("A", "B", "D", "F"), times = c(25, 30, 25, 20))))
expect_equal(predict_out$data(cols = c("ord"))[[1]], factor(rep(c("1", "2", "3", "6"), times = c(20, 25, 30, 25)), ordered = TRUE))

# test: unseen levels are not touched in predict
op$param_set$values$no_collapse_above_absolute = Inf
op$param_set$values$no_collapse_above_prevalence = 1
df_pred = data.frame(
target = runif(7),
fct = factor(LETTERS[1:7]),
ord = factor(1:7, ordered = TRUE)
)
pred_task = TaskRegr$new(df_pred, target = "target", id = "test_pred")
op$train(list(task))
predict_out = op$predict(list(pred_task))[[1]]

expect_equal(predict_out$data(cols = c("fct"))[[1]], factor(c("A", "B", "A", "A", "A", "A", "G")))
expect_equal(predict_out$data(cols = c("ord"))[[1]], factor(c("2", "2", "3", "3", "3", "3", "7"), ordered = TRUE))
})

0 comments on commit f04a3c2

Please sign in to comment.