Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New PipeOpEncodePL for Piecewise Linear Encoding #861

Draft
wants to merge 11 commits into
base: master
Choose a base branch
from
2 changes: 1 addition & 1 deletion R/PipeOpEncode.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
#' @section Input and Output Channels:
#' Input and output channels are inherited from [`PipeOpTaskPreproc`].
#'
#' The output is the input [`Task`][mlr3::Task] with all affected `factor` and `ordered` parameters encoded according to the `method`
#' The output is the input [`Task`][mlr3::Task] with all affected `factor` and `ordered` columns encoded according to the `method`
#' parameter.
#'
#' @section State:
Expand Down
151 changes: 151 additions & 0 deletions R/PipeOpEncodePL.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
#' @title Factor Encoding
#'
#' @usage NULL
#' @name mlr_pipeops_encode
#' @format [`R6Class`][R6::R6Class] object inheriting from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`].
#'
#' @description
#' Encodes columns of type `numeric` and `integer`.
#'
#'
#'
#' Use the [`PipeOpTaskPreproc`] `$affect_columns` functionality to only encode a subset of columns, or only encode columns of a certain type.
#'
#' @section Construction:
#' ```
#' PipeOpEncodePL$new(task_type, id = "encodepl", param_vals = list())
#' ```
#' * `task_type` :: `character(1)`\cr
#'
#' * `id` :: `character(1)`\cr
#' Identifier of resulting object, default `"encode"`.
#' * `param_vals` :: named `list`\cr
#' List of hyperparameter settings, overwriting the hyperparameter settings that would otherwise be set during construction. Default `list()`.
#'
#' @section Input and Output Channels:
#' Input and output channels are inherited from [`PipeOpTaskPreproc`].
#'
#' The output is the input [`Task`][mlr3::Task] with all affected `numeric` and `integer` columns
#'
#' @section State:
#' The `$state` is a named `list` with the `$state` elements inherited from [`PipeOpTaskPreproc`], as well as:
#' * ` ` :: named `list`\cr
#'
#' @section Parameters:
#' The parameters are the parameters inherited from [`PipeOpTaskPreproc`], as well as:
#' * `method` :: `character(1)` \cr
#' Initialized to `""`. One of:
#'
#' @section Methods:
#' Only methods inherited from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`].
#'
#' @references
#' `r format_bib("gorishniy_2022")`
#'
#' @family PipeOps
#' @template seealso_pipeopslist
#' @include PipeOpTaskPreproc.R
#' @export
#' @examples
#' library("mlr3")
#'
PipeOpEncodePL = R6Class("PipeOpEncodePL",
inherit = PipeOpTaskPreprocSimple,
public = list(
initialize = function(task_type, id = "encodepl", param_vals = list()) {
# NOTE: Might use different name, change assert, and conditions
assert_choice(task_type, mlr_reflections$task_types$task)
if (task_type == "TaskRegr") {
private$.tree_learner = LearnerRegrRpart$new()
} else if (task_type == "TaskClassif") {
private$.tree_learner = LearnerClassifRpart$new()
} else {
stopf("Task type %s not supported", task_type)
}
advieser marked this conversation as resolved.
Show resolved Hide resolved

private$.encodepl_param_set = ps(
method = p_fct(levels = c("quantiles", "tree"), tags = c("train", "predict", "required")),
quantiles_numsplits = p_int(lower = 2, default = 2, tags = c("train", "predict"), depends = quote(method == "quantiles"))
)
private$.encodepl_param_set$values = list(method = "quantiles")

super$initialize(id, param_set = alist(encodepl = private$.encodepl_param_set, private$.tree_learner$param_set),
param_vals = param_vals, packages = c("stats", private$.tree_learner$packages),
task_type = task_type, tags = "encode", feature_types = c("numeric", "integer"))
}
),
private = list(

.tree_learner = NULL,
.encodepl_param_set = NULL,

.get_state = function(task) {
cols = private$.select_cols(task)
if (!length(cols)) {
return(task) # early exit
}

pv = private$.encodepl_param_set$values
numsplits = pv$quantiles_numsplits %??% 2

if (pv$method == "quantiles") {
# TODO: check that min / max is correct here (according to paper / implementation)
bins = lapply(task$data(cols = cols), function(d) {
unique(c(min(d), stats::quantile(d, seq(1, numsplits - 1) / numsplits, na.rm = TRUE), max(d)))
})
} else {
learner = private$.tree_learner

bins = list()
for (col in cols) {
t = task$clone(deep = TRUE)$select(col)
splits = learner$train(t)$model$splits
# Get column "index" in model splits
boundaries = unname(sort(splits[, "index"]))
advieser marked this conversation as resolved.
Show resolved Hide resolved

d = task$data(cols = col)
bins[[col]] = c(min(d), boundaries, max(d))
}
}

list(bins = bins)
},

.transform = function(task) {
bins = self$state$bins
cols = names(bins)
if (!length(cols)) {
return(task) # early exit
}

dt = task$data(cols = cols)
res = as.data.table(imap(dt, function(d, col) encode_piecewise_linear(d, col, bins[[col]])))

task$select(setdiff(task$feature_names, cols))$cbind(res)
}
)
)

mlr_pipeops$add("encodepl", PipeOpEncodePL, list(task_type = "TaskRegr"))

# Helper function to implement piecewise linear encoding.
# * column: numeric vector
# * colname: name of `column`
# * bins as numeric vector of boundaries
encode_piecewise_linear = function(column, colname, bins) {
n_bins = length(bins) - 1

dt = data.table(matrix(0, length(column), n_bins))
setnames(dt, paste0(colname, ".bin", seq_len(n_bins)))

for (t in seq_len(n_bins)) {
lower = bins[[t]]
upper = bins[[t + 1]]

dt[column >= upper, colnames(dt)[[t]] := 1]
indices = column < upper & column >= lower
dt[indices, colnames(dt)[[t]] := (column[indices] - lower) / (upper - lower)]
}

dt
}
2 changes: 1 addition & 1 deletion R/PipeOpQuantileBin.R
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ PipeOpQuantileBin = R6Class("PipeOpQuantileBin",
initialize = function(id = "quantilebin", param_vals = list()) {
ps = ps(
numsplits = p_int(lower = 2, special_vals = list(NULL), tags = "train")
)
)
ps$values = list(numsplits = 2L)
super$initialize(id, param_set = ps, param_vals = param_vals, packages = "stats", feature_types = c("numeric", "integer"))
}
Expand Down
29 changes: 21 additions & 8 deletions R/bibentries.R
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,8 @@ bibentries = c(

han_2005 = bibentry("InProceedings",
doi = "10.1007/11538059_91",
author = "Han, Hui and Wang, Wen-Yuan and Mao, Bing-Huan",
editor = "Huang, De-Shuang and Zhang, Xiao-Ping and Huang, Guang-Bin",
author = "Hui Han and Wen-Yuan Wang and Bing-Huan Mao",
editor = "De-Shuang Huang and Xiao-Ping Zhang and Guang-Bin Huang",
title = "Borderline-SMOTE: A New Over-Sampling Method in Imbalanced Data Sets Learning",
booktitle = "Advances in Intelligent Computing",
year = "2005",
Expand All @@ -107,11 +107,24 @@ bibentries = c(
),

freeman_1979 = bibentry("InCollection",
author = "Freeman III, A Myrick",
title = "The Hedonic Price Approach to Measuring Demand for Neighborhood Characteristics",
booktitle = "The Economics of Neighborhood",
year = "1979",
publisher = "Elsevier",
pages = "191--217"
doi = "10.1016/B978-0-12-636250-3.50015-5",
author = "A Myrick Freeman III",
title = "The Hedonic Price Approach to Measuring Demand for Neighborhood Characteristics",
booktitle = "The Economics of Neighborhood",
year = "1979",
publisher = "Elsevier",
pages = "191--217"
),


gorishniy_2022 = bibentry("InProceedings",
title = "On Embeddings for Numerical Features in Tabular Deep Learning",
volume = "35",
url = "https://proceedings.neurips.cc/paper_files/paper/2022/hash/9e9f0ffc3d836836ca96cbf8fe14b105-Abstract-Conference.html",
booktitle = "Advances in Neural Information Processing Systems",
author = "Yury Gorishniy and Ivan Rubachev and Artem Babenko",
year = "2022",
pages = "24991--25004"
)

)
16 changes: 16 additions & 0 deletions tests/testthat/test_pipeop_encodepl.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
context("PipeOpEncodePL")

test_that("PipeOpEncodePL - basic properties", {
task = mlr_tasks$get("mtcars")
expect_datapreproc_pipeop_class(PipeOpEncodePL, constargs = list(task_type = "TaskRegr"), task = task)

task = mlr_tasks$get("iris")
expect_datapreproc_pipeop_class(PipeOpEncodePL, task = task)
expect_datapreproc_pipeop_class(PipeOpEncodePL, constargs = list(task_type = "TaskClassif"), task = task)
})

# Tests:
# - different methods
# - with params (not all for regtree, hopefully)
# - test on tasks with simple data that behaviour is as expected (compare dts)
# - TODO: decide how to handle NAs in feature columns and test that
Loading