From 0517572512df33e23d3dc4bfb085f1e6e01757d7 Mon Sep 17 00:00:00 2001
From: MLopez-Ibanez <2620021+MLopez-Ibanez@users.noreply.github.com>
Date: Fri, 1 Mar 2024 00:28:00 +0000
Subject: [PATCH] Implement Sobol initialization.

---
 DESCRIPTION                                |   1 +
 NAMESPACE                                  |   1 +
 NEWS.md                                    |   4 +-
 R/configurations.R                         |   1 -
 R/generation.R                             | 142 +++++++++++++++++++--
 R/irace-package.R                          |   1 +
 R/irace.R                                  |   4 +-
 R/utils.R                                  |   8 ++
 tests/testthat/test-targetRunnerParallel.R |   2 +-
 9 files changed, 145 insertions(+), 19 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 23d43cdc..819ed7c9 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -27,6 +27,7 @@ Imports:
         data.table (>= 1.15.0),
         matrixStats,
         R6,
+        spacefillr,
         withr
 Suggests:
     Rmpi (>= 0.6.0),
diff --git a/NAMESPACE b/NAMESPACE
index 7b1174cf..a29415fb 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -64,3 +64,4 @@ importFrom(graphics,par)
 importFrom(graphics,plot)
 importFrom(graphics,points)
 importFrom(graphics,strwidth)
+importFrom(spacefillr,generate_sobol_set)
diff --git a/NEWS.md b/NEWS.md
index 816099aa..0644f4d1 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -62,9 +62,11 @@
 ## New features and improvements
 
  * `sampleUniform()` and `sampleModel()` are significantly faster thanks to using [`data.table`](https://r-datatable.com).
+
+ * Initial configurations are sampled using Sobol low-discrepancy sequences using `spacefillr::generate_sobol_set()`. This should provide a better initial distribution of parameter values.
  
  * Ablation will report configurations that produced the same results, which
-   points to parameter values that have the same effect on the target algorithm,
+   indicates parameter values that have the same effect on the target algorithm,
    possibly indicating a bug in the target algorithm.
 
  * New option `instancesFile` of `ablation()` for using either the training
diff --git a/R/configurations.R b/R/configurations.R
index 4ebae742..d5448531 100644
--- a/R/configurations.R
+++ b/R/configurations.R
@@ -2,7 +2,6 @@
 configurations_alloc <- function(colnames, nrow, parameters)
 {
   parameter_type <- function(type) {
-    stopifnot(type %in% c("i","r","o","c"))
     switch(type,
            i = NA_integer_,
            r = NA_real_,
diff --git a/R/generation.R b/R/generation.R
index 7536053a..9fbf564e 100644
--- a/R/generation.R
+++ b/R/generation.R
@@ -24,6 +24,17 @@ get_fixed_value <- function(parameters, param)
   parameters$domain[[param]][[1L]]
 }
 
+repairConfigurations <- function(x, parameters, repair)
+{
+  if (!is.null(repair)) {
+    # FIXME: Pass the whole newConfigurations to repair and let it handle each row.
+    j <- colnames(x)
+    for (i in seq_nrow(x))
+      set(x, i, j = j, value = repair(as.data.frame(x[i]), parameters))
+  }
+  x
+}
+
 ## Calculates the parameter bounds when parameters domain is dependent
 getDependentBound <- function(parameters, param, configuration)
 {
@@ -48,7 +59,6 @@ getDependentBound <- function(parameters, param, configuration)
                    "). This is NOT a bug in irace. Check the definition of these parameters.")
     }
   }
-
   values
 }
 
@@ -81,6 +91,119 @@ get_dependent_domain <- function(parameters, param, configuration)
   domain
 }
 
+param_qunif_c <- function(x, domain, transf, digits = NULL)
+{
+  n <- length(domain)
+  z <- integer_round(as.numeric(x) * n + 1L, 1L, n)
+  domain[z]
+}
+
+param_qunif_i <- function(x, domain, transf, digits = NULL)
+{
+  upper <- domain[2L]
+  lower <- domain[1L]
+  x <- as.numeric(x)
+  if (transf == "log") {
+    # +1 for correct rounding before floor()
+    x <- transform_from_log(x, transf, lower, upper + 1L)
+  } else {
+    x <- x * (upper + 1L - lower) + lower
+  }
+  integer_round(x, lower, upper)
+}
+
+param_qunif_r <- function(x, domain, transf, digits)
+{
+  upper <- domain[2L]
+  lower <- domain[1L]
+  x <- as.numeric(x)
+  if (transf == "log") {
+    return(round(transform_from_log(x, transf, lower, upper), digits))
+  }
+  x <- x * (upper - lower) + lower
+  clamp(round(x, digits), lower, upper)
+}
+
+.param_qunif <- c(i = param_qunif_i, r = param_qunif_r, c = param_qunif_c,
+  o = param_qunif_c)
+
+generate_sobol <- function(parameters, n, repair = NULL)
+{
+  # Do not use .Machine$integer.max to minimize differences between machines.
+  seed <- sample.int(2147483647L, size = 1L)
+  confs <- spacefillr::generate_sobol_set(n, dim = parameters$nbVariable, seed = seed)
+  confs <- data.table(confs)
+  setnames(confs, parameters$names[!parameters$isFixed])
+  nodep_names <- parameters$names[!parameters$isDependent & !parameters$isFixed]
+  # FIXME: How to do this faster using data.table?
+  for (x in nodep_names) {
+    this_qunif <- .param_qunif[[parameters$types[x]]]
+    set(confs, j = x, value = this_qunif(confs[[x]], parameters$domain[[x]], parameters$transform[[x]],
+      digits = parameters$digits[x]))
+  }
+  for (x in parameters$names[parameters$isFixed]) {
+    set(confs, j = x, value = parameters$domain[[x]][[1L]])
+  }
+  setcolorder(confs, parameters$names)
+  
+  hierarchy <- parameters$hierarchy
+  max_level <- max(hierarchy)
+  if (max_level > 1L) {
+    .NEWVALUE <- .DOMAIN <- NULL # To silence CRAN warnings.
+    for (k in seq_len(max_level - 1L)) {
+      prev_names <- names(hierarchy)[hierarchy <= k]
+      dep_names <- names(hierarchy)[hierarchy == k+1L]
+      for (p in dep_names) {
+        idx_satisfied <- which_satisfied(confs, parameters$conditions[[p]])
+        if (parameters$isDependent[[p]] && length(idx_satisfied) > 0L) {
+          this_qunif <- .param_qunif[[parameters$types[p]]]
+          confs[idx_satisfied, let(.DOMAIN = list(get_dependent_domain(parameters, p, .SD))), by=.I, .SDcols=prev_names]
+          confs[idx_satisfied, .NEWVALUE := this_qunif(.SD, unlist(.DOMAIN), parameters$transform[[p]], digits = parameters$digits[p]), by=.I, .SDcols=p]
+          confs[, (p):=.NEWVALUE]
+          confs[, let(.NEWVALUE=NULL, .DOMAIN=NULL)]
+        } else if (length(idx_satisfied) < n) {
+          idx_not_satisfied <- if (length(idx_satisfied))
+                                 seq_len(n)[-idx_satisfied] else NULL
+          na_value <- switch(parameters$types[p],
+            i = NA_integer_,
+            r = NA_real_,
+            c = NA_character_,
+            o = NA_character_,
+            irace.internal.error("Unknown type '", parameters$types[p], "'"))
+          set(confs, i = idx_not_satisfied, j = p, value = na_value)
+        }
+      }
+    }
+  }
+  repairConfigurations(confs, parameters, repair)
+  set(confs, j = ".PARENT.", value = NA_integer_)
+  confs
+}
+
+sampleSobol <- function(parameters, n, repair = NULL)
+{
+  newConfigurations <- generate_sobol(parameters, n, repair)
+  newConfigurations <- unique(newConfigurations)
+  forbidden <- parameters$forbidden  
+  newConfigurations <- filter_forbidden(newConfigurations, forbidden)
+  have <- nrow(newConfigurations)
+  if (have == n) {
+    setDF(newConfigurations)
+    return(newConfigurations)
+  }
+  needed <- max(ceiling(n + (n - have) * (2 - have / n)), min(parameters$nbVariable * 5L, 100L))
+  newConfigurations <- generate_sobol(parameters, needed, repair)
+  newConfigurations <- unique(newConfigurations)
+  newConfigurations <- filter_forbidden(newConfigurations, forbidden)
+  if (nrow(newConfigurations) == 0L) {
+    irace.error("irace tried to sample a configuration not forbidden without success, perhaps your constraints are too strict?")
+  }
+  newConfigurations <- truncate_rows(newConfigurations, n)
+  setDF(newConfigurations)
+  newConfigurations
+}
+
+
 generate_uniform <- function(parameters, nbConfigurations, repair = NULL)
 {
   newConfigurations <- configurations_alloc(parameters$names, nrow = nbConfigurations, parameters = parameters)
@@ -128,16 +251,12 @@ generate_uniform <- function(parameters, nbConfigurations, repair = NULL)
     }
     set(newConfigurations, i = idx, j = currentParameter, value = newVals)
   }
-  if (!is.null(repair)) {
-    # FIXME: Pass the whole newConfigurations to repair and let it handle each row.
-    j <- colnames(newConfigurations)
-    for (i in seq_len(nbConfigurations))
-      set(newConfigurations, i, j = j, value = repair(as.data.frame(newConfigurations[i]), parameters))
-  }
+  repairConfigurations(newConfigurations, parameters, repair)
   set(newConfigurations, j = ".PARENT.", value = NA_integer_)
   newConfigurations
 }
 
+
 ### Uniform sampling for the initial generation
 sampleUniform <- function(parameters, nbConfigurations, repair = NULL)
 {
@@ -253,12 +372,7 @@ sample_from_model <- function(parameters, eliteConfigurations, model,
     # .BY is a list, so take the first argument.
     newConfigurations[idx_satisfied, c(currentParameter) := list(sample_values(.N, .BY[[1L]])), by = idx_elites[idx_satisfied]]
   }
-  if (!is.null(repair)) {
-    # FIXME: Pass the whole newConfigurations to repair and let it handle each row.
-    j <- colnames(newConfigurations)
-    for (i in seq_len(nbNewConfigurations))
-      set(newConfigurations, i, j = j, value = repair(as.data.frame(newConfigurations[i]), parameters))
-  }
+  repairConfigurations(newConfigurations, parameters, repair)
   set(newConfigurations, j = ".PARENT.", value = ids_elites[idx_elites])
   newConfigurations
 }
@@ -357,7 +471,7 @@ integer_round <- function(x, lower, upper)
   # The probability of this happening is very small, but it happens.
   x <- pmin.int(upper, x)
   irace.assert(all(x >= lower))
-  x
+  as.integer(x)
 }
 
 sample_numeric_unif <- function(n, lower, upper, transf)
diff --git a/R/irace-package.R b/R/irace-package.R
index 96ef1a0d..d694bcc3 100644
--- a/R/irace-package.R
+++ b/R/irace-package.R
@@ -7,6 +7,7 @@
 #' @importFrom R6 R6Class
 #' @importFrom grDevices dev.off pdf
 #' @importFrom graphics abline axis boxplot par plot points strwidth bxp grid
+#' @importFrom spacefillr generate_sobol_set
 #' 
 #' @details  License: GPL (>= 2)
 #' 
diff --git a/R/irace.R b/R/irace.R
index f6473104..27708b94 100644
--- a/R/irace.R
+++ b/R/irace.R
@@ -825,7 +825,7 @@ irace_run <- function(scenario, parameters)
       repeat {
         # Sample new configurations if needed
         if (nrow(allConfigurations) < nconfigurations) {
-          newConfigurations <- sampleUniform(parameters,
+          newConfigurations <- sampleSobol(parameters,
                                              nconfigurations - nrow(allConfigurations),
                                              repair = scenario$repairConfiguration)
           newConfigurations <- cbind (.ID. = max(0L, allConfigurations[[".ID."]]) + seq_nrow(newConfigurations),
@@ -1116,7 +1116,7 @@ irace_run <- function(scenario, parameters)
           catInfo("Sample ", nbNewConfigurations,
                   " configurations from uniform distribution", verbose = FALSE)
         }
-        newConfigurations <- sampleUniform(parameters, nbNewConfigurations,
+        newConfigurations <- sampleSobol(parameters, nbNewConfigurations,
                                            repair = scenario$repairConfiguration)
         newConfigurations <-
           cbind (.ID. = max(0L, allConfigurations[[".ID."]]) + seq_nrow(newConfigurations),
diff --git a/R/utils.R b/R/utils.R
index 581de0df..8a26708f 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -716,3 +716,11 @@ seq_nrow <- function(x) seq_len(nrow(x))
 
 clamp <- function(x, lower, upper) pmax.int(lower, pmin.int(x, upper))
 
+truncate_rows <- function(x, n)
+{
+  nx <- nrow(x)
+  if (nx <= n) return(x)
+  x[-seq.int(n + 1L, nx), ]
+}
+
+
diff --git a/tests/testthat/test-targetRunnerParallel.R b/tests/testthat/test-targetRunnerParallel.R
index 0565e2d2..fe28c4d7 100644
--- a/tests/testthat/test-targetRunnerParallel.R
+++ b/tests/testthat/test-targetRunnerParallel.R
@@ -49,7 +49,7 @@ x "x" r (1,2)
 ')
   expect_output(
     irace(scenario = list(targetRunnerParallel = targetRunnerParallel,
-                          instances = lapply(1:5, function(i) 10),
+                          instances = replicate(5, list(10)),
                           targetRunnerData = list(a=1, b=2),
                           maxExperiments = 42L),
           parameters = parameters),