openml · rgmantovani · Jan 18, 2019 · Jan 18, 2019 · Jan 23, 2019 · Jan 23, 2019
diff --git a/CC18 - Benchmark Analysis in R.ipynb b/CC18 - Benchmark Analysis in R.ipynb
diff --git a/R/getAvgPerformance.R → R/averagePerformance.R b/R/getAvgPerformance.R → R/averagePerformance.R
@@ -1,13 +1,13 @@
 #--------------------------------------------------------------------------------------------------
 #--------------------------------------------------------------------------------------------------
 
-getAvgPerformance = function(data, measure) {
+averagePerformance = function(data, measure) {
 
-  temp = na.omit(data[, c("flow.name", measure)])
-  algos = unique(temp$flow.name)
+  temp = na.omit(data[, c("learner.name", measure)])
+  algos = unique(temp$learner.name)
 
   aux = lapply(algos, function(alg) {
-    d = temp[which(temp$flow.name == alg),]
+    d = temp[which(temp$learner.name == alg),]
     ret = mean(d[,2])
     return(ret)
   })

diff --git a/R/checkMeasure.R b/R/checkMeasure.R
@@ -1,15 +1,15 @@
 #--------------------------------------------------------------------------------------------------
 #--------------------------------------------------------------------------------------------------
 
-checkMeasure = function(measure){
+checkMeasure = function(measure) {
+
+  #TODO: replace with checkmate commands
+  allowed.measures = c("f.measure", "kappa", "precision", "recall",
+    "usercpu.time.millis", "area.under.roc.curve", "predictive.accuracy")
 
-  allowed.measures = c("f.measure", "kappa", "mean.absolute.error", "precision", "recall", 
-    "usercpu.time.millis", "area.under.roc.curve", "predictive.accuracy", "root.mean.squared.error")
   if (!( measure %in% allowed.measures)) {
-    stop(paste0(" - Please, choose one of the following measures: ", 
-      paste(allowed.measures, collapse=', '), " \n"))  
-  } else {
-    return(TRUE)
+    stop(paste0(" - Please, choose one of the following measures: ",
+      paste(allowed.measures, collapse=', '), " \n"))
   }
 }
 

diff --git a/R/checkPackages.R b/R/checkPackages.R
@@ -4,23 +4,24 @@
 checkPackages = function(pkgs) {
 
   obj = installed.packages()
+  not.installed = which(!pkgs %in% rownames(obj))
 
-  for(pk in pkgs) {
-
-    if(pk %in% rownames(obj)) {
-      cat(paste0(" - Package: ", pk, " \t... is already installed\n"))
-    } else {
-      cat(paste0(" - Installing: ", pk, "\n"))  
-      if (pk == "farff") {
-        devtools::install_github("mlr-org/farff")
-      } else if(pk == "OpenML") {
-        devtools::install_github("openml/r", ref = "05b8b97cc5ce6ea1b3f586818cfcf157b16a3cd4")
-      } else {
-        install.packages(pkgs = pk)   
+  if(length(not.installed > 0)) {
+    need = pkgs[not.installed]
+    cat(paste0(" @ Missing packages: ", paste(need, collapse = ", "), "\n"))
+    install.packages(pkgs = need, repo = "http://cran.uni-muenster.de/")
+
+    if("scmamp" %in% not.installed) {
+      if (!requireNamespace("BiocManager", quietly = TRUE)) {
+          install.packages("BiocManager")
+      # dependencies are not in CRAN
+      BiocManager::install("Rgraphviz", version = "3.8")
+      BiocManager::install("graph", version = "3.8")
       }
     }
   }
+  cat(" @ All required packages installed.\n")
 }
 
 #--------------------------------------------------------------------------------------------------
-#--------------------------------------------------------------------------------------------------
+#--------------------------------------------------------------------------------------------------
diff --git a/R/criticalDifferencePlot.R b/R/criticalDifferencePlot.R
@@ -0,0 +1,29 @@
+#--------------------------------------------------------------------------------------------------
+#--------------------------------------------------------------------------------------------------
+
+criticalDifferencePlot = function(data, measure = "predictive.accuracy", alpha = 0.05) {
+
+  sub.df = dplyr::select(.data = data, task.id, learner.name, measure)
+  tasks = unique(sub.df$task.id)
+  algos = unique(sub.df$learner.name)
+  colnames(sub.df) = c("taskId", "learnerName", "predictiveAcc")
+
+  aux.task = lapply(tasks, function(task) {
+    aux.algo = lapply(algos, function(algo) {
+      tmp = dplyr::filter(.data = sub.df, taskId == task, learnerName == algo)
+      return( mean(tmp$predictiveAcc))
+    })
+    return(unlist(aux.algo))
+  })
+
+  mat = do.call("rbind", aux.task)
+  mat[is.nan(mat)] = -Inf
+  rownames(mat) = tasks
+  colnames(mat) = algos
+
+  g = scmamp::plotCD(results.matrix = mat, alpha = alpha)
+  return(g)
+}
+
+#--------------------------------------------------------------------------------------------------
+#--------------------------------------------------------------------------------------------------
diff --git a/R/getRanking.R → R/generateRanking.R b/R/getRanking.R → R/generateRanking.R
@@ -2,16 +2,15 @@
 #--------------------------------------------------------------------------------------------------
 
 # Obs: Not handling NAs values (removed from the ranking)
+generateRanking = function(mat, descending = FALSE) {
 
-getRanking = function(mat, descending = FALSE) {
-
   temp = mat
   for(i in 1:nrow(mat)) {
     ids =  which(!is.na(mat[i,]))
     if(descending){
       temp[i, ids] = rank(-mat[i,ids])
     } else {
-      temp[i, ids] = rank( mat[i,ids])  
+      temp[i, ids] = rank( mat[i,ids])
     }
   }
 

diff --git a/R/getAlgoCoverage.R b/R/getAlgoCoverage.R
diff --git a/R/getAlgosAvgPlot.R b/R/getAlgosAvgPlot.R
diff --git a/R/getExperimentsData.R b/R/getExperimentsData.R
diff --git a/R/config.R → R/getExperimentsResults.R b/R/config.R → R/getExperimentsResults.R
@@ -1,15 +1,19 @@
 #--------------------------------------------------------------------------------------------------
 #--------------------------------------------------------------------------------------------------
 
-# OpenML version that works
-# devtools::install_github("openml/r", ref = "05b8b97cc5ce6ea1b3f586818cfcf157b16a3cd4")
-
-library('ggplot2')
-library('reshape2')
-library('gridExtra')  
-library('mlr')
-library('OpenML')
-library('dplyr')
+getExperimentsResults = function(tasks) {
+
+  cat(" @ Getting experiment results\n")
+  aux = lapply(tasks$task.id, function(id) {
+    # cat(" - loading results from task:", id, "\n")
+    res = OpenML::listOMLRunEvaluations(task.id = id, limit = 5000, offset = 0)
+    res$task.id = id
+    return(res)
+  })
+
+  df = plyr::rbind.fill(aux)
+  return(df)
+}
 
 #--------------------------------------------------------------------------------------------------
 #--------------------------------------------------------------------------------------------------
diff --git a/R/getROCurve.R b/R/getROCurve.R
diff --git a/R/getRuntimeData.R b/R/getRuntimeData.R
@@ -1,21 +1,21 @@
 #--------------------------------------------------------------------------------------------------
 #--------------------------------------------------------------------------------------------------
 
-getAvgRuntimeData = function(data) {
+getRuntimeData = function(data) {
 
-  temp = dplyr::select(.data = data, task.id, flow.name, usercpu.time.millis.training, 
-    usercpu.time.millis.testing, usercpu.time.millis)
+  temp = dplyr::select(.data = data, task.id, learner.name,
+    usercpu.time.millis.training, usercpu.time.millis.testing, usercpu.time.millis)
 
-  algos = unique(temp$flow.name)
+  algos = unique(temp$learner.name)
   aux = lapply(algos, function(alg) {
     # TO DO: how to handle missing data here?
-    d = na.omit(temp[which(temp$flow.name == alg),])
+    d = na.omit(temp[which(temp$learner.name == alg),])
     return(colMeans(d[,3:ncol(d)]))
   })
 
-  temp = data.frame(do.call("rbind", aux))
-  temp$alg = algos
-  return(temp)
+  ret = data.frame(do.call("rbind", aux))
+  ret$alg = algos
+  return(ret)
 }