fix!: breaking changes to generate_outlier_subsets()

- switch arg to `data` from `ManyEcoEvo` - expose argument to user `n_min` and `n_max` - explicit join by argument in join - ensure filter diversity_indices for each reduced subset - ensure filter diversity_data on effects_analysis and NOT data --- This step filters datasets into a `'effects_analysis'` column, and therefore any filtering of the `diversity_*` data must be based on `effects_analysis` refs: #102 #97
egouldo · Aug 26, 2024 · e8f354b · e8f354b
1 parent 29cfbdf
commit e8f354b
Showing 1 changed file with 45 additions and 27 deletions.
diff --git a/R/generate_outlier_subsets.R b/R/generate_outlier_subsets.R
@@ -1,73 +1,91 @@
 #' Generate Outlier Subsets for ManyEcoEvo datasets
 #' @description Removes top outlier for `yi` datasets and top 2 and bottom 2 outliers for `Zr` datasets
 #'
-#' @param ManyEcoEvo a ManyEcoEvo dataframe containing formatted raw `data`, formatted `diversity_data`, the `estimate_type`, and `dataset`
+#' @param data a ManyEcoEvo dataframe containing formatted raw `data`, formatted `diversity_data`, the `estimate_type`, and `dataset`
 #'
 #' @return A ManyEcoEvo dataframe with added column `exclusion_set` with new subsets of `data` and `diversity_data`
 #' @export
 #' @family Multi-dataset Wrapper Functions
 #' @family targets-pipeline functions
-generate_outlier_subsets <- function(ManyEcoEvo) {
+generate_outlier_subsets <- function(data, n_min = numeric(1L), n_max = numeric(1L)) {
   # NOTE: should be run *after* computing Zr with compute_MA_inputs()
   # because the function expects the column 'Zr' to exist in
   # TODO: will nolonger work on Zr dataset, because this doesn't contain an estimate_type col?
   # TODO: Don't run with the reduced publishability subset.... some of these already only have 10 data points!!
   # apply conditional behaviour to trigger both
   # TODO: do not run for collinearity_removed datasets
-  if (str_detect(ManyEcoEvo$estimate_type, "Zr") %>% any(na.rm = TRUE)) {
-    ManyEcoEvo_Zr <- ManyEcoEvo %>%
+  if (str_detect(data$estimate_type, "Zr") %>% any(na.rm = TRUE)) {
+    data_Zr <- data %>%
       filter(estimate_type == "Zr") %>%
       bind_rows(., {
-        ManyEcoEvo %>%
+        data %>%
           filter(estimate_type == "Zr", collinearity_subset != "collinearity_removed") %>%
-          mutate(effects_analysis = map(
-            effects_analysis,
-            ~ slice_max(.x, Zr, n = -2) %>%
-              slice_min(Zr, n = -2)
-          )) %>%
+          mutate(effects_analysis = 
+                   map(
+                     effects_analysis,
+                     ~ slice_max(.x, Zr, n = n_max) %>%
+                       slice_min(Zr, n = n_min)
+                   )) %>%
           mutate(
             exclusion_set = paste0(exclusion_set, "-rm_outliers"),
             diversity_data =
               map2(
                 .x = diversity_data,
-                .y = data, # TODO should this be effects analysis?? Yes, but no shared variables...
-                .f = ~ semi_join(.x, .y) %>% distinct()
+                .y = effects_analysis,
+                .f = ~ semi_join(.x, .y, by = join_by(id_col == study_id)) %>% 
+                  distinct()
+              ),
+            diversity_indices =
+              map2(
+                .x = diversity_indices,
+                .y = effects_analysis,
+                .f = ~ semi_join(.x, .y, by = join_by(id_col == study_id)) %>% 
+                  distinct()
               )
           )
       }) # TODO duplicates in diversity_data....??
   }
-
-  if (str_detect(ManyEcoEvo$estimate_type, "y") %>% any(na.rm = TRUE)) {
-    ManyEcoEvo_yi <- ManyEcoEvo %>%
+
+  if (str_detect(data$estimate_type, "y") %>% 
+      any(na.rm = TRUE)) {
+    data_yi <- data %>%
       filter(str_detect(estimate_type, "y")) %>%
       bind_rows(., {
-        ManyEcoEvo %>%
+        data %>%
           filter(str_detect(estimate_type, "y")) %>%
           mutate(data = map(
             data, # TODO check list-column is still called this!
-            ~ slice_max(.x, Z, n = -1)
+            ~ slice_max(.x, Z, n = n_max)
           )) %>% # TODO check that downstream functions call on data and not effects analysis!!!
           mutate(
             exclusion_set = paste0(exclusion_set, "-rm_outliers"),
             diversity_data =
               map2(
                 .x = diversity_data,
-                .y = data, # TODO should this be effects analysis???
-                .f = ~ semi_join(.x, .y) %>% distinct()
+                .y = effects_analysis, 
+                .f = ~ semi_join(.x, .y, by = join_by(id_col == study_id)) %>% 
+                  distinct()
+              ),
+            diversity_indices =
+              map2(
+                .x = diversity_indices,
+                .y = effects_analysis,
+                .f = ~ semi_join(.x, .y, by = join_by(id_col == study_id)) %>% 
+                  distinct()
               )
           )
       })
   }
-
-  out <- if (exists(x = "ManyEcoEvo_Zr") & exists(x = "ManyEcoEvo_yi")) {
-    bind_rows(ManyEcoEvo_Zr, ManyEcoEvo_yi)
-  } else if (exists(x = "ManyEcoEvo_Zr") & !exists(x = "ManyEcoEvo_yi")) {
-    ManyEcoEvo_Zr
-  } else if (!exists(x = "ManyEcoEvo_Zr") & exists(x = "ManyEcoEvo_yi")) {
-    ManyEcoEvo_yi
+  
+  out <- if (exists(x = "data_Zr") & exists(x = "data_yi")) {
+    bind_rows(data_Zr, data_yi)
+  } else if (exists(x = "data_Zr") & !exists(x = "data_yi")) {
+    data_Zr
+  } else if (!exists(x = "data_Zr") & exists(x = "data_yi")) {
+    data_yi
   } else {
     NULL
   }
-
+  
   return(out)
 }