Skip to content

Commit

Permalink
fix!: breaking changes to generate_outlier_subsets()
Browse files Browse the repository at this point in the history
- switch arg to `data` from `ManyEcoEvo`
- expose argument to user `n_min` and `n_max`
- explicit join by argument in join
- ensure filter diversity_indices for each reduced subset
- ensure filter diversity_data on effects_analysis and NOT data

---

This step filters datasets into a `'effects_analysis'` column, and therefore any filtering of the `diversity_*` data must be based on `effects_analysis`

refs: #102 #97
  • Loading branch information
egouldo committed Aug 26, 2024
1 parent 29cfbdf commit e8f354b
Showing 1 changed file with 45 additions and 27 deletions.
72 changes: 45 additions & 27 deletions R/generate_outlier_subsets.R
Original file line number Diff line number Diff line change
@@ -1,73 +1,91 @@
#' Generate Outlier Subsets for ManyEcoEvo datasets
#' @description Removes top outlier for `yi` datasets and top 2 and bottom 2 outliers for `Zr` datasets
#'
#' @param ManyEcoEvo a ManyEcoEvo dataframe containing formatted raw `data`, formatted `diversity_data`, the `estimate_type`, and `dataset`
#' @param data a ManyEcoEvo dataframe containing formatted raw `data`, formatted `diversity_data`, the `estimate_type`, and `dataset`
#'
#' @return A ManyEcoEvo dataframe with added column `exclusion_set` with new subsets of `data` and `diversity_data`
#' @export
#' @family Multi-dataset Wrapper Functions
#' @family targets-pipeline functions
generate_outlier_subsets <- function(ManyEcoEvo) {
generate_outlier_subsets <- function(data, n_min = numeric(1L), n_max = numeric(1L)) {
# NOTE: should be run *after* computing Zr with compute_MA_inputs()
# because the function expects the column 'Zr' to exist in
# TODO: will nolonger work on Zr dataset, because this doesn't contain an estimate_type col?
# TODO: Don't run with the reduced publishability subset.... some of these already only have 10 data points!!
# apply conditional behaviour to trigger both
# TODO: do not run for collinearity_removed datasets
if (str_detect(ManyEcoEvo$estimate_type, "Zr") %>% any(na.rm = TRUE)) {
ManyEcoEvo_Zr <- ManyEcoEvo %>%
if (str_detect(data$estimate_type, "Zr") %>% any(na.rm = TRUE)) {
data_Zr <- data %>%
filter(estimate_type == "Zr") %>%
bind_rows(., {
ManyEcoEvo %>%
data %>%
filter(estimate_type == "Zr", collinearity_subset != "collinearity_removed") %>%
mutate(effects_analysis = map(
effects_analysis,
~ slice_max(.x, Zr, n = -2) %>%
slice_min(Zr, n = -2)
)) %>%
mutate(effects_analysis =
map(
effects_analysis,
~ slice_max(.x, Zr, n = n_max) %>%
slice_min(Zr, n = n_min)
)) %>%
mutate(
exclusion_set = paste0(exclusion_set, "-rm_outliers"),
diversity_data =
map2(
.x = diversity_data,
.y = data, # TODO should this be effects analysis?? Yes, but no shared variables...
.f = ~ semi_join(.x, .y) %>% distinct()
.y = effects_analysis,
.f = ~ semi_join(.x, .y, by = join_by(id_col == study_id)) %>%
distinct()
),
diversity_indices =
map2(
.x = diversity_indices,
.y = effects_analysis,
.f = ~ semi_join(.x, .y, by = join_by(id_col == study_id)) %>%
distinct()
)
)
}) # TODO duplicates in diversity_data....??
}

if (str_detect(ManyEcoEvo$estimate_type, "y") %>% any(na.rm = TRUE)) {
ManyEcoEvo_yi <- ManyEcoEvo %>%

if (str_detect(data$estimate_type, "y") %>%
any(na.rm = TRUE)) {
data_yi <- data %>%
filter(str_detect(estimate_type, "y")) %>%
bind_rows(., {
ManyEcoEvo %>%
data %>%
filter(str_detect(estimate_type, "y")) %>%
mutate(data = map(
data, # TODO check list-column is still called this!
~ slice_max(.x, Z, n = -1)
~ slice_max(.x, Z, n = n_max)
)) %>% # TODO check that downstream functions call on data and not effects analysis!!!
mutate(
exclusion_set = paste0(exclusion_set, "-rm_outliers"),
diversity_data =
map2(
.x = diversity_data,
.y = data, # TODO should this be effects analysis???
.f = ~ semi_join(.x, .y) %>% distinct()
.y = effects_analysis,
.f = ~ semi_join(.x, .y, by = join_by(id_col == study_id)) %>%
distinct()
),
diversity_indices =
map2(
.x = diversity_indices,
.y = effects_analysis,
.f = ~ semi_join(.x, .y, by = join_by(id_col == study_id)) %>%
distinct()
)
)
})
}

out <- if (exists(x = "ManyEcoEvo_Zr") & exists(x = "ManyEcoEvo_yi")) {
bind_rows(ManyEcoEvo_Zr, ManyEcoEvo_yi)
} else if (exists(x = "ManyEcoEvo_Zr") & !exists(x = "ManyEcoEvo_yi")) {
ManyEcoEvo_Zr
} else if (!exists(x = "ManyEcoEvo_Zr") & exists(x = "ManyEcoEvo_yi")) {
ManyEcoEvo_yi
out <- if (exists(x = "data_Zr") & exists(x = "data_yi")) {
bind_rows(data_Zr, data_yi)
} else if (exists(x = "data_Zr") & !exists(x = "data_yi")) {
data_Zr
} else if (!exists(x = "data_Zr") & exists(x = "data_yi")) {
data_yi
} else {
NULL
}

return(out)
}

0 comments on commit e8f354b

Please sign in to comment.