-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix!: breaking changes to
generate_outlier_subsets()
- switch arg to `data` from `ManyEcoEvo` - expose argument to user `n_min` and `n_max` - explicit join by argument in join - ensure filter diversity_indices for each reduced subset - ensure filter diversity_data on effects_analysis and NOT data --- This step filters datasets into a `'effects_analysis'` column, and therefore any filtering of the `diversity_*` data must be based on `effects_analysis` refs: #102 #97
- Loading branch information
Showing
1 changed file
with
45 additions
and
27 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,73 +1,91 @@ | ||
#' Generate Outlier Subsets for ManyEcoEvo datasets | ||
#' @description Removes top outlier for `yi` datasets and top 2 and bottom 2 outliers for `Zr` datasets | ||
#' | ||
#' @param ManyEcoEvo a ManyEcoEvo dataframe containing formatted raw `data`, formatted `diversity_data`, the `estimate_type`, and `dataset` | ||
#' @param data a ManyEcoEvo dataframe containing formatted raw `data`, formatted `diversity_data`, the `estimate_type`, and `dataset` | ||
#' | ||
#' @return A ManyEcoEvo dataframe with added column `exclusion_set` with new subsets of `data` and `diversity_data` | ||
#' @export | ||
#' @family Multi-dataset Wrapper Functions | ||
#' @family targets-pipeline functions | ||
generate_outlier_subsets <- function(ManyEcoEvo) { | ||
generate_outlier_subsets <- function(data, n_min = numeric(1L), n_max = numeric(1L)) { | ||
# NOTE: should be run *after* computing Zr with compute_MA_inputs() | ||
# because the function expects the column 'Zr' to exist in | ||
# TODO: will nolonger work on Zr dataset, because this doesn't contain an estimate_type col? | ||
# TODO: Don't run with the reduced publishability subset.... some of these already only have 10 data points!! | ||
# apply conditional behaviour to trigger both | ||
# TODO: do not run for collinearity_removed datasets | ||
if (str_detect(ManyEcoEvo$estimate_type, "Zr") %>% any(na.rm = TRUE)) { | ||
ManyEcoEvo_Zr <- ManyEcoEvo %>% | ||
if (str_detect(data$estimate_type, "Zr") %>% any(na.rm = TRUE)) { | ||
data_Zr <- data %>% | ||
filter(estimate_type == "Zr") %>% | ||
bind_rows(., { | ||
ManyEcoEvo %>% | ||
data %>% | ||
filter(estimate_type == "Zr", collinearity_subset != "collinearity_removed") %>% | ||
mutate(effects_analysis = map( | ||
effects_analysis, | ||
~ slice_max(.x, Zr, n = -2) %>% | ||
slice_min(Zr, n = -2) | ||
)) %>% | ||
mutate(effects_analysis = | ||
map( | ||
effects_analysis, | ||
~ slice_max(.x, Zr, n = n_max) %>% | ||
slice_min(Zr, n = n_min) | ||
)) %>% | ||
mutate( | ||
exclusion_set = paste0(exclusion_set, "-rm_outliers"), | ||
diversity_data = | ||
map2( | ||
.x = diversity_data, | ||
.y = data, # TODO should this be effects analysis?? Yes, but no shared variables... | ||
.f = ~ semi_join(.x, .y) %>% distinct() | ||
.y = effects_analysis, | ||
.f = ~ semi_join(.x, .y, by = join_by(id_col == study_id)) %>% | ||
distinct() | ||
), | ||
diversity_indices = | ||
map2( | ||
.x = diversity_indices, | ||
.y = effects_analysis, | ||
.f = ~ semi_join(.x, .y, by = join_by(id_col == study_id)) %>% | ||
distinct() | ||
) | ||
) | ||
}) # TODO duplicates in diversity_data....?? | ||
} | ||
|
||
if (str_detect(ManyEcoEvo$estimate_type, "y") %>% any(na.rm = TRUE)) { | ||
ManyEcoEvo_yi <- ManyEcoEvo %>% | ||
|
||
if (str_detect(data$estimate_type, "y") %>% | ||
any(na.rm = TRUE)) { | ||
data_yi <- data %>% | ||
filter(str_detect(estimate_type, "y")) %>% | ||
bind_rows(., { | ||
ManyEcoEvo %>% | ||
data %>% | ||
filter(str_detect(estimate_type, "y")) %>% | ||
mutate(data = map( | ||
data, # TODO check list-column is still called this! | ||
~ slice_max(.x, Z, n = -1) | ||
~ slice_max(.x, Z, n = n_max) | ||
)) %>% # TODO check that downstream functions call on data and not effects analysis!!! | ||
mutate( | ||
exclusion_set = paste0(exclusion_set, "-rm_outliers"), | ||
diversity_data = | ||
map2( | ||
.x = diversity_data, | ||
.y = data, # TODO should this be effects analysis??? | ||
.f = ~ semi_join(.x, .y) %>% distinct() | ||
.y = effects_analysis, | ||
.f = ~ semi_join(.x, .y, by = join_by(id_col == study_id)) %>% | ||
distinct() | ||
), | ||
diversity_indices = | ||
map2( | ||
.x = diversity_indices, | ||
.y = effects_analysis, | ||
.f = ~ semi_join(.x, .y, by = join_by(id_col == study_id)) %>% | ||
distinct() | ||
) | ||
) | ||
}) | ||
} | ||
|
||
out <- if (exists(x = "ManyEcoEvo_Zr") & exists(x = "ManyEcoEvo_yi")) { | ||
bind_rows(ManyEcoEvo_Zr, ManyEcoEvo_yi) | ||
} else if (exists(x = "ManyEcoEvo_Zr") & !exists(x = "ManyEcoEvo_yi")) { | ||
ManyEcoEvo_Zr | ||
} else if (!exists(x = "ManyEcoEvo_Zr") & exists(x = "ManyEcoEvo_yi")) { | ||
ManyEcoEvo_yi | ||
out <- if (exists(x = "data_Zr") & exists(x = "data_yi")) { | ||
bind_rows(data_Zr, data_yi) | ||
} else if (exists(x = "data_Zr") & !exists(x = "data_yi")) { | ||
data_Zr | ||
} else if (!exists(x = "data_Zr") & exists(x = "data_yi")) { | ||
data_yi | ||
} else { | ||
NULL | ||
} | ||
|
||
return(out) | ||
} |