From 7be8fce7c857290da03c82c1cf15d0dee8005b19 Mon Sep 17 00:00:00 2001 From: Isa Stallworthy <31548151+istallworthy@users.noreply.github.com> Date: Wed, 20 Sep 2023 13:21:48 -0400 Subject: [PATCH] remove helpers --- NAMESPACE | 15 -- R/formatLongData.R | 155 -------------------- R/imputeData.R | 171 --------------------- R/inspectData.R | 334 ------------------------------------------ man/formatLongData.Rd | 81 ---------- man/imputeData.Rd | 77 ---------- man/inspectData.Rd | 113 -------------- 7 files changed, 946 deletions(-) delete mode 100644 R/formatLongData.R delete mode 100644 R/imputeData.R delete mode 100644 R/inspectData.R delete mode 100644 man/formatLongData.Rd delete mode 100644 man/imputeData.Rd delete mode 100644 man/inspectData.Rd diff --git a/NAMESPACE b/NAMESPACE index 7865e4a5..8804b5c6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -11,24 +11,17 @@ export(create_custom_comparisons) export(create_custom_contrasts) export(eval_hist) export(fitModel) -export(formatLongData) export(getModel) export(get_comparison_values) -export(imputeData) -export(inspectData) export(make_love_plot) export(perform_multiple_comparison_correction) export(trimWeights) importFrom(WeightIt,weightitMSM) -importFrom(doRNG,"%dorng%") importFrom(dplyr,"%>%") -importFrom(dplyr,arrange) importFrom(dplyr,bind_rows) importFrom(dplyr,filter) importFrom(dplyr,mutate) importFrom(dplyr,select) -importFrom(foreach,getDoParName) -importFrom(foreach,getDoParWorkers) importFrom(ggplot2,aes) importFrom(ggplot2,element_blank) importFrom(ggplot2,element_rect) @@ -50,17 +43,9 @@ importFrom(jtools,export_summs) importFrom(knitr,kable) importFrom(marginaleffects,avg_predictions) importFrom(marginaleffects,hypotheses) -importFrom(mice,complete) -importFrom(mice,ibind) -importFrom(mice,mice) importFrom(mice,pool) -importFrom(missMethods,delete_MAR_1_to_x) -importFrom(parallel,detectCores) -importFrom(purrr,map_dfr) importFrom(stargazer,stargazer) importFrom(stats,p.adjust) importFrom(stringr,str_count) importFrom(survey,svydesign) importFrom(survey,svyglm) -importFrom(tibble,tibble) -importFrom(tidyr,complete) diff --git a/R/formatLongData.R b/R/formatLongData.R deleted file mode 100644 index 2277ede2..00000000 --- a/R/formatLongData.R +++ /dev/null @@ -1,155 +0,0 @@ - -#' Formats long data -#' -#' @param home_dir path to home directory -#' @param data dataframe in long format -#' @param exposure name of exposure variable -#' @param exposure_time_pts list of integers at which weights will be -#' created/assessed that correspond to time points when exposure was measured -#' @param outcome name of outcome variable with ".timepoint" suffix -#' @param tv_confounders list of time-varying confounders with ".timepoint" -#' suffix -#' @param time_var (optional) variable name in original dataset demarcating time -#' @param id_var (optional) variable name in original dataset demarcating ID -#' @param missing (optional) indicator for missing data in original dataset -#' @param factor_confounders (optional) list of variable names that are factors -#' (default is numeric) -#' @param save.out (optional) TRUE or FALSE indicator to save output and -#' intermediary output locally (default is TRUE) -#' @return formatted long dataset -#' @export -#' -#' @examples -#' test <- data.frame(ID = 1:50, -#' A.1 = rnorm(n = 50), -#' A.2 = rnorm(n = 50), -#' A.3 = rnorm(n = 50), -#' B.1 = rnorm(n = 50), -#' B.2 = rnorm(n = 50), -#' B.3 = rnorm(n = 50), -#' C = rnorm(n = 50), -#' D.3 = rnorm(n = 50)) -#' test[, c("A.1", "A.2", "A.3")] <- lapply(test[, c("A.1", "A.2", "A.3")], as.numeric) -#' -#' test_long <- stats::reshape(data = test, -#' idvar = "ID", #'list ID variable -#' varying = c("A.1", "A.2", "A.3", "B.1", "B.2", "B.3"), -#' direction = "long") -#' -#' test_long_format <- formatLongData(data = test_long, -#' exposure = "A", -#' exposure_time_pts = c(1, 2, 3), -#' outcome = "D.3", -#' tv_confounders = c("A.1", "A.2", "A.3", "B.1", "B.2", "B.3"), -#' time_var = "time", -#' id_var = NA, -#' missing = NA, -#' factor_confounders = "C", -#' save.out = FALSE) - - -formatLongData <- function(home_dir, data, exposure, exposure_time_pts, outcome, tv_confounders, time_var = NA, id_var = NA, missing = NA, - factor_confounders = NULL, save.out = TRUE){ - - if(save.out){ - if (missing(home_dir)){ - stop("Please supply a home directory.", call. = FALSE) - } - else if (!dir.exists(home_dir)) { - stop('Please provide a valid home directory.', call. = FALSE) - } - } - if (missing(data)){ - stop("Please supply data as either a dataframe with no missing data or imputed data in the form of a mids object or path to folder with imputed csv datasets.", - call. = FALSE) - } - if (missing(exposure)){ - stop("Please supply a single exposure.", call. = FALSE) - } - if (missing(outcome)){ - stop("Please supply a single outcome.", call. = FALSE) - } - if (missing(exposure_time_pts)){ - stop("Please supply the exposure time points at which you wish to create weights.", call. = FALSE) - } - if (missing(tv_confounders)){ - stop("Please supply a list of time-varying confounders.", call. = FALSE) - } - - time_varying_covariates <- tv_confounders - options(readr.num_columns = 0) - - - - # Reading and formatting LONG dataset - if (!is.na(time_var)){ - colnames(data)[colnames(data) == time_var] <- "WAVE" # Assigning time variable - } - - if(!is.na(id_var)){ - colnames(data)[colnames(data) == id_var] <- "ID" # Assigning time variable - } - - if(!is.na(missing)){ - # data[data == missing] <- NA # Makes NA the missingness indicator - is.na(data[data == missing]) <- TRUE - } - - if (which(colnames(data) == "ID") != 1){ - data <- data[,which(colnames(data) == "ID"):ncol(data)] - } - - # Exposure summary - exposure_summary <- data %>% - dplyr::filter(WAVE %in% exposure_time_pts) %>% - dplyr::group_by(WAVE) %>% - dplyr::summarize_at(dplyr::vars(all_of(exposure)), - list(mean = mean, sd = sd, min = min, max = max), na.rm = TRUE) - - cat(knitr::kable(exposure_summary, caption = paste0("Summary of ", exposure, - " Exposure Information"), format = 'pipe'), sep = "\n") - - if(save.out){ - knitr::kable(exposure_summary, caption = paste0("Summary of ", exposure, " Exposure Information"), format = 'html') %>% - kableExtra::kable_styling() %>% - kableExtra::save_kable(file = file.path(home_dir, paste0(exposure, "_exposure_info.html"))) - - cat(paste0(exposure, " exposure descriptive statistics have now been saved in the home directory"), "\n") - cat("\n") - } - - - # Outcome summary - outcome_summary <- data[, !colnames(data) %in% "ID"] - outcome_summary <- outcome_summary %>% select(contains(sapply(strsplit(outcome, "\\."), - "[", 1))) - outcome_summary <- psych::describe(outcome_summary, fast = TRUE) - - cat(knitr::kable(outcome_summary, caption = paste0("Summary of Outcome ", outcome, " Information"), - format = 'pipe'), sep = "\n") - - if(save.out){ - knitr::kable(outcome_summary, caption = paste0("Summary of Outcome ", outcome, " Information"), format = 'html') %>% - kableExtra::kable_styling() %>% - kableExtra::save_kable(file = file.path(home_dir, paste0(outcome, "_outcome_info.html"))) - - cat(paste0(outcome, " outcome descriptive statistics have now been saved in the home directory"), "\n") - } - - - data$ID <- as.factor(data$ID) - - if(!is.null(factor_confounders)){ - if (sum(factor_confounders %in% colnames(data)) < length(factor_confounders)) { - stop('Please provide factor covariates that correspond to columns in your data when creating the msm object', - call. = FALSE) - } - # Formatting factor covariates - data[, factor_confounders] <- lapply(data[, factor_confounders], as.factor) - # Formatting numeric covariates - numeric_vars <- colnames(data)[!colnames(data) %in% c(factor_confounders, "ID")] - data[, numeric_vars] <- lapply(data[, numeric_vars], as.numeric) - } - - as.data.frame(data) -} diff --git a/R/imputeData.R b/R/imputeData.R deleted file mode 100644 index 50903fbf..00000000 --- a/R/imputeData.R +++ /dev/null @@ -1,171 +0,0 @@ -#' Imputes dataset so there is no missing at each time point using parallel -#' processing to speed up -#' -#' @export -#' @importFrom mice mice -#' @importFrom mice ibind -#' @importFrom mice complete -#' @importFrom tibble tibble -#' @importFrom dplyr arrange -#' @importFrom dplyr filter -#' @importFrom dplyr mutate -#' @importFrom tidyr complete -#' @importFrom knitr kable -#' @importFrom parallel detectCores -#' @importFrom doRNG %dorng% -#' @importFrom purrr map_dfr -#' @importFrom foreach getDoParWorkers -#' @importFrom foreach getDoParName -#' @importFrom missMethods delete_MAR_1_to_x -#' @seealso {[mice::mice()], -#' } -#' @param data data in wide format -#' @param m (optional) integer number of imputed datasets (default is 5) -#' @param method (optional) character string of imputation method from mice() -#' (default is random forest "rf") -#' @param home_dir path to home directory -#' @param exposure name of exposure variable -#' @param outcome name of outcome variable with ".timepoint" suffix -#' @param para_proc (optional) TRUE/FALSE whether to do parallel processing -#' using multiple cores to speed up process (default = TRUE) -#' @param save.out (optional) TRUE or FALSE indicator to save output and -#' intermediary output locally (default is TRUE) -#' @param read_imps_from_file (optional) "yes" or "no" indicatorto read in weights -#' that have been previously run and saved locally (default is "no") -#' @return mice object of m imputed datasets -#' @examples -#' test <- data.frame(ID = 1:50, -#' A.1 = rnorm(n = 50), -#' A.2 = rnorm(n = 50), -#' A.3 = rnorm(n = 50), -#' B.1 = rnorm(n = 50), -#' B.2 = rnorm(n = 50), -#' B.3 = rnorm(n = 50), -#' C = rnorm(n = 50), -#' D.3 = rnorm(n = 50)) -#' test[, c("A.1", "A.2", "A.3")] <- lapply(test[, c("A.1", "A.2", "A.3")], as.numeric) -#' -#' test_miss <- missMethods::delete_MAR_1_to_x(as.data.frame(test), p = 0.20, -#' cols_mis = c("A.1", "B.2", "C"), -#' cols_ctrl = c("B.1", "B.1", "B.1"), 3) -#' test_i <- imputeData(data = test_miss, -#' m = 3, -#' method = "rf", -#' exposure = "A", -#' outcome = "D.3", -#' para_proc = TRUE, -#' read_imps_from_file = "no", -#' save.out = FALSE) - - -imputeData <- function(data, m = 5, method = "rf", home_dir = NA, exposure, outcome, para_proc = TRUE, - read_imps_from_file = "no", save.out = TRUE) { - - if (save.out | read_imps_from_file == "yes"){ - if (missing(home_dir)){ - stop("Please supply a home directory.", call. = FALSE) - } - else if (!dir.exists(home_dir)) { - stop('Please provide a valid home directory.', call. = FALSE) - } - } - if (missing(data)){ - stop("Please supply data as either a dataframe with no missing data or imputed data in the form of a mids object or path to folder with imputed csv datasets.", - call. = FALSE) - } - if (missing(exposure)){ - stop("Please supply a single exposure.", call. = FALSE) - } - if (missing(outcome)){ - stop("Please supply a single outcome.", call. = FALSE) - } - - - if(!is.character(method)){ - stop("Please provide as a character a valid imputation method abbreviation.", call. = FALSE) - } - if(!is.numeric(m)){ - stop("Please provide an integer value number of imputations.", call. = FALSE) - } - - if (save.out | read_imps_from_file == "yes"){ - imp_dir <- file.path(home_dir, "imputations") - if (!dir.exists(imp_dir)) { - dir.create(imp_dir) - } - } - - - if (read_imps_from_file == "yes") { - imputed_datasets <- list() - - if (!file.exists(glue::glue("{home_dir}/imputations/{exposure}-{outcome}_all_imp.rds"))) { - stop("Imputations have not been created and saved locally. Please set 'read_imps_from_file' == 'no' and re-run.", call. = FALSE) - } - - imp <- readRDS(glue::glue("{home_dir}/imputations/{exposure}-{outcome}_all_imp.rds")) - imputed_datasets <- imp - - cat("\n") - cat(glue::glue("Reading in {imputed_datasets$m} imputations from the local folder.")) - cat("\n") - return(imputed_datasets) - - } - else { - - if (sum(duplicated(data$"ID")) > 0){ - stop("Please provide a wide dataset with a single row per ID.", call. = FALSE) - } - - imp_method <- method - data_to_impute <- tibble::tibble(data) - - cat(glue::glue("Creating {m} imputed datasets using the {imp_method} imputation method in mice. This may take some time to run.")) - cat("\n") - - if (para_proc){ - # Configure parallelization - nCores <- min(parallel::detectCores(), 8) - options(mc.cores = nCores) - options(cores = nCores) - doParallel::registerDoParallel(cores = nCores) - - cat("### Using", foreach::getDoParWorkers(), "cores\n") - cat("### Using", foreach::getDoParName(), "as the backend\n") - - # Conducts imputations using parallelized execution cycling through m - imputed_datasets <- foreach::foreach(i = seq_len(m), .combine = mice::ibind) %dorng% { - cat("### Started iteration", i, "\n") - miceout <- mice::mice(data_to_impute, m = 1, method = imp_method, maxit = 0, #change maxit to default 5 after testing!!! - print = F) - cat("### Completed iteration", i, "\n") - miceout - } - } - else{ - imputed_datasets <- mice::mice(data_to_impute, m = m, method = imp_method, maxit = 0, #change maxit to default 5 after testing!!! - print = F) - } - - if(save.out){ - saveRDS(imputed_datasets, glue::glue("{home_dir}/imputations/{exposure}-{outcome}_all_imp.rds")) - } - - # Print warnings - cat("USER ALERT: Please view any logged events from the imputation below:", "\n") - cat(knitr::kable(imputed_datasets$loggedEvents, caption = "Logged Events from mice", format = 'pipe'), sep = "\n") - cat("\n") - - if(save.out){ - # Save out individual imputed datasets - for (k in seq_len(m)) { - write.csv(mice::complete(imputed_datasets, k), - file = glue::glue("{home_dir}/imputations/{exposure}-{outcome}_imp{k}.csv")) - } - cat("See the 'imputations/' folder for a .csv file of each imputed dataset and an .rds file of all imputed datasets", "\n") - } - - imputed_datasets - } -} diff --git a/R/inspectData.R b/R/inspectData.R deleted file mode 100644 index f33cc960..00000000 --- a/R/inspectData.R +++ /dev/null @@ -1,334 +0,0 @@ - -#' Inspect long/wide/imputed data -#' -#' @param data data in wide format as: a data frame, list of imputed data -#' frames, or mids object -#' @param home_dir (optional) path to home directory (required if save.out = TRUE) -#' @param exposure name of exposure variable -#' @param exposure_time_pts list of integers at which weights will be -#' created/assessed that correspond to time points when exposure was measured -#' @param outcome name of outcome variable with ".timepoint" suffix -#' @param tv_confounders list of time-varying confounders with ".timepoint" -#' suffix -#' @param ti_confounders list of time invariant confounders -#' @param epochs (optional) data frame of exposure epoch labels and values -#' @param hi_lo_cut (optional) list of two numbers indicating quantile values -#' that reflect high and low values, respectively, for continuous exposure -#' (default is median split) -#' @param reference (optional)string of "-"-separated "l" and "h" values -#' indicative of a reference exposure history to which to compare comparison, -#' required if comparison is specified -#' @param comparison (optional)list of one or more strings of "-"-separated "l" -#' and "h" values indicative of comparison history/histories to compare to -#' reference, required if reference is specified -#' @param verbose (optional) TRUE or FALSE indicator for user output (default is -#' TRUE) -#' @param save.out (optional) TRUE or FALSE indicator to save output and -#' intermediary output locally (default is TRUE) -#' @return none -#' @export -#' @examples -#' test <- data.frame(ID = 1:50, -#' A.1 = rnorm(n = 50), -#' A.2 = rnorm(n = 50), -#' A.3 = rnorm(n = 50), -#' B.1 = rnorm(n = 50), -#' B.2 = rnorm(n = 50), -#' B.3 = rnorm(n = 50), -#' C = rnorm(n = 50), -#' D.3 = rnorm(n = 50)) -#' test[, c("A.1", "A.2", "A.3")] <- lapply(test[, c("A.1", "A.2", "A.3")], as.numeric) -#' -#' inspectData(data = test, -#' exposure = "A", -#' exposure_time_pts = c(1, 2, 3), -#' outcome = "D.3", -#' tv_confounders = c("A.1", "A.2", "A.3", "B.1", "B.2", "B.3"), -#' ti_confounders = "C", -#' save.out = FALSE) -#' inspectData(data = test, -#' exposure = "A", -#' exposure_time_pts = c(1, 2, 3), -#' outcome = "D.3", -#' tv_confounders = c("A.1", "A.2", "A.3", "B.1", "B.2", "B.3"), -#' ti_confounders = "C", -#' hi_lo_cut = c(0.8, 0.2), -#' save.out = FALSE) -#' inspectData(data = test, -#' exposure = "A", -#' exposure_time_pts = c(1, 2, 3), -#' outcome = "D.3", -#' tv_confounders = c("A.1", "A.2", "A.3", "B.1", "B.2", "B.3"), -#' ti_confounders = "C", -#' hi_lo_cut = c(0.8, 0.2), -#' reference = "l-l-l", -#' comparison = "h-h-h", -#' save.out = FALSE) -#' inspectData(data = test, -#' exposure = "A", -#' exposure_time_pts = c(1, 2, 3), -#' outcome = "D.3", -#' tv_confounders = c("A.1", "A.2", "A.3", "B.1", "B.2", "B.3"), -#' ti_confounders = "C", -#' epochs = data.frame(epochs = c("Infancy", "Toddlerhood"), -#' values = I(list(c(1, 2), c(3)))), -#' save.out = FALSE) - -inspectData <- function(data, home_dir, exposure, exposure_time_pts, outcome, tv_confounders, ti_confounders, epochs = NULL, - hi_lo_cut = NULL, reference = NA, comparison = NULL, verbose = TRUE, save.out = TRUE){ - - if (save.out) { - if (missing(home_dir)) { - stop("Please supply a home directory.", call. = FALSE) - } - else if(!dir.exists(home_dir)) { - stop("Please provide a valid home directory path if you wish to save output locally.", call. = FALSE) - } - } - if (missing(data)){ - stop("Please supply data as either a dataframe with no missing data or imputed data in the form of a mids object or path to folder with imputed csv datasets.", - call. = FALSE) - } - if (missing(exposure)){ - stop("Please supply a single exposure.", call. = FALSE) - } - if (missing(outcome)){ - stop("Please supply a single outcome.", call. = FALSE) - } - if (missing(exposure_time_pts)){ - stop("Please supply the exposure time points at which you wish to create weights.", call. = FALSE) - } - if (missing(tv_confounders)){ - stop("Please supply a list of time-varying confounders.", call. = FALSE) - } - if (missing(ti_confounders)){ - stop("Please supply a list of time invariant confounders.", call. = FALSE) - } - - if (!mice::is.mids(data) & !is.data.frame(data) & !inherits(data, "list")) { - stop("Please provide either a 'mids' object, a data frame, or a list of imputed csv files in the 'data' field.", call. = FALSE) - } - - - - ID <- "ID" - time_invar_covars <- ti_confounders - time_var_covars <- tv_confounders - time_pts <- as.numeric(sapply(strsplit(tv_confounders[grepl(exposure, tv_confounders)] , "\\."), "[",2)) - - if (mice::is.mids(data)){ - data <-as.data.frame(mice::complete(data,1)) - } - - else if (inherits(data, "list")) { #just inspects frist imputed dataset - data <- data[[1]] - - } - - # long format to wide - if("WAVE" %in% colnames(data)){ - v <- sapply(strsplit(tv_confounders, "\\."), "[", 1) - v <- v[!duplicated(v)] - data_wide <- stats::reshape(data = data_long, idvar = "ID", v.names = v, timevar = "WAVE", - direction = "wide") - - #removing all NA cols (i.e., when data were not collected) - data_wide <- data_wide[,colSums(is.na(data_wide)) < nrow(data_wide)] - data <- data_wide - } - - - if(!inherits(data, "data.frame")){ - warning(paste0("Your data is a ", class(data), ". Convert to data frame before running devMSMs."), - call. = FALSE) - } - - - exposure_type <- ifelse(inherits(data[, paste0(exposure, '.', exposure_time_pts[1])], - "numeric"), "continuous", "binary") - - - # Confounder summary - potential_covariates <- colnames(data)[!(colnames(data) %in% c(ID))] - - if (sum(tv_confounders %in% potential_covariates) != length(tv_confounders)){ - stop(paste(tv_confounders[!tv_confounders %in% potential_covariates]), - " time-varying confounders are not present in the dataset.", call. = FALSE) - } - - if (sum(ti_confounders %in% potential_covariates) != length(ti_confounders)){ - stop(paste(ti_confounders[!ti_confounders %in% potential_covariates]), - " time invariant confounders are not present in the dataset.", call. = FALSE) - } - - all_potential_covariates <- c(time_invar_covars, time_var_covars) - all_potential_covariates <- all_potential_covariates[order(all_potential_covariates)] - - # Format for table output to visualize available covariates by time point - covar_table <- data.frame(variable = sapply(strsplit(all_potential_covariates, "\\."), "[", 1), - time_pt = sapply(strsplit(all_potential_covariates, "\\."), "[", 2)) %>% - dplyr::arrange(time_pt, variable) %>% - dplyr::group_by(time_pt) %>% - dplyr::summarize(variable = toString(variable)) - - if(save.out){ - write.csv(covar_table, glue::glue("{home_dir}/{exposure}-{outcome}_covariates_considered_by_time_pt.csv"), - row.names = FALSE) - } - - unique_vars <- length(unique(c(time_invar_covars, sapply(strsplit(all_potential_covariates, "\\."), "[", 1)))) - - test <- data.frame(matrix(nrow = length(time_pts), ncol = unique_vars)) - colnames(test) <- unique(c(time_invar_covars, sapply(strsplit(all_potential_covariates, "\\."), - "[", 1)))[order(unique(c(time_invar_covars, - sapply(strsplit(all_potential_covariates, - "\\."), "[", 1))))] - rownames(test) <- time_pts - - for (l in seq_len(nrow(test))) { - z = c(sapply(strsplit(all_potential_covariates[grepl(paste0(".", rownames(test)[l]), - all_potential_covariates)], "\\."), "[", 1), time_invar_covars) - z = z[!duplicated(z)] - test[l, z ] <- 1 - } - - test <- test[, colnames(test)[!(colnames(test) %in% c(ID))]] - NumTimePts <- data.frame(NumTimePts = colSums(test, na.rm = TRUE)) - test <- rbind(test, t(NumTimePts)) - NumVars <- data.frame(NumVars = rowSums(test, na.rm = TRUE)) - test[seq_len(nrow(test)), ncol(test) + 1] <- NumVars - - if(save.out){ - write.csv(test, glue::glue("{home_dir}/{exposure}-{outcome}_matrix_of_covariates_considered_by_time_pt.csv"), - row.names = TRUE) - - if(verbose){ - print(glue::glue("See the home directory for a table and matrix displaying all covariates confounders considered at each exposure time point for {exposure} and {outcome}."), "\n") - - #-2 to exclude ID and WAVE - print(glue::glue("USER ALERT: Below are the {as.character(length(all_potential_covariates) - 2)} variables spanning {unique_vars - 2} unique domains that will be treated as confounding variables for the relation between {exposure} and {outcome}."), - "Please inspect this list carefully. It should include all time-varying covariates, time invariant covariates, as well as lagged levels of exposure and outcome variables if they were collected at time points earlier than the outcome time point.", "\n") - print(all_potential_covariates[!(all_potential_covariates %in% c(ID))]) - } - } - - - # Data type - if(verbose){ - cat("\n") - cat("The following variables are designated as numeric:", "\n") - print(paste(colnames(data)[sapply(data, class) == "numeric"], sep = ",", collapse = ", ")) - cat("\n") - - cat("The following variables are designated as factors:", "\n") - print(paste(colnames(data)[sapply(data, class) == "factor"], sep = ",", collapse = ", ")) - cat("\n") - - #temporary warning re: factor levels - cat("*temp: please inspect the levels of your factors below. at present, excluding ID, the code can only accept 2-level factors. set rest to numeric", "\n") - print(sapply(data[,colnames(data)[sapply(data, class) == "factor"]], nlevels)) - - oth <- data.frame(variable = names(sapply(data, class)) [!sapply(data, class) %in% c("numeric", "factor")], - type = sapply(data, class) [!sapply(data, class) %in% c("numeric", "factor")]) - if(nrow(oth) > 0 ){ - cat(knitr::kable(oth, caption = "Other variable types", - format = 'pipe'), sep = "\n") - cat("\n") - } - - if(sum(sapply(data, is.character)) > 0){ - warning(paste0(paste(names(data)[sapply(data, is.character)], sep = ", ", collapse = ", "), - " are of class character.", " The package cannot accept character variables."), call. = FALSE) - } - } - #covariate correlations - covariates_to_include <- all_potential_covariates - - # Creates final dataset with only relevant variables - covariates_to_include <- covariates_to_include[order(covariates_to_include)] - variables_to_include <- unique(c(ID, outcome, covariates_to_include, time_var_covars)) - data2 <- data %>% - select(all_of(variables_to_include)) - - # Makes correlation table - corr_matrix <- cor(as.data.frame(lapply(data2[, colnames(data2) != ID], - as.numeric)), use = "pairwise.complete.obs") - - if(save.out){ - ggcorrplot::ggcorrplot(corr_matrix, type = "lower")+ - ggplot2::theme(axis.text.x = element_text(size = 5, margin = ggplot2::margin(-2, 0, 0, 0)), # Order: top, right, bottom, left - axis.text.y = element_text(size = 5, margin = ggplot2::margin(0, -2, 0, 0))) + - ggplot2::geom_vline(xintercept = seq_len(ncol(mtcars)) - 0.5, colour="white", size = 2) + - ggplot2::geom_hline(yintercept = seq_len(ncol(mtcars)) - 0.5, colour="white", size = 2) - - # Save correlation plot - pdf(file = paste0(home_dir, "/", exposure, "-", outcome, "_all_vars_corr_plot.pdf")) - print(ggplot2::last_plot()) - dev.off() - - if(verbose){ - cat("\n") - cat("A correlation plot of all variables in the dataset has been saved in the home directory", "\n") - cat("\n") - } - } - - - # Exposure summary - exposure_summary <- data %>% - dplyr:: select(colnames(data)[grepl(exposure, colnames(data))]) - exposure_summary <- sapply(exposure_summary, as.numeric) - exposure_summary <- psych::describe(exposure_summary, fast = TRUE) - - - if (save.out){ - knitr::kable(exposure_summary, caption = paste0("Summary of ", exposure, " Exposure Information"), - format = 'html') %>% - kableExtra::kable_styling() %>% - kableExtra::save_kable(file = file.path(home_dir, paste0("/", exposure, "_exposure_info.html"))) - if(verbose){ - cat(knitr::kable(exposure_summary, caption = paste0("Summary of ", exposure, " Exposure Information"), - format = 'pipe'), sep = "\n") - cat(paste0(exposure, " exposure descriptive statistics have now been saved in the home directory"), "\n") - cat("\n") - } - } - - eval_hist(data = data2, exposure, tv_confounders, epochs, - exposure_time_pts, hi_lo_cut, ref = reference, comps = comparison, verbose) - - # Exposure history summary - if( is.null(epochs)){ #making epochs time pts if not specified by user - epochs <- data.frame(epochs = as.character(time_pts), - values = time_pts) - } - else{ - if( !is.data.frame(epochs) | ncol(epochs) != 2 | sum(colnames(epochs) == c("epochs", "values")) != ncol(epochs)){ - stop("If you supply epochs, please provide a dataframe with two columns of epochs and values.", - call. = FALSE) - } - if(sum(is.na(epochs$values)) > 0){ - stop("Please provide one or a list of several values for each epoch.", call. = FALSE) - } - } - - # Outcome summary - outcome_summary <- data[, grepl(sapply(strsplit(outcome, "\\."), - "[", 1), colnames(data))] - outcome_summary <- psych::describe(outcome_summary, fast = TRUE) - - if(save.out){ - knitr::kable(outcome_summary, caption = paste0("Summary of Outcome ", - sapply(strsplit(outcome, "\\."), "[", 1), " Information"), format = 'html') %>% - kableExtra::kable_styling() %>% - kableExtra::save_kable(file = file.path(home_dir, paste0("/", sapply(strsplit(outcome, "\\."), "[", 1), "_outcome_info.html"))) - - if (verbose){ - cat(knitr::kable(outcome_summary, caption = paste0("Summary of Outcome ", - sapply(strsplit(outcome, "\\."), "[", 1), " Information"), - format = 'pipe'), sep = "\n") - - cat(paste0(sapply(strsplit(outcome, "\\."), "[", 1), " outcome descriptive statistics have now been saved in the home directory"), "\n") - } - } -} diff --git a/man/formatLongData.Rd b/man/formatLongData.Rd deleted file mode 100644 index be3c4545..00000000 --- a/man/formatLongData.Rd +++ /dev/null @@ -1,81 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/formatLongData.R -\name{formatLongData} -\alias{formatLongData} -\title{Formats long data} -\usage{ -formatLongData( - home_dir, - data, - exposure, - exposure_time_pts, - outcome, - tv_confounders, - time_var = NA, - id_var = NA, - missing = NA, - factor_confounders = NULL, - save.out = TRUE -) -} -\arguments{ -\item{home_dir}{path to home directory} - -\item{data}{dataframe in long format} - -\item{exposure}{name of exposure variable} - -\item{exposure_time_pts}{list of integers at which weights will be -created/assessed that correspond to time points when exposure was measured} - -\item{outcome}{name of outcome variable with ".timepoint" suffix} - -\item{tv_confounders}{list of time-varying confounders with ".timepoint" -suffix} - -\item{time_var}{(optional) variable name in original dataset demarcating time} - -\item{id_var}{(optional) variable name in original dataset demarcating ID} - -\item{missing}{(optional) indicator for missing data in original dataset} - -\item{factor_confounders}{(optional) list of variable names that are factors -(default is numeric)} - -\item{save.out}{(optional) TRUE or FALSE indicator to save output and -intermediary output locally (default is TRUE)} -} -\value{ -formatted long dataset -} -\description{ -Formats long data -} -\examples{ -test <- data.frame(ID = 1:50, - A.1 = rnorm(n = 50), - A.2 = rnorm(n = 50), - A.3 = rnorm(n = 50), - B.1 = rnorm(n = 50), - B.2 = rnorm(n = 50), - B.3 = rnorm(n = 50), - C = rnorm(n = 50), - D.3 = rnorm(n = 50)) -test[, c("A.1", "A.2", "A.3")] <- lapply(test[, c("A.1", "A.2", "A.3")], as.numeric) - -test_long <- stats::reshape(data = test, - idvar = "ID", #'list ID variable - varying = c("A.1", "A.2", "A.3", "B.1", "B.2", "B.3"), - direction = "long") - -test_long_format <- formatLongData(data = test_long, - exposure = "A", - exposure_time_pts = c(1, 2, 3), - outcome = "D.3", - tv_confounders = c("A.1", "A.2", "A.3", "B.1", "B.2", "B.3"), - time_var = "time", - id_var = NA, - missing = NA, - factor_confounders = "C", - save.out = FALSE) -} diff --git a/man/imputeData.Rd b/man/imputeData.Rd deleted file mode 100644 index 6124efcd..00000000 --- a/man/imputeData.Rd +++ /dev/null @@ -1,77 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/imputeData.R -\name{imputeData} -\alias{imputeData} -\title{Imputes dataset so there is no missing at each time point using parallel -processing to speed up} -\usage{ -imputeData( - data, - m = 5, - method = "rf", - home_dir = NA, - exposure, - outcome, - para_proc = TRUE, - read_imps_from_file = "no", - save.out = TRUE -) -} -\arguments{ -\item{data}{data in wide format} - -\item{m}{(optional) integer number of imputed datasets (default is 5)} - -\item{method}{(optional) character string of imputation method from mice() -(default is random forest "rf")} - -\item{home_dir}{path to home directory} - -\item{exposure}{name of exposure variable} - -\item{outcome}{name of outcome variable with ".timepoint" suffix} - -\item{para_proc}{(optional) TRUE/FALSE whether to do parallel processing -using multiple cores to speed up process (default = TRUE)} - -\item{read_imps_from_file}{(optional) "yes" or "no" indicatorto read in weights -that have been previously run and saved locally (default is "no")} - -\item{save.out}{(optional) TRUE or FALSE indicator to save output and -intermediary output locally (default is TRUE)} -} -\value{ -mice object of m imputed datasets -} -\description{ -Imputes dataset so there is no missing at each time point using parallel -processing to speed up -} -\examples{ -test <- data.frame(ID = 1:50, - A.1 = rnorm(n = 50), - A.2 = rnorm(n = 50), - A.3 = rnorm(n = 50), - B.1 = rnorm(n = 50), - B.2 = rnorm(n = 50), - B.3 = rnorm(n = 50), - C = rnorm(n = 50), - D.3 = rnorm(n = 50)) -test[, c("A.1", "A.2", "A.3")] <- lapply(test[, c("A.1", "A.2", "A.3")], as.numeric) - -test_miss <- missMethods::delete_MAR_1_to_x(as.data.frame(test), p = 0.20, - cols_mis = c("A.1", "B.2", "C"), - cols_ctrl = c("B.1", "B.1", "B.1"), 3) -test_i <- imputeData(data = test_miss, - m = 3, - method = "rf", - exposure = "A", - outcome = "D.3", - para_proc = TRUE, - read_imps_from_file = "no", - save.out = FALSE) -} -\seealso{ -{\code{\link[mice:mice]{mice::mice()}}, -\url{https://cran.r-project.org/web/packages/mice/index.html}} -} diff --git a/man/inspectData.Rd b/man/inspectData.Rd deleted file mode 100644 index 4cbff80e..00000000 --- a/man/inspectData.Rd +++ /dev/null @@ -1,113 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/inspectData.R -\name{inspectData} -\alias{inspectData} -\title{Inspect long/wide/imputed data} -\usage{ -inspectData( - data, - home_dir, - exposure, - exposure_time_pts, - outcome, - tv_confounders, - ti_confounders, - epochs = NULL, - hi_lo_cut = NULL, - reference = NA, - comparison = NULL, - verbose = TRUE, - save.out = TRUE -) -} -\arguments{ -\item{data}{data in wide format as: a data frame, list of imputed data -frames, or mids object} - -\item{home_dir}{(optional) path to home directory (required if save.out = TRUE)} - -\item{exposure}{name of exposure variable} - -\item{exposure_time_pts}{list of integers at which weights will be -created/assessed that correspond to time points when exposure was measured} - -\item{outcome}{name of outcome variable with ".timepoint" suffix} - -\item{tv_confounders}{list of time-varying confounders with ".timepoint" -suffix} - -\item{ti_confounders}{list of time invariant confounders} - -\item{epochs}{(optional) data frame of exposure epoch labels and values} - -\item{hi_lo_cut}{(optional) list of two numbers indicating quantile values -that reflect high and low values, respectively, for continuous exposure -(default is median split)} - -\item{reference}{(optional)string of "-"-separated "l" and "h" values -indicative of a reference exposure history to which to compare comparison, -required if comparison is specified} - -\item{comparison}{(optional)list of one or more strings of "-"-separated "l" -and "h" values indicative of comparison history/histories to compare to -reference, required if reference is specified} - -\item{verbose}{(optional) TRUE or FALSE indicator for user output (default is -TRUE)} - -\item{save.out}{(optional) TRUE or FALSE indicator to save output and -intermediary output locally (default is TRUE)} -} -\value{ -none -} -\description{ -Inspect long/wide/imputed data -} -\examples{ -test <- data.frame(ID = 1:50, - A.1 = rnorm(n = 50), - A.2 = rnorm(n = 50), - A.3 = rnorm(n = 50), - B.1 = rnorm(n = 50), - B.2 = rnorm(n = 50), - B.3 = rnorm(n = 50), - C = rnorm(n = 50), - D.3 = rnorm(n = 50)) -test[, c("A.1", "A.2", "A.3")] <- lapply(test[, c("A.1", "A.2", "A.3")], as.numeric) - -inspectData(data = test, - exposure = "A", - exposure_time_pts = c(1, 2, 3), - outcome = "D.3", - tv_confounders = c("A.1", "A.2", "A.3", "B.1", "B.2", "B.3"), - ti_confounders = "C", - save.out = FALSE) -inspectData(data = test, - exposure = "A", - exposure_time_pts = c(1, 2, 3), - outcome = "D.3", - tv_confounders = c("A.1", "A.2", "A.3", "B.1", "B.2", "B.3"), - ti_confounders = "C", - hi_lo_cut = c(0.8, 0.2), - save.out = FALSE) -inspectData(data = test, - exposure = "A", - exposure_time_pts = c(1, 2, 3), - outcome = "D.3", - tv_confounders = c("A.1", "A.2", "A.3", "B.1", "B.2", "B.3"), - ti_confounders = "C", - hi_lo_cut = c(0.8, 0.2), - reference = "l-l-l", - comparison = "h-h-h", - save.out = FALSE) -inspectData(data = test, - exposure = "A", - exposure_time_pts = c(1, 2, 3), - outcome = "D.3", - tv_confounders = c("A.1", "A.2", "A.3", "B.1", "B.2", "B.3"), - ti_confounders = "C", - epochs = data.frame(epochs = c("Infancy", "Toddlerhood"), - values = I(list(c(1, 2), c(3)))), - save.out = FALSE) -}