From 0671e6175af4a71196d8a3ef53baf7191a30e6c8 Mon Sep 17 00:00:00 2001 From: Noah Greifer Date: Fri, 29 Sep 2023 13:21:40 -0400 Subject: [PATCH 1/3] Added mitml --- DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/DESCRIPTION b/DESCRIPTION index 6334271b..fd3d3a3b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -33,6 +33,7 @@ Imports: survey, jtools, mice, + mitml, stargazer, gtools, cobalt, From 17bd4dd36f716bc756b6e9b27562544eefaaed06 Mon Sep 17 00:00:00 2001 From: Noah Greifer Date: Fri, 29 Sep 2023 13:22:09 -0400 Subject: [PATCH 2/3] Code cleaning --- R/assessBalance.R | 112 +++++++++++++-------------- R/calcBalStats.R | 184 ++++++++++++++++++++++----------------------- R/createFormulas.R | 16 ++-- R/fitModel.R | 1 + 4 files changed, 155 insertions(+), 158 deletions(-) diff --git a/R/assessBalance.R b/R/assessBalance.R index 66511732..d8dcc361 100644 --- a/R/assessBalance.R +++ b/R/assessBalance.R @@ -119,7 +119,7 @@ assessBalance <- function(home_dir, data, exposure, exposure_time_pts, outcome, type, formulas, weights = NULL, balance_thresh = 0.1, - imp_conf = NULL, verbose = TRUE, save.out = TRUE){ + imp_conf = NULL, verbose = TRUE, save.out = TRUE) { if (!is.logical(save.out)) { stop("`save.out` must be a flag (TRUE or FALSE)", call. = FALSE) @@ -128,10 +128,10 @@ assessBalance <- function(home_dir, data, exposure, exposure_time_pts, outcome, if (missing(home_dir)) { stop("Please supply a home directory.", call. = FALSE) } - else if(!is.character(home_dir)){ + else if (!is.character(home_dir)) { stop("Please provide a valid home directory path as a string if you wish to save output locally.", call. = FALSE) } - else if(!dir.exists(home_dir)) { + else if (!dir.exists(home_dir)) { stop("Please provide a valid home directory path if you wish to save output locally.", call. = FALSE) } } @@ -145,54 +145,54 @@ assessBalance <- function(home_dir, data, exposure, exposure_time_pts, outcome, !(is.list(data) && all(vapply(data, is.data.frame, logical(1L))))) { stop("Please provide either a 'mids' object, a data frame, or a list of imputed csv files in the 'data' field.", call. = FALSE) } - else if(is.list(data) && !is.data.frame(data)){ - if (sum(sapply(data, is.data.frame)) != length(data)){ + else if (is.list(data) && !is.data.frame(data)) { + if (sum(sapply(data, is.data.frame)) != length(data)) { stop("Please supply a list of data frames that have been imputed.", call. = FALSE) } } - if (missing(exposure)){ + if (missing(exposure)) { stop("Please supply a single exposure.", call. = FALSE) } - else if(!is.character(exposure) | length(exposure) != 1){ + else if (!is.character(exposure) || length(exposure) != 1) { stop("Please supply a single exposure as a character.", call. = FALSE) } - if (missing(outcome)){ + if (missing(outcome)) { stop("Please supply a single outcome.", call. = FALSE) } - else if(!is.character(outcome) | length(outcome) != 1){ + else if (!is.character(outcome) || length(outcome) != 1) { stop("Please supply a single outcome as a character.", call. = FALSE) } - if (missing(exposure_time_pts)){ + if (missing(exposure_time_pts)) { stop("Please supply the exposure time points at which you wish to create weights.", call. = FALSE) } - else if(!is.numeric(exposure_time_pts)){ + else if (!is.numeric(exposure_time_pts)) { stop("Please supply a list of exposure time points as integers.", call. = FALSE) } - if (missing(formulas)){ + if (missing(formulas)) { stop("Please supply a list of balancing formulas.", call. = FALSE) } - else if(!is.list(formulas) | is.data.frame(formulas)){ + else if (!is.list(formulas) || is.data.frame(formulas)) { stop("Please provide a list of formulas for each exposure time point", call. = FALSE) } - else if(length(formulas) != length(exposure_time_pts)){ + else if (length(formulas) != length(exposure_time_pts)) { stop("Please provide a list of formulas for each exposure time point", call. = FALSE) } - else if(is.list(formulas) && !is.data.frame(formulas)){ + else if (is.list(formulas) && !is.data.frame(formulas)) { if (sum(sapply(formulas, function(x) { - inherits(x, "formula")})) != length(formulas)){ + inherits(x, "formula")})) != length(formulas)) { stop("Please supply a list of formulas for each exposure time point.", call. = FALSE) } } - if (missing(type)){ + if (missing(type)) { stop("Please supply a 'weighted', 'prebalance' type", call. = FALSE) } - if (!inherits(type, "character") || length(type) != 1 ){ + if (!inherits(type, "character") || length(type) != 1 ) { stop("Please provide a single type as a character string from the following list: 'prebalance', 'weighted'", call. = FALSE) } else if (!type %in% c("prebalance", "weighted")) { @@ -210,12 +210,12 @@ assessBalance <- function(home_dir, data, exposure, exposure_time_pts, outcome, stop("Please provide either a 'mids' object, a data frame, or a list of imputed csv files in the 'data' field.", call. = FALSE) } - if (!is.null(weights) && (!is.list(weights) || is.data.frame(weights))){ + if (!is.null(weights) && (!is.list(weights) || is.data.frame(weights))) { stop("Please supply a list of weights output from the createWeights function.", call. = FALSE) } - else if(is.list(weights) && !is.data.frame(weights)){ + else if (is.list(weights) && !is.data.frame(weights)) { if (sum(sapply(weights, function(x) { - inherits(x, "weightitMSM")})) != length(weights)){ + inherits(x, "weightitMSM")})) != length(weights)) { stop("Please supply a list of weights output from the createWeights function.", call. = FALSE) } } @@ -224,28 +224,28 @@ assessBalance <- function(home_dir, data, exposure, exposure_time_pts, outcome, stop("Please provide one or two balance thresholds as numbers from 0-1.") } - if (length(balance_thresh) == 2 && is.null(imp_conf)){ + if (length(balance_thresh) == 2 && is.null(imp_conf)) { stop("If you wish to provide different balance threshold for important and less important confounders, please provide a list of important confounders in the 'imp_conf' field.", call. = FALSE) } - if (!is.null(imp_conf) && length(balance_thresh) == 1){ + if (!is.null(imp_conf) && length(balance_thresh) == 1) { stop("If you provide a list of important confounders, please provide a list of two balance thresholds for important and less important confounders, respectively", call. = FALSE) } - else if(!is.null(imp_conf) && !is.character(imp_conf)){ + else if (!is.null(imp_conf) && !is.character(imp_conf)) { stop("Please provide a list variable names as characters that are important confounders.", call. = FALSE) } - if(!is.logical(verbose)){ + if (!is.logical(verbose)) { stop("Please set verbose to either TRUE or FALSE.", call. = FALSE) } - else if(length(verbose) != 1){ + else if (length(verbose) != 1) { stop("Please provide a single TRUE or FALSE value to verbose.", call. = FALSE) } - if(!is.logical(save.out)){ + if (!is.logical(save.out)) { stop("Please set save.out to either TRUE or FALSE.", call. = FALSE) } - else if(length(save.out) != 1){ + else if (length(save.out) != 1) { stop("Please provide a single TRUE or FALSE value to save.out.", call. = FALSE) } @@ -254,7 +254,7 @@ assessBalance <- function(home_dir, data, exposure, exposure_time_pts, outcome, folder <- switch(type, "prebalance" = "prebalance/", "weighted/") - if (save.out){ + if (save.out) { balance_dir <- file.path(home_dir, "balance") if (!dir.exists(balance_dir)) { dir.create(balance_dir) @@ -293,14 +293,14 @@ assessBalance <- function(home_dir, data, exposure, exposure_time_pts, outcome, bal_stats <- lapply(seq_len(m), function(k) { d <- as.data.frame(mice::complete(data, k)) - if (anyNA(d)){ + if (anyNA(d)) { stop("This code requires complete data. Consider imputation if missingness < 20% and is reasonably Missing at Random (MAR).", call. = FALSE) } exposure_type <- if (is.numeric(d[, paste0(exposure, '.', exposure_time_pts[1])])) "continuous" else "binary" - if (sum(duplicated(d$"ID")) > 0){ + if (sum(duplicated(d$"ID")) > 0) { stop("Please provide wide imputed datasets with a single row per ID.", call. = FALSE) } @@ -310,19 +310,19 @@ assessBalance <- function(home_dir, data, exposure, exposure_time_pts, outcome, } else { - m = length(data) + m <- length(data) bal_stats <- lapply(seq_len(m), function(k) { d <- data[[k]] - if (anyNA(d)){ + if (anyNA(d)) { stop("This code requires complete data. Consider imputation if missingness < 20% and is reasonably Missing at Random (MAR).", call. = FALSE) } - exposure_type <- if (is.numeric(d[, paste0(exposure, '.', exposure_time_pts[1])])) "continuous" else "binary" + exposure_type <- if (is.numeric(d[[paste0(exposure, '.', exposure_time_pts[1])]])) "continuous" else "binary" - if (sum(duplicated(d[["ID"]])) > 0){ + if (sum(duplicated(d[["ID"]])) > 0) { stop("Please provide wide imputed datasets with a single row per ID.", call. = FALSE) } @@ -336,10 +336,10 @@ assessBalance <- function(home_dir, data, exposure, exposure_time_pts, outcome, bal_stats_all_imp <- bal_stats_all_imp[order(bal_stats_all_imp$covariate), ] - if (save.out){ + if (save.out) { write.csv(bal_stats_all_imp, sprintf("%s/balance/%s/%s_all_imps_balance_stat_summary.csv", home_dir, type, exposure)) - if (verbose ){ + if (verbose ) { cat("Pre balance statistics for each imputed dataset have now been saved in the 'balance/prebalance/' folder", "\n") } @@ -357,7 +357,7 @@ assessBalance <- function(home_dir, data, exposure, exposure_time_pts, outcome, #adds custom bal thresh info - if (!is.null(imp_conf)){ + if (!is.null(imp_conf)) { all_bal_stats$bal_thresh <- ifelse(all_bal_stats$covariate %in% imp_conf, balance_thresh[1], balance_thresh[2]) @@ -373,7 +373,7 @@ assessBalance <- function(home_dir, data, exposure, exposure_time_pts, outcome, } - if (verbose){ + if (verbose) { cat("\n") cat(paste0("*** Averaging Across All Imputations ***"), "\n") } @@ -383,7 +383,7 @@ assessBalance <- function(home_dir, data, exposure, exposure_time_pts, outcome, else { - if (sum(duplicated(data[["ID"]])) > 0){ + if (sum(duplicated(data[["ID"]])) > 0) { stop("Please provide wide dataset with a single row per ID.", call. = FALSE) } @@ -392,7 +392,7 @@ assessBalance <- function(home_dir, data, exposure, exposure_time_pts, outcome, call. = FALSE) } - exposure_type <- if (is.numeric(data[, paste0(exposure, '.', exposure_time_pts[1])])) "continuous" else "binary" + exposure_type <- if (is.numeric(data[[paste0(exposure, '.', exposure_time_pts[1])]])) "continuous" else "binary" bal_stats <- calcBalStats(home_dir, data, formulas, exposure, exposure_time_pts, outcome, balance_thresh, k = 0, weights = NULL, imp_conf, verbose, save.out) @@ -429,11 +429,11 @@ assessBalance <- function(home_dir, data, exposure, exposure_time_pts, outcome, bal_stats <- lapply(seq_len(m), function(k) { d <- as.data.frame(mice::complete(data, k)) - if (anyNA(d)){ + if (anyNA(d)) { stop("This code requires complete data. Consider imputation if missingness < 20% and is reasonably Missing at Random (MAR).", call. = FALSE) } - if (sum(duplicated(d[["ID"]])) > 0){ + if (sum(duplicated(d[["ID"]])) > 0) { stop("Please provide wide imputed datasets with a single row per ID.", call. = FALSE) } @@ -449,11 +449,11 @@ assessBalance <- function(home_dir, data, exposure, exposure_time_pts, outcome, bal_stats <- lapply(seq_len(m), function(k) { d <- data[[k]] - if (anyNA(d)){ + if (anyNA(d)) { stop("This code requires complete data. Consider imputation if missingness < 20% and is reasonably Missing at Random (MAR).", call. = FALSE) } - if (sum(duplicated(d[["ID"]])) > 0){ + if (sum(duplicated(d[["ID"]])) > 0) { stop("Please provide wide imputed datasets with a single row per ID.", call. = FALSE) } @@ -469,10 +469,10 @@ assessBalance <- function(home_dir, data, exposure, exposure_time_pts, outcome, bal_stats_all_imp <- do.call(dplyr::bind_rows, bal_stats) bal_stats_all_imp <- bal_stats_all_imp[order(bal_stats_all_imp$covariate), ] - if (save.out){ + if (save.out) { write.csv(bal_stats_all_imp, sprintf("%s/balance/%s/%s_all_imps_balance_stat_summary.csv", home_dir, type, exposure)) - if (verbose){ + if (verbose) { cat("Weighted balance statistics for each imputed dataset have now been saved in the 'balance/weighted/' folder", "\n") } } @@ -487,9 +487,9 @@ assessBalance <- function(home_dir, data, exposure, exposure_time_pts, outcome, avg_bal = rowMeans(do.call(cbind, lapply(bal_stats, `[[`, "std_bal_stats")))) # #adds custom bal thresh info - if (!is.null(imp_conf)){ + if (!is.null(imp_conf)) { - all_bal_stas$bal_thresh <-ifelse(all_bal_stats$covariate %in% imp_conf, + all_bal_stats$bal_thresh <-ifelse(all_bal_stats$covariate %in% imp_conf, balance_thresh[1], balance_thresh[2]) all_bal_stats$balanced <- ifelse(abs(all_bal_stats$avg_bal) < @@ -511,7 +511,7 @@ assessBalance <- function(home_dir, data, exposure, exposure_time_pts, outcome, } else { - if (sum(duplicated(data$"ID")) > 0){ + if (sum(duplicated(data$"ID")) > 0) { stop("Please provide wide dataset with a single row per ID.", call. = FALSE) } @@ -563,7 +563,7 @@ assessBalance <- function(home_dir, data, exposure, exposure_time_pts, outcome, data_type, balance_thresh, weights_method, imp_conf, verbose, save.out) }) - if (save.out){ + if (save.out) { outfile <- sprintf("%s/balance/%s/%s-%s_all_%s_%s_associations.html", home_dir, type, exposure, outcome, type, weights_method) @@ -574,8 +574,8 @@ assessBalance <- function(home_dir, data, exposure, exposure_time_pts, outcome, sink() - if (verbose){ - if (mi){ + if (verbose) { + if (mi) { cat(sprintf("Summary plots for %s %s averaged across all imputations have been saved out for each time point in the 'balance/%s/plots/' folder.\n", form_name, exposure, type)) } @@ -593,7 +593,7 @@ assessBalance <- function(home_dir, data, exposure, exposure_time_pts, outcome, row.names = FALSE) - if (verbose){ + if (verbose) { if (mi) { cat(sprintf("Check 'balance/%s/' folder for a table of all %s correlations or standardized mean differences averaged across imputed datasets.\n", @@ -616,7 +616,7 @@ assessBalance <- function(home_dir, data, exposure, exposure_time_pts, outcome, "[", 1)[!duplicated(sapply(strsplit(unbalanced_covars$covariate, "\\."), "[", 1))] - if (verbose){ + if (verbose) { if (mi) { cat(sprintf("USER ALERT: Averaging across all imputed datasets for exposure %s using the %s formulas and %s :\n", exposure, form_name, weights_method)) @@ -656,7 +656,7 @@ assessBalance <- function(home_dir, data, exposure, exposure_time_pts, outcome, round(min(all_bal_stats$avg_ba), 2), round(max(all_bal_stats$avg_ba), 2))) - if (nrow(unbalanced_covars) > 0){ + if (nrow(unbalanced_covars) > 0) { # cat(paste0("As shown below, the following ", nrow(unbalanced_covars), " covariates across time points out of ", # length(tot_covars), " total (", round(nrow(unbalanced_covars) / length(tot_covars) * 100, 2), "%) spanning ", # length(unbalanced_constructs), " domains out of ", length(tot_cons), " (", round(length(unbalanced_constructs) / length(tot_cons) * 100, 2), diff --git a/R/calcBalStats.R b/R/calcBalStats.R index cb84433e..2b3c4a36 100644 --- a/R/calcBalStats.R +++ b/R/calcBalStats.R @@ -76,48 +76,48 @@ #' weights = w[[1]]) calcBalStats <- function(home_dir = NA, data, formulas, exposure, exposure_time_pts, outcome, balance_thresh, k = 0, weights = NULL, - imp_conf = NULL, verbose = TRUE, save.out = TRUE){ + imp_conf = NULL, verbose = TRUE, save.out = TRUE) { - if(!is.list(formulas) | is.data.frame(formulas)){ + if (!is.list(formulas) || is.data.frame(formulas)) { stop("Please provide a list of formulas for each exposure time point", call. = FALSE) } - if (!is.null(weights) && !inherits(weights, "weightitMSM")){ + if (!is.null(weights) && !inherits(weights, "weightitMSM")) { stop("Please supply a list of weights output from the createWeights function (via WeightIt::WeightItMSM).", call. = FALSE) } form_name <- sapply(strsplit(names(formulas[1]), "_form"), "[", 1) - exposure_type <- if(inherits(data[, paste0(exposure, '.', exposure_time_pts[1])], "numeric")) "continuous" else "binary" - weighted <- if(!is.null(weights)) 1 else 0 + exposure_name1 <- paste0(exposure, ".", exposure_time_pts[1]) - factor_covariates <- colnames(data)[sapply(data, is.factor)] + exposure_type <- if (is.numeric(data[[exposure_name1]])) "continuous" else "binary" + weighted <- !is.null(weights) + + factor_covariates <- names(data)[sapply(data, is.factor)] factor_covariates <- setdiff(factor_covariates, "ID") if (weighted) { weights_method <- weights$method - w <- weights$weights #IPTW weights - data$weights <- as.numeric(w) + data$weights <- as.numeric(weights$weights) #IPTW weights } - else{ + else { weights_method <- "no weights" } - folder <- if (weighted) "weighted/" else "prebalance/" data_type <- if (k == 0) "single" else "imputed" - if (data_type == "imputed" && verbose){ + if (data_type == "imputed" && verbose) { cat(paste0("**Imputation ", k, "**"), "\n") } #split factors - if(length(names(data)[sapply(data, class ) == "factor"]) > 0){ - data$"ID" = as.numeric(data$"ID") - data2 <- cobalt::splitfactor(data, names(data)[sapply(data, class ) == "factor"], drop.first = FALSE ) + if (length(factor_covariates) > 0) { + # data$"ID" = as.numeric(data$"ID") + data2 <- cobalt::splitfactor(data, factor_covariates, drop.first = "if2") } - else{ + else { data2 <- data } @@ -136,18 +136,19 @@ calcBalStats <- function(home_dir = NA, data, formulas, exposure, exposure_time_ # GETS COVARIATES FROM FORM FOR ASSESSING BALANCE full_form <- formulas[[names(formulas)[as.numeric(sapply(strsplit(names(formulas), "-"), "[", 2)) == exposure_time_pt]]] - covars <- paste(deparse(full_form[[3]], width.cutoff = 500), collapse = "") # gets covariates + covars <- deparse1(full_form[[3]], collapse = "") # gets covariates covar_time <- sapply(strsplit(unlist(strsplit(as.character(covars), "\\+")), "\\."), "[", 2) covars <- as.character(unlist(strsplit(covars, "\\+"))) covars <- gsub(" ", "", covars) + exposure_name <- paste0(exposure, ".", exposure_time_pt) - if(length(names(data)[sapply(data, class ) == "factor"]) > 0){ + if (length(factor_covariates) > 0) { #making factor covars separate variables - data_cov <- data[, covars] - data_cov <- cobalt::splitfactor(data_cov, names(data_cov)[sapply(data_cov, class ) == "factor"], drop.first = FALSE ) - covars <- colnames(data_cov) + data_cov <- data[covars] + data_cov <- cobalt::splitfactor(data_cov, intersect(factor_covariates, covars), drop.first = FALSE) + covars <- names(data_cov) } @@ -158,10 +159,10 @@ calcBalStats <- function(home_dir = NA, data, formulas, exposure, exposure_time_ # Unweighted pre-balance checking if (!weighted) { if (exposure_type == "continuous") { - bal_stats <- cobalt::col_w_cov(temp[, c(covars)], temp[, paste0(exposure, ".", exposure_time_pt)], std = TRUE) # finding correlation + bal_stats <- cobalt::col_w_cov(temp[covars], temp[[exposure_name]], std = TRUE) # finding correlation } else if (exposure_type == "binary") { - bal_stats <- cobalt::col_w_smd(temp[, c(covars)], temp[, paste0(exposure, ".", exposure_time_pt)], std = TRUE) # finding smd + bal_stats <- cobalt::col_w_smd(temp[covars], temp[[exposure_name]], std = TRUE) # finding smd } } @@ -169,27 +170,26 @@ calcBalStats <- function(home_dir = NA, data, formulas, exposure, exposure_time_ else { if (exposure_type == "continuous") { # finding cor - bal_stats <- cobalt::col_w_cov(temp[covars], temp[[paste0(exposure, ".", exposure_time_pt)]], std = TRUE, + bal_stats <- cobalt::col_w_cov(temp[covars], temp[[exposure_name]], std = TRUE, weights = temp[["weights"]]) #IPTW weights } else if (exposure_type == "binary") { # finding smd - bal_stats <- cobalt::col_w_smd(temp[covars], temp[[paste0(exposure, ".", exposure_time_pt)]], std = TRUE, + bal_stats <- cobalt::col_w_smd(temp[covars], temp[[exposure_name]], std = TRUE, weights = temp[["weights"]]) #IPTW weights } } bal_stats <- as.data.frame(bal_stats) - colnames(bal_stats) <- "std_bal_stats" + names(bal_stats) <- "std_bal_stats" } #ends lag=0 # ASSIGNING HISTORIES FOR EXP TIME POINTS T>1 - if (length(lagged_time_pts) > 0) { + else { # creating proportion weights based on proportion of individuals in a given exposure history - prop_weights <- data.frame(id = data[["ID"]], + prop_weights <- data.frame(ID = data[["ID"]], exposure = exposure, exp_time = exposure_time_pt, history = NA) - colnames(prop_weights)[colnames(prop_weights) == "id"] <- "ID" # finding histories up until exp time point T histories <- apply(gtools::permutations(2, length(lagged_time_pts), c(1, 0), repeats.allowed = TRUE), @@ -217,17 +217,17 @@ calcBalStats <- function(home_dir = NA, data, formulas, exposure, exposure_time_ if (exp == 0) { # low levels/absent if (exposure_type == "continuous") { - data$flag <- ifelse(data[, exps_time[t]] <= median(data[, paste0(exposure, ".", exposure_time_pt)]) + data$flag <- ifelse(data[[exps_time[t]]] <= median(data[[exposure_name]]) & data$flag == flag, t, NA) # finding those w/ vals <= median exp @ time pt & flagged at prev t's } else { # for binary exp - data$flag <- ifelse(data[, exps_time[t]] == 0 & data$flag == flag, t , NA) # if exposure is absent & flagged at prev t's + data$flag <- ifelse(data[[exps_time[t]]] == 0 & data$flag == flag, t, NA) # if exposure is absent & flagged at prev t's } } else { # hi levels/present if (exposure_type == "continuous") { - data$flag <- ifelse(data[[exps_time[t]]] > median(data[[paste0(exposure, ".", exposure_time_pt)]]) + data$flag <- ifelse(data[[exps_time[t]]] > median(data[[exposure_name]]) & data$flag == flag, t, NA) # finding those w/ vals > median exp @ time pt & flagged at prev t's } else { # binary exp @@ -240,7 +240,7 @@ calcBalStats <- function(home_dir = NA, data, formulas, exposure, exposure_time_ } # ends history's constituent time pts time points (e.g., 6, 15, 24) # finding ids who met criteria for that history - ids <- data[["ID"]][data$flag == t] + ids <- data[["ID"]][!is.na(data$flag) & data$flag == t] # labels those ids w/ that history prop_weights[["history"]][prop_weights[["ID"]] %in% ids] <- paste(his, collapse = ",") @@ -248,7 +248,7 @@ calcBalStats <- function(home_dir = NA, data, formulas, exposure, exposure_time_ } # ends history loop (e.g., "l-l-l") prop_sum <- aggregate(exposure ~ as.factor(history), data = prop_weights, - FUN = function(x) n = length(x)) + FUN = length) # GET BALANCE STATISTICS FOR T>1 (when there is a history to weight on) if (length(lagged_time_pts) > 0) { @@ -258,11 +258,11 @@ calcBalStats <- function(home_dir = NA, data, formulas, exposure, exposure_time_ # Removing any histories that only have 1 or 0 person contributing (cannot calc bal stats) - if (sum(prop_sum$n == 1) > 0 || sum(prop_sum$n == 0) > 0) { + if (any(prop_sum$n == 1) || any(prop_sum$n == 0)) { - ommitted_histories <- as.character(as.data.frame(prop_sum)[prop_sum$n == 1 | prop_sum$n == 0, 1]) + omitted_histories <- as.character(as.data.frame(prop_sum)[[1]][prop_sum$n == 1 | prop_sum$n == 0]) - if (data_type == "imputed"){ + if (data_type == "imputed") { # cat(paste0("USER ALERT: the following history/histories, ", ommitted_histories, # ", has/have been omitted from balance checking for exposure ", exposure, # ", imputation ", k, ", at time point ", exposure_time_pt)) @@ -270,7 +270,7 @@ calcBalStats <- function(home_dir = NA, data, formulas, exposure, exposure_time_ balance checking for exposure %s imputation %s at time point %s:", omitted_histories, exposure, k, exposure_time_pt)) } - else{ + else { # cat(paste0("USER ALERT: the following history/histories, ", ommitted_histories, # ", has/have been omitted from balance checking for exposure ", exposure, # " at time point ", exposure_time_pt)) @@ -279,17 +279,18 @@ calcBalStats <- function(home_dir = NA, data, formulas, exposure, exposure_time_ omitted_histories, exposure, exposure_time_pt)) } - temp <- temp[!temp$history %in% ommitted_histories, ] + temp <- temp[!temp$history %in% omitted_histories, , drop = FALSE] } #ends hist exc # Unweighted pre-balance checking if (!weighted) { # no IPTW weighting but weighting on history + if (exposure_type == "continuous") { bal_stats <- sapply(sort(unique(temp$history)), function(i) { # finding balance by history - temp2 <- temp[temp$history == i, , drop = FALSE ] + temp2 <- temp[temp$history == i, , drop = FALSE] - cobalt::col_w_cov(temp2[covars], temp2[[paste0(exposure, ".", exposure_time_pt)]], std = FALSE) + cobalt::col_w_cov(temp2[covars], temp2[[exposure_name]], std = FALSE) # #should be same length as covars (already have factors split up) # cobalt::col_w_cov(temp2[, c(covars)], temp2[, paste0(exposure, ".", exposure_time_pt)], std = FALSE, # finding covariance @@ -305,9 +306,10 @@ calcBalStats <- function(home_dir = NA, data, formulas, exposure, exposure_time_ bal_stats$std_bal_stats <- weighted_bal_stats / - (sapply(seq(nrow(bal_stats)), function(x) { #issue: looking in data for unweighted vals but factors have additional vars - sd(as.numeric(data2[, rownames(bal_stats)[x]]), na.rm = TRUE) }) *# unweighted covar sd - sd(data[, paste0(exposure, ".", exposure_time_pt)], na.rm = TRUE)) # exposure SD at that time pt + (sapply(rownames(bal_stats), function(x) { #issue: looking in data for unweighted vals but factors have additional vars + sd(as.numeric(data2[[x]]), na.rm = TRUE) + }) *# unweighted covar sd + sd(data[[exposure_name]], na.rm = TRUE)) # exposure SD at that time pt #temp error warning re: factor w/ multiple levels creating different numbers of variables per history --makes bal_stats a list and breaks std code if (!is.list(bal_stats)) { @@ -316,7 +318,7 @@ calcBalStats <- function(home_dir = NA, data, formulas, exposure, exposure_time_ } - bal_stats <- subset(bal_stats, select = grepl("std", colnames(bal_stats) )) + bal_stats <- bal_stats[startsWith(names(bal_stats), "std")] } #ends continuous @@ -325,7 +327,7 @@ calcBalStats <- function(home_dir = NA, data, formulas, exposure, exposure_time_ temp2 <- temp[temp$history == i, , drop = FALSE ] - cobalt::col_w_smd(temp2[, c(covars)], temp2[, paste0(exposure, ".", exposure_time_pt)], std = FALSE) # finding mean difference + cobalt::col_w_smd(temp2[covars], temp2[[exposure_name]], std = FALSE) # finding mean difference # cobalt::col_w_smd(temp2[, c(covars)], temp2[, paste0(exposure, ".", exposure_time_pt)], std = FALSE, # finding mean difference # subset = temp2$history[temp2$history == i] == i) # subsetting by that history @@ -340,17 +342,15 @@ calcBalStats <- function(home_dir = NA, data, formulas, exposure, exposure_time_ # standardizing balance statistics after finding weighted balance stats bal_stats$std_bal_stats <- weighted_bal_stats/ - sapply(seq(ncol(data[, covars])), function(x){ + sapply(covars, function(x) { sqrt(mean( #dividing by pool SD estimate (unadjusted) - var(as.numeric(data[data[, (colnames(data) %in% paste0(exposure, ".", - exposure_time_pts[1]))] == 1 , colnames(data) %in% covars[x]])), #treated var - var(as.numeric(data[data[, (colnames(data) == paste0(exposure, ".", - exposure_time_pts[1]))] == 0 , colnames(data) %in% covars[x]])) #untreated var + var(as.numeric(data[[x]][data[[exposure_name1]] == 1])), #treated var + var(as.numeric(data[[x]][data[[exposure_name1]] == 0])) #untreated var )) }) #temp error warning re: factor w/ multiple levels creating different numbers of variables per history --makes bal_stats a list and breaks std code - if(inherits(bal_stats, "list")){ + if (!is.data.frame(bal_stats)) { stop("There are unequal numbers of variables across histories, likely due to an ordinal variable with several levels denoted as a factor.", call. = FALSE) } @@ -358,14 +358,13 @@ calcBalStats <- function(home_dir = NA, data, formulas, exposure, exposure_time_ # For a weighted_bal_stat of 0, make std stat also 0 so as not to throw an error bal_stats$std_bal_stats[is.nan(bal_stats$std_bal_stats)] <- 0 - bal_stats <- subset(bal_stats, select = grepl("std", colnames(bal_stats) )) + bal_stats <- bal_stats[startsWith(names(bal_stats), "std")] } #ends binary } #ends weighted=0 - # Weighted balance checking - else if (weighted) { # if weighted, use IPTW weights from weightitmsm and weight by history + else { # if weighted, use IPTW weights from weightitmsm and weight by history if (exposure_type == "continuous") { # finds balance for each covariate clustered/subset by history @@ -373,13 +372,13 @@ calcBalStats <- function(home_dir = NA, data, formulas, exposure, exposure_time_ temp2 <- temp[temp$history == i,, drop = FALSE] - cobalt::col_w_cov(temp2[, c(covars)], temp2[, paste0(exposure, ".", exposure_time_pt)], std = FALSE, # finding covariance + cobalt::col_w_cov(temp2[covars], temp2[[exposure_name]], std = FALSE, # finding covariance # subset = temp2$history[temp2$history == i] == i, # subsetting by that history - weights = temp2[, "weights"]) # adding IPTW weights + weights = temp2[["weights"]]) # adding IPTW weights }) #temp error warning re: factor w/ multiple levels creating different numbers of variables per history --makes bal_stats a list and breaks std code - if(inherits(bal_stats, "list")){ + if (!is.data.frame(bal_stats)) { stop("There are unequal numbers of variables across histories, likely due to an ordinal variable with several levels denoted as a factor.", call. = FALSE) } @@ -394,9 +393,9 @@ calcBalStats <- function(home_dir = NA, data, formulas, exposure, exposure_time_ # standardizing balance statistics after weighting by history # bal_stats <- bal_stats %>% bal_stats$std_bal_stats <- weighted_bal_stats / - (sapply(seq(nrow(bal_stats)), function(x) { #issue: looking in data for unweighted vals but factors have additional vars - sd(as.numeric(data2[, rownames(bal_stats)[x]]), na.rm = TRUE) }) *# unweighted covar sd - sd(data[, paste0(exposure, ".", exposure_time_pt)], na.rm = TRUE)) # exposure SD at that time pt + (sapply(rownames(bal_stats), function(x) { #issue: looking in data for unweighted vals but factors have additional vars + sd(as.numeric(data2[[x]]), na.rm = TRUE) }) *# unweighted covar sd + sd(data[[exposure_name]], na.rm = TRUE)) # exposure SD at that time pt # For a weighted_bal_stat of 0, make std stat also 0 so as not to throw an error @@ -404,7 +403,7 @@ calcBalStats <- function(home_dir = NA, data, formulas, exposure, exposure_time_ # bal_stats <- bal_stats %>% # dplyr::select(contains(c("std"))) - bal_stats <- subset(bal_stats, select = grepl("std", colnames(bal_stats) )) + bal_stats <- bal_stats[startsWith(names(bal_stats), "std")] } #ends continuous @@ -414,13 +413,13 @@ calcBalStats <- function(home_dir = NA, data, formulas, exposure, exposure_time_ temp2 <- temp[temp$history == i,, drop = FALSE] - cobalt::col_w_smd(temp2[, c(covars)], temp2[, paste0(exposure, ".", exposure_time_pt)], std = FALSE, # finding mean difference + cobalt::col_w_smd(temp2[covars], temp2[[exposure_name]], std = FALSE, # finding mean difference # subset = temp2$history[temp2$history == i] == i, # subsetting by that history - weights = temp2[, "weights"]) # adding IPTW weights + weights = temp2[["weights"]]) # adding IPTW weights }) #temp error warning re: factor w/ multiple levels creating different numbers of variables per history --makes bal_stats a list and breaks std code - if(inherits(bal_stats, "list")){ + if (!is.data.frame(bal_stats)) { stop("There are unequal numbers of variables across histories, likely due to an ordinal variable with several levels denoted as a factor.", call. = FALSE) } @@ -434,22 +433,17 @@ calcBalStats <- function(home_dir = NA, data, formulas, exposure, exposure_time_ # standardizing balance statistics after finding weighted balance stats bal_stats$std_bal_stats <- weighted_bal_stats/ - sapply(seq(ncol(data[, covars])), function(x){ + sapply(covars, function(x) { sqrt(mean( #dividing by pool SD estimate (unadjusted) - var(as.numeric(data[data[, (colnames(data) %in% paste0(exposure, ".", - exposure_time_pts[1]))] == 1 , colnames(data) %in% covars[x]])), #treated var - var(as.numeric(data[data[, (colnames(data) == paste0(exposure, ".", - exposure_time_pts[1]))] == 0 , colnames(data) %in% covars[x]])) #untreated var + var(as.numeric(data[[x]][data[[exposure_name1]] == 1])), #treated var + var(as.numeric(data[[x]][data[[exposure_name1]] == 0])) #untreated var )) }) # For a weighted_bal_stat of 0, make std stat also 0 so as not to throw an error bal_stats$std_bal_stats[is.nan(bal_stats$std_bal_stats)] <- 0 - # For a weighted_bal_stat of 0, make std stat also 0 so as not to throw an error - bal_stats$std_bal_stats[is.nan(bal_stats$std_bal_stats)] <- 0 - - bal_stats <- subset(bal_stats, select = grepl("std", colnames(bal_stats) )) + bal_stats <- bal_stats[startsWith(names(bal_stats), "std")] } #ends binary } #ends weighted @@ -467,11 +461,11 @@ calcBalStats <- function(home_dir = NA, data, formulas, exposure, exposure_time_ data$ID <- as.numeric(data$ID) f_vars <- colnames(data)[sapply(data, is.factor)] - if(length(f_vars) > 0){ + if (length(f_vars) > 0) { f_stats <- bal_stats[rownames(bal_stats)[sapply(strsplit(rownames(bal_stats), "_"), "[", 1) %in% f_vars], ] f_stats$name <- sapply(strsplit(rownames(f_stats), "_"), "[", 1) test <- aggregate(std_bal_stats ~ name, data = f_stats, - FUN = function(x) new = mean(x)) + FUN = mean) colnames(test) <- c("covariate", "std_bal_stats") new <- data.frame(std_bal_stats = test$std_bal_stats, @@ -482,11 +476,11 @@ calcBalStats <- function(home_dir = NA, data, formulas, exposure, exposure_time_ } #adds custom bal thresh info - if (!is.null(imp_conf)){ + if (!is.null(imp_conf)) { bal_stats$bal_thresh <- ifelse(bal_stats$covariate %in% imp_conf, balance_thresh[1], balance_thresh[2]) bal_stats$balanced <- ifelse(abs(bal_stats$std_bal_stats) < bal_stats$bal_thresh, 1, 0) } - else{ + else { bal_stats$bal_thresh <- balance_thresh bal_stats$balanced <- ifelse(abs(bal_stats$std_bal_stats) < bal_stats$bal_thresh, 1, 0) @@ -507,7 +501,7 @@ calcBalStats <- function(home_dir = NA, data, formulas, exposure, exposure_time_ if (verbose & save.out) { - if (data_type == "imputed"){ + if (data_type == "imputed") { cat(paste0("For each time point and imputation, %s summary plots for %s formulas weighting method %s have now been saved in the %s plots/' folder.\n", @@ -526,13 +520,13 @@ calcBalStats <- function(home_dir = NA, data, formulas, exposure, exposure_time_ # Summarizing balance bal_summary_exp <- as.data.frame(aggregate(balanced ~ exp_time, data = all_bal_stats, FUN = function(x) c(balanced_n = sum(x == 1), - imbalanced_n = sum(x == 0), - n = length(x)) )) - bal_summary_exp <- do.call(data.frame, bal_summary_exp) - colnames(bal_summary_exp) <- c("exp_time", "balanced_n", "imbalanced_n", "n") + imbalanced_n = sum(x == 0), + n = length(x)))) + bal_summary_exp <- do.call(data.frame, bal_summary_exp) #? + names(bal_summary_exp) <- c("exp_time", "balanced_n", "imbalanced_n", "n") - if (save.out){ + if (save.out) { write.csv(bal_summary_exp, sprintf("%s/balance/%s%s_%s_%s_%s_balance_stat_summary.csv", home_dir,folder, form_name, exposure, k, weights_method)) @@ -541,9 +535,9 @@ calcBalStats <- function(home_dir = NA, data, formulas, exposure, exposure_time_ sprintf("%s/balance/%s/%s_form_%s_%s_%s_history_sample_weight.csv", home_dir, folder, form_name, exposure, k, weights_method)) - if (verbose){ + if (verbose) { cat("\n") - if (data_type == "imputed"){ + if (data_type == "imputed") { cat(sprintf("Balance statistics using %s formulas for %s imputation %s, using %s have been saved in the 'balance/%s' folder. \n", @@ -552,7 +546,7 @@ calcBalStats <- function(home_dir = NA, data, formulas, exposure, exposure_time_ cat(sprintf("Sampling weights using the %s for %s imputation %s have been saved in the 'balance/%s' folder., \n", form_name, exposure, k, folder)) } - else{ + else { cat(sprintf("Balance statistics using %s formulas for %s using %s have been saved in the 'balance/%s' folder. \n", @@ -567,7 +561,7 @@ calcBalStats <- function(home_dir = NA, data, formulas, exposure, exposure_time_ # tallies total possible COVARIATES FROM FORM FOR ASSESSING BALANCE all_form <- as.data.frame(do.call(rbind, formulas)) - tot_covars <- deparse(all_form[, 3], width.cutoff = 300) + tot_covars <- deparse1(all_form[, 3]) tot_covars <- as.character(unlist(strsplit(tot_covars, "\\+")))[ !grepl("form", as.character(unlist(strsplit(tot_covars, "\\+"))))] tot_covars <- gsub(" ", "", tot_covars) @@ -578,7 +572,7 @@ calcBalStats <- function(home_dir = NA, data, formulas, exposure, exposure_time_ total_covars <- sum(bal_summary_exp$n, na.rm = TRUE) total_domains <- length(tot_covars) - if (imbalanced_covars > 0){ + if (imbalanced_covars > 0) { percentage_imbalanced <- round((imbalanced_covars / total_covars) * 100, 0) remaining_imbalanced_domains <- length(sapply(strsplit(all_bal_stats[all_bal_stats$balanced == 0, "covariate"], "\\."), @@ -604,7 +598,7 @@ calcBalStats <- function(home_dir = NA, data, formulas, exposure, exposure_time_ round(max(all_bal_stats$std_bal_stats), 2))) cat("\n") - if (imbalanced_covars > 0){ + if (imbalanced_covars > 0) { cat(sprintf("As shown below, %s out of %s ( %s%%) covariates across time points, corresponding to %sout of %s domains, remain imbalanced with a remaining median absolute value correlation/std mean difference of %s (range= %s):\n", imbalanced_covars, @@ -621,7 +615,7 @@ calcBalStats <- function(home_dir = NA, data, formulas, exposure, exposure_time_ k, weights_method, form_name), format = 'pipe'), sep = "\n") } - else{ + else { cat(sprintf("No covariates remain imbalanced for imputation %s using %s and %s formulas. \n", k, weights_method, form_name)) } @@ -629,9 +623,10 @@ calcBalStats <- function(home_dir = NA, data, formulas, exposure, exposure_time_ cat("\n") cat("\n") - } else { + } + else { - if (imbalanced_covars > 0){ + if (imbalanced_covars > 0) { cat(sprintf("As shown below, %s out of %s ( %s%%) covariates across time points, corresponding to %s out of %s domains, remain imbalanced with a remaining median absolute value correlation/std mean difference of %s (range= %s):\n", imbalanced_covars, @@ -647,12 +642,11 @@ calcBalStats <- function(home_dir = NA, data, formulas, exposure, exposure_time_ caption = sprintf("Imbalanced covariates using %s and %s formulas", weights_method, form_name), format = 'pipe'), sep = "\n") } - else{ + else { cat(sprintf("No covariates remain imbalanced using %s and %s formulas. \n", weights_method, form_name)) } - cat("\n") - cat("\n") + cat("\n\n") } } diff --git a/R/createFormulas.R b/R/createFormulas.R index 33e45648..2f59b3e6 100644 --- a/R/createFormulas.R +++ b/R/createFormulas.R @@ -196,8 +196,8 @@ createFormulas <- function(home_dir, exposure, exposure_time_pts, outcome, type, } } - factor_covariates <- colnames(data)[which(sapply(data, class) == "factor")] - factor_covariates <- factor_covariates[!factor_covariates %in% "ID"] + factor_covariates <- colnames(data)[sapply(data, is.factor)] + factor_covariates <- factor_covariates[factor_covariates != "ID"] forms_csv <- character() forms <- list() @@ -304,7 +304,7 @@ createFormulas <- function(home_dir, exposure, exposure_time_pts, outcome, type, #adding in any user-specified confounders to retain in all formulas if(!is.null(keep_conf)){ - if(!inherits(keep_conf, "character")){ + if(!is.character(keep_conf)){ stop("Please provide as a character string a list of confounders to include in all formulas.", call. = FALSE) } if(sum(keep_conf %in% tv_confounders) != length(keep_conf)){ @@ -332,11 +332,13 @@ createFormulas <- function(home_dir, exposure, exposure_time_pts, outcome, type, } # Appends the form string to forms_csv - forms_csv <- c(forms_csv, - sprintf("%s formula for %s-%s at %s time point %s:", - type, exposure, outcome, exposure, as.character(time))) + if (save.out) { + forms_csv <- c(forms_csv, + sprintf("%s formula for %s-%s at %s time point %s:", + type, exposure, outcome, exposure, as.character(time))) - forms_csv <- c(forms_csv, paste(exposure, "~", paste0(vars_to_include[order(vars_to_include)], sep = "", collapse = " + "))) + forms_csv <- c(forms_csv, paste(exposure, "~", paste0(vars_to_include[order(vars_to_include)], sep = "", collapse = " + "))) + } # Assigns the form to forms list forms[[paste(type, "_form", "-", time, sep = "")]] <- f diff --git a/R/fitModel.R b/R/fitModel.R index bf22d5d2..e4facc38 100644 --- a/R/fitModel.R +++ b/R/fitModel.R @@ -171,6 +171,7 @@ fitModel <- function(home_dir, data, weights, exposure, exposure_time_pts, outco if (!is.character(model)){ stop('Please provide as a character string a valid model "m" from 0-3 (e.g., "m1")', call. = FALSE) + } else if (!is.character(model) || length(model) != 1){ stop('Please provide a single outcome model selection "m" from 0-3 (e.g., "m1")', call. = FALSE) } From 03b247729e99e8e45fba2208313ed3898a8f7df4 Mon Sep 17 00:00:00 2001 From: Noah Greifer Date: Fri, 29 Sep 2023 13:22:42 -0400 Subject: [PATCH 3/3] Doc updates --- NAMESPACE | 3 +++ 1 file changed, 3 insertions(+) diff --git a/NAMESPACE b/NAMESPACE index e5760d91..0089642b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -17,4 +17,7 @@ export(get_reference_values) export(make_love_plot) export(perform_multiple_comparison_correction) export(trimWeights) +import(ggplot2) +import(stats) +import(utils) importFrom(survey,svyglm)