From eb6e6c2a4f4009d644d634eb2fb595b8ae98c693 Mon Sep 17 00:00:00 2001 From: "Isabella Stallworthy, PhD" Date: Tue, 28 Nov 2023 11:10:00 -0500 Subject: [PATCH] data documentation --- R/createFormulas.R | 2 +- R/data.R | 146 +++++++++++++++++++++ R/getModel.R | 141 ++++++++++---------- examplePipelineRevised.Rmd | 23 ++-- man/sim_data_imp_list.rda.Rd | 21 +++ man/sim_data_long_miss.rda.Rd | 20 +++ man/sim_data_long_miss_bin.rda.Rd | 20 +++ man/sim_data_mice.rda.Rd | 20 +++ man/sim_data_wide.rda.Rd | 63 +++++++++ man/sim_data_wide_bin.rda.Rd | 23 ++++ man/sim_data_wide_miss.rda.Rd | 20 +++ man/sim_data_wide_miss_bin.rda.Rd | 20 +++ vignettes/Preliminary_Steps.Rmd | 23 ++-- vignettes/Workflow_Continuous_Exposure.Rmd | 16 ++- 14 files changed, 456 insertions(+), 102 deletions(-) create mode 100644 R/data.R create mode 100644 man/sim_data_imp_list.rda.Rd create mode 100644 man/sim_data_long_miss.rda.Rd create mode 100644 man/sim_data_long_miss_bin.rda.Rd create mode 100644 man/sim_data_mice.rda.Rd create mode 100644 man/sim_data_wide.rda.Rd create mode 100644 man/sim_data_wide_bin.rda.Rd create mode 100644 man/sim_data_wide_miss.rda.Rd create mode 100644 man/sim_data_wide_miss_bin.rda.Rd diff --git a/R/createFormulas.R b/R/createFormulas.R index 076ce29e..7f559332 100644 --- a/R/createFormulas.R +++ b/R/createFormulas.R @@ -413,7 +413,7 @@ createFormulas <- function(exposure, exposure_time_pts, outcome, type, ti_confou if (verbose) { message("The user-supplied custom balancing formula for each exposure time point are below: ") - lapply(formulas, print) + lapply(forms, print) } } diff --git a/R/data.R b/R/data.R new file mode 100644 index 00000000..8545e019 --- /dev/null +++ b/R/data.R @@ -0,0 +1,146 @@ +#' Wide complete data (continuous exposure) +#' These data are simulated based on data from the Family Life Project (FLP), a longitudinal study following 1,292 families +#' representative of two geographic areas (three counties in North Carolina and three counties in Pennsylvania) with high rural +#' child poverty (Vernon-Feagans et al., 2013; Burchinal et al., 2008). These data contain economic strain (ESEATA1) as a continuously +#' distributed variable. +#' +#' @name sim_data_wide.rda +#' @docType data +#' @format A wide data frame of 1,292 observations +#' There are 69 measured variables. +#' \itemize{ +#' \item "ESETA1" is the continuous exposure of economic strain +#' \item "StrDif_Tot.58" is the continuous outcome of behavioral problems +#' \item "InRatioCor" is the income-to-needs ratio +#' \item "PmEd2" is the parent's education level +#' \item "state" is the family's state of residence +#' \item "TcBlac2" is the family's race (1 = x, 0 = y) +#' \item "bioDadInHH2" is whether the biological father lives with the family (insert coding) +#' \item "HomeOwnd" indicator of whether family owns home (insert coding) +#' \item "KFASTScr" +#' \item "PmBlac2" primary careigver race (insert coding) +#' \item "SmokTotl" +#' \item "caregiv_health" +#' \item "gov_assist" +#' \item "ALI_LE" +#' \item "B18Raw" +#' \item "CORTB" +#' \item "EARS_TJo" +#' \item "fscore" +#' \item "HOMEETA1" +#' \item "IBRAttn" +#' \item "LESMnNeg" +#' \item "MDI" +#' \item "RHAsSO" +#' \item "SAAmylase" +#' \item "WndNbrhood" +#' } +#' @references Vernon-Feagans, L., Cox, M., Willoughby, M., Burchinal, M., Garrett-Peters, P., Mills-Koonce, R., +#' Garrett-Peiers, P., Conger, R. D., & Bauer, P. J. (2013). The Family Life Project: An Epidemiological and +#' Developmental Study of Young Children Living in Poor Rural Communities. +#' Monographs of the Society for Research in Child Development, 78(5), i–150. +#' +#' Burchinal, M., Howes, C., Pianta, R., Bryant, D., Early, D., Clifford, R., & Barbarin, O. (2008). +#' Predicting Child Outcomes at the End of Kindergarten from the Quality of Pre-Kindergarten Teacher–Child Interactions and +#' Instruction. Applied Developmental Science, 12(3), 140–153. https://doi.org/10.1080/10888690802199418 +#' +#'@keywords datasets +"sim_data_wide" + + +#' Wide complete data (binary exposure) +#' These data are simulated based on data from the Family Life Project (FLP), a longitudinal study following 1,292 families +#' representative of two geographic areas (three counties in North Carolina and three counties in Pennsylvania) with high rural +#' child poverty (Vernon-Feagans et al., 2013; Burchinal et al., 2008). These data contain economic strain (ESEATA1) as a binary variable. +#' +#' @name sim_data_wide_bin.rda +#' @docType data +#' @format A data frame +#' +"sim_data_wide_bin" + + +#' Wide data with missingness (continuous exposure) +#' +#' These data are simulated based on data from the Family Life Project (FLP), a longitudinal study following 1,292 families +#' representative of two geographic areas (three counties in North Carolina and three counties in Pennsylvania) with high rural +#' child poverty (Vernon-Feagans et al., 2013; Burchinal et al., 2008). MAR missingness has been added using the missMethods package. +#' These data contain economic strain (ESEATA1) as a continuously distributed variable. +#' +#' @name sim_data_wide_miss.rda +#' @docType data +#' @format A data frame +#' +"sim_data_wide_miss" + + +#' Wide data with missingness (binary exposure) +#' +#' These data are simulated based on data from the Family Life Project (FLP), a longitudinal study following 1,292 families +#' representative of two geographic areas (three counties in North Carolina and three counties in Pennsylvania) with high rural +#' child poverty (Vernon-Feagans et al., 2013; Burchinal et al., 2008). MAR missingness has been added using the missMethods package. +#' These data contain economic strain (ESEATA1) as a binary variable. +#' +#' @name sim_data_wide_miss_bin.rda +#' @docType data +#' @format A data frame +#' +"sim_data_wide_miss_bin" + + +#' Long data with missingness (continuous exposure) +#' +#' These data are simulated based on data from the Family Life Project (FLP), a longitudinal study following 1,292 families +#' representative of two geographic areas (three counties in North Carolina and three counties in Pennsylvania) with high rural +#' child poverty (Vernon-Feagans et al., 2013; Burchinal et al., 2008). MAR missingness has been added using the missMethods package. +#' These data contain economic strain (ESEATA1) as a continuously distributed variable. +#' +#' @name sim_data_long_miss.rda +#' @docType data +#' @format A data frame +#' +"sim_data_long_miss" + + +#' Long data with missingness (binary exposure) +#' +#' These data are simulated based on data from the Family Life Project (FLP), a longitudinal study following 1,292 families +#' representative of two geographic areas (three counties in North Carolina and three counties in Pennsylvania) with high rural +#' child poverty (Vernon-Feagans et al., 2013; Burchinal et al., 2008). MAR missingness has been added using the missMethods package. +#' These data contain economic strain (ESEATA1) as a binary variable. +#' +#' @name sim_data_long_miss_bin.rda +#' @docType data +#' @format A data frame +#' +"sim_data_long_miss_bin" + + +#' Wide data imputed with mice (continuous exposure) +#' +#' These data are simulated based on data from the Family Life Project (FLP), a longitudinal study following 1,292 families +#' representative of two geographic areas (three counties in North Carolina and three counties in Pennsylvania) with high rural +#' child poverty (Vernon-Feagans et al., 2013; Burchinal et al., 2008). MAR missingness has been added using the missMethods package before +#' imputing with the mice package. These data contain economic strain (ESEATA1) as a continuously distributed variable. +#' +#' @name sim_data_mice.rda +#' @docType data +#' @format A mice object +#' +"sim_data_mice" + + +#' Wide data imputed and read in (continuous exposure) +#' +#' These data are simulated based on data from the Family Life Project (FLP), a longitudinal study following 1,292 families +#' representative of two geographic areas (three counties in North Carolina and three counties in Pennsylvania) with high rural +#' child poverty (Vernon-Feagans et al., 2013; Burchinal et al., 2008). MAR missingness has been added using the missMethods package before +#' imputing with the mice package and reading in each imputed dataset. These data contain economic strain (ESEATA1) as a continuously +#' distributed variable. +#' +#' @name sim_data_imp_list.rda +#' @docType data +#' @format A list of data frames +#' +"sim_data_imp_list" + diff --git a/R/getModel.R b/R/getModel.R index e6c8af26..756200c0 100644 --- a/R/getModel.R +++ b/R/getModel.R @@ -127,6 +127,7 @@ getModel <- function(d, exposure, exposure_time_pts, outcome, exp_epochs, #split factors factor_covariates <- names(d)[sapply(d, is.factor)] factor_covariates <- setdiff(factor_covariates, "ID") + if (length(factor_covariates) > 0) { d <- cobalt::splitfactor(d, factor_covariates, drop.first = "if2") @@ -134,76 +135,78 @@ getModel <- function(d, exposure, exposure_time_pts, outcome, exp_epochs, %in% factor_covariates] } - if (!missing(covariates)) { - if (any(grepl("\\:", covariates))) { - ints <- covariates[grepl("\\:", covariates)] - - #making interactions w/ split factors - - for (x in seq_len(length(ints))) { - vars <- as.character(unlist(strsplit(ints[x], "\\:"))) - num_comp <- length(vars) - - f_vars <- NULL - if (any(vars %in% factor_covariates)) { - vars <- do.call(c, lapply(vars, function(y) { - if (y %in% factor_covariates) { - f_vars <- factors_split[sapply(strsplit(factors_split, "\\_"), "[", 1) %in% y] - y <- f_vars } - y - })) - } - - if (any(as.logical(unlist(lapply(vars, function(x) { - any(!x %in% names(d))}))))) { - stop("Please only include covariate interactions between variables in your data", - call. = FALSE) - } - - ints2 <- combn(vars, num_comp) - ints2 <- as.data.frame(ints2[, sapply(strsplit(ints2[1, ], "\\_"), "[", 1) != - sapply(strsplit(ints2[2, ], "\\_"), "[", 1)]) - ints2 <- unlist(lapply(1:ncol(ints2), - function(y) {paste(ints2[, y], collapse = ":")} )) - ints2 <- ints2[!duplicated(ints2)] - - prods <- lapply(ints2, function(z) { - v <- as.character(unlist(strsplit(z, "\\:"))) - temp <- as.data.frame(d[, v]) - prod <- apply(as.matrix(temp), 1, prod) - prod - }) - prods <- do.call(rbind.data.frame, prods) - prods <- as.data.frame(t(prods)) - names(prods) <- ints2 - - #make factor class if both components are factors - for (f in seq_len(length(ints2))) { - vars <- as.character(unlist(strsplit(ints2[f], "\\:"))) - if (all(vars %in% factor_covariates)) { - prods[, names(prods)[any(as.logical(unlist(lapply(names(prods), function(k) { - as.character(unlist(strsplit(k, "\\:"))) %in% f_vars}))))]] <- - as.data.frame(lapply(prods[, names(prods)[any(as.logical(unlist(lapply(names(prods),function(l) { - as.character(unlist(strsplit(l, "\\:"))) %in% f_vars}))))]], - as.factor)) - } - } - #adding to dataset - - d <- cbind(d, prods) - } - } - - covariates <- c(covariates[!grepl("\\:", covariates)], - names(d)[grepl("\\:", names(d))]) - } + # if (!missing(covariates)) { + # if (any(grepl("\\:", covariates))) { + # ints <- covariates[grepl("\\:", covariates)] + # + # #making interactions w/ split factors + # + # for (x in seq_len(length(ints))) { + # vars <- as.character(unlist(strsplit(ints[x], "\\:"))) + # num_comp <- length(vars) + # + # f_vars <- NULL + # if (any(vars %in% factor_covariates)) { + # vars <- do.call(c, lapply(vars, function(y) { + # if (y %in% factor_covariates) { + # f_vars <- factors_split[sapply(strsplit(factors_split, "\\_"), "[", 1) %in% y] + # y <- f_vars } + # y + # })) + # } + # + # if (any(as.logical(unlist(lapply(vars, function(x) { + # any(!x %in% names(d))}))))) { + # stop("Please only include covariate interactions between variables in your data", + # call. = FALSE) + # } + # + # ints2 <- combn(vars, num_comp) + # ints2 <- as.data.frame(ints2[, sapply(strsplit(ints2[1, ], "\\_"), "[", 1) != + # sapply(strsplit(ints2[2, ], "\\_"), "[", 1)]) + # ints2 <- unlist(lapply(1:ncol(ints2), + # function(y) {paste(ints2[, y], collapse = ":")} )) + # ints2 <- ints2[!duplicated(ints2)] + # + # prods <- lapply(ints2, function(z) { + # v <- as.character(unlist(strsplit(z, "\\:"))) + # temp <- as.data.frame(d[, v]) + # prod <- apply(as.matrix(temp), 1, prod) + # prod + # }) + # prods <- do.call(rbind.data.frame, prods) + # prods <- as.data.frame(t(prods)) + # names(prods) <- ints2 + # + # #make factor class if both components are factors + # for (f in seq_len(length(ints2))) { + # vars <- as.character(unlist(strsplit(ints2[f], "\\:"))) + # if (all(vars %in% factor_covariates)) { + # prods[, names(prods)[any(as.logical(unlist(lapply(names(prods), function(k) { + # as.character(unlist(strsplit(k, "\\:"))) %in% f_vars}))))]] <- + # as.data.frame(lapply(prods[, names(prods)[any(as.logical(unlist(lapply(names(prods),function(l) { + # as.character(unlist(strsplit(l, "\\:"))) %in% f_vars}))))]], + # as.factor)) + # } + # } + # #adding to dataset + # + # d <- cbind(d, prods) + # } + # } + # + # covariates <- c(covariates[!grepl("\\:", covariates)], + # names(d)[grepl("\\:", names(d))]) + # } + # Covariate models checking if (model %in% c("m1", "m3", "covs")) { if (any(grepl("\\.", covariates))) { - tv_cov <- covariates[grepl("\\.", covariates)] + cov <- as.character(unlist(strsplit(covariates, "\\:"))) + tv_cov <- cov[grepl("\\.", cov)] if (any(as.numeric(gsub("_.*", "", sub(".*\\.(.)", "\\1", as.character(unlist(strsplit(tv_cov, "\\:")))))) > exposure_time_pts[1])) { @@ -213,10 +216,10 @@ getModel <- function(d, exposure, exposure_time_pts, outcome, exp_epochs, } } - if (!all(covariates[!grepl("\\:", covariates)] %in% colnames(d))) { - stop("Please only include covariates that correspond to variables in the wide dataset.", - call. = FALSE) - } + # if (!all(covariates[!grepl("\\:", covariates)] %in% colnames(d))) { + # stop("Please only include covariates that correspond to variables in the wide dataset.", + # call. = FALSE) + # } covariate_list <- paste(c(as.character(covariates)), sep = "", collapse = " + ") @@ -244,7 +247,7 @@ getModel <- function(d, exposure, exposure_time_pts, outcome, exp_epochs, collapse = " + " ) - #create interactions in data + #create exposure main effect interactions in data for (x in seq_along(unlist(strsplit(interactions, "\\+")))) { name <- gsub(" ", "", unlist(strsplit(interactions, "\\+"))[x]) diff --git a/examplePipelineRevised.Rmd b/examplePipelineRevised.Rmd index f794a4ce..55378bac 100644 --- a/examplePipelineRevised.Rmd +++ b/examplePipelineRevised.Rmd @@ -370,11 +370,11 @@ concur_conf <- "B18Raw.15" #optional custom formulas -custom <- NULL #empirical example #note: below is an example of just two of the entries in the list of a custom formula --you would need to make a list that contains entries for each exposure time point that mimics the output of createFormulas. <--Run createFormulas to see what it should look like. custom <- list("full_form-6" = as.formula("ESETA1.6 ~ BioDadInHH2 + DrnkFreq + gov_assist"), "full_form-15" = as.formula("ESETA1.15 ~ BioDadInHH2 + DrnkFreq + gov_assist") -) #add warning about future variables +) +custom <- NULL #empirical example #required @@ -424,19 +424,19 @@ If you specify concurrent confounders to retain and/or confounders to keep in th ```{r} #optional list of concurrent confounder -concur_conf <- NULL #empirical example concur_conf <- "B18Raw.15" +concur_conf <- NULL #empirical example #optional list of tv confounders to always retain (lag t-1) -keep_conf <- NULL #empirical example keep_conf <- "InRatioCor.6" +keep_conf <- NULL #empirical example #optional custom formulas -custom <- NULL #empirical example #note: below is an example of just two of the entries in the list of a custom formula --you would need to make a list that contains entries for each exposure time point that mimics the output of createFormulas. <--Run createFormulas to see what it should look like. custom <- list("short_form-6" = as.formula("ESETA1.6 ~ BioDadInHH2 + DrnkFreq + gov_assist"), "short_form-15" = as.formula("ESETA1.15 ~ BioDadInHH2 + DrnkFreq + gov_assist") ) +custom <- NULL #empirical example #required @@ -455,7 +455,6 @@ short_formulas <- createFormulas(exposure = exposure, exposure_time_pts = exposu ```{r} -#required; say what default estimand is in weightitMSM(); some dont work for all methods --add to supplement formulas <- short_formulas # optional weighting method "glm", "gbm", "bart", "super", "cbps" (default is cbps) @@ -586,19 +585,19 @@ If you specify concurrent confounders to retain and/or confounders to keep in ba ```{r} #optional custom formulas -custom <- NULL #empirical example #note: below is an example of just two of the entries in the list of a custom formula --you would need to make a list that contains entries for each exposure time point that mimics the output of createFormulas. <--Run createFormulas to see what it should look like. custom <- list("update_form-6" = as.formula("ESETA1.6 ~ BioDadInHH2 + DrnkFreq + gov_assist"), "update_form-15" = as.formula("ESETA1.15 ~ BioDadInHH2 + DrnkFreq + gov_assist") ) +custom <- NULL #empirical example #optional list of concurrent confounder -concur_conf <- NULL #empirical example concur_conf <- "B18Raw.15" +concur_conf <- NULL #empirical example #optional list of tv confounders to always retain (lag t-1) -keep_conf <- NULL #empirical example keep_conf <- "InRatioCor.6" +keep_conf <- NULL #empirical example #required @@ -755,11 +754,7 @@ int_order <- 2 #covariates (required for covariate models m1, m3) covariates <- NULL -covariates <- imbalanced_covariates -covariates <- c("ESETA1.6", "gov_assist", "B18Raw.6") -covariates <- c("ESETA1.6", "state:SmokTotl", "PmAge2:PmBlac2", "ESETA1.6:B18Raw.6:RHasSO.6") #testing interactions -covariates <- c("ESETA1.6", "InRatioCor.6", "gov_assist","PmEd2") #empirical example -covariates <- c("PmEd2") #empirical example +covariates <- c("ESETA1.6", "gov_assist", "B18Raw.6", "gov_assist:B18Raw.6", "ESETA1.6:B18Raw.6") #optional specification of epochs diff --git a/man/sim_data_imp_list.rda.Rd b/man/sim_data_imp_list.rda.Rd new file mode 100644 index 00000000..96d164e4 --- /dev/null +++ b/man/sim_data_imp_list.rda.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{sim_data_imp_list.rda} +\alias{sim_data_imp_list.rda} +\alias{sim_data_imp_list} +\title{Wide data imputed and read in (continuous exposure)} +\format{ +A list of data frames +} +\usage{ +sim_data_imp_list +} +\description{ +These data are simulated based on data from the Family Life Project (FLP), a longitudinal study following 1,292 families +representative of two geographic areas (three counties in North Carolina and three counties in Pennsylvania) with high rural +child poverty (Vernon-Feagans et al., 2013; Burchinal et al., 2008). MAR missingness has been added using the missMethods package before +imputing with the mice package and reading in each imputed dataset. These data contain economic strain (ESEATA1) as a continuously +distributed variable. +} +\keyword{datasets} diff --git a/man/sim_data_long_miss.rda.Rd b/man/sim_data_long_miss.rda.Rd new file mode 100644 index 00000000..34c34646 --- /dev/null +++ b/man/sim_data_long_miss.rda.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{sim_data_long_miss.rda} +\alias{sim_data_long_miss.rda} +\alias{sim_data_long_miss} +\title{Long data with missingness (continuous exposure)} +\format{ +A data frame +} +\usage{ +sim_data_long_miss +} +\description{ +These data are simulated based on data from the Family Life Project (FLP), a longitudinal study following 1,292 families +representative of two geographic areas (three counties in North Carolina and three counties in Pennsylvania) with high rural +child poverty (Vernon-Feagans et al., 2013; Burchinal et al., 2008). MAR missingness has been added using the missMethods package. +These data contain economic strain (ESEATA1) as a continuously distributed variable. +} +\keyword{datasets} diff --git a/man/sim_data_long_miss_bin.rda.Rd b/man/sim_data_long_miss_bin.rda.Rd new file mode 100644 index 00000000..7451efd6 --- /dev/null +++ b/man/sim_data_long_miss_bin.rda.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{sim_data_long_miss_bin.rda} +\alias{sim_data_long_miss_bin.rda} +\alias{sim_data_long_miss_bin} +\title{Long data with missingness (binary exposure)} +\format{ +A data frame +} +\usage{ +sim_data_long_miss_bin +} +\description{ +These data are simulated based on data from the Family Life Project (FLP), a longitudinal study following 1,292 families +representative of two geographic areas (three counties in North Carolina and three counties in Pennsylvania) with high rural +child poverty (Vernon-Feagans et al., 2013; Burchinal et al., 2008). MAR missingness has been added using the missMethods package. +These data contain economic strain (ESEATA1) as a binary variable. +} +\keyword{datasets} diff --git a/man/sim_data_mice.rda.Rd b/man/sim_data_mice.rda.Rd new file mode 100644 index 00000000..448640fe --- /dev/null +++ b/man/sim_data_mice.rda.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{sim_data_mice.rda} +\alias{sim_data_mice.rda} +\alias{sim_data_mice} +\title{Wide data imputed with mice (continuous exposure)} +\format{ +A mice object +} +\usage{ +sim_data_mice +} +\description{ +These data are simulated based on data from the Family Life Project (FLP), a longitudinal study following 1,292 families +representative of two geographic areas (three counties in North Carolina and three counties in Pennsylvania) with high rural +child poverty (Vernon-Feagans et al., 2013; Burchinal et al., 2008). MAR missingness has been added using the missMethods package before +imputing with the mice package. These data contain economic strain (ESEATA1) as a continuously distributed variable. +} +\keyword{datasets} diff --git a/man/sim_data_wide.rda.Rd b/man/sim_data_wide.rda.Rd new file mode 100644 index 00000000..8f5ce652 --- /dev/null +++ b/man/sim_data_wide.rda.Rd @@ -0,0 +1,63 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{sim_data_wide.rda} +\alias{sim_data_wide.rda} +\alias{sim_data_wide} +\title{Wide complete data (continuous exposure) +These data are simulated based on data from the Family Life Project (FLP), a longitudinal study following 1,292 families +representative of two geographic areas (three counties in North Carolina and three counties in Pennsylvania) with high rural +child poverty (Vernon-Feagans et al., 2013; Burchinal et al., 2008). These data contain economic strain (ESEATA1) as a continuously +distributed variable.} +\format{ +A wide data frame of 1,292 observations +There are 69 measured variables. +\itemize{ +\item "ESETA1" is the continuous exposure of economic strain +\item "StrDif_Tot.58" is the continuous outcome of behavioral problems +\item "InRatioCor" is the income-to-needs ratio +\item "PmEd2" is the parent's education level +\item "state" is the family's state of residence +\item "TcBlac2" is the family's race (1 = x, 0 = y) +\item "bioDadInHH2" is whether the biological father lives with the family (insert coding) +\item "HomeOwnd" indicator of whether family owns home (insert coding) +\item "KFASTScr" +\item "PmBlac2" primary careigver race (insert coding) +\item "SmokTotl" +\item "caregiv_health" +\item "gov_assist" +\item "ALI_LE" +\item "B18Raw" +\item "CORTB" +\item "EARS_TJo" +\item "fscore" +\item "HOMEETA1" +\item "IBRAttn" +\item "LESMnNeg" +\item "MDI" +\item "RHAsSO" +\item "SAAmylase" +\item "WndNbrhood" +} +} +\usage{ +sim_data_wide +} +\description{ +Wide complete data (continuous exposure) +These data are simulated based on data from the Family Life Project (FLP), a longitudinal study following 1,292 families +representative of two geographic areas (three counties in North Carolina and three counties in Pennsylvania) with high rural +child poverty (Vernon-Feagans et al., 2013; Burchinal et al., 2008). These data contain economic strain (ESEATA1) as a continuously +distributed variable. +} +\references{ +Vernon-Feagans, L., Cox, M., Willoughby, M., Burchinal, M., Garrett-Peters, P., Mills-Koonce, R., +Garrett-Peiers, P., Conger, R. D., & Bauer, P. J. (2013). The Family Life Project: An Epidemiological and +Developmental Study of Young Children Living in Poor Rural Communities. +Monographs of the Society for Research in Child Development, 78(5), i–150. + +Burchinal, M., Howes, C., Pianta, R., Bryant, D., Early, D., Clifford, R., & Barbarin, O. (2008). +Predicting Child Outcomes at the End of Kindergarten from the Quality of Pre-Kindergarten Teacher–Child Interactions and +Instruction. Applied Developmental Science, 12(3), 140–153. https://doi.org/10.1080/10888690802199418 +} +\keyword{datasets} diff --git a/man/sim_data_wide_bin.rda.Rd b/man/sim_data_wide_bin.rda.Rd new file mode 100644 index 00000000..26275bbf --- /dev/null +++ b/man/sim_data_wide_bin.rda.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{sim_data_wide_bin.rda} +\alias{sim_data_wide_bin.rda} +\alias{sim_data_wide_bin} +\title{Wide complete data (binary exposure) +These data are simulated based on data from the Family Life Project (FLP), a longitudinal study following 1,292 families +representative of two geographic areas (three counties in North Carolina and three counties in Pennsylvania) with high rural +child poverty (Vernon-Feagans et al., 2013; Burchinal et al., 2008). These data contain economic strain (ESEATA1) as a binary variable.} +\format{ +A data frame +} +\usage{ +sim_data_wide_bin +} +\description{ +Wide complete data (binary exposure) +These data are simulated based on data from the Family Life Project (FLP), a longitudinal study following 1,292 families +representative of two geographic areas (three counties in North Carolina and three counties in Pennsylvania) with high rural +child poverty (Vernon-Feagans et al., 2013; Burchinal et al., 2008). These data contain economic strain (ESEATA1) as a binary variable. +} +\keyword{datasets} diff --git a/man/sim_data_wide_miss.rda.Rd b/man/sim_data_wide_miss.rda.Rd new file mode 100644 index 00000000..7b3c6613 --- /dev/null +++ b/man/sim_data_wide_miss.rda.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{sim_data_wide_miss.rda} +\alias{sim_data_wide_miss.rda} +\alias{sim_data_wide_miss} +\title{Wide data with missingness (continuous exposure)} +\format{ +A data frame +} +\usage{ +sim_data_wide_miss +} +\description{ +These data are simulated based on data from the Family Life Project (FLP), a longitudinal study following 1,292 families +representative of two geographic areas (three counties in North Carolina and three counties in Pennsylvania) with high rural +child poverty (Vernon-Feagans et al., 2013; Burchinal et al., 2008). MAR missingness has been added using the missMethods package. +These data contain economic strain (ESEATA1) as a continuously distributed variable. +} +\keyword{datasets} diff --git a/man/sim_data_wide_miss_bin.rda.Rd b/man/sim_data_wide_miss_bin.rda.Rd new file mode 100644 index 00000000..f6d73a71 --- /dev/null +++ b/man/sim_data_wide_miss_bin.rda.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{sim_data_wide_miss_bin.rda} +\alias{sim_data_wide_miss_bin.rda} +\alias{sim_data_wide_miss_bin} +\title{Wide data with missingness (binary exposure)} +\format{ +A data frame +} +\usage{ +sim_data_wide_miss_bin +} +\description{ +These data are simulated based on data from the Family Life Project (FLP), a longitudinal study following 1,292 families +representative of two geographic areas (three counties in North Carolina and three counties in Pennsylvania) with high rural +child poverty (Vernon-Feagans et al., 2013; Burchinal et al., 2008). MAR missingness has been added using the missMethods package. +These data contain economic strain (ESEATA1) as a binary variable. +} +\keyword{datasets} diff --git a/vignettes/Preliminary_Steps.Rmd b/vignettes/Preliminary_Steps.Rmd index 4ab1acbe..5eb82f31 100644 --- a/vignettes/Preliminary_Steps.Rmd +++ b/vignettes/Preliminary_Steps.Rmd @@ -9,7 +9,7 @@ output: vignette: > %\VignetteIndexEntry{Preliminary_Steps} - %\VignetteEngine{knitr::rmarkdown} + %\VignetteEngine{knitr::rmarkdown_notangle} %\VignetteEncoding{UTF-8} --- @@ -27,7 +27,7 @@ options(rmarkdown.html_vignette.check_title = FALSE) These four recommended preliminary steps are designed to assist the user in preparing and inspecting their data to ensure appropriate use of the package. Users should first view the Terminology vignette and complete the Data Requirements and Specifying Core Inputs vignettes. -The following recommended preliminary steps (using helper functions summarized in Table 1 are not included in the *devMSMs* package that can be found at the following Github are designed to assist the user in preparing and inspecting their data and guide specification of required function inputs to ensure appropriate use of the package. Of note, *devMSMs* must also be installed and loaded to use these helper functions (see Installation). +The helper functions (summarized in Table 1) for the following recommended preliminary steps can be found at this Github. Of note, *devMSMs* must also be installed and loaded to use these helper functions (see Installation). The user who has already formatted their data in wide format according to the Data Requirements vignette and imputed to accommodate any missing data (P2), can focus only on subsections P3 Identifying Optional Exposure Epochs and P4 Verifying History Distributions prior to using the package. Following completion of this vignette, users should use one of the Workflows vignettes to implement *devMSMs* with their longitudinal data. @@ -37,11 +37,11 @@ The user who has already formatted their data in wide format according to the Da
```{r setup} -install.packages("devtools") -install.packages("stats") +# install.packages("devtools") +# install.packages("stats") -library(devtools) -library(stats) +require(devtools) +# library(stats) devtools::install_github("istallworthy/devMSMs", quiet = TRUE) library(devMSMs) @@ -56,7 +56,7 @@ Choose from the following preliminary steps with the goal of assigning to 'data' - a mids object (output from mice::mice()) of data imputed in wide format - a list of data imputed in wide format as data frames. -Data columns should be either numeric or factor form and the ID column should be numeric. +Data columns should be either numeric, integer, or factor form and the ID column should be numeric. Some helper functions have optional arguments to suppress saving output locally (`save.out = FALSE`) and printing it to the console ( `verbose = FALSE`). The defaults to both arguments are TRUE. Users must supply a path to a home directory if `save.out = TRUE`. @@ -152,6 +152,7 @@ Users with correctly formatted variables in long format have the option of using We then transform our newly formatted long data into wide format. ```{r} +require("stats") v <- sapply(strsplit(tv_confounders[!grepl("\\:", tv_confounders)], "\\."), "[", 1) v <- v[!duplicated(v)] @@ -200,11 +201,11 @@ The user can also indicate if they have already created imputed datasets from th For this example, we create 5 imputed datasets using the default random forest method and 5 iterations, and assign the output to `data` for use with *devMSMs*. This code takes some time to run. ```{r} -m <- 5 +m <- 1 method <- "rf" -maxit <- 5 +maxit <- 0 imputed_data <- imputeData(data = data_wide, exposure = exposure, outcome = outcome, m = m, method = method, maxit = maxit, para_proc = FALSE, @@ -290,9 +291,9 @@ hi_lo_cut <- c(0.6, 0.3) ### P4b. Specify hypotheses-relevant exposure histories -We strongly recommend users be selective about which histories, or developmental sequences of high and low exposure (at exposure time points or epochs), are vital for testing their hypotheses. The units of the exposure histories are the exposure time points if no epochs are specified. We recommend that the user estimates and compares only a subset of all possible exposure histories using the `reference` and `comparison` fields (rather than comparing all possible exposure histories). +We strongly recommend users be selective about which histories, or developmental sequences of high and low exposure (at exposure time points or epochs), are vital for testing their hypotheses. We recommend that the user estimates and compares only a subset of all possible exposure histories using the `reference` and `comparison` fields (rather than comparing all possible exposure histories). -The user can specify a custom subset of user-specified exposure histories using the `reference` and `comparison` fields as optional inputs to the `compareHistories()` *devMSMs* function (see *Workflows* vignettes). To conduct these customized comparisons, users must provide at least one unique valid history (e.g., “l-l-l”) as a reference by, in quotations, provide a string (or a list of strings) of lowercase l’s and h’s (each separated by -), each corresponding to each exposure epoch (or time point), that signify the sequence of exposure levels (“low” or “high”, respectively). +The user can specify a custom subset of exposure histories using the `reference` and `comparison` fields as optional inputs to the `compareHistories()` *devMSMs* function (see *Workflows* vignettes). To conduct these customized comparisons, users must provide at least one unique valid history (e.g., “l-l-l”) as a reference by, in quotations, provide a string (or a list of strings) of lowercase l’s and h’s (each separated by -), each corresponding to each exposure epoch (or time point), that signify the sequence of exposure levels (“low” or “high”, respectively). If you supply a reference history, in comparisons provide at least one unique and valid history for comparison by, in quotations, providing a string (or list of strings) of l’s and h’s (each separated by “-”), with each corresponding to each exposure epoch, that signify the sequence of exposure levels (“low” or “high”, respectively) that constitutes the comparison exposure history/histories to be compared to the reference in Step 5b of the *Workflows* vignettes. If you supply one or more comparisons, at least one reference must be specified. Each reference exposure history will be compared to each comparison history and all comparisons will be supplied for multiple comparison correction. If no reference or comparison is specified, all histories will be compared to each other in Step 5b of the *Workflows* vignettes. diff --git a/vignettes/Workflow_Continuous_Exposure.Rmd b/vignettes/Workflow_Continuous_Exposure.Rmd index 01698d33..98b0ce0e 100644 --- a/vignettes/Workflow_Continuous_Exposure.Rmd +++ b/vignettes/Workflow_Continuous_Exposure.Rmd @@ -23,8 +23,10 @@ options(rmarkdown.html_vignette.check_title = FALSE) ```{r setup} -install.packages("devtools") -library(devtools) +# install.packages("devtools") +# library(devtools) + +require("devtools") devtools::install_github("istallworthy/devMSMs", quiet = TRUE) library(devMSMs) @@ -32,6 +34,7 @@ library(devMSMs) devtools::install_github("istallworthy/devMSMsHelpers", quiet = TRUE) library(devMSMsHelpers) ``` +
This vignette guides a user through the process of using *devMSMs* to fit marginal structural models (MSMs) with a a continuously distributed exposure variable. The users should first view the Terminology, Data Requirements, Specifying Core Inputs, and Preliminary Steps vignettes. @@ -173,7 +176,6 @@ balance_thresh <- c(0.05, 0.1) imp_conf <- c("InRatioCor.6", "InRatioCor.15", "InRatioCor.24", "InRatioCor.35", "InRatioCor.58", "PmEd2") ``` - The `assessBalance()` function saves out the following .csv and .html files into the ‘balance/prebalance/’ folder: tables of balance statistics for all confounders, tables of balance statistics for covariates that are imbalanced (with respect to their respective balance thresholds), and an overall balance summary table (averaged across any imputed datasets). Within the ‘balance/prebalance/plots/’ folder, the function outputs .jpeg files of summary love plots depicting confounder balance for each exposure time point. The function returns a data frame (or list) of balance statistics, balance thresholds, and binary balanced tag for each confounder relevant to each exposure time point. @@ -191,11 +193,12 @@ prebalance_stats <- assessBalance(data = data, exposure = exposure, exposure_tim The output above shows the initial imbalance between confounders and exposure in tables and plots. 55 confounders are imbalanced (labeled in red font in the love plots) with respect to the economic strain exposure and their respective balance threshold. The love plots depict the standardized associations between confounder and exposure at each exposure time point, with the vertical red dashed lines indicating balance thresholds. - -

+
. + ### Step 2. Create Simplified Balancing Formulas & Determine Optimal Weighting Method The goal of this second step is to create shortened, more parsimonious balancing formulas for determining the optimal IPTW weighting method that most successfully reduces imbalance. +
#### 2a. Create Simplified Balancing Formulas @@ -229,7 +232,6 @@ short_formulas <- createFormulas(exposure = exposure, exposure_time_pts = exposu ``` Above, we inspect the shortened balancing formula at each exposure time point. These formulas are considerably shorter than the full formulas. For instance, at the 58-month exposure time point, the formula contains all time invariant confounders and only time-varying confounders at the 35-month time point. -
#### 2b. Create IPTW Balancing Weights using Multiple Weighting Methods @@ -249,7 +251,7 @@ For `method`, provide one of the following methods for calculating balancing wei method <- "cbps" ``` -The `createWeights()` function can also take any number of additional arguments that will be passed to the `weightitMSM ()` function (e.g., ‘ints’, ‘criterion’, distribution’, ‘SL.library’). For instance, if the user wishes to include first-order interactions of supplied covariates in the weights model, they can include the argument `ints = TRUE`. If the user selects the SuperLearner (“super”) method, the default super learner library (‘SL.library’) is xx but an alternative library can be entered as an input to the `createWeights` function. For binary exposures, the “cbps” method allows you to specify `estimand` as either ATE, ATT, or ATC. With “glm”, “super”, and “bart” you can specify ATE, ATT, ATC, ATO, ATM, or ATOS. With “gbm”, you can specify ATE, ATT, ATC, ATO, or ATM. The default estimand for binary exposures is ATE. We advise the interested user to review the *WeightIt* documentation for more information about the additional optional arguments available for each of the weighting methods. +The `createWeights()` function can also take any number of additional arguments that will be passed to the `weightitMSM ()` function (e.g., ‘criterion’, distribution’, ‘SL.library’). If the user selects the SuperLearner (“super”) method, the default super learner library (‘SL.library’) is xx but an alternative library can be entered as an input to the `createWeights` function. For binary exposures, the “cbps” method allows you to specify `estimand` as either ATE, ATT, or ATC. With “glm”, “super”, and “bart” you can specify ATE, ATT, ATC, ATO, ATM, or ATOS. With “gbm”, you can specify ATE, ATT, ATC, ATO, or ATM. The default estimand for binary exposures is ATE. We advise the interested user to review the *WeightIt* documentation for more information about the additional optional arguments available for each of the weighting methods. The user can also specify `read_in_from_file = TRUE `if the user has previously created weights for these specific data, formula, and weight type using this function and wishes to read them in from a local file instead of recreating them. The `createWeights()` function automatically conducts some basic checks that the saved weights match the data type, weights method, and number of formulas provided. The user is responsible for making sure these weights were created appropriately.