Merge pull request #171 from istallworthy/more-feedback

data documentation
istallworthy · Nov 28, 2023 · e3c43d5 · e3c43d5
2 parents 38eb795 + eb6e6c2
commit e3c43d5
Show file tree

Hide file tree

Showing 14 changed files with 456 additions and 102 deletions.
diff --git a/R/createFormulas.R b/R/createFormulas.R
@@ -413,7 +413,7 @@ createFormulas <- function(exposure, exposure_time_pts, outcome, type, ti_confou
 
     if (verbose) {
       message("The user-supplied custom balancing formula for each exposure time point are below: ")
-      lapply(formulas, print)
+      lapply(forms, print)
 
     }
   }

diff --git a/R/data.R b/R/data.R
@@ -0,0 +1,146 @@
+#' Wide complete data (continuous exposure)
+#' These data are simulated based on data from the Family Life Project (FLP), a longitudinal study following 1,292 families 
+#' representative of two geographic areas (three counties in North Carolina and three counties in Pennsylvania) with high rural 
+#' child poverty (Vernon-Feagans et al., 2013; Burchinal et al., 2008). These data contain economic strain (ESEATA1) as a continuously 
+#' distributed variable. 
+#'
+#' @name sim_data_wide.rda
+#' @docType data
+#' @format A wide data frame of 1,292 observations
+#' There are 69 measured variables. 
+#' \itemize{
+#' \item "ESETA1" is the continuous exposure of economic strain
+#' \item "StrDif_Tot.58" is the continuous outcome of behavioral problems
+#' \item "InRatioCor" is the income-to-needs ratio
+#' \item "PmEd2" is the parent's education level
+#' \item "state" is the family's state of residence
+#' \item "TcBlac2" is the family's race (1 = x, 0 = y)
+#' \item "bioDadInHH2" is whether the biological father lives with the family (insert coding)
+#' \item "HomeOwnd" indicator of whether family owns home (insert coding)
+#' \item "KFASTScr"
+#' \item "PmBlac2" primary careigver race (insert coding)
+#' \item "SmokTotl"
+#' \item "caregiv_health"
+#' \item "gov_assist"
+#' \item "ALI_LE"
+#' \item "B18Raw"
+#' \item "CORTB"
+#' \item "EARS_TJo"
+#' \item "fscore"
+#' \item "HOMEETA1"
+#' \item "IBRAttn"
+#' \item "LESMnNeg"
+#' \item "MDI"
+#' \item "RHAsSO"
+#' \item "SAAmylase"
+#' \item "WndNbrhood"
+#' }
+#' @references Vernon-Feagans, L., Cox, M., Willoughby, M., Burchinal, M., Garrett-Peters, P., Mills-Koonce, R., 
+#' Garrett-Peiers, P., Conger, R. D., & Bauer, P. J. (2013). The Family Life Project: An Epidemiological and 
+#' Developmental Study of Young Children Living in Poor Rural Communities.
+#'  Monographs of the Society for Research in Child Development, 78(5), i–150.
+#'  
+#'  Burchinal, M., Howes, C., Pianta, R., Bryant, D., Early, D., Clifford, R., & Barbarin, O. (2008). 
+#'  Predicting Child Outcomes at the End of Kindergarten from the Quality of Pre-Kindergarten Teacher–Child Interactions and 
+#'  Instruction. Applied Developmental Science, 12(3), 140–153. https://doi.org/10.1080/10888690802199418
+#'  
+#'@keywords datasets
+"sim_data_wide"
+
+
+#' Wide complete data (binary exposure)
+#' These data are simulated based on data from the Family Life Project (FLP), a longitudinal study following 1,292 families 
+#' representative of two geographic areas (three counties in North Carolina and three counties in Pennsylvania) with high rural 
+#' child poverty (Vernon-Feagans et al., 2013; Burchinal et al., 2008). These data contain economic strain (ESEATA1) as a binary variable. 
+#'
+#' @name sim_data_wide_bin.rda
+#' @docType data
+#' @format A data frame
+#'
+"sim_data_wide_bin"
+
+
+#' Wide data with missingness (continuous exposure)
+#'
+#' These data are simulated based on data from the Family Life Project (FLP), a longitudinal study following 1,292 families 
+#' representative of two geographic areas (three counties in North Carolina and three counties in Pennsylvania) with high rural 
+#' child poverty (Vernon-Feagans et al., 2013; Burchinal et al., 2008). MAR missingness has been added using the missMethods package. 
+#' These data contain economic strain (ESEATA1) as a continuously distributed variable. 
+#'
+#' @name sim_data_wide_miss.rda
+#' @docType data
+#' @format A data frame
+#'
+"sim_data_wide_miss"
+
+
+#' Wide data with missingness (binary exposure)
+#'
+#' These data are simulated based on data from the Family Life Project (FLP), a longitudinal study following 1,292 families 
+#' representative of two geographic areas (three counties in North Carolina and three counties in Pennsylvania) with high rural 
+#' child poverty (Vernon-Feagans et al., 2013; Burchinal et al., 2008). MAR missingness has been added using the missMethods package. 
+#' These data contain economic strain (ESEATA1) as a binary variable. 
+#'
+#' @name sim_data_wide_miss_bin.rda
+#' @docType data
+#' @format A data frame
+#'
+"sim_data_wide_miss_bin"
+
+
+#' Long data with missingness (continuous exposure)
+#'
+#' These data are simulated based on data from the Family Life Project (FLP), a longitudinal study following 1,292 families 
+#' representative of two geographic areas (three counties in North Carolina and three counties in Pennsylvania) with high rural 
+#' child poverty (Vernon-Feagans et al., 2013; Burchinal et al., 2008). MAR missingness has been added using the missMethods package. 
+#' These data contain economic strain (ESEATA1) as a continuously distributed variable. 
+#'
+#' @name sim_data_long_miss.rda
+#' @docType data
+#' @format A data frame
+#'
+"sim_data_long_miss"
+
+
+#' Long data with missingness (binary exposure)
+#'
+#' These data are simulated based on data from the Family Life Project (FLP), a longitudinal study following 1,292 families 
+#' representative of two geographic areas (three counties in North Carolina and three counties in Pennsylvania) with high rural 
+#' child poverty (Vernon-Feagans et al., 2013; Burchinal et al., 2008). MAR missingness has been added using the missMethods package. 
+#' These data contain economic strain (ESEATA1) as a binary variable. 
+#'
+#' @name sim_data_long_miss_bin.rda
+#' @docType data
+#' @format A data frame
+#'
+"sim_data_long_miss_bin"
+
+
+#' Wide data imputed with mice (continuous exposure)
+#'
+#' These data are simulated based on data from the Family Life Project (FLP), a longitudinal study following 1,292 families 
+#' representative of two geographic areas (three counties in North Carolina and three counties in Pennsylvania) with high rural 
+#' child poverty (Vernon-Feagans et al., 2013; Burchinal et al., 2008). MAR missingness has been added using the missMethods package before 
+#' imputing with the mice package. These data contain economic strain (ESEATA1) as a continuously distributed variable. 
+#'
+#' @name sim_data_mice.rda
+#' @docType data
+#' @format A mice object
+#'
+"sim_data_mice"
+
+
+#' Wide data imputed and read in (continuous exposure)
+#'
+#' These data are simulated based on data from the Family Life Project (FLP), a longitudinal study following 1,292 families 
+#' representative of two geographic areas (three counties in North Carolina and three counties in Pennsylvania) with high rural 
+#' child poverty (Vernon-Feagans et al., 2013; Burchinal et al., 2008). MAR missingness has been added using the missMethods package before 
+#' imputing with the mice package and reading in each imputed dataset. These data contain economic strain (ESEATA1) as a continuously 
+#' distributed variable. 
+#'
+#' @name sim_data_imp_list.rda
+#' @docType data
+#' @format A list of data frames
+#'
+"sim_data_imp_list"
+
diff --git a/R/getModel.R b/R/getModel.R
@@ -127,83 +127,86 @@ getModel <- function(d, exposure, exposure_time_pts, outcome, exp_epochs,
   #split factors
   factor_covariates <- names(d)[sapply(d, is.factor)]
   factor_covariates <- setdiff(factor_covariates, "ID")
+
   if (length(factor_covariates) > 0) {
     d <- cobalt::splitfactor(d, factor_covariates, drop.first = "if2")
 
     factors_split <- names(d)[sapply(strsplit(names(d), "\\_"), "[", 1) 
                               %in% factor_covariates]
   }
 
-  if (!missing(covariates)) {
-    if (any(grepl("\\:", covariates))) {
-      ints <- covariates[grepl("\\:", covariates)]
-
-      #making interactions w/ split factors 
-
-      for (x in seq_len(length(ints))) {
-        vars <- as.character(unlist(strsplit(ints[x], "\\:")))
-        num_comp <- length(vars)
-
-        f_vars <- NULL
-        if (any(vars %in% factor_covariates)) {
-          vars <- do.call(c, lapply(vars, function(y) {
-            if (y %in% factor_covariates) {
-              f_vars <- factors_split[sapply(strsplit(factors_split, "\\_"), "[", 1) %in% y]
-              y <- f_vars } 
-            y
-          }))
-        }
-
-        if (any(as.logical(unlist(lapply(vars, function(x) {
-          any(!x %in% names(d))}))))) {
-          stop("Please only include covariate interactions between variables in your data",
-               call. = FALSE)
-        }
-
-        ints2 <- combn(vars, num_comp)
-        ints2 <- as.data.frame(ints2[, sapply(strsplit(ints2[1, ], "\\_"), "[", 1) != 
-                                       sapply(strsplit(ints2[2, ], "\\_"), "[", 1)])
-        ints2 <- unlist(lapply(1:ncol(ints2), 
-                               function(y) {paste(ints2[, y], collapse = ":")} ))
-        ints2 <- ints2[!duplicated(ints2)]
-
-        prods <- lapply(ints2, function(z) {
-          v <- as.character(unlist(strsplit(z, "\\:")))
-          temp <- as.data.frame(d[, v])
-          prod <- apply(as.matrix(temp), 1, prod)
-          prod
-        })
-        prods <- do.call(rbind.data.frame, prods)
-        prods <- as.data.frame(t(prods))
-        names(prods) <- ints2
-
-        #make factor class if both components are factors
-        for (f in seq_len(length(ints2))) {
-          vars <- as.character(unlist(strsplit(ints2[f], "\\:")))
-          if (all(vars %in% factor_covariates)) {
-            prods[, names(prods)[any(as.logical(unlist(lapply(names(prods), function(k) { 
-              as.character(unlist(strsplit(k, "\\:"))) %in% f_vars}))))]] <- 
-              as.data.frame(lapply(prods[, names(prods)[any(as.logical(unlist(lapply(names(prods),function(l) {
-                as.character(unlist(strsplit(l, "\\:"))) %in% f_vars}))))]], 
-                as.factor))
-          }
-        }
-        #adding to dataset
-
-        d <- cbind(d, prods)
-      }
-    }
-
-    covariates <- c(covariates[!grepl("\\:", covariates)], 
-                    names(d)[grepl("\\:", names(d))])
-  }
+  # if (!missing(covariates)) {
+  #   if (any(grepl("\\:", covariates))) {
+  #     ints <- covariates[grepl("\\:", covariates)]
+  #     
+  #     #making interactions w/ split factors 
+  #     
+  #     for (x in seq_len(length(ints))) {
+  #       vars <- as.character(unlist(strsplit(ints[x], "\\:")))
+  #       num_comp <- length(vars)
+  #       
+  #       f_vars <- NULL
+  #       if (any(vars %in% factor_covariates)) {
+  #         vars <- do.call(c, lapply(vars, function(y) {
+  #           if (y %in% factor_covariates) {
+  #             f_vars <- factors_split[sapply(strsplit(factors_split, "\\_"), "[", 1) %in% y]
+  #             y <- f_vars } 
+  #           y
+  #         }))
+  #       }
+  #       
+  #       if (any(as.logical(unlist(lapply(vars, function(x) {
+  #         any(!x %in% names(d))}))))) {
+  #         stop("Please only include covariate interactions between variables in your data",
+  #              call. = FALSE)
+  #       }
+  #       
+  #       ints2 <- combn(vars, num_comp)
+  #       ints2 <- as.data.frame(ints2[, sapply(strsplit(ints2[1, ], "\\_"), "[", 1) != 
+  #                                      sapply(strsplit(ints2[2, ], "\\_"), "[", 1)])
+  #       ints2 <- unlist(lapply(1:ncol(ints2), 
+  #                              function(y) {paste(ints2[, y], collapse = ":")} ))
+  #       ints2 <- ints2[!duplicated(ints2)]
+  #       
+  #       prods <- lapply(ints2, function(z) {
+  #         v <- as.character(unlist(strsplit(z, "\\:")))
+  #         temp <- as.data.frame(d[, v])
+  #         prod <- apply(as.matrix(temp), 1, prod)
+  #         prod
+  #       })
+  #       prods <- do.call(rbind.data.frame, prods)
+  #       prods <- as.data.frame(t(prods))
+  #       names(prods) <- ints2
+  #       
+  #       #make factor class if both components are factors
+  #       for (f in seq_len(length(ints2))) {
+  #         vars <- as.character(unlist(strsplit(ints2[f], "\\:")))
+  #         if (all(vars %in% factor_covariates)) {
+  #           prods[, names(prods)[any(as.logical(unlist(lapply(names(prods), function(k) { 
+  #             as.character(unlist(strsplit(k, "\\:"))) %in% f_vars}))))]] <- 
+  #             as.data.frame(lapply(prods[, names(prods)[any(as.logical(unlist(lapply(names(prods),function(l) {
+  #               as.character(unlist(strsplit(l, "\\:"))) %in% f_vars}))))]], 
+  #               as.factor))
+  #         }
+  #       }
+  #       #adding to dataset
+  #       
+  #       d <- cbind(d, prods)
+  #     }
+  #   }
+  #   
+  #   covariates <- c(covariates[!grepl("\\:", covariates)], 
+  #                   names(d)[grepl("\\:", names(d))])
+  # }
+
 
   # Covariate models checking
 
   if (model %in% c("m1", "m3", "covs")) {
 
     if (any(grepl("\\.", covariates))) {
-      tv_cov <- covariates[grepl("\\.", covariates)]
+      cov <- as.character(unlist(strsplit(covariates, "\\:")))
+      tv_cov <- cov[grepl("\\.", cov)]
       if (any(as.numeric(gsub("_.*", "", sub(".*\\.(.)", "\\1", 
                                              as.character(unlist(strsplit(tv_cov, "\\:")))))) > 
               exposure_time_pts[1])) {
@@ -213,10 +216,10 @@ getModel <- function(d, exposure, exposure_time_pts, outcome, exp_epochs,
       }
     }
 
-    if (!all(covariates[!grepl("\\:", covariates)] %in% colnames(d))) {
-      stop("Please only include covariates that correspond to variables in the wide dataset.",
-           call. = FALSE)
-    }
+    # if (!all(covariates[!grepl("\\:", covariates)] %in% colnames(d))) {
+    #   stop("Please only include covariates that correspond to variables in the wide dataset.",
+    #        call. = FALSE)
+    # }
 
     covariate_list <- paste(c(as.character(covariates)), sep = "", 
                             collapse = " + ")
@@ -244,7 +247,7 @@ getModel <- function(d, exposure, exposure_time_pts, outcome, exp_epochs,
       collapse = " + "
     )
 
-    #create interactions in data
+    #create exposure main effect interactions in data
 
     for (x in seq_along(unlist(strsplit(interactions, "\\+")))) {
       name <- gsub(" ", "", unlist(strsplit(interactions, "\\+"))[x])