Skip to content

Commit

Permalink
Merge pull request #171 from istallworthy/more-feedback
Browse files Browse the repository at this point in the history
data documentation
  • Loading branch information
istallworthy authored Nov 28, 2023
2 parents 38eb795 + eb6e6c2 commit e3c43d5
Show file tree
Hide file tree
Showing 14 changed files with 456 additions and 102 deletions.
2 changes: 1 addition & 1 deletion R/createFormulas.R
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,7 @@ createFormulas <- function(exposure, exposure_time_pts, outcome, type, ti_confou

if (verbose) {
message("The user-supplied custom balancing formula for each exposure time point are below: ")
lapply(formulas, print)
lapply(forms, print)

}
}
Expand Down
146 changes: 146 additions & 0 deletions R/data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
#' Wide complete data (continuous exposure)
#' These data are simulated based on data from the Family Life Project (FLP), a longitudinal study following 1,292 families
#' representative of two geographic areas (three counties in North Carolina and three counties in Pennsylvania) with high rural
#' child poverty (Vernon-Feagans et al., 2013; Burchinal et al., 2008). These data contain economic strain (ESEATA1) as a continuously
#' distributed variable.
#'
#' @name sim_data_wide.rda
#' @docType data
#' @format A wide data frame of 1,292 observations
#' There are 69 measured variables.
#' \itemize{
#' \item "ESETA1" is the continuous exposure of economic strain
#' \item "StrDif_Tot.58" is the continuous outcome of behavioral problems
#' \item "InRatioCor" is the income-to-needs ratio
#' \item "PmEd2" is the parent's education level
#' \item "state" is the family's state of residence
#' \item "TcBlac2" is the family's race (1 = x, 0 = y)
#' \item "bioDadInHH2" is whether the biological father lives with the family (insert coding)
#' \item "HomeOwnd" indicator of whether family owns home (insert coding)
#' \item "KFASTScr"
#' \item "PmBlac2" primary careigver race (insert coding)
#' \item "SmokTotl"
#' \item "caregiv_health"
#' \item "gov_assist"
#' \item "ALI_LE"
#' \item "B18Raw"
#' \item "CORTB"
#' \item "EARS_TJo"
#' \item "fscore"
#' \item "HOMEETA1"
#' \item "IBRAttn"
#' \item "LESMnNeg"
#' \item "MDI"
#' \item "RHAsSO"
#' \item "SAAmylase"
#' \item "WndNbrhood"
#' }
#' @references Vernon-Feagans, L., Cox, M., Willoughby, M., Burchinal, M., Garrett-Peters, P., Mills-Koonce, R.,
#' Garrett-Peiers, P., Conger, R. D., & Bauer, P. J. (2013). The Family Life Project: An Epidemiological and
#' Developmental Study of Young Children Living in Poor Rural Communities.
#' Monographs of the Society for Research in Child Development, 78(5), i–150.
#'
#' Burchinal, M., Howes, C., Pianta, R., Bryant, D., Early, D., Clifford, R., & Barbarin, O. (2008).
#' Predicting Child Outcomes at the End of Kindergarten from the Quality of Pre-Kindergarten Teacher–Child Interactions and
#' Instruction. Applied Developmental Science, 12(3), 140–153. https://doi.org/10.1080/10888690802199418
#'
#'@keywords datasets
"sim_data_wide"


#' Wide complete data (binary exposure)
#' These data are simulated based on data from the Family Life Project (FLP), a longitudinal study following 1,292 families
#' representative of two geographic areas (three counties in North Carolina and three counties in Pennsylvania) with high rural
#' child poverty (Vernon-Feagans et al., 2013; Burchinal et al., 2008). These data contain economic strain (ESEATA1) as a binary variable.
#'
#' @name sim_data_wide_bin.rda
#' @docType data
#' @format A data frame
#'
"sim_data_wide_bin"


#' Wide data with missingness (continuous exposure)
#'
#' These data are simulated based on data from the Family Life Project (FLP), a longitudinal study following 1,292 families
#' representative of two geographic areas (three counties in North Carolina and three counties in Pennsylvania) with high rural
#' child poverty (Vernon-Feagans et al., 2013; Burchinal et al., 2008). MAR missingness has been added using the missMethods package.
#' These data contain economic strain (ESEATA1) as a continuously distributed variable.
#'
#' @name sim_data_wide_miss.rda
#' @docType data
#' @format A data frame
#'
"sim_data_wide_miss"


#' Wide data with missingness (binary exposure)
#'
#' These data are simulated based on data from the Family Life Project (FLP), a longitudinal study following 1,292 families
#' representative of two geographic areas (three counties in North Carolina and three counties in Pennsylvania) with high rural
#' child poverty (Vernon-Feagans et al., 2013; Burchinal et al., 2008). MAR missingness has been added using the missMethods package.
#' These data contain economic strain (ESEATA1) as a binary variable.
#'
#' @name sim_data_wide_miss_bin.rda
#' @docType data
#' @format A data frame
#'
"sim_data_wide_miss_bin"


#' Long data with missingness (continuous exposure)
#'
#' These data are simulated based on data from the Family Life Project (FLP), a longitudinal study following 1,292 families
#' representative of two geographic areas (three counties in North Carolina and three counties in Pennsylvania) with high rural
#' child poverty (Vernon-Feagans et al., 2013; Burchinal et al., 2008). MAR missingness has been added using the missMethods package.
#' These data contain economic strain (ESEATA1) as a continuously distributed variable.
#'
#' @name sim_data_long_miss.rda
#' @docType data
#' @format A data frame
#'
"sim_data_long_miss"


#' Long data with missingness (binary exposure)
#'
#' These data are simulated based on data from the Family Life Project (FLP), a longitudinal study following 1,292 families
#' representative of two geographic areas (three counties in North Carolina and three counties in Pennsylvania) with high rural
#' child poverty (Vernon-Feagans et al., 2013; Burchinal et al., 2008). MAR missingness has been added using the missMethods package.
#' These data contain economic strain (ESEATA1) as a binary variable.
#'
#' @name sim_data_long_miss_bin.rda
#' @docType data
#' @format A data frame
#'
"sim_data_long_miss_bin"


#' Wide data imputed with mice (continuous exposure)
#'
#' These data are simulated based on data from the Family Life Project (FLP), a longitudinal study following 1,292 families
#' representative of two geographic areas (three counties in North Carolina and three counties in Pennsylvania) with high rural
#' child poverty (Vernon-Feagans et al., 2013; Burchinal et al., 2008). MAR missingness has been added using the missMethods package before
#' imputing with the mice package. These data contain economic strain (ESEATA1) as a continuously distributed variable.
#'
#' @name sim_data_mice.rda
#' @docType data
#' @format A mice object
#'
"sim_data_mice"


#' Wide data imputed and read in (continuous exposure)
#'
#' These data are simulated based on data from the Family Life Project (FLP), a longitudinal study following 1,292 families
#' representative of two geographic areas (three counties in North Carolina and three counties in Pennsylvania) with high rural
#' child poverty (Vernon-Feagans et al., 2013; Burchinal et al., 2008). MAR missingness has been added using the missMethods package before
#' imputing with the mice package and reading in each imputed dataset. These data contain economic strain (ESEATA1) as a continuously
#' distributed variable.
#'
#' @name sim_data_imp_list.rda
#' @docType data
#' @format A list of data frames
#'
"sim_data_imp_list"

141 changes: 72 additions & 69 deletions R/getModel.R
Original file line number Diff line number Diff line change
Expand Up @@ -127,83 +127,86 @@ getModel <- function(d, exposure, exposure_time_pts, outcome, exp_epochs,
#split factors
factor_covariates <- names(d)[sapply(d, is.factor)]
factor_covariates <- setdiff(factor_covariates, "ID")

if (length(factor_covariates) > 0) {
d <- cobalt::splitfactor(d, factor_covariates, drop.first = "if2")

factors_split <- names(d)[sapply(strsplit(names(d), "\\_"), "[", 1)
%in% factor_covariates]
}

if (!missing(covariates)) {
if (any(grepl("\\:", covariates))) {
ints <- covariates[grepl("\\:", covariates)]

#making interactions w/ split factors

for (x in seq_len(length(ints))) {
vars <- as.character(unlist(strsplit(ints[x], "\\:")))
num_comp <- length(vars)

f_vars <- NULL
if (any(vars %in% factor_covariates)) {
vars <- do.call(c, lapply(vars, function(y) {
if (y %in% factor_covariates) {
f_vars <- factors_split[sapply(strsplit(factors_split, "\\_"), "[", 1) %in% y]
y <- f_vars }
y
}))
}

if (any(as.logical(unlist(lapply(vars, function(x) {
any(!x %in% names(d))}))))) {
stop("Please only include covariate interactions between variables in your data",
call. = FALSE)
}

ints2 <- combn(vars, num_comp)
ints2 <- as.data.frame(ints2[, sapply(strsplit(ints2[1, ], "\\_"), "[", 1) !=
sapply(strsplit(ints2[2, ], "\\_"), "[", 1)])
ints2 <- unlist(lapply(1:ncol(ints2),
function(y) {paste(ints2[, y], collapse = ":")} ))
ints2 <- ints2[!duplicated(ints2)]

prods <- lapply(ints2, function(z) {
v <- as.character(unlist(strsplit(z, "\\:")))
temp <- as.data.frame(d[, v])
prod <- apply(as.matrix(temp), 1, prod)
prod
})
prods <- do.call(rbind.data.frame, prods)
prods <- as.data.frame(t(prods))
names(prods) <- ints2

#make factor class if both components are factors
for (f in seq_len(length(ints2))) {
vars <- as.character(unlist(strsplit(ints2[f], "\\:")))
if (all(vars %in% factor_covariates)) {
prods[, names(prods)[any(as.logical(unlist(lapply(names(prods), function(k) {
as.character(unlist(strsplit(k, "\\:"))) %in% f_vars}))))]] <-
as.data.frame(lapply(prods[, names(prods)[any(as.logical(unlist(lapply(names(prods),function(l) {
as.character(unlist(strsplit(l, "\\:"))) %in% f_vars}))))]],
as.factor))
}
}
#adding to dataset

d <- cbind(d, prods)
}
}

covariates <- c(covariates[!grepl("\\:", covariates)],
names(d)[grepl("\\:", names(d))])
}
# if (!missing(covariates)) {
# if (any(grepl("\\:", covariates))) {
# ints <- covariates[grepl("\\:", covariates)]
#
# #making interactions w/ split factors
#
# for (x in seq_len(length(ints))) {
# vars <- as.character(unlist(strsplit(ints[x], "\\:")))
# num_comp <- length(vars)
#
# f_vars <- NULL
# if (any(vars %in% factor_covariates)) {
# vars <- do.call(c, lapply(vars, function(y) {
# if (y %in% factor_covariates) {
# f_vars <- factors_split[sapply(strsplit(factors_split, "\\_"), "[", 1) %in% y]
# y <- f_vars }
# y
# }))
# }
#
# if (any(as.logical(unlist(lapply(vars, function(x) {
# any(!x %in% names(d))}))))) {
# stop("Please only include covariate interactions between variables in your data",
# call. = FALSE)
# }
#
# ints2 <- combn(vars, num_comp)
# ints2 <- as.data.frame(ints2[, sapply(strsplit(ints2[1, ], "\\_"), "[", 1) !=
# sapply(strsplit(ints2[2, ], "\\_"), "[", 1)])
# ints2 <- unlist(lapply(1:ncol(ints2),
# function(y) {paste(ints2[, y], collapse = ":")} ))
# ints2 <- ints2[!duplicated(ints2)]
#
# prods <- lapply(ints2, function(z) {
# v <- as.character(unlist(strsplit(z, "\\:")))
# temp <- as.data.frame(d[, v])
# prod <- apply(as.matrix(temp), 1, prod)
# prod
# })
# prods <- do.call(rbind.data.frame, prods)
# prods <- as.data.frame(t(prods))
# names(prods) <- ints2
#
# #make factor class if both components are factors
# for (f in seq_len(length(ints2))) {
# vars <- as.character(unlist(strsplit(ints2[f], "\\:")))
# if (all(vars %in% factor_covariates)) {
# prods[, names(prods)[any(as.logical(unlist(lapply(names(prods), function(k) {
# as.character(unlist(strsplit(k, "\\:"))) %in% f_vars}))))]] <-
# as.data.frame(lapply(prods[, names(prods)[any(as.logical(unlist(lapply(names(prods),function(l) {
# as.character(unlist(strsplit(l, "\\:"))) %in% f_vars}))))]],
# as.factor))
# }
# }
# #adding to dataset
#
# d <- cbind(d, prods)
# }
# }
#
# covariates <- c(covariates[!grepl("\\:", covariates)],
# names(d)[grepl("\\:", names(d))])
# }


# Covariate models checking

if (model %in% c("m1", "m3", "covs")) {

if (any(grepl("\\.", covariates))) {
tv_cov <- covariates[grepl("\\.", covariates)]
cov <- as.character(unlist(strsplit(covariates, "\\:")))
tv_cov <- cov[grepl("\\.", cov)]
if (any(as.numeric(gsub("_.*", "", sub(".*\\.(.)", "\\1",
as.character(unlist(strsplit(tv_cov, "\\:")))))) >
exposure_time_pts[1])) {
Expand All @@ -213,10 +216,10 @@ getModel <- function(d, exposure, exposure_time_pts, outcome, exp_epochs,
}
}

if (!all(covariates[!grepl("\\:", covariates)] %in% colnames(d))) {
stop("Please only include covariates that correspond to variables in the wide dataset.",
call. = FALSE)
}
# if (!all(covariates[!grepl("\\:", covariates)] %in% colnames(d))) {
# stop("Please only include covariates that correspond to variables in the wide dataset.",
# call. = FALSE)
# }

covariate_list <- paste(c(as.character(covariates)), sep = "",
collapse = " + ")
Expand Down Expand Up @@ -244,7 +247,7 @@ getModel <- function(d, exposure, exposure_time_pts, outcome, exp_epochs,
collapse = " + "
)

#create interactions in data
#create exposure main effect interactions in data

for (x in seq_along(unlist(strsplit(interactions, "\\+")))) {
name <- gsub(" ", "", unlist(strsplit(interactions, "\\+"))[x])
Expand Down
Loading

0 comments on commit e3c43d5

Please sign in to comment.