From ca6e49e248d54c1f82803e6d10ea9b4544b38bf6 Mon Sep 17 00:00:00 2001 From: Keith Goldfeld Date: Fri, 31 May 2024 16:22:21 -0400 Subject: [PATCH 1/6] Updating function addCorGen --- R/add_correlated_data.R | 60 +++++++++++++++++++++++++++++------------ 1 file changed, 43 insertions(+), 17 deletions(-) diff --git a/R/add_correlated_data.R b/R/add_correlated_data.R index ce17362d..a03bedb4 100644 --- a/R/add_correlated_data.R +++ b/R/add_correlated_data.R @@ -403,12 +403,22 @@ addCorGen <- function(dtOld, nvars=NULL, idvar = "id", rho=NULL, corstr=NULL, co # wide(ness) is determined by incoming data structure. maxN <- dtOld[, .N, by = idvar][, max(N)] + if (maxN == 1) { wide <- TRUE - assertNotMissing(nvars = missing(nvars)) - assertAtLeast(nvars = nvars, minVal = 2) + if ((is.null(nvars) | is.null(rho) | is.null(corstr)) & (is.null(corMatrix))) { + stop("Either nvars, rho, and corstr all must be provided or corMatrix must be provided.") + } + + if (is.null(corMatrix)) { # that means that we are using nvars/rho/corstr + assertAtLeast(nvars = nvars, minVal = 2) + } } else if (maxN > 1) { wide <- FALSE + if ((is.null(rho) | is.null(corstr)) & (is.null(corMatrix))) { + stop("Either both rho and corstr must be provided or corMatrix must be provided.") + } + } #### @@ -457,24 +467,41 @@ addCorGen <- function(dtOld, nvars=NULL, idvar = "id", rho=NULL, corstr=NULL, co # check if the dimensions of corr matrix matches (equal) cluster size - dn <- dtTemp[, .N, keyby = .id] - dn[, dim := nrow(corMatrix)] - compare_cluster_size <- dn[, sum(N != dim)] - if (compare_cluster_size != 0) { - stop("Dimensions of corMatrix not equal to cluster sizes!") + if (!wide) { + dn <- dtTemp[, .N, keyby = .id] + dn[, dim := nrow(corMatrix)] + compare_cluster_size <- dn[, sum(N != dim)] + if (compare_cluster_size != 0) { + stop("Dimensions of corMatrix not equal to cluster sizes!") + } } } } - if (wide) { # Convert to long form temporarily + if ( is.null(nvars) ) nvars <- nrow(corMatrix) dtTemp <- addPeriods(dtTemp, nPeriods = nvars, idvars = ".id") } dtTemp[, seq_ := 1:.N, keyby = .id] - nvars <- dtTemp[.id == 1, .N] # only permits case where number of records per id is the same - - #### + # nvars <- dtTemp[.id == 1, .N] # only permits case where number of records per id is the same + + counts <- dtTemp[, .N, by = .id][, N] + same_nvar <- all(counts == counts[1]) + + if (!wide) { # multiple record per id + if (is.null(corMatrix)) { + if (same_nvar) { + corMatrix <- genCorMat(nvars = counts[1] , rho = rho, corstr = corstr, nclusters = 1) + } else { + corMatrix <- genCorMat(nvars = counts , rho = rho, corstr = corstr, nclusters = length(counts)) + } + } + } else { # single record per id + if (is.null(corMatrix)) { + corMatrix <- genCorMat(nvars = nvars , rho = rho, corstr = corstr, nclusters = 1) + } + } if (method == "copula") { @@ -483,19 +510,18 @@ addCorGen <- function(dtOld, nvars=NULL, idvar = "id", rho=NULL, corstr=NULL, co dtM <- rbindlist( lapply(ns, function(x) .genQuantU(x$N, 1, rho, corstr, corMatrix[[x$.id]])) ) + dtTemp[, .U := dtM$Unew] } else { - if (is.null(corMatrix)) { - corMatrix <- .buildCorMat(nvars, corMatrix = NULL, rho = rho, corstr = corstr) - } + + nvars <- nrow(corMatrix) + ns <- nrow(dtTemp[, .N, keyby = .id]) Unew <- c(t(mvnfast::rmvn(n = ns, mu = rep(0, nvars), sigma = corMatrix))) dtTemp[, .U := stats::pnorm(Unew)] } - # dtTemp[, seq := dtM$seq] - if (dist == "poisson") { setnames(dtTemp, param1, ".param1") dtTemp[, .XX := stats::qpois(p = .U, lambda = .param1)] @@ -527,7 +553,7 @@ addCorGen <- function(dtOld, nvars=NULL, idvar = "id", rho=NULL, corstr=NULL, co } dX <- dtTemp[, list(.id, seq_, .XX)] - + } else if (method == "ep") { if (is.list(corMatrix)) { From 54629c47e3a1c440422a20ff7380a6eb88ab26d7 Mon Sep 17 00:00:00 2001 From: Keith Goldfeld Date: Fri, 31 May 2024 16:32:53 -0400 Subject: [PATCH 2/6] Updating news. --- NEWS.md | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/NEWS.md b/NEWS.md index 081efd4b..3ffdf03e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,9 +1,15 @@ # simstudy (development version) +## New features +* Added the ability to generate data from a empirical distribution by using new functions `genDataDensity` and `addDataDensity`. + +## Minor fix +* `addCorGen` no longer requires all clusters to have the same size when using the *rho* and *corstr* arguments to define the correlation. + # simstudy 0.8.0 ## New features -* added the option to specify a customized distribution in `defData` and `defDataAdd` by +* Added the option to specify a customized distribution in `defData` and `defDataAdd` by specifying `dist = "custom"`. *`addPeriods` now includes a new argument `periodVec` that allows users to designate specific measurement time periods using vector. @@ -39,7 +45,7 @@ distribution in `defData` and `defDataAdd`. * Improved the random effect variance generation for function `iccRE` under the Poisson distribution. The current approach is based on the 2013 paper by Nakagawa & Schielzeth titled "A general and simple method for obtaining $R^2$ from -generalized linear mixed-effects models" +generalized linear mixed-effects models." ## Minor fix * Modified internal function to speed up beta distribution data generation. @@ -57,13 +63,13 @@ performance has been dramatically improved. ## Minor fixes -* Fixed bug in `genSpline` +* Fixed bug in `genSpline`. # simstudy 0.5.1 ## Minor fixes -* Fixed bug in `trtAssign` +* Fixed bug in `trtAssign`. # simstudy 0.5.0 @@ -77,7 +83,7 @@ performance has been dramatically improved. # simstudy 0.4.0 ## New features -* genOrdCat now supports non-proportional odds +* genOrdCat now supports non-proportional odds. * Added functions defRepeat and defRepeatAdd to facilitate the definition of multiple variables that share identical data definitions. ## Minor improvements and fixes From 40db6597941e6ec3aa7ef6bb8a1bbc18fda840f5 Mon Sep 17 00:00:00 2001 From: Keith Goldfeld Date: Tue, 4 Jun 2024 12:14:14 -0400 Subject: [PATCH 3/6] Updating categorical documenation for logit link --- vignettes/simstudy.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/simstudy.Rmd b/vignettes/simstudy.Rmd index 6b56cb75..7e3ba68b 100644 --- a/vignettes/simstudy.Rmd +++ b/vignettes/simstudy.Rmd @@ -188,7 +188,7 @@ d <- list() d[[1]] <- data.table("beta", "mean", "both", "-", "dispersion", "X", "-", "X") d[[2]] <- data.table("binary", "probability", "both", "-", "-", "X", "-", "X") d[[3]] <- data.table("binomial", "probability", "both", "-", "# of trials", "X", "-", "X") -d[[4]] <- data.table("categorical", "probability", "string", "p_1;p_2;...;p_n", "a;b;c", "X", "-", "-") +d[[4]] <- data.table("categorical", "probability", "string", "p_1;p_2;...;p_n", "a;b;c", "X", "-", "X") d[[5]] <- data.table("clusterSize", "total N", "both", "-", "dispersion", "X", "-", "-") d[[6]] <- data.table("custom", "function", "string", "-", "arguments", "X", "-", "-") d[[7]] <- data.table("exponential", "mean", "both", "-", "-", "X", "X", "-") From 79986cfc3ffb790490ae482db15761f09cef4df4 Mon Sep 17 00:00:00 2001 From: Keith Goldfeld Date: Tue, 4 Jun 2024 17:21:55 -0400 Subject: [PATCH 4/6] Adding details to addCorGen help --- R/add_correlated_data.R | 32 ++++++++++++++++++++++++++++++++ R/simstudy-package.R | 1 + man/addCorGen.Rd | 34 +++++++++++++++++++++++++++++++++- man/distributions.Rd | 1 + 4 files changed, 67 insertions(+), 1 deletion(-) diff --git a/R/add_correlated_data.R b/R/add_correlated_data.R index a03bedb4..6a3eb212 100644 --- a/R/add_correlated_data.R +++ b/R/add_correlated_data.R @@ -298,6 +298,37 @@ addCorFlex <- function(dt, defs, rho = 0, tau = NULL, corstr = "cs", #' Emrich and Piedmonte (1991). #' @param ... May include additional arguments that have been deprecated and are #' no longer used. +#' @details The original data table can come in one of two formats: a single row +#' per **idvar** (where data are *ungrouped*) or multiple rows per **idvar** (in which +#' case the data are *grouped* or clustered). The structure of the arguments +#' depends on the format of the data. +#' +#' In the case of *ungrouped* data, there are two ways to specify the number of +#' correlated variables and the covariance matrix. In approach (1), +#' **nvars** needs to be specified along with **rho** and **corstr**. +#' In approach (2), **corMatrix** may be specified by identifying a single square +#' *n* x *n* covariance matrix. The number of new variables generated for each +#' record will be *n*. If **nvars**, **rho**, +#' **corstr**, and **corMatrix** are all specified, the data will be +#' generated based on the information provided in the covariance matrix alone. +#' In both (1) and (2), the data will be returned in a wide format. +#' +#' In the case of *grouped* data, where there are *G* groups, there are also two +#' ways to proceed. In both cases, +#' the number of new variables to be generated may vary by group, and will be determined by the +#' number of records in each group, \eqn{n_i, i \in \{1,...,G\}} (i.e., the number of records that share the same +#' value of *idvar*). **nvars** is not used in grouped data. +#' In approach (1), the arguments **rho** and **corstr** may both be specified +#' to determine the structure of the covariance +#' matrix. In approach (2), the argument **corMatrix** may be specified. +#' **corMatrix** can be a single matrix with dimensions \eqn{n \ \text{x} \ n} if +#' \eqn{n_i = n} for all *i*. However, if the sample sizes of each group vary +#' (i.e., \eqn{n_i \ne n_j} for some groups *i* and *j*), **corMatrix** must be a list +#' of covariance matrices with a length *G*; each +#' covariance matrix in the list will have dimensions +#' \eqn{n_i \ \text{x} \ n_i, \ i \in \{1,...,G\}}. In the case of *grouped* data, the +#' new data will be returned in *long* format (i.e., one new column only). +#' #' @return Original data.table with added column(s) of correlated data #' @references Emrich LJ, Piedmonte MR. A Method for Generating High-Dimensional #' Multivariate Binary Variates. The American Statistician 1991;45:302-4. @@ -335,6 +366,7 @@ addCorFlex <- function(dt, defs, rho = 0, tau = NULL, corstr = "cs", #' #' @concept correlated #' @export +#' @md addCorGen <- function(dtOld, nvars=NULL, idvar = "id", rho=NULL, corstr=NULL, corMatrix = NULL, dist, param1, param2 = NULL, cnames = NULL, method = "copula", ...) { diff --git a/R/simstudy-package.R b/R/simstudy-package.R index 4539b0d8..6149e9a3 100644 --- a/R/simstudy-package.R +++ b/R/simstudy-package.R @@ -33,6 +33,7 @@ NULL #' | binary | probability for 1 | String or Number | NA | identity or logit | #' | binomial | probability of success | String or Number | number of trials | identity or logit | #' | categorical | probabilities | `p_1;p_2;..;p_n` | category labels: `a;b;c` , `50;130;20`| identity or logit | +#' | custom | name of function | String | arguments | identity | #' | exponential | mean (lambda) | String or Number | NA | identity or log | #' | gamma | mean | String or Number | dispersion value | identity or log | #' | mixture | formula | `x_1 `\|` p_1 + x_2 `\|` p_2 ... x_n `\|` p_n` | NA | NA | diff --git a/man/addCorGen.Rd b/man/addCorGen.Rd index 512823b6..3cb625c8 100644 --- a/man/addCorGen.Rd +++ b/man/addCorGen.Rd @@ -22,7 +22,7 @@ addCorGen( \arguments{ \item{dtOld}{The data set that will be augmented. If the data set includes a single record per id, the new data table will be created as a "wide" data set. -If the original data set includes multiple records per id, the new data set will +If the original data set includes multiple records per id, the new data set will be in "long" format.} \item{nvars}{The number of new variables to create for each id. This is only applicable @@ -70,6 +70,38 @@ Original data.table with added column(s) of correlated data \description{ Create multivariate (correlated) data - for general distributions } +\details{ +The original data table can come in one of two formats: a single row +per \strong{idvar} (where data are \emph{ungrouped}) or multiple rows per \strong{idvar} (in which +case the data are \emph{grouped} or clustered). The structure of the arguments +depends on the format of the data. + +In the case of \emph{ungrouped} data, there are two ways to specify the number of +correlated variables and the covariance matrix. In approach (1), +\strong{nvars} needs to be specified along with \strong{rho} and \strong{corstr}. +In approach (2), \strong{corMatrix} may be specified by identifying a single square +\emph{n} x \emph{n} covariance matrix. The number of new variables generated for each +record will be \emph{n}. If \strong{nvars}, \strong{rho}, +\strong{corstr}, and \strong{corMatrix} are all specified, the data will be +generated based on the information provided in the covariance matrix alone. +In both (1) and (2), the data will be returned in a wide format. + +In the case of \emph{grouped} data, where there are \emph{G} groups, there are also two +ways to proceed. In both cases, +the number of new variables to be generated may vary by group, and will be determined by the +number of records in each group, \eqn{n_i, i \in \{1,...,G\}} (i.e., the number of records that share the same +value of \emph{idvar}). \strong{nvars} is not used in grouped data. +In approach (1), the arguments \strong{rho} and \strong{corstr} may both be specified +to determine the structure of the covariance +matrix. In approach (2), the argument \strong{corMatrix} may be specified. +\strong{corMatrix} can be a single matrix with dimensions \eqn{n \ \text{x} \ n} if +\eqn{n_i = n} for all \emph{i}. However, if the sample sizes of each group vary +(i.e., \eqn{n_i \ne n_j} for some groups \emph{i} and \emph{j}), \strong{corMatrix} must be a list +of covariance matrices with a length \emph{G}; each +covariance matrix in the list will have dimensions +\eqn{n_i \ \text{x} \ n_i, \ i \in \{1,...,G\}}. In the case of \emph{grouped} data, the +new data will be returned in \emph{long} format (i.e., one new column only). +} \examples{ # Wide example diff --git a/man/distributions.Rd b/man/distributions.Rd index 82cd473c..495452cd 100644 --- a/man/distributions.Rd +++ b/man/distributions.Rd @@ -41,6 +41,7 @@ distribution can be found in this table:\tabular{lllll}{ binary \tab probability for 1 \tab String or Number \tab NA \tab identity or logit \cr binomial \tab probability of success \tab String or Number \tab number of trials \tab identity or logit \cr categorical \tab probabilities \tab \verb{p_1;p_2;..;p_n} \tab category labels: \verb{a;b;c} , \verb{50;130;20} \tab identity or logit \cr + custom \tab name of function \tab String \tab arguments \tab identity \cr exponential \tab mean (lambda) \tab String or Number \tab NA \tab identity or log \cr gamma \tab mean \tab String or Number \tab dispersion value \tab identity or log \cr mixture \tab formula \tab \code{x_1 }|\code{p_1 + x_2}|\verb{p_2 ... x_n}|\code{ p_n} \tab NA \tab NA \cr From a363865b127d3d0f58eabb6685a1e12ea43dc0d6 Mon Sep 17 00:00:00 2001 From: Keith Goldfeld Date: Wed, 5 Jun 2024 12:54:56 -0400 Subject: [PATCH 5/6] Updating example in help --- R/add_correlated_data.R | 60 ++++++++++++++++++++++++----------------- man/addCorGen.Rd | 44 +++++++++++++++++++----------- 2 files changed, 64 insertions(+), 40 deletions(-) diff --git a/R/add_correlated_data.R b/R/add_correlated_data.R index 6a3eb212..77af5906 100644 --- a/R/add_correlated_data.R +++ b/R/add_correlated_data.R @@ -333,37 +333,49 @@ addCorFlex <- function(dt, defs, rho = 0, tau = NULL, corstr = "cs", #' @references Emrich LJ, Piedmonte MR. A Method for Generating High-Dimensional #' Multivariate Binary Variates. The American Statistician 1991;45:302-4. #' @examples -#' # Wide example -#' -#' def <- defData(varname = "xbase", formula = 5, variance = .4, dist = "gamma", id = "cid") -#' def <- defData(def, varname = "lambda", formula = ".5 + .1*xbase", dist = "nonrandom", link = "log") -#' -#' dt <- genData(100, def) -#' +#' # Ungrouped data +#' +#' cMat <- genCorMat(nvars = 4, rho = .2, corstr = "ar1", nclusters = 1) +#' +#' def <- +#' defData(varname = "xbase", formula = 5, variance = .4, dist = "gamma") |> +#' defData(varname = "lambda", formula = ".5 + .1*xbase", dist = "nonrandom", link = "log") |> +#' defData(varname = "n", formula = 3, dist = "noZeroPoisson") +#' +#' dd <- genData(101, def, id = "cid") +#' +#' ## Specify with nvars, rho, and corstr +#' #' addCorGen( -#' dtOld = dt, idvar = "cid", nvars = 3, rho = .7, corstr = "cs", +#' dtOld = dd, idvar = "cid", nvars = 3, rho = .7, corstr = "cs", #' dist = "poisson", param1 = "lambda" #' ) -#' -#' # Long example -#' -#' def <- defData(varname = "xbase", formula = 5, variance = .4, dist = "gamma", id = "cid") -#' -#' def2 <- defDataAdd( -#' varname = "p", formula = "-3+.2*period + .3*xbase", -#' dist = "nonrandom", link = "logit" +#' +#' ## Specify with covMatrix +#' +#' addCorGen( +#' dtOld = dd, idvar = "cid", corMatrix = cMat, +#' dist = "poisson", param1 = "lambda" #' ) -#' -#' dt <- genData(100, def) -#' -#' dtLong <- addPeriods(dt, idvars = "cid", nPeriods = 3) -#' dtLong <- addColumns(def2, dtLong) +#' +#' # Grouped data +#' +#' cMats <- genCorMat(nvars = dd$n, rho = .5, corstr = "cs", nclusters = nrow(dd)) +#' +#' dx <- genCluster(dd, "cid", "n", "id") +#' +#' ## Specify with nvars, rho, and corstr #' #' addCorGen( -#' dtOld = dtLong, idvar = "cid", nvars = NULL, rho = .7, corstr = "cs", -#' dist = "binary", param1 = "p" +#' dtOld = dx, idvar = "cid", rho = .8, corstr = "ar1", dist = "poisson", param1 = "xbase" #' ) -#' +#' +#' ## Specify with covMatrix +#' +#' addCorGen( +#' dtOld = dx, idvar = "cid", corMatrix = cMats, dist = "poisson", param1 = "xbase" +#' ) +#' #' @concept correlated #' @export #' @md diff --git a/man/addCorGen.Rd b/man/addCorGen.Rd index 3cb625c8..b692e10e 100644 --- a/man/addCorGen.Rd +++ b/man/addCorGen.Rd @@ -103,35 +103,47 @@ covariance matrix in the list will have dimensions new data will be returned in \emph{long} format (i.e., one new column only). } \examples{ -# Wide example +# Ungrouped data -def <- defData(varname = "xbase", formula = 5, variance = .4, dist = "gamma", id = "cid") -def <- defData(def, varname = "lambda", formula = ".5 + .1*xbase", dist = "nonrandom", link = "log") +cMat <- genCorMat(nvars = 4, rho = .2, corstr = "ar1", nclusters = 1) -dt <- genData(100, def) +def <- + defData(varname = "xbase", formula = 5, variance = .4, dist = "gamma") |> + defData(varname = "lambda", formula = ".5 + .1*xbase", dist = "nonrandom", link = "log") |> + defData(varname = "n", formula = 3, dist = "noZeroPoisson") + +dd <- genData(101, def, id = "cid") + +## Specify with nvars, rho, and corstr addCorGen( - dtOld = dt, idvar = "cid", nvars = 3, rho = .7, corstr = "cs", + dtOld = dd, idvar = "cid", nvars = 3, rho = .7, corstr = "cs", dist = "poisson", param1 = "lambda" ) -# Long example - -def <- defData(varname = "xbase", formula = 5, variance = .4, dist = "gamma", id = "cid") +## Specify with covMatrix -def2 <- defDataAdd( - varname = "p", formula = "-3+.2*period + .3*xbase", - dist = "nonrandom", link = "logit" +addCorGen( + dtOld = dd, idvar = "cid", corMatrix = cMat, + dist = "poisson", param1 = "lambda" ) -dt <- genData(100, def) +# Grouped data + +cMats <- genCorMat(nvars = dd$n, rho = .5, corstr = "cs", nclusters = nrow(dd)) + +dx <- genCluster(dd, "cid", "n", "id") + +## Specify with nvars, rho, and corstr + +addCorGen( + dtOld = dx, idvar = "cid", rho = .8, corstr = "ar1", dist = "poisson", param1 = "xbase" +) -dtLong <- addPeriods(dt, idvars = "cid", nPeriods = 3) -dtLong <- addColumns(def2, dtLong) +## Specify with covMatrix addCorGen( - dtOld = dtLong, idvar = "cid", nvars = NULL, rho = .7, corstr = "cs", - dist = "binary", param1 = "p" + dtOld = dx, idvar = "cid", corMatrix = cMats, dist = "poisson", param1 = "xbase" ) } From 053b84b3406108aca780c0358237e765e61dc24e Mon Sep 17 00:00:00 2001 From: kgoldfeld Date: Sun, 9 Jun 2024 22:34:14 -0400 Subject: [PATCH 6/6] Update R/add_correlated_data.R Co-authored-by: Jacob Wujciak-Jens --- R/add_correlated_data.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/add_correlated_data.R b/R/add_correlated_data.R index 77af5906..1dcbcc86 100644 --- a/R/add_correlated_data.R +++ b/R/add_correlated_data.R @@ -597,7 +597,6 @@ addCorGen <- function(dtOld, nvars=NULL, idvar = "id", rho=NULL, corstr=NULL, co } dX <- dtTemp[, list(.id, seq_, .XX)] - } else if (method == "ep") { if (is.list(corMatrix)) {