From 1bb2d4db8b6e44ac99db2d6e625a92cc3e44fe6c Mon Sep 17 00:00:00 2001 From: Keith Goldfeld Date: Tue, 28 May 2024 16:22:17 -0400 Subject: [PATCH 1/2] Adding genDataDensity and addDataDensity --- NAMESPACE | 2 ++ R/add_data.R | 57 +++++++++++++++++++++++++++++++++++++++++-- R/generate_data.R | 31 +++++++++++++++++++++++ man/addColumns.Rd | 4 +-- man/addDataDensity.Rd | 38 +++++++++++++++++++++++++++++ man/genDataDensity.Rd | 35 ++++++++++++++++++++++++++ 6 files changed, 163 insertions(+), 4 deletions(-) create mode 100644 man/addDataDensity.Rd create mode 100644 man/genDataDensity.Rd diff --git a/NAMESPACE b/NAMESPACE index 05ecad47..deedabd9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,6 +6,7 @@ export(addCondition) export(addCorData) export(addCorFlex) export(addCorGen) +export(addDataDensity) export(addMarkov) export(addMultiFac) export(addPeriods) @@ -34,6 +35,7 @@ export(genCorGen) export(genCorMat) export(genCorOrdCat) export(genData) +export(genDataDensity) export(genDummy) export(genFactor) export(genFormula) diff --git a/R/add_data.R b/R/add_data.R index 9b0440a9..39af65b1 100644 --- a/R/add_data.R +++ b/R/add_data.R @@ -1,7 +1,7 @@ #' Add columns to existing data set #' -#' @param dtDefs name of definitions for added columns -#' @param dtOld name of data table that is to be updated +#' @param dtDefs Name of definitions for added columns +#' @param dtOld Name of data table that is to be updated #' @param envir Environment the data definitions are evaluated in. #' Defaults to [base::parent.frame]. #' @return an updated data.table that contains the added simulated data @@ -523,3 +523,56 @@ addSynthetic <- function(dtOld, dtFrom, dS[] } + +#' @title Add data from a density defined by a vector of integers +#' @description Data are generated from an a density defined by a vector of integers. +#' @param dtOld Name of data table that is to be updated. +#' @param dataDist Vector that defines the desired density. +#' @param varname Name of variable name. +#' @param uselimits Indicator to use minimum and maximum of input data vector as +#' limits for sampling. Defaults to FALSE, in which case a smoothed density that +#' extends beyond the limits is used. +#' @param id A string specifying the field that serves as the record id. The +#' default field is "id". +#' @return A data table with the generated data. +#' @examples +#' def <- defData(varname = "x1", formula = 5, dist = "poisson") +#' +#' data_dist <- data_dist <- c(1, 2, 2, 3, 4, 4, 4, 5, 6, 6, 7, 7, 7, 8, 9, 10, 10) +#' +#' dd <- genData(500, def) +#' dd <- addDataDensity(dd, data_dist, varname = "x2") +#' dd <- addDataDensity(dd, data_dist, varname = "x3", uselimits = TRUE) +#' @export +#' @concept generate_data +#' +#' +addDataDensity <- function(dtOld, dataDist, varname, uselimits = FALSE) { + + assertNotMissing(dtOld = missing(dtOld), dataDist = missing(dataDist), varname = missing(varname)) + assertClass(dtOld = dtOld, class = "data.table") + + assertNotInDataTable(varname, dtOld) + + dataDist <- round(dataDist, 0) + + if (uselimits) { + density_est <- density(dataDist, n = 10000, from = min(data_dist), to = max(data_dist)) + } else { + density_est <- density(dataDist, n = 10000) + } + + x <- density_est$x + y <- density_est$y + + # Normalize the density values to create a probability distribution + + probabilities <- y / sum(y) + + # Sample from the x values according to the probabilities + + .x <- sample(x, size = nrow(dtOld), replace = TRUE, prob = probabilities) + + dtOld[, (varname) := .x] + dtOld[] +} diff --git a/R/generate_data.R b/R/generate_data.R index f2b7e5b1..9de4f869 100644 --- a/R/generate_data.R +++ b/R/generate_data.R @@ -1175,3 +1175,34 @@ genSynthetic <- function(dtFrom, n = nrow(dtFrom), } +#' @title Generate data from a density defined by a vector of integers +#' @description Data are generated from an a density defined by a vector of integers +#' @param n Number of samples to draw from the density. +#' @param dataDist Vector that defines the desired density +#' @param varname Name of variable name +#' @param uselimits Indicator to use minimum and maximum of input data vector as +#' limits for sampling. Defaults to FALSE, in which case a smoothed density that +#' extends beyond the limits is used. +#' @param id A string specifying the field that serves as the record id. The +#' default field is "id". +#' @return A data table with the generated data +#' @examples +#' data_dist <- data_dist <- c(1, 2, 2, 3, 4, 4, 4, 5, 6, 6, 7, 7, 7, 8, 9, 10, 10) +#' +#' genDataDensity(500, data_dist, varname = "x1", id = "id") +#' genDataDensity(500, data_dist, varname = "x1", uselimits = TRUE, id = "id") +#' @export +#' @concept generate_data + +genDataDensity <- function(n, dataDist, varname, uselimits = FALSE, id = "id") { + + assertNotMissing(n = missing(n), dataDist = missing(dataDist), varname = missing(varname)) + + dataDist <- round(dataDist, 0) + + .dd <- genData(n, id = id) + addDataDensity(.dd, dataDist, varname, uselimits)[] + +} + + diff --git a/man/addColumns.Rd b/man/addColumns.Rd index b1d41825..4a24d3e4 100644 --- a/man/addColumns.Rd +++ b/man/addColumns.Rd @@ -7,9 +7,9 @@ addColumns(dtDefs, dtOld, envir = parent.frame()) } \arguments{ -\item{dtDefs}{name of definitions for added columns} +\item{dtDefs}{Name of definitions for added columns} -\item{dtOld}{name of data table that is to be updated} +\item{dtOld}{Name of data table that is to be updated} \item{envir}{Environment the data definitions are evaluated in. Defaults to \link[base:sys.parent]{base::parent.frame}.} diff --git a/man/addDataDensity.Rd b/man/addDataDensity.Rd new file mode 100644 index 00000000..e3cde13b --- /dev/null +++ b/man/addDataDensity.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/add_data.R +\name{addDataDensity} +\alias{addDataDensity} +\title{Add data from a density defined by a vector of integers} +\usage{ +addDataDensity(dtOld, dataDist, varname, uselimits = FALSE) +} +\arguments{ +\item{dtOld}{Name of data table that is to be updated.} + +\item{dataDist}{Vector that defines the desired density.} + +\item{varname}{Name of variable name.} + +\item{uselimits}{Indicator to use minimum and maximum of input data vector as +limits for sampling. Defaults to FALSE, in which case a smoothed density that +extends beyond the limits is used.} + +\item{id}{A string specifying the field that serves as the record id. The +default field is "id".} +} +\value{ +A data table with the generated data. +} +\description{ +Data are generated from an a density defined by a vector of integers. +} +\examples{ +def <- defData(varname = "x1", formula = 5, dist = "poisson") + +data_dist <- data_dist <- c(1, 2, 2, 3, 4, 4, 4, 5, 6, 6, 7, 7, 7, 8, 9, 10, 10) + +dd <- genData(500, def) +dd <- addDataDensity(dd, data_dist, varname = "x2") +dd <- addDataDensity(dd, data_dist, varname = "x3", uselimits = TRUE) +} +\concept{generate_data} diff --git a/man/genDataDensity.Rd b/man/genDataDensity.Rd new file mode 100644 index 00000000..36ebbeb4 --- /dev/null +++ b/man/genDataDensity.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/generate_data.R +\name{genDataDensity} +\alias{genDataDensity} +\title{Generate data from a density defined by a vector of integers} +\usage{ +genDataDensity(n, dataDist, varname, uselimits = FALSE, id = "id") +} +\arguments{ +\item{n}{Number of samples to draw from the density.} + +\item{dataDist}{Vector that defines the desired density} + +\item{varname}{Name of variable name} + +\item{uselimits}{Indicator to use minimum and maximum of input data vector as +limits for sampling. Defaults to FALSE, in which case a smoothed density that +extends beyond the limits is used.} + +\item{id}{A string specifying the field that serves as the record id. The +default field is "id".} +} +\value{ +A data table with the generated data +} +\description{ +Data are generated from an a density defined by a vector of integers +} +\examples{ +data_dist <- data_dist <- c(1, 2, 2, 3, 4, 4, 4, 5, 6, 6, 7, 7, 7, 8, 9, 10, 10) + +genDataDensity(500, data_dist, varname = "x1", id = "id") +genDataDensity(500, data_dist, varname = "x1", uselimits = TRUE, id = "id") +} +\concept{generate_data} From d1a2737471a5b252dc5a5637f5bf255e455ec320 Mon Sep 17 00:00:00 2001 From: Keith Goldfeld Date: Wed, 29 May 2024 13:09:48 -0400 Subject: [PATCH 2/2] Adding tests and changes to pass checks --- R/add_data.R | 6 +- man/addDataDensity.Rd | 3 - tests/testthat/test-generate_data.R | 105 ++++++++++++++++++++++++++++ 3 files changed, 107 insertions(+), 7 deletions(-) diff --git a/R/add_data.R b/R/add_data.R index 39af65b1..2092cd85 100644 --- a/R/add_data.R +++ b/R/add_data.R @@ -532,8 +532,6 @@ addSynthetic <- function(dtOld, dtFrom, #' @param uselimits Indicator to use minimum and maximum of input data vector as #' limits for sampling. Defaults to FALSE, in which case a smoothed density that #' extends beyond the limits is used. -#' @param id A string specifying the field that serves as the record id. The -#' default field is "id". #' @return A data table with the generated data. #' @examples #' def <- defData(varname = "x1", formula = 5, dist = "poisson") @@ -557,9 +555,9 @@ addDataDensity <- function(dtOld, dataDist, varname, uselimits = FALSE) { dataDist <- round(dataDist, 0) if (uselimits) { - density_est <- density(dataDist, n = 10000, from = min(data_dist), to = max(data_dist)) + density_est <- stats::density(dataDist, n = 10000, from = min(dataDist), to = max(dataDist)) } else { - density_est <- density(dataDist, n = 10000) + density_est <- stats::density(dataDist, n = 10000) } x <- density_est$x diff --git a/man/addDataDensity.Rd b/man/addDataDensity.Rd index e3cde13b..1d2beee3 100644 --- a/man/addDataDensity.Rd +++ b/man/addDataDensity.Rd @@ -16,9 +16,6 @@ addDataDensity(dtOld, dataDist, varname, uselimits = FALSE) \item{uselimits}{Indicator to use minimum and maximum of input data vector as limits for sampling. Defaults to FALSE, in which case a smoothed density that extends beyond the limits is used.} - -\item{id}{A string specifying the field that serves as the record id. The -default field is "id".} } \value{ A data table with the generated data. diff --git a/tests/testthat/test-generate_data.R b/tests/testthat/test-generate_data.R index a3fbdafa..49ebe8a0 100644 --- a/tests/testthat/test-generate_data.R +++ b/tests/testthat/test-generate_data.R @@ -886,3 +886,108 @@ test_that("logistiCoefs works", { }) +#addDataDensity and genDataDensity + +test_that("addDataDensity works", { + + skip_on_cran() + + f <- function(data_dist) { + + def <- defData(varname = "x1", formula = 5, dist = "poisson") + + dd <- genData(10000, def) + dd <- addDataDensity(dd, data_dist, varname = "x2", uselimits = TRUE) + + dd[] + + } + + compare <- function() { + + ints <- rpois(50, rpois(1, 8)) + dx <- f(ints) + suppressWarnings(ks.test(dx$x2, ints))$p.value + + } + + kstest <- mean(sapply(1:200, function(x) compare()) < .05) + expect_lt(kstest, 0.05) + + + f2 <- function(data_dist) { + + def <- defData(varname = "x1", formula = 5, dist = "poisson") + + dd <- genData(10000, def) + dd <- genData(10000, def) + dd <- addDataDensity(dd, data_dist, varname = "x2") + + dd[] + + } + + compare2 <- function() { + + ints <- rpois(50, rpois(1, 8)) + dx <- f2(ints) + p.tails <- dx[, mean(x2 <= min(ints) | x2 >= max(ints))] + p.value <- suppressWarnings(ks.test(dx$x2, ints))$p.value + + data.table::data.table(p.tails, p.value) + + } + + dp <- data.table::rbindlist(lapply(1:200, function(x) compare2())) + + expect_lte(dp[, round(mean(p.tails), 2)], 0.05) + expect_lt(dp[, mean(p.value <= .05)], 0.05) + + + +}) + +test_that("genDataDensity works", { + + skip_on_cran() + + f <- function(data_dist) { + + dd <- genDataDensity(10000, data_dist, varname = "x1", uselimits = TRUE) + dd[] + + } + + compare <- function() { + + ints <- rpois(50, rpois(1, 8)) + dx <- f(ints) + suppressWarnings(ks.test(dx$x1, ints))$p.value + + } + + kstest <- mean(sapply(1:200, function(x) compare()) < .05) + expect_lt(kstest, 0.05) + +}) + +test_that("genDataDensity and addDataDensity throws errors", { + skip_on_cran() + + dd <- genData(10) + ddist <- rpois( 50, 5 ) + expect_error(genDataDensity(dtOld = dd, distData = ddist, newvar = "weight")) + expect_error(genDataDensity(5, distData = xdist, newvar = "weight")) + expect_error(genDataDensity(5, distData = ddist)) + + def <- defData(varname = "x1", formula = 5, dist = "poisson") + dd <- genData(10, def) + + expect_error(addDataDensity(dx, ddist, varname = "x2")) + expect_error(addDataDensity(dd, xdist, varname = "x2")) + expect_error(addDataDensity(dd, ddist, varname = "x1")) + expect_error(addDataDensity(dd, ddist)) + expect_error(addDataDensity(5, ddist, varname = "x2")) +}) + +