Skip to content

Commit

Permalink
Merge pull request #227 from kgoldfeld/88-create-a-rejection-sampling…
Browse files Browse the repository at this point in the history
…-function-to-sample-from-non-standard-distributions

88 create a rejection sampling function to sample from non standard distributions
  • Loading branch information
kgoldfeld authored May 30, 2024
2 parents 1bdef31 + d1a2737 commit ff26538
Show file tree
Hide file tree
Showing 7 changed files with 263 additions and 4 deletions.
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ export(addCondition)
export(addCorData)
export(addCorFlex)
export(addCorGen)
export(addDataDensity)
export(addMarkov)
export(addMultiFac)
export(addPeriods)
Expand Down Expand Up @@ -34,6 +35,7 @@ export(genCorGen)
export(genCorMat)
export(genCorOrdCat)
export(genData)
export(genDataDensity)
export(genDummy)
export(genFactor)
export(genFormula)
Expand Down
55 changes: 53 additions & 2 deletions R/add_data.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#' Add columns to existing data set
#'
#' @param dtDefs name of definitions for added columns
#' @param dtOld name of data table that is to be updated
#' @param dtDefs Name of definitions for added columns
#' @param dtOld Name of data table that is to be updated
#' @param envir Environment the data definitions are evaluated in.
#' Defaults to [base::parent.frame].
#' @return an updated data.table that contains the added simulated data
Expand Down Expand Up @@ -523,3 +523,54 @@ addSynthetic <- function(dtOld, dtFrom,
dS[]

}

#' @title Add data from a density defined by a vector of integers
#' @description Data are generated from an a density defined by a vector of integers.
#' @param dtOld Name of data table that is to be updated.
#' @param dataDist Vector that defines the desired density.
#' @param varname Name of variable name.
#' @param uselimits Indicator to use minimum and maximum of input data vector as
#' limits for sampling. Defaults to FALSE, in which case a smoothed density that
#' extends beyond the limits is used.
#' @return A data table with the generated data.
#' @examples
#' def <- defData(varname = "x1", formula = 5, dist = "poisson")
#'
#' data_dist <- data_dist <- c(1, 2, 2, 3, 4, 4, 4, 5, 6, 6, 7, 7, 7, 8, 9, 10, 10)
#'
#' dd <- genData(500, def)
#' dd <- addDataDensity(dd, data_dist, varname = "x2")
#' dd <- addDataDensity(dd, data_dist, varname = "x3", uselimits = TRUE)
#' @export
#' @concept generate_data
#'
#'
addDataDensity <- function(dtOld, dataDist, varname, uselimits = FALSE) {

assertNotMissing(dtOld = missing(dtOld), dataDist = missing(dataDist), varname = missing(varname))
assertClass(dtOld = dtOld, class = "data.table")

assertNotInDataTable(varname, dtOld)

dataDist <- round(dataDist, 0)

if (uselimits) {
density_est <- stats::density(dataDist, n = 10000, from = min(dataDist), to = max(dataDist))
} else {
density_est <- stats::density(dataDist, n = 10000)
}

x <- density_est$x
y <- density_est$y

# Normalize the density values to create a probability distribution

probabilities <- y / sum(y)

# Sample from the x values according to the probabilities

.x <- sample(x, size = nrow(dtOld), replace = TRUE, prob = probabilities)

dtOld[, (varname) := .x]
dtOld[]
}
31 changes: 31 additions & 0 deletions R/generate_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -1175,3 +1175,34 @@ genSynthetic <- function(dtFrom, n = nrow(dtFrom),

}

#' @title Generate data from a density defined by a vector of integers
#' @description Data are generated from an a density defined by a vector of integers
#' @param n Number of samples to draw from the density.
#' @param dataDist Vector that defines the desired density
#' @param varname Name of variable name
#' @param uselimits Indicator to use minimum and maximum of input data vector as
#' limits for sampling. Defaults to FALSE, in which case a smoothed density that
#' extends beyond the limits is used.
#' @param id A string specifying the field that serves as the record id. The
#' default field is "id".
#' @return A data table with the generated data
#' @examples
#' data_dist <- data_dist <- c(1, 2, 2, 3, 4, 4, 4, 5, 6, 6, 7, 7, 7, 8, 9, 10, 10)
#'
#' genDataDensity(500, data_dist, varname = "x1", id = "id")
#' genDataDensity(500, data_dist, varname = "x1", uselimits = TRUE, id = "id")
#' @export
#' @concept generate_data

genDataDensity <- function(n, dataDist, varname, uselimits = FALSE, id = "id") {

assertNotMissing(n = missing(n), dataDist = missing(dataDist), varname = missing(varname))

dataDist <- round(dataDist, 0)

.dd <- genData(n, id = id)
addDataDensity(.dd, dataDist, varname, uselimits)[]

}


4 changes: 2 additions & 2 deletions man/addColumns.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

35 changes: 35 additions & 0 deletions man/addDataDensity.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

35 changes: 35 additions & 0 deletions man/genDataDensity.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

105 changes: 105 additions & 0 deletions tests/testthat/test-generate_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -886,3 +886,108 @@ test_that("logistiCoefs works", {

})

#addDataDensity and genDataDensity

test_that("addDataDensity works", {

skip_on_cran()

f <- function(data_dist) {

def <- defData(varname = "x1", formula = 5, dist = "poisson")

dd <- genData(10000, def)
dd <- addDataDensity(dd, data_dist, varname = "x2", uselimits = TRUE)

dd[]

}

compare <- function() {

ints <- rpois(50, rpois(1, 8))
dx <- f(ints)
suppressWarnings(ks.test(dx$x2, ints))$p.value

}

kstest <- mean(sapply(1:200, function(x) compare()) < .05)
expect_lt(kstest, 0.05)


f2 <- function(data_dist) {

def <- defData(varname = "x1", formula = 5, dist = "poisson")

dd <- genData(10000, def)
dd <- genData(10000, def)
dd <- addDataDensity(dd, data_dist, varname = "x2")

dd[]

}

compare2 <- function() {

ints <- rpois(50, rpois(1, 8))
dx <- f2(ints)
p.tails <- dx[, mean(x2 <= min(ints) | x2 >= max(ints))]
p.value <- suppressWarnings(ks.test(dx$x2, ints))$p.value

data.table::data.table(p.tails, p.value)

}

dp <- data.table::rbindlist(lapply(1:200, function(x) compare2()))

expect_lte(dp[, round(mean(p.tails), 2)], 0.05)
expect_lt(dp[, mean(p.value <= .05)], 0.05)



})

test_that("genDataDensity works", {

skip_on_cran()

f <- function(data_dist) {

dd <- genDataDensity(10000, data_dist, varname = "x1", uselimits = TRUE)
dd[]

}

compare <- function() {

ints <- rpois(50, rpois(1, 8))
dx <- f(ints)
suppressWarnings(ks.test(dx$x1, ints))$p.value

}

kstest <- mean(sapply(1:200, function(x) compare()) < .05)
expect_lt(kstest, 0.05)

})

test_that("genDataDensity and addDataDensity throws errors", {
skip_on_cran()

dd <- genData(10)
ddist <- rpois( 50, 5 )
expect_error(genDataDensity(dtOld = dd, distData = ddist, newvar = "weight"))
expect_error(genDataDensity(5, distData = xdist, newvar = "weight"))
expect_error(genDataDensity(5, distData = ddist))

def <- defData(varname = "x1", formula = 5, dist = "poisson")
dd <- genData(10, def)

expect_error(addDataDensity(dx, ddist, varname = "x2"))
expect_error(addDataDensity(dd, xdist, varname = "x2"))
expect_error(addDataDensity(dd, ddist, varname = "x1"))
expect_error(addDataDensity(dd, ddist))
expect_error(addDataDensity(5, ddist, varname = "x2"))
})


0 comments on commit ff26538

Please sign in to comment.