insert 'stats::' and correct parameter spelling

AngusMcLure · Mar 28, 2024 · 0e6f7f3 · 0e6f7f3
1 parent a0fce8a
commit 0e6f7f3
Show file tree

Hide file tree

Showing 5 changed files with 172 additions and 17 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -4,9 +4,10 @@
 ^Presentations*
 ^InProgress*
 ^Figures*
-man/hello.Rd
+^R/detection.R
 ^LICENSE\.md$
 ^dev_resources$
 ^\.github$
 ^PoolPoweR\.Rproj$
 ^CHANGELOG.md$
+
diff --git a/InProgress/detection.R → InProgress/detection_dev.R b/InProgress/detection.R → InProgress/detection_dev.R
diff --git a/R/detection.R b/R/detection.R
@@ -0,0 +1,151 @@
+#' Power and sample size calculations for estimating population prevalence from
+#' pooled samples
+#'
+#' `power_pool()` calculates the statistical power of a pooled survey design to
+#' determine whether population prevalence is different from a threshold.
+#' `sample_size_pool()` calculate the sample size required for a pooled survey
+#' to achieve a specified power.
+#'
+#' @param sample_size numeric The total number of units across the whole sample.
+#'   Should be greater than `pool_size * pool_number`
+#' @param pool_size numeric The number of units per pool. Must be a numeric
+#'   value or vector of values greater than 0.
+#' @param pool_number numeric The number of pools per cluster. Must be a integer
+#'   value or a vector of integer values greater than or equal to 1.
+#' @param prevalence_null,prevlanece_alt numeric The proportion of units that
+#'   carry the marker of interest (i.e. true positive). `prevalence_null` is the
+#'   threshold to compare to and `prevlanece_alt` is the design prevalence. Must
+#'   be be a numeric value between 0 and 1, inclusive of both.
+#' @param correlation numeric The correlation between test results within a
+#'   single cluster (units in different clusters are assumed to be
+#'   uncorrelated). Must be a numeric value between 0 and 1, inclusive of both.
+#'   A value of 1 indicates that units within clusters are perfectly correlated
+#'   (there are no differences units within a single cluster). A value of 0
+#'   indicates that units within clusters are no more correlated than units in
+#'   different clusters.
+#' @param sensitivity numeric The probability that the test correctly identifies
+#'   a true positive. Must be a numeric value between 0 and 1, inclusive of
+#'   both. A value of 1 indicates that the test can perfectly identify all true
+#'   positives.
+#' @param specificity numeric The probability that the test correctly identifies
+#'   a true negative. Must be a numeric value between 0 and 1, inclusive of
+#'   both. A value of 1 indicates that the test can perfectly identify all true
+#'   negatives.
+#' @param sig_level numeric Signifigance level for statistical test. Defaults to
+#'   0.05. Must be strictly between 0 and 1.
+#' @param alternative string The kind of comparison to make. If "greater"
+#'   (default) or "less" computes power of tests for one-sided comparisons that
+#'   population prevalence (`prevalence_alt`) is greater than or less than the
+#'   reference on threshold prevalence (`prevalence_null`) respectively. If
+#'   "two.sided" computes power for a two-sided comparison (`power_pool()`
+#'   only).
+#' @param form string The distribution used to model the cluster-level
+#'   prevalence and correlation of units within cluster. Select one of "beta",
+#'   "logitnorm" or "cloglognorm". See details.
+#' @param link string Transformation to be applied to prevalence estimates for
+#'   the purposes of calculating confidence intervals. Options are 'identity'
+#'   (i.e. no transformation), 'logit' (default), 'cloglog' and 'log'.
+#'
+#'
+#' @return The statistical power of the proposed design with regards to
+#'   comparing prevalence to a threshold (`power_pool()`)
+#' @export
+#'
+#' @examples
+#' power_pool(sample_size = 1000, pool_size = 10, pool_number = 2,
+#'            prevalence_null = 0.01, prevalence_alt = 0.02)
+#'
+#' sample_size_pool(pool_size = 10, pool_number = 2,
+#'                  prevalence_null = 0.01, prevalence_alt = 0.02)
+#'
+#' power_pool(sample_size = 1000, pool_size = 10,
+#'            pool_number = 2, prevalence_null = 0.01,
+#'            prevalence_alt = 0.02, correlation = 0.01)
+#'
+#' sample_size_pool(pool_size = 10, pool_number = 2,
+#'                 prevalence_null = 0.01, prevalence_alt = 0.02,
+#'                 correlation = 0.01)
+
+
+
+
+
+detection_errors <- function(prevalence, pool_size,N,M,sensitivity, specificity, correlation,
+                             periods_per_location, periods_total,
+                             catch.mean, catch.dispersion,
+                             form = 'beta', link = NULL){
+  rho <- correlation
+
+  link <- switch(form, logitnorm = qlogis, cloglognorm = cloglog, beta = function(x){x})
+  invlink <- switch(form, logitnorm = plogis, cloglognorm = cloglog_inv, beta = function(x){x})
+
+  if(form %in% c('logitnorm', 'cloglognorm')){
+    pars <- mu_sigma_linknorm(prevalence,prevalence * (1- prevalence) * rho, link, invlink)
+    mu <- pars[1]
+    sigma <- pars[2]
+    density <- function(x){dnorm(x, mean = mu, sd = sigma)}
+  }
+  if(form == 'beta'){
+    Alpha <- prevalence * (rho^-1 -1)
+    Beta <- (1-prevalence) * (rho^-1 -1)
+    density <- function(x){dbeta(x,Alpha, Beta)}
+  }
+
+  if(missing(N) & missing(periods_per_location) & missing(periods_total)){
+    stop('One of the following must be provided:
+             N (the number of groups per location)
+             periods_per_location (the number of sampling periods per location)
+             periods_total (total the number of sampling periods across all locations)')
+  }
+  if(missing(N) & missing(periods_per_location)){
+    periods_per_location <- periods_total/M
+    if(periods_per_location%%1) warning('Inputs imply a fractional number of sampling periods per sampling location')
+  }
+
+  if(rho == 0){
+    if(missing(N)){ #Case where we assume random (negative binomial) catch sizes at each location
+      warning('For correlation = 0, a heirarchical/cluster survey design with M locations and p sampling periods per location is approximately equivalent a simple random survey with p*M sampling periods per location')
+      const <- catch.dispersion/(catch.mean + catch.dispersion)
+      q <- (1 - (1 - sensitivity - specificity) * (1-prevalence)^pool_size - sensitivity) ^ (1/pool_size)
+      typeII <- (const/(1 - q * (1 - const)))^(M * periods_per_location * catch.dispersion)
+    }else{
+      warning('For correlation = 0, a heirarchical/cluster survey design with M locations and N groups per location is equivalent a simple random survey with N*M groups')
+      typeII <- (1 - (1 - sensitivity - specificity) * (1-prevalence)^pool_size - sensitivity)^(N*M)
+    }
+  }else{
+    if(missing(N)){ #Case where we assume random (negative binomial) catch sizes at each location
+      f <- function(x){
+        q <- (1 - (1 - sensitivity - specificity) * (1-invlink(x))^pool_size - sensitivity) ^ (1/pool_size)
+        density(x) *
+          (const/(1 - q * (1 - const)))^(periods_per_location * catch.dispersion)
+      }
+      typeI <- 1 - (const/(1 - specificity^(1/pool_size) * (1 - const)))^(periods_per_location * catch.dispersion * M)
+
+    }else{ #Case with fixed number of pools per site
+      f <- function(x){
+        density(x) *
+          (1 - (1 - sensitivity - specificity) * (1-invlink(x))^pool_size - sensitivity)^N
+      }
+    }
+    #typeI <-   1- specificity^(N * M)
+    typeI <- -expm1(log(specificity)*N * M) # equivalent to the above commented code, but more numerically stable for high specificity and large N*M
+  }
+
+  lb <- switch(form, logitnorm = -Inf, cloglognorm = -Inf, beta = 0)
+  ub <- switch(form, logitnorm = Inf, cloglognorm = Inf, beta = 1)
+
+
+  if(form == 'beta'){
+    if(missing(N)){stop('Have not implemented negative binomial sample size with form = beta. It has a nice closed form solution in terms of hypergeometric functions for the case with a perfec test. See paper notes')}
+    if(sensitivity ==1){
+      typeII <- exp((log(specificity) * N + lbeta(Alpha, Beta + pool_size * N) -  lbeta(Alpha, Beta)) * M)
+    }else{
+      z <- 0:N
+      summand <- (1-sensitivity)^N/beta(Alpha, Beta) * choose(N,z) * ((1-sensitivity - specificity)/(sensitivity - 1))^z * beta(Alpha, Beta + z * pool_size)
+      typeII <- sum(summand) ^ M
+    }
+  }else{
+    typeII <- stats::integrate(f, lb, ub)$value ^ M
+  }
+  list(typeI = typeI, typeII = typeII)
+}
diff --git a/R/power.R b/R/power.R
@@ -12,9 +12,9 @@
 #'   value or vector of values greater than 0.
 #' @param pool_number numeric The number of pools per cluster. Must be a integer
 #'   value or a vector of integer values greater than or equal to 1.
-#' @param prevalence_null,prevlanece_alt numeric The proportion of units that
+#' @param prevalence_null,prevalence_alt numeric The proportion of units that
 #'   carry the marker of interest (i.e. true positive). `prevalence_null` is the
-#'   threshold to compare to and `prevlanece_alt` is the design prevalence. Must
+#'   threshold to compare to and `prevalence_alt` is the design prevalence. Must
 #'   be be a numeric value between 0 and 1, inclusive of both.
 #' @param correlation numeric The correlation between test results within a
 #'   single cluster (units in different clusters are assumed to be
@@ -31,7 +31,8 @@
 #'   a true negative. Must be a numeric value between 0 and 1, inclusive of
 #'   both. A value of 1 indicates that the test can perfectly identify all true
 #'   negatives.
-#' @param sig_level numeric Signifigance level for statistical test. Defaults to
+#' @param power numeric The desired statistical power of the survey.
+#' @param sig_level numeric Significance level for statistical test. Defaults to
 #'   0.05. Must be strictly between 0 and 1.
 #' @param alternative string The kind of comparison to make. If "greater"
 #'   (default) or "less" computes power of tests for one-sided comparisons that
@@ -82,12 +83,12 @@ power_pool <- function(sample_size, pool_size, pool_number, prevalence_null, pre
 
   # if(real.scale & form %in% c('logitnorm', 'cloglognorm')){ g <- function(x){
   # #calculate mu from theta and rho .var <- correlation * x * (1-x)
-  # mu_sigma_linknorm(x,.var, link = switch(form, logitnorm = qlogis,
+  # mu_sigma_linknorm(x,.var, link = switch(form, logitnorm = stats::qlogis,
   # cloglognorm = cloglog), invlink = switch(form, logitnorm = plogis,
   # cloglognorm = cloglog_inv))[1] } gdivinv <- function(x){1}
   # }else{
   g <- switch(link,
-              logit = qlogis,
+              logit = stats::qlogis,
               cloglog = cloglog,
               log = log,
               identity = function(x){x})
@@ -119,10 +120,10 @@ power_pool <- function(sample_size, pool_size, pool_number, prevalence_null, pre
   #print(fi0)
 
   power <- switch(alternative,
-                  less = pnorm(((g(theta0) - g(thetaa))  - qnorm(1-sig_level)/sqrt(fi0)) * sqrt(fia)),
-                  greater = pnorm(((g(thetaa) - g(theta0))  - qnorm(1-sig_level)/sqrt(fi0)) * sqrt(fia)),
-                  two.sided = pnorm(((g(theta0) - g(thetaa))  - qnorm(1-sig_level/2)/sqrt(fi0)) * sqrt(fia)) +
-                    pnorm(((g(thetaa) - g(theta0))  - qnorm(1-sig_level/2)/sqrt(fi0)) * sqrt(fia)),
+                  less = stats::pnorm(((g(theta0) - g(thetaa))  - stats::qnorm(1-sig_level)/sqrt(fi0)) * sqrt(fia)),
+                  greater = stats::pnorm(((g(thetaa) - g(theta0))  - stats::qnorm(1-sig_level)/sqrt(fi0)) * sqrt(fia)),
+                  two.sided = stats::pnorm(((g(theta0) - g(thetaa))  - stats::qnorm(1-sig_level/2)/sqrt(fi0)) * sqrt(fia)) +
+                    stats::pnorm(((g(thetaa) - g(theta0))  - stats::qnorm(1-sig_level/2)/sqrt(fi0)) * sqrt(fia)),
                   stop('invalid alternative. options are less, greater, and two.sided')
   )
   power
@@ -134,7 +135,7 @@ power_pool <- function(sample_size, pool_size, pool_number, prevalence_null, pre
 sample_size_pool <- function(pool_size, pool_number,
                              prevalence_null, prevalence_alt,
                              correlation = 0, sensitivity = 1, specificity = 1,
-                             power = 0.8, sig.level = 0.05,
+                             power = 0.8, sig_level = 0.05,
                              alternative = 'greater',
                              form = 'beta',
                              link = 'logit'){
@@ -154,7 +155,7 @@ sample_size_pool <- function(pool_size, pool_number,
   }
 
   g <- switch(link,
-              logit = qlogis,
+              logit = stats::qlogis,
               cloglog = cloglog,
               log = log,
               identity = function(x){x})
@@ -183,5 +184,5 @@ sample_size_pool <- function(pool_size, pool_number,
                           form = form))[1,1]
 
   # Note that the below is correct for either kind of one-sided test, but not for two sided tests
-  ((qnorm(power)/sqrt(unit_fia) + qnorm(1 - sig.level)/sqrt(unit_fi0))/(g(theta0) - g(thetaa)))^2
+  ((stats::qnorm(power)/sqrt(unit_fia) + stats::qnorm(1 - sig_level)/sqrt(unit_fi0))/(g(theta0) - g(thetaa)))^2
 }
diff --git a/man/power_pool.Rd b/man/power_pool.Rd