stan-dev · jgabry · Feb 15, 2024 · Dec 28, 2023 · Jan 22, 2024 · Jan 24, 2024
diff --git a/R/crps.R b/R/crps.R
@@ -112,7 +112,7 @@ loo_crps.matrix <-
            log_lik,
            ...,
            permutations = 1,
-           r_eff = NULL,
+           r_eff = 1,
            cores = getOption("mc.cores", 1)) {
   validate_crps_input(x, x2, y, log_lik)
   repeats <- replicate(permutations,
@@ -154,7 +154,7 @@ loo_scrps.matrix <-
     log_lik,
     ...,
     permutations = 1,
-    r_eff = NULL,
+    r_eff = 1,
     cores = getOption("mc.cores", 1)) {
   validate_crps_input(x, x2, y, log_lik)
   repeats <- replicate(permutations,
@@ -175,7 +175,7 @@ EXX_compute <- function(x, x2) {
 }
 
 
-EXX_loo_compute <- function(x, x2, log_lik, r_eff = NULL, ...) {
+EXX_loo_compute <- function(x, x2, log_lik, r_eff = 1, ...) {
   S <- nrow(x)
   shuffle <- sample (1:S)
   x2 <- x2[shuffle,]

diff --git a/R/diagnostics.R b/R/diagnostics.R
diff --git a/R/effective_sample_sizes.R b/R/effective_sample_sizes.R
@@ -175,7 +175,6 @@ psis_n_eff <- function(w, ...) {
 psis_n_eff.default <- function(w, r_eff = NULL, ...) {
   ss <- sum(w^2)
   if (is.null(r_eff)) {
-    warning("PSIS n_eff not adjusted based on MCMC n_eff.", call. = FALSE)
     return(1 / ss)
   }
   stopifnot(length(r_eff) == 1)
@@ -186,11 +185,11 @@ psis_n_eff.default <- function(w, r_eff = NULL, ...) {
 psis_n_eff.matrix <- function(w, r_eff = NULL, ...) {
   ss <- colSums(w^2)
   if (is.null(r_eff)) {
-    warning("PSIS n_eff not adjusted based on MCMC n_eff.", call. = FALSE)
     return(1 / ss)
   }
-  if (length(r_eff) != length(ss))
-    stop("r_eff must have length ncol(w).", call. = FALSE)
+  if (length(r_eff) != length(ss) && length(r_eff) != 1) {
+    stop("r_eff must have length 1 or ncol(w).", call. = FALSE)
+  }
   1 / ss * r_eff
 }
 

diff --git a/R/gpdfit.R b/R/gpdfit.R
@@ -81,7 +81,7 @@ adjust_k_wip <- function(k, n) {
 }
 
 
-#' Inverse CDF of generalized pareto distribution
+#' Inverse CDF of generalized Pareto distribution
 #' (assuming location parameter is 0)
 #'
 #' @noRd

diff --git a/R/importance_sampling.R b/R/importance_sampling.R
@@ -19,7 +19,7 @@ importance_sampling <- function(log_ratios, method, ...) {
 importance_sampling.array <-
   function(log_ratios, method,
            ...,
-           r_eff = NULL,
+           r_eff = 1,
            cores = getOption("mc.cores", 1)) {
     cores <- loo_cores(cores)
     stopifnot(length(dim(log_ratios)) == 3)
@@ -36,7 +36,7 @@ importance_sampling.array <-
 importance_sampling.matrix <-
   function(log_ratios, method,
            ...,
-           r_eff = NULL,
+           r_eff = 1,
            cores = getOption("mc.cores", 1)) {
     cores <- loo_cores(cores)
     assert_importance_sampling_method_is_implemented(method)
@@ -49,7 +49,7 @@ importance_sampling.matrix <-
 #' @inheritParams psis
 #' @export
 importance_sampling.default <-
-  function(log_ratios, method, ..., r_eff = NULL) {
+  function(log_ratios, method, ..., r_eff = 1) {
     stopifnot(is.null(dim(log_ratios)) || length(dim(log_ratios)) == 1)
     assert_importance_sampling_method_is_implemented(method)
     dim(log_ratios) <- c(length(log_ratios), 1)
@@ -128,7 +128,7 @@ implemented_is_methods <- function() c("psis", "tis", "sis")
 #'   but unnormalized.
 #' @param pareto_k Vector of GPD k estimates.
 #' @param tail_len Vector of tail lengths used to fit GPD.
-#' @param r_eff Vector of relative MCMC n_eff for `exp(log lik)`
+#' @param r_eff Vector of relative MCMC ESS (n_eff) for `exp(log lik)`
 #' @template is_method
 #' @return A list of class `"psis"` with structure described in the main doc at
 #'   the top of this file.
@@ -153,7 +153,7 @@ importance_sampling_object <-
     out <- structure(
       list(
         log_weights = unnormalized_log_weights,
-        diagnostics = list(pareto_k = pareto_k, n_eff = NULL)
+        diagnostics = list(pareto_k = pareto_k, n_eff = NULL, r_eff = r_eff)
       ),
       # attributes
       norm_const_log = norm_const_log,
@@ -184,6 +184,7 @@ do_importance_sampling <- function(log_ratios, r_eff, cores, method) {
   assert_importance_sampling_method_is_implemented(method)
   N <- ncol(log_ratios)
   S <- nrow(log_ratios)
+  k_threshold <- ps_khat_threshold(S)
   tail_len <- n_pareto(r_eff, S)
 
   if (method == "psis") {
@@ -223,7 +224,7 @@ do_importance_sampling <- function(log_ratios, r_eff, cores, method) {
 
   log_weights <- psis_apply(lw_list, "log_weights", fun_val = numeric(S))
   pareto_k <- psis_apply(lw_list, "pareto_k")
-  throw_pareto_warnings(pareto_k)
+  throw_pareto_warnings(pareto_k, k_threshold)
 
   importance_sampling_object(
     unnormalized_log_weights = log_weights,

diff --git a/R/loo-glossary.R b/R/loo-glossary.R
@@ -3,6 +3,7 @@
 #' @name loo-glossary
 #'
 #' @template loo-and-psis-references
+#' @template loo-uncertainty-reference
 #' @template bayesvis-reference
 #'
 #' @description
@@ -38,7 +39,8 @@
 #' estimate is an accurate estimate for the scale, it ignores the skewness. When
 #' making model comparisons, the SE of the component-wise (pairwise) differences
 #' should be used instead (see the `se_diff` section below and Eq 24 in
-#' VGG2017).
+#' VGG2017). Sivula et al. (2022) discuss the conditions when the normal
+#' approximation used for SE and `se_diff` is good.
 #'
 #' @section Monte Carlo SE of elpd_loo:
 #'
@@ -62,41 +64,73 @@
 #'
 #' @section Pareto k estimates:
 #'
-#' The Pareto `k` estimate is a diagnostic for Pareto smoothed importance
+#' The Pareto \eqn{k} estimate is a diagnostic for Pareto smoothed importance
 #' sampling (PSIS), which is used to compute components of `elpd_loo`. In
-#' importance-sampling LOO (the full posterior distribution is used as the
-#' proposal distribution). The Pareto k diagnostic estimates how far an
+#' importance-sampling LOO the full posterior distribution is used as the
+#' proposal distribution. The Pareto k diagnostic estimates how far an
 #' individual leave-one-out distribution is from the full distribution. If
 #' leaving out an observation changes the posterior too much then importance
-#' sampling is not able to give reliable estimate. If `k<0.5`, then the
-#' corresponding component of `elpd_loo` is estimated with high accuracy.
-#' If `0.5<k<0.7` the accuracy is lower, but still ok. If `k>0.7`,
-#' then importance sampling is not able to provide useful estimate for that
-#' component/observation. Pareto k is also useful as a measure of influence of
-#' an observation. Highly influential observations have high k values. Very high
-#' k values often indicate model misspecification, outliers or mistakes in data
-#' processing. See Section 6 of Gabry et al. (2019) for an example.
+#' sampling is not able to give a reliable estimate. Pareto smoothing stabilizes
+#' importance sampling and guarantees a finite variance estimate at the
+#' cost of some bias.
+#'
+#' The diagnostic threshold for Pareto \eqn{k} depends on sample size
+#' \eqn{S} (sample size dependent threshold was introduced by Vehtari
+#' et al., 2022, and before that fixed thresholds of 0.5 and 0.7 were
+#' recommended). For simplicity, `loo` package uses the nominal sample
+#' size \eqn{S}  when computing the sample size specific
+#' threshold. This provides an optimistic threshold if the effective
+#' sample size is less than 2200, but even then if ESS/S > 1/2 the difference
+#' is usually negligible. Thinning of MCMC draws can be used to improve
+#' the ratio ESS/S.
+#'
+#' * If \eqn{k < min(1 - 1 / log10(S), 0.7)}, where \eqn{S} is the
+#'   sample size, the PSIS estimate and the corresponding Monte
+#'   Carlo standard error estimate are reliable.
+#'
+#' * If \eqn{1 - 1 / log10(S) <= k < 0.7}, the PSIS estimate and the
+#'   corresponding Monte Carlo standard error estimate are not
+#'   reliable, but increasing the (effective) sample size \eqn{S} above
+#'   2200 may help (this will increase the sample size specific
+#'   threshold \eqn{(1 - 1 / log10(2200) > 0.7} and then the bias specific
+#'   threshold 0.7 dominates).
+#'
+#' * If \eqn{0.7 <= k < 1}, the PSIS estimate and the corresponding Monte
+#'   Carlo standard error have large bias and are not reliable. Increasing
+#'   the sample size may reduce the variability in the \eqn{k} estimate, which
+#'   may also result in a lower \eqn{k} estimate.
+#'
+#' * If \eqn{k \geq 1}{k >= 1}, the target distribution is estimated to
+#'   have non-finite mean. The PSIS estimate and the corresponding Monte
+#'   Carlo standard error are not well defined. Increasing the sample size
+#'   may reduce the variability in \eqn{k} estimate, which may also result in
+#'   a lower \eqn{k} estimate.
+#'
+#' Pareto \eqn{k} is also useful as a measure of influence of an
+#' observation.  Highly influential observations have high \eqn{k}
+#' values. Very high \eqn{k} values often indicate model
+#' misspecification, outliers or mistakes in data processing. See
+#' Section 6 of Gabry et al. (2019) for an example.
 #'
 #' \subsection{Interpreting `p_loo` when Pareto `k` is large}{
-#' If `k > 0.7` then we can also look at the `p_loo` estimate for
-#' some additional information about the problem:
+#' If \eqn{k > 0.7} then we can also look at
+#' the `p_loo` estimate for some additional information about the problem:
 #'
-#' \itemize{
-#' \item If `p_loo << p` (the total number of parameters in the model),
+#' * If `p_loo << p` (the total number of parameters in the model),
 #' then the model is likely to be misspecified. Posterior predictive checks
 #' (PPCs) are then likely to also detect the problem. Try using an overdispersed
 #' model, or add more structural information (nonlinearity, mixture model,
 #' etc.).
 #'
-#' \item If `p_loo < p` and the number of parameters `p` is relatively
+#' * If `p_loo < p` and the number of parameters `p` is relatively
 #' large compared to the number of observations (e.g., `p>N/5`), it is
 #' likely that the model is so flexible or the population prior so weak that it’s
 #' difficult to predict the left out observation (even for the true model).
 #' This happens, for example, in the simulated 8 schools (in VGG2017), random
 #' effect models with a few observations per random effect, and Gaussian
 #' processes and spatial models with short correlation lengths.
 #'
-#' \item If `p_loo > p`, then the model is likely to be badly misspecified.
+#' * If `p_loo > p`, then the model is likely to be badly misspecified.
 #' If the number of parameters `p<<N`, then PPCs are also likely to detect the
 #' problem. See the case study at
 #' <https://avehtari.github.io/modelselection/roaches.html> for an example.
@@ -106,7 +140,6 @@
 #' may have few observations and other groups many), it is possible that PPCs won't
 #' detect the problem.
 #' }
-#' }
 #'
 #' @section elpd_diff:
 #' `elpd_diff` is the difference in `elpd_loo` for two models. If more

diff --git a/R/loo-package.R b/R/loo-package.R
@@ -13,7 +13,7 @@
 #' *Stan Development Team*
 #'
 #' This package implements the methods described in Vehtari, Gelman, and
-#' Gabry (2017), Vehtari, Simpson, Gelman, Yao, and Gabry (2019), and
+#' Gabry (2017), Vehtari, Simpson, Gelman, Yao, and Gabry (2022), and
 #' Yao et al. (2018). To get started see the **loo** package
 #' [vignettes](https://mc-stan.org/loo/articles/index.html), the
 #' [loo()] function for efficient approximate leave-one-out
@@ -33,7 +33,7 @@
 #'   fast and stable computations for approximate LOO-CV laid out in Vehtari,
 #'   Gelman, and Gabry (2017). From existing posterior simulation draws, we
 #'   compute LOO-CV using Pareto smoothed importance sampling (PSIS; Vehtari,
-#'   Simpson, Gelman, Yao, and Gabry, 2019), a new procedure for stabilizing
+#'   Simpson, Gelman, Yao, and Gabry, 2022), a new procedure for stabilizing
 #'   and diagnosing importance weights. As a byproduct of our calculations,
 #'   we also obtain approximate standard errors for estimated predictive
 #'   errors and for comparing of predictive errors between two models.