Skip to content

Commit

Permalink
moved model comparison with LOO back to lecture 9
Browse files Browse the repository at this point in the history
  • Loading branch information
avehtari committed Nov 4, 2024
1 parent d2e85a7 commit bd61bde
Show file tree
Hide file tree
Showing 2 changed files with 2 additions and 324 deletions.
Binary file modified slides/BDA_lecture_8b.pdf
Binary file not shown.
326 changes: 2 additions & 324 deletions slides/BDA_lecture_8b.tex
Original file line number Diff line number Diff line change
Expand Up @@ -319,10 +319,10 @@
\item See also \url{https://users.aalto.fi/~ave/modelselection/CV-FAQ.html}

\end{list2}
\item 7.3 Model comparison based on predictive performance\\
\end{itemize}
Next week
\begin{itemize}
\item 7.3 Model comparison based on predictive performance\\
\item 7.4 Model comparison using Bayes factors\\
\item 7.5 Continuous model expansion / sensitivity analysis
\item {\color{gray}7.5 Example (may be skipped)}
Expand Down Expand Up @@ -1149,330 +1149,7 @@
\end{list2}}
\end{frame}
\begin{frame}{Model comparison and selection}
Today
\begin{list1}
\item Model comparison and selection (elpd\_diff, se)
\end{list1}
Next lecture
\begin{list1}
\item {\footnotesize Related methods (WAIC, *IC, BF)}
\item Hypothesis testing
\item Potential overfitting
\item Model expansion and averaging
\end{list1}
\end{frame}
\begin{frame}[fragile]{Student retention -- Posterior predictive distributions}
\framesubtitle{with \texttt{tidybayes}}
\vspace{-0.75\baselineskip}
Latent hierarchical linear model\\
\hspace{-7mm}
\begin{minipage}[t][3.6cm][t]{1.0\linewidth}
\includegraphics[height=3.6cm]{student_retention_lbinom_preds.pdf}
\end{minipage}
\vspace{-0.25\baselineskip}
Latent hierarchical linear model + spline\\
\hspace{-7mm}
\begin{minipage}[t][3.6cm][t]{1.0\linewidth}
\includegraphics[height=3.6cm]{student_retention_sbinom_preds.pdf}
\end{minipage}
\end{frame}
\begin{frame}[fragile]{Student retention -- Marginal PPC}
\framesubtitle{\texttt{pp\_check(fit, ndraws=100)}}
\vspace{-0.75\baselineskip}
Latent hierarchical linear model\\
\begin{minipage}[t][3.6cm][t]{1.0\linewidth}
\includegraphics[height=3.6cm]{student_retention_lbinom_ppc_dens_overlay.pdf}
\end{minipage}
\vspace{-0.5\baselineskip}
Latent hierarchical linear model + spline\\
\begin{minipage}[t][3.6cm][t]{1.0\linewidth}
\includegraphics[height=3.6cm]{student_retention_sbinom_ppc_dens_overlay.pdf}
\end{minipage}
\end{frame}
\begin{frame}[fragile]{Student retention -- LOO intervals}
\vspace{-0.5\baselineskip}
LOO predictive intervals -- latent hierarchical linear\\
\hspace{-7mm}
\begin{minipage}[t][3.6cm][t]{1.0\linewidth}
\includegraphics[height=3.6cm]{student_retention_lbinom_ppc_loo_intervals.pdf}
\end{minipage}
\vspace{-0.5\baselineskip}
LOO predictive intervals -- latent hierarchical linear + spline\\
\hspace{-7mm}
\begin{minipage}[t][3.6cm][t]{1.0\linewidth}
\includegraphics[height=3.6cm]{student_retention_sbinom_ppc_loo_intervals.pdf}
\end{minipage}
\end{frame}
\begin{frame}[fragile]{Student retention -- LOO-PIT checking}
\framesubtitle{\texttt{pp\_check(fit, type = "loo\_pit\_qq", ndraws=4000)}}
\vspace{-0.5\baselineskip}
LOO-PIT check -- latent hierarchical linear\\
\hspace{-7mm}
\begin{minipage}[t][3.6cm][t]{1.0\linewidth}
\includegraphics[height=3.6cm]{student_retention_lbinom_ppc_loo_pit_qq.pdf}
\end{minipage}
\vspace{-0.5\baselineskip}
LOO-PIT check -- latent hierarchical linear + spline\\
\hspace{-7mm}
\begin{minipage}[t][3.6cm][t]{1.0\linewidth}
\includegraphics[height=3.6cm]{student_retention_sbinom_ppc_loo_pit_qq.pdf}
\end{minipage}
\end{frame}
\begin{frame}[fragile]{Student retention -- $R^2$}
Latent hierarchical linear vs. latent hierarchical linear + spline
\begin{minted}[fontsize=\footnotesize]{text}
> loo_R2(fit4) |> round(digits=2)
Estimate Est.Error Q2.5 Q97.5
R2 0.92 0.02 0.88 0.95
> loo_R2(fit6) |> round(digits=2)
Estimate Est.Error Q2.5 Q97.5
R2 0.97 0.01 0.95 0.98
\end{minted}
$R^2$ measures the goodness of the mean of the predictive
distribution
\vspace{4\baselineskip}
{\color{gray}\footnotesize \href{https://doi.org/10.1080/00031305.2018.1549100}{Gelman, Goodrich, Gabry, and Vehtari (2019). R-squared for Bayesian regression models. \textit{The American Statistician}, 73(3):307-309.}}
\end{frame}
\begin{frame}[fragile]{Student retention -- log score -- elpd }
\vspace{-\baselineskip}
\begin{itemize}
\item information theoretical goodness of the whole distribution
\item elpd = expected log predictive density (probability)
\item elpd\_loo = estimated with LOO predictive densities / probs\\
$\sum_{n=1}^N \log p(y_i | x_i, x_{-i}, y_{-i})$
\end{itemize}
% \vspace{-0.5\baselineskip}
% LOO predictive intervals -- latent hierarchical linear\\
% \hspace{-7mm}
% \begin{minipage}[t][3.6cm][t]{1.0\linewidth}
% \includegraphics[height=3.6cm]{student_retention_sbinom_ppc_loo_intervals.pdf}
% \end{minipage}
%\vspace{-0.5\baselineskip}
\only<2->{
LOO predictive intervals -- latent hierarchical linear + spline\\
\begin{minipage}[t][3.6cm][t]{1.0\linewidth}
\hspace{-9mm}
\includegraphics[height=3.6cm]{student_retention_sbinom_ppc_loo_intervals.pdf}
\end{minipage}
}
\vspace{-1.75\baselineskip}
\only<3->{\fontsize{6.95}{9}\selectfont {~~~~~-8.4 -5.6 -2.9 -2.9 -2.8 -3.0 -4.0 -3.2 -3.9 -3.2 -3.4 -3.2 -2.9 -3.9 -3.4 -3.4 -3.2 -2.7 -2.8 -3.1\\
~~~~~~~~~-2.5 -2.8 -2.9 -3.4 -5.4 -3.7 -3.1 -3.3 -3.5 -3.2 -3.5 -3.5 -6.6 -3.8 -3.7 -3.4 -2.5 -2.8 -2.9 -3.3\\
}}
\uncover<4->{\footnotesize $\sum = $ -141.7}
\end{frame}
\begin{frame}[fragile]{Student retention -- elpd\_loo}
Latent hierarchical linear + spline
\begin{minted}[fontsize=\footnotesize,highlightlines=6]{text}
> loo(fit6)
Computed from 4000 by 40 log-likelihood matrix
Estimate SE
elpd_loo -141.7 7.2
p_loo 10.9 2.5
\end{minted}
\pause
Latent hierarchical linear
\begin{minted}[fontsize=\footnotesize,highlightlines=6]{text}
> loo(fit4)
Computed from 4000 by 40 log-likelihood matrix
Estimate SE
elpd_loo -184.3 17.3
p_loo 24.3 5.8
\end{minted}
\end{frame}
\begin{frame}[fragile]{Student retention -- log score -- elpd }
\vspace{-\baselineskip}
{
{\small LOO predictive intervals -- latent hierarchical linear}\\
\begin{minipage}[t][2.8cm][t]{1.2\linewidth}
\hspace{-9mm}
\includegraphics[height=2.8cm,trim=0 40 0 0,clip]{student_retention_lbinom_ppc_loo_intervals.pdf}
\end{minipage}
}
\vspace{-1.75\baselineskip}
{\fontsize{6.7}{8}\selectfont {~~~~-15.7 -7.6 -3.9 -2.9 -6.7 -4.2 -2.9 -3.1 -12.9 -4.7 -3.3 -3.4 -9.0 -3.0 -3.3 -3.2 -8.2 -2.8 -3.2 -3.0\\
~~~~~~~~~-2.9 -3.3 -3.0 -4.6 -4.3 -3.3 -3.0 -4.0 -3.0 -5.6 -3.6 -5.4 -4.9 -3.6 -3.9 -5.2 -2.7 -3.7 -3.0 -4.1
}}
{\scriptsize $\sum = $ -184.3}
{
{\small LOO predictive intervals -- latent hierarchical linear + spline}\\
\begin{minipage}[t][2.8cm][t]{1.2\linewidth}
\hspace{-9mm}
\includegraphics[height=2.8cm,trim=0 40 0 0,clip]{student_retention_sbinom_ppc_loo_intervals.pdf}
\end{minipage}
}
\vspace{-1.75\baselineskip}
{\fontsize{6.7}{8}\selectfont {~~~~~-8.4 -5.6 -2.9 -2.9 -2.8 -3.0 -4.0 -3.2 -3.9 -3.2 -3.4 -3.2 -2.9 -3.9 -3.4 -3.4 -3.2 -2.7 -2.8 -3.1\\
~~~~~~~~~-2.5 -2.8 -2.9 -3.4 -5.4 -3.7 -3.1 -3.3 -3.5 -3.2 -3.5 -3.5 -6.6 -3.8 -3.7 -3.4 -2.5 -2.8 -2.9 -3.3
}}
{\scriptsize $\sum = $ -141.7}
\end{frame}
\begin{frame}[fragile]{Student retention -- elpd\_loo}
\vspace{-0.7\baselineskip}
\hspace{-5mm}Latent hierarchical linear (fit4) vs latent hierarchical linear + spline (fit6)
\only<+>{\hspace{-5mm}\includegraphics[height=7cm]{student_retention_loo_pointwise_scatter.pdf}}
\only<+>{\hspace{-5mm}\includegraphics[height=7cm]{student_retention_loo_pointwise_diff_scatter.pdf}}
\only<+>{\hspace{-5mm}\includegraphics[height=7cm]{student_retention_loo_pointwise_diff_histogram_1.pdf}}
\only<+>{\hspace{-5mm}\includegraphics[height=7cm]{student_retention_loo_pointwise_diff_histogram_2.pdf}}
\only<+->{\hspace{-5mm}\includegraphics[height=7cm]{student_retention_loo_pointwise_diff_histogram_3.pdf}}
\only<+->{
\begin{minipage}[t][4cm][t]{3.2cm}
\vspace{-10.5\baselineskip}
mean $\approx 1.07$\\
\only<+->{sd $\approx 2.26$\\}
\only<+->{SE = sd/$\sqrt{40}\approx 0.36$\\}
\only<+->{\\sum $\approx 42.6$\\}
\only<+->{SE = sd$*\sqrt{40}\approx 14.3$\\}
\end{minipage}
}
\end{frame}
\begin{frame}[fragile]{Student retention -- elpd\_loo}
{\color{gray}
Latent hierarchical linear + spline
\begin{minted}[fontsize=\footnotesize,highlightlines=3]{text}
> loo(fit6)
Estimate SE
elpd_loo -141.7 7.2
p_loo 10.9 2.5
\end{minted}
Latent hierarchical linear
\begin{minted}[fontsize=\footnotesize,highlightlines=3]{text}
> loo(fit4)
Estimate SE
elpd_loo -184.3 17.3
p_loo 23.8 5.7
\end{minted}
}
\begin{minted}[fontsize=\footnotesize,highlightlines={2-4}]{text}
> loo_compare(loo(fit4), loo(fit6))
elpd_diff se_diff
fit6 0.0 0.0
fit4 -42.6 14.3
\end{minted}
\end{frame}
\begin{frame}{LOO difference uncertainty estimate (SE) reliability}
\vspace{-0.2\baselineskip}
\begin{list1}
\item[1.] The models make very similar predictions
\begin{list2}
\item<2-> if $|\mbox{elpd\_loo}|<4$, SE is not reliable, but the
difference is small anyway
\item<2-> selecting a ``wrong'' model has small cost
\item<2-> in nested case, the skewness favors the simpler model
\end{list2}
\item[2.] The models are misspecified with outliers in the data
\begin{list2}
\item<3-> in nested case, the bias favors the simpler model
\item<3-> model checking and model extension to avoid misspecified
models (Bayesian workflow)
\end{list2}
\item[3.] The number of observations is small
\begin{list2}
\item<4-> in nested case the skewness favors the simpler model
\item<4-> any inference with small $n$ is difficult
\item<4-> if $|\mbox{elpd\_loo}|>4$, model is well specified,
and $n>100$ then the normal approximation is good
\end{list2}
\end{list1}
{\color{gray}\footnotesize Sivula, Magnusson, Matamoros, and Vehtari (2022). Uncertainty in Bayesian leave-one-out cross-validation based model comparison. \textit{\href{https://arxiv.org/abs/2008.10296v3}{arXiv:2008.10296v3}}.}
\end{frame}
\begin{frame}{Log score and elpd\_loo}
\begin{itemize}
\item Log score is not easily interpretable
\item but is information theoretically good utility for the goodness
of the whole distribution
\item and thus is useful in model comparison
\end{itemize}
\end{frame}
\begin{frame}{Log score and elpd\_loo}
\begin{itemize}
\item Interpretation in discrete case
\begin{itemize}
\item log probability
\end{itemize}
\item<2-> For example
\begin{itemize}
\item $\frac{1}{N}\sum_{n=1}^N\exp(\mathrm{elpd}_{\mathrm{loo},n}) \approx 4\%$ probability that we predict the
observed value
\item<3-> compare to guessing uniformly from the data range [121,310] having
$1/(310-121+1) \approx 0.5\%$ probability \only<4->{(log score -210)}
\end{itemize}
\item<5-> Interpretation in continuous case
\begin{itemize}
\item can be compared to a simple reference distribution
\end{itemize}
\end{itemize}
\end{frame}
% \begin{frame}[fragile]{Student retention -- loo computation}
Expand Down Expand Up @@ -1502,6 +1179,7 @@
\frametitle{Next week}
\begin{itemize}
\item Model comparison with LOO-CV
\item When is cross-validation applicable?
\begin{list2}
\item data generating mechanisms and prediction tasks
Expand Down

0 comments on commit bd61bde

Please sign in to comment.