diff --git a/BUILD.bazel b/BUILD.bazel index a7f054c4..422058cc 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -48,6 +48,7 @@ cc_library( "include/albatross/serialize/Ransac", "include/albatross/src/cereal/ThreadPool.hpp", "include/albatross/src/cereal/block_utils.hpp", + "include/albatross/src/cereal/block_diagonal.hpp", "include/albatross/src/cereal/dataset.hpp", "include/albatross/src/cereal/distribution.hpp", "include/albatross/src/cereal/eigen.hpp", @@ -127,8 +128,10 @@ cc_library( "include/albatross/src/models/gp.hpp", "include/albatross/src/models/least_squares.hpp", "include/albatross/src/models/null_model.hpp", + "include/albatross/src/models/pic_gp.hpp", "include/albatross/src/models/ransac.hpp", "include/albatross/src/models/ransac_gp.hpp", + "include/albatross/src/models/sparse_common.hpp", "include/albatross/src/models/sparse_gp.hpp", "include/albatross/src/samplers/callbacks.hpp", "include/albatross/src/samplers/ensemble.hpp", @@ -228,6 +231,7 @@ swift_cc_test( "tests/test_models.cc", "tests/test_models.h", "tests/test_parameter_handling_mixin.cc", + "tests/test_pic_gp.cc", "tests/test_prediction.cc", "tests/test_radial.cc", "tests/test_random_utils.cc", diff --git a/WORKSPACE.bazel b/WORKSPACE.bazel index ab62240f..54325525 100644 --- a/WORKSPACE.bazel +++ b/WORKSPACE.bazel @@ -31,7 +31,7 @@ http_archive( build_file = "@rules_swiftnav//third_party:mkl_headers.BUILD", sha256 = "b24d12a8e18ba23de5c659a33fb184a7ac6019d4b159e78f628d7c8de225f77a", urls = [ - "https://anaconda.org/intel/mkl-include/2023.1.0/download/linux-64/mkl-include-2023.1.0-intel_46342.tar.bz2", + "https://swiftnav-public-mkl-sharedlibs.s3.us-west-2.amazonaws.com/mkl-include-2023.1.0-intel_46342.tar.bz2", ], ) @@ -41,7 +41,7 @@ http_archive( sha256 = "c63adbfdbdc7c4992384a2d89cd62211e4a9f8061e3e841af1a269699531cb02", strip_prefix = "lib", urls = [ - "https://anaconda.org/intel/mkl-static/2023.1.0/download/linux-64/mkl-static-2023.1.0-intel_46342.tar.bz2", + "https://swiftnav-public-mkl-sharedlibs.s3.us-west-2.amazonaws.com/mkl-static-2023.1.0-intel_46342.tar.bz2", ], ) diff --git a/doc/src/crappy-pic.rst b/doc/src/crappy-pic.rst new file mode 100644 index 00000000..0005dc14 --- /dev/null +++ b/doc/src/crappy-pic.rst @@ -0,0 +1,164 @@ +################################################# +PIC math scratchpad +################################################# + +.. _crappy-pic: + +------------------------------------------------------------------- +Relationship between :math:`\sigma_\cancel{B}` and :math:`\sigma_f` +------------------------------------------------------------------- + +It seems like what we need first of all is some way to get the PITC posterior variance based only on :math:`\cancel{B}` using only the full PITC posterior and calculations that scale only with :math:`|B|`. I'm not totally finished working out how :math:`(\sigma^2_{*})^{\cancel{B} \cancel{B}}` fits into the whole posterior :math:`(\sigma^2_*)^{PIC}` + +Observe that since :math:`\mathbf{\Lambda}` is a block-diagonal matrix, its inverse is too, so when it's in the middle of a quadratic form, the cross terms disappear. As far as I can tell, this is the only place we can additively separate :math:`\mathbf{Q}_{\cancel{B} \cancel{B}}` and :math:`\mathbf{K}_{B B}` without explicit cross terms. Start with the usual application of Woodbury's lemma to the PITC posterior covariance: + +.. math:: + + \newcommand{Lam}{\mathbf{\Lambda}} + \newcommand{Kuu}{\mathbf{K}_{uu}} + \newcommand{Kuf}{\mathbf{K}_{uf}} + \newcommand{Kfu}{\mathbf{K}_{fu}} + (\sigma_*^2) &= K_* - \mathbf{Q}_{* f} \left(\mathbf{K}_{fu} \mathbf{K}_{uu}^{-1} \mathbf{K}_{uf} + \mathbf{\Lambda} \right)^{-1} \mathbf{Q}_{f *} \\ + &= K_* - \mathbf{K}_{*u} \Kuu^{-1} \Kuf \left(\Kfu \Kuu^{-1} \Kuf + \Lam\right)^{-1} \Kfu \Kuu^{-1} \mathbf{K}_{u*} \\ + &= K_* - \mathbf{K}_{*u} \Kuu^{-1} \mathbf{K}_{u*} + \mathbf{K}_{*u} \left( \Kuu + \Kuf \Lam^{-1} \Kfu \right)^{-1} \mathbf{K}_{u*} + +Now break up :math:`\Kuf` and :math:`\Lam`: + +.. math:: + + \Kuf &= \begin{bmatrix} \mathbf{K}_{u \cancel{B}} & \mathbf{K}_{u B} \end{bmatrix} \\ + \Lam &= \begin{bmatrix} \Lam_{\cancel{B}} & \mathbf{0} \\ \mathbf{0} & \Lam_{B} \end{bmatrix} \\ + +so that + +.. math:: + + \Kuf \Lam^{-1} \Kfu &= \begin{bmatrix} \mathbf{K}_{u \cancel{B}} & \mathbf{K}_{u B} \end{bmatrix} + \begin{bmatrix} \Lam_{\cancel{B}}^{-1} & \mathbf{0} \\ \mathbf{0} & \Lam_{B}^{-1} \end{bmatrix} + \begin{bmatrix} \mathbf{K}_{\cancel{B} u} \\ \mathbf{K}_{B u} \end{bmatrix} \\ + &= \mathbf{K}_{u \cancel{B}} \Lam_{\cancel{B}}^{-1} \mathbf{K}_{\cancel{B} u} + + \mathbf{K}_{u B} \Lam_{B}^{-1} \mathbf{K}_{B u} \\ + \mathbf{K}_{u \cancel{B}} \Lam_{\cancel{B}}^{-1} \mathbf{K}_{\cancel{B} u} &= \Kuf \Lam^{-1} \Kfu - \mathbf{K}_{u B} \Lam_{B}^{-1} \mathbf{K}_{B u} + +This last one is the inverse term in the training-data dependent part of the PITC posterior covariance using only the training points in :math:`\cancel{B}`, which I think is what we want to combine with the dense posterior from the training points in :math:`B`: + +.. math:: + (\sigma_*^2)^{\cancel{B} \cancel{B}} = K_* - \mathbf{K}_{*u} \Kuu^{-1} \mathbf{K}_{u*} + \mathbf{K}_{*u} \left( \Kuu + \mathbf{K}_{u \cancel{B}} \Lam_{\cancel{B}}^{-1} \mathbf{K}_{\cancel{B} u} \right)^{-1} \mathbf{K}_{u*} + +which we can write in terms of only the full PITC prior and the prior for group :math:`B`: + +.. math:: + (\sigma_*^2)^{\cancel{B} \cancel{B}} = K_* - \mathbf{K}_{*u} \Kuu^{-1} \mathbf{K}_{u*} + \mathbf{K}_{*u} \left( \Kuu + \Kuf \Lam^{-1} \Kfu - \mathbf{K}_{u B} \Lam_{B}^{-1} \mathbf{K}_{B u} \right)^{-1} \mathbf{K}_{u*} + +If you set :math:`B = \emptyset`, removing the correction for the training points in block :math:`B` entirely, you have the original PITC posterior covariance based on the full set of training points :math:`f`, as expected. Now we only have to compute :math:`\Kuf \Lam^{-1} \Kfu` (the term that scales with the full training set :math:`f`) once, and nothing that scales with :math:`|\cancel{B}|`. This still involves an :math:`O(M^3)` decomposition for each group :math:`B`. I have been assuming our group size :math:`|B|` is significantly smaller than the number of inducing points :math:`M`, so it's not great, but an interesting thing happens if we use the "downdate" form of Woodbury's lemma in the opposite direction to the usual way: + +.. math:: + \left(A - B C^{-1} B^T \right)^{-1} &= A^{-1} + A^{-1} B \left(C - B^T A^{-1} B\right)^{-1} B^T A^{-1} \\ + +Letting :math:`\mathbf{A} = \Kuu + \Kuf \Lam^{-1} \Kfu` (playing the part of :math:`A` in Woodbury's lemma): + +.. math:: + + (\sigma_*^2)^{\cancel{B} \cancel{B}} &= K_* - \mathbf{K}_{*u} \Kuu^{-1} \mathbf{K}_{u*} + \mathbf{K}_{*u} \left( \mathbf{A}^{-1} + \mathbf{A}^{-1} \mathbf{K}_{u B} \left(\Lam_{B} - \mathbf{K}_{B u} \mathbf{A}^{-1} \mathbf{K}_{u B}\right)^{-1} \mathbf{K}_{B u} \mathbf{A}^{-1} \right) \mathbf{K}_{u*} \\ + &= K_* - \mathbf{K}_{*u} \Kuu^{-1} \mathbf{K}_{u*} + \mathbf{K}_{*u} \mathbf{A}^{-1} \mathbf{K}_{u*} + \mathbf{K}_{*u} \mathbf{A}^{-1} \mathbf{K}_{u B} \left(\Lam_{B} - \mathbf{K}_{B u} \mathbf{A}^{-1} \mathbf{K}_{u B}\right)^{-1} \mathbf{K}_{B u} \mathbf{A}^{-1} \mathbf{K}_{u*} \\ + +I find the use of Woodbury's lemma to downdate a bit uncomfortable -- does this work with real symmetric matrices? But we would only need to decompose :math:`\mathbf{A}` (and also :math:`\Kuu`) at a cost of :math:`O(M^3)` once; the inversion to be calculated per group costs :math:`O(|B|^3)`. I also don't immediately spot a slick QR decomposition way to do this per-group term; it's pretty symmetrical. The only thing that jumps out is precomputing :math:`\mathbf{K}_{B u} \mathbf{A}^{-\frac{1}{2}}`. + +---------------------------------------------- +Derivation of Woodbury's Lemma for Differences +---------------------------------------------- + +This is specifically for symmetric matrices, to convince myself the downdate formula is legit. We would like to compute a rank-:math:`k` downdate to the inverse of a matrix :math:`A`. + +.. math:: + \left(A - B C^{-1} B^T \right)^{-1} &= A^{-1} + A^{-1} B \left(C - B^T A^{-1} B\right)^{-1} B^T A^{-1} \\ + +To be extra clear, let's say :math:`A \in \mathbb{R}^{n \times n}`, :math:`B \in \mathbb{R}^{n \times k}`, :math:`C \in \mathbb{R}^{k \times k}`. We put the relevant matrices into a block matrix :math:`M \in \mathbb{R}^{m \times m}` for :math:`m = n + k`, and label the conforming blocks of its inverse: + +.. math:: + M &= \begin{pmatrix} A & B \\ B^T & C\end{pmatrix} \\ + M^{-1} &= \begin{pmatrix} W & X \\ X^T & Y\end{pmatrix} \\ + M M^{-1} &= \begin{pmatrix} \mathbf{I} & \mathbf{0} \\ \mathbf{0} & \mathbf{I} \end{pmatrix} \\ + M^{-1} M &= \begin{pmatrix} \mathbf{I} & \mathbf{0} \\ \mathbf{0} & \mathbf{I} \end{pmatrix} + +The blocks of :math:`M M^{-1}` yield the following equations: + +.. math:: + AW + BX^T &= \mathbf{I} \\ + AX + BY &= \mathbf{0} \\ + B^T W + CX^T &= \mathbf{0} \\ + B^T X + CY &= \mathbf{I} \\ + +Rearrange the middle two: + +.. math:: + X &= -A^{-1} B Y \\ + X^T &= -C^{-1} B^T W + +If we do the same for :math:`M^{-1} M`: + +.. math:: + X &= -W B C^{-1} \\ + X^T &= -Y B^T A^{-1} + +These blocks are equal: + +.. math:: + C^{-1} B^T W &= Y B^T A^{-1} \\ + A^{-1} B Y &= W B C^{-1} \\ + +Now use the middle two equations from :math:`M M^{-1}` and plug them into the first and last equations irrespectively: + +.. math:: + W - A^{-1} B C^{-1} B^T W &= A^{1} \\ + \left(I - A^{-1} B C^{-1} B^T\right) W &= A^{-1} \\ + -C^{-1} B^T A B Y + Y &= C^{-1} \\ + \left(I - C^{-1} B^T A^{-1} B\right)Y &= C^{-1} \\ + +Assuming :math:`\left(I - A^{-1} B C^{-1} B^T\right)` and :math:`\left(I - C^{-1} B^T A^{-1} B\right)` are invertible, rearrange: + +.. math:: + W &= \left(I - A^{-1} B C^{-1} B^T\right)^{-1} A^{-1} \\ + &= \left(A - B C^{-1} B^T\right)^{-1} \\ + Y &= \left(I - C^{-1} B^T A^{-1} B\right)^{-1} C^{-1} \\ + &= \left(C - B^T A^{-1} B\right)^{-1} + +Now use equality of the off-diagonal blocks from the two ways to multiply :math:`M` and :math:`M^{-1}` above (don't you wish Sphinx would number equations?) and substitute: + +.. math:: + C^{-1} B^T \left(A - B C^{-1} B^T\right)^{-1} &= \left(C - B^T A^{-1} B\right)^{-1} B^T A^{-1} \\ + \left(A - B C^{-1} B^T\right)^{-1} B C^{-1} &= A^{-1} B \left(C - B^T A^{-1} B\right)^{-1} \\ + +Let's look just at the term :math:`\left(A - B C^{-1} B^T\right)` and right-multiply by :math:`-A^{-1}`: + +.. math:: + \left(A - B C^{-1} B^T\right) \left(-A^{-1}\right) &= -\mathbf{I} + B C^{-1} B^T A^{-1} \\ + \mathbf{I} + \left(A - B C^{-1} B^T\right) \left(-A^{-1}\right) &= B C^{-1} B^T A^{-1} + +Now let's return to the previous equation and right-multiply by :math:`B^T A^{-1}`: + +.. math:: + \left(A - B C^{-1} B^T\right)^{-1} B C^{-1} B^T A^{-1} &= A^{-1} B \left(C - B^T A^{-1} B\right)^{-1} B^T A^{-1} \\ + +Substitute the previous result for :math:`B C^{-1} B^T A^{-1}`: + +.. math:: + \left(A - B C^{-1} B^T\right)^{-1} \left( \mathbf{I} + \left(A - B C^{-1} B^T\right) \left(-A^{-1}\right) \right) &= A^{-1} B \left(C - B^T A^{-1} B\right)^{-1} B^T A^{-1} \\ + \left(A - B C^{-1} B^T\right)^{-1} - A^{-1} &= A^{-1} B \left(C - B^T A^{-1} B\right)^{-1} B^T A^{-1} \\ + \left(A - B C^{-1} B^T \right)^{-1} &= A^{-1} + A^{-1} B \left(C - B^T A^{-1} B\right)^{-1} B^T A^{-1} \blacksquare + +which is the difference form of Woodbury's lemma, or a rank-:math:`k` downdate of :math:`A^{-1}`. The main assumption seems to be that :math:`\left(I - A^{-1} B C^{-1} B^T\right)` and :math:`\left(I - C^{-1} B^T A^{-1} B\right)` are invertible. + +------------------------ +Blockwise inversion of S +------------------------ + +I tried to invert :math:`\mathbf{S}` blockwise; it didn't work out but I'm leaving it in here just to look back at: + +.. math:: + \newcommand{VV}{\mathbf{V}} + \mathbf{U} &= \mathbf{Q}_{* \cancel{B}} \mathbf{S}^{-1}_{\cancel{B} B} \VV_{B *} \\ + &= \mathbf{Q}_{* \cancel{B}} \left( \mathbf{Q}_{\cancel{B} \cancel{B}}^{-1} \mathbf{Q}_{\cancel{B} B} \left(\mathbf{Q}_{B B} - \mathbf{Q}_{B \cancel{B}} \mathbf{Q}_{\cancel{B} \cancel{B}}^{-1} \mathbf{Q}_{\cancel{B} B}\right)^{-1} \right) \VV_{B *} + + +How are we going to avoid the :math:`O(|\cancel{B}|^3)` inversion for :math:`\mathbf{Q}_{\cancel{B} \cancel{B}}`? Also we are going to get cross-terms with both :math:`\mathbf{Q}_{* \cancel{B}}` and :math:`\mathbf{Q}_{B B}` involved, but maybe they are OK because they are only :math:`O(|\cancel{B}| |B|)`? \ No newline at end of file diff --git a/doc/src/sparse-gp-details.rst b/doc/src/sparse-gp-details.rst index ce5bab0a..92f94773 100644 --- a/doc/src/sparse-gp-details.rst +++ b/doc/src/sparse-gp-details.rst @@ -338,3 +338,144 @@ However, we found that while the posterior mean predictions were numerically sta You should be able to find this implementation and details using git history. +-------------------- +PIC +-------------------- + +The PIC approximation is introduced in: + + [5] Local and global sparse Gaussian process approximations + Edward Snelson and Zoubin Ghahramani + https://proceedings.mlr.press/v2/snelson07a/snelson07a.pdf + +without any details of the covariance calculation. + +Mean +^^^^ + +.. math:: + \newcommand{Lam}{\mathbf{\Lambda}} + \newcommand{Kuu}{\mathbf{K}_{uu}} + \newcommand{Kuf}{\mathbf{K}_{uf}} + \newcommand{Kfu}{\mathbf{K}_{fu}} + \newcommand{VV}{\mathbf{V}} + \mu^{PIC} &= \tilde{\mathbf{K}}^{PIC}_{*f} \left[ \tilde{\mathbf{K}}^{PITC}_{ff} \right]^{-1} y \\ + &= \begin{bmatrix} \mathbf{Q}_{* \cancel{B}} & \mathbf{Q}_{* B} + \mathbf{V}_{* B}\end{bmatrix} \begin{bmatrix} \mathbf{Q}_{\cancel{B} \cancel{B}} & \mathbf{Q}_{\cancel{B} B} \\ \mathbf{Q}_{B \cancel{B}} & \mathbf{Q}_{B B} \end{bmatrix}^{-1} \begin{bmatrix} y_{\cancel{B}} \\ y_{B} \end{bmatrix} + +Breaking down :math:`\mathbf{K}_{* B} = \mathbf{Q}_{* B} + \VV_{* B}` we can write the PIC mean as a correction to the PITC mean: + +.. math:: + \mu^{PIC} &= \mu^{PITC} + \underbrace{\begin{bmatrix} \mathbf{0} & \VV_{* B} \end{bmatrix} \begin{bmatrix} \mathbf{Q}_{\cancel{B} \cancel{B}} & \mathbf{Q}_{\cancel{B} B} \\ \mathbf{Q}_{B \cancel{B}} & \mathbf{Q}_{B B} \end{bmatrix}^{-1} \begin{bmatrix} y_{\cancel{B}} \\ y_{B} \end{bmatrix} }_{\Delta \mu^{PIC}}\\ + +The inverse of the PITC term can be had via Woodbury's lemma and the QR decomposition. + +.. math:: + \Delta \mu^{PIC} &=\begin{bmatrix} \mathbf{0} & \VV_{* B} \end{bmatrix} \begin{bmatrix} \mathbf{Q}_{\cancel{B} \cancel{B}} & \mathbf{Q}_{\cancel{B} B} \\ \mathbf{Q}_{B \cancel{B}} & \mathbf{Q}_{B B} \end{bmatrix}^{-1} \begin{bmatrix} y_{\cancel{B}} \\ y_{B} \end{bmatrix} \\ + &= \begin{bmatrix} \mathbf{0} & \VV_{* B} \end{bmatrix} \left(\Kfu \Kuu^{-1} \Kuf + \Lam\right)^{-1} y \\ + &= \begin{bmatrix} \mathbf{0} & \VV_{* B} \end{bmatrix} \left( \Lam^{-1} - \Lam^{-1} \Kfu \left(\Kuu + \Kuf \Lam^{-1} \Kfu\right)^{-1} \Kuf \Lam^{-1} \right) y + +Assume we have already computed the PITC approximation as described above. We already have access to the information vector :math:`v = P R^{-1} Q^T_1 \Lam^{-\frac{1}{2}} y` by the QR decomposition of :math:`B = \begin{bmatrix} \Lam^{-\frac{1}{2}} \Kfu \\ L^T_{uu}\end{bmatrix}`. We can mimic the solution of the PITC approximation, but since we are premultiplying by the block-diagonal matrix :math:`\Lam^{-1}`, we can split this into per-block components: + +.. math:: + \Delta \mu^{PIC} &= \begin{bmatrix} \mathbf{0} & \VV_{* B} \end{bmatrix} \Lam^{-1} \left(y - \Kfu v\right) \\ + &= \mathbf{V}_{* b} \Lam^{-1}_{B} \left(y_{B} - \mathbf{K}_{B u} v_{B} \right) + +Covariance +^^^^^^^^^^ + +The way I started partitioning the covariance term into blocks is as follows: + +.. math:: + + (\sigma_*^2)^{PIC} &= K_* - \tilde{\mathbf{K}}^{PIC}_{*f} \left[ \tilde{\mathbf{K}}^{PITC}_{ff} \right]^{-1} \tilde{\mathbf{K}}^{PIC}_{f*} + \sigma^2 \\ + &= K_* - \begin{bmatrix} \mathbf{Q}_{* \cancel{B}} & \mathbf{K}_{* B} \end{bmatrix} \left(\mathbf{Q}_{ff} - \mathtt{blkdiag}(\mathbf{K}_{ff} - \mathbf{Q}_{ff})\right)^{-1} \begin{bmatrix} \mathbf{Q}_{\cancel{B} *} \\ \mathbf{K}_{B *} \end{bmatrix} \\ + &= K_* - \begin{bmatrix} \mathbf{Q}_{* \cancel{B}} & \mathbf{K}_{* B} \end{bmatrix} \left(\mathbf{K}_{fu} \mathbf{K}_{uu}^{-1} \mathbf{K}_{uf} + \Lam \right)^{-1} \begin{bmatrix} \mathbf{Q}_{\cancel{B} *} \\ \mathbf{K}_{B *} \end{bmatrix} + +The problem with doing this for PIC covariance (vs. PIC mean and PITC) is that we can't left-multiply the whole thing by :math:`\mathbf{K}_{uu}^{-1} \mathbf{K}_{uf}` (which in those instances leads to applying Woodbury's lemma to reduce the inverse to the size of the number of inducing points :math:`M`) because :math:`\mathbf{K}_{*B}` is not a low-rank approximation using the inducing points. We can instead break up the inverse term into blocks: + +.. math:: + (\sigma_*^2)^{PIC} &= K_* - + \begin{bmatrix} \mathbf{Q}_{* \cancel{B}} & \mathbf{K}_{* B}\end{bmatrix} + \begin{bmatrix} \mathbf{Q}_{\cancel{B} \cancel{B}} & \mathbf{Q}_{\cancel{B} B} \\ \mathbf{Q}_{B \cancel{B}} & \mathbf{Q}_{B B} \end{bmatrix}^{-1} + \begin{bmatrix} \mathbf{Q}_{\cancel{B} *} \\ \mathbf{K}_{B *}\end{bmatrix} + +If we substitute :math:`\mathbf{K}_{* B} = \mathbf{Q}_{* B} + \VV_{* B}` as with the mean, it doesn't work out nicely: + +.. math:: + (\sigma_*^2)^{PIC} &= K_* - + \begin{bmatrix} \mathbf{Q}_{* \cancel{B}} & \mathbf{Q}_{* B} + \VV_{* B} \end{bmatrix} + \underbrace{\begin{bmatrix} \mathbf{Q}_{\cancel{B} \cancel{B}} & \mathbf{Q}_{\cancel{B} B} \\ \mathbf{Q}_{B \cancel{B}} & \mathbf{Q}_{B B} \end{bmatrix}^{-1}}_{\mathbf{S}^{-1}} + \begin{bmatrix} \mathbf{Q}_{\cancel{B} *} \\ \mathbf{Q}_{B *} + \VV_{B *}\end{bmatrix} \\ + &= K_* - \mathbf{Q}_{**}^{PITC} - \underbrace{\mathbf{Q}_{* f} \mathbf{S}^{-1}_{f B} \VV_{B *}}_{\mathbf{U}} - \mathbf{U}^T - \mathbf{V}_{* B} \mathbf{S}^{-1}_{B B} \mathbf{V}_{B *} + +Now we have 3 correction terms to apply to the posterior PITC covariance. The best thing I can think of is to apply Woodbury's lemma, but in the opposite direction to usual: + +.. math:: + \mathbf{S}^{-1} &= \left(\Kfu \Kuu^{-1} \Kuf + \Lam\right)^{-1} \\ + &= \Lam^{-1} - \Lam^{-1} \Kfu \left( \Kuu + \Kuf \Lam^{-1} \Kfu \right)^{-1} \Kuf \Lam^{-1} + +which involves decomposing the block-diagonal matrix :math:`\Lam` with blocks of size :math:`|B|` and a matrix the size :math:`M` of the inducing point set. In practice after we precompute terms, we have a sequence of triangular factors that we can subset as needed to pick out :math:`B` and :math:`\cancel{B}`. (Confusingly, one of these useful decompositions is the QR decomposition of the unrelated matrix :math:`B` in the PITC derivation above.) + +.. math:: + \mathbf{U} &= \mathbf{Q}_{* f} \mathbf{S}^{-1}_{f B} \VV_{B *} \\ + &= \mathbf{Q}_{* f} \left( \Lam^{-1} - \Lam^{-1} \Kfu \left( \Kuu + \Kuf \Lam^{-1} \Kfu \right)^{-1} \Kuf \Lam^{-1} \right)_{f B} \VV_{B *} + +This looks appealingly like we could just keep combining instances of the QR decomposition of :math:`B`, but that would leave out the :math:`\Kuu` part. + +The open question is how to efficiently distill this into various correction terms for each group that don't require operations that scale with :math:`\cancel{B}`, since the paper promises :math:`O((|B| + M)^2)` for predictive covariances after precomputation. In principle, using sparse vectors / matrices for the :math:`*` target components, combined with Eigen's expression templates, should bring the complexity of some of these repeated solves for mostly empty vectors (for :math:`B`) down to :math:`O(|B|)`, and likewise for inducing points. + +At this point, our cross-terms are preceded by :math:`Q_{* f}`, which expands to :math:`K_{*u} \Kuu^{-1} \Kuf`. We can precompute everything except :math:`K_{*u}`, including the :math:`O(M^3)` inducing point decomposition, leaving prediction-time computations to scale with :math:`M`. + +To compute :math:`\mathbf{V}_{* B} \mathbf{S}^{-1}_{B B} \mathbf{V}_{B *}`, we form a (mostly zero) column of :math:`\mathbf{V}` for each feature, break the two terms of :math:`\mathbf{S}^{-1}` into symmetric parts, multiply by :math:`\mathbf{V}` and subtract, here in excruciating notational detail: + +.. math:: + \VV_{* B} \mathbf{S}^{-1} \VV_{B *} &= \VV_{* B} \left( \Lam^{-1} - \Lam^{-1} \Kfu \left( \Kuu + \Kuf \Lam^{-1} \Kfu \right)^{-1} \Kuf \Lam^{-1} \right)_{B B} \VV_{B *} \\ + &= \VV_{* B} \left( \left(P_\Lam L_\Lam^{-T} D_\Lam^{-\frac{1}{2}}\right) \underbrace{\left(D_\Lam^{-\frac{1}{2}} L_\Lam^{-1} P_\Lam^T\right)}_{Z_\Lam} - \mathbf{G}^T (B^T B)^{-1} \mathbf{G} \right) \VV_{B *} \\ + &= \VV_{* B} \left( \mathbf{Z}_\Lam^T \mathbf{Z}_\Lam - \mathbf{G}^T (P_u R_u^{-1} R_u^{-T} P_u^T) \mathbf{G} \right) \VV_{B *} \\ + &= \VV_{* B} \left( \mathbf{Z}_\Lam^T \mathbf{Z}_\Lam - \mathbf{G}^T (P_u R_u^{-1}) \underbrace{(R_u^{-T} P_u^T) \mathbf{G}}_{\mathbf{Z}_u} \right) \VV_{B *} \\ + &= \VV_{* B} \left( \mathbf{Z}_\Lam^T \mathbf{Z}_\Lam - \mathbf{Z}_u^T \mathbf{Z}_u \right) \VV_{B *} \\ + &= \VV_{* B} \mathbf{Z}_\Lam^T \underbrace{\mathbf{Z}_\Lam \VV_{B *}}_{\mathbf{\xi}_\Lam} - \VV_{* B} \mathbf{Z}_u^T \underbrace{\mathbf{Z}_u \VV_{B *}}_{\mathbf{\xi}_u} \\ + &= \mathbf{\xi}_\Lam^T \mathbf{\xi}_\Lam - \mathbf{\xi}_u^T \mathbf{\xi}_u \\ + +Note that the left-hand (subscript :math:`\Lam`) term is the decomposition of a block-diagonal matrix, so it will only contain cross-terms between features corresponding to the same local block. This calculation can be done blockwise. The right-hand (subscript :math:`u`) term projects the features through the local block of the training dataset and then through the inducing points, so the cross-terms are not in general sparse, and this calculation involves a lot more careful indexing. + +The same breakdown of :math:`\mathbf{S}^{-1}` can be used to compute :math:`\mathbf{W}` during the fit step. + +The notation :math:`\mathbf{W}_B` indicates that the relevant columns of :math:`\mathbf{W}` are used on the right-hand side. This is mathematically equivalent to making :math:`\VV_{B *}` have dimension :math:`N \times p` and be zero outside block :math:`B`. Computationally the right factor must be a sparse object to preserve the desired asymptotics. + +Summing up +^^^^^^^^^^ + +For the covariance, we must precompute: + + - :math:`P^TLDL^TP = \Lam` + - :math:`\mathbf{G} = \Lam^{-1} \Kfu` + - :math:`L_{uu} L_{uu}^T = \Kuu` + - :math:`QRP^T = B = \begin{bmatrix}\Lam^{-\frac{1}{2}}\Kfu \\ L_{uu} \end{bmatrix}` such that :math:`\mathbf{S}^{-1} = \Lam^{-1} - \mathbf{G} \left(B^T B\right)^{-1} \mathbf{G}^T`, and blocks :math:`\mathbf{S}^{-1}_{a b}` can be got by choosing the right rows / columns with which to do permutations and back-substitutions. + - :math:`\mathbf{W} = \Kuu^{-1} \mathbf{K}_{u f} \mathbf{S}^{-1}` + +For the mean, we compute the information vector :math:`v` as in PITC. + +then for each group :math:`B`: + + - :math:`\mathbf{Y}_B = \Kuu^{-1} \mathbf{K}_{u B}` + - :math:`v_b = \Lam_{B B}^{-1} \left( y_B - \mathbf{K}_{B u} v \right)` + +Given that we have already computed :math:`\Kfu`, we can use :math:`\mathbf{K}_{u B}` and :math:`\mathbf{K}_{u \cancel{B}}` efficiently in Eigen using sparse matrices with a single entry per nonzero row or column to be used. + +Then at prediction time, we must compute: + + - :math:`\mathbf{K}_{* B}`, :math:`O(|B|)` + - :math:`\mathbf{K}_{* u}`, :math:`O(M)` + - :math:`\mathbf{Q}_{* B} = \mathbf{K}_{* u} \mathbf{Y}_B`, :math:`O(M^2)` with the existing decomposition + - :math:`\VV_{* B} = \mathbf{K}_{* B} - \mathbf{Q}_{* B}`, :math:`O(|B|^2)` + - :math:`\mathbf{Q}_{**}^{PITC}` as with PITC + - :math:`\mathbf{U} = \mathbf{K}_{* u} \mathbf{W}_B \VV_{B *}`, :math:`O(M + |B|)`? + - :math:`\VV_{* B} \mathbf{S}^{-1}_{B B} \VV_{B *}`, :math:`O(|B|^2)` + +Further notes +^^^^^^^^^^^^^ + +If asked to predict something that is uncorrelated with all training data (conditional on inducing points), we simply fall back to the PITC approximation. + +On some problems, calculation of the predictive covariance using Woodbury's lemma to avoid an :math:`O(N^3)` decomposition seems to lead to numerical problems. \ No newline at end of file diff --git a/include/albatross/SparseGP b/include/albatross/SparseGP index 2a26aab2..ddcd2ffc 100644 --- a/include/albatross/SparseGP +++ b/include/albatross/SparseGP @@ -23,6 +23,8 @@ #include #include #include +#include #include +#include #endif diff --git a/include/albatross/serialize/GP b/include/albatross/serialize/GP index f54633c5..2ecee09c 100644 --- a/include/albatross/serialize/GP +++ b/include/albatross/serialize/GP @@ -18,6 +18,7 @@ #include "../src/cereal/block_utils.hpp" #include "../src/cereal/serializable_ldlt.hpp" +#include "../src/cereal/block_diagonal.hpp" #include "../src/cereal/gp.hpp" #include "../src/cereal/representations.hpp" #include "../src/cereal/serializable_spqr.hpp" diff --git a/include/albatross/src/cereal/block_diagonal.hpp b/include/albatross/src/cereal/block_diagonal.hpp new file mode 100644 index 00000000..b9253363 --- /dev/null +++ b/include/albatross/src/cereal/block_diagonal.hpp @@ -0,0 +1,27 @@ +/* + * Copyright (C) 2019 Swift Navigation Inc. + * Contact: Swift Navigation + * + * This source is subject to the license found in the file 'LICENSE' which must + * be distributed together with this source. All other rights reserved. + * + * THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, + * EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE. + */ + +#ifndef ALBATROSS_SRC_CEREAL_BLOCK_DIAGONAL_HPP_ +#define ALBATROSS_SRC_CEREAL_BLOCK_DIAGONAL_HPP_ + +namespace cereal { + +template +inline void serialize(Archive &archive, + const albatross::BlockDiagonalLDLT &ldlt, + const std::uint32_t version __attribute__((unused))) { + archive(cereal::make_nvp("blocks", ldlt.blocks)); +} + +} // namespace cereal + +#endif /* ALBATROSS_SRC_CEREAL_BLOCK_DIAGONAL_HPP_ */ diff --git a/include/albatross/src/cereal/gp.hpp b/include/albatross/src/cereal/gp.hpp index b3926397..cd799908 100644 --- a/include/albatross/src/cereal/gp.hpp +++ b/include/albatross/src/cereal/gp.hpp @@ -17,6 +17,8 @@ using albatross::Fit; using albatross::GaussianProcessBase; using albatross::GPFit; using albatross::LinearCombination; + +using albatross::PICGPFit; using albatross::SparseGPFit; #ifndef GP_SERIALIZATION_VERSION @@ -51,6 +53,28 @@ inline void serialize(Archive &archive, Fit> &fit, } } +template +inline void +serialize(Archive &archive, + Fit> &fit, + const std::uint32_t version __attribute__((unused))) { + archive(cereal::make_nvp("train_features", fit.train_features)); + archive(cereal::make_nvp("inducing_features", fit.inducing_features)); + archive(cereal::make_nvp("train_covariance", fit.train_covariance)); + archive(cereal::make_nvp("sigma_R", fit.sigma_R)); + archive(cereal::make_nvp("P", fit.P)); + archive(cereal::make_nvp("mean_w", fit.mean_w)); + archive(cereal::make_nvp("W", fit.W)); + archive(cereal::make_nvp("covariance_Y", fit.covariance_Y)); + archive(cereal::make_nvp("Z", fit.Z)); + archive(cereal::make_nvp("A_ldlt", fit.A_ldlt)); + archive(cereal::make_nvp("measurement_groups", fit.measurement_groups)); + archive(cereal::make_nvp("information", fit.information)); + archive(cereal::make_nvp("numerical_rank", fit.numerical_rank)); + archive(cereal::make_nvp("cols_Bs", fit.cols_Bs)); +} + template inline void save(Archive &archive, diff --git a/include/albatross/src/cereal/suitesparse.hpp b/include/albatross/src/cereal/suitesparse.hpp index 32a89867..ca44c9c6 100644 --- a/include/albatross/src/cereal/suitesparse.hpp +++ b/include/albatross/src/cereal/suitesparse.hpp @@ -439,8 +439,6 @@ inline void load(Archive &ar, cholmod_sparse &m, } detail::suitesparse_cereal_realloc(&m.x, m.nzmax, element_size_bytes); - std::cout << "m.p size: " << p_size_bytes << " bytes" << std::endl; - std::cout << "m.ncol + 1: " << m.ncol + 1 << std::endl; detail::decode_array(ar, "m.p", m.p, (m.ncol + 1) * p_size_bytes); detail::decode_array(ar, "m.i", m.i, m.nzmax * integer_size_bytes); if (!m.packed) { diff --git a/include/albatross/src/linalg/block_diagonal.hpp b/include/albatross/src/linalg/block_diagonal.hpp index 29321879..c57fd1c5 100644 --- a/include/albatross/src/linalg/block_diagonal.hpp +++ b/include/albatross/src/linalg/block_diagonal.hpp @@ -68,6 +68,8 @@ struct BlockDiagonalLDLT { Eigen::Index rows() const; Eigen::Index cols() const; + + bool operator==(const BlockDiagonalLDLT &other) const; }; struct BlockDiagonal { @@ -248,6 +250,11 @@ inline const BlockDiagonalLDLT &BlockDiagonalLDLT::adjoint() const { return *this; } +inline bool +BlockDiagonalLDLT::operator==(const BlockDiagonalLDLT &other) const { + return blocks == other.blocks; +} + /* * Block Diagonal */ diff --git a/include/albatross/src/models/pic_gp.hpp b/include/albatross/src/models/pic_gp.hpp new file mode 100644 index 00000000..8863cbec --- /dev/null +++ b/include/albatross/src/models/pic_gp.hpp @@ -0,0 +1,788 @@ +/* + * Copyright (C) 2024 Swift Navigation Inc. + * Contact: Swift Navigation + * + * This source is subject to the license found in the file 'LICENSE' which must + * be distributed together with this source. All other rights reserved. + * + * THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, + * EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE. + */ + +#ifndef INCLUDE_ALBATROSS_MODELS_PIC_GP_H_ +#define INCLUDE_ALBATROSS_MODELS_PIC_GP_H_ + +namespace albatross { + +/* + * This class directly implements the PIC covariance structure: + * conditional on the inducing points, each group of observations is + * independent. You can use it as a covariance function for a dense + * GP model, and it should give you the same results as the PIC + * approximation, but since it's implemented directly as a covariance + * function, it's much less efficient than modelling using the + * PICGaussianProcessRegression class defined below. It is provided + * for debugging purposes, since it can be tricky to follow what the + * efficient implementation is doing. + * + * See the PIC GP model test suite for an example of how to use this + * class. + */ +template +class BruteForcePIC + : public CovarianceFunction< + BruteForcePIC> { +public: + CovarianceType cov_; + std::vector inducing_points_; + GrouperFunction grouper_; + Eigen::LDLT K_uu_ldlt_; + + BruteForcePIC(const std::vector &inducing_points, + CovarianceType &&cov, GrouperFunction &&grouper) + : cov_{cov}, inducing_points_{inducing_points}, grouper_{grouper}, + K_uu_ldlt_{cov_(inducing_points_, inducing_points_).ldlt()} {} + + template + double _call_impl(const X &x, const Y &y) const { + if (grouper_(x) == grouper_(y)) { + return cov_(x, y); + } + + Eigen::VectorXd K_xu(inducing_points_.size()); + Eigen::VectorXd K_uy(inducing_points_.size()); + for (Eigen::Index i = 0; + i < static_cast(inducing_points_.size()); ++i) { + K_xu[i] = cov_(x, inducing_points_[i]); + K_uy[i] = cov_(inducing_points_[i], y); + } + return K_xu.dot(K_uu_ldlt_.solve(K_uy)); + } +}; + +template +auto make_brute_force_pic_covariance( + const std::vector &inducing_points, + CovarianceType &&cov, GrouperFunction &&grouper) { + return BruteForcePIC( + inducing_points, std::forward(cov), + std::forward(grouper)); +} + +template +class PICGaussianProcessRegression; + +template +struct PICGPFit {}; + +template +using PICGroupIndexType = + typename std::result_of::type; + +template struct PICGroup { + // Row indices are into the original set of features into the data + // used to form S. + Eigen::Index initial_row; + Eigen::Index block_size; + Eigen::Index block_index; + albatross::RegressionDataset dataset; +}; + +template +using PICGroupMap = std::map, + PICGroup>; + +template +struct Fit> { + using PermutationIndices = Eigen::Matrix; + using GroupIndexType = PICGroupIndexType; + using GroupMap = PICGroupMap; + + std::vector train_features; + std::vector inducing_features; + Eigen::SerializableLDLT train_covariance; + Eigen::MatrixXd sigma_R; + Eigen::PermutationMatrixX P; + std::vector mean_w; + Eigen::MatrixXd W; + std::vector covariance_Y; + Eigen::MatrixXd Z; + BlockDiagonalLDLT A_ldlt; + GroupMap measurement_groups; + Eigen::VectorXd information; + Eigen::Index numerical_rank; + std::vector> cols_Bs; + GrouperFunction grouper; + + Fit(){}; + + Fit(const std::vector &features_, + const std::vector &inducing_features_, + const Eigen::SerializableLDLT &train_covariance_, + const Eigen::MatrixXd &sigma_R_, const Eigen::PermutationMatrixX &P_, + const std::vector &mean_w_, const Eigen::MatrixXd &W_, + const std::vector &covariance_Y_, + const Eigen::MatrixXd &Z_, const BlockDiagonalLDLT &A_ldlt_, + const GroupMap &measurement_groups_, const Eigen::VectorXd &information_, + Eigen::Index numerical_rank_, + const std::vector> cols_Bs_, + GrouperFunction grouper_) + : train_features(features_), inducing_features(inducing_features_), + train_covariance(train_covariance_), sigma_R(sigma_R_), P(P_), + mean_w(mean_w_), W(W_), covariance_Y(covariance_Y_), Z(Z_), + A_ldlt(A_ldlt_), measurement_groups(measurement_groups_), + information(information_), numerical_rank(numerical_rank_), + cols_Bs(cols_Bs_), grouper(grouper_) {} + + void shift_mean(const Eigen::VectorXd &mean_shift) { + ALBATROSS_ASSERT(mean_shift.size() == information.size()); + information += train_covariance.solve(mean_shift); + } + + bool operator==( + const Fit> + &other) const { + return (train_features == other.train_features && + inducing_features == other.inducing_features && + train_covariance == other.train_covariance && + sigma_R == other.sigma_R && P.indices() == other.P.indices() && + mean_w == other.mean_w && W == W && + covariance_Y == other.covariance_Y && Z == other.Z && + A_ldlt == other.A_ldlt && + measurement_groups == other.measurement_groups && + information == other.information && + numerical_rank == other.numerical_rank && cols_Bs == other.cols_Bs); + } +}; + +/* + * This class implements an approximation technique for Gaussian + * processes which relies on the following: + * + * 1. Observations occur in correlated groups + * + * 2. Conditional on the values of observations within the same group + * and on the values of the inducing points, an observation is + * independent of observations in other groups. + * + * This is known as the PIC (Partially Independent Conditional) + * approximation and was introduced in + * + * [1] Local and global sparse Gaussian process approximations + * Edward Snelson, Zoubin Ghahramani + * https://proceedings.mlr.press/v2/snelson07a/snelson07a.pdf + * + * The upshot of this method is that for enough small enough groups, + * the fit performance scales as O(N M^2) for N observations and M + * inducing points (just like FITC/PITC), but fine-grained local + * information from correlation groups can provide better predictive + * performance. + * + * Inducing point and group definitions are handled as in the + * FITC/PITC approximation class (see sparse_gp.hpp). If a prediction + * point is not correlated with any of the training observations (as + * defined by the grouper function), the resulting predicted + * distribution will be computed using only the fitted inducing + * points. + * + * Several operations supported by other Albatross GP model classes, + * like "rebasing" (computing new inducing points from old ones) and + * reduced-complexity updating of a fit with new information, are not + * yet supported by the PIC approximation class. + * + */ +template +class PICGaussianProcessRegression + : public GaussianProcessBase> { +public: + using Base = GaussianProcessBase< + CovFunc, MeanFunc, + PICGaussianProcessRegression>; + + template + using GroupIndexType = PICGroupIndexType; + + template + using GroupMap = PICGroupMap; + + PICGaussianProcessRegression() : Base() { initialize_params(); }; + + PICGaussianProcessRegression(const CovFunc &covariance_function, + const MeanFunc &mean_function) + : Base(covariance_function, mean_function) { + initialize_params(); + }; + PICGaussianProcessRegression(CovFunc &&covariance_function, + MeanFunc &&mean_function) + : Base(std::move(covariance_function), std::move(mean_function)) { + initialize_params(); + }; + + PICGaussianProcessRegression( + const CovFunc &covariance_function, const MeanFunc &mean_function, + const GrouperFunction &independent_group_function, + const InducingPointStrategy &inducing_point_strategy, + const std::string &model_name) + : Base(covariance_function, mean_function, model_name), + inducing_point_strategy_(inducing_point_strategy), + independent_group_function_(independent_group_function) { + initialize_params(); + }; + PICGaussianProcessRegression(CovFunc &&covariance_function, + MeanFunc &&mean_function, + GrouperFunction &&independent_group_function, + InducingPointStrategy &&inducing_point_strategy, + const std::string &model_name) + : Base(std::move(covariance_function), std::move(mean_function), + model_name), + inducing_point_strategy_(std::move(inducing_point_strategy)), + independent_group_function_(std::move(independent_group_function)) { + initialize_params(); + }; + + void initialize_params() { + measurement_nugget_ = { + details::DEFAULT_NUGGET, + LogScaleUniformPrior(PARAMETER_EPSILON, PARAMETER_MAX)}; + inducing_nugget_ = {details::DEFAULT_NUGGET, + LogScaleUniformPrior(PARAMETER_EPSILON, PARAMETER_MAX)}; + } + + ParameterStore get_params() const override { + auto params = map_join(this->mean_function_.get_params(), + this->covariance_function_.get_params()); + params[details::measurement_nugget_name()] = measurement_nugget_; + params[details::inducing_nugget_name()] = inducing_nugget_; + return params; + } + + void set_param(const std::string &name, const Parameter ¶m) override { + if (name == details::measurement_nugget_name()) { + measurement_nugget_ = param; + return; + } else if (name == details::inducing_nugget_name()) { + inducing_nugget_ = param; + return; + } + const bool success = set_param_if_exists_in_any( + name, param, &this->covariance_function_, &this->mean_function_); + ALBATROSS_ASSERT(success); + } + + template ::type, + GrouperFunction>::value>::type> + void update_grouper_function(NewGrouper &&f) { + independent_group_function_ = std::forward(f); + } + + template ::type, + InducingPointStrategy>::value>::type> + void update_inducing_point_strategy(NewStrategy &&f) { + inducing_point_strategy_ = std::forward(f); + } + + // Here we create the QR decomposition of: + // + // B = |A^-1/2 K_fu| = |Q_1| R P^T + // |K_uu^{T/2} | |Q_2| + // + // which corresponds to the inverse square root of Sigma + // + // Sigma = (B^T B)^-1 + std::unique_ptr + compute_sigma_qr(const Eigen::SerializableLDLT &K_uu_ldlt, + const BlockDiagonalLDLT &A_ldlt, + const Eigen::MatrixXd &K_fu) const { + Eigen::MatrixXd B(A_ldlt.rows() + K_uu_ldlt.rows(), K_uu_ldlt.rows()); + B.topRows(A_ldlt.rows()) = A_ldlt.sqrt_solve(K_fu); + B.bottomRows(K_uu_ldlt.rows()) = K_uu_ldlt.sqrt_transpose(); + return QRImplementation::compute(B, Base::threads_.get()); + }; + + template < + typename FeatureType, + std::enable_if_t< + has_call_operator::value, int> = 0> + auto _fit_impl(const std::vector &features, + const MarginalDistribution &targets) const { + // Determine the set of inducing points, u. + const auto u = + inducing_point_strategy_(this->covariance_function_, features); + ALBATROSS_ASSERT(u.size() > 0 && "Empty inducing points!"); + + BlockDiagonalLDLT A_ldlt; + Eigen::SerializableLDLT K_uu_ldlt; + Eigen::MatrixXd K_fu; + Eigen::VectorXd y; + GroupMap measurement_groups; + compute_internal_components(u, features, targets, &A_ldlt, &K_uu_ldlt, + &K_fu, &y, &measurement_groups); + auto B_qr = compute_sigma_qr(K_uu_ldlt, A_ldlt, K_fu); + + // To make a prediction, we will need to compute cross-terms with + // sub-blocks of S^-1, where + // + // S^-1 = A^-1 - A^-1 K_fu (K_uu + K_uf A^-1 K_fu)^-1 K_uf A^-1 + // \____________________________________________/ + // Z^T Z + // + // The inverse part of the right-hand term of this is B_qr above, + // and we already have A_ldlt, so we can form Z. + // + // Now to compute sub-blocks of S^-1, we just index into Z^T Z + // appropriately, and if the term is diagonal, also solve using + // the blocks of A^-1. + const Eigen::MatrixXd Z = sqrt_solve(*B_qr, A_ldlt.solve(K_fu).transpose()); + + Eigen::VectorXd y_augmented = Eigen::VectorXd::Zero(B_qr->rows()); + y_augmented.topRows(y.size()) = A_ldlt.sqrt_solve(y, Base::threads_.get()); + const Eigen::VectorXd v = B_qr->solve(y_augmented); + + using InducingPointFeatureType = typename std::decay::type; + + const Eigen::MatrixXd W = K_uu_ldlt.solve( + (A_ldlt.solve(K_fu) - Z.transpose() * Z * K_fu).transpose()); + + // TODO(@peddie): a lot of this can be batched + std::vector covariance_Y(measurement_groups.size()); + std::vector mean_w(measurement_groups.size()); + std::vector> cols_Bs(measurement_groups.size()); + const auto block_start_indices = A_ldlt.block_to_row_map(); + + const auto precompute_block = [&, this](std::size_t block_number, + Eigen::Index start_row) -> void { + const Eigen::Index block_size = A_ldlt.blocks[block_number].rows(); + // K_fu is already computed, so we can form K_uB and K_uA by + // appropriate use of sparse indexing matrices and avoid an O(N + // M) operation. + // + // This nonsense would be a lot more straightforward with Eigen + // 3.4's slicing and indexing API. + Eigen::SparseMatrix cols_B(features.size(), block_size); + cols_B.reserve(block_size); + + for (Eigen::Index i = 0; i < block_size; ++i) { + cols_B.insert(start_row + i, i) = 1.; + } + cols_B.makeCompressed(); + + cols_Bs[block_number] = cols_B; + // v_b \in R^b = A_BB^-1 (y_b - K_Bu v) + Eigen::MatrixXd ydiff_b = cols_B.transpose() * (y - K_fu * v); + const Eigen::MatrixXd mean_w_full = + A_ldlt.blocks[block_number].solve(ydiff_b); + mean_w[block_number] = mean_w_full; + // Y \in R^(u x b) = K_uu^-1 K_uB + covariance_Y[block_number] = K_uu_ldlt.solve(K_fu.transpose() * cols_B); + }; + + apply_map(block_start_indices, precompute_block, Base::threads_.get()); + + using FitType = + Fit>; + return FitType(features, u, K_uu_ldlt, get_R(*B_qr), get_P(*B_qr), mean_w, + W, covariance_Y, Z, A_ldlt, measurement_groups, v, + B_qr->rank(), cols_Bs, independent_group_function_); + } + + // This is included to allow the SparseGP + // to be compatible with fits generated + // using a standard GP. + using Base::_predict_impl; + + template + Eigen::VectorXd _predict_impl( + const std::vector &features, + const Fit> + &sparse_gp_fit, + PredictTypeIdentity &&) const { + const auto find_group = [this, &sparse_gp_fit](const auto &feature) { + return sparse_gp_fit.measurement_groups.find( + independent_group_function_(without_measurement(feature))); + }; + const Eigen::MatrixXd K_up = + this->covariance_function_(sparse_gp_fit.inducing_features, features); + Eigen::VectorXd mean_correction = Eigen::VectorXd::Zero(features.size()); + for (Eigen::Index j = 0; j < features.size(); ++j) { + const auto group = find_group(features[j]); + if (group == sparse_gp_fit.measurement_groups.end()) { + continue; + } + const std::vector fvec = {features[j]}; + + const Eigen::VectorXd features_cov = + this->covariance_function_(group->second.dataset.features, fvec); + const Eigen::VectorXd kpuy = + K_up.transpose().row(j) * + sparse_gp_fit.covariance_Y[group->second.block_index]; + const Eigen::VectorXd Vbp = features_cov - kpuy; + mean_correction[j] = + Vbp.dot(sparse_gp_fit.mean_w[group->second.block_index]); + } + + Eigen::VectorXd mean = + K_up.transpose() * sparse_gp_fit.information + mean_correction; + + this->mean_function_.add_to(features, &mean); + return mean; + } + + template + MarginalDistribution _predict_impl( + const std::vector &features, + const Fit> + &sparse_gp_fit, + PredictTypeIdentity &&) const { + const auto K_up = + this->covariance_function_(sparse_gp_fit.inducing_features, features); + Eigen::VectorXd mean = gp_mean_prediction(K_up, sparse_gp_fit.information); + this->mean_function_.add_to(features, &mean); + + // First pass: compute an O(1) mapping from features to groups + std::vector feature_to_block(features.size()); + std::vector groups( + features.size()); + Eigen::VectorXi col_alloc{Eigen::VectorXi::Zero(features.size())}; + bool all_same_group = true; + for (Eigen::Index j = 0; j < features.size(); ++j) { + groups[j] = sparse_gp_fit.measurement_groups.find( + independent_group_function_(without_measurement(features[j]))); + if (groups[j] != sparse_gp_fit.measurement_groups.end()) { + col_alloc(j) = groups[j]->second.block_size; + all_same_group = all_same_group && groups[j] == groups[0]; + } + feature_to_block[j] = + std::distance(sparse_gp_fit.measurement_groups.begin(), groups[j]); + } + + // Second pass: compute mean vector and fill sparse Vp matrix + Eigen::VectorXd mean_correction = Eigen::VectorXd::Zero(features.size()); + Eigen::SparseMatrix Vp(sparse_gp_fit.train_features.size(), + features.size()); + Vp.reserve(col_alloc); + for (Eigen::Index j = 0; j < features.size(); ++j) { + if (groups[j] == sparse_gp_fit.measurement_groups.end()) { + continue; + } + + const auto &B = groups[j]->second; + Eigen::VectorXd Vbp = + this->covariance_function_(B.dataset.features, + std::vector{features[j]}) - + K_up.col(j).transpose() * sparse_gp_fit.covariance_Y[B.block_index]; + for (Eigen::Index i = 0; i < Vbp.size(); ++i) { + Vp.insert(B.initial_row + i, j) = Vbp(i); + } + mean_correction[j] = Vbp.dot(sparse_gp_fit.mean_w[B.block_index]); + } + Vp.makeCompressed(); + + Eigen::MatrixXd xi_lambda = sparse_gp_fit.A_ldlt.sqrt_solve(Vp); + Eigen::MatrixXd xi_u = sparse_gp_fit.Z * Vp; + Eigen::VectorXd VSV_diag{ + (xi_lambda.transpose() * xi_lambda - xi_u.transpose() * xi_u) + .diagonal()}; + + const Eigen::VectorXd U_diag = + (K_up.transpose() * sparse_gp_fit.W * Vp).diagonal(); + + Eigen::VectorXd marginal_variance(cast::to_index(features.size())); + for (Eigen::Index i = 0; i < marginal_variance.size(); ++i) { + marginal_variance[i] = this->covariance_function_( + features[cast::to_size(i)], features[cast::to_size(i)]); + } + + const Eigen::MatrixXd Q_sqrt = + sparse_gp_fit.train_covariance.sqrt_solve(K_up); + const Eigen::VectorXd Q_diag = + Q_sqrt.cwiseProduct(Q_sqrt).array().colwise().sum(); + marginal_variance -= Q_diag; + + const Eigen::MatrixXd S_sqrt = + sqrt_solve(sparse_gp_fit.sigma_R, sparse_gp_fit.P, K_up); + const Eigen::VectorXd S_diag = + S_sqrt.cwiseProduct(S_sqrt).array().colwise().sum(); + marginal_variance += S_diag; + + mean += mean_correction; + + return MarginalDistribution(mean, + marginal_variance - (2 * U_diag + VSV_diag)); + } + + template + JointDistribution _predict_impl( + const std::vector &features, + const Fit> + &sparse_gp_fit, + PredictTypeIdentity &&) const { + // K_up + const Eigen::MatrixXd K_up = + this->covariance_function_(sparse_gp_fit.inducing_features, features); + const Eigen::MatrixXd prior_cov = this->covariance_function_(features); + + // First pass: compute an O(1) mapping from features to groups + std::vector feature_to_block(features.size()); + std::vector groups( + features.size()); + Eigen::VectorXi col_alloc{Eigen::VectorXi::Zero(features.size())}; + bool all_same_group = true; + for (Eigen::Index j = 0; j < features.size(); ++j) { + groups[j] = sparse_gp_fit.measurement_groups.find( + independent_group_function_(without_measurement(features[j]))); + if (groups[j] != sparse_gp_fit.measurement_groups.end()) { + col_alloc(j) = groups[j]->second.block_size; + all_same_group = all_same_group && groups[j] == groups[0]; + } + feature_to_block[j] = + std::distance(sparse_gp_fit.measurement_groups.begin(), groups[j]); + } + + // Second pass: compute mean vector and fill sparse Vp matrix + Eigen::VectorXd mean_correction = Eigen::VectorXd::Zero(features.size()); + Eigen::SparseMatrix Vp(sparse_gp_fit.train_features.size(), + features.size()); + Vp.reserve(col_alloc); + for (Eigen::Index j = 0; j < features.size(); ++j) { + if (groups[j] == sparse_gp_fit.measurement_groups.end()) { + // Fall back to PITC + continue; + } + + const auto &B = groups[j]->second; + // TODO(@peddie): there must be some way to assign this + // expression directly to a map of the column memory + Eigen::VectorXd Vbp = + this->covariance_function_(B.dataset.features, + std::vector{features[j]}) - + K_up.col(j).transpose() * sparse_gp_fit.covariance_Y[B.block_index]; + for (Eigen::Index i = 0; i < Vbp.size(); ++i) { + Vp.insert(B.initial_row + i, j) = Vbp(i); + } + mean_correction[j] = Vbp.dot(sparse_gp_fit.mean_w[B.block_index]); + } + Vp.makeCompressed(); + + Eigen::MatrixXd xi_lambda = sparse_gp_fit.A_ldlt.sqrt_solve(Vp); + Eigen::MatrixXd xi_u = sparse_gp_fit.Z * Vp; + Eigen::MatrixXd VSV{xi_lambda.transpose() * xi_lambda - + xi_u.transpose() * xi_u}; + + const Eigen::MatrixXd U = K_up.transpose() * sparse_gp_fit.W * Vp; + + const Eigen::MatrixXd S_sqrt = + sqrt_solve(sparse_gp_fit.sigma_R, sparse_gp_fit.P, K_up); + + const Eigen::MatrixXd Q_sqrt = + sparse_gp_fit.train_covariance.sqrt_solve(K_up); + + const Eigen::MatrixXd max_explained = Q_sqrt.transpose() * Q_sqrt; + const Eigen::MatrixXd unexplained = S_sqrt.transpose() * S_sqrt; + + const Eigen::MatrixXd pitc_covariance = + prior_cov - max_explained + unexplained; + + const Eigen::MatrixXd pic_correction = U + U.transpose() + VSV; + JointDistribution pred(K_up.transpose() * sparse_gp_fit.information + + mean_correction, + pitc_covariance - pic_correction); + this->mean_function_.add_to(features, &pred.mean); + + return pred; + } + + InducingPointStrategy get_inducing_point_strategy() const { + return inducing_point_strategy_; + } + +private: + // This method takes care of a lot of the common book keeping required to + // setup the Sparse Gaussian Process problem. Namely, we want to get from + // possibly unordered features to a structured representation + // in the form K_ff = A_ff + Q_ff where Q_ff = K_fu K_uu^-1 K_uf and + // A_ff is block diagonal and is formed by subtracting Q_ff from K_ff. + // + template + void compute_internal_components( + const std::vector &inducing_features, + const std::vector &out_of_order_features, + const MarginalDistribution &out_of_order_targets, + BlockDiagonalLDLT *A_ldlt, Eigen::SerializableLDLT *K_uu_ldlt, + Eigen::MatrixXd *K_fu, Eigen::VectorXd *y, + GroupMap *measurement_groups) const { + + ALBATROSS_ASSERT(A_ldlt != nullptr); + ALBATROSS_ASSERT(K_uu_ldlt != nullptr); + ALBATROSS_ASSERT(K_fu != nullptr); + ALBATROSS_ASSERT(y != nullptr); + + const auto indexer = + group_by(out_of_order_features, independent_group_function_).indexers(); + + const auto out_of_order_measurement_features = + as_measurements(out_of_order_features); + + std::vector reordered_inds; + BlockDiagonal K_ff; + Eigen::Index begin_row_index = 0; + Eigen::Index block_index = 0; + // TODO(@peddie): compute these blocks asynchronously? + for (const auto &pair : indexer) { + reordered_inds.insert(reordered_inds.end(), pair.second.begin(), + pair.second.end()); + measurement_groups->operator[](pair.first) = PICGroup{ + begin_row_index, static_cast(pair.second.size()), + block_index, + albatross::RegressionDataset( + subset(out_of_order_features, pair.second), + out_of_order_targets.subset(pair.second))}; + auto subset_features = + subset(out_of_order_measurement_features, pair.second); + K_ff.blocks.emplace_back(this->covariance_function_(subset_features)); + K_ff.blocks.back().diagonal() += + subset(out_of_order_targets.covariance.diagonal(), pair.second); + begin_row_index += pair.second.size(); + ++block_index; + } + + const auto features = + subset(out_of_order_measurement_features, reordered_inds); + auto targets = subset(out_of_order_targets, reordered_inds); + *y = targets.mean; + + this->mean_function_.remove_from( + subset(out_of_order_features, reordered_inds), &targets.mean); + + *K_fu = this->covariance_function_(features, inducing_features, + Base::threads_.get()); + + auto K_uu = + this->covariance_function_(inducing_features, Base::threads_.get()); + K_uu.diagonal() += + inducing_nugget_.value * Eigen::VectorXd::Ones(K_uu.rows()); + + *K_uu_ldlt = K_uu.ldlt(); + // P is such that: + // Q_ff = K_fu K_uu^-1 K_uf + // = K_fu K_uu^-T/2 K_uu^-1/2 K_uf + // = P^T P + const Eigen::MatrixXd P = K_uu_ldlt->sqrt_solve(K_fu->transpose()); + + // We only need the diagonal blocks of Q_ff to get A + BlockDiagonal Q_ff_diag; + Eigen::Index i = 0; + for (const auto &pair : indexer) { + Eigen::Index cols = cast::to_index(pair.second.size()); + auto P_cols = P.block(0, i, P.rows(), cols); + Q_ff_diag.blocks.emplace_back(P_cols.transpose() * P_cols); + i += cols; + } + auto A = K_ff - Q_ff_diag; + + // It's possible that the inducing points will perfectly describe + // some of the data, in which case we need to add a bit of extra + // noise to make sure lambda is invertible. + for (auto &b : A.blocks) { + b.diagonal() += + measurement_nugget_.value * Eigen::VectorXd::Ones(b.rows()); + } + + *A_ldlt = A.ldlt(); + } + + Parameter measurement_nugget_; + Parameter inducing_nugget_; + InducingPointStrategy inducing_point_strategy_; + GrouperFunction independent_group_function_; +}; + +template +using SparseQRPicGaussianProcessRegression = + PICGaussianProcessRegression; + +template +auto pic_gp_from_covariance_and_mean( + CovFunc &&covariance_function, MeanFunc &&mean_function, + GrouperFunction &&grouper_function, InducingPointStrategy &&strategy, + const std::string &model_name, + QRImplementation qr __attribute__((unused)) = DenseQRImplementation{}) { + return PICGaussianProcessRegression< + typename std::decay::type, typename std::decay::type, + typename std::decay::type, + typename std::decay::type, + typename std::decay::type>( + std::forward(covariance_function), + std::forward(mean_function), + std::forward(grouper_function), + std::forward(strategy), model_name); +}; + +template +auto pic_gp_from_covariance_and_mean( + CovFunc &&covariance_function, MeanFunc &&mean_function, + GrouperFunction &&grouper_function, const std::string &model_name, + QRImplementation qr = DenseQRImplementation{}) { + return pic_gp_from_covariance_and_mean( + std::forward(covariance_function), + std::forward(mean_function), + std::forward(grouper_function), + StateSpaceInducingPointStrategy(), model_name, qr); +}; + +template +auto pic_gp_from_covariance(CovFunc &&covariance_function, + GrouperFunction &&grouper_function, + InducingPointStrategy &&strategy, + const std::string &model_name, + QRImplementation qr = DenseQRImplementation{}) { + return pic_gp_from_covariance_and_mean( + std::forward(covariance_function), ZeroMean(), + std::forward(grouper_function), + std::forward(strategy), model_name, qr); +}; + +template +auto pic_gp_from_covariance(CovFunc covariance_function, + GrouperFunction grouper_function, + const std::string &model_name, + QRImplementation qr = DenseQRImplementation{}) { + return pic_gp_from_covariance_and_mean< + CovFunc, decltype(ZeroMean()), GrouperFunction, + decltype(StateSpaceInducingPointStrategy()), QRImplementation>( + std::forward(covariance_function), ZeroMean(), + std::forward(grouper_function), + StateSpaceInducingPointStrategy(), model_name, qr); +}; + +} // namespace albatross + +#endif /* INCLUDE_ALBATROSS_MODELS_PIC_GP_H_ */ diff --git a/include/albatross/src/models/sparse_common.hpp b/include/albatross/src/models/sparse_common.hpp new file mode 100644 index 00000000..525ecee4 --- /dev/null +++ b/include/albatross/src/models/sparse_common.hpp @@ -0,0 +1,91 @@ +/* + * Copyright (C) 2024 Swift Navigation Inc. + * Contact: Swift Navigation + * + * This source is subject to the license found in the file 'LICENSE' which must + * be distributed together with this source. All other rights reserved. + * + * THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, + * EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE. + */ + +#ifndef INCLUDE_ALBATROSS_MODELS_SPARSE_COMMON_H_ +#define INCLUDE_ALBATROSS_MODELS_SPARSE_COMMON_H_ + +namespace albatross { + +namespace details { + +constexpr double DEFAULT_NUGGET = 1e-8; + +inline std::string measurement_nugget_name() { return "measurement_nugget"; } + +inline std::string inducing_nugget_name() { return "inducing_nugget"; } + +static constexpr double cSparseRNugget = 1.e-10; + +} // namespace details + +struct UniformlySpacedInducingPoints { + + UniformlySpacedInducingPoints(std::size_t num_points_ = 10) + : num_points(num_points_) {} + + template + std::vector operator()(const CovarianceFunction &cov ALBATROSS_UNUSED, + const std::vector &features) const { + double min = *std::min_element(features.begin(), features.end()); + double max = *std::max_element(features.begin(), features.end()); + return linspace(min, max, num_points); + } + + std::size_t num_points; +}; + +struct StateSpaceInducingPointStrategy { + + template ::value, + int> = 0> + auto operator()(const CovarianceFunction &cov, + const std::vector &features) const { + return cov.state_space_representation(features); + } + + template ::value, + int> = 0> + auto + operator()(const CovarianceFunction &cov ALBATROSS_UNUSED, + const std::vector &features ALBATROSS_UNUSED) const + ALBATROSS_FAIL( + CovarianceFunction, + "Covariance function is missing state_space_representation method, " + "be sure _ssr_impl has been defined for the types concerned"); +}; + +struct SPQRImplementation { + using QRType = Eigen::SPQR>; + + static std::unique_ptr compute(const Eigen::MatrixXd &m, + ThreadPool *threads) { + return SPQR_create(m.sparseView(), threads); + } +}; + +struct DenseQRImplementation { + using QRType = Eigen::ColPivHouseholderQR; + + static std::unique_ptr compute(const Eigen::MatrixXd &m, + ThreadPool *threads + __attribute__((unused))) { + return std::make_unique(m); + } +}; + +} // namespace albatross + +#endif // INCLUDE_ALBATROSS_MODELS_SPARSE_COMMON_H_ diff --git a/include/albatross/src/models/sparse_gp.hpp b/include/albatross/src/models/sparse_gp.hpp index 36234e4e..90985cb4 100644 --- a/include/albatross/src/models/sparse_gp.hpp +++ b/include/albatross/src/models/sparse_gp.hpp @@ -15,81 +15,10 @@ namespace albatross { -namespace details { - -constexpr double DEFAULT_NUGGET = 1e-8; - -inline std::string measurement_nugget_name() { return "measurement_nugget"; } - -inline std::string inducing_nugget_name() { return "inducing_nugget"; } - -static constexpr double cSparseRNugget = 1.e-10; - -} // namespace details - template class SparseGaussianProcessRegression; -struct UniformlySpacedInducingPoints { - - UniformlySpacedInducingPoints(std::size_t num_points_ = 10) - : num_points(num_points_) {} - - template - std::vector operator()(const CovarianceFunction &cov ALBATROSS_UNUSED, - const std::vector &features) const { - double min = *std::min_element(features.begin(), features.end()); - double max = *std::max_element(features.begin(), features.end()); - return linspace(min, max, num_points); - } - - std::size_t num_points; -}; - -struct StateSpaceInducingPointStrategy { - - template ::value, - int> = 0> - auto operator()(const CovarianceFunction &cov, - const std::vector &features) const { - return cov.state_space_representation(features); - } - - template ::value, - int> = 0> - auto - operator()(const CovarianceFunction &cov ALBATROSS_UNUSED, - const std::vector &features ALBATROSS_UNUSED) const - ALBATROSS_FAIL( - CovarianceFunction, - "Covariance function is missing state_space_representation method, " - "be sure _ssr_impl has been defined for the types concerned"); -}; - -struct SPQRImplementation { - using QRType = Eigen::SPQR>; - - static std::unique_ptr compute(const Eigen::MatrixXd &m, - ThreadPool *threads) { - return SPQR_create(m.sparseView(), threads); - } -}; - -struct DenseQRImplementation { - using QRType = Eigen::ColPivHouseholderQR; - - static std::unique_ptr compute(const Eigen::MatrixXd &m, - ThreadPool *threads - __attribute__((unused))) { - return std::make_unique(m); - } -}; - template struct SparseGPFit {}; template struct Fit> { diff --git a/tests/test_pic_gp.cc b/tests/test_pic_gp.cc new file mode 100644 index 00000000..227497a6 --- /dev/null +++ b/tests/test_pic_gp.cc @@ -0,0 +1,404 @@ +/* + * Copyright (C) 2024 Swift Navigation Inc. + * Contact: Swift Navigation + * + * This source is subject to the license found in the file 'LICENSE' which must + * be distributed together with this source. All other rights reserved. + * + * THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, + * EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE. + */ + +#include +#include + +#include "test_models.h" +#include +#include + +#include +#include + +namespace albatross { + +struct GroupedLeaveOneIntervalOut { + + GroupedLeaveOneIntervalOut(){}; + explicit GroupedLeaveOneIntervalOut(double group_domain_size_) + : group_domain_size(group_domain_size_){}; + + int operator()(const double &f) const { + return static_cast(floor(f / group_domain_size)); + } + + int operator()(const Measurement &f) const { + return static_cast(floor(f.value / group_domain_size)); + } + + double group_domain_size = 5.; +}; + +TEST(TestPicGP, TestPredictionExists) { + const auto dataset = make_toy_linear_data(); + UniformlySpacedInducingPoints strategy(8); + GroupedLeaveOneIntervalOut grouper; + const auto pic = + pic_gp_from_covariance(make_simple_covariance_function(), grouper, + strategy, "pic", DenseQRImplementation{}); + + const auto pic_fit = pic.fit(dataset); + + const auto test_features = linspace(0.1, 9.9, 11); + + const auto pic_pred = + pic_fit.predict_with_measurement_noise(test_features).joint(); + + EXPECT_GT(pic_pred.mean.size(), 0); +} + +TEST(TestPicGP, ScalarEquivalence) { + static constexpr std::size_t kNumTrainPoints = 3; + static constexpr std::size_t kNumTestPoints = 1; + static constexpr std::size_t kNumInducingPoints = 3; + + UniformlySpacedInducingPoints strategy(kNumInducingPoints); + // Set the grouper so that all the simple test data falls within one + // group. + GroupedLeaveOneIntervalOut grouper(10); + auto direct = gp_from_covariance(make_simple_covariance_function(), "direct"); + auto pic = pic_gp_from_covariance(make_simple_covariance_function(), grouper, + strategy, "pic", DenseQRImplementation{}); + + auto dataset = make_toy_linear_data(5, 1, 0.1, kNumTrainPoints); + + auto direct_fit = direct.fit(dataset); + auto pic_fit = pic.fit(dataset); + + auto test_features = linspace(0.1, 9.9, kNumTestPoints); + auto direct_pred = + direct_fit.predict_with_measurement_noise(test_features).joint(); + auto pic_pred = pic_fit.predict_with_measurement_noise(test_features).joint(); + EXPECT_LT(distance::wasserstein_2(direct_pred, pic_pred), 1e-12); +} + +TEST(TestPicGP, SingleBlockEquivalence) { + static constexpr std::size_t kNumTrainPoints = 10; + static constexpr std::size_t kNumTestPoints = 10; + static constexpr std::size_t kNumInducingPoints = 4; + + UniformlySpacedInducingPoints strategy(kNumInducingPoints); + // Set the grouper so that all the simple test data falls within one + // group. + GroupedLeaveOneIntervalOut grouper(10); + auto direct = gp_from_covariance(make_simple_covariance_function(), "direct"); + auto pic = pic_gp_from_covariance(make_simple_covariance_function(), grouper, + strategy, "pic", DenseQRImplementation{}); + + auto dataset = make_toy_linear_data(5, 1, 0.1, kNumTrainPoints); + + auto direct_fit = direct.fit(dataset); + auto pic_fit = pic.fit(dataset); + + auto test_features = linspace(0.1, 9.9, kNumTestPoints); + auto direct_pred = + direct_fit.predict_with_measurement_noise(test_features).joint(); + auto pic_pred = pic_fit.predict_with_measurement_noise(test_features).joint(); + EXPECT_LT(distance::wasserstein_2(direct_pred, pic_pred), 1e-10); +} + +template +JointDistribution test_pic(const RegressionDataset &data, + const std::vector &predict, + CovarianceType &&cov, InducingStrategy &&inducing, + GrouperFunction &&grouper) { + const auto u = inducing(cov, data.features); + + auto bfp_cov = make_brute_force_pic_covariance(inducing(cov, data.features), + cov, grouper); + + BlockDiagonal K_ff; + const auto indexers = group_by(data.features, grouper).indexers(); + std::vector reordered_inds; + for (const auto &pair : indexers) { + reordered_inds.insert(reordered_inds.end(), pair.second.begin(), + pair.second.end()); + K_ff.blocks.emplace_back(cov(subset(data.features, pair.second))); + K_ff.blocks.back().diagonal() += + subset(data.targets.covariance.diagonal(), pair.second); + } + const std::vector ordered_features = + subset(data.features, reordered_inds); + + Eigen::MatrixXd K_uu = cov(u, u); + K_uu.diagonal() += Eigen::VectorXd::Constant(K_uu.rows(), 1e-8); + const Eigen::MatrixXd K_fu = cov(ordered_features, u); + + const Eigen::SerializableLDLT K_uu_ldlt = K_uu.ldlt(); + const Eigen::MatrixXd P = K_uu_ldlt.sqrt_solve(K_fu.transpose()); + const Eigen::MatrixXd Q_ff = P.transpose() * P; + BlockDiagonal Q_ff_diag; + Eigen::Index i = 0; + for (const auto &pair : indexers) { + const Eigen::Index cols = cast::to_index(pair.second.size()); + auto P_cols = P.block(0, i, P.rows(), cols); + Q_ff_diag.blocks.emplace_back(P_cols.transpose() * P_cols); + i += cols; + } + // auto Lambda = K_ff - Q_ff_diag; + + Eigen::MatrixXd Q_ff_lambda = Q_ff; + Eigen::Index k = 0; + for (const auto &block : K_ff.blocks) { + Q_ff_lambda.block(k, k, block.rows(), block.rows()) = block; + Q_ff_lambda.block(k, k, block.rows(), block.rows()).diagonal() += + Eigen::VectorXd::Constant(block.rows(), 1e-2); + k += block.rows(); + } + + const auto print_matrix = [](const Eigen::MatrixXd &m, std::string &&label) { + std::cout << label << " (" << m.rows() << "x" << m.cols() << "):\n" + << m.format(Eigen::FullPrecision) << std::endl; + }; + + const auto print_vector = [](const Eigen::VectorXd &v, std::string &&label) { + std::cout << label << " (" << v.size() + << "): " << v.transpose().format(Eigen::FullPrecision) + << std::endl; + }; + + print_matrix(Q_ff_lambda, "Q_ff + Lambda"); + // const BlockDiagonalLDLT Lambda_ldlt = Lambda.ldlt(); + + print_matrix(bfp_cov(ordered_features), "BFP: Q_ff + Lambda"); + + print_matrix(Q_ff_lambda - bfp_cov(ordered_features), + "Q_ff_lambda difference"); + + const Eigen::SerializableLDLT S = Q_ff_lambda.ldlt(); + + print_matrix(S.matrixL(), "PITC L"); + print_vector(S.vectorD(), "PITC D"); + + Eigen::MatrixXd K_PIC(ordered_features.size(), predict.size()); + Eigen::MatrixXd K_PITC(ordered_features.size(), predict.size()); + + Eigen::MatrixXd V_PIC = + Eigen::MatrixXd::Zero(ordered_features.size(), predict.size()); + + Eigen::MatrixXd Y = K_uu_ldlt.solve(K_fu.transpose()); + print_matrix(Y, "Y"); + + Eigen::MatrixXd W = K_uu_ldlt.solve(S.solve(K_fu).transpose()); + print_matrix(W, "W"); + + for (Eigen::Index f = 0; f < cast::to_index(ordered_features.size()); ++f) { + for (Eigen::Index p = 0; p < cast::to_index(predict.size()); ++p) { + const std::vector pv{predict[p]}; + K_PITC(f, p) = K_fu.row(f).dot(K_uu_ldlt.solve(cov(u, pv)).col(0)); + if (grouper(predict[p]) == grouper(ordered_features[f])) { + K_PIC(f, p) = cov(ordered_features[f], predict[p]); + V_PIC(f, p) = + K_PIC(f, p) - K_fu.row(f).dot(K_uu_ldlt.solve(cov(u, pv)).col(0)); + } else { + K_PIC(f, p) = K_fu.row(f).dot(K_uu_ldlt.solve(cov(u, pv)).col(0)); + } + } + } + + const Eigen::MatrixXd K_pu = cov(predict, u); + // Eigen::Index j = 0; + // Eigen::Index blk = 0; + // std::vector WW; + + const Eigen::MatrixXd WBW = K_uu_ldlt.solve(S.solve(K_fu).transpose()); + print_matrix(K_PIC, "K_PIC"); + print_matrix(bfp_cov(ordered_features, predict), "BFP: K_PIC"); + + print_matrix(K_PIC - bfp_cov(ordered_features, predict), "K_PIC error"); + print_matrix(K_PITC, "K_PITC"); + + print_matrix(V_PIC, "V_PIC"); + + print_matrix(WBW, "W"); + const Eigen::MatrixXd U = K_pu * WBW * V_PIC; + print_matrix(U, "U"); + + auto SV = S.sqrt_solve(V_PIC); + const Eigen::MatrixXd VSV = + V_PIC.transpose() * S.solve(V_PIC); // SV.transpose() * SV; + print_matrix(VSV, "VSV"); + + const Eigen::MatrixXd predict_prior = cov(predict); + + print_matrix(predict_prior, "prior"); + + print_matrix(bfp_cov(predict), "BFP: prior"); + + print_matrix(predict_prior - bfp_cov(predict), "prior error"); + + // auto KK = K_PITC_ldlt.sqrt_solve(K_PIC); + // const Eigen::MatrixXd explained_cov = KK.transpose() * KK; + const Eigen::MatrixXd explained_cov = K_PIC.transpose() * S.solve(K_PIC); + print_matrix(explained_cov, "explained"); + + const Eigen::VectorXd PIC_mean = + K_PIC.transpose() * S.solve(data.targets.mean); + print_vector(PIC_mean, "PIC mean"); + const Eigen::MatrixXd predict_cov = predict_prior - explained_cov; + + print_matrix(predict_cov, "K_**"); + + const Eigen::MatrixXd PITC_cov = + predict_prior - K_PITC.transpose() * S.solve(K_PITC); + + const Eigen::MatrixXd PIC_cov = PITC_cov - U - U.transpose() - VSV; + print_matrix(PITC_cov, "PTIC cov"); + print_matrix(PIC_cov, "PIC cov"); + + print_matrix(predict_cov - PITC_cov, "PIC - PITC"); + print_matrix(predict_cov - PIC_cov, "PIC - factored"); + + return JointDistribution{PIC_mean, PIC_cov}; +} + +TEST(TestPicGP, BruteForceEquivalenceOneBlock) { + static constexpr std::size_t kNumTrainPoints = 10; + static constexpr std::size_t kNumTestPoints = 10; + static constexpr std::size_t kNumInducingPoints = 5; + + UniformlySpacedInducingPoints strategy(kNumInducingPoints); + GroupedLeaveOneIntervalOut grouper(10); + auto pic = pic_gp_from_covariance(make_simple_covariance_function(), grouper, + strategy, "pic", DenseQRImplementation{}); + + auto dataset = make_toy_linear_data(5, 1, 0.1, kNumTrainPoints); + auto bfp_cov = make_brute_force_pic_covariance( + strategy(make_simple_covariance_function(), dataset.features), + make_simple_covariance_function(), grouper); + auto bfp = gp_from_covariance(bfp_cov); + + auto bfp_fit = bfp.fit(dataset); + auto pic_fit = pic.fit(dataset); + + auto test_features = linspace(0.1, 9.9, kNumTestPoints); + auto bfp_pred = bfp_fit.predict_with_measurement_noise(test_features).joint(); + auto pic_pred = pic_fit.predict_with_measurement_noise(test_features).joint(); + EXPECT_LT(distance::wasserstein_2(bfp_pred, pic_pred), 1e-11); +} + +TEST(TestPicGP, BruteForceEquivalenceMultipleBlocks) { + static constexpr std::size_t kNumTrainPoints = 10; + static constexpr std::size_t kNumTestPoints = 10; + static constexpr std::size_t kNumInducingPoints = 5; + + UniformlySpacedInducingPoints strategy(kNumInducingPoints); + GroupedLeaveOneIntervalOut grouper(2); + auto pic = pic_gp_from_covariance(make_simple_covariance_function(), grouper, + strategy, "pic", DenseQRImplementation{}); + + auto dataset = make_toy_linear_data(5, 1, 0.1, kNumTrainPoints); + auto bfp_cov = make_brute_force_pic_covariance( + strategy(make_simple_covariance_function(), dataset.features), + make_simple_covariance_function(), grouper); + auto bfp = gp_from_covariance(bfp_cov); + + auto bfp_fit = bfp.fit(dataset); + auto pic_fit = pic.fit(dataset); + + auto test_features = linspace(0.1, 9.9, kNumTestPoints); + auto bfp_pred = bfp_fit.predict_with_measurement_noise(test_features).joint(); + auto pic_pred = pic_fit.predict_with_measurement_noise(test_features).joint(); + + EXPECT_LT(distance::wasserstein_2(bfp_pred, pic_pred), 1e-12); +} + +TEST(TestPicGP, PITCEquivalenceOutOfTraining) { + static constexpr std::size_t kNumTrainPoints = 10; + static constexpr std::size_t kNumTestPoints = 10; + static constexpr std::size_t kNumInducingPoints = 5; + + UniformlySpacedInducingPoints strategy(kNumInducingPoints); + GroupedLeaveOneIntervalOut grouper(2); + auto pic = pic_gp_from_covariance(make_simple_covariance_function(), grouper, + strategy, "pic", DenseQRImplementation{}); + + auto dataset = make_toy_linear_data(5, 1, 0.1, kNumTrainPoints); + auto pitc = + sparse_gp_from_covariance(make_simple_covariance_function(), grouper, + strategy, "pitc", DenseQRImplementation{}); + + auto pic_fit = pic.fit(dataset); + auto pitc_fit = pitc.fit(dataset); + + auto test_features = linspace(10.1, 19.9, kNumTestPoints); + auto pic_pred = pic_fit.predict_with_measurement_noise(test_features).joint(); + auto pitc_pred = + pitc_fit.predict_with_measurement_noise(test_features).joint(); + + EXPECT_LT(distance::wasserstein_2(pic_pred, pitc_pred), 1e-12); +} + +TEST(TestPicGP, PredictMeanEquivalent) { + static constexpr std::size_t kNumTrainPoints = 10; + static constexpr std::size_t kNumTestPoints = 10; + static constexpr std::size_t kNumInducingPoints = 5; + + UniformlySpacedInducingPoints strategy(kNumInducingPoints); + GroupedLeaveOneIntervalOut grouper(2); + auto pic = pic_gp_from_covariance(make_simple_covariance_function(), grouper, + strategy, "pic", DenseQRImplementation{}); + + auto dataset = make_toy_linear_data(5, 1, 0.1, kNumTrainPoints); + + auto pic_fit = pic.fit(dataset); + + auto test_features = linspace(0.1, 9.9, kNumTestPoints); + auto pic_pred = pic_fit.predict_with_measurement_noise(test_features).mean(); + auto pic_joint_pred = + pic_fit.predict_with_measurement_noise(test_features).joint(); + + const double pic_mean_error = (pic_pred - pic_joint_pred.mean).norm(); + EXPECT_LT(pic_mean_error, 1e-12); +} + +TEST(TestPicGP, PredictMarginalEquivalent) { + static constexpr std::size_t kNumTrainPoints = 10; + static constexpr std::size_t kNumTestPoints = 10; + static constexpr std::size_t kNumInducingPoints = 5; + + UniformlySpacedInducingPoints strategy(kNumInducingPoints); + GroupedLeaveOneIntervalOut grouper(2); + auto pic = pic_gp_from_covariance(make_simple_covariance_function(), grouper, + strategy, "pic", DenseQRImplementation{}); + + auto dataset = make_toy_linear_data(5, 1, 0.1, kNumTrainPoints); + + auto pic_fit = pic.fit(dataset); + + auto test_features = linspace(0.1, 9.9, kNumTestPoints); + auto pic_pred = + pic_fit.predict_with_measurement_noise(test_features).marginal(); + auto pic_joint_pred = + pic_fit.predict_with_measurement_noise(test_features).joint(); + + const double pic_marginal_error = + (pic_pred.mean - pic_joint_pred.mean).norm(); + EXPECT_LT(pic_marginal_error, 1e-12); + const double pic_marginal_cov_error = + (Eigen::VectorXd(pic_pred.covariance.diagonal()) - + Eigen::VectorXd(pic_joint_pred.covariance.diagonal())) + .norm(); + EXPECT_LT(pic_marginal_cov_error, 1e-12) + << "\nmarginal: " + << Eigen::VectorXd(pic_pred.covariance) + .transpose() + .format(Eigen::FullPrecision) + << "\njoint diag: " + << Eigen::VectorXd(pic_joint_pred.covariance.diagonal()) + .transpose() + .format(Eigen::FullPrecision); +} + +} // namespace albatross