From 8090c4779107af053157e75199c4b5230349f1f7 Mon Sep 17 00:00:00 2001 From: Matt Peddie Date: Thu, 8 Aug 2024 01:04:54 +1000 Subject: [PATCH 01/12] Make BlockDiagonalLDLT serialiseable --- BUILD.bazel | 1 + include/albatross/serialize/GP | 1 + .../albatross/src/cereal/block_diagonal.hpp | 27 +++++++++++++++++++ 3 files changed, 29 insertions(+) create mode 100644 include/albatross/src/cereal/block_diagonal.hpp diff --git a/BUILD.bazel b/BUILD.bazel index a7f054c4..431088f1 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -48,6 +48,7 @@ cc_library( "include/albatross/serialize/Ransac", "include/albatross/src/cereal/ThreadPool.hpp", "include/albatross/src/cereal/block_utils.hpp", + "include/albatross/src/cereal/block_diagonal.hpp", "include/albatross/src/cereal/dataset.hpp", "include/albatross/src/cereal/distribution.hpp", "include/albatross/src/cereal/eigen.hpp", diff --git a/include/albatross/serialize/GP b/include/albatross/serialize/GP index f54633c5..2ecee09c 100644 --- a/include/albatross/serialize/GP +++ b/include/albatross/serialize/GP @@ -18,6 +18,7 @@ #include "../src/cereal/block_utils.hpp" #include "../src/cereal/serializable_ldlt.hpp" +#include "../src/cereal/block_diagonal.hpp" #include "../src/cereal/gp.hpp" #include "../src/cereal/representations.hpp" #include "../src/cereal/serializable_spqr.hpp" diff --git a/include/albatross/src/cereal/block_diagonal.hpp b/include/albatross/src/cereal/block_diagonal.hpp new file mode 100644 index 00000000..964602bd --- /dev/null +++ b/include/albatross/src/cereal/block_diagonal.hpp @@ -0,0 +1,27 @@ +/* + * Copyright (C) 2019 Swift Navigation Inc. + * Contact: Swift Navigation + * + * This source is subject to the license found in the file 'LICENSE' which must + * be distributed together with this source. All other rights reserved. + * + * THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, + * EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE. + */ + +#ifndef ALBATROSS_SRC_CEREAL_BLOCK_DIAGONAL_HPP_ +#define ALBATROSS_SRC_CEREAL_BLOCK_DIAGONAL_HPP_ + +namespace cereal { + +template +inline void serialize(Archive &archive, + const albatross::BlockDiagonalLDLT &ldlt, + const std::uint32_t version) { + archive(cereal::make_nvp("blocks", ldlt.blocks)); +} + +} // namespace cereal + +#endif /* ALBATROSS_SRC_CEREAL_BLOCK_DIAGONAL_HPP_ */ From 31a0b21ad09f5305e352177db2cc4f917c27dd9e Mon Sep 17 00:00:00 2001 From: Matt Peddie Date: Thu, 8 Aug 2024 01:05:35 +1000 Subject: [PATCH 02/12] Factor out common sparse GP stuff --- BUILD.bazel | 1 + include/albatross/SparseGP | 1 + .../albatross/src/models/sparse_common.hpp | 92 +++++++++++++++++++ include/albatross/src/models/sparse_gp.hpp | 71 -------------- 4 files changed, 94 insertions(+), 71 deletions(-) create mode 100644 include/albatross/src/models/sparse_common.hpp diff --git a/BUILD.bazel b/BUILD.bazel index 431088f1..71536037 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -130,6 +130,7 @@ cc_library( "include/albatross/src/models/null_model.hpp", "include/albatross/src/models/ransac.hpp", "include/albatross/src/models/ransac_gp.hpp", + "include/albatross/src/models/sparse_common.hpp", "include/albatross/src/models/sparse_gp.hpp", "include/albatross/src/samplers/callbacks.hpp", "include/albatross/src/samplers/ensemble.hpp", diff --git a/include/albatross/SparseGP b/include/albatross/SparseGP index 2a26aab2..925bb506 100644 --- a/include/albatross/SparseGP +++ b/include/albatross/SparseGP @@ -23,6 +23,7 @@ #include #include #include +#include #include #endif diff --git a/include/albatross/src/models/sparse_common.hpp b/include/albatross/src/models/sparse_common.hpp new file mode 100644 index 00000000..b5267b99 --- /dev/null +++ b/include/albatross/src/models/sparse_common.hpp @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2024 Swift Navigation Inc. + * Contact: Swift Navigation + * + * This source is subject to the license found in the file 'LICENSE' which must + * be distributed together with this source. All other rights reserved. + * + * THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, + * EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE. + */ + +#ifndef INCLUDE_ALBATROSS_MODELS_SPARSE_COMMON_H_ +#define INCLUDE_ALBATROSS_MODELS_SPARSE_COMMON_H_ + +namespace albatross { + +namespace details { + +constexpr double DEFAULT_NUGGET = 1e-8; + +inline std::string measurement_nugget_name() { return "measurement_nugget"; } + +inline std::string inducing_nugget_name() { return "inducing_nugget"; } + +static constexpr double cSparseRNugget = 1.e-10; + +} // namespace details + +struct UniformlySpacedInducingPoints { + + UniformlySpacedInducingPoints(std::size_t num_points_ = 10) + : num_points(num_points_) {} + + template + std::vector operator()(const CovarianceFunction &cov ALBATROSS_UNUSED, + const std::vector &features) const { + double min = *std::min_element(features.begin(), features.end()); + double max = *std::max_element(features.begin(), features.end()); + return linspace(min, max, num_points); + } + + std::size_t num_points; +}; + +struct StateSpaceInducingPointStrategy { + + template ::value, + int> = 0> + auto operator()(const CovarianceFunction &cov, + const std::vector &features) const { + return cov.state_space_representation(features); + } + + template ::value, + int> = 0> + auto + operator()(const CovarianceFunction &cov ALBATROSS_UNUSED, + const std::vector &features ALBATROSS_UNUSED) const + ALBATROSS_FAIL( + CovarianceFunction, + "Covariance function is missing state_space_representation method, " + "be sure _ssr_impl has been defined for the types concerned"); +}; + +struct SPQRImplementation { + using QRType = Eigen::SPQR>; + + static std::unique_ptr compute(const Eigen::MatrixXd &m, + ThreadPool *threads) { + return SPQR_create(m.sparseView(), threads); + } +}; + +struct DenseQRImplementation { + using QRType = Eigen::ColPivHouseholderQR; + + static std::unique_ptr compute(const Eigen::MatrixXd &m, + ThreadPool *threads + __attribute__((unused))) { + return std::make_unique(m); + } +}; + + +} // namespace albatross + +#endif // INCLUDE_ALBATROSS_MODELS_SPARSE_COMMON_H_ \ No newline at end of file diff --git a/include/albatross/src/models/sparse_gp.hpp b/include/albatross/src/models/sparse_gp.hpp index 36234e4e..90985cb4 100644 --- a/include/albatross/src/models/sparse_gp.hpp +++ b/include/albatross/src/models/sparse_gp.hpp @@ -15,81 +15,10 @@ namespace albatross { -namespace details { - -constexpr double DEFAULT_NUGGET = 1e-8; - -inline std::string measurement_nugget_name() { return "measurement_nugget"; } - -inline std::string inducing_nugget_name() { return "inducing_nugget"; } - -static constexpr double cSparseRNugget = 1.e-10; - -} // namespace details - template class SparseGaussianProcessRegression; -struct UniformlySpacedInducingPoints { - - UniformlySpacedInducingPoints(std::size_t num_points_ = 10) - : num_points(num_points_) {} - - template - std::vector operator()(const CovarianceFunction &cov ALBATROSS_UNUSED, - const std::vector &features) const { - double min = *std::min_element(features.begin(), features.end()); - double max = *std::max_element(features.begin(), features.end()); - return linspace(min, max, num_points); - } - - std::size_t num_points; -}; - -struct StateSpaceInducingPointStrategy { - - template ::value, - int> = 0> - auto operator()(const CovarianceFunction &cov, - const std::vector &features) const { - return cov.state_space_representation(features); - } - - template ::value, - int> = 0> - auto - operator()(const CovarianceFunction &cov ALBATROSS_UNUSED, - const std::vector &features ALBATROSS_UNUSED) const - ALBATROSS_FAIL( - CovarianceFunction, - "Covariance function is missing state_space_representation method, " - "be sure _ssr_impl has been defined for the types concerned"); -}; - -struct SPQRImplementation { - using QRType = Eigen::SPQR>; - - static std::unique_ptr compute(const Eigen::MatrixXd &m, - ThreadPool *threads) { - return SPQR_create(m.sparseView(), threads); - } -}; - -struct DenseQRImplementation { - using QRType = Eigen::ColPivHouseholderQR; - - static std::unique_ptr compute(const Eigen::MatrixXd &m, - ThreadPool *threads - __attribute__((unused))) { - return std::make_unique(m); - } -}; - template struct SparseGPFit {}; template struct Fit> { From 0af3b22f687b11513d98af0f1efed17bcf575085 Mon Sep 17 00:00:00 2001 From: Matt Peddie Date: Thu, 8 Aug 2024 01:06:09 +1000 Subject: [PATCH 03/12] Introduce PIC approximation --- BUILD.bazel | 2 + doc/src/crappy-pic.rst | 164 ++++ doc/src/sparse-gp-details.rst | 141 +++ include/albatross/SparseGP | 1 + include/albatross/src/cereal/gp.hpp | 24 + include/albatross/src/models/pic_gp.hpp | 1054 +++++++++++++++++++++++ tests/test_pic_gp.cc | 404 +++++++++ 7 files changed, 1790 insertions(+) create mode 100644 doc/src/crappy-pic.rst create mode 100644 include/albatross/src/models/pic_gp.hpp create mode 100644 tests/test_pic_gp.cc diff --git a/BUILD.bazel b/BUILD.bazel index 71536037..422058cc 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -128,6 +128,7 @@ cc_library( "include/albatross/src/models/gp.hpp", "include/albatross/src/models/least_squares.hpp", "include/albatross/src/models/null_model.hpp", + "include/albatross/src/models/pic_gp.hpp", "include/albatross/src/models/ransac.hpp", "include/albatross/src/models/ransac_gp.hpp", "include/albatross/src/models/sparse_common.hpp", @@ -230,6 +231,7 @@ swift_cc_test( "tests/test_models.cc", "tests/test_models.h", "tests/test_parameter_handling_mixin.cc", + "tests/test_pic_gp.cc", "tests/test_prediction.cc", "tests/test_radial.cc", "tests/test_random_utils.cc", diff --git a/doc/src/crappy-pic.rst b/doc/src/crappy-pic.rst new file mode 100644 index 00000000..0005dc14 --- /dev/null +++ b/doc/src/crappy-pic.rst @@ -0,0 +1,164 @@ +################################################# +PIC math scratchpad +################################################# + +.. _crappy-pic: + +------------------------------------------------------------------- +Relationship between :math:`\sigma_\cancel{B}` and :math:`\sigma_f` +------------------------------------------------------------------- + +It seems like what we need first of all is some way to get the PITC posterior variance based only on :math:`\cancel{B}` using only the full PITC posterior and calculations that scale only with :math:`|B|`. I'm not totally finished working out how :math:`(\sigma^2_{*})^{\cancel{B} \cancel{B}}` fits into the whole posterior :math:`(\sigma^2_*)^{PIC}` + +Observe that since :math:`\mathbf{\Lambda}` is a block-diagonal matrix, its inverse is too, so when it's in the middle of a quadratic form, the cross terms disappear. As far as I can tell, this is the only place we can additively separate :math:`\mathbf{Q}_{\cancel{B} \cancel{B}}` and :math:`\mathbf{K}_{B B}` without explicit cross terms. Start with the usual application of Woodbury's lemma to the PITC posterior covariance: + +.. math:: + + \newcommand{Lam}{\mathbf{\Lambda}} + \newcommand{Kuu}{\mathbf{K}_{uu}} + \newcommand{Kuf}{\mathbf{K}_{uf}} + \newcommand{Kfu}{\mathbf{K}_{fu}} + (\sigma_*^2) &= K_* - \mathbf{Q}_{* f} \left(\mathbf{K}_{fu} \mathbf{K}_{uu}^{-1} \mathbf{K}_{uf} + \mathbf{\Lambda} \right)^{-1} \mathbf{Q}_{f *} \\ + &= K_* - \mathbf{K}_{*u} \Kuu^{-1} \Kuf \left(\Kfu \Kuu^{-1} \Kuf + \Lam\right)^{-1} \Kfu \Kuu^{-1} \mathbf{K}_{u*} \\ + &= K_* - \mathbf{K}_{*u} \Kuu^{-1} \mathbf{K}_{u*} + \mathbf{K}_{*u} \left( \Kuu + \Kuf \Lam^{-1} \Kfu \right)^{-1} \mathbf{K}_{u*} + +Now break up :math:`\Kuf` and :math:`\Lam`: + +.. math:: + + \Kuf &= \begin{bmatrix} \mathbf{K}_{u \cancel{B}} & \mathbf{K}_{u B} \end{bmatrix} \\ + \Lam &= \begin{bmatrix} \Lam_{\cancel{B}} & \mathbf{0} \\ \mathbf{0} & \Lam_{B} \end{bmatrix} \\ + +so that + +.. math:: + + \Kuf \Lam^{-1} \Kfu &= \begin{bmatrix} \mathbf{K}_{u \cancel{B}} & \mathbf{K}_{u B} \end{bmatrix} + \begin{bmatrix} \Lam_{\cancel{B}}^{-1} & \mathbf{0} \\ \mathbf{0} & \Lam_{B}^{-1} \end{bmatrix} + \begin{bmatrix} \mathbf{K}_{\cancel{B} u} \\ \mathbf{K}_{B u} \end{bmatrix} \\ + &= \mathbf{K}_{u \cancel{B}} \Lam_{\cancel{B}}^{-1} \mathbf{K}_{\cancel{B} u} + + \mathbf{K}_{u B} \Lam_{B}^{-1} \mathbf{K}_{B u} \\ + \mathbf{K}_{u \cancel{B}} \Lam_{\cancel{B}}^{-1} \mathbf{K}_{\cancel{B} u} &= \Kuf \Lam^{-1} \Kfu - \mathbf{K}_{u B} \Lam_{B}^{-1} \mathbf{K}_{B u} + +This last one is the inverse term in the training-data dependent part of the PITC posterior covariance using only the training points in :math:`\cancel{B}`, which I think is what we want to combine with the dense posterior from the training points in :math:`B`: + +.. math:: + (\sigma_*^2)^{\cancel{B} \cancel{B}} = K_* - \mathbf{K}_{*u} \Kuu^{-1} \mathbf{K}_{u*} + \mathbf{K}_{*u} \left( \Kuu + \mathbf{K}_{u \cancel{B}} \Lam_{\cancel{B}}^{-1} \mathbf{K}_{\cancel{B} u} \right)^{-1} \mathbf{K}_{u*} + +which we can write in terms of only the full PITC prior and the prior for group :math:`B`: + +.. math:: + (\sigma_*^2)^{\cancel{B} \cancel{B}} = K_* - \mathbf{K}_{*u} \Kuu^{-1} \mathbf{K}_{u*} + \mathbf{K}_{*u} \left( \Kuu + \Kuf \Lam^{-1} \Kfu - \mathbf{K}_{u B} \Lam_{B}^{-1} \mathbf{K}_{B u} \right)^{-1} \mathbf{K}_{u*} + +If you set :math:`B = \emptyset`, removing the correction for the training points in block :math:`B` entirely, you have the original PITC posterior covariance based on the full set of training points :math:`f`, as expected. Now we only have to compute :math:`\Kuf \Lam^{-1} \Kfu` (the term that scales with the full training set :math:`f`) once, and nothing that scales with :math:`|\cancel{B}|`. This still involves an :math:`O(M^3)` decomposition for each group :math:`B`. I have been assuming our group size :math:`|B|` is significantly smaller than the number of inducing points :math:`M`, so it's not great, but an interesting thing happens if we use the "downdate" form of Woodbury's lemma in the opposite direction to the usual way: + +.. math:: + \left(A - B C^{-1} B^T \right)^{-1} &= A^{-1} + A^{-1} B \left(C - B^T A^{-1} B\right)^{-1} B^T A^{-1} \\ + +Letting :math:`\mathbf{A} = \Kuu + \Kuf \Lam^{-1} \Kfu` (playing the part of :math:`A` in Woodbury's lemma): + +.. math:: + + (\sigma_*^2)^{\cancel{B} \cancel{B}} &= K_* - \mathbf{K}_{*u} \Kuu^{-1} \mathbf{K}_{u*} + \mathbf{K}_{*u} \left( \mathbf{A}^{-1} + \mathbf{A}^{-1} \mathbf{K}_{u B} \left(\Lam_{B} - \mathbf{K}_{B u} \mathbf{A}^{-1} \mathbf{K}_{u B}\right)^{-1} \mathbf{K}_{B u} \mathbf{A}^{-1} \right) \mathbf{K}_{u*} \\ + &= K_* - \mathbf{K}_{*u} \Kuu^{-1} \mathbf{K}_{u*} + \mathbf{K}_{*u} \mathbf{A}^{-1} \mathbf{K}_{u*} + \mathbf{K}_{*u} \mathbf{A}^{-1} \mathbf{K}_{u B} \left(\Lam_{B} - \mathbf{K}_{B u} \mathbf{A}^{-1} \mathbf{K}_{u B}\right)^{-1} \mathbf{K}_{B u} \mathbf{A}^{-1} \mathbf{K}_{u*} \\ + +I find the use of Woodbury's lemma to downdate a bit uncomfortable -- does this work with real symmetric matrices? But we would only need to decompose :math:`\mathbf{A}` (and also :math:`\Kuu`) at a cost of :math:`O(M^3)` once; the inversion to be calculated per group costs :math:`O(|B|^3)`. I also don't immediately spot a slick QR decomposition way to do this per-group term; it's pretty symmetrical. The only thing that jumps out is precomputing :math:`\mathbf{K}_{B u} \mathbf{A}^{-\frac{1}{2}}`. + +---------------------------------------------- +Derivation of Woodbury's Lemma for Differences +---------------------------------------------- + +This is specifically for symmetric matrices, to convince myself the downdate formula is legit. We would like to compute a rank-:math:`k` downdate to the inverse of a matrix :math:`A`. + +.. math:: + \left(A - B C^{-1} B^T \right)^{-1} &= A^{-1} + A^{-1} B \left(C - B^T A^{-1} B\right)^{-1} B^T A^{-1} \\ + +To be extra clear, let's say :math:`A \in \mathbb{R}^{n \times n}`, :math:`B \in \mathbb{R}^{n \times k}`, :math:`C \in \mathbb{R}^{k \times k}`. We put the relevant matrices into a block matrix :math:`M \in \mathbb{R}^{m \times m}` for :math:`m = n + k`, and label the conforming blocks of its inverse: + +.. math:: + M &= \begin{pmatrix} A & B \\ B^T & C\end{pmatrix} \\ + M^{-1} &= \begin{pmatrix} W & X \\ X^T & Y\end{pmatrix} \\ + M M^{-1} &= \begin{pmatrix} \mathbf{I} & \mathbf{0} \\ \mathbf{0} & \mathbf{I} \end{pmatrix} \\ + M^{-1} M &= \begin{pmatrix} \mathbf{I} & \mathbf{0} \\ \mathbf{0} & \mathbf{I} \end{pmatrix} + +The blocks of :math:`M M^{-1}` yield the following equations: + +.. math:: + AW + BX^T &= \mathbf{I} \\ + AX + BY &= \mathbf{0} \\ + B^T W + CX^T &= \mathbf{0} \\ + B^T X + CY &= \mathbf{I} \\ + +Rearrange the middle two: + +.. math:: + X &= -A^{-1} B Y \\ + X^T &= -C^{-1} B^T W + +If we do the same for :math:`M^{-1} M`: + +.. math:: + X &= -W B C^{-1} \\ + X^T &= -Y B^T A^{-1} + +These blocks are equal: + +.. math:: + C^{-1} B^T W &= Y B^T A^{-1} \\ + A^{-1} B Y &= W B C^{-1} \\ + +Now use the middle two equations from :math:`M M^{-1}` and plug them into the first and last equations irrespectively: + +.. math:: + W - A^{-1} B C^{-1} B^T W &= A^{1} \\ + \left(I - A^{-1} B C^{-1} B^T\right) W &= A^{-1} \\ + -C^{-1} B^T A B Y + Y &= C^{-1} \\ + \left(I - C^{-1} B^T A^{-1} B\right)Y &= C^{-1} \\ + +Assuming :math:`\left(I - A^{-1} B C^{-1} B^T\right)` and :math:`\left(I - C^{-1} B^T A^{-1} B\right)` are invertible, rearrange: + +.. math:: + W &= \left(I - A^{-1} B C^{-1} B^T\right)^{-1} A^{-1} \\ + &= \left(A - B C^{-1} B^T\right)^{-1} \\ + Y &= \left(I - C^{-1} B^T A^{-1} B\right)^{-1} C^{-1} \\ + &= \left(C - B^T A^{-1} B\right)^{-1} + +Now use equality of the off-diagonal blocks from the two ways to multiply :math:`M` and :math:`M^{-1}` above (don't you wish Sphinx would number equations?) and substitute: + +.. math:: + C^{-1} B^T \left(A - B C^{-1} B^T\right)^{-1} &= \left(C - B^T A^{-1} B\right)^{-1} B^T A^{-1} \\ + \left(A - B C^{-1} B^T\right)^{-1} B C^{-1} &= A^{-1} B \left(C - B^T A^{-1} B\right)^{-1} \\ + +Let's look just at the term :math:`\left(A - B C^{-1} B^T\right)` and right-multiply by :math:`-A^{-1}`: + +.. math:: + \left(A - B C^{-1} B^T\right) \left(-A^{-1}\right) &= -\mathbf{I} + B C^{-1} B^T A^{-1} \\ + \mathbf{I} + \left(A - B C^{-1} B^T\right) \left(-A^{-1}\right) &= B C^{-1} B^T A^{-1} + +Now let's return to the previous equation and right-multiply by :math:`B^T A^{-1}`: + +.. math:: + \left(A - B C^{-1} B^T\right)^{-1} B C^{-1} B^T A^{-1} &= A^{-1} B \left(C - B^T A^{-1} B\right)^{-1} B^T A^{-1} \\ + +Substitute the previous result for :math:`B C^{-1} B^T A^{-1}`: + +.. math:: + \left(A - B C^{-1} B^T\right)^{-1} \left( \mathbf{I} + \left(A - B C^{-1} B^T\right) \left(-A^{-1}\right) \right) &= A^{-1} B \left(C - B^T A^{-1} B\right)^{-1} B^T A^{-1} \\ + \left(A - B C^{-1} B^T\right)^{-1} - A^{-1} &= A^{-1} B \left(C - B^T A^{-1} B\right)^{-1} B^T A^{-1} \\ + \left(A - B C^{-1} B^T \right)^{-1} &= A^{-1} + A^{-1} B \left(C - B^T A^{-1} B\right)^{-1} B^T A^{-1} \blacksquare + +which is the difference form of Woodbury's lemma, or a rank-:math:`k` downdate of :math:`A^{-1}`. The main assumption seems to be that :math:`\left(I - A^{-1} B C^{-1} B^T\right)` and :math:`\left(I - C^{-1} B^T A^{-1} B\right)` are invertible. + +------------------------ +Blockwise inversion of S +------------------------ + +I tried to invert :math:`\mathbf{S}` blockwise; it didn't work out but I'm leaving it in here just to look back at: + +.. math:: + \newcommand{VV}{\mathbf{V}} + \mathbf{U} &= \mathbf{Q}_{* \cancel{B}} \mathbf{S}^{-1}_{\cancel{B} B} \VV_{B *} \\ + &= \mathbf{Q}_{* \cancel{B}} \left( \mathbf{Q}_{\cancel{B} \cancel{B}}^{-1} \mathbf{Q}_{\cancel{B} B} \left(\mathbf{Q}_{B B} - \mathbf{Q}_{B \cancel{B}} \mathbf{Q}_{\cancel{B} \cancel{B}}^{-1} \mathbf{Q}_{\cancel{B} B}\right)^{-1} \right) \VV_{B *} + + +How are we going to avoid the :math:`O(|\cancel{B}|^3)` inversion for :math:`\mathbf{Q}_{\cancel{B} \cancel{B}}`? Also we are going to get cross-terms with both :math:`\mathbf{Q}_{* \cancel{B}}` and :math:`\mathbf{Q}_{B B}` involved, but maybe they are OK because they are only :math:`O(|\cancel{B}| |B|)`? \ No newline at end of file diff --git a/doc/src/sparse-gp-details.rst b/doc/src/sparse-gp-details.rst index ce5bab0a..92f94773 100644 --- a/doc/src/sparse-gp-details.rst +++ b/doc/src/sparse-gp-details.rst @@ -338,3 +338,144 @@ However, we found that while the posterior mean predictions were numerically sta You should be able to find this implementation and details using git history. +-------------------- +PIC +-------------------- + +The PIC approximation is introduced in: + + [5] Local and global sparse Gaussian process approximations + Edward Snelson and Zoubin Ghahramani + https://proceedings.mlr.press/v2/snelson07a/snelson07a.pdf + +without any details of the covariance calculation. + +Mean +^^^^ + +.. math:: + \newcommand{Lam}{\mathbf{\Lambda}} + \newcommand{Kuu}{\mathbf{K}_{uu}} + \newcommand{Kuf}{\mathbf{K}_{uf}} + \newcommand{Kfu}{\mathbf{K}_{fu}} + \newcommand{VV}{\mathbf{V}} + \mu^{PIC} &= \tilde{\mathbf{K}}^{PIC}_{*f} \left[ \tilde{\mathbf{K}}^{PITC}_{ff} \right]^{-1} y \\ + &= \begin{bmatrix} \mathbf{Q}_{* \cancel{B}} & \mathbf{Q}_{* B} + \mathbf{V}_{* B}\end{bmatrix} \begin{bmatrix} \mathbf{Q}_{\cancel{B} \cancel{B}} & \mathbf{Q}_{\cancel{B} B} \\ \mathbf{Q}_{B \cancel{B}} & \mathbf{Q}_{B B} \end{bmatrix}^{-1} \begin{bmatrix} y_{\cancel{B}} \\ y_{B} \end{bmatrix} + +Breaking down :math:`\mathbf{K}_{* B} = \mathbf{Q}_{* B} + \VV_{* B}` we can write the PIC mean as a correction to the PITC mean: + +.. math:: + \mu^{PIC} &= \mu^{PITC} + \underbrace{\begin{bmatrix} \mathbf{0} & \VV_{* B} \end{bmatrix} \begin{bmatrix} \mathbf{Q}_{\cancel{B} \cancel{B}} & \mathbf{Q}_{\cancel{B} B} \\ \mathbf{Q}_{B \cancel{B}} & \mathbf{Q}_{B B} \end{bmatrix}^{-1} \begin{bmatrix} y_{\cancel{B}} \\ y_{B} \end{bmatrix} }_{\Delta \mu^{PIC}}\\ + +The inverse of the PITC term can be had via Woodbury's lemma and the QR decomposition. + +.. math:: + \Delta \mu^{PIC} &=\begin{bmatrix} \mathbf{0} & \VV_{* B} \end{bmatrix} \begin{bmatrix} \mathbf{Q}_{\cancel{B} \cancel{B}} & \mathbf{Q}_{\cancel{B} B} \\ \mathbf{Q}_{B \cancel{B}} & \mathbf{Q}_{B B} \end{bmatrix}^{-1} \begin{bmatrix} y_{\cancel{B}} \\ y_{B} \end{bmatrix} \\ + &= \begin{bmatrix} \mathbf{0} & \VV_{* B} \end{bmatrix} \left(\Kfu \Kuu^{-1} \Kuf + \Lam\right)^{-1} y \\ + &= \begin{bmatrix} \mathbf{0} & \VV_{* B} \end{bmatrix} \left( \Lam^{-1} - \Lam^{-1} \Kfu \left(\Kuu + \Kuf \Lam^{-1} \Kfu\right)^{-1} \Kuf \Lam^{-1} \right) y + +Assume we have already computed the PITC approximation as described above. We already have access to the information vector :math:`v = P R^{-1} Q^T_1 \Lam^{-\frac{1}{2}} y` by the QR decomposition of :math:`B = \begin{bmatrix} \Lam^{-\frac{1}{2}} \Kfu \\ L^T_{uu}\end{bmatrix}`. We can mimic the solution of the PITC approximation, but since we are premultiplying by the block-diagonal matrix :math:`\Lam^{-1}`, we can split this into per-block components: + +.. math:: + \Delta \mu^{PIC} &= \begin{bmatrix} \mathbf{0} & \VV_{* B} \end{bmatrix} \Lam^{-1} \left(y - \Kfu v\right) \\ + &= \mathbf{V}_{* b} \Lam^{-1}_{B} \left(y_{B} - \mathbf{K}_{B u} v_{B} \right) + +Covariance +^^^^^^^^^^ + +The way I started partitioning the covariance term into blocks is as follows: + +.. math:: + + (\sigma_*^2)^{PIC} &= K_* - \tilde{\mathbf{K}}^{PIC}_{*f} \left[ \tilde{\mathbf{K}}^{PITC}_{ff} \right]^{-1} \tilde{\mathbf{K}}^{PIC}_{f*} + \sigma^2 \\ + &= K_* - \begin{bmatrix} \mathbf{Q}_{* \cancel{B}} & \mathbf{K}_{* B} \end{bmatrix} \left(\mathbf{Q}_{ff} - \mathtt{blkdiag}(\mathbf{K}_{ff} - \mathbf{Q}_{ff})\right)^{-1} \begin{bmatrix} \mathbf{Q}_{\cancel{B} *} \\ \mathbf{K}_{B *} \end{bmatrix} \\ + &= K_* - \begin{bmatrix} \mathbf{Q}_{* \cancel{B}} & \mathbf{K}_{* B} \end{bmatrix} \left(\mathbf{K}_{fu} \mathbf{K}_{uu}^{-1} \mathbf{K}_{uf} + \Lam \right)^{-1} \begin{bmatrix} \mathbf{Q}_{\cancel{B} *} \\ \mathbf{K}_{B *} \end{bmatrix} + +The problem with doing this for PIC covariance (vs. PIC mean and PITC) is that we can't left-multiply the whole thing by :math:`\mathbf{K}_{uu}^{-1} \mathbf{K}_{uf}` (which in those instances leads to applying Woodbury's lemma to reduce the inverse to the size of the number of inducing points :math:`M`) because :math:`\mathbf{K}_{*B}` is not a low-rank approximation using the inducing points. We can instead break up the inverse term into blocks: + +.. math:: + (\sigma_*^2)^{PIC} &= K_* - + \begin{bmatrix} \mathbf{Q}_{* \cancel{B}} & \mathbf{K}_{* B}\end{bmatrix} + \begin{bmatrix} \mathbf{Q}_{\cancel{B} \cancel{B}} & \mathbf{Q}_{\cancel{B} B} \\ \mathbf{Q}_{B \cancel{B}} & \mathbf{Q}_{B B} \end{bmatrix}^{-1} + \begin{bmatrix} \mathbf{Q}_{\cancel{B} *} \\ \mathbf{K}_{B *}\end{bmatrix} + +If we substitute :math:`\mathbf{K}_{* B} = \mathbf{Q}_{* B} + \VV_{* B}` as with the mean, it doesn't work out nicely: + +.. math:: + (\sigma_*^2)^{PIC} &= K_* - + \begin{bmatrix} \mathbf{Q}_{* \cancel{B}} & \mathbf{Q}_{* B} + \VV_{* B} \end{bmatrix} + \underbrace{\begin{bmatrix} \mathbf{Q}_{\cancel{B} \cancel{B}} & \mathbf{Q}_{\cancel{B} B} \\ \mathbf{Q}_{B \cancel{B}} & \mathbf{Q}_{B B} \end{bmatrix}^{-1}}_{\mathbf{S}^{-1}} + \begin{bmatrix} \mathbf{Q}_{\cancel{B} *} \\ \mathbf{Q}_{B *} + \VV_{B *}\end{bmatrix} \\ + &= K_* - \mathbf{Q}_{**}^{PITC} - \underbrace{\mathbf{Q}_{* f} \mathbf{S}^{-1}_{f B} \VV_{B *}}_{\mathbf{U}} - \mathbf{U}^T - \mathbf{V}_{* B} \mathbf{S}^{-1}_{B B} \mathbf{V}_{B *} + +Now we have 3 correction terms to apply to the posterior PITC covariance. The best thing I can think of is to apply Woodbury's lemma, but in the opposite direction to usual: + +.. math:: + \mathbf{S}^{-1} &= \left(\Kfu \Kuu^{-1} \Kuf + \Lam\right)^{-1} \\ + &= \Lam^{-1} - \Lam^{-1} \Kfu \left( \Kuu + \Kuf \Lam^{-1} \Kfu \right)^{-1} \Kuf \Lam^{-1} + +which involves decomposing the block-diagonal matrix :math:`\Lam` with blocks of size :math:`|B|` and a matrix the size :math:`M` of the inducing point set. In practice after we precompute terms, we have a sequence of triangular factors that we can subset as needed to pick out :math:`B` and :math:`\cancel{B}`. (Confusingly, one of these useful decompositions is the QR decomposition of the unrelated matrix :math:`B` in the PITC derivation above.) + +.. math:: + \mathbf{U} &= \mathbf{Q}_{* f} \mathbf{S}^{-1}_{f B} \VV_{B *} \\ + &= \mathbf{Q}_{* f} \left( \Lam^{-1} - \Lam^{-1} \Kfu \left( \Kuu + \Kuf \Lam^{-1} \Kfu \right)^{-1} \Kuf \Lam^{-1} \right)_{f B} \VV_{B *} + +This looks appealingly like we could just keep combining instances of the QR decomposition of :math:`B`, but that would leave out the :math:`\Kuu` part. + +The open question is how to efficiently distill this into various correction terms for each group that don't require operations that scale with :math:`\cancel{B}`, since the paper promises :math:`O((|B| + M)^2)` for predictive covariances after precomputation. In principle, using sparse vectors / matrices for the :math:`*` target components, combined with Eigen's expression templates, should bring the complexity of some of these repeated solves for mostly empty vectors (for :math:`B`) down to :math:`O(|B|)`, and likewise for inducing points. + +At this point, our cross-terms are preceded by :math:`Q_{* f}`, which expands to :math:`K_{*u} \Kuu^{-1} \Kuf`. We can precompute everything except :math:`K_{*u}`, including the :math:`O(M^3)` inducing point decomposition, leaving prediction-time computations to scale with :math:`M`. + +To compute :math:`\mathbf{V}_{* B} \mathbf{S}^{-1}_{B B} \mathbf{V}_{B *}`, we form a (mostly zero) column of :math:`\mathbf{V}` for each feature, break the two terms of :math:`\mathbf{S}^{-1}` into symmetric parts, multiply by :math:`\mathbf{V}` and subtract, here in excruciating notational detail: + +.. math:: + \VV_{* B} \mathbf{S}^{-1} \VV_{B *} &= \VV_{* B} \left( \Lam^{-1} - \Lam^{-1} \Kfu \left( \Kuu + \Kuf \Lam^{-1} \Kfu \right)^{-1} \Kuf \Lam^{-1} \right)_{B B} \VV_{B *} \\ + &= \VV_{* B} \left( \left(P_\Lam L_\Lam^{-T} D_\Lam^{-\frac{1}{2}}\right) \underbrace{\left(D_\Lam^{-\frac{1}{2}} L_\Lam^{-1} P_\Lam^T\right)}_{Z_\Lam} - \mathbf{G}^T (B^T B)^{-1} \mathbf{G} \right) \VV_{B *} \\ + &= \VV_{* B} \left( \mathbf{Z}_\Lam^T \mathbf{Z}_\Lam - \mathbf{G}^T (P_u R_u^{-1} R_u^{-T} P_u^T) \mathbf{G} \right) \VV_{B *} \\ + &= \VV_{* B} \left( \mathbf{Z}_\Lam^T \mathbf{Z}_\Lam - \mathbf{G}^T (P_u R_u^{-1}) \underbrace{(R_u^{-T} P_u^T) \mathbf{G}}_{\mathbf{Z}_u} \right) \VV_{B *} \\ + &= \VV_{* B} \left( \mathbf{Z}_\Lam^T \mathbf{Z}_\Lam - \mathbf{Z}_u^T \mathbf{Z}_u \right) \VV_{B *} \\ + &= \VV_{* B} \mathbf{Z}_\Lam^T \underbrace{\mathbf{Z}_\Lam \VV_{B *}}_{\mathbf{\xi}_\Lam} - \VV_{* B} \mathbf{Z}_u^T \underbrace{\mathbf{Z}_u \VV_{B *}}_{\mathbf{\xi}_u} \\ + &= \mathbf{\xi}_\Lam^T \mathbf{\xi}_\Lam - \mathbf{\xi}_u^T \mathbf{\xi}_u \\ + +Note that the left-hand (subscript :math:`\Lam`) term is the decomposition of a block-diagonal matrix, so it will only contain cross-terms between features corresponding to the same local block. This calculation can be done blockwise. The right-hand (subscript :math:`u`) term projects the features through the local block of the training dataset and then through the inducing points, so the cross-terms are not in general sparse, and this calculation involves a lot more careful indexing. + +The same breakdown of :math:`\mathbf{S}^{-1}` can be used to compute :math:`\mathbf{W}` during the fit step. + +The notation :math:`\mathbf{W}_B` indicates that the relevant columns of :math:`\mathbf{W}` are used on the right-hand side. This is mathematically equivalent to making :math:`\VV_{B *}` have dimension :math:`N \times p` and be zero outside block :math:`B`. Computationally the right factor must be a sparse object to preserve the desired asymptotics. + +Summing up +^^^^^^^^^^ + +For the covariance, we must precompute: + + - :math:`P^TLDL^TP = \Lam` + - :math:`\mathbf{G} = \Lam^{-1} \Kfu` + - :math:`L_{uu} L_{uu}^T = \Kuu` + - :math:`QRP^T = B = \begin{bmatrix}\Lam^{-\frac{1}{2}}\Kfu \\ L_{uu} \end{bmatrix}` such that :math:`\mathbf{S}^{-1} = \Lam^{-1} - \mathbf{G} \left(B^T B\right)^{-1} \mathbf{G}^T`, and blocks :math:`\mathbf{S}^{-1}_{a b}` can be got by choosing the right rows / columns with which to do permutations and back-substitutions. + - :math:`\mathbf{W} = \Kuu^{-1} \mathbf{K}_{u f} \mathbf{S}^{-1}` + +For the mean, we compute the information vector :math:`v` as in PITC. + +then for each group :math:`B`: + + - :math:`\mathbf{Y}_B = \Kuu^{-1} \mathbf{K}_{u B}` + - :math:`v_b = \Lam_{B B}^{-1} \left( y_B - \mathbf{K}_{B u} v \right)` + +Given that we have already computed :math:`\Kfu`, we can use :math:`\mathbf{K}_{u B}` and :math:`\mathbf{K}_{u \cancel{B}}` efficiently in Eigen using sparse matrices with a single entry per nonzero row or column to be used. + +Then at prediction time, we must compute: + + - :math:`\mathbf{K}_{* B}`, :math:`O(|B|)` + - :math:`\mathbf{K}_{* u}`, :math:`O(M)` + - :math:`\mathbf{Q}_{* B} = \mathbf{K}_{* u} \mathbf{Y}_B`, :math:`O(M^2)` with the existing decomposition + - :math:`\VV_{* B} = \mathbf{K}_{* B} - \mathbf{Q}_{* B}`, :math:`O(|B|^2)` + - :math:`\mathbf{Q}_{**}^{PITC}` as with PITC + - :math:`\mathbf{U} = \mathbf{K}_{* u} \mathbf{W}_B \VV_{B *}`, :math:`O(M + |B|)`? + - :math:`\VV_{* B} \mathbf{S}^{-1}_{B B} \VV_{B *}`, :math:`O(|B|^2)` + +Further notes +^^^^^^^^^^^^^ + +If asked to predict something that is uncorrelated with all training data (conditional on inducing points), we simply fall back to the PITC approximation. + +On some problems, calculation of the predictive covariance using Woodbury's lemma to avoid an :math:`O(N^3)` decomposition seems to lead to numerical problems. \ No newline at end of file diff --git a/include/albatross/SparseGP b/include/albatross/SparseGP index 925bb506..ddcd2ffc 100644 --- a/include/albatross/SparseGP +++ b/include/albatross/SparseGP @@ -25,5 +25,6 @@ #include #include #include +#include #endif diff --git a/include/albatross/src/cereal/gp.hpp b/include/albatross/src/cereal/gp.hpp index b3926397..9b421b83 100644 --- a/include/albatross/src/cereal/gp.hpp +++ b/include/albatross/src/cereal/gp.hpp @@ -17,7 +17,9 @@ using albatross::Fit; using albatross::GaussianProcessBase; using albatross::GPFit; using albatross::LinearCombination; + using albatross::SparseGPFit; +using albatross::PICGPFit; #ifndef GP_SERIALIZATION_VERSION #define GP_SERIALIZATION_VERSION 2 @@ -51,6 +53,28 @@ inline void serialize(Archive &archive, Fit> &fit, } } +template +inline void +serialize(Archive &archive, + Fit> &fit, + const std::uint32_t version) { + archive(cereal::make_nvp("train_features", fit.train_features)); + archive(cereal::make_nvp("inducing_features", fit.inducing_features)); + archive(cereal::make_nvp("train_covariance", fit.train_covariance)); + archive(cereal::make_nvp("sigma_R", fit.sigma_R)); + archive(cereal::make_nvp("P", fit.P)); + archive(cereal::make_nvp("mean_w", fit.mean_w)); + archive(cereal::make_nvp("W", fit.W)); + archive(cereal::make_nvp("covariance_Y", fit.covariance_Y)); + archive(cereal::make_nvp("Z", fit.Z)); + archive(cereal::make_nvp("A_ldlt", fit.A_ldlt)); + archive(cereal::make_nvp("measurement_groups", fit.measurement_groups)); + archive(cereal::make_nvp("information", fit.information)); + archive(cereal::make_nvp("numerical_rank", fit.numerical_rank)); + archive(cereal::make_nvp("cols_Bs", fit.cols_Bs)); +} + template inline void save(Archive &archive, diff --git a/include/albatross/src/models/pic_gp.hpp b/include/albatross/src/models/pic_gp.hpp new file mode 100644 index 00000000..cd088341 --- /dev/null +++ b/include/albatross/src/models/pic_gp.hpp @@ -0,0 +1,1054 @@ +/* + * Copyright (C) 2024 Swift Navigation Inc. + * Contact: Swift Navigation + * + * This source is subject to the license found in the file 'LICENSE' which must + * be distributed together with this source. All other rights reserved. + * + * THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, + * EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE. + */ + +#ifndef INCLUDE_ALBATROSS_MODELS_PIC_GP_H_ +#define INCLUDE_ALBATROSS_MODELS_PIC_GP_H_ + +namespace albatross { + +template +class BruteForcePIC + : public CovarianceFunction< + BruteForcePIC> { +public: + CovarianceType cov_; + std::vector inducing_points_; + GrouperFunction grouper_; + Eigen::LDLT K_uu_ldlt_; + + BruteForcePIC(const std::vector &inducing_points, + CovarianceType &&cov, GrouperFunction &&grouper) + : cov_{cov}, inducing_points_{inducing_points}, grouper_{grouper}, + K_uu_ldlt_{cov_(inducing_points_, inducing_points_).ldlt()} {} + + template + double _call_impl(const X &x, const Y &y) const { + if (grouper_(x) == grouper_(y)) { + return cov_(x, y); + } + + Eigen::VectorXd K_xu(inducing_points_.size()); + Eigen::VectorXd K_uy(inducing_points_.size()); + for (Eigen::Index i = 0; + i < static_cast(inducing_points_.size()); ++i) { + K_xu[i] = cov_(x, inducing_points_[i]); + K_uy[i] = cov_(inducing_points_[i], y); + } + // const Eigen::VectorXd K_uy = cov_(inducing_points_, y); + return K_xu.dot(K_uu_ldlt_.solve(K_uy)); + } +}; + +template +auto make_brute_force_pic_covariance( + const std::vector &inducing_points, + CovarianceType &&cov, GrouperFunction &&grouper) { + return BruteForcePIC( + inducing_points, std::forward(cov), + std::forward(grouper)); +} + +template +class PICGaussianProcessRegression; + +template +struct PICGPFit {}; + +template +using PICGroupIndexType = + typename std::result_of::type; + +template struct PICGroup { + // Row indices are into the original set of features into the data + // used to form S. + Eigen::Index initial_row; + Eigen::Index block_size; + Eigen::Index block_index; + albatross::RegressionDataset dataset; +}; + +template +using PICGroupMap = std::map, + PICGroup>; + +template +struct Fit> { + using PermutationIndices = Eigen::Matrix; + using GroupIndexType = PICGroupIndexType; + using GroupMap = PICGroupMap; + + std::vector train_features; + std::vector inducing_features; + Eigen::SerializableLDLT train_covariance; + Eigen::MatrixXd sigma_R; + Eigen::PermutationMatrixX P; + std::vector mean_w; + Eigen::MatrixXd W; + std::vector covariance_Y; + Eigen::MatrixXd Z; + BlockDiagonalLDLT A_ldlt; + GroupMap measurement_groups; + Eigen::VectorXd information; + Eigen::Index numerical_rank; + std::vector> cols_Bs; + GrouperFunction grouper; + + Fit(){}; + + Fit(const std::vector &features_, + const std::vector &inducing_features_, + const Eigen::SerializableLDLT &train_covariance_, + const Eigen::MatrixXd &sigma_R_, const Eigen::PermutationMatrixX &P_, + const std::vector &mean_w_, const Eigen::MatrixXd &W_, + const std::vector &covariance_Y_, + const Eigen::MatrixXd &Z_, const BlockDiagonalLDLT &A_ldlt_, + const GroupMap &measurement_groups_, const Eigen::VectorXd &information_, + Eigen::Index numerical_rank_, + const std::vector> cols_Bs_, + GrouperFunction grouper_) + : train_features(features_), inducing_features(inducing_features_), + train_covariance(train_covariance_), sigma_R(sigma_R_), P(P_), + mean_w(mean_w_), W(W_), covariance_Y(covariance_Y_), Z(Z_), + A_ldlt(A_ldlt_), measurement_groups(measurement_groups_), + information(information_), numerical_rank(numerical_rank_), + cols_Bs(cols_Bs_), grouper(grouper_) {} + + void shift_mean(const Eigen::VectorXd &mean_shift) { + ALBATROSS_ASSERT(mean_shift.size() == information.size()); + information += train_covariance.solve(mean_shift); + } + + bool operator==( + const Fit> + &other) const { + return (train_features == other.train_features && + inducing_features == other.inducing_features && + train_covariance == other.train_covariance && + sigma_R == other.sigma_R && P.indices() == other.P.indices() && + mean_w == other.mean_w && W == W && + covariance_Y == other.covariance_Y && Z == other.Z && + A_ldlt == other.A_ldlt && + measurement_groups == other.measurement_groups && + information == other.information && + numerical_rank == other.numerical_rank && cols_Bs == other.cols_Bs); + } +}; + +/* + * This class implements an approximation technique for Gaussian processes + * which relies on an assumption that all observations are independent (or + * groups of observations are independent) conditional on a set of inducing + * points. The method is based off: + * + * [1] Sparse Gaussian Processes using Pseudo-inputs + * Edward Snelson, Zoubin Ghahramani + * http://www.gatsby.ucl.ac.uk/~snelson/SPGP_up.pdf + * + * Though the code uses notation closer to that used in this (excellent) + * overview of these methods: + * + * [2] A Unifying View of Sparse Approximate Gaussian Process Regression + * Joaquin Quinonero-Candela, Carl Edward Rasmussen + * http://www.jmlr.org/papers/volume6/quinonero-candela05a/quinonero-candela05a.pdf + * + * Very broadly speaking this method starts with a prior over the observations, + * + * [f] ~ N(0, K_ff) + * + * where K_ff(i, j) = covariance_function(features[i], features[j]) and f + * represents the function value. + * + * It then uses a set of inducing points, u, and makes some assumptions about + * the conditional distribution: + * + * [f|u] ~ N(K_fu K_uu^-1 u, K_ff - Q_ff) + * + * Where Q_ff = K_fu K_uu^-1 K_uf represents the variance in f that is + * explained by u. + * + * For FITC (Fully Independent Training Contitional) the assumption is that + * K_ff - Qff is diagonal, for PITC (Partially Independent Training Conditional) + * that it is block diagonal. These assumptions lead to an efficient way of + * inferring the posterior distribution for some new location f*, + * + * [f*|f=y] ~ N(K_*u S K_uf A^-1 y, K_** - Q_** + K_*u S K_u*) + * + * Where S = (K_uu + K_uf A^-1 K_fu)^-1 and A = diag(K_ff - Q_ff) and "diag" + * may mean diagonal or block diagonal. Regardless we end up with O(m^2n) + * complexity instead of O(n^3) of direct Gaussian processes. (Note that in [2] + * S is called sigma and A is lambda.) + * + * Of course, the implementation details end up somewhat more complex in order + * to improve numerical stability. Here we use an approach based off the QR + * decomposition which is described in + * + * Stable and Efficient Gaussian Process Calculations + * http://www.jmlr.org/papers/volume10/foster09a/foster09a.pdf + * + * A more detailed (but more likely to be out of date) description of + * the details can be found on the albatross documentation. A short + * description follows. It starts by setting up the Sparse Gaussian process + * covariances + * + * [f|u] ~ N(K_fu K_uu^-1 u, K_ff - Q_ff) + * + * We then set, + * + * A = K_ff - Q_ff + * = K_ff - K_fu K_uu^-1 K_uf + * + * which can be thought of as the covariance in the training data which + * is not be explained by the inducing points. The fundamental + * assumption in these sparse Gaussian processes is that A is sparse, in + * this case block diagonal. + * + * We then build a matrix B and use its QR decomposition (with pivoting P) + * + * B = |A^-1/2 K_fu| = |Q_1| R P^T + * |K_uu^{T/2} | |Q_2| + * + * After which we can get the information vector (see _fit_impl) + * + * v = (K_uu + K_uf A^-1 K_fu)^-1 K_uf A^-1 y + * = (B^T B) B^T A^-1/2 y + * = P R^-1 Q_1^T A^-1/2 y + * + * and can make predictions for new locations (see _predict_impl), + * + * [f*|f=y] ~ N(K_*u S K_uf A^-1 y, K_** - Q_** + K_*u S K_u*) + * ~ N(m, C) + * + * where we have + * + * m = K_*u S K_uf A^-1 y + * = K_*u v + * + * and + * + * C = K_** - Q_** + K_*u S K_u* + * + * using + * + * Q_** = K_*u K_uu^-1 K_u* + * = (K_uu^{-1/2} K_u*)^T (K_uu^{-1/2} K_u*) + * = Q_sqrt^T Q_sqrt + * and + * + * K_*u S K_u* = K_*u (K_uu + K_uf A^-1 K_fu)^-1 K_u* + * = K_*u (B^T B)^-1 K_u* + * = K_*u (P R R^T P^T)^-1 K_u* + * = (P R^-T K_u*)^T (P R^-T K_u*) + * = S_sqrt^T S_sqrt + */ +template +class PICGaussianProcessRegression + : public GaussianProcessBase> { +public: + using Base = GaussianProcessBase< + CovFunc, MeanFunc, + PICGaussianProcessRegression>; + + template + using GroupIndexType = PICGroupIndexType; + + template + using GroupMap = PICGroupMap; + + PICGaussianProcessRegression() : Base() { initialize_params(); }; + + PICGaussianProcessRegression(const CovFunc &covariance_function, + const MeanFunc &mean_function) + : Base(covariance_function, mean_function) { + initialize_params(); + }; + PICGaussianProcessRegression(CovFunc &&covariance_function, + MeanFunc &&mean_function) + : Base(std::move(covariance_function), std::move(mean_function)) { + initialize_params(); + }; + + PICGaussianProcessRegression( + const CovFunc &covariance_function, const MeanFunc &mean_function, + const GrouperFunction &independent_group_function, + const InducingPointStrategy &inducing_point_strategy, + const std::string &model_name) + : Base(covariance_function, mean_function, model_name), + inducing_point_strategy_(inducing_point_strategy), + independent_group_function_(independent_group_function) { + initialize_params(); + }; + PICGaussianProcessRegression(CovFunc &&covariance_function, + MeanFunc &&mean_function, + GrouperFunction &&independent_group_function, + InducingPointStrategy &&inducing_point_strategy, + const std::string &model_name) + : Base(std::move(covariance_function), std::move(mean_function), + model_name), + inducing_point_strategy_(std::move(inducing_point_strategy)), + independent_group_function_(std::move(independent_group_function)) { + initialize_params(); + }; + + void initialize_params() { + measurement_nugget_ = { + details::DEFAULT_NUGGET, + LogScaleUniformPrior(PARAMETER_EPSILON, PARAMETER_MAX)}; + inducing_nugget_ = {details::DEFAULT_NUGGET, + LogScaleUniformPrior(PARAMETER_EPSILON, PARAMETER_MAX)}; + } + + ParameterStore get_params() const override { + auto params = map_join(this->mean_function_.get_params(), + this->covariance_function_.get_params()); + params[details::measurement_nugget_name()] = measurement_nugget_; + params[details::inducing_nugget_name()] = inducing_nugget_; + return params; + } + + void set_param(const std::string &name, const Parameter ¶m) override { + if (name == details::measurement_nugget_name()) { + measurement_nugget_ = param; + return; + } else if (name == details::inducing_nugget_name()) { + inducing_nugget_ = param; + return; + } + const bool success = set_param_if_exists_in_any( + name, param, &this->covariance_function_, &this->mean_function_); + ALBATROSS_ASSERT(success); + } + + template ::type, + GrouperFunction>::value>::type> + void update_grouper_function(NewGrouper &&f) { + independent_group_function_ = std::forward(f); + } + + template ::type, + InducingPointStrategy>::value>::type> + void update_inducing_point_strategy(NewStrategy &&f) { + inducing_point_strategy_ = std::forward(f); + } + + template + auto + _update_impl(const Fit> &old_fit, + const std::vector &features, + const MarginalDistribution &targets) const { + assert(false && "Cannot call update() on a PIC GP yet!"); + + BlockDiagonalLDLT A_ldlt; + Eigen::SerializableLDLT K_uu_ldlt; + Eigen::MatrixXd K_fu; + Eigen::VectorXd y; + compute_internal_components(old_fit.train_features, features, targets, + &A_ldlt, &K_uu_ldlt, &K_fu, &y); + + const Eigen::Index n_old = old_fit.sigma_R.rows(); + const Eigen::Index n_new = A_ldlt.rows(); + const Eigen::Index k = old_fit.sigma_R.cols(); + Eigen::MatrixXd B = Eigen::MatrixXd::Zero(n_old + n_new, k); + + ALBATROSS_ASSERT(n_old == k); + + // Form: + // B = |R_old P_old^T| = |Q_1| R P^T + // |A^{-1/2} K_fu| |Q_2| + for (Eigen::Index i = 0; i < old_fit.permutation_indices.size(); ++i) { + const Eigen::Index &pi = old_fit.permutation_indices.coeff(i); + B.col(pi).topRows(i + 1) = old_fit.sigma_R.col(i).topRows(i + 1); + } + B.bottomRows(n_new) = A_ldlt.sqrt_solve(K_fu); + const auto B_qr = QRImplementation::compute(B, Base::threads_.get()); + + // Form: + // y_aug = |R_old P_old^T v_old| + // |A^{-1/2} y | + ALBATROSS_ASSERT(old_fit.information.size() == n_old); + Eigen::VectorXd y_augmented(n_old + n_new); + for (Eigen::Index i = 0; i < old_fit.permutation_indices.size(); ++i) { + y_augmented[i] = + old_fit.information[old_fit.permutation_indices.coeff(i)]; + } + y_augmented.topRows(n_old) = + old_fit.sigma_R.template triangularView() * + y_augmented.topRows(n_old); + + y_augmented.bottomRows(n_new) = A_ldlt.sqrt_solve(y, Base::threads_.get()); + const Eigen::VectorXd v = B_qr->solve(y_augmented); + + Eigen::MatrixXd R = get_R(*B_qr); + if (B_qr->rank() < B_qr->cols()) { + // Inflate the diagonal of R in an attempt to avoid singularity + R.diagonal() += + Eigen::VectorXd::Constant(B_qr->cols(), details::cSparseRNugget); + } + using FitType = + Fit>; + return FitType(old_fit.train_features, old_fit.train_covariance, R, + get_P(*B_qr), v, B_qr->rank()); + } + + // Here we create the QR decomposition of: + // + // B = |A^-1/2 K_fu| = |Q_1| R P^T + // |K_uu^{T/2} | |Q_2| + // + // which corresponds to the inverse square root of Sigma + // + // Sigma = (B^T B)^-1 + std::unique_ptr + compute_sigma_qr(const Eigen::SerializableLDLT &K_uu_ldlt, + const BlockDiagonalLDLT &A_ldlt, + const Eigen::MatrixXd &K_fu) const { + Eigen::MatrixXd B(A_ldlt.rows() + K_uu_ldlt.rows(), K_uu_ldlt.rows()); + B.topRows(A_ldlt.rows()) = A_ldlt.sqrt_solve(K_fu); + B.bottomRows(K_uu_ldlt.rows()) = K_uu_ldlt.sqrt_transpose(); + return QRImplementation::compute(B, Base::threads_.get()); + }; + + template < + typename FeatureType, + std::enable_if_t< + has_call_operator::value, int> = 0> + auto _fit_impl(const std::vector &features, + const MarginalDistribution &targets) const { + // Determine the set of inducing points, u. + const auto u = + inducing_point_strategy_(this->covariance_function_, features); + ALBATROSS_ASSERT(u.size() > 0 && "Empty inducing points!"); + + BlockDiagonalLDLT A_ldlt; + Eigen::SerializableLDLT K_uu_ldlt; + Eigen::MatrixXd K_fu; + Eigen::VectorXd y; + GroupMap measurement_groups; + compute_internal_components(u, features, targets, &A_ldlt, &K_uu_ldlt, + &K_fu, &y, &measurement_groups); + auto B_qr = compute_sigma_qr(K_uu_ldlt, A_ldlt, K_fu); + + // To make a prediction, we will need to compute cross-terms with + // sub-blocks of S^-1, where + // + // S^-1 = A^-1 - A^-1 K_fu (K_uu + K_uf A^-1 K_fu)^-1 K_uf A^-1 + // \____________________________________________/ + // Z^T Z + // + // The inverse part of the right-hand term of this is B_qr above, + // and we already have A_ldlt, so we can form Z. + // + // Now to compute sub-blocks of S^-1, we just index into Z^T Z + // appropriately, and if the term is diagonal, also solve using + // the blocks of A^-1. + const Eigen::MatrixXd Z = sqrt_solve(*B_qr, A_ldlt.solve(K_fu).transpose()); + + Eigen::VectorXd y_augmented = Eigen::VectorXd::Zero(B_qr->rows()); + y_augmented.topRows(y.size()) = A_ldlt.sqrt_solve(y, Base::threads_.get()); + const Eigen::VectorXd v = B_qr->solve(y_augmented); + + using InducingPointFeatureType = typename std::decay::type; + + const Eigen::MatrixXd W = K_uu_ldlt.solve( + (A_ldlt.solve(K_fu) - Z.transpose() * Z * K_fu).transpose()); + + // TODO(@peddie): a lot of this can be batched + std::vector covariance_Y(measurement_groups.size()); + std::vector mean_w(measurement_groups.size()); + std::vector> cols_Bs(measurement_groups.size()); + const auto block_start_indices = A_ldlt.block_to_row_map(); + + const auto precompute_block = [&, this](std::size_t block_number, + Eigen::Index start_row) -> void { + const Eigen::Index block_size = A_ldlt.blocks[block_number].rows(); + // K_fu is already computed, so we can form K_uB and K_uA by + // appropriate use of sparse indexing matrices and avoid an O(N + // M) operation. + // + // This nonsense would be a lot more straightforward with Eigen + // 3.4's slicing and indexing API. + Eigen::SparseMatrix cols_B(features.size(), block_size); + cols_B.reserve(block_size); + + for (Eigen::Index i = 0; i < block_size; ++i) { + cols_B.insert(start_row + i, i) = 1.; + } + cols_B.makeCompressed(); + + cols_Bs[block_number] = cols_B; + // v_b \in R^b = A_BB^-1 (y_b - K_Bu v) + Eigen::MatrixXd ydiff_b = cols_B.transpose() * (y - K_fu * v); + const Eigen::MatrixXd mean_w_full = + A_ldlt.blocks[block_number].solve(ydiff_b); + mean_w[block_number] = mean_w_full; + // Y \in R^(u x b) = K_uu^-1 K_uB + covariance_Y[block_number] = K_uu_ldlt.solve(K_fu.transpose() * cols_B); + }; + + apply_map(block_start_indices, precompute_block, Base::threads_.get()); + + using FitType = + Fit>; + return FitType(features, u, K_uu_ldlt, get_R(*B_qr), get_P(*B_qr), mean_w, + W, covariance_Y, Z, A_ldlt, measurement_groups, v, + B_qr->rank(), cols_Bs, independent_group_function_); + } + + template + auto fit_from_prediction(const std::vector &new_inducing_points, + const JointDistribution &prediction_) const { + FitModel>> + output(*this, + Fit>()); + Fit> &new_fit = + output.get_fit(); + + new_fit.train_features = new_inducing_points; + + const Eigen::MatrixXd K_zz = + this->covariance_function_(new_inducing_points, Base::threads_.get()); + new_fit.train_covariance = Eigen::SerializableLDLT(K_zz); + + // We're going to need to take the sqrt of the new covariance which + // could be extremely small, so here we add a small nugget to avoid + // numerical instability + JointDistribution prediction(prediction_); + prediction.covariance.diagonal() += Eigen::VectorXd::Constant( + cast::to_index(prediction.size()), 1, details::DEFAULT_NUGGET); + new_fit.information = new_fit.train_covariance.solve(prediction.mean); + + // Here P is the posterior covariance at the new inducing points. If + // we consider the case where we rebase and then use the resulting fit + // to predict the new inducing points then we see that the predictive + // covariance (see documentation above) would be,: + // + // C = K_zz - Q_zz + K_zz Sigma K_zz + // + // We can use this, knowing that at the inducing points K_zz = Q_zz, to + // derive our updated Sigma, + // + // C = K_zz - K_zz + K_zz Sigma K_zz + // C = K_zz Sigma K_zz + // Sigma = K_zz^-1 C K_zz^-1 + // + // And since we need to store Sigma in sqrt form we get, + // + // Sigma = (B_z^T B_z)^-1 + // = K_zz^-1 C K_zz^-1 + // + // So by setting: + // + // B_z = C^{-1/2} K_z + // + // We can then compute and store the QR decomposition of B + // as we do in a normal fit. + const Eigen::SerializableLDLT C_ldlt(prediction.covariance); + const Eigen::MatrixXd sigma_inv_sqrt = C_ldlt.sqrt_solve(K_zz); + const auto B_qr = QRImplementation::compute(sigma_inv_sqrt, nullptr); + + new_fit.P = get_P(*B_qr); + new_fit.sigma_R = get_R(*B_qr); + new_fit.numerical_rank = B_qr->rank(); + + return output; + } + + // This is included to allow the SparseGP + // to be compatible with fits generated + // using a standard GP. + using Base::_predict_impl; + + template + Eigen::VectorXd _predict_impl( + const std::vector &features, + const Fit> + &sparse_gp_fit, + PredictTypeIdentity &&) const { + const auto find_group = [this, &sparse_gp_fit](const auto &feature) { + return sparse_gp_fit.measurement_groups.find( + independent_group_function_(without_measurement(feature))); + }; + const Eigen::MatrixXd K_up = + this->covariance_function_(sparse_gp_fit.inducing_features, features); + Eigen::VectorXd mean_correction = Eigen::VectorXd::Zero(features.size()); + for (Eigen::Index j = 0; j < features.size(); ++j) { + const auto group = find_group(features[j]); + if (group == sparse_gp_fit.measurement_groups.end()) { + continue; + } + const std::vector fvec = {features[j]}; + + const Eigen::VectorXd features_cov = + this->covariance_function_(group->second.dataset.features, fvec); + const Eigen::VectorXd kpuy = + K_up.transpose().row(j) * + sparse_gp_fit.covariance_Y[group->second.block_index]; + const Eigen::VectorXd Vbp = features_cov - kpuy; + mean_correction[j] = + Vbp.dot(sparse_gp_fit.mean_w[group->second.block_index]); + } + + Eigen::VectorXd mean = + K_up.transpose() * sparse_gp_fit.information + mean_correction; + + this->mean_function_.add_to(features, &mean); + return mean; + } + + template + MarginalDistribution _predict_impl( + const std::vector &features, + const Fit> + &sparse_gp_fit, + PredictTypeIdentity &&) const { + const auto K_up = + this->covariance_function_(sparse_gp_fit.inducing_features, features); + Eigen::VectorXd mean = gp_mean_prediction(K_up, sparse_gp_fit.information); + this->mean_function_.add_to(features, &mean); + + // First pass: compute an O(1) mapping from features to groups + std::vector feature_to_block(features.size()); + std::vector groups( + features.size()); + Eigen::VectorXi col_alloc{Eigen::VectorXi::Zero(features.size())}; + bool all_same_group = true; + bool all_in_groups = true; + for (Eigen::Index j = 0; j < features.size(); ++j) { + groups[j] = sparse_gp_fit.measurement_groups.find( + independent_group_function_(without_measurement(features[j]))); + if (groups[j] != sparse_gp_fit.measurement_groups.end()) { + col_alloc(j) = groups[j]->second.block_size; + all_same_group = all_same_group && groups[j] == groups[0]; + } else { + all_in_groups = false; + } + feature_to_block[j] = + std::distance(sparse_gp_fit.measurement_groups.begin(), groups[j]); + } + + // Second pass: compute mean vector and fill sparse Vp matrix + Eigen::VectorXd mean_correction = Eigen::VectorXd::Zero(features.size()); + Eigen::SparseMatrix Vp(sparse_gp_fit.train_features.size(), + features.size()); + Vp.reserve(col_alloc); + for (Eigen::Index j = 0; j < features.size(); ++j) { + if (groups[j] == sparse_gp_fit.measurement_groups.end()) { + continue; + } + + const auto &B = groups[j]->second; + Eigen::VectorXd Vbp = + this->covariance_function_(B.dataset.features, + std::vector{features[j]}) - + K_up.col(j).transpose() * sparse_gp_fit.covariance_Y[B.block_index]; + for (Eigen::Index i = 0; i < Vbp.size(); ++i) { + Vp.insert(B.initial_row + i, j) = Vbp(i); + } + mean_correction[j] = Vbp.dot(sparse_gp_fit.mean_w[B.block_index]); + } + Vp.makeCompressed(); + + Eigen::MatrixXd xi_lambda = sparse_gp_fit.A_ldlt.sqrt_solve(Vp); + Eigen::MatrixXd xi_u = sparse_gp_fit.Z * Vp; + Eigen::VectorXd VSV_diag{ + (xi_lambda.transpose() * xi_lambda - xi_u.transpose() * xi_u) + .diagonal()}; + + const Eigen::VectorXd U_diag = (K_up.transpose() * sparse_gp_fit.W * Vp).diagonal(); + + Eigen::VectorXd marginal_variance(cast::to_index(features.size())); + for (Eigen::Index i = 0; i < marginal_variance.size(); ++i) { + marginal_variance[i] = this->covariance_function_( + features[cast::to_size(i)], features[cast::to_size(i)]); + } + + const Eigen::MatrixXd Q_sqrt = + sparse_gp_fit.train_covariance.sqrt_solve(K_up); + const Eigen::VectorXd Q_diag = + Q_sqrt.cwiseProduct(Q_sqrt).array().colwise().sum(); + marginal_variance -= Q_diag; + + const Eigen::MatrixXd S_sqrt = + sqrt_solve(sparse_gp_fit.sigma_R, sparse_gp_fit.P, K_up); + const Eigen::VectorXd S_diag = + S_sqrt.cwiseProduct(S_sqrt).array().colwise().sum(); + marginal_variance += S_diag; + + mean += mean_correction; + + return MarginalDistribution(mean, + marginal_variance - (2 * U_diag + VSV_diag)); + } + + template + JointDistribution _predict_impl( + const std::vector &features, + const Fit> + &sparse_gp_fit, + PredictTypeIdentity &&) const { + // K_up + const Eigen::MatrixXd K_up = + this->covariance_function_(sparse_gp_fit.inducing_features, features); + const Eigen::MatrixXd prior_cov = this->covariance_function_(features); + + // First pass: compute an O(1) mapping from features to groups + std::vector feature_to_block(features.size()); + std::vector groups( + features.size()); + Eigen::VectorXi col_alloc{Eigen::VectorXi::Zero(features.size())}; + bool all_same_group = true; + bool all_in_groups = true; + for (Eigen::Index j = 0; j < features.size(); ++j) { + groups[j] = sparse_gp_fit.measurement_groups.find( + independent_group_function_(without_measurement(features[j]))); + if (groups[j] != sparse_gp_fit.measurement_groups.end()) { + col_alloc(j) = groups[j]->second.block_size; + all_same_group = all_same_group && groups[j] == groups[0]; + } else { + all_in_groups = false; + } + feature_to_block[j] = + std::distance(sparse_gp_fit.measurement_groups.begin(), groups[j]); + } + + // Second pass: compute mean vector and fill sparse Vp matrix + Eigen::VectorXd mean_correction = Eigen::VectorXd::Zero(features.size()); + Eigen::SparseMatrix Vp(sparse_gp_fit.train_features.size(), + features.size()); + Vp.reserve(col_alloc); + for (Eigen::Index j = 0; j < features.size(); ++j) { + if (groups[j] == sparse_gp_fit.measurement_groups.end()) { + // Fall back to PITC + continue; + } + + const auto &B = groups[j]->second; + // TODO(@peddie): there must be some way to assign this + // expression directly to a map of the column memory + Eigen::VectorXd Vbp = + this->covariance_function_(B.dataset.features, + std::vector{features[j]}) - + K_up.col(j).transpose() * sparse_gp_fit.covariance_Y[B.block_index]; + for (Eigen::Index i = 0; i < Vbp.size(); ++i) { + Vp.insert(B.initial_row + i, j) = Vbp(i); + } + mean_correction[j] = Vbp.dot(sparse_gp_fit.mean_w[B.block_index]); + } + Vp.makeCompressed(); + + + Eigen::MatrixXd xi_lambda = sparse_gp_fit.A_ldlt.sqrt_solve(Vp); + Eigen::MatrixXd xi_u = sparse_gp_fit.Z * Vp; + Eigen::MatrixXd VSV{xi_lambda.transpose() * xi_lambda - + xi_u.transpose() * xi_u}; + + const Eigen::MatrixXd U = K_up.transpose() * sparse_gp_fit.W * Vp; + + const Eigen::MatrixXd S_sqrt = + sqrt_solve(sparse_gp_fit.sigma_R, sparse_gp_fit.P, K_up); + + const Eigen::MatrixXd Q_sqrt = + sparse_gp_fit.train_covariance.sqrt_solve(K_up); + + const Eigen::MatrixXd max_explained = Q_sqrt.transpose() * Q_sqrt; + const Eigen::MatrixXd unexplained = S_sqrt.transpose() * S_sqrt; + + const Eigen::MatrixXd pitc_covariance = + prior_cov - max_explained + unexplained; + + const Eigen::MatrixXd pic_correction = U + U.transpose() + VSV; + JointDistribution pred(K_up.transpose() * sparse_gp_fit.information + + mean_correction, + pitc_covariance - pic_correction); + this->mean_function_.add_to(features, &pred.mean); + + return pred; + } + + template + double log_likelihood(const RegressionDataset &dataset) const { + const auto u = + inducing_point_strategy_(this->covariance_function_, dataset.features); + + BlockDiagonalLDLT A_ldlt; + Eigen::SerializableLDLT K_uu_ldlt; + Eigen::MatrixXd K_fu; + Eigen::VectorXd y; + compute_internal_components(u, dataset.features, dataset.targets, &A_ldlt, + &K_uu_ldlt, &K_fu, &y); + const auto B_qr = compute_sigma_qr(K_uu_ldlt, A_ldlt, K_fu); + // The log likelihood for y ~ N(0, K) is: + // + // L = 1/2 (n log(2 pi) + log(|K|) + y^T K^-1 y) + // + // where in our case we have + // K = A + Q_ff + // and + // Q_ff = K_fu K_uu^-1 K_uf + // + // First we get the determinant, |K|: + // + // |K| = |A + Q_ff| + // = |K_uu + K_uf A^-1 K_fu| |K_uu^-1| |A| + // = |P R^T Q^T Q R P^T| |K_uu^-1| |A| + // = |R^T R| |K_uu^-1| |A| + // = |R|^2 |A| / |K_uu| + // + // Where the first equality comes from the matrix determinant lemma. + // https://en.wikipedia.org/wiki/Matrix_determinant_lemma#Generalization + // + // After which we can take the log: + // + // log(|K|) = 2 log(|R|) + log(|A|) - log(|K_uu|) + // + const double log_det_a = A_ldlt.log_determinant(); + + const double log_det_r = + B_qr->matrixR().diagonal().array().cwiseAbs().log().sum(); + const double log_det_K_uu = K_uu_ldlt.log_determinant(); + const double log_det = log_det_a + 2 * log_det_r - log_det_K_uu; + + // q = y^T K^-1 y + // = y^T (A + Q_ff)^-1 y + // = y^T (A^-1 - A^-1 K_fu (K_uu + K_uf A^-1 K_fu)^-1 K_uf A^-1) y + // = y^T A^-1 y - y^T A^-1 K_fu (K_uu + K_uf A^-1 K_fu)^-1 K_uf A^-1) y + // = y^T A^-1 y - y^T A^-1 K_fu (R^T R)^-1 K_uf A^-1) y + // = y^T A^-1 y - (R^-T K_uf A^-1 y)^T (R^-T K_uf A^-1 y) + // = y^T y_a - y_b^T y_b + // + // with y_b = R^-T K_uf y_a + const Eigen::VectorXd y_a = A_ldlt.solve(y); + + Eigen::VectorXd y_b = K_fu.transpose() * y_a; + y_b = sqrt_solve(*B_qr, y_b); + + double log_quadratic = y.transpose() * y_a; + log_quadratic -= y_b.transpose() * y_b; + + const double rank = cast::to_double(y.size()); + const double log_dimension = rank * log(2 * M_PI); + + return -0.5 * (log_det + log_quadratic + log_dimension) + + this->prior_log_likelihood(); + } + + InducingPointStrategy get_inducing_point_strategy() const { + return inducing_point_strategy_; + } + +private: + // This method takes care of a lot of the common book keeping required to + // setup the Sparse Gaussian Process problem. Namely, we want to get from + // possibly unordered features to a structured representation + // in the form K_ff = A_ff + Q_ff where Q_ff = K_fu K_uu^-1 K_uf and + // A_ff is block diagonal and is formed by subtracting Q_ff from K_ff. + // + template + void compute_internal_components( + const std::vector &inducing_features, + const std::vector &out_of_order_features, + const MarginalDistribution &out_of_order_targets, + BlockDiagonalLDLT *A_ldlt, Eigen::SerializableLDLT *K_uu_ldlt, + Eigen::MatrixXd *K_fu, Eigen::VectorXd *y, + GroupMap *measurement_groups) const { + + ALBATROSS_ASSERT(A_ldlt != nullptr); + ALBATROSS_ASSERT(K_uu_ldlt != nullptr); + ALBATROSS_ASSERT(K_fu != nullptr); + ALBATROSS_ASSERT(y != nullptr); + + const auto indexer = + group_by(out_of_order_features, independent_group_function_).indexers(); + + const auto out_of_order_measurement_features = + as_measurements(out_of_order_features); + + std::vector reordered_inds; + BlockDiagonal K_ff; + Eigen::Index begin_row_index = 0; + Eigen::Index block_index = 0; + // TODO(@peddie): compute these blocks asynchronously? + for (const auto &pair : indexer) { + reordered_inds.insert(reordered_inds.end(), pair.second.begin(), + pair.second.end()); + measurement_groups->operator[](pair.first) = PICGroup{ + begin_row_index, static_cast(pair.second.size()), + block_index, + albatross::RegressionDataset( + subset(out_of_order_features, pair.second), + out_of_order_targets.subset(pair.second))}; + auto subset_features = + subset(out_of_order_measurement_features, pair.second); + K_ff.blocks.emplace_back(this->covariance_function_(subset_features)); + K_ff.blocks.back().diagonal() += + subset(out_of_order_targets.covariance.diagonal(), pair.second); + begin_row_index += pair.second.size(); + ++block_index; + } + + const auto features = + subset(out_of_order_measurement_features, reordered_inds); + auto targets = subset(out_of_order_targets, reordered_inds); + *y = targets.mean; + + this->mean_function_.remove_from( + subset(out_of_order_features, reordered_inds), &targets.mean); + + *K_fu = this->covariance_function_(features, inducing_features, + Base::threads_.get()); + + auto K_uu = + this->covariance_function_(inducing_features, Base::threads_.get()); + K_uu.diagonal() += + inducing_nugget_.value * Eigen::VectorXd::Ones(K_uu.rows()); + + *K_uu_ldlt = K_uu.ldlt(); + // P is such that: + // Q_ff = K_fu K_uu^-1 K_uf + // = K_fu K_uu^-T/2 K_uu^-1/2 K_uf + // = P^T P + const Eigen::MatrixXd P = K_uu_ldlt->sqrt_solve(K_fu->transpose()); + + // We only need the diagonal blocks of Q_ff to get A + BlockDiagonal Q_ff_diag; + Eigen::Index i = 0; + for (const auto &pair : indexer) { + Eigen::Index cols = cast::to_index(pair.second.size()); + auto P_cols = P.block(0, i, P.rows(), cols); + Q_ff_diag.blocks.emplace_back(P_cols.transpose() * P_cols); + i += cols; + } + auto A = K_ff - Q_ff_diag; + + // It's possible that the inducing points will perfectly describe + // some of the data, in which case we need to add a bit of extra + // noise to make sure lambda is invertible. + for (auto &b : A.blocks) { + b.diagonal() += + measurement_nugget_.value * Eigen::VectorXd::Ones(b.rows()); + } + + *A_ldlt = A.ldlt(); + } + + Parameter measurement_nugget_; + Parameter inducing_nugget_; + InducingPointStrategy inducing_point_strategy_; + GrouperFunction independent_group_function_; +}; + +// rebase_inducing_points takes a Sparse GP which was fit using some set of +// inducing points and creates a new fit relative to new inducing points. +// Note that this will NOT be the equivalent to having fit the model with +// the new inducing points since some information may have been lost in +// the process. +template +auto rebase_inducing_points( + const FitModel>> + &fit_model, + const std::vector &new_inducing_points) { + return fit_model.get_model().fit_from_prediction( + new_inducing_points, fit_model.predict(new_inducing_points).joint()); +} + +template +using SparseQRPicGaussianProcessRegression = + PICGaussianProcessRegression; + +template +auto pic_gp_from_covariance_and_mean( + CovFunc &&covariance_function, MeanFunc &&mean_function, + GrouperFunction &&grouper_function, InducingPointStrategy &&strategy, + const std::string &model_name, + QRImplementation qr __attribute__((unused)) = DenseQRImplementation{}) { + return PICGaussianProcessRegression< + typename std::decay::type, typename std::decay::type, + typename std::decay::type, + typename std::decay::type, + typename std::decay::type>( + std::forward(covariance_function), + std::forward(mean_function), + std::forward(grouper_function), + std::forward(strategy), model_name); +}; + +template +auto pic_gp_from_covariance_and_mean( + CovFunc &&covariance_function, MeanFunc &&mean_function, + GrouperFunction &&grouper_function, const std::string &model_name, + QRImplementation qr = DenseQRImplementation{}) { + return pic_gp_from_covariance_and_mean( + std::forward(covariance_function), + std::forward(mean_function), + std::forward(grouper_function), + StateSpaceInducingPointStrategy(), model_name, qr); +}; + +template +auto pic_gp_from_covariance(CovFunc &&covariance_function, + GrouperFunction &&grouper_function, + InducingPointStrategy &&strategy, + const std::string &model_name, + QRImplementation qr = DenseQRImplementation{}) { + return pic_gp_from_covariance_and_mean( + std::forward(covariance_function), ZeroMean(), + std::forward(grouper_function), + std::forward(strategy), model_name, qr); +}; + +template +auto pic_gp_from_covariance(CovFunc covariance_function, + GrouperFunction grouper_function, + const std::string &model_name, + QRImplementation qr = DenseQRImplementation{}) { + return pic_gp_from_covariance_and_mean< + CovFunc, decltype(ZeroMean()), GrouperFunction, + decltype(StateSpaceInducingPointStrategy()), QRImplementation>( + std::forward(covariance_function), ZeroMean(), + std::forward(grouper_function), + StateSpaceInducingPointStrategy(), model_name, qr); +}; + +} // namespace albatross + +#endif /* INCLUDE_ALBATROSS_MODELS_PIC_GP_H_ */ diff --git a/tests/test_pic_gp.cc b/tests/test_pic_gp.cc new file mode 100644 index 00000000..96b8c1f9 --- /dev/null +++ b/tests/test_pic_gp.cc @@ -0,0 +1,404 @@ +/* + * Copyright (C) 2024 Swift Navigation Inc. + * Contact: Swift Navigation + * + * This source is subject to the license found in the file 'LICENSE' which must + * be distributed together with this source. All other rights reserved. + * + * THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, + * EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE. + */ + +#include +#include + +// #include "albatross/src/eigen/serializable_ldlt.hpp" +// #include "albatross/src/linalg/block_diagonal.hpp" +#include "test_models.h" +#include +#include + +#include +#include + +namespace albatross { + +struct LeaveOneIntervalOut { + + LeaveOneIntervalOut(){}; + explicit LeaveOneIntervalOut(double group_domain_size_) + : group_domain_size(group_domain_size_){}; + + int operator()(const double &f) const { + return static_cast(floor(f / group_domain_size)); + } + + int operator()(const Measurement &f) const { + return static_cast(floor(f.value / group_domain_size)); + } + + double group_domain_size = 5.; +}; + +TEST(TestPicGP, TestPredictionExists) { + const auto dataset = make_toy_linear_data(); + UniformlySpacedInducingPoints strategy(8); + LeaveOneIntervalOut grouper; + const auto pic = + pic_gp_from_covariance(make_simple_covariance_function(), grouper, + strategy, "pic", DenseQRImplementation{}); + + const auto pic_fit = pic.fit(dataset); + + const auto test_features = linspace(0.1, 9.9, 11); + + const auto pic_pred = + pic_fit.predict_with_measurement_noise(test_features).joint(); + + EXPECT_GT(pic_pred.mean.size(), 0); +} + + +TEST(TestPicGP, ScalarEquivalence) { + static constexpr std::size_t kNumTrainPoints = 3; + static constexpr std::size_t kNumTestPoints = 1; + static constexpr std::size_t kNumInducingPoints = 3; + + UniformlySpacedInducingPoints strategy(kNumInducingPoints); + // Set the grouper so that all the simple test data falls within one + // group. + LeaveOneIntervalOut grouper(10); + auto direct = gp_from_covariance(make_simple_covariance_function(), "direct"); + auto pic = pic_gp_from_covariance(make_simple_covariance_function(), grouper, + strategy, "pic", DenseQRImplementation{}); + + auto dataset = make_toy_linear_data(5, 1, 0.1, kNumTrainPoints); + + auto direct_fit = direct.fit(dataset); + auto pic_fit = pic.fit(dataset); + + auto test_features = linspace(0.1, 9.9, kNumTestPoints); + auto direct_pred = + direct_fit.predict_with_measurement_noise(test_features).joint(); + auto pic_pred = pic_fit.predict_with_measurement_noise(test_features).joint(); + EXPECT_LT(distance::wasserstein_2(direct_pred, pic_pred), 1e-12); +} + +TEST(TestPicGP, SingleBlockEquivalence) { + static constexpr std::size_t kNumTrainPoints = 10; + static constexpr std::size_t kNumTestPoints = 10; + static constexpr std::size_t kNumInducingPoints = 4; + + UniformlySpacedInducingPoints strategy(kNumInducingPoints); + // Set the grouper so that all the simple test data falls within one + // group. + LeaveOneIntervalOut grouper(10); + auto direct = gp_from_covariance(make_simple_covariance_function(), "direct"); + auto pic = pic_gp_from_covariance(make_simple_covariance_function(), grouper, + strategy, "pic", DenseQRImplementation{}); + + auto dataset = make_toy_linear_data(5, 1, 0.1, kNumTrainPoints); + + auto direct_fit = direct.fit(dataset); + auto pic_fit = pic.fit(dataset); + + auto test_features = linspace(0.1, 9.9, kNumTestPoints); + auto direct_pred = + direct_fit.predict_with_measurement_noise(test_features).joint(); + auto pic_pred = pic_fit.predict_with_measurement_noise(test_features).joint(); + EXPECT_LT(distance::wasserstein_2(direct_pred, pic_pred), 1e-10); +} + +template +JointDistribution test_pic(const RegressionDataset &data, + const std::vector &predict, + CovarianceType &&cov, InducingStrategy &&inducing, + GrouperFunction &&grouper) { + const auto u = inducing(cov, data.features); + + auto bfp_cov = make_brute_force_pic_covariance(inducing(cov, data.features), + cov, grouper); + + BlockDiagonal K_ff; + const auto indexers = group_by(data.features, grouper).indexers(); + std::vector reordered_inds; + for (const auto &pair : indexers) { + reordered_inds.insert(reordered_inds.end(), pair.second.begin(), + pair.second.end()); + K_ff.blocks.emplace_back(cov(subset(data.features, pair.second))); + K_ff.blocks.back().diagonal() += + subset(data.targets.covariance.diagonal(), pair.second); + } + const std::vector ordered_features = + subset(data.features, reordered_inds); + + Eigen::MatrixXd K_uu = cov(u, u); + K_uu.diagonal() += Eigen::VectorXd::Constant(K_uu.rows(), 1e-8); + const Eigen::MatrixXd K_fu = cov(ordered_features, u); + + const Eigen::SerializableLDLT K_uu_ldlt = K_uu.ldlt(); + const Eigen::MatrixXd P = K_uu_ldlt.sqrt_solve(K_fu.transpose()); + const Eigen::MatrixXd Q_ff = P.transpose() * P; + BlockDiagonal Q_ff_diag; + Eigen::Index i = 0; + for (const auto &pair : indexers) { + const Eigen::Index cols = cast::to_index(pair.second.size()); + auto P_cols = P.block(0, i, P.rows(), cols); + Q_ff_diag.blocks.emplace_back(P_cols.transpose() * P_cols); + i += cols; + } + // auto Lambda = K_ff - Q_ff_diag; + + Eigen::MatrixXd Q_ff_lambda = Q_ff; + Eigen::Index k = 0; + for (const auto &block : K_ff.blocks) { + Q_ff_lambda.block(k, k, block.rows(), block.rows()) = block; + Q_ff_lambda.block(k, k, block.rows(), block.rows()).diagonal() += + Eigen::VectorXd::Constant(block.rows(), 1e-2); + k += block.rows(); + } + + const auto print_matrix = [](const Eigen::MatrixXd &m, std::string &&label) { + std::cout << label << " (" << m.rows() << "x" << m.cols() << "):\n" + << m.format(Eigen::FullPrecision) << std::endl; + }; + + const auto print_vector = [](const Eigen::VectorXd &v, std::string &&label) { + std::cout << label << " (" << v.size() + << "): " << v.transpose().format(Eigen::FullPrecision) + << std::endl; + }; + + print_matrix(Q_ff_lambda, "Q_ff + Lambda"); + // const BlockDiagonalLDLT Lambda_ldlt = Lambda.ldlt(); + + print_matrix(bfp_cov(ordered_features), "BFP: Q_ff + Lambda"); + + print_matrix(Q_ff_lambda - bfp_cov(ordered_features), + "Q_ff_lambda difference"); + + const Eigen::SerializableLDLT S = Q_ff_lambda.ldlt(); + + print_matrix(S.matrixL(), "PITC L"); + print_vector(S.vectorD(), "PITC D"); + + Eigen::MatrixXd K_PIC(ordered_features.size(), predict.size()); + Eigen::MatrixXd K_PITC(ordered_features.size(), predict.size()); + + Eigen::MatrixXd V_PIC = + Eigen::MatrixXd::Zero(ordered_features.size(), predict.size()); + + Eigen::MatrixXd Y = K_uu_ldlt.solve(K_fu.transpose()); + print_matrix(Y, "Y"); + + Eigen::MatrixXd W = K_uu_ldlt.solve(S.solve(K_fu).transpose()); + print_matrix(W, "W"); + + for (Eigen::Index f = 0; f < cast::to_index(ordered_features.size()); ++f) { + for (Eigen::Index p = 0; p < cast::to_index(predict.size()); ++p) { + const std::vector pv{predict[p]}; + K_PITC(f, p) = K_fu.row(f).dot(K_uu_ldlt.solve(cov(u, pv)).col(0)); + if (grouper(predict[p]) == grouper(ordered_features[f])) { + K_PIC(f, p) = cov(ordered_features[f], predict[p]); + V_PIC(f, p) = + K_PIC(f, p) - K_fu.row(f).dot(K_uu_ldlt.solve(cov(u, pv)).col(0)); + } else { + K_PIC(f, p) = K_fu.row(f).dot(K_uu_ldlt.solve(cov(u, pv)).col(0)); + } + } + } + + const Eigen::MatrixXd K_pu = cov(predict, u); + // Eigen::Index j = 0; + // Eigen::Index blk = 0; + // std::vector WW; + + const Eigen::MatrixXd WBW = K_uu_ldlt.solve(S.solve(K_fu).transpose()); + print_matrix(K_PIC, "K_PIC"); + print_matrix(bfp_cov(ordered_features, predict), "BFP: K_PIC"); + + print_matrix(K_PIC - bfp_cov(ordered_features, predict), "K_PIC error"); + print_matrix(K_PITC, "K_PITC"); + + print_matrix(V_PIC, "V_PIC"); + + print_matrix(WBW, "W"); + const Eigen::MatrixXd U = K_pu * WBW * V_PIC; + print_matrix(U, "U"); + + auto SV = S.sqrt_solve(V_PIC); + const Eigen::MatrixXd VSV = + V_PIC.transpose() * S.solve(V_PIC); // SV.transpose() * SV; + print_matrix(VSV, "VSV"); + + const Eigen::MatrixXd predict_prior = cov(predict); + + print_matrix(predict_prior, "prior"); + + print_matrix(bfp_cov(predict), "BFP: prior"); + + print_matrix(predict_prior - bfp_cov(predict), "prior error"); + + // auto KK = K_PITC_ldlt.sqrt_solve(K_PIC); + // const Eigen::MatrixXd explained_cov = KK.transpose() * KK; + const Eigen::MatrixXd explained_cov = K_PIC.transpose() * S.solve(K_PIC); + print_matrix(explained_cov, "explained"); + + const Eigen::VectorXd PIC_mean = + K_PIC.transpose() * S.solve(data.targets.mean); + print_vector(PIC_mean, "PIC mean"); + const Eigen::MatrixXd predict_cov = predict_prior - explained_cov; + + print_matrix(predict_cov, "K_**"); + + const Eigen::MatrixXd PITC_cov = + predict_prior - K_PITC.transpose() * S.solve(K_PITC); + + const Eigen::MatrixXd PIC_cov = PITC_cov - U - U.transpose() - VSV; + print_matrix(PITC_cov, "PTIC cov"); + print_matrix(PIC_cov, "PIC cov"); + + print_matrix(predict_cov - PITC_cov, "PIC - PITC"); + print_matrix(predict_cov - PIC_cov, "PIC - factored"); + + return JointDistribution{PIC_mean, PIC_cov}; +} + +TEST(TestPicGP, BruteForceEquivalenceOneBlock) { + static constexpr std::size_t kNumTrainPoints = 10; + static constexpr std::size_t kNumTestPoints = 10; + static constexpr std::size_t kNumInducingPoints = 5; + + UniformlySpacedInducingPoints strategy(kNumInducingPoints); + LeaveOneIntervalOut grouper(10); + auto pic = pic_gp_from_covariance(make_simple_covariance_function(), grouper, + strategy, "pic", DenseQRImplementation{}); + + auto dataset = make_toy_linear_data(5, 1, 0.1, kNumTrainPoints); + auto bfp_cov = make_brute_force_pic_covariance( + strategy(make_simple_covariance_function(), dataset.features), + make_simple_covariance_function(), grouper); + auto bfp = gp_from_covariance(bfp_cov); + + auto bfp_fit = bfp.fit(dataset); + auto pic_fit = pic.fit(dataset); + + auto test_features = linspace(0.1, 9.9, kNumTestPoints); + auto bfp_pred = bfp_fit.predict_with_measurement_noise(test_features).joint(); + auto pic_pred = pic_fit.predict_with_measurement_noise(test_features).joint(); + EXPECT_LT(distance::wasserstein_2(bfp_pred, pic_pred), 1e-11); +} + +TEST(TestPicGP, BruteForceEquivalenceMultipleBlocks) { + static constexpr std::size_t kNumTrainPoints = 10; + static constexpr std::size_t kNumTestPoints = 10; + static constexpr std::size_t kNumInducingPoints = 5; + + UniformlySpacedInducingPoints strategy(kNumInducingPoints); + LeaveOneIntervalOut grouper(2); + auto pic = pic_gp_from_covariance(make_simple_covariance_function(), grouper, + strategy, "pic", DenseQRImplementation{}); + + auto dataset = make_toy_linear_data(5, 1, 0.1, kNumTrainPoints); + auto bfp_cov = make_brute_force_pic_covariance( + strategy(make_simple_covariance_function(), dataset.features), + make_simple_covariance_function(), grouper); + auto bfp = gp_from_covariance(bfp_cov); + + auto bfp_fit = bfp.fit(dataset); + auto pic_fit = pic.fit(dataset); + + auto test_features = linspace(0.1, 9.9, kNumTestPoints); + auto bfp_pred = bfp_fit.predict_with_measurement_noise(test_features).joint(); + auto pic_pred = pic_fit.predict_with_measurement_noise(test_features).joint(); + + EXPECT_LT(distance::wasserstein_2(bfp_pred, pic_pred), 1e-12); +} + +TEST(TestPicGP, PITCEquivalenceOutOfTraining) { + static constexpr std::size_t kNumTrainPoints = 10; + static constexpr std::size_t kNumTestPoints = 10; + static constexpr std::size_t kNumInducingPoints = 5; + + UniformlySpacedInducingPoints strategy(kNumInducingPoints); + LeaveOneIntervalOut grouper(2); + auto pic = pic_gp_from_covariance(make_simple_covariance_function(), grouper, + strategy, "pic", DenseQRImplementation{}); + + auto dataset = make_toy_linear_data(5, 1, 0.1, kNumTrainPoints); + auto pitc = + sparse_gp_from_covariance(make_simple_covariance_function(), grouper, + strategy, "pitc", DenseQRImplementation{}); + + auto pic_fit = pic.fit(dataset); + auto pitc_fit = pitc.fit(dataset); + + auto test_features = linspace(10.1, 19.9, kNumTestPoints); + auto pic_pred = pic_fit.predict_with_measurement_noise(test_features).joint(); + auto pitc_pred = pitc_fit.predict_with_measurement_noise(test_features).joint(); + + EXPECT_LT(distance::wasserstein_2(pic_pred, pitc_pred), 1e-12); +} + +TEST(TestPicGP, PredictMeanEquivalent) { + static constexpr std::size_t kNumTrainPoints = 10; + static constexpr std::size_t kNumTestPoints = 10; + static constexpr std::size_t kNumInducingPoints = 5; + + UniformlySpacedInducingPoints strategy(kNumInducingPoints); + LeaveOneIntervalOut grouper(2); + auto pic = pic_gp_from_covariance(make_simple_covariance_function(), grouper, + strategy, "pic", DenseQRImplementation{}); + + auto dataset = make_toy_linear_data(5, 1, 0.1, kNumTrainPoints); + + auto pic_fit = pic.fit(dataset); + + auto test_features = linspace(0.1, 9.9, kNumTestPoints); + auto pic_pred = pic_fit.predict_with_measurement_noise(test_features).mean(); + auto pic_joint_pred = pic_fit.predict_with_measurement_noise(test_features).joint(); + + const double pic_mean_error = (pic_pred - pic_joint_pred.mean).norm(); + EXPECT_LT(pic_mean_error, 1e-12); +} + +TEST(TestPicGP, PredictMarginalEquivalent) { + static constexpr std::size_t kNumTrainPoints = 10; + static constexpr std::size_t kNumTestPoints = 10; + static constexpr std::size_t kNumInducingPoints = 5; + + UniformlySpacedInducingPoints strategy(kNumInducingPoints); + LeaveOneIntervalOut grouper(2); + auto pic = pic_gp_from_covariance(make_simple_covariance_function(), grouper, + strategy, "pic", DenseQRImplementation{}); + + auto dataset = make_toy_linear_data(5, 1, 0.1, kNumTrainPoints); + + auto pic_fit = pic.fit(dataset); + + auto test_features = linspace(0.1, 9.9, kNumTestPoints); + auto pic_pred = pic_fit.predict_with_measurement_noise(test_features).marginal(); + auto pic_joint_pred = pic_fit.predict_with_measurement_noise(test_features).joint(); + + const double pic_marginal_error = + (pic_pred.mean - pic_joint_pred.mean).norm(); + EXPECT_LT(pic_marginal_error, 1e-12); + const double pic_marginal_cov_error = + (Eigen::VectorXd(pic_pred.covariance.diagonal()) - + Eigen::VectorXd(pic_joint_pred.covariance.diagonal())) + .norm(); + EXPECT_LT(pic_marginal_cov_error, 1e-12) + << "\nmarginal: " + << Eigen::VectorXd(pic_pred.covariance) + .transpose() + .format(Eigen::FullPrecision) + << "\njoint diag: " + << Eigen::VectorXd(pic_joint_pred.covariance.diagonal()) + .transpose() + .format(Eigen::FullPrecision); +} + + +} // namespace albatross From f2d36a10d5365c45ff7e2fa05a426ddd8efaea76 Mon Sep 17 00:00:00 2001 From: Matt Peddie Date: Tue, 6 Aug 2024 08:42:49 +1000 Subject: [PATCH 04/12] format fixups --- include/albatross/src/cereal/gp.hpp | 2 +- include/albatross/src/models/pic_gp.hpp | 4 ++-- include/albatross/src/models/sparse_common.hpp | 3 +-- tests/test_pic_gp.cc | 14 ++++++++------ 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/include/albatross/src/cereal/gp.hpp b/include/albatross/src/cereal/gp.hpp index 9b421b83..216d487a 100644 --- a/include/albatross/src/cereal/gp.hpp +++ b/include/albatross/src/cereal/gp.hpp @@ -18,8 +18,8 @@ using albatross::GaussianProcessBase; using albatross::GPFit; using albatross::LinearCombination; -using albatross::SparseGPFit; using albatross::PICGPFit; +using albatross::SparseGPFit; #ifndef GP_SERIALIZATION_VERSION #define GP_SERIALIZATION_VERSION 2 diff --git a/include/albatross/src/models/pic_gp.hpp b/include/albatross/src/models/pic_gp.hpp index cd088341..8badc0dd 100644 --- a/include/albatross/src/models/pic_gp.hpp +++ b/include/albatross/src/models/pic_gp.hpp @@ -681,7 +681,8 @@ class PICGaussianProcessRegression (xi_lambda.transpose() * xi_lambda - xi_u.transpose() * xi_u) .diagonal()}; - const Eigen::VectorXd U_diag = (K_up.transpose() * sparse_gp_fit.W * Vp).diagonal(); + const Eigen::VectorXd U_diag = + (K_up.transpose() * sparse_gp_fit.W * Vp).diagonal(); Eigen::VectorXd marginal_variance(cast::to_index(features.size())); for (Eigen::Index i = 0; i < marginal_variance.size(); ++i) { @@ -764,7 +765,6 @@ class PICGaussianProcessRegression } Vp.makeCompressed(); - Eigen::MatrixXd xi_lambda = sparse_gp_fit.A_ldlt.sqrt_solve(Vp); Eigen::MatrixXd xi_u = sparse_gp_fit.Z * Vp; Eigen::MatrixXd VSV{xi_lambda.transpose() * xi_lambda - diff --git a/include/albatross/src/models/sparse_common.hpp b/include/albatross/src/models/sparse_common.hpp index b5267b99..525ecee4 100644 --- a/include/albatross/src/models/sparse_common.hpp +++ b/include/albatross/src/models/sparse_common.hpp @@ -86,7 +86,6 @@ struct DenseQRImplementation { } }; - } // namespace albatross -#endif // INCLUDE_ALBATROSS_MODELS_SPARSE_COMMON_H_ \ No newline at end of file +#endif // INCLUDE_ALBATROSS_MODELS_SPARSE_COMMON_H_ diff --git a/tests/test_pic_gp.cc b/tests/test_pic_gp.cc index 96b8c1f9..dc110334 100644 --- a/tests/test_pic_gp.cc +++ b/tests/test_pic_gp.cc @@ -59,7 +59,6 @@ TEST(TestPicGP, TestPredictionExists) { EXPECT_GT(pic_pred.mean.size(), 0); } - TEST(TestPicGP, ScalarEquivalence) { static constexpr std::size_t kNumTrainPoints = 3; static constexpr std::size_t kNumTestPoints = 1; @@ -337,7 +336,8 @@ TEST(TestPicGP, PITCEquivalenceOutOfTraining) { auto test_features = linspace(10.1, 19.9, kNumTestPoints); auto pic_pred = pic_fit.predict_with_measurement_noise(test_features).joint(); - auto pitc_pred = pitc_fit.predict_with_measurement_noise(test_features).joint(); + auto pitc_pred = + pitc_fit.predict_with_measurement_noise(test_features).joint(); EXPECT_LT(distance::wasserstein_2(pic_pred, pitc_pred), 1e-12); } @@ -358,7 +358,8 @@ TEST(TestPicGP, PredictMeanEquivalent) { auto test_features = linspace(0.1, 9.9, kNumTestPoints); auto pic_pred = pic_fit.predict_with_measurement_noise(test_features).mean(); - auto pic_joint_pred = pic_fit.predict_with_measurement_noise(test_features).joint(); + auto pic_joint_pred = + pic_fit.predict_with_measurement_noise(test_features).joint(); const double pic_mean_error = (pic_pred - pic_joint_pred.mean).norm(); EXPECT_LT(pic_mean_error, 1e-12); @@ -379,8 +380,10 @@ TEST(TestPicGP, PredictMarginalEquivalent) { auto pic_fit = pic.fit(dataset); auto test_features = linspace(0.1, 9.9, kNumTestPoints); - auto pic_pred = pic_fit.predict_with_measurement_noise(test_features).marginal(); - auto pic_joint_pred = pic_fit.predict_with_measurement_noise(test_features).joint(); + auto pic_pred = + pic_fit.predict_with_measurement_noise(test_features).marginal(); + auto pic_joint_pred = + pic_fit.predict_with_measurement_noise(test_features).joint(); const double pic_marginal_error = (pic_pred.mean - pic_joint_pred.mean).norm(); @@ -400,5 +403,4 @@ TEST(TestPicGP, PredictMarginalEquivalent) { .format(Eigen::FullPrecision); } - } // namespace albatross From 78348f5c52f786623d4d3fe1be42b5b0275087ab Mon Sep 17 00:00:00 2001 From: Matt Peddie Date: Wed, 7 Aug 2024 02:25:03 +1000 Subject: [PATCH 05/12] Remove debug spam --- include/albatross/src/cereal/suitesparse.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/albatross/src/cereal/suitesparse.hpp b/include/albatross/src/cereal/suitesparse.hpp index 32a89867..ca44c9c6 100644 --- a/include/albatross/src/cereal/suitesparse.hpp +++ b/include/albatross/src/cereal/suitesparse.hpp @@ -439,8 +439,6 @@ inline void load(Archive &ar, cholmod_sparse &m, } detail::suitesparse_cereal_realloc(&m.x, m.nzmax, element_size_bytes); - std::cout << "m.p size: " << p_size_bytes << " bytes" << std::endl; - std::cout << "m.ncol + 1: " << m.ncol + 1 << std::endl; detail::decode_array(ar, "m.p", m.p, (m.ncol + 1) * p_size_bytes); detail::decode_array(ar, "m.i", m.i, m.nzmax * integer_size_bytes); if (!m.packed) { From 9ba34bf905b386111cf322ca6d2e678cef6ff2c3 Mon Sep 17 00:00:00 2001 From: Matt Peddie Date: Wed, 7 Aug 2024 02:25:08 +1000 Subject: [PATCH 06/12] Remove unused group tracker --- include/albatross/src/models/pic_gp.hpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/include/albatross/src/models/pic_gp.hpp b/include/albatross/src/models/pic_gp.hpp index 8badc0dd..b1c30d0e 100644 --- a/include/albatross/src/models/pic_gp.hpp +++ b/include/albatross/src/models/pic_gp.hpp @@ -639,15 +639,12 @@ class PICGaussianProcessRegression features.size()); Eigen::VectorXi col_alloc{Eigen::VectorXi::Zero(features.size())}; bool all_same_group = true; - bool all_in_groups = true; for (Eigen::Index j = 0; j < features.size(); ++j) { groups[j] = sparse_gp_fit.measurement_groups.find( independent_group_function_(without_measurement(features[j]))); if (groups[j] != sparse_gp_fit.measurement_groups.end()) { col_alloc(j) = groups[j]->second.block_size; all_same_group = all_same_group && groups[j] == groups[0]; - } else { - all_in_groups = false; } feature_to_block[j] = std::distance(sparse_gp_fit.measurement_groups.begin(), groups[j]); @@ -726,15 +723,12 @@ class PICGaussianProcessRegression features.size()); Eigen::VectorXi col_alloc{Eigen::VectorXi::Zero(features.size())}; bool all_same_group = true; - bool all_in_groups = true; for (Eigen::Index j = 0; j < features.size(); ++j) { groups[j] = sparse_gp_fit.measurement_groups.find( independent_group_function_(without_measurement(features[j]))); if (groups[j] != sparse_gp_fit.measurement_groups.end()) { col_alloc(j) = groups[j]->second.block_size; all_same_group = all_same_group && groups[j] == groups[0]; - } else { - all_in_groups = false; } feature_to_block[j] = std::distance(sparse_gp_fit.measurement_groups.begin(), groups[j]); From 7612fcfaa82113d9957dcff9bd8741aea3e943d8 Mon Sep 17 00:00:00 2001 From: Matt Peddie Date: Wed, 7 Aug 2024 04:13:32 +1000 Subject: [PATCH 07/12] serialization fixups --- include/albatross/src/cereal/block_diagonal.hpp | 2 +- include/albatross/src/cereal/gp.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/albatross/src/cereal/block_diagonal.hpp b/include/albatross/src/cereal/block_diagonal.hpp index 964602bd..b9253363 100644 --- a/include/albatross/src/cereal/block_diagonal.hpp +++ b/include/albatross/src/cereal/block_diagonal.hpp @@ -18,7 +18,7 @@ namespace cereal { template inline void serialize(Archive &archive, const albatross::BlockDiagonalLDLT &ldlt, - const std::uint32_t version) { + const std::uint32_t version __attribute__((unused))) { archive(cereal::make_nvp("blocks", ldlt.blocks)); } diff --git a/include/albatross/src/cereal/gp.hpp b/include/albatross/src/cereal/gp.hpp index 216d487a..cd799908 100644 --- a/include/albatross/src/cereal/gp.hpp +++ b/include/albatross/src/cereal/gp.hpp @@ -58,7 +58,7 @@ template > &fit, - const std::uint32_t version) { + const std::uint32_t version __attribute__((unused))) { archive(cereal::make_nvp("train_features", fit.train_features)); archive(cereal::make_nvp("inducing_features", fit.inducing_features)); archive(cereal::make_nvp("train_covariance", fit.train_covariance)); From 179dd64543087c5ec2eb36ed458afe30597251f4 Mon Sep 17 00:00:00 2001 From: Matt Peddie Date: Wed, 7 Aug 2024 04:13:40 +1000 Subject: [PATCH 08/12] missing operator== for block diagonal LDLT --- include/albatross/src/linalg/block_diagonal.hpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/albatross/src/linalg/block_diagonal.hpp b/include/albatross/src/linalg/block_diagonal.hpp index 29321879..c57fd1c5 100644 --- a/include/albatross/src/linalg/block_diagonal.hpp +++ b/include/albatross/src/linalg/block_diagonal.hpp @@ -68,6 +68,8 @@ struct BlockDiagonalLDLT { Eigen::Index rows() const; Eigen::Index cols() const; + + bool operator==(const BlockDiagonalLDLT &other) const; }; struct BlockDiagonal { @@ -248,6 +250,11 @@ inline const BlockDiagonalLDLT &BlockDiagonalLDLT::adjoint() const { return *this; } +inline bool +BlockDiagonalLDLT::operator==(const BlockDiagonalLDLT &other) const { + return blocks == other.blocks; +} + /* * Block Diagonal */ From cdf43376443858063024215ea449b309709504d7 Mon Sep 17 00:00:00 2001 From: Matt Peddie Date: Thu, 8 Aug 2024 01:58:08 +1000 Subject: [PATCH 09/12] Avoid linker collision --- tests/test_pic_gp.cc | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/test_pic_gp.cc b/tests/test_pic_gp.cc index dc110334..0a8c247f 100644 --- a/tests/test_pic_gp.cc +++ b/tests/test_pic_gp.cc @@ -24,10 +24,10 @@ namespace albatross { -struct LeaveOneIntervalOut { +struct GroupedLeaveOneIntervalOut { - LeaveOneIntervalOut(){}; - explicit LeaveOneIntervalOut(double group_domain_size_) + GroupedLeaveOneIntervalOut(){}; + explicit GroupedLeaveOneIntervalOut(double group_domain_size_) : group_domain_size(group_domain_size_){}; int operator()(const double &f) const { @@ -44,7 +44,7 @@ struct LeaveOneIntervalOut { TEST(TestPicGP, TestPredictionExists) { const auto dataset = make_toy_linear_data(); UniformlySpacedInducingPoints strategy(8); - LeaveOneIntervalOut grouper; + GroupedLeaveOneIntervalOut grouper; const auto pic = pic_gp_from_covariance(make_simple_covariance_function(), grouper, strategy, "pic", DenseQRImplementation{}); @@ -67,7 +67,7 @@ TEST(TestPicGP, ScalarEquivalence) { UniformlySpacedInducingPoints strategy(kNumInducingPoints); // Set the grouper so that all the simple test data falls within one // group. - LeaveOneIntervalOut grouper(10); + GroupedLeaveOneIntervalOut grouper(10); auto direct = gp_from_covariance(make_simple_covariance_function(), "direct"); auto pic = pic_gp_from_covariance(make_simple_covariance_function(), grouper, strategy, "pic", DenseQRImplementation{}); @@ -92,7 +92,7 @@ TEST(TestPicGP, SingleBlockEquivalence) { UniformlySpacedInducingPoints strategy(kNumInducingPoints); // Set the grouper so that all the simple test data falls within one // group. - LeaveOneIntervalOut grouper(10); + GroupedLeaveOneIntervalOut grouper(10); auto direct = gp_from_covariance(make_simple_covariance_function(), "direct"); auto pic = pic_gp_from_covariance(make_simple_covariance_function(), grouper, strategy, "pic", DenseQRImplementation{}); @@ -271,7 +271,7 @@ TEST(TestPicGP, BruteForceEquivalenceOneBlock) { static constexpr std::size_t kNumInducingPoints = 5; UniformlySpacedInducingPoints strategy(kNumInducingPoints); - LeaveOneIntervalOut grouper(10); + GroupedLeaveOneIntervalOut grouper(10); auto pic = pic_gp_from_covariance(make_simple_covariance_function(), grouper, strategy, "pic", DenseQRImplementation{}); @@ -296,7 +296,7 @@ TEST(TestPicGP, BruteForceEquivalenceMultipleBlocks) { static constexpr std::size_t kNumInducingPoints = 5; UniformlySpacedInducingPoints strategy(kNumInducingPoints); - LeaveOneIntervalOut grouper(2); + GroupedLeaveOneIntervalOut grouper(2); auto pic = pic_gp_from_covariance(make_simple_covariance_function(), grouper, strategy, "pic", DenseQRImplementation{}); @@ -322,7 +322,7 @@ TEST(TestPicGP, PITCEquivalenceOutOfTraining) { static constexpr std::size_t kNumInducingPoints = 5; UniformlySpacedInducingPoints strategy(kNumInducingPoints); - LeaveOneIntervalOut grouper(2); + GroupedLeaveOneIntervalOut grouper(2); auto pic = pic_gp_from_covariance(make_simple_covariance_function(), grouper, strategy, "pic", DenseQRImplementation{}); @@ -348,7 +348,7 @@ TEST(TestPicGP, PredictMeanEquivalent) { static constexpr std::size_t kNumInducingPoints = 5; UniformlySpacedInducingPoints strategy(kNumInducingPoints); - LeaveOneIntervalOut grouper(2); + GroupedLeaveOneIntervalOut grouper(2); auto pic = pic_gp_from_covariance(make_simple_covariance_function(), grouper, strategy, "pic", DenseQRImplementation{}); @@ -371,7 +371,7 @@ TEST(TestPicGP, PredictMarginalEquivalent) { static constexpr std::size_t kNumInducingPoints = 5; UniformlySpacedInducingPoints strategy(kNumInducingPoints); - LeaveOneIntervalOut grouper(2); + GroupedLeaveOneIntervalOut grouper(2); auto pic = pic_gp_from_covariance(make_simple_covariance_function(), grouper, strategy, "pic", DenseQRImplementation{}); From ae2f56116521914f53652232d8f3f2bbcd238c15 Mon Sep 17 00:00:00 2001 From: Matt Peddie Date: Thu, 8 Aug 2024 01:59:34 +1000 Subject: [PATCH 10/12] Remove unimplemented PIC functions --- include/albatross/src/models/pic_gp.hpp | 203 ------------------------ 1 file changed, 203 deletions(-) diff --git a/include/albatross/src/models/pic_gp.hpp b/include/albatross/src/models/pic_gp.hpp index b1c30d0e..97f80e6d 100644 --- a/include/albatross/src/models/pic_gp.hpp +++ b/include/albatross/src/models/pic_gp.hpp @@ -354,66 +354,6 @@ class PICGaussianProcessRegression inducing_point_strategy_ = std::forward(f); } - template - auto - _update_impl(const Fit> &old_fit, - const std::vector &features, - const MarginalDistribution &targets) const { - assert(false && "Cannot call update() on a PIC GP yet!"); - - BlockDiagonalLDLT A_ldlt; - Eigen::SerializableLDLT K_uu_ldlt; - Eigen::MatrixXd K_fu; - Eigen::VectorXd y; - compute_internal_components(old_fit.train_features, features, targets, - &A_ldlt, &K_uu_ldlt, &K_fu, &y); - - const Eigen::Index n_old = old_fit.sigma_R.rows(); - const Eigen::Index n_new = A_ldlt.rows(); - const Eigen::Index k = old_fit.sigma_R.cols(); - Eigen::MatrixXd B = Eigen::MatrixXd::Zero(n_old + n_new, k); - - ALBATROSS_ASSERT(n_old == k); - - // Form: - // B = |R_old P_old^T| = |Q_1| R P^T - // |A^{-1/2} K_fu| |Q_2| - for (Eigen::Index i = 0; i < old_fit.permutation_indices.size(); ++i) { - const Eigen::Index &pi = old_fit.permutation_indices.coeff(i); - B.col(pi).topRows(i + 1) = old_fit.sigma_R.col(i).topRows(i + 1); - } - B.bottomRows(n_new) = A_ldlt.sqrt_solve(K_fu); - const auto B_qr = QRImplementation::compute(B, Base::threads_.get()); - - // Form: - // y_aug = |R_old P_old^T v_old| - // |A^{-1/2} y | - ALBATROSS_ASSERT(old_fit.information.size() == n_old); - Eigen::VectorXd y_augmented(n_old + n_new); - for (Eigen::Index i = 0; i < old_fit.permutation_indices.size(); ++i) { - y_augmented[i] = - old_fit.information[old_fit.permutation_indices.coeff(i)]; - } - y_augmented.topRows(n_old) = - old_fit.sigma_R.template triangularView() * - y_augmented.topRows(n_old); - - y_augmented.bottomRows(n_new) = A_ldlt.sqrt_solve(y, Base::threads_.get()); - const Eigen::VectorXd v = B_qr->solve(y_augmented); - - Eigen::MatrixXd R = get_R(*B_qr); - if (B_qr->rank() < B_qr->cols()) { - // Inflate the diagonal of R in an attempt to avoid singularity - R.diagonal() += - Eigen::VectorXd::Constant(B_qr->cols(), details::cSparseRNugget); - } - using FitType = - Fit>; - return FitType(old_fit.train_features, old_fit.train_covariance, R, - get_P(*B_qr), v, B_qr->rank()); - } - // Here we create the QR decomposition of: // // B = |A^-1/2 K_fu| = |Q_1| R P^T @@ -518,66 +458,6 @@ class PICGaussianProcessRegression B_qr->rank(), cols_Bs, independent_group_function_); } - template - auto fit_from_prediction(const std::vector &new_inducing_points, - const JointDistribution &prediction_) const { - FitModel>> - output(*this, - Fit>()); - Fit> &new_fit = - output.get_fit(); - - new_fit.train_features = new_inducing_points; - - const Eigen::MatrixXd K_zz = - this->covariance_function_(new_inducing_points, Base::threads_.get()); - new_fit.train_covariance = Eigen::SerializableLDLT(K_zz); - - // We're going to need to take the sqrt of the new covariance which - // could be extremely small, so here we add a small nugget to avoid - // numerical instability - JointDistribution prediction(prediction_); - prediction.covariance.diagonal() += Eigen::VectorXd::Constant( - cast::to_index(prediction.size()), 1, details::DEFAULT_NUGGET); - new_fit.information = new_fit.train_covariance.solve(prediction.mean); - - // Here P is the posterior covariance at the new inducing points. If - // we consider the case where we rebase and then use the resulting fit - // to predict the new inducing points then we see that the predictive - // covariance (see documentation above) would be,: - // - // C = K_zz - Q_zz + K_zz Sigma K_zz - // - // We can use this, knowing that at the inducing points K_zz = Q_zz, to - // derive our updated Sigma, - // - // C = K_zz - K_zz + K_zz Sigma K_zz - // C = K_zz Sigma K_zz - // Sigma = K_zz^-1 C K_zz^-1 - // - // And since we need to store Sigma in sqrt form we get, - // - // Sigma = (B_z^T B_z)^-1 - // = K_zz^-1 C K_zz^-1 - // - // So by setting: - // - // B_z = C^{-1/2} K_z - // - // We can then compute and store the QR decomposition of B - // as we do in a normal fit. - const Eigen::SerializableLDLT C_ldlt(prediction.covariance); - const Eigen::MatrixXd sigma_inv_sqrt = C_ldlt.sqrt_solve(K_zz); - const auto B_qr = QRImplementation::compute(sigma_inv_sqrt, nullptr); - - new_fit.P = get_P(*B_qr); - new_fit.sigma_R = get_R(*B_qr); - new_fit.numerical_rank = B_qr->rank(); - - return output; - } - // This is included to allow the SparseGP // to be compatible with fits generated // using a standard GP. @@ -787,73 +667,6 @@ class PICGaussianProcessRegression return pred; } - template - double log_likelihood(const RegressionDataset &dataset) const { - const auto u = - inducing_point_strategy_(this->covariance_function_, dataset.features); - - BlockDiagonalLDLT A_ldlt; - Eigen::SerializableLDLT K_uu_ldlt; - Eigen::MatrixXd K_fu; - Eigen::VectorXd y; - compute_internal_components(u, dataset.features, dataset.targets, &A_ldlt, - &K_uu_ldlt, &K_fu, &y); - const auto B_qr = compute_sigma_qr(K_uu_ldlt, A_ldlt, K_fu); - // The log likelihood for y ~ N(0, K) is: - // - // L = 1/2 (n log(2 pi) + log(|K|) + y^T K^-1 y) - // - // where in our case we have - // K = A + Q_ff - // and - // Q_ff = K_fu K_uu^-1 K_uf - // - // First we get the determinant, |K|: - // - // |K| = |A + Q_ff| - // = |K_uu + K_uf A^-1 K_fu| |K_uu^-1| |A| - // = |P R^T Q^T Q R P^T| |K_uu^-1| |A| - // = |R^T R| |K_uu^-1| |A| - // = |R|^2 |A| / |K_uu| - // - // Where the first equality comes from the matrix determinant lemma. - // https://en.wikipedia.org/wiki/Matrix_determinant_lemma#Generalization - // - // After which we can take the log: - // - // log(|K|) = 2 log(|R|) + log(|A|) - log(|K_uu|) - // - const double log_det_a = A_ldlt.log_determinant(); - - const double log_det_r = - B_qr->matrixR().diagonal().array().cwiseAbs().log().sum(); - const double log_det_K_uu = K_uu_ldlt.log_determinant(); - const double log_det = log_det_a + 2 * log_det_r - log_det_K_uu; - - // q = y^T K^-1 y - // = y^T (A + Q_ff)^-1 y - // = y^T (A^-1 - A^-1 K_fu (K_uu + K_uf A^-1 K_fu)^-1 K_uf A^-1) y - // = y^T A^-1 y - y^T A^-1 K_fu (K_uu + K_uf A^-1 K_fu)^-1 K_uf A^-1) y - // = y^T A^-1 y - y^T A^-1 K_fu (R^T R)^-1 K_uf A^-1) y - // = y^T A^-1 y - (R^-T K_uf A^-1 y)^T (R^-T K_uf A^-1 y) - // = y^T y_a - y_b^T y_b - // - // with y_b = R^-T K_uf y_a - const Eigen::VectorXd y_a = A_ldlt.solve(y); - - Eigen::VectorXd y_b = K_fu.transpose() * y_a; - y_b = sqrt_solve(*B_qr, y_b); - - double log_quadratic = y.transpose() * y_a; - log_quadratic -= y_b.transpose() * y_b; - - const double rank = cast::to_double(y.size()); - const double log_dimension = rank * log(2 * M_PI); - - return -0.5 * (log_det + log_quadratic + log_dimension) + - this->prior_log_likelihood(); - } - InducingPointStrategy get_inducing_point_strategy() const { return inducing_point_strategy_; } @@ -959,22 +772,6 @@ class PICGaussianProcessRegression GrouperFunction independent_group_function_; }; -// rebase_inducing_points takes a Sparse GP which was fit using some set of -// inducing points and creates a new fit relative to new inducing points. -// Note that this will NOT be the equivalent to having fit the model with -// the new inducing points since some information may have been lost in -// the process. -template -auto rebase_inducing_points( - const FitModel>> - &fit_model, - const std::vector &new_inducing_points) { - return fit_model.get_model().fit_from_prediction( - new_inducing_points, fit_model.predict(new_inducing_points).joint()); -} - template using SparseQRPicGaussianProcessRegression = From 86b38ea5f2be6b29fd9630e7a56ba0c7b8a87c37 Mon Sep 17 00:00:00 2001 From: Matt Peddie Date: Mon, 2 Sep 2024 15:47:15 +1000 Subject: [PATCH 11/12] Cleanups and comments --- include/albatross/src/models/pic_gp.hpp | 143 +++++++----------------- tests/test_pic_gp.cc | 2 - 2 files changed, 43 insertions(+), 102 deletions(-) diff --git a/include/albatross/src/models/pic_gp.hpp b/include/albatross/src/models/pic_gp.hpp index 97f80e6d..8863cbec 100644 --- a/include/albatross/src/models/pic_gp.hpp +++ b/include/albatross/src/models/pic_gp.hpp @@ -15,6 +15,20 @@ namespace albatross { +/* + * This class directly implements the PIC covariance structure: + * conditional on the inducing points, each group of observations is + * independent. You can use it as a covariance function for a dense + * GP model, and it should give you the same results as the PIC + * approximation, but since it's implemented directly as a covariance + * function, it's much less efficient than modelling using the + * PICGaussianProcessRegression class defined below. It is provided + * for debugging purposes, since it can be tricky to follow what the + * efficient implementation is doing. + * + * See the PIC GP model test suite for an example of how to use this + * class. + */ template class BruteForcePIC @@ -44,7 +58,6 @@ class BruteForcePIC K_xu[i] = cov_(x, inducing_points_[i]); K_uy[i] = cov_(inducing_points_[i], y); } - // const Eigen::VectorXd K_uy = cov_(inducing_points_, y); return K_xu.dot(K_uu_ldlt_.solve(K_uy)); } }; @@ -149,110 +162,40 @@ struct Fit> { }; /* - * This class implements an approximation technique for Gaussian processes - * which relies on an assumption that all observations are independent (or - * groups of observations are independent) conditional on a set of inducing - * points. The method is based off: - * - * [1] Sparse Gaussian Processes using Pseudo-inputs - * Edward Snelson, Zoubin Ghahramani - * http://www.gatsby.ucl.ac.uk/~snelson/SPGP_up.pdf - * - * Though the code uses notation closer to that used in this (excellent) - * overview of these methods: - * - * [2] A Unifying View of Sparse Approximate Gaussian Process Regression - * Joaquin Quinonero-Candela, Carl Edward Rasmussen - * http://www.jmlr.org/papers/volume6/quinonero-candela05a/quinonero-candela05a.pdf - * - * Very broadly speaking this method starts with a prior over the observations, - * - * [f] ~ N(0, K_ff) - * - * where K_ff(i, j) = covariance_function(features[i], features[j]) and f - * represents the function value. - * - * It then uses a set of inducing points, u, and makes some assumptions about - * the conditional distribution: - * - * [f|u] ~ N(K_fu K_uu^-1 u, K_ff - Q_ff) - * - * Where Q_ff = K_fu K_uu^-1 K_uf represents the variance in f that is - * explained by u. - * - * For FITC (Fully Independent Training Contitional) the assumption is that - * K_ff - Qff is diagonal, for PITC (Partially Independent Training Conditional) - * that it is block diagonal. These assumptions lead to an efficient way of - * inferring the posterior distribution for some new location f*, - * - * [f*|f=y] ~ N(K_*u S K_uf A^-1 y, K_** - Q_** + K_*u S K_u*) - * - * Where S = (K_uu + K_uf A^-1 K_fu)^-1 and A = diag(K_ff - Q_ff) and "diag" - * may mean diagonal or block diagonal. Regardless we end up with O(m^2n) - * complexity instead of O(n^3) of direct Gaussian processes. (Note that in [2] - * S is called sigma and A is lambda.) - * - * Of course, the implementation details end up somewhat more complex in order - * to improve numerical stability. Here we use an approach based off the QR - * decomposition which is described in + * This class implements an approximation technique for Gaussian + * processes which relies on the following: * - * Stable and Efficient Gaussian Process Calculations - * http://www.jmlr.org/papers/volume10/foster09a/foster09a.pdf + * 1. Observations occur in correlated groups * - * A more detailed (but more likely to be out of date) description of - * the details can be found on the albatross documentation. A short - * description follows. It starts by setting up the Sparse Gaussian process - * covariances + * 2. Conditional on the values of observations within the same group + * and on the values of the inducing points, an observation is + * independent of observations in other groups. * - * [f|u] ~ N(K_fu K_uu^-1 u, K_ff - Q_ff) + * This is known as the PIC (Partially Independent Conditional) + * approximation and was introduced in * - * We then set, - * - * A = K_ff - Q_ff - * = K_ff - K_fu K_uu^-1 K_uf - * - * which can be thought of as the covariance in the training data which - * is not be explained by the inducing points. The fundamental - * assumption in these sparse Gaussian processes is that A is sparse, in - * this case block diagonal. - * - * We then build a matrix B and use its QR decomposition (with pivoting P) - * - * B = |A^-1/2 K_fu| = |Q_1| R P^T - * |K_uu^{T/2} | |Q_2| - * - * After which we can get the information vector (see _fit_impl) - * - * v = (K_uu + K_uf A^-1 K_fu)^-1 K_uf A^-1 y - * = (B^T B) B^T A^-1/2 y - * = P R^-1 Q_1^T A^-1/2 y - * - * and can make predictions for new locations (see _predict_impl), - * - * [f*|f=y] ~ N(K_*u S K_uf A^-1 y, K_** - Q_** + K_*u S K_u*) - * ~ N(m, C) - * - * where we have - * - * m = K_*u S K_uf A^-1 y - * = K_*u v - * - * and - * - * C = K_** - Q_** + K_*u S K_u* - * - * using - * - * Q_** = K_*u K_uu^-1 K_u* - * = (K_uu^{-1/2} K_u*)^T (K_uu^{-1/2} K_u*) - * = Q_sqrt^T Q_sqrt - * and + * [1] Local and global sparse Gaussian process approximations + * Edward Snelson, Zoubin Ghahramani + * https://proceedings.mlr.press/v2/snelson07a/snelson07a.pdf + * + * The upshot of this method is that for enough small enough groups, + * the fit performance scales as O(N M^2) for N observations and M + * inducing points (just like FITC/PITC), but fine-grained local + * information from correlation groups can provide better predictive + * performance. + * + * Inducing point and group definitions are handled as in the + * FITC/PITC approximation class (see sparse_gp.hpp). If a prediction + * point is not correlated with any of the training observations (as + * defined by the grouper function), the resulting predicted + * distribution will be computed using only the fitted inducing + * points. + * + * Several operations supported by other Albatross GP model classes, + * like "rebasing" (computing new inducing points from old ones) and + * reduced-complexity updating of a fit with new information, are not + * yet supported by the PIC approximation class. * - * K_*u S K_u* = K_*u (K_uu + K_uf A^-1 K_fu)^-1 K_u* - * = K_*u (B^T B)^-1 K_u* - * = K_*u (P R R^T P^T)^-1 K_u* - * = (P R^-T K_u*)^T (P R^-T K_u*) - * = S_sqrt^T S_sqrt */ template #include -// #include "albatross/src/eigen/serializable_ldlt.hpp" -// #include "albatross/src/linalg/block_diagonal.hpp" #include "test_models.h" #include #include From 9bb009edb226cf66cac804175abd8af605329e90 Mon Sep 17 00:00:00 2001 From: Matt Peddie Date: Mon, 2 Sep 2024 15:47:33 +1000 Subject: [PATCH 12/12] Update Albatross to use vendored MKL distribution --- WORKSPACE.bazel | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/WORKSPACE.bazel b/WORKSPACE.bazel index ab62240f..54325525 100644 --- a/WORKSPACE.bazel +++ b/WORKSPACE.bazel @@ -31,7 +31,7 @@ http_archive( build_file = "@rules_swiftnav//third_party:mkl_headers.BUILD", sha256 = "b24d12a8e18ba23de5c659a33fb184a7ac6019d4b159e78f628d7c8de225f77a", urls = [ - "https://anaconda.org/intel/mkl-include/2023.1.0/download/linux-64/mkl-include-2023.1.0-intel_46342.tar.bz2", + "https://swiftnav-public-mkl-sharedlibs.s3.us-west-2.amazonaws.com/mkl-include-2023.1.0-intel_46342.tar.bz2", ], ) @@ -41,7 +41,7 @@ http_archive( sha256 = "c63adbfdbdc7c4992384a2d89cd62211e4a9f8061e3e841af1a269699531cb02", strip_prefix = "lib", urls = [ - "https://anaconda.org/intel/mkl-static/2023.1.0/download/linux-64/mkl-static-2023.1.0-intel_46342.tar.bz2", + "https://swiftnav-public-mkl-sharedlibs.s3.us-west-2.amazonaws.com/mkl-static-2023.1.0-intel_46342.tar.bz2", ], )