From e9638eaee2397147761daf655a2c0d21a0fc7a59 Mon Sep 17 00:00:00 2001
From: Daniel Lee <bearlee@alum.mit.edu>
Date: Sun, 16 Oct 2016 23:25:11 -0400
Subject: [PATCH 01/10] Fixes #2082. Adding test for multiple translation units
 and inlining remaining functions

---
 make/tests                                    | 17 +++++++++
 src/stan/command/stanc_helper.hpp             | 12 +++----
 src/stan/io/cmd_line.hpp                      | 36 +++++++++----------
 src/stan/lang/rethrow_located.hpp             |  2 +-
 src/test/unit/multiple_translation_units1.cpp |  5 +++
 src/test/unit/multiple_translation_units2.cpp |  5 +++
 .../unit/multiple_translation_units_test.cpp  |  7 ++++
 7 files changed, 59 insertions(+), 25 deletions(-)
 create mode 100644 src/test/unit/multiple_translation_units1.cpp
 create mode 100644 src/test/unit/multiple_translation_units2.cpp
 create mode 100644 src/test/unit/multiple_translation_units_test.cpp
diff --git a/make/tests b/make/tests
index 91663e22106..097da960599 100644
--- a/make/tests
+++ b/make/tests
@@ -128,6 +128,23 @@ test/dummy.cpp:
 .PHONY: test-headers
 test-headers: $(HEADER_TESTS)
 
+##
+# Adding a test for multiple translation units. If this fails,
+# a new function is probably missing an inline
+##
+
+test/unit/multiple_translation_units%.o : src/test/unit/multiple_translation_units%.cpp
+	@mkdir -p $(dir $@)
+	$(COMPILE.c) -O$O $< $(OUTPUT_OPTION)
+
+test/unit/libmultiple.so : test/unit/multiple_translation_units1.o test/unit/multiple_translation_units2.o
+	@mkdir -p $(dir $@)
+	$(CC) -shared $(OUTPUT_OPTION) $^
+
+src/test/unit/multiple_translation_units_test.cpp : test/unit/libmultiple.so
+
+
+
 ############################################################
 ##
 # Use the stanc compiler to generate C++ from Stan programs
diff --git a/src/stan/command/stanc_helper.hpp b/src/stan/command/stanc_helper.hpp
index 5745459119e..52aa98edd4e 100644
--- a/src/stan/command/stanc_helper.hpp
+++ b/src/stan/command/stanc_helper.hpp
@@ -11,7 +11,7 @@
 #include <string>
 
 
-void print_version(std::ostream* out_stream) {
+inline void print_version(std::ostream* out_stream) {
   if (!out_stream) return;
   *out_stream << "stanc version "
               << stan::MAJOR_VERSION
@@ -25,7 +25,7 @@ void print_version(std::ostream* out_stream) {
 /**
  * Prints the Stan compiler (stanc) help.
  */
-void print_stanc_help(std::ostream* out_stream) {
+inline void print_stanc_help(std::ostream* out_stream) {
   using stan::io::print_help_option;
 
   if (!out_stream) return;
@@ -56,8 +56,8 @@ void print_stanc_help(std::ostream* out_stream) {
                     "Do not fail if a function is declared but not defined");
 }
 
-void delete_file(std::ostream* err_stream,
-                 const std::string& file_name) {
+inline void delete_file(std::ostream* err_stream,
+                        const std::string& file_name) {
   int deleted = std::remove(file_name.c_str());
   if (deleted != 0 && file_name.size() > 0)
     if (err_stream)
@@ -66,8 +66,8 @@ void delete_file(std::ostream* err_stream,
 }
 
 
-int stanc_helper(int argc, const char* argv[],
-                 std::ostream* out_stream, std::ostream* err_stream) {
+inline int stanc_helper(int argc, const char* argv[],
+                        std::ostream* out_stream, std::ostream* err_stream) {
   static const int SUCCESS_RC = 0;
   static const int EXCEPTION_RC = -1;
   static const int PARSE_FAIL_RC = -2;
diff --git a/src/stan/io/cmd_line.hpp b/src/stan/io/cmd_line.hpp
index 0d526130ea4..1ee8946e40d 100644
--- a/src/stan/io/cmd_line.hpp
+++ b/src/stan/io/cmd_line.hpp
@@ -24,9 +24,9 @@ namespace stan {
      * @param width Width of option (defaults to 20).
      * @param o Output stream ptr, default to null
      */
-    void pad_help_option(std::ostream* o,
-                         const std::string& option = "",
-                         unsigned int width = 20) {
+    inline void pad_help_option(std::ostream* o,
+                                const std::string& option = "",
+                                unsigned int width = 20) {
       if (!o) return;
       *o << "  " << option;
       int padding = width - option.size();
@@ -46,10 +46,10 @@ namespace stan {
      * @param msg
      * @param note
      */
-    void print_help_helper(std::ostream* o,
-                           const std::string& key_val,
-                           const std::string& msg,
-                           const std::string& note = "") {
+    inline void print_help_helper(std::ostream* o,
+                                  const std::string& key_val,
+                                  const std::string& msg,
+                                  const std::string& note = "") {
       if (!o) return;
       pad_help_option(o, key_val);
       *o << msg
@@ -71,11 +71,11 @@ namespace stan {
      * @param msg
      * @param note
      */
-    void print_help_option(std::ostream* o,
-                           const std::string& key,
-                           const std::string& value_type,
-                           const std::string& msg,
-                           const std::string& note = "") {
+    inline void print_help_option(std::ostream* o,
+                                  const std::string& key,
+                                  const std::string& value_type,
+                                  const std::string& msg,
+                                  const std::string& note = "") {
       std::stringstream ss;
       ss << "--" << key;
       if (value_type.size() > 0)
@@ -187,7 +187,7 @@ namespace stan {
        * @tparam Type of value.
        */
       template <typename T>
-      bool val(const std::string& key, T& x) const {
+      inline bool val(const std::string& key, T& x) const {
         if (!has_key(key))
           return false;
         std::stringstream s(key_val_.find(key)->second);
@@ -210,7 +210,7 @@ namespace stan {
        *
        * @return Number of bare arguments.
        */
-      size_t bare_size() const {
+      inline size_t bare_size() const {
         return bare_.size();
       }
 
@@ -228,7 +228,7 @@ namespace stan {
        * @tparam T Type of value returned.
        */
       template <typename T>
-      bool bare(size_t n, T& x) const {
+      inline bool bare(size_t n, T& x) const {
         if (n >= bare_.size())
           return false;
         std::stringstream s(bare_[n]);
@@ -272,7 +272,7 @@ namespace stan {
     // explicit instantation for std::string to allow for spaces
     // in bare_[n]
     template <>
-    bool cmd_line::bare<std::string>(size_t n, std::string& x) const {
+    inline bool cmd_line::bare<std::string>(size_t n, std::string& x) const {
       if (n >= bare_.size())
         return false;
       x = bare_[n];
@@ -282,8 +282,8 @@ namespace stan {
     // explicit instantation for std::string to allow for spaces
     // in key_val_
     template <>
-    bool cmd_line::val<std::string>(const std::string& key,
-                                    std::string& x) const {
+    inline bool cmd_line::val<std::string>(const std::string& key,
+                                           std::string& x) const {
       if (!has_key(key))
         return false;
       x = key_val_.find(key)->second;
diff --git a/src/stan/lang/rethrow_located.hpp b/src/stan/lang/rethrow_located.hpp
index f238b489d99..4c35acb7ac5 100644
--- a/src/stan/lang/rethrow_located.hpp
+++ b/src/stan/lang/rethrow_located.hpp
@@ -84,7 +84,7 @@ namespace stan {
      * @param[in] line Line number in Stan source program where
      * exception originated.
      */
-    void rethrow_located(const std::exception& e, int line) {
+    inline void rethrow_located(const std::exception& e, int line) {
       using std::bad_alloc;          // -> exception
       using std::bad_cast;           // -> exception
       using std::bad_exception;      // -> exception
diff --git a/src/test/unit/multiple_translation_units1.cpp b/src/test/unit/multiple_translation_units1.cpp
new file mode 100644
index 00000000000..20d5b08375d
--- /dev/null
+++ b/src/test/unit/multiple_translation_units1.cpp
@@ -0,0 +1,5 @@
+#include <stan/model/model_header.hpp>
+
+stan::math::var function1() {
+  return 0;
+}
diff --git a/src/test/unit/multiple_translation_units2.cpp b/src/test/unit/multiple_translation_units2.cpp
new file mode 100644
index 00000000000..fe6bb4c27ef
--- /dev/null
+++ b/src/test/unit/multiple_translation_units2.cpp
@@ -0,0 +1,5 @@
+#include <stan/model/model_header.hpp>
+
+stan::math::var function2() {
+  return 0;
+}
diff --git a/src/test/unit/multiple_translation_units_test.cpp b/src/test/unit/multiple_translation_units_test.cpp
new file mode 100644
index 00000000000..45795f6bb76
--- /dev/null
+++ b/src/test/unit/multiple_translation_units_test.cpp
@@ -0,0 +1,7 @@
+#include <gtest/gtest.h>
+
+TEST(multiple_translation_units, compile) {
+  SUCCEED()
+    << "this test compiling indicates that compiling the stan library "
+    << "with multiple translation units is ok.";
+}

From 99373aefa142d679598e3e0cc703eb02bd7896ef Mon Sep 17 00:00:00 2001
From: Daniel Lee <bearlee@alum.mit.edu>
Date: Sat, 22 Oct 2016 22:27:26 -0400
Subject: [PATCH 02/10] Fixing makefile so it respects the command-line option

---
 make/os_win | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/make/os_win b/make/os_win
index eaad45ec15e..baf4b482bf9 100644
--- a/make/os_win
+++ b/make/os_win
@@ -14,15 +14,15 @@
 ##
 
 EXE = .exe
-O_STANC = 3
+O_STANC ?= 3
 CFLAGS_GTEST += -DGTEST_HAS_PTHREAD=0
 PATH_SEPARATOR = '\'
 ## Detecting Process Bitness:
 ## http://blogs.msdn.com/b/david.wang/archive/2006/03/26/howto-detect-process-bitness.aspx
 ifeq ($(PROCESSOR_ARCHITECTURE)$(PROCESSOR_ARCHITEW6432),x86)
-  BIT=32
+  BIT?=32
 else
-  BIT=64
+  BIT?=64
 endif
 ifeq (g++,$(CC_TYPE))
   CFLAGS += -m$(BIT)

From c0cbe376f7a6169d80c3648a978123519e4357d3 Mon Sep 17 00:00:00 2001
From: Daniel Lee <bearlee@alum.mit.edu>
Date: Tue, 25 Oct 2016 23:55:25 -0400
Subject: [PATCH 03/10] Updating make file

---
 make/tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/make/tests b/make/tests
index 097da960599..db5b99035e6 100644
--- a/make/tests
+++ b/make/tests
@@ -139,7 +139,7 @@ test/unit/multiple_translation_units%.o : src/test/unit/multiple_translation_uni
 
 test/unit/libmultiple.so : test/unit/multiple_translation_units1.o test/unit/multiple_translation_units2.o
 	@mkdir -p $(dir $@)
-	$(CC) -shared $(OUTPUT_OPTION) $^
+	$(CC) $(CFLAGS) -shared $(OUTPUT_OPTION) $^
 
 src/test/unit/multiple_translation_units_test.cpp : test/unit/libmultiple.so
 

From ee8f23e102a666f3bd1073a89fd9d8b34adf2e24 Mon Sep 17 00:00:00 2001
From: stan-buildbot <mc.stanislaw@gmail.com>
Date: Thu, 15 Jun 2017 20:30:12 -0400
Subject: [PATCH 04/10] Updates the Math submodule to 8008de4.

---
 lib/stan_math | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/stan_math b/lib/stan_math
index 87ec15ad842..8008de4e4d8 160000
--- a/lib/stan_math
+++ b/lib/stan_math
@@ -1 +1 @@
-Subproject commit 87ec15ad842fbeba2c2b6733b8839f6ea86e38f3
+Subproject commit 8008de4e4d867ae21e6a66d561588a4ebb80e866

From 286fe9567a6e9a3e1df9a457d45149a7b0686b5c Mon Sep 17 00:00:00 2001
From: Daniel Lee <daniel@stan.fit>
Date: Fri, 16 Jun 2017 21:43:40 -0400
Subject: [PATCH 05/10] Adding inline to more functions

---
 src/stan/command/stanc_helper.hpp     | 170 +++++++++++++-------------
 src/stan/services/util/create_rng.hpp |   4 +-
 2 files changed, 87 insertions(+), 87 deletions(-)

diff --git a/src/stan/command/stanc_helper.hpp b/src/stan/command/stanc_helper.hpp
index 3419ee8b2f7..1adf51700c0 100644
--- a/src/stan/command/stanc_helper.hpp
+++ b/src/stan/command/stanc_helper.hpp
@@ -91,7 +91,7 @@ inline void delete_file(std::ostream* err_stream,
  * @param[in] in_file_name the name of the input file
  * @return a valid C++ identifier based on the file name.
  */
-std::string identifier_from_file_name(const std::string& in_file_name) {
+inline std::string identifier_from_file_name(const std::string& in_file_name) {
   size_t slashInd = in_file_name.rfind('/');
   size_t ptInd = in_file_name.rfind('.');
   if (ptInd == std::string::npos)
@@ -107,7 +107,7 @@ std::string identifier_from_file_name(const std::string& in_file_name) {
   std::string result =
     in_file_name.substr(slashInd, ptInd - slashInd);
   for (std::string::iterator strIt = result.begin();
-      strIt != result.end(); strIt++) {
+       strIt != result.end(); strIt++) {
     if (!isalnum(*strIt) && *strIt != '_') {
       *strIt = '_';
     }
@@ -124,12 +124,12 @@ std::string identifier_from_file_name(const std::string& in_file_name) {
  * @param[in] extension The extension (WITHOUT dot)- e.g. "stan".
  * @return true if the file has the extension
  */
-bool has_extension(const std::string& file_name, const std::string& extension) {
+inline bool has_extension(const std::string& file_name, const std::string& extension) {
   if (file_name.length() >= extension.length() + 1) {  // +1 for the dot
     if (0 == file_name.compare (file_name.length() - extension.length(),
-        extension.length(), extension)
-      && file_name[file_name.length() - extension.length() - 1] == '.')
-        return true;
+                                extension.length(), extension)
+        && file_name[file_name.length() - extension.length() - 1] == '.')
+      return true;
     else
       return false;
   } else {
@@ -144,18 +144,18 @@ bool has_extension(const std::string& file_name, const std::string& extension) {
  * @param[in] identifier_type the type of the identifier to be reported
  * in error messages
  */
-void check_identifier(const std::string& identifier,
-                      const std::string& identifier_type) {
+inline void check_identifier(const std::string& identifier,
+                             const std::string& identifier_type) {
   if (!isalpha(identifier[0]) && identifier[0] != '_') {
     std::string msg(identifier_type + " must not start with a "
                     "number or symbol other than _");
     throw std::invalid_argument(msg);
   }
   for (std::string::const_iterator strIt = identifier.begin();
-      strIt != identifier.end(); strIt++) {
+       strIt != identifier.end(); strIt++) {
     if (!isalnum(*strIt) && *strIt != '_') {
       std::string msg(identifier_type
-        + " must contain only letters, numbers and _");
+                      + " must contain only letters, numbers and _");
       throw std::invalid_argument(msg);
     }
   }
@@ -232,95 +232,95 @@ inline int stanc_helper(int argc, const char* argv[],
     bool valid_input = false;
 
     switch (compilation_type) {
-      case kModel: {
-        std::string model_name;
-        if (cmd.has_key("name")) {
-          cmd.val("name", model_name);
-        } else {
-          model_name = identifier_from_file_name(in_file_name) + "_model";
-        }
-
-        // TODO(martincerny) Check that the -namespace flag is not set
+    case kModel: {
+      std::string model_name;
+      if (cmd.has_key("name")) {
+        cmd.val("name", model_name);
+      } else {
+        model_name = identifier_from_file_name(in_file_name) + "_model";
+      }
 
-        if (cmd.has_key("o")) {
-          cmd.val("o", out_file_name);
-        } else {
-          out_file_name = model_name;
-          // TODO(carpenter): shouldn't this be .hpp without a main()?
-          out_file_name += ".cpp";
-        }
+      // TODO(martincerny) Check that the -namespace flag is not set
 
-        check_identifier(model_name, "model_name");
+      if (cmd.has_key("o")) {
+        cmd.val("o", out_file_name);
+      } else {
+        out_file_name = model_name;
+        // TODO(carpenter): shouldn't this be .hpp without a main()?
+        out_file_name += ".cpp";
+      }
 
-        std::fstream out(out_file_name.c_str(), std::fstream::out);
-        if (out_stream) {
-          *out_stream << "Model name=" << model_name << std::endl;
-          *out_stream << "Input file=" << in_file_name << std::endl;
-          *out_stream << "Output file=" << out_file_name << std::endl;
-        }
-        if (!out.is_open()) {
-          std::stringstream msg;
-          msg << "Failed to open output file "
-              <<  out_file_name.c_str();
-          throw std::invalid_argument(msg.str());
-        }
+      check_identifier(model_name, "model_name");
 
-        valid_input = stan::lang::compile(err_stream, in, out, model_name,
-                                  allow_undefined, in_file_name, include_paths);
-        out.close();
-        break;
+      std::fstream out(out_file_name.c_str(), std::fstream::out);
+      if (out_stream) {
+        *out_stream << "Model name=" << model_name << std::endl;
+        *out_stream << "Input file=" << in_file_name << std::endl;
+        *out_stream << "Output file=" << out_file_name << std::endl;
+      }
+      if (!out.is_open()) {
+        std::stringstream msg;
+        msg << "Failed to open output file "
+            <<  out_file_name.c_str();
+        throw std::invalid_argument(msg.str());
       }
-      case kStandaloneFunctions: {
-        if (cmd.has_key("o")) {
-          cmd.val("o", out_file_name);
-        } else {
-          out_file_name = identifier_from_file_name(in_file_name);
-          out_file_name += ".hpp";
-        }
 
-        // TODO(martincerny) Allow multiple namespaces
-        // (split namespace argument by "::")
-        std::vector<std::string> namespaces;
-        if (cmd.has_key("namespace")) {
-          std::string ns;
-          cmd.val("namespace", ns);
-          namespaces.push_back(ns);
-        } else {
-          namespaces.push_back(
-            identifier_from_file_name(in_file_name) + "_functions");
-        }
+      valid_input = stan::lang::compile(err_stream, in, out, model_name,
+                                        allow_undefined, in_file_name, include_paths);
+      out.close();
+      break;
+    }
+    case kStandaloneFunctions: {
+      if (cmd.has_key("o")) {
+        cmd.val("o", out_file_name);
+      } else {
+        out_file_name = identifier_from_file_name(in_file_name);
+        out_file_name += ".hpp";
+      }
 
-        // TODO(martincerny) Check that the -name flag is not set
+      // TODO(martincerny) Allow multiple namespaces
+      // (split namespace argument by "::")
+      std::vector<std::string> namespaces;
+      if (cmd.has_key("namespace")) {
+        std::string ns;
+        cmd.val("namespace", ns);
+        namespaces.push_back(ns);
+      } else {
+        namespaces.push_back(
+                             identifier_from_file_name(in_file_name) + "_functions");
+      }
 
-        for (size_t namespace_i = 0;
-            namespace_i < namespaces.size(); ++namespace_i) {
-          check_identifier(namespaces[namespace_i], "namespace");
-        }
+      // TODO(martincerny) Check that the -name flag is not set
 
-        std::fstream out(out_file_name.c_str(), std::fstream::out);
-        if (out_stream) {
-          *out_stream << "Parsing a fuctions-only file" << std::endl;
+      for (size_t namespace_i = 0;
+           namespace_i < namespaces.size(); ++namespace_i) {
+        check_identifier(namespaces[namespace_i], "namespace");
+      }
 
-          *out_stream << "Target namespace= ";
-          for (size_t namespace_i = 0;
-              namespace_i < namespaces.size(); ++namespace_i) {
-            *out_stream << "::" << namespaces[namespace_i];
-          }
-          *out_stream << std::endl;
+      std::fstream out(out_file_name.c_str(), std::fstream::out);
+      if (out_stream) {
+        *out_stream << "Parsing a fuctions-only file" << std::endl;
 
-          *out_stream << "Input file=" << in_file_name << std::endl;
-          *out_stream << "Output file=" << out_file_name << std::endl;
+        *out_stream << "Target namespace= ";
+        for (size_t namespace_i = 0;
+             namespace_i < namespaces.size(); ++namespace_i) {
+          *out_stream << "::" << namespaces[namespace_i];
         }
+        *out_stream << std::endl;
 
-        valid_input = stan::lang::compile_functions(err_stream, in, out,
-                        namespaces, allow_undefined);
-
-        out.close();
-        break;
-      }
-      default: {
-        assert(false);
+        *out_stream << "Input file=" << in_file_name << std::endl;
+        *out_stream << "Output file=" << out_file_name << std::endl;
       }
+
+      valid_input = stan::lang::compile_functions(err_stream, in, out,
+                                                  namespaces, allow_undefined);
+
+      out.close();
+      break;
+    }
+    default: {
+      assert(false);
+    }
     }
 
     if (!valid_input) {
diff --git a/src/stan/services/util/create_rng.hpp b/src/stan/services/util/create_rng.hpp
index 071327d6a6e..d32df0ee483 100644
--- a/src/stan/services/util/create_rng.hpp
+++ b/src/stan/services/util/create_rng.hpp
@@ -22,8 +22,8 @@ namespace stan {
        * @param[in] chain the chain id
        * @return a boost::ecuyer1988 instance
        */
-      boost::ecuyer1988 create_rng(unsigned int seed,
-                                   unsigned int chain) {
+      inline boost::ecuyer1988 create_rng(unsigned int seed,
+                                          unsigned int chain) {
         using boost::uintmax_t;
         static uintmax_t DISCARD_STRIDE = static_cast<uintmax_t>(1) << 50;
         boost::ecuyer1988 rng(seed);

From 89977689b71a65ff5f6eca6834f9f9df4b69960b Mon Sep 17 00:00:00 2001
From: Daniel Lee <daniel@stan.fit>
Date: Fri, 16 Jun 2017 23:18:09 -0400
Subject: [PATCH 06/10] Reverting makefile change for windows

---
 make/os_win | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/make/os_win b/make/os_win
index baf4b482bf9..b207a29fc7a 100644
--- a/make/os_win
+++ b/make/os_win
@@ -5,7 +5,7 @@
 #   - CFLAGS_GTEST
 #   - OPT
 #   - EXE
-# 
+#
 # Additionally, set the following variables to build
 # demo/lang using the Microsoft Visual Studio compiler.
 #   - CC_STANC
@@ -14,7 +14,7 @@
 ##
 
 EXE = .exe
-O_STANC ?= 3
+O_STANC = 3
 CFLAGS_GTEST += -DGTEST_HAS_PTHREAD=0
 PATH_SEPARATOR = '\'
 ## Detecting Process Bitness:
@@ -40,4 +40,3 @@ ifeq (mingw32-g++,$(CC_TYPE))
 endif
 ifeq (clang++,$(CC_TYPE))
 endif
-

From f09a470d1d613068b976e3a81c0ecf50b945de2d Mon Sep 17 00:00:00 2001
From: Daniel Lee <daniel@stan.fit>
Date: Sat, 17 Jun 2017 01:32:25 -0400
Subject: [PATCH 07/10] cpplint

---
 src/stan/command/stanc_helper.hpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/stan/command/stanc_helper.hpp b/src/stan/command/stanc_helper.hpp
index 1adf51700c0..6ad8d483f6d 100644
--- a/src/stan/command/stanc_helper.hpp
+++ b/src/stan/command/stanc_helper.hpp
@@ -124,7 +124,8 @@ inline std::string identifier_from_file_name(const std::string& in_file_name) {
  * @param[in] extension The extension (WITHOUT dot)- e.g. "stan".
  * @return true if the file has the extension
  */
-inline bool has_extension(const std::string& file_name, const std::string& extension) {
+inline bool has_extension(const std::string& file_name,
+                          const std::string& extension) {
   if (file_name.length() >= extension.length() + 1) {  // +1 for the dot
     if (0 == file_name.compare (file_name.length() - extension.length(),
                                 extension.length(), extension)
@@ -266,7 +267,8 @@ inline int stanc_helper(int argc, const char* argv[],
       }
 
       valid_input = stan::lang::compile(err_stream, in, out, model_name,
-                                        allow_undefined, in_file_name, include_paths);
+                                        allow_undefined, in_file_name,
+                                        include_paths);
       out.close();
       break;
     }
@@ -286,8 +288,8 @@ inline int stanc_helper(int argc, const char* argv[],
         cmd.val("namespace", ns);
         namespaces.push_back(ns);
       } else {
-        namespaces.push_back(
-                             identifier_from_file_name(in_file_name) + "_functions");
+        namespaces.push_back(identifier_from_file_name(in_file_name)
+                             + "_functions");
       }
 
       // TODO(martincerny) Check that the -name flag is not set

From 1197f40f2c8836d2b8fdb3c903f980d205c476dc Mon Sep 17 00:00:00 2001
From: Daniel Lee <daniel@stan.fit>
Date: Sat, 17 Jun 2017 01:44:23 -0400
Subject: [PATCH 08/10] Fixes #2332. Updates performance test.

---
 src/test/performance/logistic_test.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/test/performance/logistic_test.cpp b/src/test/performance/logistic_test.cpp
index 2c3710f5dca..101dbef8045 100644
--- a/src/test/performance/logistic_test.cpp
+++ b/src/test/performance/logistic_test.cpp
@@ -105,19 +105,19 @@ TEST_F(performance, run) {
 TEST_F(performance, values_from_tagged_version) {
   int N_values = 9;
   ASSERT_EQ(N_values, last_draws_per_run[0].size())
-    << "last tagged version, 2.9.0, had " << N_values << " elements";
+    << "last tagged version, 2.16.0, had " << N_values << " elements";
 
   std::vector<double> first_run = last_draws_per_run[0];
-  EXPECT_FLOAT_EQ(-65.319603, first_run[0])
+  EXPECT_FLOAT_EQ(-65.508499, first_run[0])
     << "lp__: index 0";
 
-  EXPECT_FLOAT_EQ(0.96712297, first_run[1])
+  EXPECT_FLOAT_EQ(0.78501701, first_run[1])
     << "accept_stat__: index 1";
 
-  EXPECT_FLOAT_EQ(1.06545, first_run[2])
+  EXPECT_FLOAT_EQ(0.76857001, first_run[2])
     << "stepsize__: index 2";
 
-  EXPECT_FLOAT_EQ(2, first_run[3])
+  EXPECT_FLOAT_EQ(1, first_run[3])
     << "treedepth__: index 3";
 
   EXPECT_FLOAT_EQ(3, first_run[4])
@@ -126,13 +126,13 @@ TEST_F(performance, values_from_tagged_version) {
   EXPECT_FLOAT_EQ(0, first_run[5])
     << "divergent__: index 5";
 
-  EXPECT_FLOAT_EQ(66.090698, first_run[6])
+  EXPECT_FLOAT_EQ(68.208702, first_run[6])
     << "energy__: index 6";
 
-  EXPECT_FLOAT_EQ(1.4068201, first_run[7])
+  EXPECT_FLOAT_EQ(1.4087501, first_run[7])
     << "beta.1: index 7";
 
-  EXPECT_FLOAT_EQ(-0.59367198, first_run[8])
+  EXPECT_FLOAT_EQ(-0.69222099, first_run[8])
     << "beta.2: index 8";
 
   matches_tagged_version = !HasNonfatalFailure();

From 26a7c84e19fd0a38761765b7533bed5cd10702b3 Mon Sep 17 00:00:00 2001
From: Bob Carpenter <carp@alias-i.com>
Date: Sat, 17 Jun 2017 23:54:41 -0400
Subject: [PATCH 09/10] manual for 2.16, fixes #2265

---
 src/docs/stan-reference/acknowledgements.tex |  33 +-
 src/docs/stan-reference/distributions.tex    |  70 +-
 src/docs/stan-reference/examples.tex         | 686 +++++++++----------
 src/docs/stan-reference/language.tex         | 136 ++--
 src/docs/stan-reference/programming.tex      | 469 +++++++------
 src/docs/stan-reference/stan-reference.tex   |   2 +-
 src/docs/stan-reference/title.tex            |  21 +-
 7 files changed, 724 insertions(+), 693 deletions(-)

diff --git a/src/docs/stan-reference/acknowledgements.tex b/src/docs/stan-reference/acknowledgements.tex
index 1023640d60d..b33b96e5fdc 100644
--- a/src/docs/stan-reference/acknowledgements.tex
+++ b/src/docs/stan-reference/acknowledgements.tex
@@ -15,7 +15,7 @@ \section*{Grants and Corporate Support}
 \subsection*{Current Grants}
 
 \begin{itemize}
-\item 
+\item
  U.~S.\ Department of Education Institute of Education Sciences
 \begin{itemize}\small
 \item Statistical and Research Methodology: Solving
@@ -39,13 +39,13 @@ \subsection*{Previous Grants}
 %
 \begin{itemize}
 \item
-U.~S.\ Department of Energy 
+U.~S.\ Department of Energy
 \begin{itemize}\small
 \item DE-SC0002099: Petascale Computing
 \end{itemize}
 %
 \item
-U.~S.\ National Science Foundation 
+U.~S.\ National Science Foundation
 \begin{itemize}\small
 \item
 ATM-0934516: Reconstructing Climate from Tree Ring Data
@@ -54,7 +54,7 @@ \subsection*{Previous Grants}
 \end{itemize}
 %
 \item
-U.~S.\ Department of Education Institute of Education Sciences 
+U.~S.\ Department of Education Institute of Education Sciences
 \begin{itemize}\small
 \item ED-GRANTS-032309-005:
 Practical Tools for Multilevel Hierarchical Modeling in Education
@@ -92,12 +92,12 @@ \subsection*{Code  and Doc Patches}
 to:
 Ethan Adams,
 Avraham Adler,
-Jarret Barber, 
+Jarret Barber,
 David R.~Blair,
-Miguel de Val-Borro, 
-Ross Boylan, 
-Eric N.~Brown, 
-Devin Caughey, 
+Miguel de Val-Borro,
+Ross Boylan,
+Eric N.~Brown,
+Devin Caughey,
 Emmanuel Charpentier,
 Daniel Chen,
 Jacob Egner,
@@ -116,6 +116,7 @@ \subsection*{Code  and Doc Patches}
 Dan Lakeland,
 Devin Leopold,
 Nathanael I.~Lichti,
+Jussi M\"a\"att\"a,
 Titus van der Malsburg,
 P.~D.~Metcalfe,
 Kyle Meyer,
@@ -155,8 +156,9 @@ \subsection*{Code  and Doc Patches}
 Alex Chase,
 Daniel Chen,
 Roman Cheplyaka,
-Andy Choi, 
+Andy Choi,
 David Chudzicki,
+Michael Clerx,
 Andria Dawson,
 daydreamt (GitHub handle),
 Conner DiPaolo,
@@ -190,15 +192,18 @@ \subsection*{Code  and Doc Patches}
 Tobias Madsen,
 Stefano Mangiola,
 David Manheim,
+Stephen Martin,
 Sean Matthews,
 David Mawdsley,
 Dieter Menne,
 Evelyn Mitchell,
+Javier Moreno,
 Robert Myles,xs
 Sunil Nandihalli,
 Eric Novik,
 Julia Palacios,
-Tamas Papp, 
+Tamas Papp,
+Anders Gorm Pedersen,
 Tomi Peltola,
 Andre Pfeuffer,
 Sergio Polini,
@@ -224,6 +229,7 @@ \subsection*{Code  and Doc Patches}
 Dougal Sutherland,
 John Sutton,
 Maciej Swat,
+J.~Takoua,
 Andrew J.~Tanentzap,
 Shravan Vashisth,
 Aki Vehtari,
@@ -234,8 +240,9 @@ \subsection*{Code  and Doc Patches}
 Sam Weiss,
 Luke Wiklendt,
 wrobell (GitHub handle),
-Howard Zail, and
-Jon Zelner.
+Howard Zail,
+Jon Zelner, and
+Xiubo Zhang
 
 Thanks to Kevin van Horn for install instructions for Cygwin and to
 Kyle Foreman for instructions on using the MKL compiler.
diff --git a/src/docs/stan-reference/distributions.tex b/src/docs/stan-reference/distributions.tex
index f144db5b120..d725717a94f 100644
--- a/src/docs/stan-reference/distributions.tex
+++ b/src/docs/stan-reference/distributions.tex
@@ -4,7 +4,7 @@ \chapter{Conventions for Probability Functions}
 
 Functions associated with distributions are set up to follow the same
 naming conventions for both built-in distributions and for user-defined
-distributions. 
+distributions.
 
 \section{Suffix Marks Type of Function}
 
@@ -18,7 +18,7 @@ \section{Suffix Marks Type of Function}
 log probability density function  & continuous & \code{\_lpdf} \\
 \hline
 log cumulative distribution function & any & \code{\_lcdf} \\
-log complementary cumulative distribution function & any & \code{\_lccdf} \\ 
+log complementary cumulative distribution function & any & \code{\_lccdf} \\
 \hline
 random number generator & any & \code{\_rng}
 \end{tabular}
@@ -114,14 +114,14 @@ \section{Cumulative Distribution Functions}
 | \, \theta)$, the cumulative distribution function (CDF) $F_Y$ is
 defined by
 \[
-F_Y(y) 
+F_Y(y)
 \ = \
-\mbox{Pr}[Y < y] 
+\mbox{Pr}[Y < y]
 \ = \
 \int_{-\infty}^y p(y \, | \, \theta) \ \mathrm{d}\theta.
 \]
 The complementary cumulative distribution function (CCDF) is defined
-as 
+as
 \[
 \mbox{Pr}[Y \geq y]
 \ = \
@@ -134,7 +134,7 @@ \section{Cumulative Distribution Functions}
 
 In Stan, there is a cumulative distribution function for each
 probability function.  For instance, \code{normal\_cdf(y,~mu,~sigma)}
-is defined by 
+is defined by
 %
 \[
 \int_0^y \distro{Normal}(y \, | \, \mu, \sigma) \ \mathrm{d}y.
@@ -182,7 +182,7 @@ \section{Vectorization}\label{vectorization.section}
 covariance matrix is only done once.
 
 Stan also overloads some scalar functions, such as \code{log} and
-\code{exp}, to apply to vectors (arrays) and return vectors (arrays).  
+\code{exp}, to apply to vectors (arrays) and return vectors (arrays).
 These vectorizations are defined elementwise and unlike the
 probability functions, provide only minimal efficiency speedups over
 repeated application and assignment in a loop.
@@ -205,7 +205,7 @@ \subsubsection{Vectorized Scalar Arguments}
 argument, their types can be anything but their size must match.  For
 instance, it is legal to use
 \code{normal\_lpdf(row\_vector~|~vector,~real)} as long as the vector and
-row vector have the same size.  
+row vector have the same size.
 
 \subsubsection{Vectorized Vector and Row Vector Arguments}
 
@@ -447,7 +447,7 @@ \subsubsection{Probability Mass Function}
 
 \begin{eqnarray*}
 \distro{BinomialLogit}(n|N,\alpha)
-& = & \distro{BinomialLogit}(n|N,\mbox{logit}^{-1}(\alpha))
+& = & \distro{Binomial}(n|N,\mbox{logit}^{-1}(\alpha))
 \\[6pt]
 & = & \binom{N}{n} \left( \mbox{logit}^{-1}(\alpha) \right)^{n}
                     \left( 1 - \mbox{logit}^{-1}(\alpha) \right)^{N - n}.
@@ -1005,7 +1005,7 @@ \subsubsection{Stan Functions}
     \code{N = sum(\farg{y})}}
 %
 \fitem{int[]}{multinomial\_rng}{vector \farg{theta}, int \farg{N}}{Generate a
-  multinomial variate with simplex distribution parameter \farg{theta} and 
+  multinomial variate with simplex distribution parameter \farg{theta} and
   total count \farg{N}; may only be used in generated quantities block}
 \end{description}
 
@@ -1072,7 +1072,7 @@ \subsubsection{Stan Functions}
 \end{description}
 %
 \begin{description}
-\fitem{real}{normal\_rng}{real \farg{mu} \textbar\ real \farg{sigma}}{Generate a normal
+\fitem{real}{normal\_rng}{real \farg{mu},  real \farg{sigma}}{Generate a normal
   variate with location \farg{mu} and scale \farg{sigma}; may only be used in
   generated quantities block}
 \end{description}
@@ -1130,51 +1130,51 @@ \section{Skew Normal Distribution}
 
 \subsubsection{Probability Density Function}
 
-If $\mu \in \reals$, $\sigma \in \posreals$, and $\alpha \in \reals$,
+If $\xi \in \reals$, $\omega \in \posreals$, and $\alpha \in \reals$,
 then for $y \in \reals$,
 \[
-\distro{SkewNormal}(y|\mu,\sigma,\alpha)
+\distro{SkewNormal}(y \mid \xi, \omega, \alpha)
 =
-\frac{1}{\sigma\sqrt{2\pi}}
+\frac{1}{\omega\sqrt{2\pi}}
 \
 \exp\left( - \, \frac{1}{2}
-           \left(  \frac{y - \mu}{\sigma} \right)^2
+           \left(  \frac{y - \xi}{\omega} \right)^2
     \right)
 \
-\left(1 + \erf\left( \alpha\left(\frac{y - \mu}{\sigma\sqrt{2}}\right)\right)\right)
+\left(1 + \erf\left( \alpha\left(\frac{y - \xi}{\omega\sqrt{2}}\right)\right)\right)
 .
 \]
 
-\pitem{y}{skew\_normal}{mu, sigma, alpha}
+\pitem{y}{skew\_normal}{xi, omega, alpha}
 
 \subsubsection{Stan Functions}
 
 \begin{description}
-  \fitem{real}{skew\_normal\_lpdf}{reals \farg{y} \textbar\ reals \farg{mu}, reals
-    \farg{sigma}, reals \farg{alpha}}{The log of the skew normal density
-	of \farg{y} given location \farg{mu}, scale \farg{sigma}, and
+  \fitem{real}{skew\_normal\_lpdf}{reals \farg{y} \textbar\ reals \farg{xi}, reals
+    \farg{omega}, reals \farg{alpha}}{The log of the skew normal density
+	of \farg{y} given location \farg{xi}, scale \farg{omega}, and
 	shape \farg{alpha}}
 %
-\fitem{real}{skew\_normal\_cdf}{reals \farg{y}, reals \farg{mu}, reals
-  \farg{sigma}, reals \farg{alpha}}{The skew normal
-  distribution function of \farg{y} given location \farg{mu}, scale
-  \farg{sigma}, and shape \farg{alpha}}
+\fitem{real}{skew\_normal\_cdf}{reals \farg{y}, reals \farg{xi}, reals
+  \farg{omega}, reals \farg{alpha}}{The skew normal
+  distribution function of \farg{y} given location \farg{xi}, scale
+  \farg{omega}, and shape \farg{alpha}}
 %
-\fitemtwolines{real}{skew\_normal\_lcdf}{reals \farg{y} \textbar\ reals \farg{mu}, reals
-  \farg{sigma}}{reals \farg{alpha}}{The log of the skew normal cumulative
-  distribution function of \farg{y} given location \farg{mu}, scale
-  \farg{sigma}, and shape \farg{alpha}}
+\fitemtwolines{real}{skew\_normal\_lcdf}{reals \farg{y} \textbar\ reals \farg{xi}, reals
+  \farg{omega}}{reals \farg{alpha}}{The log of the skew normal cumulative
+  distribution function of \farg{y} given location \farg{xi}, scale
+  \farg{omega}, and shape \farg{alpha}}
 %
-\fitemtwolines{real}{skew\_normal\_lccdf}{reals \farg{y} \textbar\ reals \farg{mu}, reals
-  \farg{sigma}}{reals \farg{alpha}}{The log of the skew normal complementary
-   cumulative distribution function of \farg{y} given location \farg{mu}, scale
-  \farg{sigma}, and shape \farg{alpha}}
+\fitemtwolines{real}{skew\_normal\_lccdf}{reals \farg{y} \textbar\ reals \farg{xi}, reals
+  \farg{omega}}{reals \farg{alpha}}{The log of the skew normal complementary
+   cumulative distribution function of \farg{y} given location \farg{xi}, scale
+  \farg{omega}, and shape \farg{alpha}}
 \end{description}
 %
 \begin{description}
-\fitem{real}{skew\_normal\_rng}{real \farg{mu}, real \farg{sigma},
+\fitem{real}{skew\_normal\_rng}{real \farg{xi}, real \farg{omega},
   real \farg{alpha}}{Generate a skew normal variate with location
-  \farg{mu}, scale \farg{sigma}, and shape \farg{alpha}; may only be used in
+  \farg{xi}, scale \farg{omega}, and shape \farg{alpha}; may only be used in
   generated quantities block}
 \end{description}
 
@@ -1297,7 +1297,7 @@ \subsubsection{Probability Density Function}
 terms of inverse scale.
 
 The double-exponential distribution can be defined as a compound
-exponential-normal distribution.  Specifically, if 
+exponential-normal distribution.  Specifically, if
 \[
 \alpha \sim \mathsf{Exponential}\left( \frac{1}{\lambda} \right)
 \]
diff --git a/src/docs/stan-reference/examples.tex b/src/docs/stan-reference/examples.tex
index ab7f9952fbf..45301e8c07c 100644
--- a/src/docs/stan-reference/examples.tex
+++ b/src/docs/stan-reference/examples.tex
@@ -4,7 +4,7 @@ \chapter{Regression Models}
 
 \noindent
 Stan supports regression models from simple linear regressions to
-multilevel generalized linear models.  
+multilevel generalized linear models.
 
 \section{Linear Regression}
 
@@ -15,7 +15,7 @@ \section{Linear Regression}
 %
 \[
 y_n = \alpha + \beta x_n + \epsilon_n
-\ \ \ \mbox{where} \ \ \ 
+\ \ \ \mbox{where} \ \ \
 \epsilon_n \sim \distro{Normal}(0,\sigma).
 \]
 This is equivalent to the following sampling involving the
@@ -122,7 +122,7 @@ \subsection{Matrix Notation and Vectorization}
 predictions line up with the outcomes in the $N$-vector \code{y}, so
 the entire model may be written using matrix arithmetic as shown.  It
 would be possible to include a column of 1 values in \code{x} and
-remove the \code{alpha} parameter.  
+remove the \code{alpha} parameter.
 
 The sampling statement in the model above is just a more efficient,
 vector-based approach to coding the model with a loop, as in the
@@ -136,7 +136,7 @@ \subsection{Matrix Notation and Vectorization}
 \end{stancode}
 %
 With Stan's matrix indexing scheme, \code{x[n]} picks out row \code{n}
-of the matrix \code{x};  because \code{beta} is a column vector, 
+of the matrix \code{x};  because \code{beta} is a column vector,
 the product \code{x[n] * beta} is a scalar of type \code{real}.
 
 \subsubsection{Intercepts as Inputs}
@@ -160,18 +160,18 @@ \subsubsection{Intercepts as Inputs}
 \section{The QR Reparameterization}\label{QR-reparameterization.section}
 
 In the previous example, the linear predictor can be written as
-$\eta = x \beta$, where $\eta$ is a $N$-vector of predictions, 
+$\eta = x \beta$, where $\eta$ is a $N$-vector of predictions,
 $x$ is a $N \times K$ matrix, and $\beta$ is a $K$-vector of coefficients.
 Presuming $N \geq K$, we can exploit the fact that any design matrix, $x$
 can be decomposed using the thin QR decomposition into an orthogonal matrix
 $Q$ and an upper-triangular matrix $R$, i.e. $x = Q R$. See \ref{QR-decomposition}
-for more information on the QR decomposition but note that \code{qr\_Q} and 
+for more information on the QR decomposition but note that \code{qr\_Q} and
 \code{qr\_R} implement the fat QR decomposition so here we thin it by including
 only $K$ columns in $Q$ and $K$ rows in $R$. Also, in practice, it is best to
-write $x = Q^\ast R^\ast$ where $Q^\ast = Q \times \sqrt{n - 1}$ and 
-$R^\ast = \frac{1}{\sqrt{n - 1}} R$. Thus, we can equivalently write 
-$\eta = x \beta = Q R \beta = Q^\ast R^\ast \beta$. If we let 
-$\theta = R^\ast \beta$, then we have $\eta = Q^\ast \theta$ and 
+write $x = Q^\ast R^\ast$ where $Q^\ast = Q \times \sqrt{n - 1}$ and
+$R^\ast = \frac{1}{\sqrt{n - 1}} R$. Thus, we can equivalently write
+$\eta = x \beta = Q R \beta = Q^\ast R^\ast \beta$. If we let
+$\theta = R^\ast \beta$, then we have $\eta = Q^\ast \theta$ and
 $\beta = R^{\ast^{-1}} \theta$. In that case, the previous Stan program becomes
 %
 \begin{stancode}
@@ -242,11 +242,11 @@ \section{Priors for Coefficients and Scales}\label{regression-priors.section}
 \refsection{hierarchical-priors} and multivariate parameters in
 \refsection{multivariate-hierarchical-priors}. There is also a
 discussion of priors used to identify models in
-\refsection{priors-for-identification}. 
+\refsection{priors-for-identification}.
 
-However, as described in \refsection{QR-reparameterization}, if you do 
-not have an informative prior on the \emph{location} of the regression 
-coefficients, then you are better off reparameterizing your model so 
+However, as described in \refsection{QR-reparameterization}, if you do
+not have an informative prior on the \emph{location} of the regression
+coefficients, then you are better off reparameterizing your model so
 that the regression coefficients are a generated quantity. In that case,
 it usually does not matter very much what prior is used on on the
 reparameterized regression coefficients and almost any weakly informative
@@ -305,7 +305,7 @@ \subsection{Proper Uniform Priors: Interval Constraints}
 \end{stancode}
 %
 This will implicitly give \code{sigma} a $\distro{Uniform}(0.1, 2.7)$
-prior.  
+prior.
 
 \subsubsection{Matching Support to Constraints}
 
@@ -320,7 +320,7 @@ \subsubsection{Matching Support to Constraints}
   ...
 model {
   // *** bad *** : support narrower than constraint
-  sigma ~ uniform(0.1, 2.7);  
+  sigma ~ uniform(0.1, 2.7);
 \end{stancode}
 %
 The sampling statement imposes a limited support for \code{sigma} in
@@ -346,7 +346,7 @@ \subsection{``Uninformative'' Proper Priors}
 %
 If the prior scale, such as 1000, is several orders of magnitude
 larger than the estimated coefficients, then such a prior is
-effectively providing no effect whatsoever.  
+effectively providing no effect whatsoever.
 
 We actively discourage users from using the default scale priors
 suggested through the BUGS examples \citep{LunnEtAl:2012}, such as
@@ -379,7 +379,7 @@ \subsection{Truncated Priors}
 gives \code{sigma} a half-normal prior, technically
 %
 \[
-p(\sigma) 
+p(\sigma)
 \ = \
 \frac{\distro{Normal}(\sigma | 0, 1000)}
      {1 - \distro{NormalCDF}(0 | 0, 1000)}
@@ -427,7 +427,7 @@ \subsection{Bounded Priors}
 proper prior is to impose a uniform prior on a bounded scale.  For
 example, we could declare the parameter for mean women's height to
 have a lower bound of one meter and an upper bound of three meters.
-Surely the answer has to lie in that range.  
+Surely the answer has to lie in that range.
 
 Similarly, it is not uncommon to see priors for scale parameters that
 impose lower bounds of zero and upper bounds of very large numbers,
@@ -461,7 +461,7 @@ \subsection{Fat-Tailed Priors and ``Default'' Priors}
 expected to be, but still leaves a lot of mass in its tails.  The
 usual choice in such a situation is to use a Cauchy distribution for a
 prior, which can concentrate its mass around its median, but has tails
-that are so fat that the variance is infinite.  
+that are so fat that the variance is infinite.
 
 Without specific information, the Cauchy prior is a very good default
 parameter choice for regression coefficients
@@ -545,11 +545,11 @@ \section{Logistic and Probit Regression}\label{logistic-probit-regression.sectio
 }
 model {
   y ~ bernoulli_logit(alpha + beta * x);
-} 
+}
 \end{stancode}
 %
 The noise parameter is built into the Bernoulli formulation here
-rather than specified directly.  
+rather than specified directly.
 
 Logistic regression is a kind of generalized linear model with binary
 outcomes and the log odds (logit) link function, defined by
@@ -558,7 +558,7 @@ \section{Logistic and Probit Regression}\label{logistic-probit-regression.sectio
 \mbox{logit}(v) = \log \left( \frac{v}{1-v} \right).
 \]
 %
-The inverse of the link function appears in the model.  
+The inverse of the link function appears in the model.
 %
 \[
 \mbox{logit}^{-1}(u) = \frac{1}{1 + \exp(-u)}.
@@ -566,10 +566,10 @@ \section{Logistic and Probit Regression}\label{logistic-probit-regression.sectio
 %
 
 The model formulation above uses the logit-parameterized version of
-the Bernoulli distribution, which is defined by 
+the Bernoulli distribution, which is defined by
 %
 \[
-\distro{BernoulliLogit}(y|\alpha) 
+\distro{BernoulliLogit}(y|\alpha)
 =
 \distro{Bernoulli}(y | \mbox{logit}^{-1}(\alpha)).
 \]
@@ -596,7 +596,7 @@ \section{Logistic and Probit Regression}\label{logistic-probit-regression.sectio
 
 Other link functions may be used in the same way.  For example, probit
 regression uses the cumulative normal distribution function, which is
-typically written as 
+typically written as
 \[
 \Phi(x) = \int_{-\infty}^x \distro{Normal}(y|0,1) \, dy.
 \]
@@ -611,8 +611,8 @@ \section{Logistic and Probit Regression}\label{logistic-probit-regression.sectio
         y[n] ~ bernoulli(Phi(alpha + beta * x[n]));
 \end{stancode}
 %
-A fast approximation to the cumulative unit normal distribution function 
-$\Phi$ is implemented in Stan as the function \code{Phi\_approx}.  The 
+A fast approximation to the cumulative unit normal distribution function
+$\Phi$ is implemented in Stan as the function \code{Phi\_approx}.  The
 approximate probit regression model may be coded with the following.
 %
 
@@ -661,7 +661,7 @@ \section{Multi-Logit Regression}\label{multi-logit.section}
 \code{categorical\_logit}).
 
 The first loop may be made more efficient by vectorizing the first
-loop by converting the matrix \code{beta} to a vector, 
+loop by converting the matrix \code{beta} to a vector,
 %
 \begin{stancode}
 to_vector(beta) ~ normal(0, 5);
@@ -711,7 +711,7 @@ \subsection{Identifiability}
 suitable prior on the coefficients.
 
 An alternative is to use $(K-1)$-vectors by fixing one of them to be
-zero. \refsection{partially-known-parameters} discusses how to mix 
+zero. \refsection{partially-known-parameters} discusses how to mix
 constants and parameters in a vector.  In the multi-logit case, the
 parameter block would be redefined to use $(K-1)$-vectors
 %
@@ -824,7 +824,7 @@ \subsection{Translated and Scaled Simplex}
 
 With this parameterization, a Dirichlet prior can be placed on
 \code{beta\_raw}, perhaps uniform, and another prior put on
-\code{beta\_scale}, typically for ``shrinkage.''  
+\code{beta\_scale}, typically for ``shrinkage.''
 
 
 \subsection{Soft Centering}
@@ -847,7 +847,7 @@ \section{Ordered Logistic and Probit Regression}\label{ordered-logistic.section}
 $k$ if the linear predictor $x_n \beta$ falls between $c_{k-1}$ and
 $c_k$, assuming $c_0 = -\infty$ and $c_K = \infty$.  The noise term is
 fixed by the form of regression, with examples for ordered logistic
-and ordered probit models.  
+and ordered probit models.
 
 \subsection{Ordered Logistic Regression}
 
@@ -863,19 +863,19 @@ \subsection{Ordered Logistic Regression}
   int<lower=1> D;
   int<lower=1,upper=K> y[N];
   row_vector[D] x[N];
-} 
+}
 parameters {
   vector[D] beta;
   ordered[K-1] c;
-} 
+}
 model {
   for (n in 1:N)
     y[n] ~ ordered_logistic(x[n] * beta, c);
 }
 \end{stancode}
-% 
+%
 The vector of cutpoints \code{c} is declared as \code{ordered[K-1]},
-which guarantees that \code{c[k]} is less than \code{c[k+1]}. 
+which guarantees that \code{c[k]} is less than \code{c[k+1]}.
 
 If the cutpoints were assigned independent priors, the constraint
 effectively truncates the joint prior to support over points that
@@ -974,7 +974,7 @@ \section{Hierarchical Logistic Regression}
   for (n in 1:N)
     y[n] ~ bernoulli(inv_logit(x[n] * beta[ll[n]]));
 }
-\end{stancode}  
+\end{stancode}
 %
 The standard deviation parameter \code{sigma} gets an implicit uniform
 prior on $(0,\infty)$ because of its declaration with a lower-bound
@@ -1017,7 +1017,7 @@ \subsubsection{Optimizing the Model}
 
 \begin{stancode}
   for (n in 1:N)
-    y[n] ~ bernoulli_logit(x[n] * beta[ll[n]]);    
+    y[n] ~ bernoulli_logit(x[n] * beta[ll[n]]);
 \end{stancode}
 %
 See \refsection{bernoulli-logit-distribution} for a definition of
@@ -1042,7 +1042,7 @@ \subsubsection{Optimizing the Model}
 %
 The brackets introduce a new scope for the local variable
 \code{x\_beta\_ll}; alternatively, the variable may be declared at the
-top of the model block.  
+top of the model block.
 
 In some cases, such as the above, the local variable assignment leads
 to models that are less readable.  The recommended practice in such
@@ -1149,7 +1149,7 @@ \subsection{1PL (Rasch) Model}
 %
 
 \begin{stancode}
-parameters {    
+parameters {
   real delta;         // mean student ability
   real alpha[J];      // ability of student j - mean ability
   real beta[K];       // difficulty of question k
@@ -1196,7 +1196,7 @@ \subsection{1PL (Rasch) Model}
 $\beta_k$ results in the same predictions.  The use of priors for
 $\alpha$ and $\beta$ located at 0 identifies the parameters; see
 \citep{GelmanHill:2007} for a discussion of identifiability issues and
-alternative approaches to identification.  
+alternative approaches to identification.
 
 For testing purposes, the IRT 1PL model distributed with Stan uses
 informative priors that match the actual data generation process used
@@ -1220,12 +1220,12 @@ \subsection{Multilevel 2PL Model}
 %
 
 \begin{stancode}
-parameters {    
+parameters {
   real mu_beta;                  // mean student ability
   real alpha[J];               // ability for j - mean
   real beta[K];                // difficulty for k
   real<lower=0> gamma[K];      // discrimination of k
-  real<lower=0> sigma_beta;    // scale of difficulties 
+  real<lower=0> sigma_beta;    // scale of difficulties
   real<lower=0> sigma_gamma;   // scale of log discrimination
 }
 \end{stancode}
@@ -1235,14 +1235,14 @@ \subsection{Multilevel 2PL Model}
 \begin{stancode}
 model {
   alpha ~ normal(0, 1);
-  beta ~ normal(0, sigma_beta);   
+  beta ~ normal(0, sigma_beta);
   gamma ~ lognormal(0, sigma_gamma);
   mu_beta ~ cauchy(0, 5);
   sigma_alpha ~ cauchy(0, 5);
   sigma_beta ~ cauchy(0, 5);
   sigma_gamma ~ cauchy(0, 5);
   for (n in 1:N)
-    y[n] ~ bernoulli_logit(gamma[kk[n]] 
+    y[n] ~ bernoulli_logit(gamma[kk[n]]
                            * (alpha[jj[n]] - (beta[kk[n]] + mu_beta)));
 }
 \end{stancode}
@@ -1271,7 +1271,7 @@ \subsection{Multilevel 2PL Model}
 \end{stancode}
 %
 The point is that the \code{alpha} determines the scale and location
-and \code{beta} and \code{gamma} are allowed to float.  
+and \code{beta} and \code{gamma} are allowed to float.
 
 The \code{beta} parameter is here given a non-centered
 parameterization, with parameter \code{mu\_beta} serving as the mean
@@ -1289,7 +1289,7 @@ \subsection{Multilevel 2PL Model}
 %
 Non-centered parameterizations tend to be more efficient in
 hierarchical models; see \refsection{reparameterization} for more
-information on non-centered reparameterizations.  
+information on non-centered reparameterizations.
 
 The intercept term \code{mu\_beta} can't itself be modeled
 hierarchically, so it is given a weakly informative
@@ -1344,7 +1344,7 @@ \subsection{Separability}
 likelihood estimate for the coefficient for that predictor diverges to
 infinity.  This divergence can be controlled by providing a prior for
 the coefficient, which will ``shrink'' the estimate back toward zero
-and thus identify the model in the posterior.  
+and thus identify the model in the posterior.
 
 Similar problems arise for sampling with improper flat priors.  The
 sampler will try to draw very large values.  By providing a prior,
@@ -1358,7 +1358,7 @@ \section{Multivariate Priors for Hierarchical Models}\label{multivariate-hierarc
 In hierarchical regression models (and other situations), several
 individual-level variables may be assigned hierarchical priors.  For
 example, a model with multiple varying intercepts and slopes within
-might assign them a multivariate prior. 
+might assign them a multivariate prior.
 
 As an example, the individuals might be people and the outcome income,
 with predictors such as education level and age, and the groups might be states
@@ -1375,7 +1375,7 @@ \subsection{Multivariate Regression Example}
 assume that $x_{n,1} = 1$ is a fixed ``intercept'' predictor.  To
 encode group membership, they assume individual $n$ belongs to group
 $jj[n] \in 1{:}J$.  Each individual $n$ also has an observed outcome
-$y_n$ taking on real values. 
+$y_n$ taking on real values.
 
 \subsubsection{Likelihood}
 
@@ -1384,7 +1384,7 @@ \subsubsection{Likelihood}
 group $j$.  The likelihood function for individual $n$ is then just
 %
 \[
-y_n \sim \distro{Normal}(x_n \, \beta_{jj[n]}, \, \sigma) 
+y_n \sim \distro{Normal}(x_n \, \beta_{jj[n]}, \, \sigma)
 \mbox{ for } n \in 1{:}N.
 \]
 %
@@ -1416,18 +1416,18 @@ \subsubsection{Hyperpriors}
 \]
 Of course, if more is known about the expected coefficient values
 $\beta_{j, k}$, this information can be incorporated into the prior for
-$\mu_k$.  
+$\mu_k$.
 
 For the prior on the covariance matrix, Gelman and Hill suggest using
 a scaled inverse Wishart.  That choice was motivated primarily by
 convenience as it is conjugate to the multivariate likelihood function
-and thus simplifies Gibbs sampling.  
+and thus simplifies Gibbs sampling.
 
 In Stan, there is no restriction to conjugacy for multivariate priors,
 and we in fact recommend a slightly different approach.  Like Gelman
 and Hill, we decompose our prior into a scale and a matrix, but are
 able to do so in a more natural way based on the actual variable
-scales and a correlation matrix.  Specifically, we define 
+scales and a correlation matrix.  Specifically, we define
 \[
 \Sigma = \mbox{diag\_matrix}(\tau) \ \Omega \ \mbox{diag\_matrix}(\tau),
 \]
@@ -1446,7 +1446,7 @@ \subsubsection{Hyperpriors}
 prior for scales, but we recommend something weakly informative like a
 half-Cauchy distribution with a small scale, such as
 \[
-\tau_k \sim \distro{Cauchy}(0, 2.5) 
+\tau_k \sim \distro{Cauchy}(0, 2.5)
 \mbox{ for } k \in 1{:}K \mbox{ constrained by } \tau_k > 0.
 \]
 As for the prior means, if there is information about the scale of
@@ -1484,7 +1484,7 @@ \subsubsection{Group-Level Predictors for Prior Mean}
 independent weakly informative priors, such as
 \[
 \gamma_l \sim \distro{Normal}(0,5).
-\] 
+\]
 As usual, information about the group-level means should be
 incorporated into this prior.
 
@@ -1529,7 +1529,7 @@ \subsubsection{Coding the Model in Stan}
 \end{stancode}
 %
 The hyperprior covariance matrix is defined implicitly through the
-a quadratic form in the code 
+a quadratic form in the code
 because the correlation matrix \code{Omega} and scale vector
 \code{tau} are more natural to inspect in the output; to output
 \code{Sigma}, define it as a transformed parameter.  The function
@@ -1599,7 +1599,7 @@ \subsubsection{Optimization through Vectorization}
 The code in the Stan program above also builds up an array of vectors
 for the outcomes and for the multivariate normal, which provides a
 very significant speedup by reducing the number of linear systems that
-need to be solved and differentiated.  
+need to be solved and differentiated.
 %
 \begin{stancode}
   {
@@ -1629,7 +1629,7 @@ \subsubsection{Optimization through Cholesky Factorization}
 %
 \begin{stancode}
 data {
-  matrix[J, L] u;  
+  matrix[J, L] u;
   ...
 parameters {
   matrix[K, J] z;
@@ -1640,10 +1640,10 @@ \subsubsection{Optimization through Cholesky Factorization}
   beta = u * gamma + (diag_pre_multiply(tau,L_Omega) * z)';
 }
 model {
-  to_vector(z) ~ normal(0, 1); 
+  to_vector(z) ~ normal(0, 1);
   L_Omega ~ lkj_corr_cholesky(2);
   ...
-\end{stancode}  
+\end{stancode}
 %
 The data variable \code{u} was originally an array of vectors, which
 is efficient for access; here it is redeclared as a matrix in order to
@@ -1660,12 +1660,12 @@ \subsubsection{Optimization through Cholesky Factorization}
 factor of the final covariance matrix,
 %
 \begin{stancode}
-  Sigma_beta 
+  Sigma_beta
   = quad_form_diag(Omega, tau)
   = diag_pre_multiply(tau, L_Omega) * diag_pre_multiply(tau, L_Omega)'
 \end{stancode}
 %
-where the diagonal pre-multiply compound operation is defined by 
+where the diagonal pre-multiply compound operation is defined by
 %
 \begin{stancode}
 diag_pre_multiply(a, b) = diag_matrix(a) * b
@@ -1685,8 +1685,8 @@ \subsubsection{Optimization through Cholesky Factorization}
 \begin{stancode}
 parameters {
   matrix[K, J] z;
-  cholesky_factor_corr[K] L_Omega;  
-  vector<lower=0,upper=pi()/2>[K] tau_unif;  
+  cholesky_factor_corr[K] L_Omega;
+  vector<lower=0,upper=pi()/2>[K] tau_unif;
   matrix[L, K] gamma;                         // group coeffs
   real<lower=0> sigma;                       // prediction error scale
 }
@@ -1697,7 +1697,7 @@ \subsubsection{Optimization through Cholesky Factorization}
   beta = u * gamma + (diag_pre_multiply(tau,L_Omega) * z)';
 }
 model {
-  to_vector(z) ~ normal(0, 1); 
+  to_vector(z) ~ normal(0, 1);
   L_Omega ~ lkj_corr_cholesky(2);
   to_vector(gamma) ~ normal(0, 5);
   y ~ normal(rows_dot_product(beta[jj] , x), sigma);
@@ -1716,12 +1716,12 @@ \subsubsection{Optimization through Cholesky Factorization}
 %   ...
 % transformed parameters {
 %   matrix[M,3] alpha;
-%   alpha 
+%   alpha
 %     = transpose(rep_matrix(mu, M)
 %                  + diag_pre_multiply(sigma_Sigma,L_Sigma) * z);
 %   ...
 % model {
-%   to_vector(z) ~ normal(0, 1); 
+%   to_vector(z) ~ normal(0, 1);
 %   gamma ~ normal(0, 5);
 %   sigma_Sigma ~ cauchy(0, 2.5);
 %   L_Sigma ~ lkj_corr_cholesky(3);
@@ -1729,7 +1729,7 @@ \subsubsection{Optimization through Cholesky Factorization}
 % \end{stancode}
 % \end{quote}
 % %
-% Taken together, this Stan program amounts to 
+% Taken together, this Stan program amounts to
 % %
 % \begin{eqnarray*}
 % \sigma & \sim & \mbox{Cauchy}(0, 2.5)
@@ -1771,11 +1771,11 @@ \subsection{Programming Predictions}
 data {
   int<lower=1> K;
   int<lower=0> N;
-  matrix[N, K] x;       
-  vector[N] y;     
+  matrix[N, K] x;
+  vector[N] y;
 
   int<lower=0> N_new;
-  matrix[N_new, K] x_new; 
+  matrix[N_new, K] x_new;
 }
 parameters {
   vector[K] beta;
@@ -1806,10 +1806,10 @@ \subsection{Predictions as Generated Quantities}
   real<lower=0> sigma;
 }
 model {
-  y ~ normal(x * beta, sigma);         
+  y ~ normal(x * beta, sigma);
 }
 generated quantities {
-  vector[N_new] y_new;  
+  vector[N_new] y_new;
   for (n in 1:N_new)
     y_new[n] = normal_rng(x_new[n] * beta, sigma);
 }
@@ -1832,7 +1832,7 @@ \subsubsection{Overflow in Generated Quantities}
 be intercepted and dealt with, typically be reparameterizing or
 reimplementing the random number generator using real values rather
 than integers, which are upper-bounded by $2^{31} - 1$ in Stan.
-  
+
 
 \section{Multivariate Outcomes}
 
@@ -1841,13 +1841,13 @@ \section{Multivariate Outcomes}
 regressions are just repeated categorical regressions.  In contrast,
 this section discusses regression when each observed value is
 multivariate.  To relate multiple outcomes in a regression setting,
-their error terms are provided with covariance structure.  
+their error terms are provided with covariance structure.
 
 This section considers two cases, seemingly unrelated regressions for
 continuous multivariate quantities and multivariate probit regression
 for boolean multivariate quantities.
 
-\subsection{Seemingly Unrelated Regressions} 
+\subsection{Seemingly Unrelated Regressions}
 
 The first model considered is the ``seemingly unrelated'' regressions
 (SUR) of econometrics where several linear regressions share
@@ -1932,7 +1932,7 @@ \subsection{Seemingly Unrelated Regressions}
 
 If required, the full correlation or covariance matrices may be
 reconstructed from their Cholesky factors in the generated quantities
-block.  
+block.
 
 
 \subsection{Multivariate Probit Regression}
@@ -1944,7 +1944,7 @@ \subsection{Multivariate Probit Regression}
 The observations $y_n$ are $D$-vectors of boolean values (coded 0 for
 false, 1 for true).  The values for the observations $y_n$ are based
 on latent values $z_n$ drawn from a seemingly unrelated regression
-model (see the previous section), 
+model (see the previous section),
 %
 \begin{eqnarray*}
  z_n & = & x_n \, \beta + \epsilon_n
@@ -2067,7 +2067,7 @@ \subsection{Multivariate Probit Regression}
 These include the regression coefficients \code{beta} and the Cholesky
 factor of the correlation matrix, \code{L\_Omega}.  This time there is
 no scaling because the covariance matrix has unit scale (i.e., it is a
-correlation matrix;  see above). 
+correlation matrix;  see above).
 
 The critical part of the parameter declaration is that the latent real
 value $z$ is broken into positive-constrained and negative-constrained
@@ -2092,9 +2092,9 @@ \subsection{Multivariate Probit Regression}
 model {
   L_Omega ~ lkj_corr_cholesky(4);
   to_vector(beta) ~ normal(0, 5);
-  { 
+  {
     vector[D] beta_x[N];
-    for (n in 1:N) 
+    for (n in 1:N)
       beta_x[n] = beta * x[n];
     z ~ multi_normal_cholesky(beta_x, L_Omega);
   }
@@ -2105,12 +2105,12 @@ \subsection{Multivariate Probit Regression}
 Chib-style constraints on \code{z}.
 
 Finally, the correlation matrix itself can be put back together in the
-generated quantities block if desired. 
+generated quantities block if desired.
 %
 \begin{stancode}
 generated quantities {
   corr_matrix[D] Omega;
-  Omega = multiply_lower_tri_self_transpose(L_Omega);  
+  Omega = multiply_lower_tri_self_transpose(L_Omega);
 }
 \end{stancode}
 %
@@ -2136,9 +2136,9 @@ \subsection{Prediction}
 and $y$ is
 %
 \[
-p(\alpha, \beta, \sigma \, | \, x, y) 
-\propto 
-\prod_{n=1}^N 
+p(\alpha, \beta, \sigma \, | \, x, y)
+\propto
+\prod_{n=1}^N
   \distro{Normal}(y_n \, | \, \alpha + \beta x_n, \sigma).
 \]
 %
@@ -2150,7 +2150,7 @@ \subsection{Prediction}
 = \int_{(\alpha,\beta,\sigma)}
   \distro{Normal}(\tilde{y}_n \, | \, \alpha + \beta \tilde{x}_n, \sigma)
   \times
-  p(\alpha, \beta, \sigma \, | \, x, y) 
+  p(\alpha, \beta, \sigma \, | \, x, y)
   \
   \mathrm{d}(\alpha,\beta,\sigma).
 \]
@@ -2223,7 +2223,7 @@ \chapter{Time-Series Models}\label{time-series.chapter}
 \noindent
 Times series data come arranged in temporal order.  This chapter
 presents two kinds of time series models, regression-like models such
-as autoregressive and moving average models, and hidden Markov models. 
+as autoregressive and moving average models, and hidden Markov models.
 
 \refchapter{gaussian-processes} presents Gaussian processes, which may
 also be used for time-series (and spatial) data.
@@ -2279,7 +2279,7 @@ \subsubsection{Slicing for Efficiency}
 %
 \begin{stancode}
 model {
-  y[2:(N - 1)] ~ normal(alpha + beta * y[1:(N - 1)], sigma);
+  y[2:N] ~ normal(alpha + beta * y[1:(N - 1)], sigma);
 }
 \end{stancode}
 %
@@ -2288,13 +2288,13 @@ \subsubsection{Slicing for Efficiency}
 
 
 
-\subsection{Extensions to the AR(1) Model} 
+\subsection{Extensions to the AR(1) Model}
 
 Proper priors of a range of different families may be added for the
 regression coefficients and noise scale.  The normal noise model can
 be changed to a Student-$t$ distribution or any other distribution
 with unbounded support.  The model could also be made hierarchical if
-multiple series of observations are available.  
+multiple series of observations are available.
 
 To enforce the estimation of a stationary AR(1) process, the slope
 coefficient \code{beta} may be constrained with bounds as follows.
@@ -2368,7 +2368,7 @@ \subsection{ARCH(1) Models}
 and $\mu$, $\alpha_0$, and $\alpha_1$ are unknown regression coefficient parameters.
 %
 \begin{eqnarray*}
-r_t & = & \mu + a_t 
+r_t & = & \mu + a_t
 \\[2pt]
 a_t & = & \sigma_t \epsilon_t
 \\[2pt]
@@ -2379,7 +2379,7 @@ \subsection{ARCH(1) Models}
 %
 In order to ensure the noise terms $\sigma^2_t$ are positive, the
 scale coefficients are constrained to be positive, $\alpha_0, \alpha_1
-> 0$.  To ensure stationarity of the time series, the slope is 
+> 0$.  To ensure stationarity of the time series, the slope is
 constrained to to be less than one, $\alpha_1 < 1$.%
 %
 \footnote{In practice, it can be useful to remove the constraint to
@@ -2434,21 +2434,21 @@ \subsection{GARCH(1,1) Models}
 %
 \begin{stancode}
 data {
-  int<lower=0> T; 
+  int<lower=0> T;
   real r[T];
-  real<lower=0> sigma1; 
+  real<lower=0> sigma1;
 }
 parameters {
-  real mu; 
-  real<lower=0> alpha0;          
-  real<lower=0,upper=1> alpha1;  
-  real<lower=0,upper=(1-alpha1)> beta1; 
+  real mu;
+  real<lower=0> alpha0;
+  real<lower=0,upper=1> alpha1;
+  real<lower=0,upper=(1-alpha1)> beta1;
 }
 transformed parameters {
   real<lower=0> sigma[T];
   sigma[1] = sigma1;
   for (t in 2:T)
-    sigma[t] = sqrt(alpha0 
+    sigma[t] = sqrt(alpha0
                      + alpha1 * pow(r[t-1] - mu, 2)
                      + beta1 * pow(sigma[t-1], 2));
 }
@@ -2458,12 +2458,12 @@ \subsection{GARCH(1,1) Models}
 \end{stancode}
 %
 To get the recursive definition of the volatility regression off the
-ground, the data declaration includes a non-negative value 
-\code{sigma1} for the scale of the noise at $t = 1$. 
+ground, the data declaration includes a non-negative value
+\code{sigma1} for the scale of the noise at $t = 1$.
 
 The constraints are coded directly on the parameter declarations.
 This declaration is order-specific in that the constraint on \code{beta1}
-depends on the value of \code{alpha1}. 
+depends on the value of \code{alpha1}.
 
 A transformed parameter array of non-negative values \code{sigma} is
 used to store the scale values at each time point.  The definition of
@@ -2528,7 +2528,7 @@ \subsection{$\mbox{MA}(2)$ Example}
   theta ~ cauchy(0, 2.5);
   sigma ~ cauchy(0, 2.5);
   for (t in 3:T)
-    y[t] ~ normal(mu 
+    y[t] ~ normal(mu
                   + theta[1] * epsilon[t - 1]
                   + theta[2] * epsilon[t - 2],
                   sigma);
@@ -2545,7 +2545,7 @@ \subsection{$\mbox{MA}(2)$ Example}
 This model could be improved in terms of speed by vectorizing the
 sampling statement in the model block.  Vectorizing the calculation of
 the $\epsilon_t$ could also be sped up by using a dot product instead
-of a loop.  
+of a loop.
 
 
 \subsection{Vectorized $\mbox{MA}(Q)$ Model}
@@ -2629,11 +2629,11 @@ \section{Autoregressive Moving Average Models}
 \end{stancode}
 %
 The data is declared in the same way as the other time-series
-regressions and the parameters are documented in the code. 
+regressions and the parameters are documented in the code.
 
 In the model block, the local vector \code{nu} stores the predictions
 and \code{err} the errors.  These are computed similarly to the
-errors in the moving average models described in the previous section.  
+errors in the moving average models described in the previous section.
 
 The priors are weakly informative for stationary processes.  The
 likelihood only involves the error term, which is efficiently
@@ -2658,7 +2658,7 @@ \section{Autoregressive Moving Average Models}
   err = y[1] - mu + phi * mu;
   err ~ normal(0, sigma);
   for (t in 2:T) {
-    err = y[t] - (mu + phi * y[t-1] + theta * err); 
+    err = y[t] - (mu + phi * y[t-1] + theta * err);
     err ~ normal(0, sigma);
   }
 }
@@ -2668,7 +2668,7 @@ \section{Autoregressive Moving Average Models}
 variables, such as \code{err} in this case, can be reused in Stan.
 Folta's approach could be extended to higher order moving-average
 models by storing more than one error term as a local variable and
-reassigning them in the loop.  
+reassigning them in the loop.
 
 Both encodings are very fast.  The original encoding has the advantage
 of vectorizing the normal distribution, but it uses a bit more memory.
@@ -2702,7 +2702,7 @@ \subsection{Identifiability and Stationarity}%
 %
 \begin{stancode}
 read<lower = -1, upper = 1> phi;
-\end{stancode} 
+\end{stancode}
 
 
 
@@ -2736,7 +2736,7 @@ \section{Stochastic Volatility Models}
 %
 Rearranging the first line, $\epsilon_t = y_t \exp(-h_t / 2)$,
 allowing the sampling distribution for $y_t$ to be written as
-\[ 
+\[
 y_t \sim \distro{Normal}(0,\exp(h_t/2)).
 \]
 The recurrence equation for $h_{t+1}$ may be combined with the
@@ -2761,7 +2761,7 @@ \section{Stochastic Volatility Models}
 model {
   phi ~ uniform(-1, 1);
   sigma ~ cauchy(0, 5);
-  mu ~ cauchy(0, 10);  
+  mu ~ cauchy(0, 10);
   h[1] ~ normal(mu, sigma / sqrt(1 - phi * phi));
   for (t in 2:T)
     h[t] ~ normal(mu + phi * (h[t - 1] -  mu), sigma);
@@ -2781,11 +2781,11 @@ \section{Stochastic Volatility Models}
 data points from the above model with $\mu = -1.02$, $\phi = 0.95$,
 and $\sigma = 0.25$ leads to 95\% posterior intervals for $\mu$ of
 $(-1.23, -0.54)$, for $\phi$ of $(0.82,0.98 )$ and for $\sigma$ of
-$(0.16,0.38)$. 
+$(0.16,0.38)$.
 
 The samples using NUTS show a high degree of autocorrelation among the
 samples, both for this model and the stochastic volatility model
-evaluated in \citep{Hoffman-Gelman:2011, Hoffman-Gelman:2014}.  
+evaluated in \citep{Hoffman-Gelman:2011, Hoffman-Gelman:2014}.
 Using a non-diagonal mass
 matrix provides faster convergence and more effective samples than a
 diagonal mass matrix, but will not scale to large values of $T$.
@@ -2866,7 +2866,7 @@ \section{Hidden Markov Models}\label{hmms.section}
 z_t \sim \distro{Categorical}(\theta_{z[t-1]}).
 \]
 The output $y_t$ at time $t$ is generated conditionally independently
-based on the latent state $z_t$.  
+based on the latent state $z_t$.
 
 This section describes HMMs with a simple categorical model for
 outputs $y_t \in \{1,\ldots,V\}$.  The categorical distribution for
@@ -2901,7 +2901,7 @@ \subsection{Supervised Parameter Estimation}
   simplex[V] phi[K];    // emit probs
 }
 model {
-  for (k in 1:K) 
+  for (k in 1:K)
     theta[k] ~ dirichlet(alpha);
   for (k in 1:K)
     phi[k] ~ dirichlet(beta);
@@ -2924,7 +2924,7 @@ \subsection{Start-State and End-State Probabilities}
 the probability of $z_1$ should be set to the stationary state
 probabilities in the Markov chain.  In this case, there is no distinct
 end to the data, so there is no need to model the probability that the
-sequence ends at $z_T$.  
+sequence ends at $z_T$.
 
 An alternative conception of HMMs is as models of finite-length
 sequences.  For example, human language sentences have distinct
@@ -2953,7 +2953,7 @@ \subsection{Calculating Sufficient Statistics}
 transformed data {
   int<lower=0> trans[K, K];
   int<lower=0> emit[K, V];
-  for (k1 in 1:K) 
+  for (k1 in 1:K)
     for (k2 in 1:K)
       trans[k1, k2] = 0;
   for (t in 2:T)
@@ -3006,7 +3006,7 @@ \subsection{Analytic Posterior}
 transformed data {
   vector<lower=0>[K] alpha_post[K];
   vector<lower=0>[V] beta_post[K];
-  for (k in 1:K) 
+  for (k in 1:K)
     alpha_post[k] = alpha;
   for (t in 2:T)
     alpha_post[z[t-1], z[t]] = alpha_post[z[t-1], z[t]] + 1;
@@ -3021,7 +3021,7 @@ \subsection{Analytic Posterior}
 %
 \begin{stancode}
 model {
-  for (k in 1:K) 
+  for (k in 1:K)
     theta[k] ~ dirichlet(alpha_post[k]);
   for (k in 1:K)
     phi[k] ~ dirichlet(beta_post[k]);
@@ -3039,7 +3039,7 @@ \subsection{Semisupervised Estimation}
 strategy in Stan requires calculating the probability of an output
 sequence with an unknown state sequence.  This is a marginalization
 problem, and for HMMs, it is computed with the so-called forward
-algorithm.  
+algorithm.
 
 In Stan, the forward algorithm is coded as follows.%
 %
@@ -3059,12 +3059,12 @@ \subsection{Semisupervised Estimation}
 %
 The model for the supervised data does not change; the unsupervised
 data is handled with the following Stan implementation of the forward
-algorithm.  
+algorithm.
 %
 \begin{stancode}
 model {
  ...
-  { 
+  {
     real acc[K];
     real gamma[T_unsup, K];
     for (k in 1:K)
@@ -3099,7 +3099,7 @@ \subsection{Semisupervised Estimation}
 
 The brackets provide the scope for the local variables \code{acc} and
 \code{gamma}; these could have been declared earlier, but it is
-clearer to keep their declaration near their use. 
+clearer to keep their declaration near their use.
 
 
 \subsection{Predictive Inference}
@@ -3108,7 +3108,7 @@ \subsection{Predictive Inference}
 $\phi_{k,v}$ and an observation sequence $u_1,\ldots,u_T \in \{
 1,\ldots,V \}$, the Viterbi (dynamic programming) algorithm
 computes the state sequence which is most likely to have generated the
-observed output $u$.  
+observed output $u$.
 
 The Viterbi algorithm can be coded in Stan in the generated quantities
 block as follows.  The predictions here is the most likely state
@@ -3122,7 +3122,7 @@ \subsection{Predictive Inference}
 generated quantities {
   int<lower=1,upper=K> y_star[T_unsup];
   real log_p_y_star;
-  { 
+  {
     int back_ptr[T_unsup, K];
     real best_logp[T_unsup, K];
     real best_total_logp;
@@ -3133,7 +3133,7 @@ \subsection{Predictive Inference}
         best_logp[t, k] = negative_infinity();
         for (j in 1:K) {
           real logp;
-          logp = best_logp[t-1, j] 
+          logp = best_logp[t-1, j]
                   + log(theta[j, k]) + log(phi[k, u[t]]);
           if (logp > best_logp[t, k]) {
             back_ptr[t, k] = j;
@@ -3147,7 +3147,7 @@ \subsection{Predictive Inference}
       if (best_logp[T_unsup, k] == log_p_y_star)
         y_star[T_unsup] = k;
     for (t in 1:(T_unsup - 1))
-      y_star[T_unsup - t] = back_ptr[T_unsup - t + 1, 
+      y_star[T_unsup - t] = back_ptr[T_unsup - t + 1,
                                       y_star[T_unsup - t + 1]];
   }
 }
@@ -3261,7 +3261,7 @@ \section{Partially Known Parameters}\label{partially-known-parameters.section}
   real<lower=0> var1;     real<lower=0> var2;
 }
 transformed data {
-  real<lower=0> max_cov = sqrt(var1 * var2);  
+  real<lower=0> max_cov = sqrt(var1 * var2);
   real<upper=0> min_cov = -max_cov;
 }
 parameters {
@@ -3272,7 +3272,7 @@ \section{Partially Known Parameters}\label{partially-known-parameters.section}
   matrix[2, 2] Sigma;
   Sigma[1, 1] = var1;     Sigma[1, 2] = cov;
   Sigma[2, 1] = cov;      Sigma[2, 2] = var2;
-}  
+}
 model {
   y ~ multi_normal(mu, Sigma);
 }
@@ -3292,14 +3292,14 @@ \section{Partially Known Parameters}\label{partially-known-parameters.section}
 
 The vectorization of the multivariate normal is critical for
 efficiency here.  The transformed parameter \code{Sigma} could be
-defined as a local variable within the model block if 
+defined as a local variable within the model block if
 
 \section{Sliced Missing Data}
 
 If the missing data is part of some larger data structure, then it can
 often be effectively reassembled using index arrays and slicing.
 Here's an example for time-series data, where only some entries in the
-series are observed. 
+series are observed.
 %
 \begin{stancode}
 data {
@@ -3372,13 +3372,13 @@ \section{Loading matrix for factor analysis}
   K_choose_2 = (K * (K - 1)) / 2;
 }
 parameters {
-  vector[K_choose_2] L_lower;  
+  vector[K_choose_2] L_lower;
 }
 transformed parameters {
   cholesky_factor_cov[K] L;
   for (k in 1:K)
     L[k, k] = 1;
-  { 
+  {
     int i;
     for (m in 2:K) {
       for (n in 1:(m - 1)) {
@@ -3468,7 +3468,7 @@ \section{Missing Multivariate Data}
     ns1[n1] = n;
     n1 = n1 + 1;
   } else if (y_observed[n, 2]) {
-    ns2[n2] = n;    
+    ns2[n2] = n;
     n2 = n2 + 1;
   }
 }
@@ -3518,30 +3518,30 @@ \section{Truncated Data}\label{truncated-data.section}
 
 Truncated data is data for which measurements are only reported if
 they fall above a lower bound, below an upper bound, or between a
-lower and upper bound.  
+lower and upper bound.
 
 Truncated data may be modeled in Stan using truncated distributions.
 For example, suppose the truncated data is $y_n$ with an upper
 truncation point of $U = 300$ so that $y_n < 300$.  In Stan, this
 data can be modeled as following a truncated normal distribution for
-the observations as follows. 
+the observations as follows.
 %
 \begin{stancode}
 data {
   int<lower=0> N;
   real U;
   real<upper=U> y[N];
-} 
+}
 parameters {
   real mu;
   real<lower=0> sigma;
-} 
+}
 model {
   for (n in 1:N)
     y[n] ~ normal(mu, sigma) T[,U];
 }
 \end{stancode}
-% 
+%
 The model declares an upper bound \code{U} as data and constrains
 the data for \code{y} to respect the constraint;  this will be checked
 when the data is loaded into the model before sampling begins.
@@ -3558,7 +3558,7 @@ \subsection{Constraints and Out-of-Bounds Returns}
 sampled with the statement.
 %
 \begin{stancode}
-for (n in 1:N) 
+for (n in 1:N)
   y[n] ~ normal(mu, sigma) T[L,U];
 \end{stancode}
 %
@@ -3582,7 +3582,7 @@ \subsection{Constraints and Out-of-Bounds Returns}
 \code{y} is data, then \code{L} and \code{U} must be appropriately
 constrained so that all data is in range and the value of \code{L} is
 less than that of \code{U} (if they are equal, the parameter range
-collapses to a single point and the Hamiltonian dynamics used by 
+collapses to a single point and the Hamiltonian dynamics used by
 the sampler break down).  The following declarations ensure the bounds
 are well behaved.
 %
@@ -3614,13 +3614,13 @@ \subsection{Unknown Truncation Points}
   real y[N];
 }
 parameters {
-  real<upper = min(y)> L; 
+  real<upper = min(y)> L;
   real<lower = max(y)> U;
   real mu;
   real<lower=0> sigma;
 }
 model {
-  L ~ ...;  
+  L ~ ...;
   U ~ ...;
   for (n in 1:N)
     y[n] ~ normal(mu, sigma) T[L,U];
@@ -3638,7 +3638,7 @@ \subsection{Unknown Truncation Points}
 
 The ellipses where the priors for the bounds \code{L} and \code{U}
 should go should be filled in with a an informative prior in
-order for this model to not concentrate \code{L} strongly around 
+order for this model to not concentrate \code{L} strongly around
 \code{min(y)} and \code{U} strongly around \code{max(y)}.
 
 
@@ -3647,7 +3647,7 @@ \section{Censored Data}
 Censoring hides values from points that are too large, too small, or
 both.  Unlike with truncated data, the number of data points that were
 censored is known.  The textbook example is the household scale which
-does not report values above 300 pounds.  
+does not report values above 300 pounds.
 
 \subsection{Estimating Censored Values}
 
@@ -3681,7 +3681,7 @@ \subsection{Estimating Censored Values}
 declared to have values of type \code{real<lower=U>}, all imputed values
 for censored data will be greater than \code{U}.  The imputed censored
 data affects the location and scale parameters through the last
-sampling statement in the model.  
+sampling statement in the model.
 
 \subsection{Integrating out Censored Values}
 
@@ -3691,7 +3691,7 @@ \subsection{Integrating out Censored Values}
 probability of
 %
 \[
-\mbox{Pr}[y > U] 
+\mbox{Pr}[y > U]
 = \int_U^{\infty} \distro{Normal}(y|\mu,\sigma) \, dy
 = 1 - \Phi\left(\frac{y - \mu}{\sigma}\right),
 \]
@@ -3724,7 +3724,7 @@ \subsection{Integrating out Censored Values}
   real<lower=0> sigma;
 }
 model {
-  y_obs ~ normal(mu, sigma); 
+  y_obs ~ normal(mu, sigma);
   target += N_cens * normal_lccdf(U | mu, sigma);
 }
 \end{stancode}
@@ -3780,7 +3780,7 @@ \section{Relation to Clustering}\label{clustering-mixture.section}
 multivariate form as the statistical basis for the $K$-means
 algorithm;  the latent Dirichlet allocation model, usually applied to
 clustering problems, can be viewed as a mixed-membership multinomial
-mixture model. 
+mixture model.
 
 
 \section{Latent Discrete Parameterization}
@@ -3800,13 +3800,13 @@ \section{Latent Discrete Parameterization}
 \]
 %
 The variable $y_n$ is distributed according to the parameters
-of the mixture component $z_n$, 
+of the mixture component $z_n$,
 \[
 y_n \sim \distro{Normal}(\mu_{z[n]},\sigma_{z[n]}).
 \]
 %
 This model is not directly supported by Stan because it involves
-discrete parameters $z_n$, but Stan can sample $\mu$ and $\sigma$ 
+discrete parameters $z_n$, but Stan can sample $\mu$ and $\sigma$
 by summing out the $z$ parameter as described in the next section.
 
 
@@ -3814,12 +3814,12 @@ \section{Summing out the Responsibility Parameter}
 
 To implement the normal mixture model outlined in the previous
 section in Stan, the discrete parameters can be summed out of the
-model. If $Y$ is a mixture of $K$ normal distributions with 
+model. If $Y$ is a mixture of $K$ normal distributions with
 locations $\mu_k$ and scales $\sigma_k$ with mixing proportions
-$\lambda$ in the unit $K$-simplex, then 
+$\lambda$ in the unit $K$-simplex, then
 \[
 p_Y(y | \lambda, \mu, \sigma)
-\ = \ 
+\ = \
 \sum_{k=1}^K \lambda_k \, \distro{Normal}(y \, | \, \mu_k, \sigma_k).
 \]
 
@@ -3879,7 +3879,7 @@ \section{Log Sum of Exponentials: Linear Sums on the Log Scale}
 \\[2pt]
 & = & \mbox{log\_sum\_exp}(\hspace*{-5pt}\begin{array}[t]{l}
                          \log(0.3) + \log \distro{Normal}(y|-1,2),
-                         \\                  
+                         \\
                          \log(0.7) + \log \distro{Normal}(y|3,1) \ ).
                        \end{array}
 \end{eqnarray*}
@@ -3918,7 +3918,7 @@ \subsubsection{Dropping uniform mixture ratios}
 %
 \[
 \mbox{log\_sum\_exp}(c + a, c + b)
-\ = \ 
+\ = \
 c + \mbox{log\_sum\_exp(a, b)}
 \]
 %
@@ -3965,7 +3965,7 @@ \subsection{Estimating Parameters of a Mixture}
 points. The mixing proportion parameter \code{theta} is declared to be
 a unit $K$-simplex, whereas the component location parameter \code{mu}
 and scale parameter \code{sigma} are both defined to be
-\code{K}-vectors.  
+\code{K}-vectors.
 
 The location parameter \code{mu} is declared to be an ordered vector
 in order to identify the model.  This will not affect inferences that
@@ -3976,7 +3976,7 @@ \subsection{Estimating Parameters of a Mixture}
 
 The values in the scale array \code{sigma} are constrained to be
 non-negative, and have a weakly informative prior given in the model
-chosen to avoid zero values and thus collapsing components.  
+chosen to avoid zero values and thus collapsing components.
 
 The model declares a local array variable \code{lps} to be size
 \code{K} and uses it to accumulate the log contributions from the
@@ -3998,7 +3998,7 @@ \section{Vectorizing Mixtures}
 %
 \begin{stancode}
 for (n in 1:N) {
-  target += log_sum_exp(log(lambda) 
+  target += log_sum_exp(log(lambda)
                           + normal_lpdf(y[n] | mu[1], sigma[1]),
                         log1m(lambda)
                           + normal_lpdf(y[n] | mu[2], sigma[2]));
@@ -4008,7 +4008,7 @@ \section{Vectorizing Mixtures}
 %
 \begin{stancode}
 for (n in 1:N)
-  target += log_mix(lambda, 
+  target += log_mix(lambda,
                     normal_lpdf(y[n] | mu[1], sigma[1]),
                     normal_lpdf(y[n] | mu[2], sigma[2])));
 \end{stancode}
@@ -4016,16 +4016,16 @@ \section{Vectorizing Mixtures}
 This definition assumes that each observation $y_n$ may have arisen
 from either of the mixture components. The density is
 \[
-p(y \, | \, \lambda, \mu, \sigma) 
+p(y \, | \, \lambda, \mu, \sigma)
 = \prod_{n=1}^N (\lambda \times \distro{Normal}(y_n \, | \, \mu_1, \sigma_1)
                  + (1 - \lambda) \times \distro{Normal}(y_n \, | \, \mu_2, \sigma_2).
 \]
-%                       
+%
 Contrast the previous model with the following (erroneous) attempt to
 vectorize the model.
 %
 \begin{stancode}
-target += log_sum_exp(log(lambda) 
+target += log_sum_exp(log(lambda)
                         + normal_lpdf(y | mu[1], sigma[1]),
                       log1m(lambda)
                         + normal_lpdf(y | mu[2], sigma[2]));
@@ -4056,12 +4056,12 @@ \section{Inferences Supported by
 exchangeable in the model and thus not identifiable.  This arises if
 the parameters of the mixture components have exchangeable priors and
 the mixture ratio gets a uniform prior so that the parameters of the
-mixture components are also exchangeable in the likelihood.  
+mixture components are also exchangeable in the likelihood.
 
 We have finessed this basic problem by ordering the parameters.  This
 will allow us in some cases to pick out mixture components either
 ahead of time or after fitting (e.g., male vs. female, or Democrat
-vs.\ Republican).  
+vs.\ Republican).
 
 In other cases, we do not care about the actual identities of the
 mixture components and want to consider inferences that are
@@ -4089,8 +4089,8 @@ \subsection{Mixtures with Unidentifiable Components}
 so that with the likelihood
 %
 \[
-p(y_n \, | \, \mu, \sigma) 
-= \lambda \, \distro{Normal}(y_n \, | \, \mu_1, \sigma_1) 
+p(y_n \, | \, \mu, \sigma)
+= \lambda \, \distro{Normal}(y_n \, | \, \mu_1, \sigma_1)
 + (1 - \lambda) \, \distro{Normal}(y_n \, | \, \mu_2, \sigma_2),
 \]
 %
@@ -4100,7 +4100,7 @@ \subsection{Mixtures with Unidentifiable Components}
 %
 \footnote{Imposing a constraint such as $\theta < 0.5$ will resolve
   the symmetry, but fundamentally changes the model and its posterior
-  inferences.} 
+  inferences.}
 
 \subsection{Inference under Label Switching}
 
@@ -4119,7 +4119,7 @@ \subsubsection{Predictive likelihood}
 %
 \[
 p(\tilde{y} \, | \, y)
-= 
+=
 \int_{\theta}
 p(\tilde{y} \, | \, \theta)
 \, p(\theta | y)
@@ -4195,7 +4195,7 @@ \section{Zero-Inflated and Hurdle Models}\label{zero-inflated.section}
 modeling the probability of a zero outcome.  Zero-inflated models, as
 defined by \citet{Lambert:1992}, add additional probability mass to
 the outcome of zero.  Hurdle models, on the other hand, are formulated
-as pure mixtures of zero and non-zero outcomes.  
+as pure mixtures of zero and non-zero outcomes.
 
 Zero inflation and hurdle models can be formulated for discrete
 distributions other than the Poisson.  Zero inflation does not work
@@ -4215,8 +4215,8 @@ \subsection{Zero Inflation}
 notation for a Poisson mean parameter).  The probability function is
 thus
 \[
-p(y_n|\theta,\lambda) 
-= 
+p(y_n|\theta,\lambda)
+=
 \left\{
 \begin{array}{ll}
 \theta + (1 - \theta) \times \distro{Poisson}(0|\lambda) & \mbox{ if } y_n = 0, \mbox{ and}
@@ -4224,7 +4224,7 @@ \subsection{Zero Inflation}
 (1-\theta) \times \distro{Poisson}(y_n|\lambda) & \mbox{ if } y_n > 0.
 \end{array}
 \right.
-\] 
+\]
 %
 The log probability function can be implemented directly in Stan as follows.
 %
@@ -4241,7 +4241,7 @@ \subsection{Zero Inflation}
   for (n in 1:N) {
     if (y[n] == 0)
       target += log_sum_exp(bernoulli_lpmf(1 | theta),
-                            bernoulli_lpmf(0 | theta) 
+                            bernoulli_lpmf(0 | theta)
                               + poisson_lpmf(y[n] | lambda));
     else
       target += bernoulli_lpmf(0 | theta)
@@ -4265,7 +4265,7 @@ \subsection{Hurdle Models}
 %
 \[
 p(y|\theta,\lambda)
-= 
+=
 \begin{cases}
 \ \theta & \mbox{if } y = 0, \mbox{ and}
 \\
@@ -4279,7 +4279,7 @@ \subsection{Hurdle Models}
 %
 where \distro{PoissonCDF} is the cumulative distribution function for
 the Poisson distribution.  The hurdle model is even more straightforward to
-program in Stan, as it does not require an explicit mixture.  
+program in Stan, as it does not require an explicit mixture.
 %
 \begin{stancode}
    if (y[n] == 0)
@@ -4308,7 +4308,7 @@ \subsection{Hurdle Models}
 Julian King pointed out that because
 \[
 \log \left( 1 - \distro{PoissonCDF}(0 | \lambda) \right)
-\ = \ \log \left( 1 - \distro{Poisson}(0 | \lambda) \right) 
+\ = \ \log \left( 1 - \distro{Poisson}(0 | \lambda) \right)
 \ = \ \log(1 - \exp(-\lambda))
 \]
 the CCDF in the else clause can be replaced with a simpler expression.
@@ -4353,7 +4353,7 @@ \subsection{Hurdle Models}
 
   N0 = num_zero(y);
   Ngt0 = N - N0;
-  { 
+  {
     int pos;
     pos = 1;
     for (n in 1:N) {
@@ -4434,7 +4434,7 @@ \chapter{Measurement Error and Meta-Analysis}
 large relative to the quantity being measured, or when very precise
 relations can be estimated being measured quantities, it is useful to
 introduce an explicit model of measurement error.  One kind of
-measurement error is rounding. 
+measurement error is rounding.
 
 Meta-analysis plays out statistically very much like measurement error
 models, where the inferences drawn from multiple data sets are
@@ -4465,12 +4465,12 @@ \subsection{Regression with Measurement Error}
 }
 parameters {
   real alpha;           // intercept
-  real beta;            // slope 
+  real beta;            // slope
   real<lower=0> sigma;  // outcome noise
 }
 model {
   y ~ normal(alpha + beta * x, sigma);
-  alpha ~ normal(0, 10);  
+  alpha ~ normal(0, 10);
   beta ~ normal(0, 10);
   sigma ~ cauchy(0, 5);
 }
@@ -4494,7 +4494,7 @@ \subsection{Regression with Measurement Error}
   real<lower=0> tau;  // measurement noise
 }
 parameters {
-  real x[N];          // unknown true value   
+  real x[N];          // unknown true value
   real mu_x;          // prior location
   real sigma_x;       // prior scale
   ...
@@ -4503,7 +4503,7 @@ \subsection{Regression with Measurement Error}
   x ~ normal(mu_x, sigma_x);  // prior
   x_meas ~ normal(x, tau);    // measurement model
   y ~ normal(alpha + beta * x, sigma);
-  ... 
+  ...
 }
 \end{stancode}
 %
@@ -4535,7 +4535,7 @@ \subsection{Rounding}
 A common form of measurement error arises from rounding measurements.
 Rounding may be done in many ways, such as rounding weights to the
 nearest milligram, or to the nearest pound; rounding may even be done
-by rounding down to the nearest integer.  
+by rounding down to the nearest integer.
 
 Exercise 3.5(b) from \citep{GelmanEtAl:2013} provides an example.
 %
@@ -4543,7 +4543,7 @@ \subsection{Rounding}
   3.5. \ Suppose we weigh an object five times and measure
   weights, rounded to the nearest pound, of 10, 10, 12, 11, 9.  Assume
   the unrounded measurements are normally distributed with a
-  noninformative prior distribution on $\mu$ and $\sigma^2$. 
+  noninformative prior distribution on $\mu$ and $\sigma^2$.
   \\[4pt]
   (b) \ Give the correct posterior distribution for $(\mu, \sigma^2)$,
   treating the measurements as rounded.
@@ -4561,15 +4561,15 @@ \subsection{Rounding}
 by marginalizing out the unrounded measurement, producing the likelihood
 \[
 p(y_n \, | \, \mu, \sigma)
-= 
+=
 \int_{y_n - 0.5}^{y_n + 0.5}
 \
 \distro{Normal}(z_n \, | \, \mu, \sigma)
 \
 \mathrm{d}z_n
-= 
+=
 \Phi\!\left(\frac{y_n + 0.5 - \mu}{\sigma}\right)
-- 
+-
 \Phi\!\left(\frac{y_n - 0.5 - \mu}{\sigma}\right).
 \]
 Gelman's answer for this problem took the noninformative prior to be
@@ -4589,10 +4589,10 @@ \subsection{Rounding}
 &  \propto &
 \frac{1}{\sigma^2}
 \
-\mathlarger{\mathlarger{\prod}}_{n=1}^5 
+\mathlarger{\mathlarger{\prod}}_{n=1}^5
 \left(
 \Phi\!\left(\frac{y_n + 0.5 - \mu}{\sigma}\right)
-- 
+-
 \Phi\!\left(\frac{y_n - 0.5 - \mu}{\sigma}\right)
 \right).
 \end{eqnarray*}
@@ -4665,7 +4665,7 @@ \section{Meta-Analysis}
 
 Meta-analysis aims to pool the data from several studies, such as the
 application of a tutoring program in several schools or treatment
-using a drug in several clinical trials.  
+using a drug in several clinical trials.
 
 The Bayesian framework is particularly convenient for meta-analysis,
 because each previous study can be treated as providing a noisy
@@ -4712,17 +4712,17 @@ \subsubsection{Converting to Log Odds and Standard Error}
 \[
 y_j = \log \left( \frac{r^t_j / (n^t_j - r^t_j)}
                        {r^c_j / (n^c_j - r^c_j)} \right)
-\ \ = \ \ 
+\ \ = \ \
 \log \left( \frac{r^t_j}{n^t_j - r^t_j} \right)
-- 
+-
 \log \left( \frac{r^c_j}{n^c_j - r^c_j} \right)
 \]
 and corresponding standard errors
 \[
 \sigma_j = \sqrt{
-\frac{1}{r^T_i} 
+\frac{1}{r^T_i}
 + \frac{1}{n^T_i - r^T_i}
-+ \frac{1}{r^C_i} 
++ \frac{1}{r^C_i}
 + \frac{1}{n^C_i - r^C_i}
 }.
 \]
@@ -4735,7 +4735,7 @@ \subsubsection{Converting to Log Odds and Standard Error}
 transformed data {
   real y[J];
   real<lower=0> sigma[J];
-  for (j in 1:J) 
+  for (j in 1:J)
     y[j] = log(r_t[j]) - log(n_t[j] - r_t[j])
             - (log(r_c[j]) - log(n_c[j] - r_c[j]);
   for (j in 1:J)
@@ -4744,7 +4744,7 @@ \subsubsection{Converting to Log Odds and Standard Error}
 }
 \end{stancode}
 %
-This definition will be problematic if any of the success counts is 
+This definition will be problematic if any of the success counts is
 zero or equal to the number of trials.
 If that arises, a direct binomial model will be required or other
 transforms must be used than the unregularized sample log odds.
@@ -4783,7 +4783,7 @@ \subsubsection{Hierarchical Model}
 vary by clinical trial, a hierarchical model can be used.  The
 parameters include per-trial treatment effects and the hierarchical
 prior parameters, which will be estimated along with other unknown
-quantities.  
+quantities.
 %
 \begin{stancode}
 parameters {
@@ -4812,7 +4812,7 @@ \subsubsection{Hierarchical Model}
 %
 \footnote{The model provided for this data in
 \citep[Section~5.5]{GelmanEtAl:2013} is included with the
-data in the Stan example model repository, 
+data in the Stan example model repository,
 \url{http://mc-stan.org/documentation}.}
 
 \subsubsection{Extensions and Alternatives}
@@ -4862,7 +4862,7 @@ \section{The Benefits of Marginalization}\label{rao-blackwell.section}
 (EM), are often provided in applied statistics papers to describe
 maximum likelihood estimation algorithms.  Such derivations provide
 exactly the marginalization needed for coding the model in Stan.
- 
+
 \section{Change Point Models}
 
 The first example is a model of coal mining disasters in the U.K.\
@@ -4913,13 +4913,13 @@ \subsection{Marginalizing out the Discrete Parameter}
 To code this model in Stan, the discrete parameter $s$ must be
 marginalized out to produce a model defining the log of the
 probability function $p(e,l,D_t)$.  The full joint probability factors
-as 
+as
 %
 \begin{eqnarray*}
 p(e,l,s,D)
 & = & p(e) \, p(l) \, p(s) \, p(D | s, e, l)
 \\[3pt]
-& = & 
+& = &
 \begin{array}[t]{l}
 \distro{Exponential}(e|r_e) \ \distro{Exponential}(l|r_l) \
 \distro{Uniform}(s|1, T)
@@ -4938,7 +4938,7 @@ \subsection{Marginalizing out the Discrete Parameter}
 where the likelihood is defined by marginalizing $s$ as
 %
 \begin{eqnarray*}
-p(D | e,l) 
+p(D | e,l)
 & = &
 \sum_{s=1}^T p(s, D | e,l)
 \\[3pt]
@@ -4946,7 +4946,7 @@ \subsection{Marginalizing out the Discrete Parameter}
 \sum_{s=1}^T p(s) p(D | s,e,l)
 \\[3pt]
 & = &
-\sum_{s=1}^T \distro{Uniform}(s | 1,T) 
+\sum_{s=1}^T \distro{Uniform}(s | 1,T)
 \, \prod_{t=1}^T \distro{Poisson}(D_t | t < s \ ? \ e \ : \ l)
 \end{eqnarray*}
 %
@@ -4956,7 +4956,7 @@ \subsection{Marginalizing out the Discrete Parameter}
 \log p(D | e,l)
 \\[3pt]
 \mbox{ } \ \ = \
-\mbox{log\_sum\_exp}_{s=1}^T 
+\mbox{log\_sum\_exp}_{s=1}^T
 \begin{array}[t]{l}
 \big(
  \log \distro{Uniform}(s \, | \, 1, T)
@@ -5013,7 +5013,7 @@ \subsection{Coding the Model in Stan}
   e ~ exponential(r_e);
   l ~ exponential(r_l);
   target += log_sum_exp(lp);
-}    
+}
 \end{stancode}
 \vspace*{-6pt}
 \caption{\small\it A change point model in which disaster rates
@@ -5093,7 +5093,7 @@ \subsection{Posterior Distribution of the Discrete Change Point}
 & \propto &
 q(s | D)
 \\[3pt]
-& = & 
+& = &
 \frac{1}{M} \sum_{m=1}^{M} \exp(\code{lp}[m,s]).
 \end{eqnarray*}
 %
@@ -5184,7 +5184,7 @@ \subsection{Multiple Change Points}
 for (s1 in 1:T)
   for (s2 in 1:T)
     for (t in 1:T)
-      lp[s1,s2] = lp[s1,s2] 
+      lp[s1,s2] = lp[s1,s2]
         + poisson_lpmf(D[t] | t < s1 ? e : (t < s2 ? m : l));
 \end{stancode}
 %
@@ -5292,12 +5292,12 @@ \subsection{Cormack-Jolly-Seber with Discrete Parameter}
 The Cormack-Jolly-Seber (CJS) model
 \citep{Cormack:1964,Jolly:1965,Seber:1965} is an open-population model
 in which the population may change over time due to death; the
-presentation here draws heavily on \citep{Schofield:2007}.  
+presentation here draws heavily on \citep{Schofield:2007}.
 
 The basic data is
 %
 \begin{itemize}
-\item $I$ : number of individuals, 
+\item $I$ : number of individuals,
 \item $T$ : number of capture periods, and
 \item $y_{i,t}$ : boolean indicating if individual $i$ was captured at
   time $t$.
@@ -5351,10 +5351,10 @@ \subsection{Collective Cormack-Jolly-Seber Model}
 alive at time $t$ (if it is dead, the recapture probability is zero).
 this quantity is defined recursively by
 \[
-\chi_t 
-= 
+\chi_t
+=
 \begin{cases}
-1 
+1
 & \mbox{if } t = T
 \\[3pt]
 (1 - \phi_t) + \phi_t (1 - p_{t+1}) \chi_{t+1}
@@ -5370,7 +5370,7 @@ \subsection{Collective Cormack-Jolly-Seber Model}
 \phi_t)$, or (2) surviving to the next time period with probability
 $\phi_t$, not being captured in the next time period with probability
 $(1 - p_{t+1})$, and not being captured again after being alive in
-period $t+1$ with probability $\chi_{t+1}$. 
+period $t+1$ with probability $\chi_{t+1}$.
 
 With three capture times, there are three captured/not-captured
 profiles an individual may have.  These may be naturally coded as
@@ -5395,7 +5395,7 @@ \subsection{Collective Cormack-Jolly-Seber Model}
 {5} & + & - & + & $\phi_1 \, (1 - p_2) \, \phi_2 \, p_3$
 \\ \hline
 {6} & + & + & - & $ \phi_1 \, p_2 \, \chi_2$
-\\ 
+\\
 {7} & + & + & + & $\phi_1 \, p_2 \, \phi_2 \, p_3$
 \end{tabular}
 \end{center}
@@ -5432,7 +5432,7 @@ \subsection{Collective Cormack-Jolly-Seber Model}
   real<lower=0,upper=1> p[3];
 }
 transformed parameters {
-  real<lower=0,upper=1> chi[2];  
+  real<lower=0,upper=1> chi[2];
   chi[2] = (1 - phi[2]) + phi[2] * (1 - p[3]);
   chi[1] = (1 - phi[1]) + phi[1] * (1 - p[2]) * chi[2];
 }
@@ -5442,9 +5442,9 @@ \subsection{Collective Cormack-Jolly-Seber Model}
   target += history[4] * (log(chi[1]));
   target += history[5] * (log(phi[1]) + log1m(p[2])
                             + log(phi[2]) + log(p[3]));
-  target += history[6] * (log(phi[1]) + log(p[2]) 
+  target += history[6] * (log(phi[1]) + log(p[2])
                             + log(chi[2]));
-  target += history[7] * (log(phi[1]) + log(p[2]) 
+  target += history[7] * (log(phi[1]) + log(p[2])
                             + log(phi[2]) + log(p[3]));
 }
 generated quantities {
@@ -5507,7 +5507,7 @@ \subsubsection{Utility Functions}
 feed it into the Stan model as preprocessed data.  Yet another
 alternative encoding would be a sparse one recording only the
 capture events along with their time and identifying the individual
-captured.} 
+captured.}
 %
 \begin{stancode}
 functions {
@@ -5517,11 +5517,11 @@ \subsubsection{Utility Functions}
         return k;
     return 0;
   }
-  int last_capture(int[] y_i) { 
+  int last_capture(int[] y_i) {
     for (k_rev in 0:(size(y_i) - 1)) {
       int k;
       k = size(y_i) - k_rev;
-      if (y_i[k]) 
+      if (y_i[k])
         return k;
     }
     return 0;
@@ -5553,7 +5553,7 @@ \subsubsection{Utility Functions}
   n_captured = rep_vector(0, T);
   for (t in 1:T)
     for (i in 1:I)
-      if (y[i, t]) 
+      if (y[i, t])
         n_captured[t] = n_captured[t] + 1;
 }
 \end{stancode}
@@ -5588,16 +5588,16 @@ \subsubsection{Utility Functions}
   ...
   vector prob_uncaptured(int T, vector p, vector phi) {
     vector[T] chi;
-    chi[T] = 1.0;              
+    chi[T] = 1.0;
     for (t in 1:(T - 1)) {
       int t_curr;
       int t_next;
       t_curr = T - t;
       t_next = t_curr + 1;
-      chi[t_curr] = (1 - phi[t_curr]) 
-                     + phi[t_curr] 
-                       * (1 - p[t_next]) 
-                       * chi[t_next]; 
+      chi[t_curr] = (1 - phi[t_curr])
+                     + phi[t_curr]
+                       * (1 - p[t_next])
+                       * chi[t_next];
     }
     return chi;
   }
@@ -5748,7 +5748,7 @@ \subsection{Noisy Categorical Measurement Model}
 
 In this section, only categorical ratings are considered, and the
 challenge in the modeling for Stan is to marginalize out the discrete
-parameters.  
+parameters.
 
 \cite{DawidSkene:1979} introduce a noisy-measurement model for
 data coding and apply in the epidemiological setting of coding what
@@ -5854,12 +5854,12 @@ \subsubsection{Marginalizing out the True Category}
 \[
 p(y | \theta, \pi)
 \ = \
-\prod_{i=1}^I \sum_{k=1}^K  
+\prod_{i=1}^I \sum_{k=1}^K
   \left( \distro{Categorical}(z_i | \pi)
          \prod_{j=1}^J
          \distro{Categorical}(y_{i, j}|\theta_{j, z[i]})
   \right).
-\]            
+\]
 %
 In the missing data model, only the observed labels would be used in
 the inner product.
@@ -5879,7 +5879,7 @@ \subsubsection{Marginalizing out the True Category}
          \log \distro{Categorical}(y_{i, j}|\theta_{j, z[i]})
   \right) \right),
 \end{array}
-\]                            
+\]
 which can be directly coded using Stan's built-in \code{log\_sum\_exp}
 function.
 
@@ -5911,7 +5911,7 @@ \subsection{Stan Implementation}
     log_q_z[i] = log(pi);
     for (j in 1:J)
       for (k in 1:K)
-        log_q_z[i, k] = log_q_z[i, k] 
+        log_q_z[i, k] = log_q_z[i, k]
                          + log(theta[j, k, y[i, j]]);
   }
 }
@@ -5940,7 +5940,7 @@ \subsection{Stan Implementation}
 $\beta_{k,k'} = 1$ if $k \neq k'$.  Taking $\alpha$ and $\beta_k$ to
 be unit vectors and applying optimization will produce the same answer
 as the expectation maximization (EM) algorithm of
-\cite{DawidSkene:1979}.  
+\cite{DawidSkene:1979}.
 
 \subsubsection{Inference for the True Category}
 
@@ -5992,7 +5992,7 @@ \section{Sparse Data Structures}
 \begin{center}
 \begin{minipage}[c]{0.45\textwidth}
 \[
-y 
+y
 =
 \left[
 \begin{array}{cccc}
@@ -6008,7 +6008,7 @@ \section{Sparse Data Structures}
 \ \ \
 \begin{minipage}[c]{0.45\textwidth}
 \begin{tabular}{ll|l}
-$jj$ & $kk$ & $y$ 
+$jj$ & $kk$ & $y$
 \\ \hline
 1 & 1 & 0
 \\
@@ -6075,7 +6075,7 @@ \section{Ragged Data Structures}\label{ragged-data-structs.section}
 
 Ragged arrays are arrays that are not rectangular, but have different
 sized entries.  This kind of structure crops up when there are
-different numbers of observations per entry.  
+different numbers of observations per entry.
 
 A general approach to dealing with ragged structure is to move to a
 full database-like data structure as discussed in the previous
@@ -6107,7 +6107,7 @@ \section{Ragged Data Structures}\label{ragged-data-structs.section}
   rows of different sizes ($y_1$ is size 3, $y_2$ size 2, and $y_3$
   size 4).  On the right is an example of how to code the data in Stan,
   using a single vector $y$ to hold all the values and a separate
-  array of integers $s$ to hold the group row sizes.  In this 
+  array of integers $s$ to hold the group row sizes.  In this
   example, $y_1 = z_{1:3}$, $y_2 =
   z_{4:5}$, and $y_3 = z_{6:9}$.}\label{ragged-data.figure}
 \end{figure}
@@ -6118,7 +6118,7 @@ \section{Ragged Data Structures}\label{ragged-data-structs.section}
 \[
 \prod_{n=1}^3 \log \distro{Normal}(y_n | \mu_n, \sigma).
 \]
-There's no direct way to encode this in Stan.  
+There's no direct way to encode this in Stan.
 
 A full database type structure could be used, as in the sparse
 example, but this is inefficient, wasting space for unnecessary
@@ -6127,7 +6127,7 @@ \section{Ragged Data Structures}\label{ragged-data-structs.section}
 data structure indicating the sizes of each subarray.  This is
 indicated on the right of \reffigure{ragged-data}.  This coding uses a
 single array for the values and a separate array for the sizes of each
-row.  
+row.
 
 The model can then be coded up using slicing operations as follows.
 \begin{quote}
@@ -6181,7 +6181,7 @@ \section{Soft $K$-Means}
 $D$-dimensional vectors.  Specifically, there will be $N$ items to be
 clustered, each represented as a vector $y_n \in \reals^D$.  In the
 ``soft'' version of $K$-means, the assignments to clusters will be
-probabilistic.  
+probabilistic.
 
 \subsection{Geometric Hard  $K$-Means Clustering}
 
@@ -6192,7 +6192,7 @@ \subsection{Geometric Hard  $K$-Means Clustering}
 \begin{enumerate}
 \item For each $n$ in $1:N$, randomly assign vector $y_n$ to a cluster in $1{:}K$;
 \item Repeat
-\begin{enumerate} 
+\begin{enumerate}
 \item For each cluster $k$ in $1{:}K$, compute the cluster centroid $\mu_k$  by averaging the
   vectors assigned to that cluster;
 \item For each $n$ in $1:N$, reassign $y_n$ to the cluster $k$
@@ -6272,7 +6272,7 @@ \subsection{Stan Implementation of Soft $K$-Means}
   real<upper=0> soft_z[N, K]; // log unnormalized clusters
   for (n in 1:N)
     for (k in 1:K)
-      soft_z[n, k] = neg_log_K 
+      soft_z[n, k] = neg_log_K
                      - 0.5 * dot_self(mu[k] - y[n]);
 }
 model {
@@ -6305,7 +6305,7 @@ \subsection{Generalizing Soft $K$-Means}
 geometries.  For instance, replacing the normal distribution with the
 double exponential (Laplace) distribution produces a clustering model
 based on $L_1$ distance (i.e., Manhattan or taxicab
-distance). 
+distance).
 
 Within the multivariate normal version of $K$-means, replacing the
 unit covariance matrix with a shared covariance matrix amounts to
@@ -6339,7 +6339,7 @@ \subsection{Non-Identifiability}
 parameter in soft $K$-means is not identified, leading to problems in
 monitoring convergence.  Clusters can even fail to be identified
 within a single chain, with indices swapping if the chain is long
-enough or the data is not cleanly separated. 
+enough or the data is not cleanly separated.
 
 \subsection{Multimodality}
 
@@ -6376,12 +6376,12 @@ \section{Naive Bayes Classification and Clustering}
 
 Multinomial mixture models are referred to as ``naive Bayes'' because
 they are often applied to classification problems where the
-multinomial independence assumptions are clearly false. 
+multinomial independence assumptions are clearly false.
 
 Naive Bayes classification and clustering can be applied to any data
 with multinomial structure.  A typical example of this is natural
 language text classification and clustering, which is used an example
-in what follows. 
+in what follows.
 
 The observed data consists of a sequence of $M$ documents made up of
 bags of words drawn from a vocabulary of $V$ distinct words.  A
@@ -6398,7 +6398,7 @@ \section{Naive Bayes Classification and Clustering}
 z_m \sim \distro{Categorical}(\theta).
 \]
 The $K$-simplex parameter $\theta$ represents the prevalence of each
-category in the data.  
+category in the data.
 
 Next, the words in each document are generated conditionally
 independently of each other and the words in other documents based on
@@ -6443,7 +6443,7 @@ \subsection{Coding Ragged Arrays}
 %
 The relevant variables for the program are \code{N}, the total number
 of words in all the documents, the word array \code{w}, and the
-document identity array \code{doc}.  
+document identity array \code{doc}.
 
 \subsection{Estimation with Category-Labeled Training Data}
 
@@ -6475,7 +6475,7 @@ \subsection{Estimation with Category-Labeled Training Data}
 }
 model {
   theta ~ dirichlet(alpha);
-  for (k in 1:K)  
+  for (k in 1:K)
     phi[k] ~ dirichlet(beta);
   for (m in 1:M)
     z[m] ~ categorical(theta);
@@ -6486,12 +6486,12 @@ \subsection{Estimation with Category-Labeled Training Data}
 %
 Note that the topic identifiers $z_m$ are declared as data and the
 latent category assignments are included as part of the likelihood
-function.  
+function.
 
 \subsection{Estimation without Category-Labeled Training Data}
 
 Naive Bayes models can be used in an unsupervised fashion to cluster
-multinomial-structured data into a fixed number $K$ of categories.  
+multinomial-structured data into a fixed number $K$ of categories.
 The data declaration includes the same variables as the model in the
 previous section excluding the topic labels \code{z}.   Because
 \code{z} is discrete, it needs to be summed out of the model
@@ -6502,13 +6502,13 @@ \subsection{Estimation without Category-Labeled Training Data}
 \begin{array}{l}
 \log p(w_{m,1},\ldots,w_{m,N_m}|\theta,\phi)
 \\[2pt]
-\ \ \ = \ 
-\log \sum_{k=1}^K 
+\ \ \ = \
+\log \sum_{k=1}^K
 \left( \distro{Categorical}(k|\theta)
         \times \prod_{n=1}^{N_m} \distro{Categorical}(w_{m,n}|\phi_k)
 \right)
 \\[6pt]
-\ \ \ = \ 
+\ \ \ = \
 \log \sum_{k=1}^K \exp \left(
 \log \distro{Categorical}(k|\theta)
 + \sum_{n=1}^{N_m} \log \distro{Categorical}(w_{m,n}|\phi_k)
@@ -6526,12 +6526,12 @@ \subsection{Estimation without Category-Labeled Training Data}
   theta ~ dirichlet(alpha);
   for (k in 1:K)
     phi[k] ~ dirichlet(beta);
-  for (m in 1:M) 
-    for (k in 1:K) 
+  for (m in 1:M)
+    for (k in 1:K)
       gamma[m, k] = categorical_lpmf(k | theta);
   for (n in 1:N)
     for (k in 1:K)
-      gamma[doc[n], k] = gamma[doc[n], k] 
+      gamma[doc[n], k] = gamma[doc[n], k]
                          + categorical_lpmf(w[n] | phi[k]);
   for (m in 1:M)
     target += log_sum_exp(gamma[m]);
@@ -6549,7 +6549,7 @@ \subsection{Estimation without Category-Labeled Training Data}
 \[
 \mbox{Pr}[z_m = k|w,\alpha,\beta]
 =
-\exp \left( 
+\exp \left(
 \gamma_{m,k}
 - \log \sum_{k=1}^K \exp \left( \gamma_{m,k} \right)
 \right).
@@ -6644,10 +6644,10 @@ \subsection{Summing out the Discrete Parameters}
 %
 \begin{eqnarray*}
 p(\theta,\phi|w,\alpha,\beta)
-& \propto & 
+& \propto &
 p(\theta|\alpha) \times p(\phi|\beta) \times p(w|\theta,\phi)
 \\[4pt]
-& = & 
+& = &
 \prod_{m=1}^M p(\theta_m|\alpha)
 \times
 \prod_{k=1}^K p(\phi_k|\beta)
@@ -6672,16 +6672,16 @@ \subsection{Summing out the Discrete Parameters}
 \begin{array}{l}
 \log p(\theta,\phi|w,\alpha,\beta)
 \\[6pt]
-{ } \ \ 
+{ } \ \
 \begin{array}{l}
 { } = \sum_{m=1}^M \log \distro{Dirichlet}(\theta_m|\alpha)
 \ + \
 \sum_{k=1}^K \log \distro{Dirichlet}(\phi_k|\beta)
 \\[6pt]
 { } \ \ \ \ \
-+ \sum_{m=1}^M \sum_{n=1}^{N[m]} \log \left( 
++ \sum_{m=1}^M \sum_{n=1}^{N[m]} \log \left(
 \sum_{z=1}^K
-  \distro{Categorical}(z|\theta_m) 
+  \distro{Categorical}(z|\theta_m)
    \times \distro{Categorical}(w_{m,n}|\phi_z)
  \right)
 \end{array}
@@ -6711,28 +6711,28 @@ \subsection{Implementation of LDA}
   simplex[V] phi[K];     // word dist for topic k
 }
 model {
-  for (m in 1:M)  
+  for (m in 1:M)
     theta[m] ~ dirichlet(alpha);  // prior
-  for (k in 1:K)  
+  for (k in 1:K)
     phi[k] ~ dirichlet(beta);     // prior
   for (n in 1:N) {
     real gamma[K];
-    for (k in 1:K) 
+    for (k in 1:K)
       gamma[k] = log(theta[doc[n], k]) + log(phi[k, w[n]]);
-    target += log_sum_exp(gamma));  // likelihood;
+    target += log_sum_exp(gamma);  // likelihood;
   }
 }
 \end{stancode}
 %
 As in the other mixture models, the log-sum-of-exponents function is
-used to stabilize the numerical arithmetic. 
+used to stabilize the numerical arithmetic.
 
 \subsection{Correlated Topic Model}
 
 To account for correlations in the distribution of topics for
 documents, \citep{BleiLafferty:2007} introduced a variant of LDA in
 which the Dirichlet prior on the per-document topic distribution is
-replaced with a multivariate logistic normal distribution.  
+replaced with a multivariate logistic normal distribution.
 
 The authors treat the prior as a fixed hyperparameter.  They use an
 $L_1$-regularized estimate of covariance, which is equivalent to the
@@ -6752,7 +6752,7 @@ \subsubsection{Fixed Hyperparameter Correlated Topic Model}
 data {
   ... data as before without alpha ...
   vector[K] mu;          // topic mean
-  cov_matrix[K] Sigma;   // topic covariance 
+  cov_matrix[K] Sigma;   // topic covariance
 }
 \end{stancode}
 %
@@ -6781,7 +6781,7 @@ \subsubsection{Full Bayes Correlated Topic Model}
 
 By adding a prior for the mean and covariance, Stan supports full
 Bayesian inference for the correlated topic model.  This requires
-moving the declarations of topic mean \code{mu} and covariance \code{Sigma} 
+moving the declarations of topic mean \code{mu} and covariance \code{Sigma}
 from the data block to the parameters block and providing them with
 priors in the model.  A relatively efficient and interpretable prior
 for the covariance matrix \code{Sigma} may be encoded as follows.
@@ -6843,7 +6843,7 @@ \chapter{Gaussian Processes}\label{gaussian-processes.chapter}
 the function's value at a finite number of input points is a multivariate
 normal distribution.  This makes it tractable to both fit models from finite
 amounts of observed data and make predictions for finitely many new data
-points. 
+points.
 
 Unlike a simple multivariate normal distribution, which is
 parameterized by a mean vector and covariance matrix, a Gaussian
@@ -6881,7 +6881,7 @@ \section{Gaussian Process Regression}
 conditioned on their inputs $x$ is Gaussian:
 \[
 y \sim \distro{MultiNormal}(m(x), K(x | \theta)),
-\] 
+\]
 where $m(x)$ is an $N$-vector and $K(x | \theta)$ is an $N \times N$
 covariance matrix.  The mean function $m : \reals^{N \times D}
 \rightarrow \reals^{N}$ can be anything, but the covariance function
@@ -6897,7 +6897,7 @@ \section{Gaussian Process Regression}
 in this chapter, is an exponentiated quadratic function,
 \[
   K(x | \alpha, \rho, \sigma)_{i, j}
-= \alpha^2 
+= \alpha^2
 \exp \left(
 - \dfrac{1}{2 \rho^2} \sum_{d=1}^D (x_{i,d} - x_{j,d})^2
 \right)
@@ -6908,10 +6908,10 @@ \section{Gaussian Process Regression}
 function with value 1 if $i = j$ and value 0 otherwise; note that this
 test is between the indexes $i$ and $j$, not between values $x_i$ and
 $x_j$. Note that this kernel is obtained through a convolution of two
-independent Gaussian processes, $f_1$ and $f_2$, with kernels 
+independent Gaussian processes, $f_1$ and $f_2$, with kernels
 \[
   K_1(x | \alpha, \rho)_{i, j}
-= \alpha^2 
+= \alpha^2
 \exp \left(
 - \dfrac{1}{2 \rho^2} \sum_{d=1}^D (x_{i,d} - x_{j,d})^2
 \right)
@@ -6919,14 +6919,14 @@ \section{Gaussian Process Regression}
 and
 \[
   K_2(x | \sigma)_{i, j}
-= 
+=
  \delta_{i, j} \sigma^2,
 \]
 
 The addition of $\sigma^2$ on the diagonal is important
 to ensure the positive definiteness of the resulting matrix in the case of
 two identical inputs $x_i = x_j$.  In statistical terms, $\sigma$ is
-the scale of the noise term in the regression. 
+the scale of the noise term in the regression.
 
 The hyperparameter $\rho$ is the \emph{length-scale}, and corresponds to the
 frequency of the functions represented by the Gaussian process prior with
@@ -6949,7 +6949,7 @@ \section{Gaussian Process Regression}
 $x_i$ and $x_j$ (i.e., the $L_2$ norm of their difference, $x_i -
 x_j$). This results in support for smooth functions in the process.
 The amount of variation in the function is controlled by the free
-hyperparameters $\alpha$, $\rho$, and $\sigma$.  
+hyperparameters $\alpha$, $\rho$, and $\sigma$.
 
 Changing the notion of distance from Euclidean to taxicab distance
 (i.e., an $L_1$ norm) changes the support to functions which are
@@ -7001,7 +7001,7 @@ \section{Simulating from a Gaussian Process}
 \end{stancode}
 %
 The above model can also be written more compactly using the specialized
-covariance function that implements the exponentiated quadratic kernel. 
+covariance function that implements the exponentiated quadratic kernel.
 %
 \begin{stancode}
 data {
@@ -7027,7 +7027,7 @@ \section{Simulating from a Gaussian Process}
 The input data is just the vector of inputs \code{x} and its size
 \code{N}.  Such a model can be used with values of \code{x} evenly
 spaced over some interval in order to plot sample draws of functions
-from a Gaussian process.  
+from a Gaussian process.
 
 \subsection{Multivariate Inputs}
 
@@ -7051,7 +7051,7 @@ \subsection{Multivariate Inputs}
 \end{stancode}
 %
 The data is now declared as an array of vectors instead of an array of
-scalars; the dimensionality \code{D} is also declared.  
+scalars; the dimensionality \code{D} is also declared.
 
 In the remainder of the chapter, univariate models will be used for simplicity,
 but any of the models could be changed to multivariate in the same way as the
@@ -7119,7 +7119,7 @@ \section{Fitting a Gaussian Process}\label{fit-gp.section}
 
 \subsection{GP with a normal outcome}
 
-The full generative model for a GP with a normal outcome, 
+The full generative model for a GP with a normal outcome,
 $y \in \R^N$, with inputs $x \in \R^N$, for a finite $N$:
 
 \begin{align*}
@@ -7141,7 +7141,7 @@ \subsection{GP with a normal outcome}
   \left(0, K(x | \alpha, \rho) + \mathbf{I}_N \sigma^2\right) \\
 \end{align*}
 
-It can be more computationally efficient when dealing with a normal 
+It can be more computationally efficient when dealing with a normal
 outcome to integrate out the Gaussian process, because this yields a
 lower-dimensional parameter space over which to do inference. We'll fit
 both models in Stan. The former model will be referred to as the latent
@@ -7159,9 +7159,9 @@ \subsection{GP with a normal outcome}
 
 The Stan program implementing the marginal likelihood GP is shown below. The
 program is similar to the Stan programs that implement the simulation GPs
-above, but because we are doing inference on the hyperparameters, we need to 
+above, but because we are doing inference on the hyperparameters, we need to
 calculate the covariance matrix \code{K} in the model block, rather than
-the transformed data block. 
+the transformed data block.
 %
 \footnote{The program code is available in the Stan example model repository;
 see \url{http://mc-stan.org/documentation}.}
@@ -7189,7 +7189,7 @@ \subsection{GP with a normal outcome}
 
   // diagonal elements
   for (n in 1:N)
-    K[n, n] = K[n, n] + sq_sigma; 
+    K[n, n] = K[n, n] + sq_sigma;
 
   L_K = cholesky_decompose(K);
 
@@ -7210,7 +7210,7 @@ \subsection{GP with a normal outcome}
 consists of the priors for the hyperparameters and the multivariate
 Cholesky-parameterized normal likelihood, only now the value \code{y} is known
 and the covariance matrix \code{K} is an unknown dependent on the
-hyperparameters, allowing us to learn the hyperparameters. 
+hyperparameters, allowing us to learn the hyperparameters.
 
 We have used the Cholesky parameterized \distro{MultiNormal} rather than the
 standard \distro{MultiNormal} because it allows us to the
@@ -7256,7 +7256,7 @@ \subsubsection{Latent variable GP}
 
     // diagonal elements
     for (k in 1:N)
-      K[k, k] = K[k, k] + delta; 
+      K[k, k] = K[k, k] + delta;
 
     L_K = cholesky_decompose(K);
     f = L_K * eta;
@@ -7300,9 +7300,9 @@ \subsubsection{Poisson GP}
 %
 \begin{stancode}
 data {
-... 
+...
   int<lower=0> y[N];
-... 
+...
 }
 ...
 parameters {
@@ -7343,9 +7343,9 @@ \subsubsection{Logistic Gaussian Process Regression}
 %
 \begin{stancode}
 data {
-... 
+...
   int<lower=0, upper=1> z[N];
-... 
+...
 }
 ...
 model {
@@ -7362,7 +7362,7 @@ \subsection{Automatic Relevance Determination}
 covariance function can be further generalized by fitting a scale
 parameter $\rho_d$ for each dimension $d$,
 \[
-  k(x | \alpha, \vec{\rho}, \sigma)_{i, j} = \alpha^2 \exp 
+  k(x | \alpha, \vec{\rho}, \sigma)_{i, j} = \alpha^2 \exp
 \left(-\dfrac{1}{2}
 \sum_{d=1}^D \dfrac{1}{\rho_d^2} (x_{i,d} - x_{j,d})^2
 \right)
@@ -7391,7 +7391,7 @@ \subsection{Automatic Relevance Determination}
 
 %
 \begin{stancode}
-functions { 
+functions {
   matrix L_cov_exp_quad_ARD(real[] x,
                             real alpha,
                             vector[] rho,
@@ -7404,7 +7404,7 @@ \subsection{Automatic Relevance Determination}
       K[i,i] = sq_alpha + delta;
       for (j in (i + 1):N) {
         K[i, j] = sq_alpha
-                      * exp(inv_half 
+                      * exp(inv_half
                             * dot_self((x[i] - x[j]) ./ rho));
         K[j, i] = K[i, j];
       }
@@ -7450,13 +7450,13 @@ \subsection{Priors for Gaussian Process Parameters}\label{priors-gp.section}
 
 Formulating priors for GP hyperparameters requires the analyst to consider the
 inherent statistical properties of a GP, the GP's purpose in the model, and the
-numerical issues that may arise in Stan when estimating a GP. 
+numerical issues that may arise in Stan when estimating a GP.
 
 Perhaps most importantly, the parameters $\rho$ and $\alpha$ are weakly
 identified \citep{zhang-gp:2004}. The ratio of the two
 parameters is well-identified, but in practice we put independent priors on the
 two hyperparameters because these two quantities are more interpretable than
-their ratio. 
+their ratio.
 
 \subsubsection{Priors for length-scale}
 
@@ -7470,7 +7470,7 @@ \subsubsection{Priors for length-scale}
 accurately sample using Euclidean HMC.
 
 We may wish to put further soft constraints on the length-scale, but these are
-dependent on how the GP is used in our statistical model. 
+dependent on how the GP is used in our statistical model.
 
 If our model consists of only the GP, i.e.:
 %
@@ -7478,14 +7478,14 @@ \subsubsection{Priors for length-scale}
   f & \sim \distro{MultiNormal}\left(0, K(x | \alpha, \rho)\right) \\
   y_i & \sim \distro{Normal}(f_i, \sigma) \, \forall i \in \{1, \dots, N\} \\
   & x \in \reals^{N \times D}, \, f \in \reals^N
-\end{align*} 
+\end{align*}
 %
 we likely don't need constraints beyond penalizing small length-scales.  We'd
 like to allow the GP prior to represent both high-frequency and low-frequency
 functions, so our prior should put non-negligible mass on both sets of
 functions. In this case, an inverse gamma, \code{inv\_gamma\_lpdf} in Stan's
 language, will work well as it has a sharp left tail that puts negligible mass
-on length-scales, but a generous right tail, allowing for large length-scales. 
+on length-scales, but a generous right tail, allowing for large length-scales.
 
 If we're using the GP as a component in a larger model that includes an overall
 mean and fixed effects for the same variables we're using as the domain for the
@@ -7496,7 +7496,7 @@ \subsubsection{Priors for length-scale}
 	\sim \distro{Normal}(\beta_0 + x_i \beta_{[1:D]} + f_i, \sigma) \, \forall i
 	\in \{1, \dots, N\} \\ & x_i^T, \beta_{[1:D]} \in \reals^D,\, x \in \reals^{N
 	\times D},\, f \in \reals^N
-\end{align*} 
+\end{align*}
 %
 we'll likely want to constrain large length-scales as well.  A length scale
 that is larger than the scale of the data yields a GP posterior that is
@@ -7512,14 +7512,14 @@ \subsubsection{Priors for length-scale}
 	f(x | a, b, p) & = \dfrac{(a/b)^{p/2}}{2K_p(\sqrt{ab})} x^{p - 1}\exp(-(ax + b
 	/ x)/2) \\
 	& x, a, b \in \reals^{+}, \, p \in \mathbb{Z}
-\end{align*} 
+\end{align*}
 %
 which has an inverse gamma left tail if $p \leq 0$ and a Gaussian right tail.
 This has not yet been implemented in Stan's math library, but it is possible to
 implement as a user defined function:
 \begin{stancode}
 functions {
-  real generalized_inverse_gaussian_lpdf(real x, int p, 
+  real generalized_inverse_gaussian_lpdf(real x, int p,
                                         real a, real b) {
     return p * 0.5 * log(a / b)
       - log(2 * modified_bessel_second_kind(p, sqrt(a * b)))
@@ -7555,7 +7555,7 @@ \subsubsection{Priors for marginal standard deviation}
 A half-$t$ or half-Gaussian prior on alpha also has the benefit of putting
 nontrivial prior mass around zero. This allows the GP support the zero
 functions and allows the possibility that the GP won't contribute to the
-conditional mean of the total output. 
+conditional mean of the total output.
 
 \subsection{Predictive Inference with a Gaussian Process}
 
@@ -7577,8 +7577,8 @@ \subsection{Predictive Inference with a Gaussian Process}
 %
 \begin{stancode}
 data {
-  int<lower=1> N1;     
-  real x1[N1]; 
+  int<lower=1> N1;
+  real x1[N1];
   vector[N1] y1;
   int<lower=1> N2;
   real x2[N2];
@@ -7621,10 +7621,10 @@ \subsection{Predictive Inference with a Gaussian Process}
 The input vectors \code{x1} and \code{x2} are declared as data, as is the
 observed output vector \code{y1}.  The unknown output vector \code{y2}, which
 corresponds to input vector \code{x2}, is declared in the generated quantities
-block and will be sampled when the model is executed.  
+block and will be sampled when the model is executed.
 
 A transformed data block is used to combine the input vectors
-\code{x1} and \code{x2} into a single vector \code{x}. 
+\code{x1} and \code{x2} into a single vector \code{x}.
 
 The model block declares and defines a local variable for the combined output
 vector \code{f}, which consists of the concatenation of the conditional mean
@@ -7642,7 +7642,7 @@ \subsection{Predictive Inference with a Gaussian Process}
 
 \subsubsection{Predictive Inference in non-Gaussian GPs}
 
-We can do predictive inference in non-Gaussian GPs in much the 
+We can do predictive inference in non-Gaussian GPs in much the
 same way as we do with Gaussian GPs.
 
 Consider the following full model for prediction using logistic Gaussian
@@ -7654,8 +7654,8 @@ \subsubsection{Predictive Inference in non-Gaussian GPs}
 %
 \begin{stancode}
 data {
-  int<lower=1> N1;     
-  real x1[N1]; 
+  int<lower=1> N1;
+  real x1[N1];
   vector[N1] z1;
   int<lower=1> N2;
   real x2[N2];
@@ -7701,12 +7701,12 @@ \subsubsection{Analytical Form of Joint Predictive Inference}
 
 Bayesian predictive inference for Gaussian processes with Gaussian observations
 can be sped up by deriving the posterior analytically, then directly sampling
-from it.  
+from it.
 
 Jumping straight to the result,
 \[
 p(\tilde{y}|\tilde{x},y,x)
-= 
+=
 \distro{Normal}(K^{\top}\Sigma^{-1}y,\
                 \Omega - K^{\top}\Sigma^{-1}K),
 \]
@@ -7764,7 +7764,7 @@ \subsubsection{Analytical Form of Joint Predictive Inference}
       K_div_y1 = mdivide_left_tri_low(L_K, y1);
       K_div_y1 = mdivide_right_tri_low(K_div_y1',L_K)';
       k_x1_x2 = cov_exp_quad(x1, x2, alpha, rho);
-      f2_mu = (k_x1_x2' * K_div_y1); 
+      f2_mu = (k_x1_x2' * K_div_y1);
       v_pred = mdivide_left_tri_low(L_K, k_x1_x2);
       cov_f2 = cov_exp_quad(x2, alpha, rho) - v_pred' * v_pred;
       diag_delta = diag_matrix(rep_vector(delta,N2));
@@ -7775,8 +7775,8 @@ \subsubsection{Analytical Form of Joint Predictive Inference}
   }
 }
 data {
-  int<lower=1> N1;     
-  real x1[N1]; 
+  int<lower=1> N1;
+  real x1[N1];
   vector[N1] y1;
   int<lower=1> N2;
   real x2[N2];
@@ -7790,7 +7790,7 @@ \subsubsection{Analytical Form of Joint Predictive Inference}
   real<lower=0> sigma;
 }
 model {
-  matrix[N1, N1] L_K; 
+  matrix[N1, N1] L_K;
   {
     matrix[N1, N1] K = cov_exp_quad(x, alpha, rho);
     real sq_sigma = square(sigma);
@@ -7820,7 +7820,7 @@ \subsection{Multiple-output Gaussian processes}
   f(x) & \sim \distro{GP}(m(x), K(x | \theta, \phi)) \\
   K(x & | \theta) \in \reals^{M \times M}, \, f(x), \, m(x) \in \reals^M
 \end{align*}
-where the $K(x, x^\prime | \theta, \phi)_{[m, m^\prime]}$ entry defines the 
+where the $K(x, x^\prime | \theta, \phi)_{[m, m^\prime]}$ entry defines the
 covariance between $f_m(x)$ and $f_{m^\prime}(x^\prime)(x)$. This construction
 of Gaussian processes allows us to learn the covariance between the output
 dimensions of $f(x)$. If we parameterize our kernel $K$:
@@ -7837,7 +7837,7 @@ \subsection{Multiple-output Gaussian processes}
 parameterized by some vector $\phi$.
 
 The \distro{MatrixNormal} distribution has two covariance matrices: $K(x |
-\alpha, \rho)$ to encode column covariance, and $C(\phi)$ to define row 
+\alpha, \rho)$ to encode column covariance, and $C(\phi)$ to define row
 covariance. The salient features of the \distro{MatrixNormal} are that the rows
 of the matrix $f$ are distributed:
 \begin{align*} f_{[n,]} \sim \distro{MultiNormal}(m(x)_{[n,]}, K(x | \alpha,
@@ -7845,12 +7845,12 @@ \subsection{Multiple-output Gaussian processes}
 distributed: \begin{align*} f_{[,m]} \sim \distro{MultiNormal}(m(x)_{[,m]}, K(x
 	| \alpha, \rho) C(\phi)_{[m,m]}) \end{align*}
 This also means means that $\E\left[f^T f\right]$ is equal to
-$\text{trace}(K(x | \alpha, \rho)) \times C$, whereas $\E\left[ff^T\right]$ 
+$\text{trace}(K(x | \alpha, \rho)) \times C$, whereas $\E\left[ff^T\right]$
 is $\text{trace}(C) \times K(x | \alpha, \rho)$. We can derive this using
 properties of expectation and the \distro{MatrixNormal} density.
 
 We should set $\alpha$ to $1.0$ because the parameter is not identified unless
-we constrain $\text{trace}(C) = 1$. Otherwise, we can multiply $\alpha$ by a scalar $d$ and 
+we constrain $\text{trace}(C) = 1$. Otherwise, we can multiply $\alpha$ by a scalar $d$ and
 $C$ by $1/d$ and our likelihood will not change.
 %
 We can generate a random variable $f$ from a \distro{MatrixNormal} density in
@@ -7894,7 +7894,7 @@ \subsection{Multiple-output Gaussian processes}
 
     // diagonal elements
     for (k in 1:N)
-      K[k, k] = K[k, k] + delta; 
+      K[k, k] = K[k, k] + delta;
 
     L_K = cholesky_decompose(K);
     f = L_K * eta
@@ -7930,7 +7930,7 @@ \chapter{Directions, Rotations, and Hyperspheres}
 type, the values of which determine points on a hypersphere (circle in
 two dimensions, sphere in three dimensions).
 
-\section{Unit Vectors}  
+\section{Unit Vectors}
 
 The length of a vector $x \in \reals^K$ is given by
 \[
@@ -7976,7 +7976,7 @@ \section{Circles, Spheres, and Hyperspheres}
 Even though $S^{n-1}$ behaves locally like $\reals^{n-1}$, there is no
 way to smoothly map between them. For example, because
 latitude and longitude work on a modular basis (wrapping at $2\pi$
-radians in natural units), they do not produce a smooth map. 
+radians in natural units), they do not produce a smooth map.
 
 Like a bounded interval $(a, b)$, in geometric terms, a sphere is
 compact in that the distance between any two points is bounded.
@@ -7998,7 +7998,7 @@ \section{Transforming to Unconstrained Parameters}
 the surface of the sphere.  \cite{Marsaglia:1972} introduced an
 auxiliary variable interpretation of this mapping that provides the
 desired properties of uniformity; see \refsection{unit-vector} for
-details. 
+details.
 
 
 \subsubsection{Warning: undefined at zero!}
@@ -8025,7 +8025,7 @@ \section{Unit Vectors and Rotations}
 R_{\theta}
 =
 \begin{bmatrix}
-\cos \theta & - \sin \theta 
+\cos \theta & - \sin \theta
 \\
 \sin \theta & \cos \theta
 \end{bmatrix}.
@@ -8075,7 +8075,7 @@ \section{Example: Simple Harmonic Oscillator}
 
 As a concrete example of a system of ODEs, consider a harmonic
 oscillator, which is characterized by an equilibrium position and a
-restoring force proportional to the displacement with friction.  
+restoring force proportional to the displacement with friction.
 The system state will be a pair $y = (y_1, y_2)$ representing position
 and momentum (i.e., a point in phase space).  The change in the system
 with respect to time is given by the following differential equations.%
@@ -8085,7 +8085,7 @@ \section{Example: Simple Harmonic Oscillator}
   to implement the \code{rk45} solver.}
 %
 \begin{equation}\label{ode-sho.equation}
-\frac{d}{dt} y_1 = y_2 
+\frac{d}{dt} y_1 = y_2
 \hspace*{0.5in}
 \frac{d}{dt} y_2 = -y_1 - \theta y_2
 \end{equation}
@@ -8200,7 +8200,7 @@ \section{Solving a System of Linear ODEs using a Matrix Exponential}
   y = \left[\begin{array}{c}
         y_1 \\
         y_2 \\
-        \end{array}\right] \ \ \ \ \ 
+        \end{array}\right] \ \ \ \ \
    A = \left[\begin{array}{cc}
 	0 & 1 \\
 	-1 & -\theta \\
@@ -8228,15 +8228,15 @@ \section{Solving a System of Linear ODEs using a Matrix Exponential}
 generated quantities {
   vector[2] y_hat[T];
   matrix[2, 2] A;
-  
+
   A[1, 1] = 0;
   A[1, 2] = 1;
   A[2, 1] = -1;
   A[2, 2] = -theta[1];
-  
+
   for (t in 1:T)
     y_hat[t] = matrix_exp((t - 1) * A) * y0;
-  
+
   // add measurement error
   for (t in 1:T) {
     y_hat[t, 1] = y_hat[t, 1] + normal_rng(0, 0.1);
@@ -8248,13 +8248,13 @@ \section{Solving a System of Linear ODEs using a Matrix Exponential}
 \caption{\small\it Stan program to simulate noisy measurements from a
   simple harmonic oscillator.  The system of linear differential equations is
   coded as a matrix. The system parameters \code{theta} and initial
-  state \code{y0} are read in as data along  observation times \code{ts}. 
+  state \code{y0} are read in as data along  observation times \code{ts}.
   The generated quantities block is used to solve the ODE for the specified
-  times and then add random measurement error, producing observations 
+  times and then add random measurement error, producing observations
   \code{y\_hat}. Because the ODEs are linear, we can use the \code{matrix\_exp}
   function to solve the system. }\label{sho-sim-me.figure}
 \end{figure}
-  
+
 
 \section{Measurement Error Models}
 
@@ -8293,7 +8293,7 @@ \subsection{Simulating Noisy Measurements}
 \begin{stancode}
 functions {
   real[] sho(real t,
-             real[] y, 
+             real[] y,
              real[] theta,
              real[] x_r,
              int[] x_i) {
@@ -8380,7 +8380,7 @@ \subsection{Estimating System Parameters and Initial State}
 \begin{stancode}
 functions {
   real[] sho(real t,
-             real[] y, 
+             real[] y,
              real[] theta,
              real[] x_r,
              int[] x_i) {
@@ -8506,10 +8506,10 @@ \subsection{Tolerance}
 The relative and absolute tolerance control the accuracy of the
 solutions generated by the integrator.  Relative tolerances are
 relative to the solution value, whereas absolute tolerances is the
-maximum absolute error allowed in a solution.  
+maximum absolute error allowed in a solution.
 
 Smaller tolerances produce more accurate solutions.  Smaller
-tolerances also require more computation time.  
+tolerances also require more computation time.
 
 \subsubsection{Sensitivity Analysis}
 
@@ -8523,4 +8523,4 @@ \subsection{Maximum Number of Steps}
 This can arise in MCMC when a bad jump is taken, particularly during
 warmup.  With the non-stiff solver, this may result in jumping into a
 stiff region of the parameter space, which would require a very small
-step size and very many steps to satisfy even modest tolerances.  
+step size and very many steps to satisfy even modest tolerances.
diff --git a/src/docs/stan-reference/language.tex b/src/docs/stan-reference/language.tex
index 6c5da264905..762e1b22161 100644
--- a/src/docs/stan-reference/language.tex
+++ b/src/docs/stan-reference/language.tex
@@ -247,65 +247,43 @@ \subsection{Constrained Data Types}
 \end{stancode}
 
 There are also special data types for structured vectors and
-matrices.
-% There are four constrained vector data types, \code{simplex}
-% for unit simplexes, \code{unit\_vector} for unit-length vectors,
-% \code{ordered} for ordered vectors of scalars and
-% \code{positive\_ordered} for vectors of positive ordered
-% scalars. There are specialized matrix data types \code{corr\_matrix}
-% and \code{cov\_matrix} for correlation matrices (symmetric, positive
-% definite, unit diagonal) and covariance matrices (symmetric, positive
-% definite).  The type \code{cholesky\_factor\_cov} is for Cholesky
-% factors of covariance matrices (lower triangular, positive diagonal,
-% product with own transpose is a covariance matrix).  The type
-% \code{cholesky\_factor\_corr} is for Cholesky factors of correlation
-% matrices (lower triangular, positive diagonal, unit-length rows).
-
-Constraints provide error checking for variables defined in the \code{data},
-\code{transformed data}, \code{transformed parameters}, and
-\code{generated quantities} blocks.
-%
-Constraints are critical for variables declared in the
-\code{parameters} block, where they determine the transformation from
-constrained variables (those satisfying the declared constraint) to
-unconstrained variables (those ranging over all of $\mathbb{R}^n$).
+matrices. There are four constrained vector data types, \code{simplex}
+for unit simplexes, \code{unit\_vector} for unit-length vectors,
+\code{ordered} for ordered vectors of scalars and
+\code{positive\_ordered} for vectors of positive ordered
+scalars. There are specialized matrix data types \code{corr\_matrix}
+and \code{cov\_matrix} for correlation matrices (symmetric, positive
+definite, unit diagonal) and covariance matrices (symmetric, positive
+definite).  The type \code{cholesky\_factor\_cov} is for Cholesky
+factors of covariance matrices (lower triangular, positive diagonal,
+product with own transpose is a covariance matrix).  The type
+\code{cholesky\_factor\_corr} is for Cholesky factors of correlation
+matrices (lower triangular, positive diagonal, unit-length rows).
+
+Constraints provide error checking for variables defined in the
+\code{data}, \code{transformed data}, \code{transformed parameters},
+and \code{generated quantities} blocks.  Constraints are critical for
+variables declared in the \code{parameters} block, where they
+determine the transformation from constrained variables (those
+satisfying the declared constraint) to unconstrained variables (those
+ranging over all of $\mathbb{R}^n$).
 
 It is worth calling out the most important aspect of constrained data
 types:
 %
 \begin{quote}
-\it
-The model must have support (non-zero density) at every value of the
-parameters that meets their declared constraints.
+  \it The model must have support (non-zero density, equivalently
+  finite log density) at every value of the parameters that meets
+  their declared constraints.
 \end{quote}
 %
-If the declared parameter constraints are less strict than the
-support, the samplers and optimizers may have any of a number of
-pathologies including just getting stuck, failure to initialize,
-excessive Metropolis rejection, or biased samples due to inability to
-explore the tails of the distribution.
-
-% Integer or real types may be constrained with lower bounds, upper
-% bounds, or both.  There are four constrained vector data types,
-% \code{simplex} for unit simplexes, \code{unit\_vector} for unit-length
-% vectors, \code{ordered} for ordered vectors of scalars and
-% \code{positive\_ordered} for vectors of positive ordered scalars.
-% There are specialized matrix data types \code{corr\_matrix} and
-% \code{cov\_matrix} for correlation matrices (symmetric, positive
-% definite, unit diagonal) and covariance matrices (symmetric, positive
-% definite).  The type \code{cholesky\_factor\_cov} is for Cholesky
-% factors of covariance matrices (lower triangular, positive diagonal,
-% product with own transpose is a covariance matrix).  The type
-% \code{cholesky\_factor\_corr} is for Cholesky factors of correlation
-% matrices (lower triangular, positive diagonal, unit-length rows).
-
-% \subsection{Arrays}
-
-% Stan supports arrays of arbitrary order of any of the basic data
-% types or constrained basic data types.  This includes
-% three-dimensional arrays of integers, one-dimensional arrays of
-% positive reals, four-dimensional arrays of simplexes, one-dimensional
-% arrays of row vectors, and so on.
+If this condition is violated with parameter values that satisfy
+declared constraints but do not have finite log density, then the
+samplers and optimizers may have any of a number of pathologies
+including just getting stuck, failure to initialize, excessive
+Metropolis rejection, or biased samples due to inability to explore
+the tails of the distribution.
+
 
 \section{Primitive Numerical Data Types}\label{numerical-data-types.section}
 
@@ -616,6 +594,15 @@ \subsection{Unit Simplexes}
 threshold $\epsilon$ to account for errors arising from floating-point
 imprecision.
 
+In high dimensional problems, simplexes may require smaller step sizes
+in the inference algorithms in order to remain stable; this can be
+achieved through higher target acceptance rates for samplers and
+longer warmup periods, tighter tolerances for optimization with more
+iterations, and in either case, with less dispersed parameter
+initialization or custom initialization if there are informative
+priors for some parameters.
+
+
 \subsection{Unit Vectors}
 
 A unit vector is a vector with a norm of one.  For instance,
@@ -2954,28 +2941,8 @@ \subsubsection{Vectorization}
 
 \subsection{Accessing the Log Density}
 
-To increment the log density returned by the model by some value
-\code{u}, use the following statement.%
-%
-\footnote{Originally, Stan provided direct access to the log density
-  through a variable \code{lp\_\_} and then later through the
-  \code{increment\_log\_prob()} statement, but the first has been
-  removed and the latter deprecated.}
-%
-\begin{stancode}
-target += u;
-\end{stancode}
-%
-In general, \code{u} can be any expression;  if it is a container, the
-log density will be incremented by the sum of the elements in the
-container.
-
 To access accumulated log density up to the current execution point,
-the function \code{get\_lp()} may be used.%
-%
-\footnote{The value of \code{lp\_\_} will only hold the Jacobian until
-  the end of the execution because it is not used in either
-  \code{increment\_log\_prob(u)} or \code{target += u}.}
+the function \code{target()()} may be used.
 
 
 \section{Sampling Statements}\label{sampling-statements.section}
@@ -3008,7 +2975,7 @@ \section{Sampling Statements}\label{sampling-statements.section}
 y ~ dist(theta1, ..., thetaN);
 \end{stancode}
 %
-involving subexpressions \code{y1} and \code{theta1} through
+involving subexpressions \code{y} and \code{theta1} through
 \code{thetaN} (including the case where \code{N} is zero) will be well
 formed if and only if the corresponding assignment statement is
 well-formed.  For densities allowing real \code{y} values, the log
@@ -3062,20 +3029,25 @@ \subsection{User-Transformed Variables}
 expression.  For instance, it is legal syntactically to write
 %
 \begin{stancode}
-data {
-  real<lower=0> y;
+parameters {
+  real<lower=0> beta;
 }
 // ...
 model {
-  log(y) ~ normal(mu, sigma);
+  log(beta) ~ normal(mu, sigma);
 }
 \end{stancode}
 %
-Unfortunately, this is not enough to properly model \code{y} as having
-a lognormal distribution.  The log Jacobian of the transform must be
-added to the log probability accumulator to account for the
-differential change in scale (see \refsection{change-of-variables} for
-full definitions).  For the case above, the following adjustment will
+Unfortunately, this is not enough to properly model \code{beta} as
+having a lognormal distribution.  Whenever a nonliner transform is
+applied to a parameter, such as the logarithm function being applied
+to \code{beta} here, and then used on the left-hand side of a sampling
+statement or on the left of a vertical bar in a log pdf function, an
+adjustment must be made to account for the differential change in
+scale and ensure \code{beta} gets the correct distribution.  The
+correction required is to add the log Jacobian of the transform to the
+target log density (see \refsection{change-of-variables} for full
+definitions).  For the case above, the following adjustment will
 account for the log transform.%
 %
 \footnote{Because $\log | \frac{d}{dy} \log y | = \log | 1/y | = - \log
diff --git a/src/docs/stan-reference/programming.tex b/src/docs/stan-reference/programming.tex
index 3c46f0452bd..043da3c2492 100644
--- a/src/docs/stan-reference/programming.tex
+++ b/src/docs/stan-reference/programming.tex
@@ -76,7 +76,7 @@ \subsubsection{Beta Distribution}
 
 It is often more natural to specify hyperpriors in terms of
 transformed parameters.  In the case of the Beta, the obvious choice
-for reparameterization is in terms of a mean parameter 
+for reparameterization is in terms of a mean parameter
 \[
 \phi = \alpha / (\alpha + \beta)
 \]
@@ -109,7 +109,7 @@ \subsubsection{Beta Distribution}
 \end{stancode}
 %
 The new parameters, \code{phi} and \code{lambda}, are declared in the
-parameters block and the parameters for the Beta distribution, 
+parameters block and the parameters for the Beta distribution,
 \code{alpha} and \code{beta}, are declared and defined in the
 transformed parameters block.  And If their values are not of interest,
 they could instead be defined as local variables in the model as
@@ -139,7 +139,7 @@ \subsubsection{Beta Distribution}
 %
 If the variables \code{alpha} and \code{beta} are of interest, they
 can be defined in the transformed parameter block and then used in the
-model.  
+model.
 
 
 \subsubsection{Jacobians not Necessary}
@@ -147,7 +147,7 @@ \subsubsection{Jacobians not Necessary}
 Because the transformed parameters are being used, rather than given a
 distribution, there is no need to apply a Jacobian adjustment for the
 transform.  For example, in the beta distribution example,
-\code{alpha} and \code{beta} have the correct posterior distribution.  
+\code{alpha} and \code{beta} have the correct posterior distribution.
 
 \subsubsection{Dirichlet Priors}
 
@@ -170,7 +170,7 @@ \subsubsection{Dirichlet Priors}
 %
 This provides essentially $K$ degrees of freedom, one for each
 dimension of \code{alpha}, and it is not obvious how to specify a
-reasonable prior for \code{alpha}.  
+reasonable prior for \code{alpha}.
 
 An alternative coding is to use the mean, which is a simplex, and a
 total count.
@@ -203,7 +203,7 @@ \subsection{Transforming Unconstrained Priors: Probit and Logit}
 is because inverse logit is the cumulative distribution function (cdf)
 for the logistic distribution, so that the logit function itself is
 the inverse cdf and thus maps a uniform draw in $(0, 1)$ to a
-logistically-distributed quantity.  
+logistically-distributed quantity.
 
 Things work the same way for the probit case: if $u$ has a
 $\distro{Uniform}(0, 1)$ distribution, then $\Phi^{-1}(u)$ has a
@@ -305,7 +305,7 @@ \subsection{Transforming Unconstrained Priors: Probit and Logit}
   theta_raw ~ normal(mu, sigma);
 \end{stancode}
 %
-where either or both of \code{mu} and \code{sigma} can be vectors.  
+where either or both of \code{mu} and \code{sigma} can be vectors.
 
 \section{Changes of Variables}
 
@@ -313,7 +313,7 @@ \section{Changes of Variables}
 parameter is characterized by a distribution.  The standard textbook
 example is the lognormal distribution, which is the distribution of a
 variable $y > 0$ whose logarithm $\log y$ has a normal distribution.
-Note that the distribution is being assigned to $\log y$. 
+Note that the distribution is being assigned to $\log y$.
 
 The change of variables requires an adjustment to the probability to
 account for the distortion caused by the transform.  For this to work,
@@ -328,7 +328,7 @@ \section{Changes of Variables}
 In the case of log normals, if $y$'s logarithm is normal with mean
 $\mu$ and deviation $\sigma$, then the distribution of $y$ is given by
 \[
-p(y) 
+p(y)
 \ = \
 \distro{Normal}(\log y| \mu, \sigma) \  \left| \frac{d}{dy} \log y \right|
 \ = \
@@ -336,7 +336,7 @@ \section{Changes of Variables}
 \]
 Stan works on the log scale to prevent underflow, where
 \[
-\log p(y) 
+\log p(y)
 =
 \log \distro{Normal}(\log y| \mu, \sigma)
 - \log y.
@@ -363,7 +363,7 @@ \section{Changes of Variables}
 \end{stancode}
 %
 It is important, as always, to declare appropriate constraints on
-parameters;  here \code{y} is constrained to be positive. 
+parameters;  here \code{y} is constrained to be positive.
 
 It would be slightly more efficient to define a local variable for the
 logarithm, as follows.
@@ -388,7 +388,7 @@ \subsection{Change of Variables vs.\ Transformations}
 and a simple variable transformation.  A transformation samples a
 parameter, then transforms it, whereas a change of variables
 transforms a parameter, then samples it.  Only the latter requires a
-Jacobian adjustment.  
+Jacobian adjustment.
 
 Note that it does not matter whether the probability function is
 expressed using a sampling statement, such as
@@ -408,7 +408,7 @@ \subsubsection{Gamma and Inverse Gamma Distribution}\label{jacobian-adjustment.s
 Like the log normal, the inverse gamma distribution is a distribution
 of variables whose inverse has a gamma distribution.  This section
 contrasts two approaches, first with a transform, then with a change
-of variables. 
+of variables.
 
 The transform based approach to sampling \code{y\_inv} with an inverse
 gamma distribution can be coded as follows.
@@ -418,7 +418,7 @@ \subsubsection{Gamma and Inverse Gamma Distribution}\label{jacobian-adjustment.s
   real<lower=0> y;
 }
 transformed parameters {
-  real<lower=0> y_inv;   
+  real<lower=0> y_inv;
   y_inv = 1 / y;
 }
 model {
@@ -435,11 +435,11 @@ \subsubsection{Gamma and Inverse Gamma Distribution}\label{jacobian-adjustment.s
 }
 transformed parameters {
   real<lower=0> y;
-  y = 1 / y_inv;                         // change
-  target +=  -2 * log(y_inv) );  // adjustme;
+  y = 1 / y_inv;  // change variables
 }
 model {
   y ~ gamma(2,4);
+  target +=  -2 * log(y_inv);  //  Jacobian adjustment;
 }
 \end{stancode}
 %
@@ -499,7 +499,7 @@ \subsection{Multivariate Changes of Variables}
 transformed parameters {
   ...
   vector[K] J_diag;  // diagonals of Jacobian matrix
-  ... 
+  ...
   ... compute J[k, k] = d.v[k] / d.u[k] ...
   target += sum(log(J_diag));
   ...
@@ -514,10 +514,35 @@ \section{Vectors with Varying Bounds}
 Jacobians (all of which are described in
 \refchapter{variable-transforms}) have to be calculated in Stan.
 
+\subsection{Varying Lower Bounds}
+
 For example, suppose there is a vector parameter $\alpha$ with a
-vector $L$ of lower bounds.  The following program declares raw
-unconstrained parameters, then explicitly transforms the raw
-parameters to $\alpha$, accounting for the Jacobian along the way.
+vector $L$ of lower bounds.  The simplest way to deal with this if $L$
+is a constant is to shift a lower-bounded parameter.
+%
+\begin{stancode}
+data {
+  int N;
+  vector[N] L;  // lower bounds
+  ...
+parameters {
+  vector<lower=0>[N] alpha_raw;
+  ...
+transformed parameters {
+  vector[N] alpha = L + alpha_raw;
+  ...
+\end{stancode}
+%
+The Jacobian for adding a constant is one, so its log drops out of the
+log density.
+
+Even if the lower bound is a paramter rather than data, there is no
+Jacobian required, because the transform from $(L, \alpha_{\mathrm
+  raw})$ to $(L + \alpha_{\mathrm raw}, \alpha_{\mathrm raw}$ produces
+a Jacobian derivative matrix with a unit determinant.
+
+It's also possible implement the transform by directly trnasforming
+an unconstrained parameter and accounting for the Jacobian.
 %
 \begin{stancode}
 data {
@@ -546,6 +571,33 @@ \section{Vectors with Varying Bounds}
 $\alpha_{\mathrm{raw}}$ then a revised Jacobian needs to be calculated
 taking into account the dependencies.
 
+\subsection{Varying Upper and Lower Bounds}
+
+Suppowe there are lower and upper bounds that vary by parameter.
+These can be applied to shift and rescale a parameter constrained to
+$(0, 1)$.
+%
+\begin{stancode}
+data {
+  int N;
+  vector[N] L;  // lower bounds
+  vector[N] U;  // upper bounds
+  ...
+parameters {
+  vector<lower=0, upper=1>[N] alpha_raw;
+  ...
+transformed parameters {
+  vector[N] alpha = L + (U - L) .* alpha_raw;
+\end{stancode}
+%
+The expression \code{U - L} is multiplied by \code{alpha\_raw}
+elementwise to produce a vector of variables in $(0, U-L)$, then
+adding $L$ results in a variable ranging between $(L, U)$.
+
+In this case, it is important that $L$ and $U$ are constants,
+otherwise a Jacobian would be required when multiplying by $U - L$.
+
+
 
 \chapter{Custom Probability Functions}%
 \label{custom-probability-functions.chapter}
@@ -568,11 +620,11 @@ \subsection{Triangle Distribution}
 a density defined as follows.
 \[
 \distro{Triangle}(y | \alpha,\beta)
-= 
+=
 \frac{2}{\beta - \alpha}
 \
 \left(
-1 - 
+1 -
 \left|
 y - \frac{\alpha + \beta}{\beta - \alpha}
 \right|
@@ -623,7 +675,7 @@ \subsection{Triangle Distribution}
 %
 With the constraint on \code{y} in place, this is just a less
 efficient, slower, and less arithmetically stable version of the
-original program.  But if the constraint on \code{y} is removed, 
+original program.  But if the constraint on \code{y} is removed,
 the model will compile and run without arithmetic errors, but will not
 sample properly.%
 %
@@ -651,7 +703,7 @@ \subsection{Exponential Distribution}
 This encoding will work for any \code{lambda} and \code{y}; they can
 be parameters, data, or one of each, or even local variables.
 
-The assignment statement in the previous paragraph generates 
+The assignment statement in the previous paragraph generates
 \Cpp code that is very similar to that generated by the following
 sampling statement.
 %
@@ -755,7 +807,7 @@ \subsubsection{Catching errors}
 Rejection is used to flag errors that arise in inputs or in program
 state.  It is far better to fail early with a localized informative
 error message than to run into problems much further downstream (as in
-rejecting a state or failing to compute a derivative).  
+rejecting a state or failing to compute a derivative).
 
 The most common errors that are coded is to test that all of the
 arguments to a function are legal.  The following function takes a
@@ -811,8 +863,8 @@ \subsection{Type Declarations for Functions}
 {\it Functions:} & {\it Locals Variables:} & {\it Nonlocal Variables:} \\
 {\it Undimensioned} & {\it Unconstrained} & {\it Constrained} \\
 \hline \hline
-\code{int} 
-& \code{int} 
+\code{int}
+& \code{int}
 & \code{int<lower=L>}
 \\
 & &  \code{int<upper=U>}
@@ -821,8 +873,8 @@ \subsection{Type Declarations for Functions}
 %
 \\ \hline
 %
-\code{real} 
-& \code{real} 
+\code{real}
+& \code{real}
 & \code{real<lower=L>}
 \\
 & &  \code{real<upper=U>}
@@ -832,7 +884,7 @@ \subsection{Type Declarations for Functions}
 \\ \hline
 %
 \code{vector}
-& 
+&
 \code{vector[N]}
 & \code{vector<lower=L>[N]}
 \\
@@ -860,8 +912,8 @@ \subsection{Type Declarations for Functions}
 %
 \\ \hline
 %
-\code{matrix} 
-& \code{matrix[M,~N]} 
+\code{matrix}
+& \code{matrix[M,~N]}
 & \code{matrix<lower=L>[M,~N]}
 \\
 & &  \code{matrix<upper=U>[M,~N]}
@@ -870,7 +922,7 @@ \subsection{Type Declarations for Functions}
 \\[4pt]
 & & \code{cov\_matrix[K]}
 \\
-& & \code{corr\_matrix[K]} 
+& & \code{corr\_matrix[K]}
 \\
 & & \code{cholesky\_factor\_cov[K]}
 \\
@@ -919,10 +971,10 @@ \subsection{Type Declarations for Functions}
 %
 \footnote{A range of built-in validation routines is coming to Stan soon!
 Alternatively, the \code{reject} statement can be used to check
-constraints on the simplex.} 
+constraints on the simplex.}
 %
 Upper or lower bounds on values or constrained types are not allowed
-as return types or argument types in function declarations.  
+as return types or argument types in function declarations.
 
 \subsection{Array Types for Function Declarations}
 
@@ -937,7 +989,7 @@ \subsection{Array Types for Function Declarations}
 %
 The notation \code{[\,]} is used for one-dimensional arrays (as in the
 return above), \code{[\,,\,]} for two-dimensional arrays,
-\code{[\,,\,,\,]} for three-dimensional arrays, and so on.  
+\code{[\,,\,,\,]} for three-dimensional arrays, and so on.
 
 Functions support arrays of any type, including matrix and vector
 types.  As with other types, no constraints are allowed.
@@ -973,7 +1025,7 @@ \section{Functions as Statements}
 Void functions applied to appropriately typed arguments may be used on
 their own as statements.  For example, the pretty-print function
 defined above may be applied to a covariance matrix being defined in
-the transformed parameters block. 
+the transformed parameters block.
 %
 \begin{stancode}
 transformed parameters {
@@ -1097,8 +1149,8 @@ \section{User-Defined Probability Functions}
 %
 \begin{stancode}
 functions {
-  real unit_normal_lpdf(real y) { 
-    return normal_lpdf(y | 0, 1); 
+  real unit_normal_lpdf(real y) {
+    return normal_lpdf(y | 0, 1);
   }
 }
 ...
@@ -1111,7 +1163,7 @@ \section{User-Defined Probability Functions}
 %
 The ability to use the \code{unit\_normal} function as a density is
 keyed off its name ending in \code{\_lpdf} (names ending in
-\code{\_lpmf} for probability mass functions work the same way).  
+\code{\_lpmf} for probability mass functions work the same way).
 
 In general, if \code{foo\_lpdf} is defined to consume $N + 1$
 arguments, then
@@ -1120,7 +1172,7 @@ \section{User-Defined Probability Functions}
 y ~ foo(theta1, ..., thetaN);
 \end{stancode}
 %
-can be used as shorthand for 
+can be used as shorthand for
 %
 \begin{stancode}
 target += foo_lpdf(y | theta1, ..., thetaN);
@@ -1128,7 +1180,7 @@ \section{User-Defined Probability Functions}
 %
 As with the built-in functions, the suffix \code{\_lpdf} is dropped and
 the first argument moves to the left of the sampling symbol (\Verb|~|)
-in the sampling statement.  
+in the sampling statement.
 
 Functions ending in \code{\_lpmf} (for probability mass functions),
 behave exactly the same way.  The difference is that the first
@@ -1157,9 +1209,9 @@ \section{Documenting Functions}\label{documenting-functions.section}
 %
 \begin{stancode}
 /**
- * Return a data matrix of specified size with rows 
- * corresponding to items and the first column filled 
- * with the value 1 to represent the intercept and the 
+ * Return a data matrix of specified size with rows
+ * corresponding to items and the first column filled
+ * with the value 1 to represent the intercept and the
  * remaining columns randomly filled with unit-normal draws.
  *
  * @param N Number of rows corresponding to data items
@@ -1167,7 +1219,7 @@ \section{Documenting Functions}\label{documenting-functions.section}
  *          item.
  * @return Simulated predictor matrix.
  */
-matrix predictors_rng(int N, int K) { 
+matrix predictors_rng(int N, int K) {
   ...
 \end{stancode}
 %
@@ -1190,7 +1242,7 @@ \section{Documenting Functions}\label{documenting-functions.section}
 %
 \begin{stancode}
  ...
- * @param theta 
+ * @param theta
  * @throws If any of the entries of theta is negative.
  */
 real entropy(vector theta) {
@@ -1241,7 +1293,7 @@ \section{Recursive Functions}
 $A^n$, which is defined for a square matrix $A$ and positive integer
 $n$ by
 \[
-A^n 
+A^n
 =
 \begin{cases}
 \ \mbox{I} & \mbox{if } n = 0, \mbox{ and}
@@ -1259,7 +1311,7 @@ \section{Recursive Functions}
   matrix matrix_pow(matrix a, int n) {
     if (n == 0)
       return diag_matrix(rep_vector(1, rows(a)));
-    else 
+    else
       return a *  matrix_pow(a, n - 1);
   }
 \end{stancode}
@@ -1308,7 +1360,7 @@ \subsubsection{Redundant Intercepts}
   Gibbs sampling as used in BUGS and JAGS and the Hamiltonian Monte
   Carlo (HMC) and no-U-turn samplers (NUTS) used by Stan.}
 %
-Suppose there are observations $y_n$ for $n \in 1{:}N$, 
+Suppose there are observations $y_n$ for $n \in 1{:}N$,
 two intercept parameters $\lambda_1$ and
 $\lambda_2$, a scale parameter $\sigma > 0$, and the sampling distribution
 %
@@ -1344,7 +1396,7 @@ \subsubsection{Redundant Intercepts}
 as illustrated in the left-hand figure of
 \reffigure{non-identifiable-density}.  The ridge for this model is
 along the line where $\lambda_2 = \lambda_1 + c$ for some constant
-$c$.  
+$c$.
 
 Contrast this model with a simple regression with a single intercept
 parameter $\mu$ and sampling distribution
@@ -1374,7 +1426,7 @@ \subsubsection{Ability and Difficulty in IRT Models}
 %
 \[
 p(y | \alpha, \beta)
-= 
+=
 p(y | \alpha + c, \beta - c).
 \]
 %
@@ -1449,7 +1501,7 @@ \subsubsection{Softmax with $K$ vs. $K-1$ Parameters}
 The softmax function is many-to-one, which leads to a lack of
 identifiability of the unconstrained parameters $\alpha$.  In
 particular, adding or subtracting a constant from each $\alpha_k$
-produces the same simplex $\theta$. 
+produces the same simplex $\theta$.
 
 
 
@@ -1511,7 +1563,7 @@ \subsubsection{Pinning Parameters}
 \subsubsection{Adding Priors}
 
 So far, the models have been discussed as if the priors on the
-parameters were improper uniform priors.  
+parameters were improper uniform priors.
 
 A more general Bayesian solution to these invariance problems is to
 impose proper priors on the parameters.  This approach can be used to
@@ -1572,7 +1624,7 @@ \subsubsection{Vague, Strongly Informative, and Weakly Informative Priors}
 
 Care must be used when adding a prior to resolve invariances.  If the
 prior is taken to be too broad (i.e., too vague), the resolution is in
-theory only, and samplers will still struggle.  
+theory only, and samplers will still struggle.
 
 Ideally, a realistic prior will be formulated based on substantive
 knowledge of the problem being modeled.  Such a prior can be chosen to
@@ -1610,7 +1662,7 @@ \subsection{Mixture Models}
 and $\mu_2$, a shared scale $\sigma > 0$, a mixture ratio $\theta \in
 [0,1]$, and likelihood
 \[
-p(y|\theta,\mu_1,\mu_2,\sigma) 
+p(y|\theta,\mu_1,\mu_2,\sigma)
 = \prod_{n=1}^N \big( \theta \, \distro{Normal}(y_n|\mu_1,\sigma)
                        + (1 - \theta) \, \distro{Normal}(y_n|\mu_2,\sigma) \big).
 \]
@@ -1631,7 +1683,7 @@ \subsection{Convergence Monitoring and Effective Sample Size}
 problem is that the posterior mean, a key ingredient in these
 computations, is affected by label switching, resulting in a posterior
 mean for $\mu_1$ that is equal to that of $\mu_2$, and a posterior
-mean for $\theta$ that is always 1/2, no matter what the data is. 
+mean for $\theta$ that is always 1/2, no matter what the data is.
 
 \subsection{Some Inferences are Invariant}
 
@@ -1649,7 +1701,7 @@ \subsection{Highly Multimodal Posteriors}
 
 Theoretically, this should not present a problem for inference because
 all of the integrals involved in posterior predictive inference will
-be well behaved. The problem in practice is computation. 
+be well behaved. The problem in practice is computation.
 
 Being able to carry out such invariant inferences in practice is an
 altogether different matter.  It is almost always intractable to find
@@ -1686,12 +1738,12 @@ \subsubsection{Parameter Ordering Constraints}
 compromised.  For instance, attempting to use $M$ posterior samples to
 estimate $\mbox{Pr}[\mu_1 > \mu_2]$, will fail, because the estimator
 \[
-\mbox{Pr}[\mu_1 > \mu_2] 
-\approx 
+\mbox{Pr}[\mu_1 > \mu_2]
+\approx
 \sum_{m=1}^M \mbox{I}(\mu_1^{(m)} > \mu_2^{(m)})
 \]
 will result in an estimate of 0 because the posterior respects the
-constraint in the model.  
+constraint in the model.
 
 \subsubsection{Initialization around a Single Mode}
 
@@ -1713,13 +1765,13 @@ \section{Component Collapsing in Mixture Models}
 It is possible for two mixture components in a mixture model to
 collapse to the same values during sampling or optimization.  For
 example, a mixture of $K$ normals might devolve to have $\mu_i =
-\mu_j$ and $\sigma_i = \sigma_j$ for $i \neq j$.  
+\mu_j$ and $\sigma_i = \sigma_j$ for $i \neq j$.
 
 This will typically happen early in sampling due to initialization in
 MCMC or optimization or arise from random movement during MCMC.  Once
 the parameters match for a given draw $(m)$, it can become hard to
 escape because there can be a trough of low-density mass between the
-current parameter values and the ones without collapsed components.  
+current parameter values and the ones without collapsed components.
 
 It may help to use a smaller step size during warmup, a stronger prior
 on each mixture component's membership responsibility.  A more extreme
@@ -1757,7 +1809,7 @@ \subsection{Beta-Binomial Models with Skewed Data and Weak Priors}
 to a posterior
 %
 \begin{eqnarray*}
-p(\phi|y) 
+p(\phi|y)
 & \propto & \textstyle
 \distro{Beta}(\phi|0.5,0.5) \times \prod_{n=1}^N \distro{Bernoulli}(y_n|\phi)
 \\[4pt]
@@ -1818,7 +1870,7 @@ \subsection{Separability in Logistic Regression}
 ``separability.''  In this case, predictive accuracy on the observed data
 continue to improve as $\beta_k \rightarrow \infty$, because for cases
 with $y_n = 1$, $x_n \beta \rightarrow \infty$ and hence
-$\mbox{logit}^{-1}(x_n \beta) \rightarrow 1$.  
+$\mbox{logit}^{-1}(x_n \beta) \rightarrow 1$.
 
 With separability, there is no maximum to the likelihood and hence no
 maximum likelihood estimate.  From the Bayesian perspective, the
@@ -1866,13 +1918,13 @@ \subsection{Gibbs Sampling}
 Gibbs sampling proceeds in iteration $m$ by drawing
 %
 \begin{eqnarray*}
-\lambda_1^{(m)} 
+\lambda_1^{(m)}
 & \sim & p(\lambda_1 \, | \, \lambda_2^{(m-1)}, \, \sigma^{(m-1)}, \, y)
 \\[6pt]
 \lambda_2^{(m)}
 & \sim & p(\lambda_2 \, | \, \lambda_1^{(m)}, \, \sigma^{(m-1)}, \, y)
 \\[6pt]
-\sigma^{(m)} 
+\sigma^{(m)}
 & \sim & p(\sigma \, | \, \lambda_1^{(m)}, \, \lambda_2^{(m)}, \, y).
 \end{eqnarray*}
 %
@@ -1902,7 +1954,7 @@ \subsection{Hamiltonian Monte Carlo Sampling}
 the correlation and the simulation will run $\lambda_1$ and
 $\lambda_2$ in opposite directions along the valley corresponding to
 the ridge in the posterior log density (see
-\reffigure{non-identifiable-density}.  
+\reffigure{non-identifiable-density}.
 
 \subsection{No-U-Turn Sampling}
 
@@ -1922,7 +1974,7 @@ \subsection{No-U-Turn Sampling}
 improper posterior!  Thus the behavior of HMC in general and NUTS
 in particular should be reassuring in that it will clearly fail in
 cases of improper posteriors, resulting in a clean diagnostic of
-sweeping out very large paths in the posterior.  
+sweeping out very large paths in the posterior.
 
 \begin{figure}
 %
@@ -1983,10 +2035,10 @@ \subsection{No-U-Turn Sampling}
   %
   On the top is the non-identified model with improper uniform priors
   and likelihood $y_n \sim \distro{Normal}(\lambda_1 + \lambda_2,
-  \sigma)$. 
+  \sigma)$.
   %
   In the middle is the same likelihood as the middle plus priors
-  $\lambda_k \sim \distro{Normal}(0,10)$. 
+  $\lambda_k \sim \distro{Normal}(0,10)$.
   %
   On the bottom is an identified model with an improper prior, with
   likelihood  $y_n \sim \distro{Normal}(\mu,\sigma)$.  All models
@@ -2065,12 +2117,12 @@ \subsection{Examples: Fits in Stan}
   The average number of leapfrog steps is roughly 3 in
   the identified model, 150 in the model identified by a weak prior, and
   1400 in the non-identified model.
-\item 
+\item
   The number of effective samples per
   second for $\mu$ is roughly 31,000 in the identified model, 1900 in the model
   identified with weakly informative priors, and 200 in the
-  non-identified model; the results are similar for $\sigma$.  
-\item 
+  non-identified model; the results are similar for $\sigma$.
+\item
   In the non-identified model, the 95\% interval for $\lambda_1$ is
   (-2300,6000), whereas it is only (-12,12) in the model identified with
   weakly informative priors.
@@ -2117,7 +2169,7 @@ \section{Basic Motivation}
 column vector, is ambiguous.  Similarly, row vectors are separated
 from column vectors because multiplying a row vector by a column
 vector produces a scalar, whereas multiplying in the opposite order
-produces a matrix.  
+produces a matrix.
 
 The following code fragment shows all four ways to declare a
 two-dimensional container of size $M \times N$.
@@ -2134,7 +2186,7 @@ \section{Basic Motivation}
 comments to the right of the declarations.  Thus the only way to
 efficiently iterate over row vectors is to use the third declaration,
 but if you need linear algebra on matrices, but the only way to use
-matrix operations is to use the fourth declaration. 
+matrix operations is to use the fourth declaration.
 
 The inefficiencies due to any manual reshaping of containers is
 usually slight compared to what else is going on in a Stan program
@@ -2162,9 +2214,9 @@ \section{Data Type and Indexing Efficiency}\label{indexingefficiency.section}
 The underlying matrix and linear algebra operations are implemented in
 terms of data types from the Eigen \Cpp library.  By having vectors
 and matrices as basic types, no conversion is necessary when invoking
-matrix operations or calling linear algebra functions.  
+matrix operations or calling linear algebra functions.
 
-Arrays, on the other hand, are implemented as instances of the \Cpp \ 
+Arrays, on the other hand, are implemented as instances of the \Cpp \
 \code{std::vector} class (not to be confused with Eigen's
 \code{Eigen::Vector} class or Stan vectors).  By implementing arrays
 this way, indexing is very efficient because values can be returned by
@@ -2174,11 +2226,11 @@ \subsection{Matrices vs.\ Two-Dimensional Arrays}
 
 In Stan models, there are a few minor efficiency considerations in
 deciding between a two-dimensional array and a matrix, which may seem
-interchangeable at first glance.  
+interchangeable at first glance.
 
 First, matrices use a bit less memory than two-dimensional arrays.
 This is because they don't store a sequence of arrays, but just the
-data and the two dimensions.  
+data and the two dimensions.
 
 Second, matrices store their data in column-major order.  Furthermore,
 all of the data in a matrix is guaranteed to be contiguous in memory.
@@ -2395,7 +2447,7 @@ \chapter{Multiple Indexing and Range Indexing}\label{multi-indexing.chapter}
 a local variable.
 %
 \begin{stancode}
-{ 
+{
   vector[N] mu;
   for (n in 1:N)
     mu[n] = alpha[ii[n]] + beta[ii[n]] * x[n];
@@ -2410,9 +2462,9 @@ \section{Multiple Indexing}
 variables as indicated in the comments.
 %
 \begin{stancode}
-int c[3];     
+int c[3];
 ...             // define: c == (5, 9, 7)
-int idxs[4];   
+int idxs[4];
 ...             // define: idxs == (3, 3, 1, 2)
 int d[4];
 d = c[idxs];   // result: d == (7, 7, 5, 9)
@@ -2433,7 +2485,7 @@ \section{Multiple Indexing}
 example, consider the following.
 %
 \begin{stancode}
-int c[2, 3];   
+int c[2, 3];
 ...            // define: c = ((1, 3, 5), ((7, 11, 13))
 int idxs[4];
 ...            // define: idxs = (2, 2, 1, 2)
@@ -2459,17 +2511,17 @@ \section{Multiple Indexing}
 %
 The single index is applied, the one-dimensional result is determined,
 then the multiple index is applied to the result.  That is,
-\code{c[2,idxs]} evaluates to the same value as \code{c[2][idxs]}. 
+\code{c[2,idxs]} evaluates to the same value as \code{c[2][idxs]}.
 
 Multiple indexing can apply to more than one position of a
 multi-dimensional array.  For instance, consider the following
 %
 \begin{stancode}
-int c[2, 3];   
+int c[2, 3];
 ...                    // define: c = ((1, 3, 5), (7, 11, 13))
 int idxs1[3];
 ...                    // define: idxs1 = (2, 2, 1)
-int idxs2[2]; 
+int idxs2[2];
 ...                    // define: idxs2 = (1, 3)
 int d[3, 2];
 d = c[idxs1, idxs2];  // result: d = ((7, 13), (7, 13), (1, 5))
@@ -2510,7 +2562,7 @@ \subsection{Lower and Upper Bound Indexes}
 %
 The range index \code{3:6} behaves semantically just like the multiple
 index \code{(3, 4, 5, 6)}.  In terms of implementation, the sliced
-upper and/or lower bounded indices are faster and use less memory 
+upper and/or lower bounded indices are faster and use less memory
 because they do not explicitly create a multiple index, but rather use
 a direct loop.  They are also easier to read, so should be preferred
 over multiple indexes where applicable.
@@ -2519,8 +2571,8 @@ \subsection{Lower or Upper Bound Indexes}
 
 It is also possible to supply just a lower bound, or just an upper
 bound.  Writing \code{c[3:]} is just shorthand for
-\code{c[3:size(c)]}.  Writing \code{c[:5]} is just shorthand for 
-\code{c[1:5]}.  
+\code{c[3:size(c)]}.  Writing \code{c[:5]} is just shorthand for
+\code{c[1:5]}.
 
 \subsection{Full Range Indexes}
 
@@ -2534,7 +2586,7 @@ \section{Multiple Indexing on the Left of Assignments}
 
 Multiple expressions may be used on the left-hand side of an
 assignment statement, where they work exactly the same way as on the
-right-hand side in terms of picking out entries of a container.  
+right-hand side in terms of picking out entries of a container.
 For example, consider the following.
 %
 \begin{stancode}
@@ -2548,7 +2600,7 @@ \section{Multiple Indexing on the Left of Assignments}
 %
 The result above can be worked out by noting that the assignment sets
 \code{a[idxs[1]]} (\code{a[3]}) to \code{c[1]} (\code{5}) and
-\code{a[idxs[2]]} (\code{a[2]}) to \code{c[2]} (\code{9}). 
+\code{a[idxs[2]]} (\code{a[2]}) to \code{c[2]} (\code{9}).
 
 The same principle applies when there are many multiple indexes, as
 in the following example.
@@ -2587,7 +2639,7 @@ \subsection{Assign-by-Value and Aliasing}
 int a[3];
 ...                // define: a == (5, 6, 7)
 a[2:3] = a[1:2];
-...                // result: a == (5, 5, 6) 
+...                // result: a == (5, 5, 6)
 \end{stancode}
 %
 The reason the value of \code{a} after the assignment is $(5,5,6)$
@@ -2687,8 +2739,8 @@ \subsection{Matrices with One Multiple Index}
 If matrices receive a single multiple index, the result is a matrix.
 So if \code{m} is a matrix, so is \code{m[2:4]}.  In contrast,
 supplying a single index, \code{m[3]}, produces a row vector result.
-That is, \code{m[3]} produces the same result as \code{m[3,~]} 
-or \code{m[3,~1:cols(m)]}. 
+That is, \code{m[3]} produces the same result as \code{m[3,~]}
+or \code{m[3,~1:cols(m)]}.
 
 \subsection{Arrays of Vectors or Matrices}
 
@@ -2753,16 +2805,11 @@ \section{Matrices with Parameters and Constants}
 
 \begin{stancode}
 transformed data {
-  int<lower=1, upper=3> ii[7];
-  int<lower=1, upper=3> jj[7];
-  ii[1] = 1;  jj[1] = 1;
-  ii[2] = 2;  jj[2] = 1;   // skip [1, 2] and [1, 3]
-  ii[3] = 3;  jj[3] = 1;
-  ii[4] = 2;  jj[4] = 2;
-  ii[5] = 3;  jj[5] = 2;
-  ii[6] = 2;  jj[6] = 3;
-  ii[7] = 3;  jj[7] = 3;
-}
+  int<lower=1, upper=3> idxs[7,
+    = { {1, 1},
+        {2, 1}, {2, 2}, {2, 3},
+        {3, 1}, {3, 2}, {3, 3} };
+  ...
 \end{stancode}
 %
 The seven remaining parameters are declared as a vector.
@@ -2770,7 +2817,7 @@ \section{Matrices with Parameters and Constants}
 \begin{stancode}
 parameters {
   vector[7] A_raw;
-}
+  ...
 \end{stancode}
 %
 Then the full matrix \code{A} is constructed in the model block as a
@@ -2779,17 +2826,17 @@ \section{Matrices with Parameters and Constants}
 \begin{stancode}
 model {
   matrix[3, 3] A;
-  A[ii, jj] = A_raw;
+  for (i in 1:7)
+    A[idxs[i, 1], idxs[i, 2]] = A_raw[i];
   A[1, 2] = 0;
   A[1, 3] = 0;
-}
+  ...
 \end{stancode}
-
+%
 This may seem like overkill in this setting, but in more general
-settings, the matrix size, vector size, and the \code{ii} and
-\code{jj} array and size can be coded up as data along with the values
-to fill in for the constant entries.  Similar techniques can be used
-to build up matrices with ad-hoc constraints, such as a handful of
+settings, the matrix size, vector size, and the \code{idxs} array will
+be too large to code directly.  Similar techniques can be used to
+build up matrices with ad-hoc constraints, such as a handful of
 entries known to be positive.
 
 
@@ -2860,7 +2907,7 @@ \section{Statistical vs.\  Computational Efficiency}
 statistical efficiency for Stan programs fit with sampling-based
 algorithms.  Computational efficiency measures the amount of time or
 memory required for a given step in a calculation, such as an
-evaluation of a log posterior or penalized likelihood.  
+evaluation of a log posterior or penalized likelihood.
 
 Statistical efficiency typically involves requiring fewer steps in
 algorithms by making the statistical formulation of a model better
@@ -2975,7 +3022,7 @@ \section{Well-Specified Models}
 intended runs slowly or may even have convergence and mixing issues.
 While some of the techniques recommended in the remaining sections of
 this chapter may mitigate the problem somewhat, the best remedy is a
-better model specification.  
+better model specification.
 
 Somewhat counterintuitively, more complicated models often run faster
 than simpler models.  One common pattern is with a group of parameters
@@ -2995,7 +3042,7 @@ \section{Reparameterization}\label{reparameterization.section}
 through reparameterization.  In some cases, reparameterization can
 dramatically increase effective sample size for the same number of
 iterations or even make programs that would not converge well
-behaved. 
+behaved.
 
 \subsection{Example: Neal's Funnel}
 
@@ -3021,7 +3068,7 @@ \subsection{Example: Neal's Funnel}
   Neal's funnel for the upper-level variable $y$ and one lower-level
   variable $x_1$ (see the text for the formula).  The blue region has
   log density greater than -8, the yellow region density greater than
-  -16, and the gray background a density less than -16.  
+  -16, and the gray background a density less than -16.
   (Right) 4000 draws from a run of Stan's
   sampler with default settings.  Both plots are restricted to the
   shown window of $x_1$ and $y$ values; some draws fell outside of
@@ -3052,7 +3099,7 @@ \subsection{Example: Neal's Funnel}
 The funnel can be implemented directly in Stan as follows.
 %
 \begin{stancode}
-parameters {  
+parameters {
   real y;
   vector[9] x;
 }
@@ -3074,7 +3121,7 @@ \subsection{Example: Neal's Funnel}
 the following more efficient form.
 %
 \begin{stancode}
-parameters {  
+parameters {
   real y_raw;
   vector[9] x_raw;
 }
@@ -3082,12 +3129,12 @@ \subsection{Example: Neal's Funnel}
   real y;
   vector[9] x;
 
-  y = 3.0 * y_raw;  
+  y = 3.0 * y_raw;
   x = exp(y/2) * x_raw;
 }
 model {
-  y_raw ~ normal(0, 1); // implies y ~ normal(0, 3) 
-  x_raw ~ normal(0, 1); // implies x ~ normal(0, exp(y/2))  
+  y_raw ~ normal(0, 1); // implies y ~ normal(0, 3)
+  x_raw ~ normal(0, 1); // implies x ~ normal(0, exp(y/2))
 }
 \end{stancode}
 %
@@ -3139,7 +3186,7 @@ \subsection{Reparameterizing the Cauchy}
 Thus if the random variable $Y$ has a unit uniform distribution, $Y
 \sim \distro{Uniform}(0,1)$, then $F^{-1}_X(Y)$ has a Cauchy
 distribution with location $\mu$ and scale $\tau$, i.e., $F^{-1}_X(Y) \sim
-\distro{Cauchy}(\mu,\tau)$. 
+\distro{Cauchy}(\mu,\tau)$.
 
 Consider a Stan program involving a Cauchy-distributed parameter
 \code{beta}.
@@ -3167,7 +3214,7 @@ \subsection{Reparameterizing the Cauchy}
 transformed parameters {
   real beta;
   beta = mu + tau * tan(beta_unif);  // beta ~ cauchy(mu, tau)
-}    
+}
 model {
   beta_unif ~ uniform(-pi()/2, pi()/2);  // not necessary
   ...
@@ -3189,7 +3236,7 @@ \subsection{Reparameterizing the Cauchy}
 needs the total log probability up to an additive constant.  Stan will spend
 some time checking that that \code{beta\_unif} is between
 \code{-pi()/2} and \code{pi()/2}, but this condition is guaranteed by
-the constraints in the declaration of \code{beta\_unif}.  
+the constraints in the declaration of \code{beta\_unif}.
 
 \subsection{Reparameterizing a Student-t Distribution}
 
@@ -3217,8 +3264,8 @@ \subsection{Reparameterizing a Student-t Distribution}
 $\mbox{\sf Student-t}(\nu, 0, 1)$, i.e.,
 \[
 \mbox{\sf Student-t}(\beta | \nu,0,1).
-= 
-\int_0^{\infty} 
+=
+\int_0^{\infty}
 \,
 \mbox{\sf Normal}(\beta | 0, 1 / \tau^{-\frac{1}{2}})
 \times
@@ -3267,7 +3314,7 @@ \subsection{Reparameterizing a Student-t Distribution}
 parameters {
   real<lower=0> nu;
   real<lower=0> tau;
-  real alpha;  
+  real alpha;
   ...
 transformed parameters {
   real beta;
@@ -3279,7 +3326,7 @@ \subsection{Reparameterizing a Student-t Distribution}
   tau ~ gamma(half_nu, half_nu);
   alpha ~ normal(0, 1);
   ...
-\end{stancode}  
+\end{stancode}
 %
 Although set to \code{0} here, in most cases, the lower bound for the
 degrees of freedom parameter \code{nu} can be set to \code{1} or
@@ -3293,7 +3340,7 @@ \subsection{Hierarchical Models and the Non-Centered Parameterization}
 Unfortunately, the usual situation in applied Bayesian modeling
 involves complex geometries and interactions that are not known
 analytically.  Nevertheless, reparameterization can still be very
-effective for separating parameters.  
+effective for separating parameters.
 
 \subsubsection{Centered parameterization}
 
@@ -3303,7 +3350,7 @@ \subsubsection{Centered parameterization}
 %
 \begin{stancode}
 parameters {
-  real mu_beta;   
+  real mu_beta;
   real<lower=0> sigma_beta;
   vector[K] beta;
   ...
@@ -3353,7 +3400,7 @@ \subsection{Non-Centered Parameterization}
   // implies: beta ~ normal(mu_beta, sigma_beta)
   beta = mu_beta + sigma_beta * beta_raw;
 model {
-  beta_raw ~ normal(0, 1);  
+  beta_raw ~ normal(0, 1);
   ...
 \end{stancode}
 %
@@ -3362,19 +3409,19 @@ \subsection{Non-Centered Parameterization}
 
 Reparameterization of hierarchical models is not limited to the normal
 distribution, although the normal distribution is the best candidate
-for doing so. In general, any distribution of parameters in the 
+for doing so. In general, any distribution of parameters in the
 location-scale family is a good candidate for reparameterization. Let
 $\beta = l + s\alpha$ where $l$ is a location parameter and $s$ is a
 scale parameter. Note that $l$ need not be the mean, $s$ need not
 be the standard deviation, and neither the mean nor the standard
 deviation need to exist. If $\alpha$ and $\beta$ are from the same
-distributional family but $\alpha$ has location zero and unit scale, 
+distributional family but $\alpha$ has location zero and unit scale,
 while $\beta$ has location $l$ and scale $s$, then that distribution
 is a location-scale distribution. Thus, if $\alpha$ were a parameter
 and $\beta$ were a transformed parameter, then a prior distribution
-from the location-scale family on $\alpha$ with location zero and unit 
+from the location-scale family on $\alpha$ with location zero and unit
 scale implies a prior distribution on $\beta$ with location $l$ and
-scale $s$. Doing so would reduce the dependence between $\alpha$, 
+scale $s$. Doing so would reduce the dependence between $\alpha$,
 $l$, and $s$.
 
 There are several univariate distributions in the location-scale
@@ -3382,9 +3429,9 @@ \subsection{Non-Centered Parameterization}
 cases of the Cauchy distribution (with one degree of freedom) and the
 normal distribution (with infinite degrees of freedom). As shown above,
 if $\alpha$ is distributed standard normal, then $\beta$ is distributed
-normal with mean $\mu = l$ and standard deviation $\sigma = s$. The 
-logistic, the double exponential, the generalized extreme value 
-distributions, and the stable distribution are also in the 
+normal with mean $\mu = l$ and standard deviation $\sigma = s$. The
+logistic, the double exponential, the generalized extreme value
+distributions, and the stable distribution are also in the
 location-scale family.
 
 Also, if $z$ is distributed standard normal, then $z^2$ is distributed
@@ -3399,7 +3446,7 @@ \subsection{Non-Centered Parameterization}
 
 \subsection{Multivariate Reparameterizations}
 
-The benefits of reparameterization are not limited to univariate 
+The benefits of reparameterization are not limited to univariate
 distributions. A parameter with a multivariate normal prior distribution
 is also an excellent candidate for reparameterization. Suppose you intend
 the prior for $\beta$ to be multivariate normal with mean vector $\mu$
@@ -3424,10 +3471,10 @@ \subsection{Multivariate Reparameterizations}
 be unknown parameters, in which case their priors would be unaffected
 by a reparameterization of \Verb|beta|.
 
-If $\alpha$ has the same dimensions as $\beta$ but the elements of 
-$\alpha$ are independently and identically distributed standard normal 
-such that $\beta = \mu + L\alpha$, where $LL^\top = \Sigma$, then 
-$\beta$ is distributed multivariate normal with mean vector $\mu$ and 
+If $\alpha$ has the same dimensions as $\beta$ but the elements of
+$\alpha$ are independently and identically distributed standard normal
+such that $\beta = \mu + L\alpha$, where $LL^\top = \Sigma$, then
+$\beta$ is distributed multivariate normal with mean vector $\mu$ and
 covariance matrix $\Sigma$. One choice for $L$ is the Cholesky factor
 of $\Sigma$. Thus, the model above could be reparameterized as follows.
 %
@@ -3446,10 +3493,10 @@ \subsection{Multivariate Reparameterizations}
   ...
 transformed parameters {
   vector[K] beta;
-  beta = mu + L * alpha; 
+  beta = mu + L * alpha;
 }
 model {
-  alpha ~ normal(0, 1); 
+  alpha ~ normal(0, 1);
   // implies: beta ~ multi_normal(mu, Sigma)
   ...
 \end{stancode}
@@ -3459,7 +3506,7 @@ \subsection{Multivariate Reparameterizations}
 avoids the need to invert \Verb|Sigma| every time \Verb|multi_normal|
 is evaluated.
 
-The Cholesky factor is also useful when a covariance matrix is 
+The Cholesky factor is also useful when a covariance matrix is
 decomposed into a correlation matrix that is multiplied from both
 sides by a diagonal matrix of standard deviations, where either the
 standard deviations or the correlations are unknown parameters. The
@@ -3502,7 +3549,7 @@ \subsection{Multivariate Reparameterizations}
 %
 This reparameterization of a multivariate normal distribution in
 terms of standard normal variates can be extended to other multivariate
-distributions that can be conceptualized as contaminations of the 
+distributions that can be conceptualized as contaminations of the
 multivariate normal, such as the multivariate Student t and the skew
 multivariate normal distribution.
 
@@ -3556,7 +3603,7 @@ \subsection{Multivariate Reparameterizations}
   for (i in 1:(K-1))
     A[i, K] = 0;
   A[K, K] = sqrt(c[K]);
-  
+
   for (i in 1:K)
     c[i] ~ chi_square(nu - i + 1);
 
@@ -3568,7 +3615,7 @@ \subsection{Multivariate Reparameterizations}
 %
 This reparameterization is more efficient for three reasons. First, it
 reduces dependence among the elements of \Verb|z| and second, it
-avoids the need to invert the covariance matrix, $W$ every time 
+avoids the need to invert the covariance matrix, $W$ every time
 \Verb|wishart| is evaluated. Third, if $W$ is to be used with a
 multivariate normal distribution, you can pass $L A$ to the more
 efficient \Verb|multi_normal_cholesky| function, rather than passing
@@ -3578,9 +3625,9 @@ \subsection{Multivariate Reparameterizations}
 freedom $\nu$, then $W^{-1}$ is distributed inverse Wishart with inverse
 scale matrix $S^{-1}$ and degrees of freedom $\nu$. Thus, the previous
 result can be used to reparameterize the inverse Wishart distribution.
-Since $W = L * A * A^{\top} * L^{\top}$, 
+Since $W = L * A * A^{\top} * L^{\top}$,
 $W^{-1} = L^{{\top}^{-1}} A^{{\top}^{-1}} A^{-1} L^{-1}$, where all four
-inverses exist, but 
+inverses exist, but
 $L^{{-1}^{\top}} = L^{{\top}^{-1}}$ and $A^{{-1}^{\top}} = A^{{\top}^{-1}}$.
 We can slightly modify the above Stan code for this case:
 %
@@ -3607,7 +3654,7 @@ \subsection{Multivariate Reparameterizations}
   ...
 model {
   matrix[K, K] A;
-  matrix[K, K] A_inv_L_inv;  
+  matrix[K, K] A_inv_L_inv;
   int count;
   count = 1;
   for (j in 1:(K-1)) {
@@ -3628,18 +3675,18 @@ \subsection{Multivariate Reparameterizations}
   for (i in 1:K)
     c[i] ~ chi_square(nu - i + 1);
 
-  z ~ normal(0, 1); // implies: crossprod(A_inv_L_inv) ~ 
+  z ~ normal(0, 1); // implies: crossprod(A_inv_L_inv) ~
   // inv_wishart(nu, L_inv' * L_inv)
   ...
 \end{stancode}
 %
 Another candidate for reparameterization is the Dirichlet distribution
-with all $K$ shape parameters equal. \cite{ZyczkowskiSommers:2001} shows 
-that if $\theta_i$ is equal to the sum of $\beta$ independent squared 
-standard normal variates and $\rho_i = \frac{\theta_i}{\sum \theta_i}$, 
-then the $K$-vector $\rho$ is distributed Dirichlet with all shape 
-parameters equal to $\frac{\beta}{2}$. In particular, if $\beta = 2$, 
-then $\rho$ is uniformly distributed on the unit simplex. Thus, we can 
+with all $K$ shape parameters equal. \cite{ZyczkowskiSommers:2001} shows
+that if $\theta_i$ is equal to the sum of $\beta$ independent squared
+standard normal variates and $\rho_i = \frac{\theta_i}{\sum \theta_i}$,
+then the $K$-vector $\rho$ is distributed Dirichlet with all shape
+parameters equal to $\frac{\beta}{2}$. In particular, if $\beta = 2$,
+then $\rho$ is uniformly distributed on the unit simplex. Thus, we can
 make $\rho$ be a transformed parameter to reduce dependence, as in:
 %
 \begin{stancode}
@@ -3657,7 +3704,7 @@ \subsection{Multivariate Reparameterizations}
 }
 model {
   for (k in 1:K)
-    z[k] ~ normal(0, 1); 
+    z[k] ~ normal(0, 1);
   // implies: rho ~ dirichlet(0.5 * beta * ones)
   ...
 \end{stancode}
@@ -3690,20 +3737,20 @@ \subsection{Vectorizing Summations}
 some operation that depends on \code{n}.
 %
 \begin{stancode}
-for (n in 1:N) 
+for (n in 1:N)
   total = total + foo(n,...);
 \end{stancode}
 %
 This code has to create intermediate representations for each
-of the \code{N} summands.  
+of the \code{N} summands.
 
 A faster alternative is to copy the values into a vector, then
 apply the \code{sum()} operator, as in the following refactoring.
 %
 \begin{stancode}
-{  
+{
   vector[N] summands;
-  for (n in 1:N) 
+  for (n in 1:N)
     summands[n] = foo(n,...);
   total = sum(summands);
 }
@@ -3739,7 +3786,7 @@ \subsection{Vectorization through Matrix Operations}
 }
 model {
   for (n in 1:N) {
-    real gamma;  
+    real gamma;
     gamma = 0;
     for (k in 1:K)
       gamma = gamma + x[n, k] * beta[k];
@@ -3783,7 +3830,7 @@ \subsection{Vectorization through Matrix Operations}
 \Cpp matrix library.  This will require enhancing Eigen to deal
 with mixed-type arguments, such as the type \code{double} used
 for constants and the algorithmic differentiation type
-\code{stan::math::var} 
+\code{stan::math::var}
 used for variables.}.
 %
 The inefficiency of transposition could itself be mitigated somewhat by
@@ -3840,7 +3887,7 @@ \subsection{Vectorized Probability Functions}
 }
 parameters {
   vector[K] beta;
-} 
+}
 model {
   y ~ normal(x * beta, 1);
 }
@@ -3848,7 +3895,7 @@ \subsection{Vectorized Probability Functions}
 %
 The variables are all declared as either matrix or vector types.
 The result of the matrix-vector multiplication \code{x * beta} in the
-model block is a vector of the same length as \code{y}.  
+model block is a vector of the same length as \code{y}.
 
 The probability function documentation in \refpart{built-in-functions}
 indicates which of Stan's probability functions support
@@ -3886,7 +3933,7 @@ \subsection{Reshaping Data for Vectorization}
 \end{stancode}
 %
 If there aren't many levels (values \code{ii[n]} can take), then it
-behooves us to reorganize the data by group in a case like this.  
+behooves us to reorganize the data by group in a case like this.
 Rather than having a single observation vector \code{y}, there are K of them.
 And because Stan doesn't support ragged arrays, it means K
 declarations.  For instance, with 5 levels, we have
@@ -3922,7 +3969,7 @@ \section{Exploiting Sufficient Statistics}
 }
 model {
   theta ~ beta(alpha, beta);
-  for (n in 1:N) 
+  for (n in 1:N)
     y[n] ~ bernoulli(theta);
 }
 \end{stancode}
@@ -3938,7 +3985,7 @@ \section{Exploiting Sufficient Statistics}
 %
 Because truth is represented as one and falsehood as zero, the sum
 \code{sum(y)} of a binary vector \code{y} is equal to the number of
-positive outcomes out of a total of \code{N} trials.  
+positive outcomes out of a total of \code{N} trials.
 
 This can be generalized to other discrete cases (one wouldn't expect
 continuous observations to be duplicated if they are random).  Suppose
@@ -3969,7 +4016,7 @@ \section{Aggregating Common Subexpressions}
 If an expression is calculated once, the value should be saved and
 reused wherever possible.  That is, rather than using
 \code{exp(theta)} in multiple places, declare a local variable to
-store its value and reuse the local variable.  
+store its value and reuse the local variable.
 
 Another case that may not be so obvious is with two multilevel
 parameters, say \code{a[ii[n]] + b[jj[n]]}.  If \code{a} and \code{b}
@@ -4035,8 +4082,8 @@ \section{Standardizing Predictors and Outputs}
 \]
 and the sample standard deviation of $y$ is
 \[
-\mbox{sd}(y) 
-= \left( 
+\mbox{sd}(y)
+= \left(
 \frac{1}{N} \sum_{n=1}^N (y_n - \bar{y})^2
 \right)^{1/2}.
 \]
@@ -4065,7 +4112,7 @@ \section{Standardizing Predictors and Outputs}
 }
 model {
   // priors
-  alpha ~ normal(0, 10);    
+  alpha ~ normal(0, 10);
   beta ~ normal(0, 10);
   sigma ~ cauchy(0, 5);
   // likelihood
@@ -4077,7 +4124,7 @@ \section{Standardizing Predictors and Outputs}
 
 The data block for the standardized model is identical.  The
 standardized predictors and outputs are defined in the transformed
-data block.  
+data block.
 %
 \begin{stancode}
 data {
@@ -4097,11 +4144,11 @@ \section{Standardizing Predictors and Outputs}
   real<lower=0> sigma_std;
 }
 model {
-  alpha_std ~ normal(0, 10);    
+  alpha_std ~ normal(0, 10);
   beta_std ~ normal(0, 10);
   sigma_std ~ cauchy(0, 5);
   for (n in 1:N)
-    y_std[n] ~ normal(alpha_std + beta_std * x_std[n], 
+    y_std[n] ~ normal(alpha_std + beta_std * x_std[n],
                       sigma_std);
 }
 \end{stancode}
@@ -4118,7 +4165,7 @@ \section{Standardizing Predictors and Outputs}
 
 The original regression
 \[
-y_n 
+y_n
 = \alpha + \beta x_n + \epsilon_n
 \]
 has been transformed to a regression on the standardized variables,
@@ -4131,66 +4178,66 @@ \section{Standardizing Predictors and Outputs}
 The original parameters can be recovered with a little algebra,
 %
 \begin{eqnarray*}
-y_n 
+y_n
 & = & \mbox{z}_y^{-1}(\mbox{z}_y(y_n))
 \\[4pt]
-& = & 
-\mbox{z}_y^{-1} 
-\left( 
-\alpha' 
+& = &
+\mbox{z}_y^{-1}
+\left(
+\alpha'
 + \beta' \mbox{z}_x(x_n)
 + \epsilon_n'
 \right)
 \\[4pt]
-& = & 
-\mbox{z}_y^{-1} 
-\left( 
-\alpha' 
-+ \beta' 
-    \left(  
+& = &
+\mbox{z}_y^{-1}
+\left(
+\alpha'
++ \beta'
+    \left(
       \frac{x_n - \bar{x}}{\mbox{\small sd}(x)}
     \right)
 + \epsilon_n'
 \right)
 \\[4pt]
-& = & 
+& = &
 \mbox{sd}(y)
-\left( 
-\alpha' 
-+ \beta' 
-    \left(  
+\left(
+\alpha'
++ \beta'
+    \left(
       \frac{x_n - \bar{x}}{\mbox{\small sd}(x)}
     \right)
 + \epsilon_n'
 \right)
 + \bar{y}
 \\[4pt]
-& = & 
-\left( 
-  \mbox{sd}(y) 
-      \left( 
-          \alpha' 
+& = &
+\left(
+  \mbox{sd}(y)
+      \left(
+          \alpha'
           - \beta' \frac{\bar{x}}{\mbox{\small sd}(x)}
-      \right) 
-  + \bar{y}  
+      \right)
+  + \bar{y}
 \right)
 + \left(
-      \beta' \frac{\mbox{\small sd}(y)}{\mbox{\small sd}(x)} 
+      \beta' \frac{\mbox{\small sd}(y)}{\mbox{\small sd}(x)}
   \right) x_n
 + \mbox{sd}(y) \epsilon'_n,
 \end{eqnarray*}
 %
 from which the original scale parameter values can be read off,
 \[
-\alpha 
+\alpha
 =
-\mbox{sd}(y) 
-      \left( 
-          \alpha' 
+\mbox{sd}(y)
+      \left(
+          \alpha'
           - \beta' \frac{\bar{x}}{\mbox{\small sd}(x)}
-      \right) 
+      \right)
   + \bar{y};
-\ \ \ \ \ 
+\ \ \ \ \
 \beta = \beta' \frac{\mbox{\small sd}(y)}{\mbox{\small sd}(x)};
 \ \ \ \ \
 \sigma = \mbox{sd}(y) \sigma'.
@@ -4204,7 +4251,7 @@ \section{Standardizing Predictors and Outputs}
   real alpha;
   real beta;
   real<lower=0> sigma;
-  alpha = sd(y) * (alpha_std - beta_std * mean(x) / sd(x)) 
+  alpha = sd(y) * (alpha_std - beta_std * mean(x) / sd(x))
            + mean(y);
   beta = beta_std * sd(y) / sd(x);
   sigma = sd(y) * sigma_std;
diff --git a/src/docs/stan-reference/stan-reference.tex b/src/docs/stan-reference/stan-reference.tex
index 68296118ffa..e2e04cc879e 100644
--- a/src/docs/stan-reference/stan-reference.tex
+++ b/src/docs/stan-reference/stan-reference.tex
@@ -8,7 +8,7 @@
 
 \pagestyle{plain}
 
-\newcommand{\stanversion}{2.15.0}
+\newcommand{\stanversion}{2.16}
 
 \frontmatter
 \include{title}
diff --git a/src/docs/stan-reference/title.tex b/src/docs/stan-reference/title.tex
index 52157d330b7..6406b1b4e88 100644
--- a/src/docs/stan-reference/title.tex
+++ b/src/docs/stan-reference/title.tex
@@ -86,12 +86,8 @@ \subsection*{Currently Active Developers}
 \\ {\footnotesize Stan Math, Stan, RStan}
 \item Jonah Sol Gabry \ (Columbia University)
 \\ {\footnotesize RStan, RStanArm, ShinyStan, Loo, BayesPlot}
-\item Alp Kucukelbir \ (Columbia University, Fero Labs)
-\\ {\footnotesize Stan, CmdStan}
 \item Robert L. Grant \ (Consultant, London)
 \\ {\footnotesize StataStan}
-\item Dustin Tran \ (Columbia University)
-\\ {\footnotesize Stan, CmdStan}
 \item Krzysztof Sakrejda \ (University of Massachusetts, Amherst)
 \\ {\footnotesize Stan Math, Stan}
 \item Aki Vehtari \ (Aalto University) \\ {\footnotesize Stan Math,
@@ -110,6 +106,11 @@ \subsection*{Currently Active Developers}
 \\ {\footnotesize RStan, RStanArm}
 \item Sean Talts \ (Dual Space, LLC)
 \\ {\footnotesize Stan Math, Stan, dev ops}
+\item Ben Bales \ (University of California, Santa Barbara)
+\\ {\footnotesize Stan Math}
+\item Ari Hartikainen \ (Aalto University)
+\\ {\footnotesize PyStan}
+
 \end{itemize}
 
 \subsection*{Development Team Alumni}
@@ -118,13 +119,17 @@ \subsection*{Development Team Alumni}
 past, but are no longer contributing actively.
 
 \begin{itemize}
-\item Matt Hoffman \ (Adobe Creative Technologies Lab)
+\item Matt Hoffman \ (while at Columbia University)
 \\ {\footnotesize Stan Math, Stan, CmdStan}
-\item Michael Malecki \ (Crunch.io, YouGov plc)
+\item Michael Malecki \ (while at Columbia University)
 \\ {\footnotesize software and graphical design}
-\item Peter Li \   (Columbia University)
+\item Peter Li \   (while at Columbia University)
 \\ {\footnotesize Stan Math, Stan}
-\item Yuanjun Guo \ (Columbia University)
+\item Yuanjun Guo \ (while at Columbia University)
 \\ {\footnotesize Stan}
+\item Alp Kucukelbir \ (while at Columbia University)
+\\ {\footnotesize Stan, CmdStan}
+\item Dustin Tran \ (while at Columbia University)
+\\ {\footnotesize Stan, CmdStan}
 \end{itemize}
 

From bcbc55e591c46f30ed80bf70f236ed31b846369c Mon Sep 17 00:00:00 2001
From: Daniel Lee <daniel@stan.fit>
Date: Mon, 19 Jun 2017 11:01:35 -0400
Subject: [PATCH 10/10] Fixing spelling errors

---
 src/docs/stan-reference/algorithms.tex    |  4 +-
 src/docs/stan-reference/appendices.tex    | 16 ++--
 src/docs/stan-reference/distributions.tex |  8 +-
 src/docs/stan-reference/examples.tex      |  2 +-
 src/docs/stan-reference/functions.tex     |  6 +-
 src/docs/stan-reference/language.tex      |  6 +-
 src/docs/stan-reference/process.tex       | 94 +++++++++++------------
 src/docs/stan-reference/programming.tex   | 10 +--
 8 files changed, 71 insertions(+), 75 deletions(-)

diff --git a/src/docs/stan-reference/algorithms.tex b/src/docs/stan-reference/algorithms.tex
index f30b40d58d8..1b7bc6c0700 100644
--- a/src/docs/stan-reference/algorithms.tex
+++ b/src/docs/stan-reference/algorithms.tex
@@ -169,11 +169,11 @@ \subsection{Leapfrog Integrator}
 repetitions of the above three steps) will be denoted
 $(\rho^{*},\theta^{*})$.
 
-The leapgrog integrator's error is on the order of $\epsilon^3$ per
+The leapfrog integrator's error is on the order of $\epsilon^3$ per
 step and $\epsilon^2$ globally, where $\epsilon$ is the time interval
 (also known as the step size);  \cite{LeimkuhlerReich:2004} provide a
 detailed analysis of numerical integration for Hamiltonian systems,
-including a derivation of the error bound for the leapforg
+including a derivation of the error bound for the leapfrog
 integrator.
 
 
diff --git a/src/docs/stan-reference/appendices.tex b/src/docs/stan-reference/appendices.tex
index 37e8655e2f3..b35cd6054f4 100644
--- a/src/docs/stan-reference/appendices.tex
+++ b/src/docs/stan-reference/appendices.tex
@@ -52,7 +52,7 @@ \section{Boost License}
 \end{quote}
 %
 The copyright for each Boost package is held by its developers or their
-assginees.
+assignees.
 
 
 \section{Eigen License}
@@ -70,7 +70,7 @@ \section{Eigen License}
 \section{SUNDIALS License}
 
 Stan uses the SUNDIALS package for solving stiff differential
-equations.  SUNDIALS is distrubted under the new BSD (3-clause) license.
+equations.  SUNDIALS is distributed under the new BSD (3-clause) license.
 %
 \begin{quote}
 \url{https://opensource.org/licenses/BSD-3-Clause}
@@ -583,7 +583,7 @@ \subsection{Syntactic Conventions}
 of \code{A}.  The notation \code{A \% B}, following the Boost Spirit
 parser library's notation, is shorthand for \code{?(A (B A)*)}, i.e.,
 any number of \code{A} (including zero), separated by \code{B}.  A
-postfixed, curly-braced number indicates a fixed number of repetions;
+postfixed, curly-braced number indicates a fixed number of repetitions;
 e.g., \code{A\{6\}} is equivalent to a sequence of six copies of \code{A}.
 
 \subsection{Programs}
@@ -862,7 +862,7 @@ \section{Warnings vs.\ Errors}
 errors.  Error messages arise when a fatal problem occurs under the
 hood from which there is no way to recover.  Warning messages arise
 in circumstances from which the underlying program can continue to
-operaate.
+operate.
 
 An example warning message is an informational message about
 ill-formed input to an underlying function, which results in a
@@ -893,7 +893,7 @@ \section{Parsing and Compilation}
 \section{Initialization}
 
 \begin{itemize}
-\item {\it vanishing density.} This message arrises when there is a
+\item {\it vanishing density.} This message arises when there is a
   problem with initialization not providing a finite log probability.
 %
 \end{itemize}
@@ -964,7 +964,7 @@ \section{\code{increment\_log\_prob} Statement}
 \section{\code{lp\_\_} Variable}
 
 \begin{description}
-\item[Depcreated]
+\item[Deprecated]
 The variable \code{lp\_\_} is available wherever log density increment
 statements are allowed (\code{target~+=} and \Verb|~| shorthand
 statements).
@@ -1113,7 +1113,7 @@ \section{\code{if\_else} Function}
   otherwise.
 %
 \item[Replacement] Use the conditional operator which allows more
-  flexiblity in the types of \code{b} and \code{c} and is much more
+  flexibility in the types of \code{b} and \code{c} and is much more
   efficient in that it only evaluates whichever of \code{b} or
   \code{c} is returned.  See \refsection{conditional-operator} for
   full details of the conditional operator.  Replace
@@ -1126,7 +1126,7 @@ \section{\code{if\_else} Function}
 \end{stancode}
 \end{description}
 
-\section{\code{\#} Commens}
+\section{\code{\#} Comments}
 
 \begin{description}
 \item[Deprecated] The use of \code{\#} for line-based comments is
diff --git a/src/docs/stan-reference/distributions.tex b/src/docs/stan-reference/distributions.tex
index d725717a94f..880b3119944 100644
--- a/src/docs/stan-reference/distributions.tex
+++ b/src/docs/stan-reference/distributions.tex
@@ -708,7 +708,7 @@ \chapter{Unbounded Discrete Distributions}
 \section{Negative Binomial Distribution}
 
 For the negative binomial distribution Stan uses the parameterization
-described in \citet{GelmanEtAl:2013}.  For alternative parametrizations, see \refsection{nbalt}.
+described in \citet{GelmanEtAl:2013}.  For alternative parameterizations, see \refsection{nbalt}.
 
 \subsubsection{Probability Mass Function}
 
@@ -1480,7 +1480,7 @@ \subsubsection{Stan Functions}
   given location \farg{mu} and scale \farg{sigma}}
 %
 \fitem{real}{lognormal\_lcdf}{reals \farg{y} \textbar\ reals \farg{mu}, reals
-  \farg{sigma}}{The log of the lognormal cumulative distribution fucntion
+  \farg{sigma}}{The log of the lognormal cumulative distribution function
    of \farg{y} given location \farg{mu} and scale \farg{sigma}}
 %
 \fitem{real}{lognormal\_lccdf}{reals \farg{y} \textbar\ reals \farg{mu}, reals
@@ -1911,7 +1911,7 @@ \subsubsection{Probability Density Function}
 \subsubsection{Stan Functions}
 \begin{description}
 \fitem{real}{rayleigh\_lpdf}{reals \farg{y} \textbar\ reals \farg{sigma}}{The log of the Rayleigh
- ensity of \farg{y} given scale \farg{sigma}}
+ density of \farg{y} given scale \farg{sigma}}
 %
 \fitem{real}{rayleigh\_cdf}{real \farg{y}, real
 \farg{sigma}}{The Rayleigh cumulative distribution of \farg{y} given scale \farg{sigma}}
@@ -2490,7 +2490,7 @@ \subsubsection{Probability Density Function}
 where $y_i$ is the $i$th row of $y$.  This is used to efficiently handle
 Gaussian Processes with multi-variate outputs where only the output dimensions
 share a kernel function but vary based on their scale.  If the model allows
-parametrization in terms of Cholesky factor of the kernel matrix, this distribution
+parameterization in terms of Cholesky factor of the kernel matrix, this distribution
 is also more efficient than $\distro{MultiGP}()$. Note that this
 function does not take into account the mean prediction.
 
diff --git a/src/docs/stan-reference/examples.tex b/src/docs/stan-reference/examples.tex
index 45301e8c07c..ecc58c2c25f 100644
--- a/src/docs/stan-reference/examples.tex
+++ b/src/docs/stan-reference/examples.tex
@@ -1581,7 +1581,7 @@ \subsubsection{Optimization through Vectorization}
   ...
 \end{stancode}
 %
-thsi fails because it breaks the vectorization of sampling for
+this fails because it breaks the vectorization of sampling for
 \code{beta},%
 %
 \footnote{Thanks to Mike Lawrence for pointing this out in the GitHub
diff --git a/src/docs/stan-reference/functions.tex b/src/docs/stan-reference/functions.tex
index 66cf89f43bf..6fa9f55d01e 100644
--- a/src/docs/stan-reference/functions.tex
+++ b/src/docs/stan-reference/functions.tex
@@ -376,7 +376,7 @@ \section{Log Probability Function}\label{get-lp.section}
 various ways by a Stan program.  The variables are first transformed
 from unconstrained to constrained, and the log Jacobian determinant
 added to the log probability accumulator.  Then the model block is
-executed on the constrained parmeters, with each sampling statement
+executed on the constrained parameters, with each sampling statement
 (\Verb|~|) and log probability increment statement
 (\code{increment\_log\_prob}) adding to the accumulator.  At the end
 of the model block execution, the value of the log probability
@@ -2474,7 +2474,7 @@ \section{Diagonal Matrix Functions}
 \end{description}
 
 Although the \code{diag\_matrix} function is available, it is unlikely
-to ever show up in an efficient Stan program.  For exmaple, rather
+to ever show up in an efficient Stan program.  For example, rather
 than converting a diagonal to a full matrix for use as a covariance
 matrix,
 %
@@ -2493,7 +2493,7 @@ \section{Diagonal Matrix Functions}
 Rather than writing \code{m~*~diag\_matrix(v)} where \code{m} is a
 matrix and \code{v} is a vector, it is much more efficient to write
 \code{diag\_post\_multiply(m,~v)} (and similarly for pre-multiplication).
-By the same topken, it is better to use \code{quad\_form\_diag(m,~v)}
+By the same token, it is better to use \code{quad\_form\_diag(m,~v)}
 rather than \code{quad\_form(m,~diag\_matrix(v))}.
 
 
diff --git a/src/docs/stan-reference/language.tex b/src/docs/stan-reference/language.tex
index 762e1b22161..e600fa1421a 100644
--- a/src/docs/stan-reference/language.tex
+++ b/src/docs/stan-reference/language.tex
@@ -753,7 +753,7 @@ \subsection{Cholesky Factors of Correlation Matrices}
 lower-triangular matrix with positive diagonal entries and rows that
 are of length 1 (i.e., $\sum_{n=1}^K L_{m,n}^2 = 1$).  If $L$ is a
 Cholesky factor for a correlation matrix, then $L\,L^{\top}$ is a
-correlation matrix (i.e., symmetric postive definite with a unit
+correlation matrix (i.e., symmetric positive definite with a unit
 diagonal).
 
 A declaration such as
@@ -3039,7 +3039,7 @@ \subsection{User-Transformed Variables}
 \end{stancode}
 %
 Unfortunately, this is not enough to properly model \code{beta} as
-having a lognormal distribution.  Whenever a nonliner transform is
+having a lognormal distribution.  Whenever a nonlinear transform is
 applied to a parameter, such as the logarithm function being applied
 to \code{beta} here, and then used on the left-hand side of a sampling
 statement or on the left of a vertical bar in a log pdf function, an
@@ -3155,7 +3155,7 @@ \subsubsection{Complementary cumulative distribution functions}
 Unlike the cdf, the ccdf is exclusive of the bound, hence the event
 $X > x$ rather than the cdf's event $X \leq x$.
 
-For continous distributions, the ccdf works out to
+For continuous distributions, the ccdf works out to
 \[
 F^C_X(x)
 \ = \ 1 - \int_{-\infty}^x p_X(u) \, du
diff --git a/src/docs/stan-reference/process.tex b/src/docs/stan-reference/process.tex
index 40afd13ba40..5c9c34df1b6 100644
--- a/src/docs/stan-reference/process.tex
+++ b/src/docs/stan-reference/process.tex
@@ -45,7 +45,7 @@ \section{Make it Reproducible}
 through the models and produce whatever posterior analysis you need.
 Scripts can be written for the shell, R, or Python.  Whatever language
 a script is in, it should be self contained and not depend on global
-variables having been set, other data being read in, etc.  
+variables having been set, other data being read in, etc.
 
 See \refchapter{reproducibility} for complete information on
 reproducibility in Stan and its interfaces.
@@ -54,7 +54,7 @@ \subsection{Scripts are Good Documentation}
 
 It may seem like overkill if running the project is only a single line
 of code, but the script provides not only a way to run the code, but
-also a form of concrete documentation for what is run. 
+also a form of concrete documentation for what is run.
 
 
 \subsection{Randomization and Saving Seeds}
@@ -87,7 +87,7 @@ \section{Make it Readable}
 developer will want to read it later.  One of the motivations of
 Stan's design was to make models self-documenting in terms of variable
 usage (e.g., data versus parameter), types (e.g., covariance matrix
-vs. unconstrained matrix) and sizes.  
+vs. unconstrained matrix) and sizes.
 
 A large part of readability is consistency.  Particularly in naming
 and layout.  Not only of programs themselves, but the directories and
@@ -123,13 +123,13 @@ \section{Design Top-Down, Code Bottom-Up}
 
 Software projects are almost always designed top-down from one or more
 intended use cases.  Good software coding, on the other hand, is
-typically done bottom-up.  
+typically done bottom-up.
 
 The motivation for top-down design is obvious.  The motivation for
 bottom-up development is that it is much easier to develop software
 using components that have been thoroughly tested.  Although Stan has
 no built-in support for either modularity or testing, many of the same
-principles apply.  
+principles apply.
 
 The way the developers of Stan themselves build models is to start as
 simply as possibly, then build up. This is true even if we have a
@@ -147,12 +147,12 @@ \section{Fit Simulated Data}
 computationally is to generate simulated (i.e., ``fake'') data with
 known parameter values, then see if the model can recover these
 parameters from the data.  If not, there is very little hope that it
-will do the right thing with data from the wild.  
+will do the right thing with data from the wild.
 
 There are fancier ways to do this, where you can do things like run
 $\chi^2$ tests on marginal statistics or follow the paradigm
 introduced in \citep{CookGelmanRubin:2006}, which involves interval
-tests.  
+tests.
 
 \section{Debug by Print}
 
@@ -162,7 +162,7 @@ \section{Debug by Print}
 %
 \footnote{The ``f'' is not a typo --- it's a historical artifact of
   the name of the \code{printf} function used for formatted printing
-  in C.} 
+  in C.}
 
 Stan supports print statements with one or more string or expression
 arguments.  Because Stan is an imperative language, variables can have
@@ -206,7 +206,7 @@ \subsection{Code Never Lies}
 The machine does what the code says, not what the documentation says.
 Documentation, on the other hand, might not match the code.  Code
 documentation easily rots as the code evolves if the documentation is
-not well maintained.  
+not well maintained.
 
 Thus it is always preferable to write readable code as opposed to
 documenting unreadable code.  Every time you write a piece of
@@ -225,8 +225,8 @@ \subsection{Comment Styles in Stan}
 
 \subsection{What Not to Comment}
 
-When commenting code, it is usually safe to assume that you are 
-writing the comments for other programmers who understand the basics 
+When commenting code, it is usually safe to assume that you are
+writing the comments for other programmers who understand the basics
 of the programming language in use.  In other words, don't comment the
 obvious.  For instance, there is no need to have comments
 such as the following, which add nothing to the code.
@@ -392,19 +392,19 @@ \section{Source Code Management}
 \includegraphics[height=4.5in]{img/git-process.png}
 \end{center}
 \caption{\small\it Git branching process for master
-  and development branches.  New features and ordinary (not hot) 
+  and development branches.  New features and ordinary (not hot)
   bugfixes are developed in branches from and merged back into the
   development  branch.  These are then
   collected into releases and merged with the master branch, which
   tracks the releases.  Hotfix branches are like feature or ordinary
-  bugfix branches, but branch from master and merge back into master. 
+  bugfix branches, but branch from master and merge back into master.
   Image courtesy of \citep{Driessen:2010}.}\label{git-process.figure} \end{figure}
 %
 The basic Git process for branching, releasing, hotfixing, and merging
 follows standard Git procedure \citep{Driessen:2010}. A diagram
 outlining the process is presented in \reffigure{git-process}. The key
 idea is that the master branch is always at the latest release, with
-older commits tagged for previous releases. 
+older commits tagged for previous releases.
 
 The development branch always represents the current state of
 development. Feature and bugfix branches branch from the development
@@ -459,7 +459,7 @@ \subsection{Unit Testing}
   \url{http://cran.r-project.org/web/packages/RUnit/index.html}.}
 
 The point of unit testing is to test the program at the application
-programmer interface (API) level, not the end-to-end functional level.  
+programmer interface (API) level, not the end-to-end functional level.
 
 The tests are run automatically when pull requests are created through
 a continuous integration process. PyStan uses the Travis continuous
@@ -491,7 +491,7 @@ \subsection{Unit Testing}
 archive (zip file) or may be cloned for development under Git.
 Cloning in Git provides a complete copy of all version history,
 including every commit to every branch since the beginning of the
-project.  
+project.
 
 User feedback is accommodated through three channels. First, and most
 formally, there is an issue tracker for each of Stan C++, CmdStan,
@@ -517,7 +517,7 @@ \subsection{Functional Testing}
 undergoes rigorous end-to-end testing of its model fitting
 functionality. Models with known answers are evaluated for both speed
 and accuracy. Models without analytic solutions are evaluated in terms
-of MCMC error.  
+of MCMC error.
 
 
 \section{Release Cycles}
@@ -538,7 +538,7 @@ \section{Release Cycles}
 Stan is released exclusively as source code, so nothing needs to be
 done with respect to binary release management or compatibility.  The
 source is tested so that it can be used under Windows, Mac OS X, and
-Linux.  
+Linux.
 
 Instructions for installing Stan C++, CmdStan, RStan, and PyStan are
 managed separately and distributed with the associated product.
@@ -575,7 +575,7 @@ \subsection{Minor Version and Forward Compatibility}
 feature without breaking backward compatibility, its minor version
 number is incremented.  Whenever the minor version is incremented, the
 patch level reverts to 0.  Most Stan releases increment the minor
-version. 
+version.
 
 \subsection{Bug Fixes and Patch Releases}
 
@@ -616,12 +616,12 @@ \section{Availability of Current and Historical Archive Versions}
 first cloning a repository, it may be kept up to date using Git's pull
 command (available from the command-line or platform-specific
 graphical user interfaces).   An alternative delivery mechanism is as
-a zip archive of a snapshot of the system.  
+a zip archive of a snapshot of the system.
 
 \section{Maintenance, Support, and Retirement}
 
 Stan support extends only to the most current release. Specifically,
-patches are not backported to older versions.  
+patches are not backported to older versions.
 
 Early fixes of bugs are available to users in the form of updated
 development branches. Snapshots of the entire code base at every
@@ -675,7 +675,7 @@ \section{Qualified Personnel}
 
 The members of the Stan development team are drawn from multiple
 computational, scientific, and statistical disciplines across
-academic, not-for-profit, and industrial laboratories. 
+academic, not-for-profit, and industrial laboratories.
 
 Most of Stan's developers have Ph.D. degrees, some have Master's
 degrees, and some are currently enrolled as undergraduate or graduate
@@ -693,7 +693,7 @@ \section{Qualified Personnel}
 Institutions at which the members of the Stan development team hold
 appointments as of Stan release 2.15.0 include Columbia University,
 Adobe Creative Technologies Lab, University of Warwick, University of
-Toronto (Scarborough), Dartmouth Colloge, University of Washington,
+Toronto (Scarborough), Dartmouth College, University of Washington,
 Lucidworks, CNRS (Paris), St. George's, University of London,
 University of Massachussetts (Amherst), Aalto University, and Novartis
 Pharma.
@@ -734,7 +734,7 @@ \section{Disaster Recovery}
 The entire history of the Stan C++, CmdStan, RStan, and PyStan
 repositories is maintained on the GitHub servers as well as on each
 developer's individual copy of the repository. Specifically, each
-repository can be reconstituted from any of the core 
+repository can be reconstituted from any of the core
 developers' machines.
 
 
@@ -788,7 +788,7 @@ \chapter{Reproducibility}\label{reproducibility.chapter}
 you are running in RStan, Rcpp handles the conversion between R's
 floating point numbers and C++ doubles. If Rcpp changes the conversion
 process or use different types, the results are not guaranteed to be
-the same down to the bit level.  
+the same down to the bit level.
 
 The compiler and compiler settings can also be an issue.  There is a
 nice discussion of the issues and how to control reproducibility in
@@ -799,7 +799,7 @@ \chapter{Reproducibility}\label{reproducibility.chapter}
 \chapter{Contributed Modules}
 
 \noindent
-Stan is an open-source project and welcomes user contributions.  
+Stan is an open-source project and welcomes user contributions.
 
 In order to reduce maintenance on the main trunk of Stan development
 and to allow developer-specified licenses, contributed Stan modules
@@ -812,10 +812,10 @@ \section{Contributing a Stan Module}
 Stan developers either through one of the following.
 %
 \begin{itemize}
-\item \code{stan-users} mailing list: 
+\item \code{stan-users} mailing list:
 \\
 \url{https://groups.google.com/forum/?fromgroups#!forum/stan-users}
-\item Stan e-mail: 
+\item Stan e-mail:
 \\
 \href{mailto:mc.stanislaw@gmail.com}{\tt mc.stanislaw@gmail.com}
 \end{itemize}
@@ -889,7 +889,7 @@ \section{Line Length}
 
 \section{File Extensions}
 
-The recommended file extension for Stan model files is \code{.stan}.  
+The recommended file extension for Stan model files is \code{.stan}.
 For Stan data dump files, the recommended extension is \code{.R}, or
 more informatively, \code{.data.R}.
 
@@ -947,7 +947,7 @@ \section{Local Variable Scope}
   }
 }
 \end{stancode}
-% 
+%
 The local variable can be eliminated altogether, as follows.
 %
 \begin{stancode}
@@ -974,7 +974,7 @@ \subsubsection{Scope of Compound Structures with Componentwise Assignment}
 model {
   vector[K] mu;
   for (n in 1:N) {
-    for (k in 1:K) 
+    for (k in 1:K)
       mu[k] = ...;
     y[n] ~ multi_normal(mu,Sigma);
 }
@@ -1002,7 +1002,7 @@ \subsection{Optional Parentheses for Single-Statement Blocks}
 for (n in 1:N)
   y[n] ~ normal(mu,1);
 \end{stancode}
-%  
+%
 Single-statement blocks can also be written on a single line, as
 in the following example.
 %
@@ -1050,7 +1050,7 @@ \subsection{Parentheses in Nested Operator Expressions}
 
 Similarly, comparison operators can usually be written with minimal
 bracketing, with the form \code{y[n] > 0 || x[n] != 0} preferred to
-the bracketed form \code{(y[n] > 0) || (x[n] != 0)}.  
+the bracketed form \code{(y[n] > 0) || (x[n] != 0)}.
 
 \subsection{No Open Brackets on Own Line}
 
@@ -1059,13 +1059,13 @@ \subsection{No Open Brackets on Own Line}
 section, not as follows.
 %
 \begin{stancode}
-for (n in 1:N) 
+for (n in 1:N)
 {
   y[n] ~ normal(mu,1);
 }
 \end{stancode}
 %
-This also goes for parameters blocks, transformed data blocks, 
+This also goes for parameters blocks, transformed data blocks,
 which should look as follows.
 %
 \begin{stancode}
@@ -1140,9 +1140,9 @@ \section{Functions}
 %
 \begin{stancode}
 /**
- * Return a data matrix of specified size with rows 
- * corresponding to items and the first column filled 
- * with the value 1 to represent the intercept and the 
+ * Return a data matrix of specified size with rows
+ * corresponding to items and the first column filled
+ * with the value 1 to represent the intercept and the
  * remaining columns randomly filled with unit-normal draws.
  *
  * @param N Number of rows correspond to data items
@@ -1150,7 +1150,7 @@ \section{Functions}
  *          item.
  * @return Simulated predictor matrix.
  */
-matrix predictors_rng(int N, int K) {  
+matrix predictors_rng(int N, int K) {
   ...
 \end{stancode}
 %
@@ -1158,7 +1158,7 @@ \section{Functions}
 asterisk of the open comment, and the end comment \code{*/} is also
 aligned on the asterisk.  The tags \code{@param} and \code{@return}
 are used to label function arguments (i.e., parameters) and return
-values.  
+values.
 
 \section{White Space}
 
@@ -1192,14 +1192,14 @@ \subsection{No Tabs}
 may be used anywhere other whitespace occurs.  Using tabs to layout a
 program is highly unportable because the number of spaces
 represented by a single tab character varies depending on which
-program is doing the rendering and how it is configured.  
+program is doing the rendering and how it is configured.
 
 \subsection{Two-Character Indents}
 
 Stan has standardized on two space characters of indentation, which is
 the standard convention for C/C++ code.  Another sensible choice is
 four spaces, which is the convention for Java and Python.  Just be
-consistent.  
+consistent.
 
 \subsection{Space Between \code{if} and Condition}
 
@@ -1236,7 +1236,7 @@ \subsection{Breaking Expressions across Lines}
 \end{stancode}
 %
 Here, the multiplication operator (\code{*}) is aligned to clearly
-signal the multiplicands in the product.  
+signal the multiplicands in the product.
 
 For function arguments, break after a comma and line the next
 argument up underneath as follows.
@@ -1257,10 +1257,10 @@ \subsection{Optional Spaces after Commas}
 
 \subsection{Unix Newlines}
 
-Wherever possible, Stan programs should use a single line feed 
-character to separate lines.  All of the Stan developers (so far, at 
-least) work on Unix-like operating systems and using a standard 
-newline makes the programs easier for us to read and share.  
+Wherever possible, Stan programs should use a single line feed
+character to separate lines.  All of the Stan developers (so far, at
+least) work on Unix-like operating systems and using a standard
+newline makes the programs easier for us to read and share.
 
 \subsubsection{Platform Specificity of Newlines}
 
diff --git a/src/docs/stan-reference/programming.tex b/src/docs/stan-reference/programming.tex
index 043da3c2492..02c60f2f278 100644
--- a/src/docs/stan-reference/programming.tex
+++ b/src/docs/stan-reference/programming.tex
@@ -536,12 +536,12 @@ \subsection{Varying Lower Bounds}
 The Jacobian for adding a constant is one, so its log drops out of the
 log density.
 
-Even if the lower bound is a paramter rather than data, there is no
+Even if the lower bound is a parameter rather than data, there is no
 Jacobian required, because the transform from $(L, \alpha_{\mathrm
   raw})$ to $(L + \alpha_{\mathrm raw}, \alpha_{\mathrm raw}$ produces
 a Jacobian derivative matrix with a unit determinant.
 
-It's also possible implement the transform by directly trnasforming
+It's also possible implement the transform by directly transforming
 an unconstrained parameter and accounting for the Jacobian.
 %
 \begin{stancode}
@@ -573,7 +573,7 @@ \subsection{Varying Lower Bounds}
 
 \subsection{Varying Upper and Lower Bounds}
 
-Suppowe there are lower and upper bounds that vary by parameter.
+Suppose there are lower and upper bounds that vary by parameter.
 These can be applied to shift and rescale a parameter constrained to
 $(0, 1)$.
 %
@@ -4266,7 +4266,3 @@ \section{Standardizing Predictors and Outputs}
 \begin{stancode}
     y_std ~ normal(alpha_std + beta_std * x_std, sigma_std);
 \end{stancode}
-
-
-
-