From e9638eaee2397147761daf655a2c0d21a0fc7a59 Mon Sep 17 00:00:00 2001 From: Daniel Lee Date: Sun, 16 Oct 2016 23:25:11 -0400 Subject: [PATCH 01/10] Fixes #2082. Adding test for multiple translation units and inlining remaining functions --- make/tests | 17 +++++++++ src/stan/command/stanc_helper.hpp | 12 +++---- src/stan/io/cmd_line.hpp | 36 +++++++++---------- src/stan/lang/rethrow_located.hpp | 2 +- src/test/unit/multiple_translation_units1.cpp | 5 +++ src/test/unit/multiple_translation_units2.cpp | 5 +++ .../unit/multiple_translation_units_test.cpp | 7 ++++ 7 files changed, 59 insertions(+), 25 deletions(-) create mode 100644 src/test/unit/multiple_translation_units1.cpp create mode 100644 src/test/unit/multiple_translation_units2.cpp create mode 100644 src/test/unit/multiple_translation_units_test.cpp diff --git a/make/tests b/make/tests index 91663e22106..097da960599 100644 --- a/make/tests +++ b/make/tests @@ -128,6 +128,23 @@ test/dummy.cpp: .PHONY: test-headers test-headers: $(HEADER_TESTS) +## +# Adding a test for multiple translation units. If this fails, +# a new function is probably missing an inline +## + +test/unit/multiple_translation_units%.o : src/test/unit/multiple_translation_units%.cpp + @mkdir -p $(dir $@) + $(COMPILE.c) -O$O $< $(OUTPUT_OPTION) + +test/unit/libmultiple.so : test/unit/multiple_translation_units1.o test/unit/multiple_translation_units2.o + @mkdir -p $(dir $@) + $(CC) -shared $(OUTPUT_OPTION) $^ + +src/test/unit/multiple_translation_units_test.cpp : test/unit/libmultiple.so + + + ############################################################ ## # Use the stanc compiler to generate C++ from Stan programs diff --git a/src/stan/command/stanc_helper.hpp b/src/stan/command/stanc_helper.hpp index 5745459119e..52aa98edd4e 100644 --- a/src/stan/command/stanc_helper.hpp +++ b/src/stan/command/stanc_helper.hpp @@ -11,7 +11,7 @@ #include -void print_version(std::ostream* out_stream) { +inline void print_version(std::ostream* out_stream) { if (!out_stream) return; *out_stream << "stanc version " << stan::MAJOR_VERSION @@ -25,7 +25,7 @@ void print_version(std::ostream* out_stream) { /** * Prints the Stan compiler (stanc) help. */ -void print_stanc_help(std::ostream* out_stream) { +inline void print_stanc_help(std::ostream* out_stream) { using stan::io::print_help_option; if (!out_stream) return; @@ -56,8 +56,8 @@ void print_stanc_help(std::ostream* out_stream) { "Do not fail if a function is declared but not defined"); } -void delete_file(std::ostream* err_stream, - const std::string& file_name) { +inline void delete_file(std::ostream* err_stream, + const std::string& file_name) { int deleted = std::remove(file_name.c_str()); if (deleted != 0 && file_name.size() > 0) if (err_stream) @@ -66,8 +66,8 @@ void delete_file(std::ostream* err_stream, } -int stanc_helper(int argc, const char* argv[], - std::ostream* out_stream, std::ostream* err_stream) { +inline int stanc_helper(int argc, const char* argv[], + std::ostream* out_stream, std::ostream* err_stream) { static const int SUCCESS_RC = 0; static const int EXCEPTION_RC = -1; static const int PARSE_FAIL_RC = -2; diff --git a/src/stan/io/cmd_line.hpp b/src/stan/io/cmd_line.hpp index 0d526130ea4..1ee8946e40d 100644 --- a/src/stan/io/cmd_line.hpp +++ b/src/stan/io/cmd_line.hpp @@ -24,9 +24,9 @@ namespace stan { * @param width Width of option (defaults to 20). * @param o Output stream ptr, default to null */ - void pad_help_option(std::ostream* o, - const std::string& option = "", - unsigned int width = 20) { + inline void pad_help_option(std::ostream* o, + const std::string& option = "", + unsigned int width = 20) { if (!o) return; *o << " " << option; int padding = width - option.size(); @@ -46,10 +46,10 @@ namespace stan { * @param msg * @param note */ - void print_help_helper(std::ostream* o, - const std::string& key_val, - const std::string& msg, - const std::string& note = "") { + inline void print_help_helper(std::ostream* o, + const std::string& key_val, + const std::string& msg, + const std::string& note = "") { if (!o) return; pad_help_option(o, key_val); *o << msg @@ -71,11 +71,11 @@ namespace stan { * @param msg * @param note */ - void print_help_option(std::ostream* o, - const std::string& key, - const std::string& value_type, - const std::string& msg, - const std::string& note = "") { + inline void print_help_option(std::ostream* o, + const std::string& key, + const std::string& value_type, + const std::string& msg, + const std::string& note = "") { std::stringstream ss; ss << "--" << key; if (value_type.size() > 0) @@ -187,7 +187,7 @@ namespace stan { * @tparam Type of value. */ template - bool val(const std::string& key, T& x) const { + inline bool val(const std::string& key, T& x) const { if (!has_key(key)) return false; std::stringstream s(key_val_.find(key)->second); @@ -210,7 +210,7 @@ namespace stan { * * @return Number of bare arguments. */ - size_t bare_size() const { + inline size_t bare_size() const { return bare_.size(); } @@ -228,7 +228,7 @@ namespace stan { * @tparam T Type of value returned. */ template - bool bare(size_t n, T& x) const { + inline bool bare(size_t n, T& x) const { if (n >= bare_.size()) return false; std::stringstream s(bare_[n]); @@ -272,7 +272,7 @@ namespace stan { // explicit instantation for std::string to allow for spaces // in bare_[n] template <> - bool cmd_line::bare(size_t n, std::string& x) const { + inline bool cmd_line::bare(size_t n, std::string& x) const { if (n >= bare_.size()) return false; x = bare_[n]; @@ -282,8 +282,8 @@ namespace stan { // explicit instantation for std::string to allow for spaces // in key_val_ template <> - bool cmd_line::val(const std::string& key, - std::string& x) const { + inline bool cmd_line::val(const std::string& key, + std::string& x) const { if (!has_key(key)) return false; x = key_val_.find(key)->second; diff --git a/src/stan/lang/rethrow_located.hpp b/src/stan/lang/rethrow_located.hpp index f238b489d99..4c35acb7ac5 100644 --- a/src/stan/lang/rethrow_located.hpp +++ b/src/stan/lang/rethrow_located.hpp @@ -84,7 +84,7 @@ namespace stan { * @param[in] line Line number in Stan source program where * exception originated. */ - void rethrow_located(const std::exception& e, int line) { + inline void rethrow_located(const std::exception& e, int line) { using std::bad_alloc; // -> exception using std::bad_cast; // -> exception using std::bad_exception; // -> exception diff --git a/src/test/unit/multiple_translation_units1.cpp b/src/test/unit/multiple_translation_units1.cpp new file mode 100644 index 00000000000..20d5b08375d --- /dev/null +++ b/src/test/unit/multiple_translation_units1.cpp @@ -0,0 +1,5 @@ +#include + +stan::math::var function1() { + return 0; +} diff --git a/src/test/unit/multiple_translation_units2.cpp b/src/test/unit/multiple_translation_units2.cpp new file mode 100644 index 00000000000..fe6bb4c27ef --- /dev/null +++ b/src/test/unit/multiple_translation_units2.cpp @@ -0,0 +1,5 @@ +#include + +stan::math::var function2() { + return 0; +} diff --git a/src/test/unit/multiple_translation_units_test.cpp b/src/test/unit/multiple_translation_units_test.cpp new file mode 100644 index 00000000000..45795f6bb76 --- /dev/null +++ b/src/test/unit/multiple_translation_units_test.cpp @@ -0,0 +1,7 @@ +#include + +TEST(multiple_translation_units, compile) { + SUCCEED() + << "this test compiling indicates that compiling the stan library " + << "with multiple translation units is ok."; +} From 99373aefa142d679598e3e0cc703eb02bd7896ef Mon Sep 17 00:00:00 2001 From: Daniel Lee Date: Sat, 22 Oct 2016 22:27:26 -0400 Subject: [PATCH 02/10] Fixing makefile so it respects the command-line option --- make/os_win | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/make/os_win b/make/os_win index eaad45ec15e..baf4b482bf9 100644 --- a/make/os_win +++ b/make/os_win @@ -14,15 +14,15 @@ ## EXE = .exe -O_STANC = 3 +O_STANC ?= 3 CFLAGS_GTEST += -DGTEST_HAS_PTHREAD=0 PATH_SEPARATOR = '\' ## Detecting Process Bitness: ## http://blogs.msdn.com/b/david.wang/archive/2006/03/26/howto-detect-process-bitness.aspx ifeq ($(PROCESSOR_ARCHITECTURE)$(PROCESSOR_ARCHITEW6432),x86) - BIT=32 + BIT?=32 else - BIT=64 + BIT?=64 endif ifeq (g++,$(CC_TYPE)) CFLAGS += -m$(BIT) From c0cbe376f7a6169d80c3648a978123519e4357d3 Mon Sep 17 00:00:00 2001 From: Daniel Lee Date: Tue, 25 Oct 2016 23:55:25 -0400 Subject: [PATCH 03/10] Updating make file --- make/tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/make/tests b/make/tests index 097da960599..db5b99035e6 100644 --- a/make/tests +++ b/make/tests @@ -139,7 +139,7 @@ test/unit/multiple_translation_units%.o : src/test/unit/multiple_translation_uni test/unit/libmultiple.so : test/unit/multiple_translation_units1.o test/unit/multiple_translation_units2.o @mkdir -p $(dir $@) - $(CC) -shared $(OUTPUT_OPTION) $^ + $(CC) $(CFLAGS) -shared $(OUTPUT_OPTION) $^ src/test/unit/multiple_translation_units_test.cpp : test/unit/libmultiple.so From ee8f23e102a666f3bd1073a89fd9d8b34adf2e24 Mon Sep 17 00:00:00 2001 From: stan-buildbot Date: Thu, 15 Jun 2017 20:30:12 -0400 Subject: [PATCH 04/10] Updates the Math submodule to 8008de4. --- lib/stan_math | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/stan_math b/lib/stan_math index 87ec15ad842..8008de4e4d8 160000 --- a/lib/stan_math +++ b/lib/stan_math @@ -1 +1 @@ -Subproject commit 87ec15ad842fbeba2c2b6733b8839f6ea86e38f3 +Subproject commit 8008de4e4d867ae21e6a66d561588a4ebb80e866 From 286fe9567a6e9a3e1df9a457d45149a7b0686b5c Mon Sep 17 00:00:00 2001 From: Daniel Lee Date: Fri, 16 Jun 2017 21:43:40 -0400 Subject: [PATCH 05/10] Adding inline to more functions --- src/stan/command/stanc_helper.hpp | 170 +++++++++++++------------- src/stan/services/util/create_rng.hpp | 4 +- 2 files changed, 87 insertions(+), 87 deletions(-) diff --git a/src/stan/command/stanc_helper.hpp b/src/stan/command/stanc_helper.hpp index 3419ee8b2f7..1adf51700c0 100644 --- a/src/stan/command/stanc_helper.hpp +++ b/src/stan/command/stanc_helper.hpp @@ -91,7 +91,7 @@ inline void delete_file(std::ostream* err_stream, * @param[in] in_file_name the name of the input file * @return a valid C++ identifier based on the file name. */ -std::string identifier_from_file_name(const std::string& in_file_name) { +inline std::string identifier_from_file_name(const std::string& in_file_name) { size_t slashInd = in_file_name.rfind('/'); size_t ptInd = in_file_name.rfind('.'); if (ptInd == std::string::npos) @@ -107,7 +107,7 @@ std::string identifier_from_file_name(const std::string& in_file_name) { std::string result = in_file_name.substr(slashInd, ptInd - slashInd); for (std::string::iterator strIt = result.begin(); - strIt != result.end(); strIt++) { + strIt != result.end(); strIt++) { if (!isalnum(*strIt) && *strIt != '_') { *strIt = '_'; } @@ -124,12 +124,12 @@ std::string identifier_from_file_name(const std::string& in_file_name) { * @param[in] extension The extension (WITHOUT dot)- e.g. "stan". * @return true if the file has the extension */ -bool has_extension(const std::string& file_name, const std::string& extension) { +inline bool has_extension(const std::string& file_name, const std::string& extension) { if (file_name.length() >= extension.length() + 1) { // +1 for the dot if (0 == file_name.compare (file_name.length() - extension.length(), - extension.length(), extension) - && file_name[file_name.length() - extension.length() - 1] == '.') - return true; + extension.length(), extension) + && file_name[file_name.length() - extension.length() - 1] == '.') + return true; else return false; } else { @@ -144,18 +144,18 @@ bool has_extension(const std::string& file_name, const std::string& extension) { * @param[in] identifier_type the type of the identifier to be reported * in error messages */ -void check_identifier(const std::string& identifier, - const std::string& identifier_type) { +inline void check_identifier(const std::string& identifier, + const std::string& identifier_type) { if (!isalpha(identifier[0]) && identifier[0] != '_') { std::string msg(identifier_type + " must not start with a " "number or symbol other than _"); throw std::invalid_argument(msg); } for (std::string::const_iterator strIt = identifier.begin(); - strIt != identifier.end(); strIt++) { + strIt != identifier.end(); strIt++) { if (!isalnum(*strIt) && *strIt != '_') { std::string msg(identifier_type - + " must contain only letters, numbers and _"); + + " must contain only letters, numbers and _"); throw std::invalid_argument(msg); } } @@ -232,95 +232,95 @@ inline int stanc_helper(int argc, const char* argv[], bool valid_input = false; switch (compilation_type) { - case kModel: { - std::string model_name; - if (cmd.has_key("name")) { - cmd.val("name", model_name); - } else { - model_name = identifier_from_file_name(in_file_name) + "_model"; - } - - // TODO(martincerny) Check that the -namespace flag is not set + case kModel: { + std::string model_name; + if (cmd.has_key("name")) { + cmd.val("name", model_name); + } else { + model_name = identifier_from_file_name(in_file_name) + "_model"; + } - if (cmd.has_key("o")) { - cmd.val("o", out_file_name); - } else { - out_file_name = model_name; - // TODO(carpenter): shouldn't this be .hpp without a main()? - out_file_name += ".cpp"; - } + // TODO(martincerny) Check that the -namespace flag is not set - check_identifier(model_name, "model_name"); + if (cmd.has_key("o")) { + cmd.val("o", out_file_name); + } else { + out_file_name = model_name; + // TODO(carpenter): shouldn't this be .hpp without a main()? + out_file_name += ".cpp"; + } - std::fstream out(out_file_name.c_str(), std::fstream::out); - if (out_stream) { - *out_stream << "Model name=" << model_name << std::endl; - *out_stream << "Input file=" << in_file_name << std::endl; - *out_stream << "Output file=" << out_file_name << std::endl; - } - if (!out.is_open()) { - std::stringstream msg; - msg << "Failed to open output file " - << out_file_name.c_str(); - throw std::invalid_argument(msg.str()); - } + check_identifier(model_name, "model_name"); - valid_input = stan::lang::compile(err_stream, in, out, model_name, - allow_undefined, in_file_name, include_paths); - out.close(); - break; + std::fstream out(out_file_name.c_str(), std::fstream::out); + if (out_stream) { + *out_stream << "Model name=" << model_name << std::endl; + *out_stream << "Input file=" << in_file_name << std::endl; + *out_stream << "Output file=" << out_file_name << std::endl; + } + if (!out.is_open()) { + std::stringstream msg; + msg << "Failed to open output file " + << out_file_name.c_str(); + throw std::invalid_argument(msg.str()); } - case kStandaloneFunctions: { - if (cmd.has_key("o")) { - cmd.val("o", out_file_name); - } else { - out_file_name = identifier_from_file_name(in_file_name); - out_file_name += ".hpp"; - } - // TODO(martincerny) Allow multiple namespaces - // (split namespace argument by "::") - std::vector namespaces; - if (cmd.has_key("namespace")) { - std::string ns; - cmd.val("namespace", ns); - namespaces.push_back(ns); - } else { - namespaces.push_back( - identifier_from_file_name(in_file_name) + "_functions"); - } + valid_input = stan::lang::compile(err_stream, in, out, model_name, + allow_undefined, in_file_name, include_paths); + out.close(); + break; + } + case kStandaloneFunctions: { + if (cmd.has_key("o")) { + cmd.val("o", out_file_name); + } else { + out_file_name = identifier_from_file_name(in_file_name); + out_file_name += ".hpp"; + } - // TODO(martincerny) Check that the -name flag is not set + // TODO(martincerny) Allow multiple namespaces + // (split namespace argument by "::") + std::vector namespaces; + if (cmd.has_key("namespace")) { + std::string ns; + cmd.val("namespace", ns); + namespaces.push_back(ns); + } else { + namespaces.push_back( + identifier_from_file_name(in_file_name) + "_functions"); + } - for (size_t namespace_i = 0; - namespace_i < namespaces.size(); ++namespace_i) { - check_identifier(namespaces[namespace_i], "namespace"); - } + // TODO(martincerny) Check that the -name flag is not set - std::fstream out(out_file_name.c_str(), std::fstream::out); - if (out_stream) { - *out_stream << "Parsing a fuctions-only file" << std::endl; + for (size_t namespace_i = 0; + namespace_i < namespaces.size(); ++namespace_i) { + check_identifier(namespaces[namespace_i], "namespace"); + } - *out_stream << "Target namespace= "; - for (size_t namespace_i = 0; - namespace_i < namespaces.size(); ++namespace_i) { - *out_stream << "::" << namespaces[namespace_i]; - } - *out_stream << std::endl; + std::fstream out(out_file_name.c_str(), std::fstream::out); + if (out_stream) { + *out_stream << "Parsing a fuctions-only file" << std::endl; - *out_stream << "Input file=" << in_file_name << std::endl; - *out_stream << "Output file=" << out_file_name << std::endl; + *out_stream << "Target namespace= "; + for (size_t namespace_i = 0; + namespace_i < namespaces.size(); ++namespace_i) { + *out_stream << "::" << namespaces[namespace_i]; } + *out_stream << std::endl; - valid_input = stan::lang::compile_functions(err_stream, in, out, - namespaces, allow_undefined); - - out.close(); - break; - } - default: { - assert(false); + *out_stream << "Input file=" << in_file_name << std::endl; + *out_stream << "Output file=" << out_file_name << std::endl; } + + valid_input = stan::lang::compile_functions(err_stream, in, out, + namespaces, allow_undefined); + + out.close(); + break; + } + default: { + assert(false); + } } if (!valid_input) { diff --git a/src/stan/services/util/create_rng.hpp b/src/stan/services/util/create_rng.hpp index 071327d6a6e..d32df0ee483 100644 --- a/src/stan/services/util/create_rng.hpp +++ b/src/stan/services/util/create_rng.hpp @@ -22,8 +22,8 @@ namespace stan { * @param[in] chain the chain id * @return a boost::ecuyer1988 instance */ - boost::ecuyer1988 create_rng(unsigned int seed, - unsigned int chain) { + inline boost::ecuyer1988 create_rng(unsigned int seed, + unsigned int chain) { using boost::uintmax_t; static uintmax_t DISCARD_STRIDE = static_cast(1) << 50; boost::ecuyer1988 rng(seed); From 89977689b71a65ff5f6eca6834f9f9df4b69960b Mon Sep 17 00:00:00 2001 From: Daniel Lee Date: Fri, 16 Jun 2017 23:18:09 -0400 Subject: [PATCH 06/10] Reverting makefile change for windows --- make/os_win | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/make/os_win b/make/os_win index baf4b482bf9..b207a29fc7a 100644 --- a/make/os_win +++ b/make/os_win @@ -5,7 +5,7 @@ # - CFLAGS_GTEST # - OPT # - EXE -# +# # Additionally, set the following variables to build # demo/lang using the Microsoft Visual Studio compiler. # - CC_STANC @@ -14,7 +14,7 @@ ## EXE = .exe -O_STANC ?= 3 +O_STANC = 3 CFLAGS_GTEST += -DGTEST_HAS_PTHREAD=0 PATH_SEPARATOR = '\' ## Detecting Process Bitness: @@ -40,4 +40,3 @@ ifeq (mingw32-g++,$(CC_TYPE)) endif ifeq (clang++,$(CC_TYPE)) endif - From f09a470d1d613068b976e3a81c0ecf50b945de2d Mon Sep 17 00:00:00 2001 From: Daniel Lee Date: Sat, 17 Jun 2017 01:32:25 -0400 Subject: [PATCH 07/10] cpplint --- src/stan/command/stanc_helper.hpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/stan/command/stanc_helper.hpp b/src/stan/command/stanc_helper.hpp index 1adf51700c0..6ad8d483f6d 100644 --- a/src/stan/command/stanc_helper.hpp +++ b/src/stan/command/stanc_helper.hpp @@ -124,7 +124,8 @@ inline std::string identifier_from_file_name(const std::string& in_file_name) { * @param[in] extension The extension (WITHOUT dot)- e.g. "stan". * @return true if the file has the extension */ -inline bool has_extension(const std::string& file_name, const std::string& extension) { +inline bool has_extension(const std::string& file_name, + const std::string& extension) { if (file_name.length() >= extension.length() + 1) { // +1 for the dot if (0 == file_name.compare (file_name.length() - extension.length(), extension.length(), extension) @@ -266,7 +267,8 @@ inline int stanc_helper(int argc, const char* argv[], } valid_input = stan::lang::compile(err_stream, in, out, model_name, - allow_undefined, in_file_name, include_paths); + allow_undefined, in_file_name, + include_paths); out.close(); break; } @@ -286,8 +288,8 @@ inline int stanc_helper(int argc, const char* argv[], cmd.val("namespace", ns); namespaces.push_back(ns); } else { - namespaces.push_back( - identifier_from_file_name(in_file_name) + "_functions"); + namespaces.push_back(identifier_from_file_name(in_file_name) + + "_functions"); } // TODO(martincerny) Check that the -name flag is not set From 1197f40f2c8836d2b8fdb3c903f980d205c476dc Mon Sep 17 00:00:00 2001 From: Daniel Lee Date: Sat, 17 Jun 2017 01:44:23 -0400 Subject: [PATCH 08/10] Fixes #2332. Updates performance test. --- src/test/performance/logistic_test.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/test/performance/logistic_test.cpp b/src/test/performance/logistic_test.cpp index 2c3710f5dca..101dbef8045 100644 --- a/src/test/performance/logistic_test.cpp +++ b/src/test/performance/logistic_test.cpp @@ -105,19 +105,19 @@ TEST_F(performance, run) { TEST_F(performance, values_from_tagged_version) { int N_values = 9; ASSERT_EQ(N_values, last_draws_per_run[0].size()) - << "last tagged version, 2.9.0, had " << N_values << " elements"; + << "last tagged version, 2.16.0, had " << N_values << " elements"; std::vector first_run = last_draws_per_run[0]; - EXPECT_FLOAT_EQ(-65.319603, first_run[0]) + EXPECT_FLOAT_EQ(-65.508499, first_run[0]) << "lp__: index 0"; - EXPECT_FLOAT_EQ(0.96712297, first_run[1]) + EXPECT_FLOAT_EQ(0.78501701, first_run[1]) << "accept_stat__: index 1"; - EXPECT_FLOAT_EQ(1.06545, first_run[2]) + EXPECT_FLOAT_EQ(0.76857001, first_run[2]) << "stepsize__: index 2"; - EXPECT_FLOAT_EQ(2, first_run[3]) + EXPECT_FLOAT_EQ(1, first_run[3]) << "treedepth__: index 3"; EXPECT_FLOAT_EQ(3, first_run[4]) @@ -126,13 +126,13 @@ TEST_F(performance, values_from_tagged_version) { EXPECT_FLOAT_EQ(0, first_run[5]) << "divergent__: index 5"; - EXPECT_FLOAT_EQ(66.090698, first_run[6]) + EXPECT_FLOAT_EQ(68.208702, first_run[6]) << "energy__: index 6"; - EXPECT_FLOAT_EQ(1.4068201, first_run[7]) + EXPECT_FLOAT_EQ(1.4087501, first_run[7]) << "beta.1: index 7"; - EXPECT_FLOAT_EQ(-0.59367198, first_run[8]) + EXPECT_FLOAT_EQ(-0.69222099, first_run[8]) << "beta.2: index 8"; matches_tagged_version = !HasNonfatalFailure(); From 26a7c84e19fd0a38761765b7533bed5cd10702b3 Mon Sep 17 00:00:00 2001 From: Bob Carpenter Date: Sat, 17 Jun 2017 23:54:41 -0400 Subject: [PATCH 09/10] manual for 2.16, fixes #2265 --- src/docs/stan-reference/acknowledgements.tex | 33 +- src/docs/stan-reference/distributions.tex | 70 +- src/docs/stan-reference/examples.tex | 686 +++++++++---------- src/docs/stan-reference/language.tex | 136 ++-- src/docs/stan-reference/programming.tex | 469 +++++++------ src/docs/stan-reference/stan-reference.tex | 2 +- src/docs/stan-reference/title.tex | 21 +- 7 files changed, 724 insertions(+), 693 deletions(-) diff --git a/src/docs/stan-reference/acknowledgements.tex b/src/docs/stan-reference/acknowledgements.tex index 1023640d60d..b33b96e5fdc 100644 --- a/src/docs/stan-reference/acknowledgements.tex +++ b/src/docs/stan-reference/acknowledgements.tex @@ -15,7 +15,7 @@ \section*{Grants and Corporate Support} \subsection*{Current Grants} \begin{itemize} -\item +\item U.~S.\ Department of Education Institute of Education Sciences \begin{itemize}\small \item Statistical and Research Methodology: Solving @@ -39,13 +39,13 @@ \subsection*{Previous Grants} % \begin{itemize} \item -U.~S.\ Department of Energy +U.~S.\ Department of Energy \begin{itemize}\small \item DE-SC0002099: Petascale Computing \end{itemize} % \item -U.~S.\ National Science Foundation +U.~S.\ National Science Foundation \begin{itemize}\small \item ATM-0934516: Reconstructing Climate from Tree Ring Data @@ -54,7 +54,7 @@ \subsection*{Previous Grants} \end{itemize} % \item -U.~S.\ Department of Education Institute of Education Sciences +U.~S.\ Department of Education Institute of Education Sciences \begin{itemize}\small \item ED-GRANTS-032309-005: Practical Tools for Multilevel Hierarchical Modeling in Education @@ -92,12 +92,12 @@ \subsection*{Code and Doc Patches} to: Ethan Adams, Avraham Adler, -Jarret Barber, +Jarret Barber, David R.~Blair, -Miguel de Val-Borro, -Ross Boylan, -Eric N.~Brown, -Devin Caughey, +Miguel de Val-Borro, +Ross Boylan, +Eric N.~Brown, +Devin Caughey, Emmanuel Charpentier, Daniel Chen, Jacob Egner, @@ -116,6 +116,7 @@ \subsection*{Code and Doc Patches} Dan Lakeland, Devin Leopold, Nathanael I.~Lichti, +Jussi M\"a\"att\"a, Titus van der Malsburg, P.~D.~Metcalfe, Kyle Meyer, @@ -155,8 +156,9 @@ \subsection*{Code and Doc Patches} Alex Chase, Daniel Chen, Roman Cheplyaka, -Andy Choi, +Andy Choi, David Chudzicki, +Michael Clerx, Andria Dawson, daydreamt (GitHub handle), Conner DiPaolo, @@ -190,15 +192,18 @@ \subsection*{Code and Doc Patches} Tobias Madsen, Stefano Mangiola, David Manheim, +Stephen Martin, Sean Matthews, David Mawdsley, Dieter Menne, Evelyn Mitchell, +Javier Moreno, Robert Myles,xs Sunil Nandihalli, Eric Novik, Julia Palacios, -Tamas Papp, +Tamas Papp, +Anders Gorm Pedersen, Tomi Peltola, Andre Pfeuffer, Sergio Polini, @@ -224,6 +229,7 @@ \subsection*{Code and Doc Patches} Dougal Sutherland, John Sutton, Maciej Swat, +J.~Takoua, Andrew J.~Tanentzap, Shravan Vashisth, Aki Vehtari, @@ -234,8 +240,9 @@ \subsection*{Code and Doc Patches} Sam Weiss, Luke Wiklendt, wrobell (GitHub handle), -Howard Zail, and -Jon Zelner. +Howard Zail, +Jon Zelner, and +Xiubo Zhang Thanks to Kevin van Horn for install instructions for Cygwin and to Kyle Foreman for instructions on using the MKL compiler. diff --git a/src/docs/stan-reference/distributions.tex b/src/docs/stan-reference/distributions.tex index f144db5b120..d725717a94f 100644 --- a/src/docs/stan-reference/distributions.tex +++ b/src/docs/stan-reference/distributions.tex @@ -4,7 +4,7 @@ \chapter{Conventions for Probability Functions} Functions associated with distributions are set up to follow the same naming conventions for both built-in distributions and for user-defined -distributions. +distributions. \section{Suffix Marks Type of Function} @@ -18,7 +18,7 @@ \section{Suffix Marks Type of Function} log probability density function & continuous & \code{\_lpdf} \\ \hline log cumulative distribution function & any & \code{\_lcdf} \\ -log complementary cumulative distribution function & any & \code{\_lccdf} \\ +log complementary cumulative distribution function & any & \code{\_lccdf} \\ \hline random number generator & any & \code{\_rng} \end{tabular} @@ -114,14 +114,14 @@ \section{Cumulative Distribution Functions} | \, \theta)$, the cumulative distribution function (CDF) $F_Y$ is defined by \[ -F_Y(y) +F_Y(y) \ = \ -\mbox{Pr}[Y < y] +\mbox{Pr}[Y < y] \ = \ \int_{-\infty}^y p(y \, | \, \theta) \ \mathrm{d}\theta. \] The complementary cumulative distribution function (CCDF) is defined -as +as \[ \mbox{Pr}[Y \geq y] \ = \ @@ -134,7 +134,7 @@ \section{Cumulative Distribution Functions} In Stan, there is a cumulative distribution function for each probability function. For instance, \code{normal\_cdf(y,~mu,~sigma)} -is defined by +is defined by % \[ \int_0^y \distro{Normal}(y \, | \, \mu, \sigma) \ \mathrm{d}y. @@ -182,7 +182,7 @@ \section{Vectorization}\label{vectorization.section} covariance matrix is only done once. Stan also overloads some scalar functions, such as \code{log} and -\code{exp}, to apply to vectors (arrays) and return vectors (arrays). +\code{exp}, to apply to vectors (arrays) and return vectors (arrays). These vectorizations are defined elementwise and unlike the probability functions, provide only minimal efficiency speedups over repeated application and assignment in a loop. @@ -205,7 +205,7 @@ \subsubsection{Vectorized Scalar Arguments} argument, their types can be anything but their size must match. For instance, it is legal to use \code{normal\_lpdf(row\_vector~|~vector,~real)} as long as the vector and -row vector have the same size. +row vector have the same size. \subsubsection{Vectorized Vector and Row Vector Arguments} @@ -447,7 +447,7 @@ \subsubsection{Probability Mass Function} \begin{eqnarray*} \distro{BinomialLogit}(n|N,\alpha) -& = & \distro{BinomialLogit}(n|N,\mbox{logit}^{-1}(\alpha)) +& = & \distro{Binomial}(n|N,\mbox{logit}^{-1}(\alpha)) \\[6pt] & = & \binom{N}{n} \left( \mbox{logit}^{-1}(\alpha) \right)^{n} \left( 1 - \mbox{logit}^{-1}(\alpha) \right)^{N - n}. @@ -1005,7 +1005,7 @@ \subsubsection{Stan Functions} \code{N = sum(\farg{y})}} % \fitem{int[]}{multinomial\_rng}{vector \farg{theta}, int \farg{N}}{Generate a - multinomial variate with simplex distribution parameter \farg{theta} and + multinomial variate with simplex distribution parameter \farg{theta} and total count \farg{N}; may only be used in generated quantities block} \end{description} @@ -1072,7 +1072,7 @@ \subsubsection{Stan Functions} \end{description} % \begin{description} -\fitem{real}{normal\_rng}{real \farg{mu} \textbar\ real \farg{sigma}}{Generate a normal +\fitem{real}{normal\_rng}{real \farg{mu}, real \farg{sigma}}{Generate a normal variate with location \farg{mu} and scale \farg{sigma}; may only be used in generated quantities block} \end{description} @@ -1130,51 +1130,51 @@ \section{Skew Normal Distribution} \subsubsection{Probability Density Function} -If $\mu \in \reals$, $\sigma \in \posreals$, and $\alpha \in \reals$, +If $\xi \in \reals$, $\omega \in \posreals$, and $\alpha \in \reals$, then for $y \in \reals$, \[ -\distro{SkewNormal}(y|\mu,\sigma,\alpha) +\distro{SkewNormal}(y \mid \xi, \omega, \alpha) = -\frac{1}{\sigma\sqrt{2\pi}} +\frac{1}{\omega\sqrt{2\pi}} \ \exp\left( - \, \frac{1}{2} - \left( \frac{y - \mu}{\sigma} \right)^2 + \left( \frac{y - \xi}{\omega} \right)^2 \right) \ -\left(1 + \erf\left( \alpha\left(\frac{y - \mu}{\sigma\sqrt{2}}\right)\right)\right) +\left(1 + \erf\left( \alpha\left(\frac{y - \xi}{\omega\sqrt{2}}\right)\right)\right) . \] -\pitem{y}{skew\_normal}{mu, sigma, alpha} +\pitem{y}{skew\_normal}{xi, omega, alpha} \subsubsection{Stan Functions} \begin{description} - \fitem{real}{skew\_normal\_lpdf}{reals \farg{y} \textbar\ reals \farg{mu}, reals - \farg{sigma}, reals \farg{alpha}}{The log of the skew normal density - of \farg{y} given location \farg{mu}, scale \farg{sigma}, and + \fitem{real}{skew\_normal\_lpdf}{reals \farg{y} \textbar\ reals \farg{xi}, reals + \farg{omega}, reals \farg{alpha}}{The log of the skew normal density + of \farg{y} given location \farg{xi}, scale \farg{omega}, and shape \farg{alpha}} % -\fitem{real}{skew\_normal\_cdf}{reals \farg{y}, reals \farg{mu}, reals - \farg{sigma}, reals \farg{alpha}}{The skew normal - distribution function of \farg{y} given location \farg{mu}, scale - \farg{sigma}, and shape \farg{alpha}} +\fitem{real}{skew\_normal\_cdf}{reals \farg{y}, reals \farg{xi}, reals + \farg{omega}, reals \farg{alpha}}{The skew normal + distribution function of \farg{y} given location \farg{xi}, scale + \farg{omega}, and shape \farg{alpha}} % -\fitemtwolines{real}{skew\_normal\_lcdf}{reals \farg{y} \textbar\ reals \farg{mu}, reals - \farg{sigma}}{reals \farg{alpha}}{The log of the skew normal cumulative - distribution function of \farg{y} given location \farg{mu}, scale - \farg{sigma}, and shape \farg{alpha}} +\fitemtwolines{real}{skew\_normal\_lcdf}{reals \farg{y} \textbar\ reals \farg{xi}, reals + \farg{omega}}{reals \farg{alpha}}{The log of the skew normal cumulative + distribution function of \farg{y} given location \farg{xi}, scale + \farg{omega}, and shape \farg{alpha}} % -\fitemtwolines{real}{skew\_normal\_lccdf}{reals \farg{y} \textbar\ reals \farg{mu}, reals - \farg{sigma}}{reals \farg{alpha}}{The log of the skew normal complementary - cumulative distribution function of \farg{y} given location \farg{mu}, scale - \farg{sigma}, and shape \farg{alpha}} +\fitemtwolines{real}{skew\_normal\_lccdf}{reals \farg{y} \textbar\ reals \farg{xi}, reals + \farg{omega}}{reals \farg{alpha}}{The log of the skew normal complementary + cumulative distribution function of \farg{y} given location \farg{xi}, scale + \farg{omega}, and shape \farg{alpha}} \end{description} % \begin{description} -\fitem{real}{skew\_normal\_rng}{real \farg{mu}, real \farg{sigma}, +\fitem{real}{skew\_normal\_rng}{real \farg{xi}, real \farg{omega}, real \farg{alpha}}{Generate a skew normal variate with location - \farg{mu}, scale \farg{sigma}, and shape \farg{alpha}; may only be used in + \farg{xi}, scale \farg{omega}, and shape \farg{alpha}; may only be used in generated quantities block} \end{description} @@ -1297,7 +1297,7 @@ \subsubsection{Probability Density Function} terms of inverse scale. The double-exponential distribution can be defined as a compound -exponential-normal distribution. Specifically, if +exponential-normal distribution. Specifically, if \[ \alpha \sim \mathsf{Exponential}\left( \frac{1}{\lambda} \right) \] diff --git a/src/docs/stan-reference/examples.tex b/src/docs/stan-reference/examples.tex index ab7f9952fbf..45301e8c07c 100644 --- a/src/docs/stan-reference/examples.tex +++ b/src/docs/stan-reference/examples.tex @@ -4,7 +4,7 @@ \chapter{Regression Models} \noindent Stan supports regression models from simple linear regressions to -multilevel generalized linear models. +multilevel generalized linear models. \section{Linear Regression} @@ -15,7 +15,7 @@ \section{Linear Regression} % \[ y_n = \alpha + \beta x_n + \epsilon_n -\ \ \ \mbox{where} \ \ \ +\ \ \ \mbox{where} \ \ \ \epsilon_n \sim \distro{Normal}(0,\sigma). \] This is equivalent to the following sampling involving the @@ -122,7 +122,7 @@ \subsection{Matrix Notation and Vectorization} predictions line up with the outcomes in the $N$-vector \code{y}, so the entire model may be written using matrix arithmetic as shown. It would be possible to include a column of 1 values in \code{x} and -remove the \code{alpha} parameter. +remove the \code{alpha} parameter. The sampling statement in the model above is just a more efficient, vector-based approach to coding the model with a loop, as in the @@ -136,7 +136,7 @@ \subsection{Matrix Notation and Vectorization} \end{stancode} % With Stan's matrix indexing scheme, \code{x[n]} picks out row \code{n} -of the matrix \code{x}; because \code{beta} is a column vector, +of the matrix \code{x}; because \code{beta} is a column vector, the product \code{x[n] * beta} is a scalar of type \code{real}. \subsubsection{Intercepts as Inputs} @@ -160,18 +160,18 @@ \subsubsection{Intercepts as Inputs} \section{The QR Reparameterization}\label{QR-reparameterization.section} In the previous example, the linear predictor can be written as -$\eta = x \beta$, where $\eta$ is a $N$-vector of predictions, +$\eta = x \beta$, where $\eta$ is a $N$-vector of predictions, $x$ is a $N \times K$ matrix, and $\beta$ is a $K$-vector of coefficients. Presuming $N \geq K$, we can exploit the fact that any design matrix, $x$ can be decomposed using the thin QR decomposition into an orthogonal matrix $Q$ and an upper-triangular matrix $R$, i.e. $x = Q R$. See \ref{QR-decomposition} -for more information on the QR decomposition but note that \code{qr\_Q} and +for more information on the QR decomposition but note that \code{qr\_Q} and \code{qr\_R} implement the fat QR decomposition so here we thin it by including only $K$ columns in $Q$ and $K$ rows in $R$. Also, in practice, it is best to -write $x = Q^\ast R^\ast$ where $Q^\ast = Q \times \sqrt{n - 1}$ and -$R^\ast = \frac{1}{\sqrt{n - 1}} R$. Thus, we can equivalently write -$\eta = x \beta = Q R \beta = Q^\ast R^\ast \beta$. If we let -$\theta = R^\ast \beta$, then we have $\eta = Q^\ast \theta$ and +write $x = Q^\ast R^\ast$ where $Q^\ast = Q \times \sqrt{n - 1}$ and +$R^\ast = \frac{1}{\sqrt{n - 1}} R$. Thus, we can equivalently write +$\eta = x \beta = Q R \beta = Q^\ast R^\ast \beta$. If we let +$\theta = R^\ast \beta$, then we have $\eta = Q^\ast \theta$ and $\beta = R^{\ast^{-1}} \theta$. In that case, the previous Stan program becomes % \begin{stancode} @@ -242,11 +242,11 @@ \section{Priors for Coefficients and Scales}\label{regression-priors.section} \refsection{hierarchical-priors} and multivariate parameters in \refsection{multivariate-hierarchical-priors}. There is also a discussion of priors used to identify models in -\refsection{priors-for-identification}. +\refsection{priors-for-identification}. -However, as described in \refsection{QR-reparameterization}, if you do -not have an informative prior on the \emph{location} of the regression -coefficients, then you are better off reparameterizing your model so +However, as described in \refsection{QR-reparameterization}, if you do +not have an informative prior on the \emph{location} of the regression +coefficients, then you are better off reparameterizing your model so that the regression coefficients are a generated quantity. In that case, it usually does not matter very much what prior is used on on the reparameterized regression coefficients and almost any weakly informative @@ -305,7 +305,7 @@ \subsection{Proper Uniform Priors: Interval Constraints} \end{stancode} % This will implicitly give \code{sigma} a $\distro{Uniform}(0.1, 2.7)$ -prior. +prior. \subsubsection{Matching Support to Constraints} @@ -320,7 +320,7 @@ \subsubsection{Matching Support to Constraints} ... model { // *** bad *** : support narrower than constraint - sigma ~ uniform(0.1, 2.7); + sigma ~ uniform(0.1, 2.7); \end{stancode} % The sampling statement imposes a limited support for \code{sigma} in @@ -346,7 +346,7 @@ \subsection{``Uninformative'' Proper Priors} % If the prior scale, such as 1000, is several orders of magnitude larger than the estimated coefficients, then such a prior is -effectively providing no effect whatsoever. +effectively providing no effect whatsoever. We actively discourage users from using the default scale priors suggested through the BUGS examples \citep{LunnEtAl:2012}, such as @@ -379,7 +379,7 @@ \subsection{Truncated Priors} gives \code{sigma} a half-normal prior, technically % \[ -p(\sigma) +p(\sigma) \ = \ \frac{\distro{Normal}(\sigma | 0, 1000)} {1 - \distro{NormalCDF}(0 | 0, 1000)} @@ -427,7 +427,7 @@ \subsection{Bounded Priors} proper prior is to impose a uniform prior on a bounded scale. For example, we could declare the parameter for mean women's height to have a lower bound of one meter and an upper bound of three meters. -Surely the answer has to lie in that range. +Surely the answer has to lie in that range. Similarly, it is not uncommon to see priors for scale parameters that impose lower bounds of zero and upper bounds of very large numbers, @@ -461,7 +461,7 @@ \subsection{Fat-Tailed Priors and ``Default'' Priors} expected to be, but still leaves a lot of mass in its tails. The usual choice in such a situation is to use a Cauchy distribution for a prior, which can concentrate its mass around its median, but has tails -that are so fat that the variance is infinite. +that are so fat that the variance is infinite. Without specific information, the Cauchy prior is a very good default parameter choice for regression coefficients @@ -545,11 +545,11 @@ \section{Logistic and Probit Regression}\label{logistic-probit-regression.sectio } model { y ~ bernoulli_logit(alpha + beta * x); -} +} \end{stancode} % The noise parameter is built into the Bernoulli formulation here -rather than specified directly. +rather than specified directly. Logistic regression is a kind of generalized linear model with binary outcomes and the log odds (logit) link function, defined by @@ -558,7 +558,7 @@ \section{Logistic and Probit Regression}\label{logistic-probit-regression.sectio \mbox{logit}(v) = \log \left( \frac{v}{1-v} \right). \] % -The inverse of the link function appears in the model. +The inverse of the link function appears in the model. % \[ \mbox{logit}^{-1}(u) = \frac{1}{1 + \exp(-u)}. @@ -566,10 +566,10 @@ \section{Logistic and Probit Regression}\label{logistic-probit-regression.sectio % The model formulation above uses the logit-parameterized version of -the Bernoulli distribution, which is defined by +the Bernoulli distribution, which is defined by % \[ -\distro{BernoulliLogit}(y|\alpha) +\distro{BernoulliLogit}(y|\alpha) = \distro{Bernoulli}(y | \mbox{logit}^{-1}(\alpha)). \] @@ -596,7 +596,7 @@ \section{Logistic and Probit Regression}\label{logistic-probit-regression.sectio Other link functions may be used in the same way. For example, probit regression uses the cumulative normal distribution function, which is -typically written as +typically written as \[ \Phi(x) = \int_{-\infty}^x \distro{Normal}(y|0,1) \, dy. \] @@ -611,8 +611,8 @@ \section{Logistic and Probit Regression}\label{logistic-probit-regression.sectio y[n] ~ bernoulli(Phi(alpha + beta * x[n])); \end{stancode} % -A fast approximation to the cumulative unit normal distribution function -$\Phi$ is implemented in Stan as the function \code{Phi\_approx}. The +A fast approximation to the cumulative unit normal distribution function +$\Phi$ is implemented in Stan as the function \code{Phi\_approx}. The approximate probit regression model may be coded with the following. % @@ -661,7 +661,7 @@ \section{Multi-Logit Regression}\label{multi-logit.section} \code{categorical\_logit}). The first loop may be made more efficient by vectorizing the first -loop by converting the matrix \code{beta} to a vector, +loop by converting the matrix \code{beta} to a vector, % \begin{stancode} to_vector(beta) ~ normal(0, 5); @@ -711,7 +711,7 @@ \subsection{Identifiability} suitable prior on the coefficients. An alternative is to use $(K-1)$-vectors by fixing one of them to be -zero. \refsection{partially-known-parameters} discusses how to mix +zero. \refsection{partially-known-parameters} discusses how to mix constants and parameters in a vector. In the multi-logit case, the parameter block would be redefined to use $(K-1)$-vectors % @@ -824,7 +824,7 @@ \subsection{Translated and Scaled Simplex} With this parameterization, a Dirichlet prior can be placed on \code{beta\_raw}, perhaps uniform, and another prior put on -\code{beta\_scale}, typically for ``shrinkage.'' +\code{beta\_scale}, typically for ``shrinkage.'' \subsection{Soft Centering} @@ -847,7 +847,7 @@ \section{Ordered Logistic and Probit Regression}\label{ordered-logistic.section} $k$ if the linear predictor $x_n \beta$ falls between $c_{k-1}$ and $c_k$, assuming $c_0 = -\infty$ and $c_K = \infty$. The noise term is fixed by the form of regression, with examples for ordered logistic -and ordered probit models. +and ordered probit models. \subsection{Ordered Logistic Regression} @@ -863,19 +863,19 @@ \subsection{Ordered Logistic Regression} int D; int y[N]; row_vector[D] x[N]; -} +} parameters { vector[D] beta; ordered[K-1] c; -} +} model { for (n in 1:N) y[n] ~ ordered_logistic(x[n] * beta, c); } \end{stancode} -% +% The vector of cutpoints \code{c} is declared as \code{ordered[K-1]}, -which guarantees that \code{c[k]} is less than \code{c[k+1]}. +which guarantees that \code{c[k]} is less than \code{c[k+1]}. If the cutpoints were assigned independent priors, the constraint effectively truncates the joint prior to support over points that @@ -974,7 +974,7 @@ \section{Hierarchical Logistic Regression} for (n in 1:N) y[n] ~ bernoulli(inv_logit(x[n] * beta[ll[n]])); } -\end{stancode} +\end{stancode} % The standard deviation parameter \code{sigma} gets an implicit uniform prior on $(0,\infty)$ because of its declaration with a lower-bound @@ -1017,7 +1017,7 @@ \subsubsection{Optimizing the Model} \begin{stancode} for (n in 1:N) - y[n] ~ bernoulli_logit(x[n] * beta[ll[n]]); + y[n] ~ bernoulli_logit(x[n] * beta[ll[n]]); \end{stancode} % See \refsection{bernoulli-logit-distribution} for a definition of @@ -1042,7 +1042,7 @@ \subsubsection{Optimizing the Model} % The brackets introduce a new scope for the local variable \code{x\_beta\_ll}; alternatively, the variable may be declared at the -top of the model block. +top of the model block. In some cases, such as the above, the local variable assignment leads to models that are less readable. The recommended practice in such @@ -1149,7 +1149,7 @@ \subsection{1PL (Rasch) Model} % \begin{stancode} -parameters { +parameters { real delta; // mean student ability real alpha[J]; // ability of student j - mean ability real beta[K]; // difficulty of question k @@ -1196,7 +1196,7 @@ \subsection{1PL (Rasch) Model} $\beta_k$ results in the same predictions. The use of priors for $\alpha$ and $\beta$ located at 0 identifies the parameters; see \citep{GelmanHill:2007} for a discussion of identifiability issues and -alternative approaches to identification. +alternative approaches to identification. For testing purposes, the IRT 1PL model distributed with Stan uses informative priors that match the actual data generation process used @@ -1220,12 +1220,12 @@ \subsection{Multilevel 2PL Model} % \begin{stancode} -parameters { +parameters { real mu_beta; // mean student ability real alpha[J]; // ability for j - mean real beta[K]; // difficulty for k real gamma[K]; // discrimination of k - real sigma_beta; // scale of difficulties + real sigma_beta; // scale of difficulties real sigma_gamma; // scale of log discrimination } \end{stancode} @@ -1235,14 +1235,14 @@ \subsection{Multilevel 2PL Model} \begin{stancode} model { alpha ~ normal(0, 1); - beta ~ normal(0, sigma_beta); + beta ~ normal(0, sigma_beta); gamma ~ lognormal(0, sigma_gamma); mu_beta ~ cauchy(0, 5); sigma_alpha ~ cauchy(0, 5); sigma_beta ~ cauchy(0, 5); sigma_gamma ~ cauchy(0, 5); for (n in 1:N) - y[n] ~ bernoulli_logit(gamma[kk[n]] + y[n] ~ bernoulli_logit(gamma[kk[n]] * (alpha[jj[n]] - (beta[kk[n]] + mu_beta))); } \end{stancode} @@ -1271,7 +1271,7 @@ \subsection{Multilevel 2PL Model} \end{stancode} % The point is that the \code{alpha} determines the scale and location -and \code{beta} and \code{gamma} are allowed to float. +and \code{beta} and \code{gamma} are allowed to float. The \code{beta} parameter is here given a non-centered parameterization, with parameter \code{mu\_beta} serving as the mean @@ -1289,7 +1289,7 @@ \subsection{Multilevel 2PL Model} % Non-centered parameterizations tend to be more efficient in hierarchical models; see \refsection{reparameterization} for more -information on non-centered reparameterizations. +information on non-centered reparameterizations. The intercept term \code{mu\_beta} can't itself be modeled hierarchically, so it is given a weakly informative @@ -1344,7 +1344,7 @@ \subsection{Separability} likelihood estimate for the coefficient for that predictor diverges to infinity. This divergence can be controlled by providing a prior for the coefficient, which will ``shrink'' the estimate back toward zero -and thus identify the model in the posterior. +and thus identify the model in the posterior. Similar problems arise for sampling with improper flat priors. The sampler will try to draw very large values. By providing a prior, @@ -1358,7 +1358,7 @@ \section{Multivariate Priors for Hierarchical Models}\label{multivariate-hierarc In hierarchical regression models (and other situations), several individual-level variables may be assigned hierarchical priors. For example, a model with multiple varying intercepts and slopes within -might assign them a multivariate prior. +might assign them a multivariate prior. As an example, the individuals might be people and the outcome income, with predictors such as education level and age, and the groups might be states @@ -1375,7 +1375,7 @@ \subsection{Multivariate Regression Example} assume that $x_{n,1} = 1$ is a fixed ``intercept'' predictor. To encode group membership, they assume individual $n$ belongs to group $jj[n] \in 1{:}J$. Each individual $n$ also has an observed outcome -$y_n$ taking on real values. +$y_n$ taking on real values. \subsubsection{Likelihood} @@ -1384,7 +1384,7 @@ \subsubsection{Likelihood} group $j$. The likelihood function for individual $n$ is then just % \[ -y_n \sim \distro{Normal}(x_n \, \beta_{jj[n]}, \, \sigma) +y_n \sim \distro{Normal}(x_n \, \beta_{jj[n]}, \, \sigma) \mbox{ for } n \in 1{:}N. \] % @@ -1416,18 +1416,18 @@ \subsubsection{Hyperpriors} \] Of course, if more is known about the expected coefficient values $\beta_{j, k}$, this information can be incorporated into the prior for -$\mu_k$. +$\mu_k$. For the prior on the covariance matrix, Gelman and Hill suggest using a scaled inverse Wishart. That choice was motivated primarily by convenience as it is conjugate to the multivariate likelihood function -and thus simplifies Gibbs sampling. +and thus simplifies Gibbs sampling. In Stan, there is no restriction to conjugacy for multivariate priors, and we in fact recommend a slightly different approach. Like Gelman and Hill, we decompose our prior into a scale and a matrix, but are able to do so in a more natural way based on the actual variable -scales and a correlation matrix. Specifically, we define +scales and a correlation matrix. Specifically, we define \[ \Sigma = \mbox{diag\_matrix}(\tau) \ \Omega \ \mbox{diag\_matrix}(\tau), \] @@ -1446,7 +1446,7 @@ \subsubsection{Hyperpriors} prior for scales, but we recommend something weakly informative like a half-Cauchy distribution with a small scale, such as \[ -\tau_k \sim \distro{Cauchy}(0, 2.5) +\tau_k \sim \distro{Cauchy}(0, 2.5) \mbox{ for } k \in 1{:}K \mbox{ constrained by } \tau_k > 0. \] As for the prior means, if there is information about the scale of @@ -1484,7 +1484,7 @@ \subsubsection{Group-Level Predictors for Prior Mean} independent weakly informative priors, such as \[ \gamma_l \sim \distro{Normal}(0,5). -\] +\] As usual, information about the group-level means should be incorporated into this prior. @@ -1529,7 +1529,7 @@ \subsubsection{Coding the Model in Stan} \end{stancode} % The hyperprior covariance matrix is defined implicitly through the -a quadratic form in the code +a quadratic form in the code because the correlation matrix \code{Omega} and scale vector \code{tau} are more natural to inspect in the output; to output \code{Sigma}, define it as a transformed parameter. The function @@ -1599,7 +1599,7 @@ \subsubsection{Optimization through Vectorization} The code in the Stan program above also builds up an array of vectors for the outcomes and for the multivariate normal, which provides a very significant speedup by reducing the number of linear systems that -need to be solved and differentiated. +need to be solved and differentiated. % \begin{stancode} { @@ -1629,7 +1629,7 @@ \subsubsection{Optimization through Cholesky Factorization} % \begin{stancode} data { - matrix[J, L] u; + matrix[J, L] u; ... parameters { matrix[K, J] z; @@ -1640,10 +1640,10 @@ \subsubsection{Optimization through Cholesky Factorization} beta = u * gamma + (diag_pre_multiply(tau,L_Omega) * z)'; } model { - to_vector(z) ~ normal(0, 1); + to_vector(z) ~ normal(0, 1); L_Omega ~ lkj_corr_cholesky(2); ... -\end{stancode} +\end{stancode} % The data variable \code{u} was originally an array of vectors, which is efficient for access; here it is redeclared as a matrix in order to @@ -1660,12 +1660,12 @@ \subsubsection{Optimization through Cholesky Factorization} factor of the final covariance matrix, % \begin{stancode} - Sigma_beta + Sigma_beta = quad_form_diag(Omega, tau) = diag_pre_multiply(tau, L_Omega) * diag_pre_multiply(tau, L_Omega)' \end{stancode} % -where the diagonal pre-multiply compound operation is defined by +where the diagonal pre-multiply compound operation is defined by % \begin{stancode} diag_pre_multiply(a, b) = diag_matrix(a) * b @@ -1685,8 +1685,8 @@ \subsubsection{Optimization through Cholesky Factorization} \begin{stancode} parameters { matrix[K, J] z; - cholesky_factor_corr[K] L_Omega; - vector[K] tau_unif; + cholesky_factor_corr[K] L_Omega; + vector[K] tau_unif; matrix[L, K] gamma; // group coeffs real sigma; // prediction error scale } @@ -1697,7 +1697,7 @@ \subsubsection{Optimization through Cholesky Factorization} beta = u * gamma + (diag_pre_multiply(tau,L_Omega) * z)'; } model { - to_vector(z) ~ normal(0, 1); + to_vector(z) ~ normal(0, 1); L_Omega ~ lkj_corr_cholesky(2); to_vector(gamma) ~ normal(0, 5); y ~ normal(rows_dot_product(beta[jj] , x), sigma); @@ -1716,12 +1716,12 @@ \subsubsection{Optimization through Cholesky Factorization} % ... % transformed parameters { % matrix[M,3] alpha; -% alpha +% alpha % = transpose(rep_matrix(mu, M) % + diag_pre_multiply(sigma_Sigma,L_Sigma) * z); % ... % model { -% to_vector(z) ~ normal(0, 1); +% to_vector(z) ~ normal(0, 1); % gamma ~ normal(0, 5); % sigma_Sigma ~ cauchy(0, 2.5); % L_Sigma ~ lkj_corr_cholesky(3); @@ -1729,7 +1729,7 @@ \subsubsection{Optimization through Cholesky Factorization} % \end{stancode} % \end{quote} % % -% Taken together, this Stan program amounts to +% Taken together, this Stan program amounts to % % % \begin{eqnarray*} % \sigma & \sim & \mbox{Cauchy}(0, 2.5) @@ -1771,11 +1771,11 @@ \subsection{Programming Predictions} data { int K; int N; - matrix[N, K] x; - vector[N] y; + matrix[N, K] x; + vector[N] y; int N_new; - matrix[N_new, K] x_new; + matrix[N_new, K] x_new; } parameters { vector[K] beta; @@ -1806,10 +1806,10 @@ \subsection{Predictions as Generated Quantities} real sigma; } model { - y ~ normal(x * beta, sigma); + y ~ normal(x * beta, sigma); } generated quantities { - vector[N_new] y_new; + vector[N_new] y_new; for (n in 1:N_new) y_new[n] = normal_rng(x_new[n] * beta, sigma); } @@ -1832,7 +1832,7 @@ \subsubsection{Overflow in Generated Quantities} be intercepted and dealt with, typically be reparameterizing or reimplementing the random number generator using real values rather than integers, which are upper-bounded by $2^{31} - 1$ in Stan. - + \section{Multivariate Outcomes} @@ -1841,13 +1841,13 @@ \section{Multivariate Outcomes} regressions are just repeated categorical regressions. In contrast, this section discusses regression when each observed value is multivariate. To relate multiple outcomes in a regression setting, -their error terms are provided with covariance structure. +their error terms are provided with covariance structure. This section considers two cases, seemingly unrelated regressions for continuous multivariate quantities and multivariate probit regression for boolean multivariate quantities. -\subsection{Seemingly Unrelated Regressions} +\subsection{Seemingly Unrelated Regressions} The first model considered is the ``seemingly unrelated'' regressions (SUR) of econometrics where several linear regressions share @@ -1932,7 +1932,7 @@ \subsection{Seemingly Unrelated Regressions} If required, the full correlation or covariance matrices may be reconstructed from their Cholesky factors in the generated quantities -block. +block. \subsection{Multivariate Probit Regression} @@ -1944,7 +1944,7 @@ \subsection{Multivariate Probit Regression} The observations $y_n$ are $D$-vectors of boolean values (coded 0 for false, 1 for true). The values for the observations $y_n$ are based on latent values $z_n$ drawn from a seemingly unrelated regression -model (see the previous section), +model (see the previous section), % \begin{eqnarray*} z_n & = & x_n \, \beta + \epsilon_n @@ -2067,7 +2067,7 @@ \subsection{Multivariate Probit Regression} These include the regression coefficients \code{beta} and the Cholesky factor of the correlation matrix, \code{L\_Omega}. This time there is no scaling because the covariance matrix has unit scale (i.e., it is a -correlation matrix; see above). +correlation matrix; see above). The critical part of the parameter declaration is that the latent real value $z$ is broken into positive-constrained and negative-constrained @@ -2092,9 +2092,9 @@ \subsection{Multivariate Probit Regression} model { L_Omega ~ lkj_corr_cholesky(4); to_vector(beta) ~ normal(0, 5); - { + { vector[D] beta_x[N]; - for (n in 1:N) + for (n in 1:N) beta_x[n] = beta * x[n]; z ~ multi_normal_cholesky(beta_x, L_Omega); } @@ -2105,12 +2105,12 @@ \subsection{Multivariate Probit Regression} Chib-style constraints on \code{z}. Finally, the correlation matrix itself can be put back together in the -generated quantities block if desired. +generated quantities block if desired. % \begin{stancode} generated quantities { corr_matrix[D] Omega; - Omega = multiply_lower_tri_self_transpose(L_Omega); + Omega = multiply_lower_tri_self_transpose(L_Omega); } \end{stancode} % @@ -2136,9 +2136,9 @@ \subsection{Prediction} and $y$ is % \[ -p(\alpha, \beta, \sigma \, | \, x, y) -\propto -\prod_{n=1}^N +p(\alpha, \beta, \sigma \, | \, x, y) +\propto +\prod_{n=1}^N \distro{Normal}(y_n \, | \, \alpha + \beta x_n, \sigma). \] % @@ -2150,7 +2150,7 @@ \subsection{Prediction} = \int_{(\alpha,\beta,\sigma)} \distro{Normal}(\tilde{y}_n \, | \, \alpha + \beta \tilde{x}_n, \sigma) \times - p(\alpha, \beta, \sigma \, | \, x, y) + p(\alpha, \beta, \sigma \, | \, x, y) \ \mathrm{d}(\alpha,\beta,\sigma). \] @@ -2223,7 +2223,7 @@ \chapter{Time-Series Models}\label{time-series.chapter} \noindent Times series data come arranged in temporal order. This chapter presents two kinds of time series models, regression-like models such -as autoregressive and moving average models, and hidden Markov models. +as autoregressive and moving average models, and hidden Markov models. \refchapter{gaussian-processes} presents Gaussian processes, which may also be used for time-series (and spatial) data. @@ -2279,7 +2279,7 @@ \subsubsection{Slicing for Efficiency} % \begin{stancode} model { - y[2:(N - 1)] ~ normal(alpha + beta * y[1:(N - 1)], sigma); + y[2:N] ~ normal(alpha + beta * y[1:(N - 1)], sigma); } \end{stancode} % @@ -2288,13 +2288,13 @@ \subsubsection{Slicing for Efficiency} -\subsection{Extensions to the AR(1) Model} +\subsection{Extensions to the AR(1) Model} Proper priors of a range of different families may be added for the regression coefficients and noise scale. The normal noise model can be changed to a Student-$t$ distribution or any other distribution with unbounded support. The model could also be made hierarchical if -multiple series of observations are available. +multiple series of observations are available. To enforce the estimation of a stationary AR(1) process, the slope coefficient \code{beta} may be constrained with bounds as follows. @@ -2368,7 +2368,7 @@ \subsection{ARCH(1) Models} and $\mu$, $\alpha_0$, and $\alpha_1$ are unknown regression coefficient parameters. % \begin{eqnarray*} -r_t & = & \mu + a_t +r_t & = & \mu + a_t \\[2pt] a_t & = & \sigma_t \epsilon_t \\[2pt] @@ -2379,7 +2379,7 @@ \subsection{ARCH(1) Models} % In order to ensure the noise terms $\sigma^2_t$ are positive, the scale coefficients are constrained to be positive, $\alpha_0, \alpha_1 -> 0$. To ensure stationarity of the time series, the slope is +> 0$. To ensure stationarity of the time series, the slope is constrained to to be less than one, $\alpha_1 < 1$.% % \footnote{In practice, it can be useful to remove the constraint to @@ -2434,21 +2434,21 @@ \subsection{GARCH(1,1) Models} % \begin{stancode} data { - int T; + int T; real r[T]; - real sigma1; + real sigma1; } parameters { - real mu; - real alpha0; - real alpha1; - real beta1; + real mu; + real alpha0; + real alpha1; + real beta1; } transformed parameters { real sigma[T]; sigma[1] = sigma1; for (t in 2:T) - sigma[t] = sqrt(alpha0 + sigma[t] = sqrt(alpha0 + alpha1 * pow(r[t-1] - mu, 2) + beta1 * pow(sigma[t-1], 2)); } @@ -2458,12 +2458,12 @@ \subsection{GARCH(1,1) Models} \end{stancode} % To get the recursive definition of the volatility regression off the -ground, the data declaration includes a non-negative value -\code{sigma1} for the scale of the noise at $t = 1$. +ground, the data declaration includes a non-negative value +\code{sigma1} for the scale of the noise at $t = 1$. The constraints are coded directly on the parameter declarations. This declaration is order-specific in that the constraint on \code{beta1} -depends on the value of \code{alpha1}. +depends on the value of \code{alpha1}. A transformed parameter array of non-negative values \code{sigma} is used to store the scale values at each time point. The definition of @@ -2528,7 +2528,7 @@ \subsection{$\mbox{MA}(2)$ Example} theta ~ cauchy(0, 2.5); sigma ~ cauchy(0, 2.5); for (t in 3:T) - y[t] ~ normal(mu + y[t] ~ normal(mu + theta[1] * epsilon[t - 1] + theta[2] * epsilon[t - 2], sigma); @@ -2545,7 +2545,7 @@ \subsection{$\mbox{MA}(2)$ Example} This model could be improved in terms of speed by vectorizing the sampling statement in the model block. Vectorizing the calculation of the $\epsilon_t$ could also be sped up by using a dot product instead -of a loop. +of a loop. \subsection{Vectorized $\mbox{MA}(Q)$ Model} @@ -2629,11 +2629,11 @@ \section{Autoregressive Moving Average Models} \end{stancode} % The data is declared in the same way as the other time-series -regressions and the parameters are documented in the code. +regressions and the parameters are documented in the code. In the model block, the local vector \code{nu} stores the predictions and \code{err} the errors. These are computed similarly to the -errors in the moving average models described in the previous section. +errors in the moving average models described in the previous section. The priors are weakly informative for stationary processes. The likelihood only involves the error term, which is efficiently @@ -2658,7 +2658,7 @@ \section{Autoregressive Moving Average Models} err = y[1] - mu + phi * mu; err ~ normal(0, sigma); for (t in 2:T) { - err = y[t] - (mu + phi * y[t-1] + theta * err); + err = y[t] - (mu + phi * y[t-1] + theta * err); err ~ normal(0, sigma); } } @@ -2668,7 +2668,7 @@ \section{Autoregressive Moving Average Models} variables, such as \code{err} in this case, can be reused in Stan. Folta's approach could be extended to higher order moving-average models by storing more than one error term as a local variable and -reassigning them in the loop. +reassigning them in the loop. Both encodings are very fast. The original encoding has the advantage of vectorizing the normal distribution, but it uses a bit more memory. @@ -2702,7 +2702,7 @@ \subsection{Identifiability and Stationarity}% % \begin{stancode} read phi; -\end{stancode} +\end{stancode} @@ -2736,7 +2736,7 @@ \section{Stochastic Volatility Models} % Rearranging the first line, $\epsilon_t = y_t \exp(-h_t / 2)$, allowing the sampling distribution for $y_t$ to be written as -\[ +\[ y_t \sim \distro{Normal}(0,\exp(h_t/2)). \] The recurrence equation for $h_{t+1}$ may be combined with the @@ -2761,7 +2761,7 @@ \section{Stochastic Volatility Models} model { phi ~ uniform(-1, 1); sigma ~ cauchy(0, 5); - mu ~ cauchy(0, 10); + mu ~ cauchy(0, 10); h[1] ~ normal(mu, sigma / sqrt(1 - phi * phi)); for (t in 2:T) h[t] ~ normal(mu + phi * (h[t - 1] - mu), sigma); @@ -2781,11 +2781,11 @@ \section{Stochastic Volatility Models} data points from the above model with $\mu = -1.02$, $\phi = 0.95$, and $\sigma = 0.25$ leads to 95\% posterior intervals for $\mu$ of $(-1.23, -0.54)$, for $\phi$ of $(0.82,0.98 )$ and for $\sigma$ of -$(0.16,0.38)$. +$(0.16,0.38)$. The samples using NUTS show a high degree of autocorrelation among the samples, both for this model and the stochastic volatility model -evaluated in \citep{Hoffman-Gelman:2011, Hoffman-Gelman:2014}. +evaluated in \citep{Hoffman-Gelman:2011, Hoffman-Gelman:2014}. Using a non-diagonal mass matrix provides faster convergence and more effective samples than a diagonal mass matrix, but will not scale to large values of $T$. @@ -2866,7 +2866,7 @@ \section{Hidden Markov Models}\label{hmms.section} z_t \sim \distro{Categorical}(\theta_{z[t-1]}). \] The output $y_t$ at time $t$ is generated conditionally independently -based on the latent state $z_t$. +based on the latent state $z_t$. This section describes HMMs with a simple categorical model for outputs $y_t \in \{1,\ldots,V\}$. The categorical distribution for @@ -2901,7 +2901,7 @@ \subsection{Supervised Parameter Estimation} simplex[V] phi[K]; // emit probs } model { - for (k in 1:K) + for (k in 1:K) theta[k] ~ dirichlet(alpha); for (k in 1:K) phi[k] ~ dirichlet(beta); @@ -2924,7 +2924,7 @@ \subsection{Start-State and End-State Probabilities} the probability of $z_1$ should be set to the stationary state probabilities in the Markov chain. In this case, there is no distinct end to the data, so there is no need to model the probability that the -sequence ends at $z_T$. +sequence ends at $z_T$. An alternative conception of HMMs is as models of finite-length sequences. For example, human language sentences have distinct @@ -2953,7 +2953,7 @@ \subsection{Calculating Sufficient Statistics} transformed data { int trans[K, K]; int emit[K, V]; - for (k1 in 1:K) + for (k1 in 1:K) for (k2 in 1:K) trans[k1, k2] = 0; for (t in 2:T) @@ -3006,7 +3006,7 @@ \subsection{Analytic Posterior} transformed data { vector[K] alpha_post[K]; vector[V] beta_post[K]; - for (k in 1:K) + for (k in 1:K) alpha_post[k] = alpha; for (t in 2:T) alpha_post[z[t-1], z[t]] = alpha_post[z[t-1], z[t]] + 1; @@ -3021,7 +3021,7 @@ \subsection{Analytic Posterior} % \begin{stancode} model { - for (k in 1:K) + for (k in 1:K) theta[k] ~ dirichlet(alpha_post[k]); for (k in 1:K) phi[k] ~ dirichlet(beta_post[k]); @@ -3039,7 +3039,7 @@ \subsection{Semisupervised Estimation} strategy in Stan requires calculating the probability of an output sequence with an unknown state sequence. This is a marginalization problem, and for HMMs, it is computed with the so-called forward -algorithm. +algorithm. In Stan, the forward algorithm is coded as follows.% % @@ -3059,12 +3059,12 @@ \subsection{Semisupervised Estimation} % The model for the supervised data does not change; the unsupervised data is handled with the following Stan implementation of the forward -algorithm. +algorithm. % \begin{stancode} model { ... - { + { real acc[K]; real gamma[T_unsup, K]; for (k in 1:K) @@ -3099,7 +3099,7 @@ \subsection{Semisupervised Estimation} The brackets provide the scope for the local variables \code{acc} and \code{gamma}; these could have been declared earlier, but it is -clearer to keep their declaration near their use. +clearer to keep their declaration near their use. \subsection{Predictive Inference} @@ -3108,7 +3108,7 @@ \subsection{Predictive Inference} $\phi_{k,v}$ and an observation sequence $u_1,\ldots,u_T \in \{ 1,\ldots,V \}$, the Viterbi (dynamic programming) algorithm computes the state sequence which is most likely to have generated the -observed output $u$. +observed output $u$. The Viterbi algorithm can be coded in Stan in the generated quantities block as follows. The predictions here is the most likely state @@ -3122,7 +3122,7 @@ \subsection{Predictive Inference} generated quantities { int y_star[T_unsup]; real log_p_y_star; - { + { int back_ptr[T_unsup, K]; real best_logp[T_unsup, K]; real best_total_logp; @@ -3133,7 +3133,7 @@ \subsection{Predictive Inference} best_logp[t, k] = negative_infinity(); for (j in 1:K) { real logp; - logp = best_logp[t-1, j] + logp = best_logp[t-1, j] + log(theta[j, k]) + log(phi[k, u[t]]); if (logp > best_logp[t, k]) { back_ptr[t, k] = j; @@ -3147,7 +3147,7 @@ \subsection{Predictive Inference} if (best_logp[T_unsup, k] == log_p_y_star) y_star[T_unsup] = k; for (t in 1:(T_unsup - 1)) - y_star[T_unsup - t] = back_ptr[T_unsup - t + 1, + y_star[T_unsup - t] = back_ptr[T_unsup - t + 1, y_star[T_unsup - t + 1]]; } } @@ -3261,7 +3261,7 @@ \section{Partially Known Parameters}\label{partially-known-parameters.section} real var1; real var2; } transformed data { - real max_cov = sqrt(var1 * var2); + real max_cov = sqrt(var1 * var2); real min_cov = -max_cov; } parameters { @@ -3272,7 +3272,7 @@ \section{Partially Known Parameters}\label{partially-known-parameters.section} matrix[2, 2] Sigma; Sigma[1, 1] = var1; Sigma[1, 2] = cov; Sigma[2, 1] = cov; Sigma[2, 2] = var2; -} +} model { y ~ multi_normal(mu, Sigma); } @@ -3292,14 +3292,14 @@ \section{Partially Known Parameters}\label{partially-known-parameters.section} The vectorization of the multivariate normal is critical for efficiency here. The transformed parameter \code{Sigma} could be -defined as a local variable within the model block if +defined as a local variable within the model block if \section{Sliced Missing Data} If the missing data is part of some larger data structure, then it can often be effectively reassembled using index arrays and slicing. Here's an example for time-series data, where only some entries in the -series are observed. +series are observed. % \begin{stancode} data { @@ -3372,13 +3372,13 @@ \section{Loading matrix for factor analysis} K_choose_2 = (K * (K - 1)) / 2; } parameters { - vector[K_choose_2] L_lower; + vector[K_choose_2] L_lower; } transformed parameters { cholesky_factor_cov[K] L; for (k in 1:K) L[k, k] = 1; - { + { int i; for (m in 2:K) { for (n in 1:(m - 1)) { @@ -3468,7 +3468,7 @@ \section{Missing Multivariate Data} ns1[n1] = n; n1 = n1 + 1; } else if (y_observed[n, 2]) { - ns2[n2] = n; + ns2[n2] = n; n2 = n2 + 1; } } @@ -3518,30 +3518,30 @@ \section{Truncated Data}\label{truncated-data.section} Truncated data is data for which measurements are only reported if they fall above a lower bound, below an upper bound, or between a -lower and upper bound. +lower and upper bound. Truncated data may be modeled in Stan using truncated distributions. For example, suppose the truncated data is $y_n$ with an upper truncation point of $U = 300$ so that $y_n < 300$. In Stan, this data can be modeled as following a truncated normal distribution for -the observations as follows. +the observations as follows. % \begin{stancode} data { int N; real U; real y[N]; -} +} parameters { real mu; real sigma; -} +} model { for (n in 1:N) y[n] ~ normal(mu, sigma) T[,U]; } \end{stancode} -% +% The model declares an upper bound \code{U} as data and constrains the data for \code{y} to respect the constraint; this will be checked when the data is loaded into the model before sampling begins. @@ -3558,7 +3558,7 @@ \subsection{Constraints and Out-of-Bounds Returns} sampled with the statement. % \begin{stancode} -for (n in 1:N) +for (n in 1:N) y[n] ~ normal(mu, sigma) T[L,U]; \end{stancode} % @@ -3582,7 +3582,7 @@ \subsection{Constraints and Out-of-Bounds Returns} \code{y} is data, then \code{L} and \code{U} must be appropriately constrained so that all data is in range and the value of \code{L} is less than that of \code{U} (if they are equal, the parameter range -collapses to a single point and the Hamiltonian dynamics used by +collapses to a single point and the Hamiltonian dynamics used by the sampler break down). The following declarations ensure the bounds are well behaved. % @@ -3614,13 +3614,13 @@ \subsection{Unknown Truncation Points} real y[N]; } parameters { - real L; + real L; real U; real mu; real sigma; } model { - L ~ ...; + L ~ ...; U ~ ...; for (n in 1:N) y[n] ~ normal(mu, sigma) T[L,U]; @@ -3638,7 +3638,7 @@ \subsection{Unknown Truncation Points} The ellipses where the priors for the bounds \code{L} and \code{U} should go should be filled in with a an informative prior in -order for this model to not concentrate \code{L} strongly around +order for this model to not concentrate \code{L} strongly around \code{min(y)} and \code{U} strongly around \code{max(y)}. @@ -3647,7 +3647,7 @@ \section{Censored Data} Censoring hides values from points that are too large, too small, or both. Unlike with truncated data, the number of data points that were censored is known. The textbook example is the household scale which -does not report values above 300 pounds. +does not report values above 300 pounds. \subsection{Estimating Censored Values} @@ -3681,7 +3681,7 @@ \subsection{Estimating Censored Values} declared to have values of type \code{real}, all imputed values for censored data will be greater than \code{U}. The imputed censored data affects the location and scale parameters through the last -sampling statement in the model. +sampling statement in the model. \subsection{Integrating out Censored Values} @@ -3691,7 +3691,7 @@ \subsection{Integrating out Censored Values} probability of % \[ -\mbox{Pr}[y > U] +\mbox{Pr}[y > U] = \int_U^{\infty} \distro{Normal}(y|\mu,\sigma) \, dy = 1 - \Phi\left(\frac{y - \mu}{\sigma}\right), \] @@ -3724,7 +3724,7 @@ \subsection{Integrating out Censored Values} real sigma; } model { - y_obs ~ normal(mu, sigma); + y_obs ~ normal(mu, sigma); target += N_cens * normal_lccdf(U | mu, sigma); } \end{stancode} @@ -3780,7 +3780,7 @@ \section{Relation to Clustering}\label{clustering-mixture.section} multivariate form as the statistical basis for the $K$-means algorithm; the latent Dirichlet allocation model, usually applied to clustering problems, can be viewed as a mixed-membership multinomial -mixture model. +mixture model. \section{Latent Discrete Parameterization} @@ -3800,13 +3800,13 @@ \section{Latent Discrete Parameterization} \] % The variable $y_n$ is distributed according to the parameters -of the mixture component $z_n$, +of the mixture component $z_n$, \[ y_n \sim \distro{Normal}(\mu_{z[n]},\sigma_{z[n]}). \] % This model is not directly supported by Stan because it involves -discrete parameters $z_n$, but Stan can sample $\mu$ and $\sigma$ +discrete parameters $z_n$, but Stan can sample $\mu$ and $\sigma$ by summing out the $z$ parameter as described in the next section. @@ -3814,12 +3814,12 @@ \section{Summing out the Responsibility Parameter} To implement the normal mixture model outlined in the previous section in Stan, the discrete parameters can be summed out of the -model. If $Y$ is a mixture of $K$ normal distributions with +model. If $Y$ is a mixture of $K$ normal distributions with locations $\mu_k$ and scales $\sigma_k$ with mixing proportions -$\lambda$ in the unit $K$-simplex, then +$\lambda$ in the unit $K$-simplex, then \[ p_Y(y | \lambda, \mu, \sigma) -\ = \ +\ = \ \sum_{k=1}^K \lambda_k \, \distro{Normal}(y \, | \, \mu_k, \sigma_k). \] @@ -3879,7 +3879,7 @@ \section{Log Sum of Exponentials: Linear Sums on the Log Scale} \\[2pt] & = & \mbox{log\_sum\_exp}(\hspace*{-5pt}\begin{array}[t]{l} \log(0.3) + \log \distro{Normal}(y|-1,2), - \\ + \\ \log(0.7) + \log \distro{Normal}(y|3,1) \ ). \end{array} \end{eqnarray*} @@ -3918,7 +3918,7 @@ \subsubsection{Dropping uniform mixture ratios} % \[ \mbox{log\_sum\_exp}(c + a, c + b) -\ = \ +\ = \ c + \mbox{log\_sum\_exp(a, b)} \] % @@ -3965,7 +3965,7 @@ \subsection{Estimating Parameters of a Mixture} points. The mixing proportion parameter \code{theta} is declared to be a unit $K$-simplex, whereas the component location parameter \code{mu} and scale parameter \code{sigma} are both defined to be -\code{K}-vectors. +\code{K}-vectors. The location parameter \code{mu} is declared to be an ordered vector in order to identify the model. This will not affect inferences that @@ -3976,7 +3976,7 @@ \subsection{Estimating Parameters of a Mixture} The values in the scale array \code{sigma} are constrained to be non-negative, and have a weakly informative prior given in the model -chosen to avoid zero values and thus collapsing components. +chosen to avoid zero values and thus collapsing components. The model declares a local array variable \code{lps} to be size \code{K} and uses it to accumulate the log contributions from the @@ -3998,7 +3998,7 @@ \section{Vectorizing Mixtures} % \begin{stancode} for (n in 1:N) { - target += log_sum_exp(log(lambda) + target += log_sum_exp(log(lambda) + normal_lpdf(y[n] | mu[1], sigma[1]), log1m(lambda) + normal_lpdf(y[n] | mu[2], sigma[2])); @@ -4008,7 +4008,7 @@ \section{Vectorizing Mixtures} % \begin{stancode} for (n in 1:N) - target += log_mix(lambda, + target += log_mix(lambda, normal_lpdf(y[n] | mu[1], sigma[1]), normal_lpdf(y[n] | mu[2], sigma[2]))); \end{stancode} @@ -4016,16 +4016,16 @@ \section{Vectorizing Mixtures} This definition assumes that each observation $y_n$ may have arisen from either of the mixture components. The density is \[ -p(y \, | \, \lambda, \mu, \sigma) +p(y \, | \, \lambda, \mu, \sigma) = \prod_{n=1}^N (\lambda \times \distro{Normal}(y_n \, | \, \mu_1, \sigma_1) + (1 - \lambda) \times \distro{Normal}(y_n \, | \, \mu_2, \sigma_2). \] -% +% Contrast the previous model with the following (erroneous) attempt to vectorize the model. % \begin{stancode} -target += log_sum_exp(log(lambda) +target += log_sum_exp(log(lambda) + normal_lpdf(y | mu[1], sigma[1]), log1m(lambda) + normal_lpdf(y | mu[2], sigma[2])); @@ -4056,12 +4056,12 @@ \section{Inferences Supported by exchangeable in the model and thus not identifiable. This arises if the parameters of the mixture components have exchangeable priors and the mixture ratio gets a uniform prior so that the parameters of the -mixture components are also exchangeable in the likelihood. +mixture components are also exchangeable in the likelihood. We have finessed this basic problem by ordering the parameters. This will allow us in some cases to pick out mixture components either ahead of time or after fitting (e.g., male vs. female, or Democrat -vs.\ Republican). +vs.\ Republican). In other cases, we do not care about the actual identities of the mixture components and want to consider inferences that are @@ -4089,8 +4089,8 @@ \subsection{Mixtures with Unidentifiable Components} so that with the likelihood % \[ -p(y_n \, | \, \mu, \sigma) -= \lambda \, \distro{Normal}(y_n \, | \, \mu_1, \sigma_1) +p(y_n \, | \, \mu, \sigma) += \lambda \, \distro{Normal}(y_n \, | \, \mu_1, \sigma_1) + (1 - \lambda) \, \distro{Normal}(y_n \, | \, \mu_2, \sigma_2), \] % @@ -4100,7 +4100,7 @@ \subsection{Mixtures with Unidentifiable Components} % \footnote{Imposing a constraint such as $\theta < 0.5$ will resolve the symmetry, but fundamentally changes the model and its posterior - inferences.} + inferences.} \subsection{Inference under Label Switching} @@ -4119,7 +4119,7 @@ \subsubsection{Predictive likelihood} % \[ p(\tilde{y} \, | \, y) -= += \int_{\theta} p(\tilde{y} \, | \, \theta) \, p(\theta | y) @@ -4195,7 +4195,7 @@ \section{Zero-Inflated and Hurdle Models}\label{zero-inflated.section} modeling the probability of a zero outcome. Zero-inflated models, as defined by \citet{Lambert:1992}, add additional probability mass to the outcome of zero. Hurdle models, on the other hand, are formulated -as pure mixtures of zero and non-zero outcomes. +as pure mixtures of zero and non-zero outcomes. Zero inflation and hurdle models can be formulated for discrete distributions other than the Poisson. Zero inflation does not work @@ -4215,8 +4215,8 @@ \subsection{Zero Inflation} notation for a Poisson mean parameter). The probability function is thus \[ -p(y_n|\theta,\lambda) -= +p(y_n|\theta,\lambda) += \left\{ \begin{array}{ll} \theta + (1 - \theta) \times \distro{Poisson}(0|\lambda) & \mbox{ if } y_n = 0, \mbox{ and} @@ -4224,7 +4224,7 @@ \subsection{Zero Inflation} (1-\theta) \times \distro{Poisson}(y_n|\lambda) & \mbox{ if } y_n > 0. \end{array} \right. -\] +\] % The log probability function can be implemented directly in Stan as follows. % @@ -4241,7 +4241,7 @@ \subsection{Zero Inflation} for (n in 1:N) { if (y[n] == 0) target += log_sum_exp(bernoulli_lpmf(1 | theta), - bernoulli_lpmf(0 | theta) + bernoulli_lpmf(0 | theta) + poisson_lpmf(y[n] | lambda)); else target += bernoulli_lpmf(0 | theta) @@ -4265,7 +4265,7 @@ \subsection{Hurdle Models} % \[ p(y|\theta,\lambda) -= += \begin{cases} \ \theta & \mbox{if } y = 0, \mbox{ and} \\ @@ -4279,7 +4279,7 @@ \subsection{Hurdle Models} % where \distro{PoissonCDF} is the cumulative distribution function for the Poisson distribution. The hurdle model is even more straightforward to -program in Stan, as it does not require an explicit mixture. +program in Stan, as it does not require an explicit mixture. % \begin{stancode} if (y[n] == 0) @@ -4308,7 +4308,7 @@ \subsection{Hurdle Models} Julian King pointed out that because \[ \log \left( 1 - \distro{PoissonCDF}(0 | \lambda) \right) -\ = \ \log \left( 1 - \distro{Poisson}(0 | \lambda) \right) +\ = \ \log \left( 1 - \distro{Poisson}(0 | \lambda) \right) \ = \ \log(1 - \exp(-\lambda)) \] the CCDF in the else clause can be replaced with a simpler expression. @@ -4353,7 +4353,7 @@ \subsection{Hurdle Models} N0 = num_zero(y); Ngt0 = N - N0; - { + { int pos; pos = 1; for (n in 1:N) { @@ -4434,7 +4434,7 @@ \chapter{Measurement Error and Meta-Analysis} large relative to the quantity being measured, or when very precise relations can be estimated being measured quantities, it is useful to introduce an explicit model of measurement error. One kind of -measurement error is rounding. +measurement error is rounding. Meta-analysis plays out statistically very much like measurement error models, where the inferences drawn from multiple data sets are @@ -4465,12 +4465,12 @@ \subsection{Regression with Measurement Error} } parameters { real alpha; // intercept - real beta; // slope + real beta; // slope real sigma; // outcome noise } model { y ~ normal(alpha + beta * x, sigma); - alpha ~ normal(0, 10); + alpha ~ normal(0, 10); beta ~ normal(0, 10); sigma ~ cauchy(0, 5); } @@ -4494,7 +4494,7 @@ \subsection{Regression with Measurement Error} real tau; // measurement noise } parameters { - real x[N]; // unknown true value + real x[N]; // unknown true value real mu_x; // prior location real sigma_x; // prior scale ... @@ -4503,7 +4503,7 @@ \subsection{Regression with Measurement Error} x ~ normal(mu_x, sigma_x); // prior x_meas ~ normal(x, tau); // measurement model y ~ normal(alpha + beta * x, sigma); - ... + ... } \end{stancode} % @@ -4535,7 +4535,7 @@ \subsection{Rounding} A common form of measurement error arises from rounding measurements. Rounding may be done in many ways, such as rounding weights to the nearest milligram, or to the nearest pound; rounding may even be done -by rounding down to the nearest integer. +by rounding down to the nearest integer. Exercise 3.5(b) from \citep{GelmanEtAl:2013} provides an example. % @@ -4543,7 +4543,7 @@ \subsection{Rounding} 3.5. \ Suppose we weigh an object five times and measure weights, rounded to the nearest pound, of 10, 10, 12, 11, 9. Assume the unrounded measurements are normally distributed with a - noninformative prior distribution on $\mu$ and $\sigma^2$. + noninformative prior distribution on $\mu$ and $\sigma^2$. \\[4pt] (b) \ Give the correct posterior distribution for $(\mu, \sigma^2)$, treating the measurements as rounded. @@ -4561,15 +4561,15 @@ \subsection{Rounding} by marginalizing out the unrounded measurement, producing the likelihood \[ p(y_n \, | \, \mu, \sigma) -= += \int_{y_n - 0.5}^{y_n + 0.5} \ \distro{Normal}(z_n \, | \, \mu, \sigma) \ \mathrm{d}z_n -= += \Phi\!\left(\frac{y_n + 0.5 - \mu}{\sigma}\right) -- +- \Phi\!\left(\frac{y_n - 0.5 - \mu}{\sigma}\right). \] Gelman's answer for this problem took the noninformative prior to be @@ -4589,10 +4589,10 @@ \subsection{Rounding} & \propto & \frac{1}{\sigma^2} \ -\mathlarger{\mathlarger{\prod}}_{n=1}^5 +\mathlarger{\mathlarger{\prod}}_{n=1}^5 \left( \Phi\!\left(\frac{y_n + 0.5 - \mu}{\sigma}\right) -- +- \Phi\!\left(\frac{y_n - 0.5 - \mu}{\sigma}\right) \right). \end{eqnarray*} @@ -4665,7 +4665,7 @@ \section{Meta-Analysis} Meta-analysis aims to pool the data from several studies, such as the application of a tutoring program in several schools or treatment -using a drug in several clinical trials. +using a drug in several clinical trials. The Bayesian framework is particularly convenient for meta-analysis, because each previous study can be treated as providing a noisy @@ -4712,17 +4712,17 @@ \subsubsection{Converting to Log Odds and Standard Error} \[ y_j = \log \left( \frac{r^t_j / (n^t_j - r^t_j)} {r^c_j / (n^c_j - r^c_j)} \right) -\ \ = \ \ +\ \ = \ \ \log \left( \frac{r^t_j}{n^t_j - r^t_j} \right) -- +- \log \left( \frac{r^c_j}{n^c_j - r^c_j} \right) \] and corresponding standard errors \[ \sigma_j = \sqrt{ -\frac{1}{r^T_i} +\frac{1}{r^T_i} + \frac{1}{n^T_i - r^T_i} -+ \frac{1}{r^C_i} ++ \frac{1}{r^C_i} + \frac{1}{n^C_i - r^C_i} }. \] @@ -4735,7 +4735,7 @@ \subsubsection{Converting to Log Odds and Standard Error} transformed data { real y[J]; real sigma[J]; - for (j in 1:J) + for (j in 1:J) y[j] = log(r_t[j]) - log(n_t[j] - r_t[j]) - (log(r_c[j]) - log(n_c[j] - r_c[j]); for (j in 1:J) @@ -4744,7 +4744,7 @@ \subsubsection{Converting to Log Odds and Standard Error} } \end{stancode} % -This definition will be problematic if any of the success counts is +This definition will be problematic if any of the success counts is zero or equal to the number of trials. If that arises, a direct binomial model will be required or other transforms must be used than the unregularized sample log odds. @@ -4783,7 +4783,7 @@ \subsubsection{Hierarchical Model} vary by clinical trial, a hierarchical model can be used. The parameters include per-trial treatment effects and the hierarchical prior parameters, which will be estimated along with other unknown -quantities. +quantities. % \begin{stancode} parameters { @@ -4812,7 +4812,7 @@ \subsubsection{Hierarchical Model} % \footnote{The model provided for this data in \citep[Section~5.5]{GelmanEtAl:2013} is included with the -data in the Stan example model repository, +data in the Stan example model repository, \url{http://mc-stan.org/documentation}.} \subsubsection{Extensions and Alternatives} @@ -4862,7 +4862,7 @@ \section{The Benefits of Marginalization}\label{rao-blackwell.section} (EM), are often provided in applied statistics papers to describe maximum likelihood estimation algorithms. Such derivations provide exactly the marginalization needed for coding the model in Stan. - + \section{Change Point Models} The first example is a model of coal mining disasters in the U.K.\ @@ -4913,13 +4913,13 @@ \subsection{Marginalizing out the Discrete Parameter} To code this model in Stan, the discrete parameter $s$ must be marginalized out to produce a model defining the log of the probability function $p(e,l,D_t)$. The full joint probability factors -as +as % \begin{eqnarray*} p(e,l,s,D) & = & p(e) \, p(l) \, p(s) \, p(D | s, e, l) \\[3pt] -& = & +& = & \begin{array}[t]{l} \distro{Exponential}(e|r_e) \ \distro{Exponential}(l|r_l) \ \distro{Uniform}(s|1, T) @@ -4938,7 +4938,7 @@ \subsection{Marginalizing out the Discrete Parameter} where the likelihood is defined by marginalizing $s$ as % \begin{eqnarray*} -p(D | e,l) +p(D | e,l) & = & \sum_{s=1}^T p(s, D | e,l) \\[3pt] @@ -4946,7 +4946,7 @@ \subsection{Marginalizing out the Discrete Parameter} \sum_{s=1}^T p(s) p(D | s,e,l) \\[3pt] & = & -\sum_{s=1}^T \distro{Uniform}(s | 1,T) +\sum_{s=1}^T \distro{Uniform}(s | 1,T) \, \prod_{t=1}^T \distro{Poisson}(D_t | t < s \ ? \ e \ : \ l) \end{eqnarray*} % @@ -4956,7 +4956,7 @@ \subsection{Marginalizing out the Discrete Parameter} \log p(D | e,l) \\[3pt] \mbox{ } \ \ = \ -\mbox{log\_sum\_exp}_{s=1}^T +\mbox{log\_sum\_exp}_{s=1}^T \begin{array}[t]{l} \big( \log \distro{Uniform}(s \, | \, 1, T) @@ -5013,7 +5013,7 @@ \subsection{Coding the Model in Stan} e ~ exponential(r_e); l ~ exponential(r_l); target += log_sum_exp(lp); -} +} \end{stancode} \vspace*{-6pt} \caption{\small\it A change point model in which disaster rates @@ -5093,7 +5093,7 @@ \subsection{Posterior Distribution of the Discrete Change Point} & \propto & q(s | D) \\[3pt] -& = & +& = & \frac{1}{M} \sum_{m=1}^{M} \exp(\code{lp}[m,s]). \end{eqnarray*} % @@ -5184,7 +5184,7 @@ \subsection{Multiple Change Points} for (s1 in 1:T) for (s2 in 1:T) for (t in 1:T) - lp[s1,s2] = lp[s1,s2] + lp[s1,s2] = lp[s1,s2] + poisson_lpmf(D[t] | t < s1 ? e : (t < s2 ? m : l)); \end{stancode} % @@ -5292,12 +5292,12 @@ \subsection{Cormack-Jolly-Seber with Discrete Parameter} The Cormack-Jolly-Seber (CJS) model \citep{Cormack:1964,Jolly:1965,Seber:1965} is an open-population model in which the population may change over time due to death; the -presentation here draws heavily on \citep{Schofield:2007}. +presentation here draws heavily on \citep{Schofield:2007}. The basic data is % \begin{itemize} -\item $I$ : number of individuals, +\item $I$ : number of individuals, \item $T$ : number of capture periods, and \item $y_{i,t}$ : boolean indicating if individual $i$ was captured at time $t$. @@ -5351,10 +5351,10 @@ \subsection{Collective Cormack-Jolly-Seber Model} alive at time $t$ (if it is dead, the recapture probability is zero). this quantity is defined recursively by \[ -\chi_t -= +\chi_t += \begin{cases} -1 +1 & \mbox{if } t = T \\[3pt] (1 - \phi_t) + \phi_t (1 - p_{t+1}) \chi_{t+1} @@ -5370,7 +5370,7 @@ \subsection{Collective Cormack-Jolly-Seber Model} \phi_t)$, or (2) surviving to the next time period with probability $\phi_t$, not being captured in the next time period with probability $(1 - p_{t+1})$, and not being captured again after being alive in -period $t+1$ with probability $\chi_{t+1}$. +period $t+1$ with probability $\chi_{t+1}$. With three capture times, there are three captured/not-captured profiles an individual may have. These may be naturally coded as @@ -5395,7 +5395,7 @@ \subsection{Collective Cormack-Jolly-Seber Model} {5} & + & - & + & $\phi_1 \, (1 - p_2) \, \phi_2 \, p_3$ \\ \hline {6} & + & + & - & $ \phi_1 \, p_2 \, \chi_2$ -\\ +\\ {7} & + & + & + & $\phi_1 \, p_2 \, \phi_2 \, p_3$ \end{tabular} \end{center} @@ -5432,7 +5432,7 @@ \subsection{Collective Cormack-Jolly-Seber Model} real p[3]; } transformed parameters { - real chi[2]; + real chi[2]; chi[2] = (1 - phi[2]) + phi[2] * (1 - p[3]); chi[1] = (1 - phi[1]) + phi[1] * (1 - p[2]) * chi[2]; } @@ -5442,9 +5442,9 @@ \subsection{Collective Cormack-Jolly-Seber Model} target += history[4] * (log(chi[1])); target += history[5] * (log(phi[1]) + log1m(p[2]) + log(phi[2]) + log(p[3])); - target += history[6] * (log(phi[1]) + log(p[2]) + target += history[6] * (log(phi[1]) + log(p[2]) + log(chi[2])); - target += history[7] * (log(phi[1]) + log(p[2]) + target += history[7] * (log(phi[1]) + log(p[2]) + log(phi[2]) + log(p[3])); } generated quantities { @@ -5507,7 +5507,7 @@ \subsubsection{Utility Functions} feed it into the Stan model as preprocessed data. Yet another alternative encoding would be a sparse one recording only the capture events along with their time and identifying the individual -captured.} +captured.} % \begin{stancode} functions { @@ -5517,11 +5517,11 @@ \subsubsection{Utility Functions} return k; return 0; } - int last_capture(int[] y_i) { + int last_capture(int[] y_i) { for (k_rev in 0:(size(y_i) - 1)) { int k; k = size(y_i) - k_rev; - if (y_i[k]) + if (y_i[k]) return k; } return 0; @@ -5553,7 +5553,7 @@ \subsubsection{Utility Functions} n_captured = rep_vector(0, T); for (t in 1:T) for (i in 1:I) - if (y[i, t]) + if (y[i, t]) n_captured[t] = n_captured[t] + 1; } \end{stancode} @@ -5588,16 +5588,16 @@ \subsubsection{Utility Functions} ... vector prob_uncaptured(int T, vector p, vector phi) { vector[T] chi; - chi[T] = 1.0; + chi[T] = 1.0; for (t in 1:(T - 1)) { int t_curr; int t_next; t_curr = T - t; t_next = t_curr + 1; - chi[t_curr] = (1 - phi[t_curr]) - + phi[t_curr] - * (1 - p[t_next]) - * chi[t_next]; + chi[t_curr] = (1 - phi[t_curr]) + + phi[t_curr] + * (1 - p[t_next]) + * chi[t_next]; } return chi; } @@ -5748,7 +5748,7 @@ \subsection{Noisy Categorical Measurement Model} In this section, only categorical ratings are considered, and the challenge in the modeling for Stan is to marginalize out the discrete -parameters. +parameters. \cite{DawidSkene:1979} introduce a noisy-measurement model for data coding and apply in the epidemiological setting of coding what @@ -5854,12 +5854,12 @@ \subsubsection{Marginalizing out the True Category} \[ p(y | \theta, \pi) \ = \ -\prod_{i=1}^I \sum_{k=1}^K +\prod_{i=1}^I \sum_{k=1}^K \left( \distro{Categorical}(z_i | \pi) \prod_{j=1}^J \distro{Categorical}(y_{i, j}|\theta_{j, z[i]}) \right). -\] +\] % In the missing data model, only the observed labels would be used in the inner product. @@ -5879,7 +5879,7 @@ \subsubsection{Marginalizing out the True Category} \log \distro{Categorical}(y_{i, j}|\theta_{j, z[i]}) \right) \right), \end{array} -\] +\] which can be directly coded using Stan's built-in \code{log\_sum\_exp} function. @@ -5911,7 +5911,7 @@ \subsection{Stan Implementation} log_q_z[i] = log(pi); for (j in 1:J) for (k in 1:K) - log_q_z[i, k] = log_q_z[i, k] + log_q_z[i, k] = log_q_z[i, k] + log(theta[j, k, y[i, j]]); } } @@ -5940,7 +5940,7 @@ \subsection{Stan Implementation} $\beta_{k,k'} = 1$ if $k \neq k'$. Taking $\alpha$ and $\beta_k$ to be unit vectors and applying optimization will produce the same answer as the expectation maximization (EM) algorithm of -\cite{DawidSkene:1979}. +\cite{DawidSkene:1979}. \subsubsection{Inference for the True Category} @@ -5992,7 +5992,7 @@ \section{Sparse Data Structures} \begin{center} \begin{minipage}[c]{0.45\textwidth} \[ -y +y = \left[ \begin{array}{cccc} @@ -6008,7 +6008,7 @@ \section{Sparse Data Structures} \ \ \ \begin{minipage}[c]{0.45\textwidth} \begin{tabular}{ll|l} -$jj$ & $kk$ & $y$ +$jj$ & $kk$ & $y$ \\ \hline 1 & 1 & 0 \\ @@ -6075,7 +6075,7 @@ \section{Ragged Data Structures}\label{ragged-data-structs.section} Ragged arrays are arrays that are not rectangular, but have different sized entries. This kind of structure crops up when there are -different numbers of observations per entry. +different numbers of observations per entry. A general approach to dealing with ragged structure is to move to a full database-like data structure as discussed in the previous @@ -6107,7 +6107,7 @@ \section{Ragged Data Structures}\label{ragged-data-structs.section} rows of different sizes ($y_1$ is size 3, $y_2$ size 2, and $y_3$ size 4). On the right is an example of how to code the data in Stan, using a single vector $y$ to hold all the values and a separate - array of integers $s$ to hold the group row sizes. In this + array of integers $s$ to hold the group row sizes. In this example, $y_1 = z_{1:3}$, $y_2 = z_{4:5}$, and $y_3 = z_{6:9}$.}\label{ragged-data.figure} \end{figure} @@ -6118,7 +6118,7 @@ \section{Ragged Data Structures}\label{ragged-data-structs.section} \[ \prod_{n=1}^3 \log \distro{Normal}(y_n | \mu_n, \sigma). \] -There's no direct way to encode this in Stan. +There's no direct way to encode this in Stan. A full database type structure could be used, as in the sparse example, but this is inefficient, wasting space for unnecessary @@ -6127,7 +6127,7 @@ \section{Ragged Data Structures}\label{ragged-data-structs.section} data structure indicating the sizes of each subarray. This is indicated on the right of \reffigure{ragged-data}. This coding uses a single array for the values and a separate array for the sizes of each -row. +row. The model can then be coded up using slicing operations as follows. \begin{quote} @@ -6181,7 +6181,7 @@ \section{Soft $K$-Means} $D$-dimensional vectors. Specifically, there will be $N$ items to be clustered, each represented as a vector $y_n \in \reals^D$. In the ``soft'' version of $K$-means, the assignments to clusters will be -probabilistic. +probabilistic. \subsection{Geometric Hard $K$-Means Clustering} @@ -6192,7 +6192,7 @@ \subsection{Geometric Hard $K$-Means Clustering} \begin{enumerate} \item For each $n$ in $1:N$, randomly assign vector $y_n$ to a cluster in $1{:}K$; \item Repeat -\begin{enumerate} +\begin{enumerate} \item For each cluster $k$ in $1{:}K$, compute the cluster centroid $\mu_k$ by averaging the vectors assigned to that cluster; \item For each $n$ in $1:N$, reassign $y_n$ to the cluster $k$ @@ -6272,7 +6272,7 @@ \subsection{Stan Implementation of Soft $K$-Means} real soft_z[N, K]; // log unnormalized clusters for (n in 1:N) for (k in 1:K) - soft_z[n, k] = neg_log_K + soft_z[n, k] = neg_log_K - 0.5 * dot_self(mu[k] - y[n]); } model { @@ -6305,7 +6305,7 @@ \subsection{Generalizing Soft $K$-Means} geometries. For instance, replacing the normal distribution with the double exponential (Laplace) distribution produces a clustering model based on $L_1$ distance (i.e., Manhattan or taxicab -distance). +distance). Within the multivariate normal version of $K$-means, replacing the unit covariance matrix with a shared covariance matrix amounts to @@ -6339,7 +6339,7 @@ \subsection{Non-Identifiability} parameter in soft $K$-means is not identified, leading to problems in monitoring convergence. Clusters can even fail to be identified within a single chain, with indices swapping if the chain is long -enough or the data is not cleanly separated. +enough or the data is not cleanly separated. \subsection{Multimodality} @@ -6376,12 +6376,12 @@ \section{Naive Bayes Classification and Clustering} Multinomial mixture models are referred to as ``naive Bayes'' because they are often applied to classification problems where the -multinomial independence assumptions are clearly false. +multinomial independence assumptions are clearly false. Naive Bayes classification and clustering can be applied to any data with multinomial structure. A typical example of this is natural language text classification and clustering, which is used an example -in what follows. +in what follows. The observed data consists of a sequence of $M$ documents made up of bags of words drawn from a vocabulary of $V$ distinct words. A @@ -6398,7 +6398,7 @@ \section{Naive Bayes Classification and Clustering} z_m \sim \distro{Categorical}(\theta). \] The $K$-simplex parameter $\theta$ represents the prevalence of each -category in the data. +category in the data. Next, the words in each document are generated conditionally independently of each other and the words in other documents based on @@ -6443,7 +6443,7 @@ \subsection{Coding Ragged Arrays} % The relevant variables for the program are \code{N}, the total number of words in all the documents, the word array \code{w}, and the -document identity array \code{doc}. +document identity array \code{doc}. \subsection{Estimation with Category-Labeled Training Data} @@ -6475,7 +6475,7 @@ \subsection{Estimation with Category-Labeled Training Data} } model { theta ~ dirichlet(alpha); - for (k in 1:K) + for (k in 1:K) phi[k] ~ dirichlet(beta); for (m in 1:M) z[m] ~ categorical(theta); @@ -6486,12 +6486,12 @@ \subsection{Estimation with Category-Labeled Training Data} % Note that the topic identifiers $z_m$ are declared as data and the latent category assignments are included as part of the likelihood -function. +function. \subsection{Estimation without Category-Labeled Training Data} Naive Bayes models can be used in an unsupervised fashion to cluster -multinomial-structured data into a fixed number $K$ of categories. +multinomial-structured data into a fixed number $K$ of categories. The data declaration includes the same variables as the model in the previous section excluding the topic labels \code{z}. Because \code{z} is discrete, it needs to be summed out of the model @@ -6502,13 +6502,13 @@ \subsection{Estimation without Category-Labeled Training Data} \begin{array}{l} \log p(w_{m,1},\ldots,w_{m,N_m}|\theta,\phi) \\[2pt] -\ \ \ = \ -\log \sum_{k=1}^K +\ \ \ = \ +\log \sum_{k=1}^K \left( \distro{Categorical}(k|\theta) \times \prod_{n=1}^{N_m} \distro{Categorical}(w_{m,n}|\phi_k) \right) \\[6pt] -\ \ \ = \ +\ \ \ = \ \log \sum_{k=1}^K \exp \left( \log \distro{Categorical}(k|\theta) + \sum_{n=1}^{N_m} \log \distro{Categorical}(w_{m,n}|\phi_k) @@ -6526,12 +6526,12 @@ \subsection{Estimation without Category-Labeled Training Data} theta ~ dirichlet(alpha); for (k in 1:K) phi[k] ~ dirichlet(beta); - for (m in 1:M) - for (k in 1:K) + for (m in 1:M) + for (k in 1:K) gamma[m, k] = categorical_lpmf(k | theta); for (n in 1:N) for (k in 1:K) - gamma[doc[n], k] = gamma[doc[n], k] + gamma[doc[n], k] = gamma[doc[n], k] + categorical_lpmf(w[n] | phi[k]); for (m in 1:M) target += log_sum_exp(gamma[m]); @@ -6549,7 +6549,7 @@ \subsection{Estimation without Category-Labeled Training Data} \[ \mbox{Pr}[z_m = k|w,\alpha,\beta] = -\exp \left( +\exp \left( \gamma_{m,k} - \log \sum_{k=1}^K \exp \left( \gamma_{m,k} \right) \right). @@ -6644,10 +6644,10 @@ \subsection{Summing out the Discrete Parameters} % \begin{eqnarray*} p(\theta,\phi|w,\alpha,\beta) -& \propto & +& \propto & p(\theta|\alpha) \times p(\phi|\beta) \times p(w|\theta,\phi) \\[4pt] -& = & +& = & \prod_{m=1}^M p(\theta_m|\alpha) \times \prod_{k=1}^K p(\phi_k|\beta) @@ -6672,16 +6672,16 @@ \subsection{Summing out the Discrete Parameters} \begin{array}{l} \log p(\theta,\phi|w,\alpha,\beta) \\[6pt] -{ } \ \ +{ } \ \ \begin{array}{l} { } = \sum_{m=1}^M \log \distro{Dirichlet}(\theta_m|\alpha) \ + \ \sum_{k=1}^K \log \distro{Dirichlet}(\phi_k|\beta) \\[6pt] { } \ \ \ \ \ -+ \sum_{m=1}^M \sum_{n=1}^{N[m]} \log \left( ++ \sum_{m=1}^M \sum_{n=1}^{N[m]} \log \left( \sum_{z=1}^K - \distro{Categorical}(z|\theta_m) + \distro{Categorical}(z|\theta_m) \times \distro{Categorical}(w_{m,n}|\phi_z) \right) \end{array} @@ -6711,28 +6711,28 @@ \subsection{Implementation of LDA} simplex[V] phi[K]; // word dist for topic k } model { - for (m in 1:M) + for (m in 1:M) theta[m] ~ dirichlet(alpha); // prior - for (k in 1:K) + for (k in 1:K) phi[k] ~ dirichlet(beta); // prior for (n in 1:N) { real gamma[K]; - for (k in 1:K) + for (k in 1:K) gamma[k] = log(theta[doc[n], k]) + log(phi[k, w[n]]); - target += log_sum_exp(gamma)); // likelihood; + target += log_sum_exp(gamma); // likelihood; } } \end{stancode} % As in the other mixture models, the log-sum-of-exponents function is -used to stabilize the numerical arithmetic. +used to stabilize the numerical arithmetic. \subsection{Correlated Topic Model} To account for correlations in the distribution of topics for documents, \citep{BleiLafferty:2007} introduced a variant of LDA in which the Dirichlet prior on the per-document topic distribution is -replaced with a multivariate logistic normal distribution. +replaced with a multivariate logistic normal distribution. The authors treat the prior as a fixed hyperparameter. They use an $L_1$-regularized estimate of covariance, which is equivalent to the @@ -6752,7 +6752,7 @@ \subsubsection{Fixed Hyperparameter Correlated Topic Model} data { ... data as before without alpha ... vector[K] mu; // topic mean - cov_matrix[K] Sigma; // topic covariance + cov_matrix[K] Sigma; // topic covariance } \end{stancode} % @@ -6781,7 +6781,7 @@ \subsubsection{Full Bayes Correlated Topic Model} By adding a prior for the mean and covariance, Stan supports full Bayesian inference for the correlated topic model. This requires -moving the declarations of topic mean \code{mu} and covariance \code{Sigma} +moving the declarations of topic mean \code{mu} and covariance \code{Sigma} from the data block to the parameters block and providing them with priors in the model. A relatively efficient and interpretable prior for the covariance matrix \code{Sigma} may be encoded as follows. @@ -6843,7 +6843,7 @@ \chapter{Gaussian Processes}\label{gaussian-processes.chapter} the function's value at a finite number of input points is a multivariate normal distribution. This makes it tractable to both fit models from finite amounts of observed data and make predictions for finitely many new data -points. +points. Unlike a simple multivariate normal distribution, which is parameterized by a mean vector and covariance matrix, a Gaussian @@ -6881,7 +6881,7 @@ \section{Gaussian Process Regression} conditioned on their inputs $x$ is Gaussian: \[ y \sim \distro{MultiNormal}(m(x), K(x | \theta)), -\] +\] where $m(x)$ is an $N$-vector and $K(x | \theta)$ is an $N \times N$ covariance matrix. The mean function $m : \reals^{N \times D} \rightarrow \reals^{N}$ can be anything, but the covariance function @@ -6897,7 +6897,7 @@ \section{Gaussian Process Regression} in this chapter, is an exponentiated quadratic function, \[ K(x | \alpha, \rho, \sigma)_{i, j} -= \alpha^2 += \alpha^2 \exp \left( - \dfrac{1}{2 \rho^2} \sum_{d=1}^D (x_{i,d} - x_{j,d})^2 \right) @@ -6908,10 +6908,10 @@ \section{Gaussian Process Regression} function with value 1 if $i = j$ and value 0 otherwise; note that this test is between the indexes $i$ and $j$, not between values $x_i$ and $x_j$. Note that this kernel is obtained through a convolution of two -independent Gaussian processes, $f_1$ and $f_2$, with kernels +independent Gaussian processes, $f_1$ and $f_2$, with kernels \[ K_1(x | \alpha, \rho)_{i, j} -= \alpha^2 += \alpha^2 \exp \left( - \dfrac{1}{2 \rho^2} \sum_{d=1}^D (x_{i,d} - x_{j,d})^2 \right) @@ -6919,14 +6919,14 @@ \section{Gaussian Process Regression} and \[ K_2(x | \sigma)_{i, j} -= += \delta_{i, j} \sigma^2, \] The addition of $\sigma^2$ on the diagonal is important to ensure the positive definiteness of the resulting matrix in the case of two identical inputs $x_i = x_j$. In statistical terms, $\sigma$ is -the scale of the noise term in the regression. +the scale of the noise term in the regression. The hyperparameter $\rho$ is the \emph{length-scale}, and corresponds to the frequency of the functions represented by the Gaussian process prior with @@ -6949,7 +6949,7 @@ \section{Gaussian Process Regression} $x_i$ and $x_j$ (i.e., the $L_2$ norm of their difference, $x_i - x_j$). This results in support for smooth functions in the process. The amount of variation in the function is controlled by the free -hyperparameters $\alpha$, $\rho$, and $\sigma$. +hyperparameters $\alpha$, $\rho$, and $\sigma$. Changing the notion of distance from Euclidean to taxicab distance (i.e., an $L_1$ norm) changes the support to functions which are @@ -7001,7 +7001,7 @@ \section{Simulating from a Gaussian Process} \end{stancode} % The above model can also be written more compactly using the specialized -covariance function that implements the exponentiated quadratic kernel. +covariance function that implements the exponentiated quadratic kernel. % \begin{stancode} data { @@ -7027,7 +7027,7 @@ \section{Simulating from a Gaussian Process} The input data is just the vector of inputs \code{x} and its size \code{N}. Such a model can be used with values of \code{x} evenly spaced over some interval in order to plot sample draws of functions -from a Gaussian process. +from a Gaussian process. \subsection{Multivariate Inputs} @@ -7051,7 +7051,7 @@ \subsection{Multivariate Inputs} \end{stancode} % The data is now declared as an array of vectors instead of an array of -scalars; the dimensionality \code{D} is also declared. +scalars; the dimensionality \code{D} is also declared. In the remainder of the chapter, univariate models will be used for simplicity, but any of the models could be changed to multivariate in the same way as the @@ -7119,7 +7119,7 @@ \section{Fitting a Gaussian Process}\label{fit-gp.section} \subsection{GP with a normal outcome} -The full generative model for a GP with a normal outcome, +The full generative model for a GP with a normal outcome, $y \in \R^N$, with inputs $x \in \R^N$, for a finite $N$: \begin{align*} @@ -7141,7 +7141,7 @@ \subsection{GP with a normal outcome} \left(0, K(x | \alpha, \rho) + \mathbf{I}_N \sigma^2\right) \\ \end{align*} -It can be more computationally efficient when dealing with a normal +It can be more computationally efficient when dealing with a normal outcome to integrate out the Gaussian process, because this yields a lower-dimensional parameter space over which to do inference. We'll fit both models in Stan. The former model will be referred to as the latent @@ -7159,9 +7159,9 @@ \subsection{GP with a normal outcome} The Stan program implementing the marginal likelihood GP is shown below. The program is similar to the Stan programs that implement the simulation GPs -above, but because we are doing inference on the hyperparameters, we need to +above, but because we are doing inference on the hyperparameters, we need to calculate the covariance matrix \code{K} in the model block, rather than -the transformed data block. +the transformed data block. % \footnote{The program code is available in the Stan example model repository; see \url{http://mc-stan.org/documentation}.} @@ -7189,7 +7189,7 @@ \subsection{GP with a normal outcome} // diagonal elements for (n in 1:N) - K[n, n] = K[n, n] + sq_sigma; + K[n, n] = K[n, n] + sq_sigma; L_K = cholesky_decompose(K); @@ -7210,7 +7210,7 @@ \subsection{GP with a normal outcome} consists of the priors for the hyperparameters and the multivariate Cholesky-parameterized normal likelihood, only now the value \code{y} is known and the covariance matrix \code{K} is an unknown dependent on the -hyperparameters, allowing us to learn the hyperparameters. +hyperparameters, allowing us to learn the hyperparameters. We have used the Cholesky parameterized \distro{MultiNormal} rather than the standard \distro{MultiNormal} because it allows us to the @@ -7256,7 +7256,7 @@ \subsubsection{Latent variable GP} // diagonal elements for (k in 1:N) - K[k, k] = K[k, k] + delta; + K[k, k] = K[k, k] + delta; L_K = cholesky_decompose(K); f = L_K * eta; @@ -7300,9 +7300,9 @@ \subsubsection{Poisson GP} % \begin{stancode} data { -... +... int y[N]; -... +... } ... parameters { @@ -7343,9 +7343,9 @@ \subsubsection{Logistic Gaussian Process Regression} % \begin{stancode} data { -... +... int z[N]; -... +... } ... model { @@ -7362,7 +7362,7 @@ \subsection{Automatic Relevance Determination} covariance function can be further generalized by fitting a scale parameter $\rho_d$ for each dimension $d$, \[ - k(x | \alpha, \vec{\rho}, \sigma)_{i, j} = \alpha^2 \exp + k(x | \alpha, \vec{\rho}, \sigma)_{i, j} = \alpha^2 \exp \left(-\dfrac{1}{2} \sum_{d=1}^D \dfrac{1}{\rho_d^2} (x_{i,d} - x_{j,d})^2 \right) @@ -7391,7 +7391,7 @@ \subsection{Automatic Relevance Determination} % \begin{stancode} -functions { +functions { matrix L_cov_exp_quad_ARD(real[] x, real alpha, vector[] rho, @@ -7404,7 +7404,7 @@ \subsection{Automatic Relevance Determination} K[i,i] = sq_alpha + delta; for (j in (i + 1):N) { K[i, j] = sq_alpha - * exp(inv_half + * exp(inv_half * dot_self((x[i] - x[j]) ./ rho)); K[j, i] = K[i, j]; } @@ -7450,13 +7450,13 @@ \subsection{Priors for Gaussian Process Parameters}\label{priors-gp.section} Formulating priors for GP hyperparameters requires the analyst to consider the inherent statistical properties of a GP, the GP's purpose in the model, and the -numerical issues that may arise in Stan when estimating a GP. +numerical issues that may arise in Stan when estimating a GP. Perhaps most importantly, the parameters $\rho$ and $\alpha$ are weakly identified \citep{zhang-gp:2004}. The ratio of the two parameters is well-identified, but in practice we put independent priors on the two hyperparameters because these two quantities are more interpretable than -their ratio. +their ratio. \subsubsection{Priors for length-scale} @@ -7470,7 +7470,7 @@ \subsubsection{Priors for length-scale} accurately sample using Euclidean HMC. We may wish to put further soft constraints on the length-scale, but these are -dependent on how the GP is used in our statistical model. +dependent on how the GP is used in our statistical model. If our model consists of only the GP, i.e.: % @@ -7478,14 +7478,14 @@ \subsubsection{Priors for length-scale} f & \sim \distro{MultiNormal}\left(0, K(x | \alpha, \rho)\right) \\ y_i & \sim \distro{Normal}(f_i, \sigma) \, \forall i \in \{1, \dots, N\} \\ & x \in \reals^{N \times D}, \, f \in \reals^N -\end{align*} +\end{align*} % we likely don't need constraints beyond penalizing small length-scales. We'd like to allow the GP prior to represent both high-frequency and low-frequency functions, so our prior should put non-negligible mass on both sets of functions. In this case, an inverse gamma, \code{inv\_gamma\_lpdf} in Stan's language, will work well as it has a sharp left tail that puts negligible mass -on length-scales, but a generous right tail, allowing for large length-scales. +on length-scales, but a generous right tail, allowing for large length-scales. If we're using the GP as a component in a larger model that includes an overall mean and fixed effects for the same variables we're using as the domain for the @@ -7496,7 +7496,7 @@ \subsubsection{Priors for length-scale} \sim \distro{Normal}(\beta_0 + x_i \beta_{[1:D]} + f_i, \sigma) \, \forall i \in \{1, \dots, N\} \\ & x_i^T, \beta_{[1:D]} \in \reals^D,\, x \in \reals^{N \times D},\, f \in \reals^N -\end{align*} +\end{align*} % we'll likely want to constrain large length-scales as well. A length scale that is larger than the scale of the data yields a GP posterior that is @@ -7512,14 +7512,14 @@ \subsubsection{Priors for length-scale} f(x | a, b, p) & = \dfrac{(a/b)^{p/2}}{2K_p(\sqrt{ab})} x^{p - 1}\exp(-(ax + b / x)/2) \\ & x, a, b \in \reals^{+}, \, p \in \mathbb{Z} -\end{align*} +\end{align*} % which has an inverse gamma left tail if $p \leq 0$ and a Gaussian right tail. This has not yet been implemented in Stan's math library, but it is possible to implement as a user defined function: \begin{stancode} functions { - real generalized_inverse_gaussian_lpdf(real x, int p, + real generalized_inverse_gaussian_lpdf(real x, int p, real a, real b) { return p * 0.5 * log(a / b) - log(2 * modified_bessel_second_kind(p, sqrt(a * b))) @@ -7555,7 +7555,7 @@ \subsubsection{Priors for marginal standard deviation} A half-$t$ or half-Gaussian prior on alpha also has the benefit of putting nontrivial prior mass around zero. This allows the GP support the zero functions and allows the possibility that the GP won't contribute to the -conditional mean of the total output. +conditional mean of the total output. \subsection{Predictive Inference with a Gaussian Process} @@ -7577,8 +7577,8 @@ \subsection{Predictive Inference with a Gaussian Process} % \begin{stancode} data { - int N1; - real x1[N1]; + int N1; + real x1[N1]; vector[N1] y1; int N2; real x2[N2]; @@ -7621,10 +7621,10 @@ \subsection{Predictive Inference with a Gaussian Process} The input vectors \code{x1} and \code{x2} are declared as data, as is the observed output vector \code{y1}. The unknown output vector \code{y2}, which corresponds to input vector \code{x2}, is declared in the generated quantities -block and will be sampled when the model is executed. +block and will be sampled when the model is executed. A transformed data block is used to combine the input vectors -\code{x1} and \code{x2} into a single vector \code{x}. +\code{x1} and \code{x2} into a single vector \code{x}. The model block declares and defines a local variable for the combined output vector \code{f}, which consists of the concatenation of the conditional mean @@ -7642,7 +7642,7 @@ \subsection{Predictive Inference with a Gaussian Process} \subsubsection{Predictive Inference in non-Gaussian GPs} -We can do predictive inference in non-Gaussian GPs in much the +We can do predictive inference in non-Gaussian GPs in much the same way as we do with Gaussian GPs. Consider the following full model for prediction using logistic Gaussian @@ -7654,8 +7654,8 @@ \subsubsection{Predictive Inference in non-Gaussian GPs} % \begin{stancode} data { - int N1; - real x1[N1]; + int N1; + real x1[N1]; vector[N1] z1; int N2; real x2[N2]; @@ -7701,12 +7701,12 @@ \subsubsection{Analytical Form of Joint Predictive Inference} Bayesian predictive inference for Gaussian processes with Gaussian observations can be sped up by deriving the posterior analytically, then directly sampling -from it. +from it. Jumping straight to the result, \[ p(\tilde{y}|\tilde{x},y,x) -= += \distro{Normal}(K^{\top}\Sigma^{-1}y,\ \Omega - K^{\top}\Sigma^{-1}K), \] @@ -7764,7 +7764,7 @@ \subsubsection{Analytical Form of Joint Predictive Inference} K_div_y1 = mdivide_left_tri_low(L_K, y1); K_div_y1 = mdivide_right_tri_low(K_div_y1',L_K)'; k_x1_x2 = cov_exp_quad(x1, x2, alpha, rho); - f2_mu = (k_x1_x2' * K_div_y1); + f2_mu = (k_x1_x2' * K_div_y1); v_pred = mdivide_left_tri_low(L_K, k_x1_x2); cov_f2 = cov_exp_quad(x2, alpha, rho) - v_pred' * v_pred; diag_delta = diag_matrix(rep_vector(delta,N2)); @@ -7775,8 +7775,8 @@ \subsubsection{Analytical Form of Joint Predictive Inference} } } data { - int N1; - real x1[N1]; + int N1; + real x1[N1]; vector[N1] y1; int N2; real x2[N2]; @@ -7790,7 +7790,7 @@ \subsubsection{Analytical Form of Joint Predictive Inference} real sigma; } model { - matrix[N1, N1] L_K; + matrix[N1, N1] L_K; { matrix[N1, N1] K = cov_exp_quad(x, alpha, rho); real sq_sigma = square(sigma); @@ -7820,7 +7820,7 @@ \subsection{Multiple-output Gaussian processes} f(x) & \sim \distro{GP}(m(x), K(x | \theta, \phi)) \\ K(x & | \theta) \in \reals^{M \times M}, \, f(x), \, m(x) \in \reals^M \end{align*} -where the $K(x, x^\prime | \theta, \phi)_{[m, m^\prime]}$ entry defines the +where the $K(x, x^\prime | \theta, \phi)_{[m, m^\prime]}$ entry defines the covariance between $f_m(x)$ and $f_{m^\prime}(x^\prime)(x)$. This construction of Gaussian processes allows us to learn the covariance between the output dimensions of $f(x)$. If we parameterize our kernel $K$: @@ -7837,7 +7837,7 @@ \subsection{Multiple-output Gaussian processes} parameterized by some vector $\phi$. The \distro{MatrixNormal} distribution has two covariance matrices: $K(x | -\alpha, \rho)$ to encode column covariance, and $C(\phi)$ to define row +\alpha, \rho)$ to encode column covariance, and $C(\phi)$ to define row covariance. The salient features of the \distro{MatrixNormal} are that the rows of the matrix $f$ are distributed: \begin{align*} f_{[n,]} \sim \distro{MultiNormal}(m(x)_{[n,]}, K(x | \alpha, @@ -7845,12 +7845,12 @@ \subsection{Multiple-output Gaussian processes} distributed: \begin{align*} f_{[,m]} \sim \distro{MultiNormal}(m(x)_{[,m]}, K(x | \alpha, \rho) C(\phi)_{[m,m]}) \end{align*} This also means means that $\E\left[f^T f\right]$ is equal to -$\text{trace}(K(x | \alpha, \rho)) \times C$, whereas $\E\left[ff^T\right]$ +$\text{trace}(K(x | \alpha, \rho)) \times C$, whereas $\E\left[ff^T\right]$ is $\text{trace}(C) \times K(x | \alpha, \rho)$. We can derive this using properties of expectation and the \distro{MatrixNormal} density. We should set $\alpha$ to $1.0$ because the parameter is not identified unless -we constrain $\text{trace}(C) = 1$. Otherwise, we can multiply $\alpha$ by a scalar $d$ and +we constrain $\text{trace}(C) = 1$. Otherwise, we can multiply $\alpha$ by a scalar $d$ and $C$ by $1/d$ and our likelihood will not change. % We can generate a random variable $f$ from a \distro{MatrixNormal} density in @@ -7894,7 +7894,7 @@ \subsection{Multiple-output Gaussian processes} // diagonal elements for (k in 1:N) - K[k, k] = K[k, k] + delta; + K[k, k] = K[k, k] + delta; L_K = cholesky_decompose(K); f = L_K * eta @@ -7930,7 +7930,7 @@ \chapter{Directions, Rotations, and Hyperspheres} type, the values of which determine points on a hypersphere (circle in two dimensions, sphere in three dimensions). -\section{Unit Vectors} +\section{Unit Vectors} The length of a vector $x \in \reals^K$ is given by \[ @@ -7976,7 +7976,7 @@ \section{Circles, Spheres, and Hyperspheres} Even though $S^{n-1}$ behaves locally like $\reals^{n-1}$, there is no way to smoothly map between them. For example, because latitude and longitude work on a modular basis (wrapping at $2\pi$ -radians in natural units), they do not produce a smooth map. +radians in natural units), they do not produce a smooth map. Like a bounded interval $(a, b)$, in geometric terms, a sphere is compact in that the distance between any two points is bounded. @@ -7998,7 +7998,7 @@ \section{Transforming to Unconstrained Parameters} the surface of the sphere. \cite{Marsaglia:1972} introduced an auxiliary variable interpretation of this mapping that provides the desired properties of uniformity; see \refsection{unit-vector} for -details. +details. \subsubsection{Warning: undefined at zero!} @@ -8025,7 +8025,7 @@ \section{Unit Vectors and Rotations} R_{\theta} = \begin{bmatrix} -\cos \theta & - \sin \theta +\cos \theta & - \sin \theta \\ \sin \theta & \cos \theta \end{bmatrix}. @@ -8075,7 +8075,7 @@ \section{Example: Simple Harmonic Oscillator} As a concrete example of a system of ODEs, consider a harmonic oscillator, which is characterized by an equilibrium position and a -restoring force proportional to the displacement with friction. +restoring force proportional to the displacement with friction. The system state will be a pair $y = (y_1, y_2)$ representing position and momentum (i.e., a point in phase space). The change in the system with respect to time is given by the following differential equations.% @@ -8085,7 +8085,7 @@ \section{Example: Simple Harmonic Oscillator} to implement the \code{rk45} solver.} % \begin{equation}\label{ode-sho.equation} -\frac{d}{dt} y_1 = y_2 +\frac{d}{dt} y_1 = y_2 \hspace*{0.5in} \frac{d}{dt} y_2 = -y_1 - \theta y_2 \end{equation} @@ -8200,7 +8200,7 @@ \section{Solving a System of Linear ODEs using a Matrix Exponential} y = \left[\begin{array}{c} y_1 \\ y_2 \\ - \end{array}\right] \ \ \ \ \ + \end{array}\right] \ \ \ \ \ A = \left[\begin{array}{cc} 0 & 1 \\ -1 & -\theta \\ @@ -8228,15 +8228,15 @@ \section{Solving a System of Linear ODEs using a Matrix Exponential} generated quantities { vector[2] y_hat[T]; matrix[2, 2] A; - + A[1, 1] = 0; A[1, 2] = 1; A[2, 1] = -1; A[2, 2] = -theta[1]; - + for (t in 1:T) y_hat[t] = matrix_exp((t - 1) * A) * y0; - + // add measurement error for (t in 1:T) { y_hat[t, 1] = y_hat[t, 1] + normal_rng(0, 0.1); @@ -8248,13 +8248,13 @@ \section{Solving a System of Linear ODEs using a Matrix Exponential} \caption{\small\it Stan program to simulate noisy measurements from a simple harmonic oscillator. The system of linear differential equations is coded as a matrix. The system parameters \code{theta} and initial - state \code{y0} are read in as data along observation times \code{ts}. + state \code{y0} are read in as data along observation times \code{ts}. The generated quantities block is used to solve the ODE for the specified - times and then add random measurement error, producing observations + times and then add random measurement error, producing observations \code{y\_hat}. Because the ODEs are linear, we can use the \code{matrix\_exp} function to solve the system. }\label{sho-sim-me.figure} \end{figure} - + \section{Measurement Error Models} @@ -8293,7 +8293,7 @@ \subsection{Simulating Noisy Measurements} \begin{stancode} functions { real[] sho(real t, - real[] y, + real[] y, real[] theta, real[] x_r, int[] x_i) { @@ -8380,7 +8380,7 @@ \subsection{Estimating System Parameters and Initial State} \begin{stancode} functions { real[] sho(real t, - real[] y, + real[] y, real[] theta, real[] x_r, int[] x_i) { @@ -8506,10 +8506,10 @@ \subsection{Tolerance} The relative and absolute tolerance control the accuracy of the solutions generated by the integrator. Relative tolerances are relative to the solution value, whereas absolute tolerances is the -maximum absolute error allowed in a solution. +maximum absolute error allowed in a solution. Smaller tolerances produce more accurate solutions. Smaller -tolerances also require more computation time. +tolerances also require more computation time. \subsubsection{Sensitivity Analysis} @@ -8523,4 +8523,4 @@ \subsection{Maximum Number of Steps} This can arise in MCMC when a bad jump is taken, particularly during warmup. With the non-stiff solver, this may result in jumping into a stiff region of the parameter space, which would require a very small -step size and very many steps to satisfy even modest tolerances. +step size and very many steps to satisfy even modest tolerances. diff --git a/src/docs/stan-reference/language.tex b/src/docs/stan-reference/language.tex index 6c5da264905..762e1b22161 100644 --- a/src/docs/stan-reference/language.tex +++ b/src/docs/stan-reference/language.tex @@ -247,65 +247,43 @@ \subsection{Constrained Data Types} \end{stancode} There are also special data types for structured vectors and -matrices. -% There are four constrained vector data types, \code{simplex} -% for unit simplexes, \code{unit\_vector} for unit-length vectors, -% \code{ordered} for ordered vectors of scalars and -% \code{positive\_ordered} for vectors of positive ordered -% scalars. There are specialized matrix data types \code{corr\_matrix} -% and \code{cov\_matrix} for correlation matrices (symmetric, positive -% definite, unit diagonal) and covariance matrices (symmetric, positive -% definite). The type \code{cholesky\_factor\_cov} is for Cholesky -% factors of covariance matrices (lower triangular, positive diagonal, -% product with own transpose is a covariance matrix). The type -% \code{cholesky\_factor\_corr} is for Cholesky factors of correlation -% matrices (lower triangular, positive diagonal, unit-length rows). - -Constraints provide error checking for variables defined in the \code{data}, -\code{transformed data}, \code{transformed parameters}, and -\code{generated quantities} blocks. -% -Constraints are critical for variables declared in the -\code{parameters} block, where they determine the transformation from -constrained variables (those satisfying the declared constraint) to -unconstrained variables (those ranging over all of $\mathbb{R}^n$). +matrices. There are four constrained vector data types, \code{simplex} +for unit simplexes, \code{unit\_vector} for unit-length vectors, +\code{ordered} for ordered vectors of scalars and +\code{positive\_ordered} for vectors of positive ordered +scalars. There are specialized matrix data types \code{corr\_matrix} +and \code{cov\_matrix} for correlation matrices (symmetric, positive +definite, unit diagonal) and covariance matrices (symmetric, positive +definite). The type \code{cholesky\_factor\_cov} is for Cholesky +factors of covariance matrices (lower triangular, positive diagonal, +product with own transpose is a covariance matrix). The type +\code{cholesky\_factor\_corr} is for Cholesky factors of correlation +matrices (lower triangular, positive diagonal, unit-length rows). + +Constraints provide error checking for variables defined in the +\code{data}, \code{transformed data}, \code{transformed parameters}, +and \code{generated quantities} blocks. Constraints are critical for +variables declared in the \code{parameters} block, where they +determine the transformation from constrained variables (those +satisfying the declared constraint) to unconstrained variables (those +ranging over all of $\mathbb{R}^n$). It is worth calling out the most important aspect of constrained data types: % \begin{quote} -\it -The model must have support (non-zero density) at every value of the -parameters that meets their declared constraints. + \it The model must have support (non-zero density, equivalently + finite log density) at every value of the parameters that meets + their declared constraints. \end{quote} % -If the declared parameter constraints are less strict than the -support, the samplers and optimizers may have any of a number of -pathologies including just getting stuck, failure to initialize, -excessive Metropolis rejection, or biased samples due to inability to -explore the tails of the distribution. - -% Integer or real types may be constrained with lower bounds, upper -% bounds, or both. There are four constrained vector data types, -% \code{simplex} for unit simplexes, \code{unit\_vector} for unit-length -% vectors, \code{ordered} for ordered vectors of scalars and -% \code{positive\_ordered} for vectors of positive ordered scalars. -% There are specialized matrix data types \code{corr\_matrix} and -% \code{cov\_matrix} for correlation matrices (symmetric, positive -% definite, unit diagonal) and covariance matrices (symmetric, positive -% definite). The type \code{cholesky\_factor\_cov} is for Cholesky -% factors of covariance matrices (lower triangular, positive diagonal, -% product with own transpose is a covariance matrix). The type -% \code{cholesky\_factor\_corr} is for Cholesky factors of correlation -% matrices (lower triangular, positive diagonal, unit-length rows). - -% \subsection{Arrays} - -% Stan supports arrays of arbitrary order of any of the basic data -% types or constrained basic data types. This includes -% three-dimensional arrays of integers, one-dimensional arrays of -% positive reals, four-dimensional arrays of simplexes, one-dimensional -% arrays of row vectors, and so on. +If this condition is violated with parameter values that satisfy +declared constraints but do not have finite log density, then the +samplers and optimizers may have any of a number of pathologies +including just getting stuck, failure to initialize, excessive +Metropolis rejection, or biased samples due to inability to explore +the tails of the distribution. + \section{Primitive Numerical Data Types}\label{numerical-data-types.section} @@ -616,6 +594,15 @@ \subsection{Unit Simplexes} threshold $\epsilon$ to account for errors arising from floating-point imprecision. +In high dimensional problems, simplexes may require smaller step sizes +in the inference algorithms in order to remain stable; this can be +achieved through higher target acceptance rates for samplers and +longer warmup periods, tighter tolerances for optimization with more +iterations, and in either case, with less dispersed parameter +initialization or custom initialization if there are informative +priors for some parameters. + + \subsection{Unit Vectors} A unit vector is a vector with a norm of one. For instance, @@ -2954,28 +2941,8 @@ \subsubsection{Vectorization} \subsection{Accessing the Log Density} -To increment the log density returned by the model by some value -\code{u}, use the following statement.% -% -\footnote{Originally, Stan provided direct access to the log density - through a variable \code{lp\_\_} and then later through the - \code{increment\_log\_prob()} statement, but the first has been - removed and the latter deprecated.} -% -\begin{stancode} -target += u; -\end{stancode} -% -In general, \code{u} can be any expression; if it is a container, the -log density will be incremented by the sum of the elements in the -container. - To access accumulated log density up to the current execution point, -the function \code{get\_lp()} may be used.% -% -\footnote{The value of \code{lp\_\_} will only hold the Jacobian until - the end of the execution because it is not used in either - \code{increment\_log\_prob(u)} or \code{target += u}.} +the function \code{target()()} may be used. \section{Sampling Statements}\label{sampling-statements.section} @@ -3008,7 +2975,7 @@ \section{Sampling Statements}\label{sampling-statements.section} y ~ dist(theta1, ..., thetaN); \end{stancode} % -involving subexpressions \code{y1} and \code{theta1} through +involving subexpressions \code{y} and \code{theta1} through \code{thetaN} (including the case where \code{N} is zero) will be well formed if and only if the corresponding assignment statement is well-formed. For densities allowing real \code{y} values, the log @@ -3062,20 +3029,25 @@ \subsection{User-Transformed Variables} expression. For instance, it is legal syntactically to write % \begin{stancode} -data { - real y; +parameters { + real beta; } // ... model { - log(y) ~ normal(mu, sigma); + log(beta) ~ normal(mu, sigma); } \end{stancode} % -Unfortunately, this is not enough to properly model \code{y} as having -a lognormal distribution. The log Jacobian of the transform must be -added to the log probability accumulator to account for the -differential change in scale (see \refsection{change-of-variables} for -full definitions). For the case above, the following adjustment will +Unfortunately, this is not enough to properly model \code{beta} as +having a lognormal distribution. Whenever a nonliner transform is +applied to a parameter, such as the logarithm function being applied +to \code{beta} here, and then used on the left-hand side of a sampling +statement or on the left of a vertical bar in a log pdf function, an +adjustment must be made to account for the differential change in +scale and ensure \code{beta} gets the correct distribution. The +correction required is to add the log Jacobian of the transform to the +target log density (see \refsection{change-of-variables} for full +definitions). For the case above, the following adjustment will account for the log transform.% % \footnote{Because $\log | \frac{d}{dy} \log y | = \log | 1/y | = - \log diff --git a/src/docs/stan-reference/programming.tex b/src/docs/stan-reference/programming.tex index 3c46f0452bd..043da3c2492 100644 --- a/src/docs/stan-reference/programming.tex +++ b/src/docs/stan-reference/programming.tex @@ -76,7 +76,7 @@ \subsubsection{Beta Distribution} It is often more natural to specify hyperpriors in terms of transformed parameters. In the case of the Beta, the obvious choice -for reparameterization is in terms of a mean parameter +for reparameterization is in terms of a mean parameter \[ \phi = \alpha / (\alpha + \beta) \] @@ -109,7 +109,7 @@ \subsubsection{Beta Distribution} \end{stancode} % The new parameters, \code{phi} and \code{lambda}, are declared in the -parameters block and the parameters for the Beta distribution, +parameters block and the parameters for the Beta distribution, \code{alpha} and \code{beta}, are declared and defined in the transformed parameters block. And If their values are not of interest, they could instead be defined as local variables in the model as @@ -139,7 +139,7 @@ \subsubsection{Beta Distribution} % If the variables \code{alpha} and \code{beta} are of interest, they can be defined in the transformed parameter block and then used in the -model. +model. \subsubsection{Jacobians not Necessary} @@ -147,7 +147,7 @@ \subsubsection{Jacobians not Necessary} Because the transformed parameters are being used, rather than given a distribution, there is no need to apply a Jacobian adjustment for the transform. For example, in the beta distribution example, -\code{alpha} and \code{beta} have the correct posterior distribution. +\code{alpha} and \code{beta} have the correct posterior distribution. \subsubsection{Dirichlet Priors} @@ -170,7 +170,7 @@ \subsubsection{Dirichlet Priors} % This provides essentially $K$ degrees of freedom, one for each dimension of \code{alpha}, and it is not obvious how to specify a -reasonable prior for \code{alpha}. +reasonable prior for \code{alpha}. An alternative coding is to use the mean, which is a simplex, and a total count. @@ -203,7 +203,7 @@ \subsection{Transforming Unconstrained Priors: Probit and Logit} is because inverse logit is the cumulative distribution function (cdf) for the logistic distribution, so that the logit function itself is the inverse cdf and thus maps a uniform draw in $(0, 1)$ to a -logistically-distributed quantity. +logistically-distributed quantity. Things work the same way for the probit case: if $u$ has a $\distro{Uniform}(0, 1)$ distribution, then $\Phi^{-1}(u)$ has a @@ -305,7 +305,7 @@ \subsection{Transforming Unconstrained Priors: Probit and Logit} theta_raw ~ normal(mu, sigma); \end{stancode} % -where either or both of \code{mu} and \code{sigma} can be vectors. +where either or both of \code{mu} and \code{sigma} can be vectors. \section{Changes of Variables} @@ -313,7 +313,7 @@ \section{Changes of Variables} parameter is characterized by a distribution. The standard textbook example is the lognormal distribution, which is the distribution of a variable $y > 0$ whose logarithm $\log y$ has a normal distribution. -Note that the distribution is being assigned to $\log y$. +Note that the distribution is being assigned to $\log y$. The change of variables requires an adjustment to the probability to account for the distortion caused by the transform. For this to work, @@ -328,7 +328,7 @@ \section{Changes of Variables} In the case of log normals, if $y$'s logarithm is normal with mean $\mu$ and deviation $\sigma$, then the distribution of $y$ is given by \[ -p(y) +p(y) \ = \ \distro{Normal}(\log y| \mu, \sigma) \ \left| \frac{d}{dy} \log y \right| \ = \ @@ -336,7 +336,7 @@ \section{Changes of Variables} \] Stan works on the log scale to prevent underflow, where \[ -\log p(y) +\log p(y) = \log \distro{Normal}(\log y| \mu, \sigma) - \log y. @@ -363,7 +363,7 @@ \section{Changes of Variables} \end{stancode} % It is important, as always, to declare appropriate constraints on -parameters; here \code{y} is constrained to be positive. +parameters; here \code{y} is constrained to be positive. It would be slightly more efficient to define a local variable for the logarithm, as follows. @@ -388,7 +388,7 @@ \subsection{Change of Variables vs.\ Transformations} and a simple variable transformation. A transformation samples a parameter, then transforms it, whereas a change of variables transforms a parameter, then samples it. Only the latter requires a -Jacobian adjustment. +Jacobian adjustment. Note that it does not matter whether the probability function is expressed using a sampling statement, such as @@ -408,7 +408,7 @@ \subsubsection{Gamma and Inverse Gamma Distribution}\label{jacobian-adjustment.s Like the log normal, the inverse gamma distribution is a distribution of variables whose inverse has a gamma distribution. This section contrasts two approaches, first with a transform, then with a change -of variables. +of variables. The transform based approach to sampling \code{y\_inv} with an inverse gamma distribution can be coded as follows. @@ -418,7 +418,7 @@ \subsubsection{Gamma and Inverse Gamma Distribution}\label{jacobian-adjustment.s real y; } transformed parameters { - real y_inv; + real y_inv; y_inv = 1 / y; } model { @@ -435,11 +435,11 @@ \subsubsection{Gamma and Inverse Gamma Distribution}\label{jacobian-adjustment.s } transformed parameters { real y; - y = 1 / y_inv; // change - target += -2 * log(y_inv) ); // adjustme; + y = 1 / y_inv; // change variables } model { y ~ gamma(2,4); + target += -2 * log(y_inv); // Jacobian adjustment; } \end{stancode} % @@ -499,7 +499,7 @@ \subsection{Multivariate Changes of Variables} transformed parameters { ... vector[K] J_diag; // diagonals of Jacobian matrix - ... + ... ... compute J[k, k] = d.v[k] / d.u[k] ... target += sum(log(J_diag)); ... @@ -514,10 +514,35 @@ \section{Vectors with Varying Bounds} Jacobians (all of which are described in \refchapter{variable-transforms}) have to be calculated in Stan. +\subsection{Varying Lower Bounds} + For example, suppose there is a vector parameter $\alpha$ with a -vector $L$ of lower bounds. The following program declares raw -unconstrained parameters, then explicitly transforms the raw -parameters to $\alpha$, accounting for the Jacobian along the way. +vector $L$ of lower bounds. The simplest way to deal with this if $L$ +is a constant is to shift a lower-bounded parameter. +% +\begin{stancode} +data { + int N; + vector[N] L; // lower bounds + ... +parameters { + vector[N] alpha_raw; + ... +transformed parameters { + vector[N] alpha = L + alpha_raw; + ... +\end{stancode} +% +The Jacobian for adding a constant is one, so its log drops out of the +log density. + +Even if the lower bound is a paramter rather than data, there is no +Jacobian required, because the transform from $(L, \alpha_{\mathrm + raw})$ to $(L + \alpha_{\mathrm raw}, \alpha_{\mathrm raw}$ produces +a Jacobian derivative matrix with a unit determinant. + +It's also possible implement the transform by directly trnasforming +an unconstrained parameter and accounting for the Jacobian. % \begin{stancode} data { @@ -546,6 +571,33 @@ \section{Vectors with Varying Bounds} $\alpha_{\mathrm{raw}}$ then a revised Jacobian needs to be calculated taking into account the dependencies. +\subsection{Varying Upper and Lower Bounds} + +Suppowe there are lower and upper bounds that vary by parameter. +These can be applied to shift and rescale a parameter constrained to +$(0, 1)$. +% +\begin{stancode} +data { + int N; + vector[N] L; // lower bounds + vector[N] U; // upper bounds + ... +parameters { + vector[N] alpha_raw; + ... +transformed parameters { + vector[N] alpha = L + (U - L) .* alpha_raw; +\end{stancode} +% +The expression \code{U - L} is multiplied by \code{alpha\_raw} +elementwise to produce a vector of variables in $(0, U-L)$, then +adding $L$ results in a variable ranging between $(L, U)$. + +In this case, it is important that $L$ and $U$ are constants, +otherwise a Jacobian would be required when multiplying by $U - L$. + + \chapter{Custom Probability Functions}% \label{custom-probability-functions.chapter} @@ -568,11 +620,11 @@ \subsection{Triangle Distribution} a density defined as follows. \[ \distro{Triangle}(y | \alpha,\beta) -= += \frac{2}{\beta - \alpha} \ \left( -1 - +1 - \left| y - \frac{\alpha + \beta}{\beta - \alpha} \right| @@ -623,7 +675,7 @@ \subsection{Triangle Distribution} % With the constraint on \code{y} in place, this is just a less efficient, slower, and less arithmetically stable version of the -original program. But if the constraint on \code{y} is removed, +original program. But if the constraint on \code{y} is removed, the model will compile and run without arithmetic errors, but will not sample properly.% % @@ -651,7 +703,7 @@ \subsection{Exponential Distribution} This encoding will work for any \code{lambda} and \code{y}; they can be parameters, data, or one of each, or even local variables. -The assignment statement in the previous paragraph generates +The assignment statement in the previous paragraph generates \Cpp code that is very similar to that generated by the following sampling statement. % @@ -755,7 +807,7 @@ \subsubsection{Catching errors} Rejection is used to flag errors that arise in inputs or in program state. It is far better to fail early with a localized informative error message than to run into problems much further downstream (as in -rejecting a state or failing to compute a derivative). +rejecting a state or failing to compute a derivative). The most common errors that are coded is to test that all of the arguments to a function are legal. The following function takes a @@ -811,8 +863,8 @@ \subsection{Type Declarations for Functions} {\it Functions:} & {\it Locals Variables:} & {\it Nonlocal Variables:} \\ {\it Undimensioned} & {\it Unconstrained} & {\it Constrained} \\ \hline \hline -\code{int} -& \code{int} +\code{int} +& \code{int} & \code{int} \\ & & \code{int} @@ -821,8 +873,8 @@ \subsection{Type Declarations for Functions} % \\ \hline % -\code{real} -& \code{real} +\code{real} +& \code{real} & \code{real} \\ & & \code{real} @@ -832,7 +884,7 @@ \subsection{Type Declarations for Functions} \\ \hline % \code{vector} -& +& \code{vector[N]} & \code{vector[N]} \\ @@ -860,8 +912,8 @@ \subsection{Type Declarations for Functions} % \\ \hline % -\code{matrix} -& \code{matrix[M,~N]} +\code{matrix} +& \code{matrix[M,~N]} & \code{matrix[M,~N]} \\ & & \code{matrix[M,~N]} @@ -870,7 +922,7 @@ \subsection{Type Declarations for Functions} \\[4pt] & & \code{cov\_matrix[K]} \\ -& & \code{corr\_matrix[K]} +& & \code{corr\_matrix[K]} \\ & & \code{cholesky\_factor\_cov[K]} \\ @@ -919,10 +971,10 @@ \subsection{Type Declarations for Functions} % \footnote{A range of built-in validation routines is coming to Stan soon! Alternatively, the \code{reject} statement can be used to check -constraints on the simplex.} +constraints on the simplex.} % Upper or lower bounds on values or constrained types are not allowed -as return types or argument types in function declarations. +as return types or argument types in function declarations. \subsection{Array Types for Function Declarations} @@ -937,7 +989,7 @@ \subsection{Array Types for Function Declarations} % The notation \code{[\,]} is used for one-dimensional arrays (as in the return above), \code{[\,,\,]} for two-dimensional arrays, -\code{[\,,\,,\,]} for three-dimensional arrays, and so on. +\code{[\,,\,,\,]} for three-dimensional arrays, and so on. Functions support arrays of any type, including matrix and vector types. As with other types, no constraints are allowed. @@ -973,7 +1025,7 @@ \section{Functions as Statements} Void functions applied to appropriately typed arguments may be used on their own as statements. For example, the pretty-print function defined above may be applied to a covariance matrix being defined in -the transformed parameters block. +the transformed parameters block. % \begin{stancode} transformed parameters { @@ -1097,8 +1149,8 @@ \section{User-Defined Probability Functions} % \begin{stancode} functions { - real unit_normal_lpdf(real y) { - return normal_lpdf(y | 0, 1); + real unit_normal_lpdf(real y) { + return normal_lpdf(y | 0, 1); } } ... @@ -1111,7 +1163,7 @@ \section{User-Defined Probability Functions} % The ability to use the \code{unit\_normal} function as a density is keyed off its name ending in \code{\_lpdf} (names ending in -\code{\_lpmf} for probability mass functions work the same way). +\code{\_lpmf} for probability mass functions work the same way). In general, if \code{foo\_lpdf} is defined to consume $N + 1$ arguments, then @@ -1120,7 +1172,7 @@ \section{User-Defined Probability Functions} y ~ foo(theta1, ..., thetaN); \end{stancode} % -can be used as shorthand for +can be used as shorthand for % \begin{stancode} target += foo_lpdf(y | theta1, ..., thetaN); @@ -1128,7 +1180,7 @@ \section{User-Defined Probability Functions} % As with the built-in functions, the suffix \code{\_lpdf} is dropped and the first argument moves to the left of the sampling symbol (\Verb|~|) -in the sampling statement. +in the sampling statement. Functions ending in \code{\_lpmf} (for probability mass functions), behave exactly the same way. The difference is that the first @@ -1157,9 +1209,9 @@ \section{Documenting Functions}\label{documenting-functions.section} % \begin{stancode} /** - * Return a data matrix of specified size with rows - * corresponding to items and the first column filled - * with the value 1 to represent the intercept and the + * Return a data matrix of specified size with rows + * corresponding to items and the first column filled + * with the value 1 to represent the intercept and the * remaining columns randomly filled with unit-normal draws. * * @param N Number of rows corresponding to data items @@ -1167,7 +1219,7 @@ \section{Documenting Functions}\label{documenting-functions.section} * item. * @return Simulated predictor matrix. */ -matrix predictors_rng(int N, int K) { +matrix predictors_rng(int N, int K) { ... \end{stancode} % @@ -1190,7 +1242,7 @@ \section{Documenting Functions}\label{documenting-functions.section} % \begin{stancode} ... - * @param theta + * @param theta * @throws If any of the entries of theta is negative. */ real entropy(vector theta) { @@ -1241,7 +1293,7 @@ \section{Recursive Functions} $A^n$, which is defined for a square matrix $A$ and positive integer $n$ by \[ -A^n +A^n = \begin{cases} \ \mbox{I} & \mbox{if } n = 0, \mbox{ and} @@ -1259,7 +1311,7 @@ \section{Recursive Functions} matrix matrix_pow(matrix a, int n) { if (n == 0) return diag_matrix(rep_vector(1, rows(a))); - else + else return a * matrix_pow(a, n - 1); } \end{stancode} @@ -1308,7 +1360,7 @@ \subsubsection{Redundant Intercepts} Gibbs sampling as used in BUGS and JAGS and the Hamiltonian Monte Carlo (HMC) and no-U-turn samplers (NUTS) used by Stan.} % -Suppose there are observations $y_n$ for $n \in 1{:}N$, +Suppose there are observations $y_n$ for $n \in 1{:}N$, two intercept parameters $\lambda_1$ and $\lambda_2$, a scale parameter $\sigma > 0$, and the sampling distribution % @@ -1344,7 +1396,7 @@ \subsubsection{Redundant Intercepts} as illustrated in the left-hand figure of \reffigure{non-identifiable-density}. The ridge for this model is along the line where $\lambda_2 = \lambda_1 + c$ for some constant -$c$. +$c$. Contrast this model with a simple regression with a single intercept parameter $\mu$ and sampling distribution @@ -1374,7 +1426,7 @@ \subsubsection{Ability and Difficulty in IRT Models} % \[ p(y | \alpha, \beta) -= += p(y | \alpha + c, \beta - c). \] % @@ -1449,7 +1501,7 @@ \subsubsection{Softmax with $K$ vs. $K-1$ Parameters} The softmax function is many-to-one, which leads to a lack of identifiability of the unconstrained parameters $\alpha$. In particular, adding or subtracting a constant from each $\alpha_k$ -produces the same simplex $\theta$. +produces the same simplex $\theta$. @@ -1511,7 +1563,7 @@ \subsubsection{Pinning Parameters} \subsubsection{Adding Priors} So far, the models have been discussed as if the priors on the -parameters were improper uniform priors. +parameters were improper uniform priors. A more general Bayesian solution to these invariance problems is to impose proper priors on the parameters. This approach can be used to @@ -1572,7 +1624,7 @@ \subsubsection{Vague, Strongly Informative, and Weakly Informative Priors} Care must be used when adding a prior to resolve invariances. If the prior is taken to be too broad (i.e., too vague), the resolution is in -theory only, and samplers will still struggle. +theory only, and samplers will still struggle. Ideally, a realistic prior will be formulated based on substantive knowledge of the problem being modeled. Such a prior can be chosen to @@ -1610,7 +1662,7 @@ \subsection{Mixture Models} and $\mu_2$, a shared scale $\sigma > 0$, a mixture ratio $\theta \in [0,1]$, and likelihood \[ -p(y|\theta,\mu_1,\mu_2,\sigma) +p(y|\theta,\mu_1,\mu_2,\sigma) = \prod_{n=1}^N \big( \theta \, \distro{Normal}(y_n|\mu_1,\sigma) + (1 - \theta) \, \distro{Normal}(y_n|\mu_2,\sigma) \big). \] @@ -1631,7 +1683,7 @@ \subsection{Convergence Monitoring and Effective Sample Size} problem is that the posterior mean, a key ingredient in these computations, is affected by label switching, resulting in a posterior mean for $\mu_1$ that is equal to that of $\mu_2$, and a posterior -mean for $\theta$ that is always 1/2, no matter what the data is. +mean for $\theta$ that is always 1/2, no matter what the data is. \subsection{Some Inferences are Invariant} @@ -1649,7 +1701,7 @@ \subsection{Highly Multimodal Posteriors} Theoretically, this should not present a problem for inference because all of the integrals involved in posterior predictive inference will -be well behaved. The problem in practice is computation. +be well behaved. The problem in practice is computation. Being able to carry out such invariant inferences in practice is an altogether different matter. It is almost always intractable to find @@ -1686,12 +1738,12 @@ \subsubsection{Parameter Ordering Constraints} compromised. For instance, attempting to use $M$ posterior samples to estimate $\mbox{Pr}[\mu_1 > \mu_2]$, will fail, because the estimator \[ -\mbox{Pr}[\mu_1 > \mu_2] -\approx +\mbox{Pr}[\mu_1 > \mu_2] +\approx \sum_{m=1}^M \mbox{I}(\mu_1^{(m)} > \mu_2^{(m)}) \] will result in an estimate of 0 because the posterior respects the -constraint in the model. +constraint in the model. \subsubsection{Initialization around a Single Mode} @@ -1713,13 +1765,13 @@ \section{Component Collapsing in Mixture Models} It is possible for two mixture components in a mixture model to collapse to the same values during sampling or optimization. For example, a mixture of $K$ normals might devolve to have $\mu_i = -\mu_j$ and $\sigma_i = \sigma_j$ for $i \neq j$. +\mu_j$ and $\sigma_i = \sigma_j$ for $i \neq j$. This will typically happen early in sampling due to initialization in MCMC or optimization or arise from random movement during MCMC. Once the parameters match for a given draw $(m)$, it can become hard to escape because there can be a trough of low-density mass between the -current parameter values and the ones without collapsed components. +current parameter values and the ones without collapsed components. It may help to use a smaller step size during warmup, a stronger prior on each mixture component's membership responsibility. A more extreme @@ -1757,7 +1809,7 @@ \subsection{Beta-Binomial Models with Skewed Data and Weak Priors} to a posterior % \begin{eqnarray*} -p(\phi|y) +p(\phi|y) & \propto & \textstyle \distro{Beta}(\phi|0.5,0.5) \times \prod_{n=1}^N \distro{Bernoulli}(y_n|\phi) \\[4pt] @@ -1818,7 +1870,7 @@ \subsection{Separability in Logistic Regression} ``separability.'' In this case, predictive accuracy on the observed data continue to improve as $\beta_k \rightarrow \infty$, because for cases with $y_n = 1$, $x_n \beta \rightarrow \infty$ and hence -$\mbox{logit}^{-1}(x_n \beta) \rightarrow 1$. +$\mbox{logit}^{-1}(x_n \beta) \rightarrow 1$. With separability, there is no maximum to the likelihood and hence no maximum likelihood estimate. From the Bayesian perspective, the @@ -1866,13 +1918,13 @@ \subsection{Gibbs Sampling} Gibbs sampling proceeds in iteration $m$ by drawing % \begin{eqnarray*} -\lambda_1^{(m)} +\lambda_1^{(m)} & \sim & p(\lambda_1 \, | \, \lambda_2^{(m-1)}, \, \sigma^{(m-1)}, \, y) \\[6pt] \lambda_2^{(m)} & \sim & p(\lambda_2 \, | \, \lambda_1^{(m)}, \, \sigma^{(m-1)}, \, y) \\[6pt] -\sigma^{(m)} +\sigma^{(m)} & \sim & p(\sigma \, | \, \lambda_1^{(m)}, \, \lambda_2^{(m)}, \, y). \end{eqnarray*} % @@ -1902,7 +1954,7 @@ \subsection{Hamiltonian Monte Carlo Sampling} the correlation and the simulation will run $\lambda_1$ and $\lambda_2$ in opposite directions along the valley corresponding to the ridge in the posterior log density (see -\reffigure{non-identifiable-density}. +\reffigure{non-identifiable-density}. \subsection{No-U-Turn Sampling} @@ -1922,7 +1974,7 @@ \subsection{No-U-Turn Sampling} improper posterior! Thus the behavior of HMC in general and NUTS in particular should be reassuring in that it will clearly fail in cases of improper posteriors, resulting in a clean diagnostic of -sweeping out very large paths in the posterior. +sweeping out very large paths in the posterior. \begin{figure} % @@ -1983,10 +2035,10 @@ \subsection{No-U-Turn Sampling} % On the top is the non-identified model with improper uniform priors and likelihood $y_n \sim \distro{Normal}(\lambda_1 + \lambda_2, - \sigma)$. + \sigma)$. % In the middle is the same likelihood as the middle plus priors - $\lambda_k \sim \distro{Normal}(0,10)$. + $\lambda_k \sim \distro{Normal}(0,10)$. % On the bottom is an identified model with an improper prior, with likelihood $y_n \sim \distro{Normal}(\mu,\sigma)$. All models @@ -2065,12 +2117,12 @@ \subsection{Examples: Fits in Stan} The average number of leapfrog steps is roughly 3 in the identified model, 150 in the model identified by a weak prior, and 1400 in the non-identified model. -\item +\item The number of effective samples per second for $\mu$ is roughly 31,000 in the identified model, 1900 in the model identified with weakly informative priors, and 200 in the - non-identified model; the results are similar for $\sigma$. -\item + non-identified model; the results are similar for $\sigma$. +\item In the non-identified model, the 95\% interval for $\lambda_1$ is (-2300,6000), whereas it is only (-12,12) in the model identified with weakly informative priors. @@ -2117,7 +2169,7 @@ \section{Basic Motivation} column vector, is ambiguous. Similarly, row vectors are separated from column vectors because multiplying a row vector by a column vector produces a scalar, whereas multiplying in the opposite order -produces a matrix. +produces a matrix. The following code fragment shows all four ways to declare a two-dimensional container of size $M \times N$. @@ -2134,7 +2186,7 @@ \section{Basic Motivation} comments to the right of the declarations. Thus the only way to efficiently iterate over row vectors is to use the third declaration, but if you need linear algebra on matrices, but the only way to use -matrix operations is to use the fourth declaration. +matrix operations is to use the fourth declaration. The inefficiencies due to any manual reshaping of containers is usually slight compared to what else is going on in a Stan program @@ -2162,9 +2214,9 @@ \section{Data Type and Indexing Efficiency}\label{indexingefficiency.section} The underlying matrix and linear algebra operations are implemented in terms of data types from the Eigen \Cpp library. By having vectors and matrices as basic types, no conversion is necessary when invoking -matrix operations or calling linear algebra functions. +matrix operations or calling linear algebra functions. -Arrays, on the other hand, are implemented as instances of the \Cpp \ +Arrays, on the other hand, are implemented as instances of the \Cpp \ \code{std::vector} class (not to be confused with Eigen's \code{Eigen::Vector} class or Stan vectors). By implementing arrays this way, indexing is very efficient because values can be returned by @@ -2174,11 +2226,11 @@ \subsection{Matrices vs.\ Two-Dimensional Arrays} In Stan models, there are a few minor efficiency considerations in deciding between a two-dimensional array and a matrix, which may seem -interchangeable at first glance. +interchangeable at first glance. First, matrices use a bit less memory than two-dimensional arrays. This is because they don't store a sequence of arrays, but just the -data and the two dimensions. +data and the two dimensions. Second, matrices store their data in column-major order. Furthermore, all of the data in a matrix is guaranteed to be contiguous in memory. @@ -2395,7 +2447,7 @@ \chapter{Multiple Indexing and Range Indexing}\label{multi-indexing.chapter} a local variable. % \begin{stancode} -{ +{ vector[N] mu; for (n in 1:N) mu[n] = alpha[ii[n]] + beta[ii[n]] * x[n]; @@ -2410,9 +2462,9 @@ \section{Multiple Indexing} variables as indicated in the comments. % \begin{stancode} -int c[3]; +int c[3]; ... // define: c == (5, 9, 7) -int idxs[4]; +int idxs[4]; ... // define: idxs == (3, 3, 1, 2) int d[4]; d = c[idxs]; // result: d == (7, 7, 5, 9) @@ -2433,7 +2485,7 @@ \section{Multiple Indexing} example, consider the following. % \begin{stancode} -int c[2, 3]; +int c[2, 3]; ... // define: c = ((1, 3, 5), ((7, 11, 13)) int idxs[4]; ... // define: idxs = (2, 2, 1, 2) @@ -2459,17 +2511,17 @@ \section{Multiple Indexing} % The single index is applied, the one-dimensional result is determined, then the multiple index is applied to the result. That is, -\code{c[2,idxs]} evaluates to the same value as \code{c[2][idxs]}. +\code{c[2,idxs]} evaluates to the same value as \code{c[2][idxs]}. Multiple indexing can apply to more than one position of a multi-dimensional array. For instance, consider the following % \begin{stancode} -int c[2, 3]; +int c[2, 3]; ... // define: c = ((1, 3, 5), (7, 11, 13)) int idxs1[3]; ... // define: idxs1 = (2, 2, 1) -int idxs2[2]; +int idxs2[2]; ... // define: idxs2 = (1, 3) int d[3, 2]; d = c[idxs1, idxs2]; // result: d = ((7, 13), (7, 13), (1, 5)) @@ -2510,7 +2562,7 @@ \subsection{Lower and Upper Bound Indexes} % The range index \code{3:6} behaves semantically just like the multiple index \code{(3, 4, 5, 6)}. In terms of implementation, the sliced -upper and/or lower bounded indices are faster and use less memory +upper and/or lower bounded indices are faster and use less memory because they do not explicitly create a multiple index, but rather use a direct loop. They are also easier to read, so should be preferred over multiple indexes where applicable. @@ -2519,8 +2571,8 @@ \subsection{Lower or Upper Bound Indexes} It is also possible to supply just a lower bound, or just an upper bound. Writing \code{c[3:]} is just shorthand for -\code{c[3:size(c)]}. Writing \code{c[:5]} is just shorthand for -\code{c[1:5]}. +\code{c[3:size(c)]}. Writing \code{c[:5]} is just shorthand for +\code{c[1:5]}. \subsection{Full Range Indexes} @@ -2534,7 +2586,7 @@ \section{Multiple Indexing on the Left of Assignments} Multiple expressions may be used on the left-hand side of an assignment statement, where they work exactly the same way as on the -right-hand side in terms of picking out entries of a container. +right-hand side in terms of picking out entries of a container. For example, consider the following. % \begin{stancode} @@ -2548,7 +2600,7 @@ \section{Multiple Indexing on the Left of Assignments} % The result above can be worked out by noting that the assignment sets \code{a[idxs[1]]} (\code{a[3]}) to \code{c[1]} (\code{5}) and -\code{a[idxs[2]]} (\code{a[2]}) to \code{c[2]} (\code{9}). +\code{a[idxs[2]]} (\code{a[2]}) to \code{c[2]} (\code{9}). The same principle applies when there are many multiple indexes, as in the following example. @@ -2587,7 +2639,7 @@ \subsection{Assign-by-Value and Aliasing} int a[3]; ... // define: a == (5, 6, 7) a[2:3] = a[1:2]; -... // result: a == (5, 5, 6) +... // result: a == (5, 5, 6) \end{stancode} % The reason the value of \code{a} after the assignment is $(5,5,6)$ @@ -2687,8 +2739,8 @@ \subsection{Matrices with One Multiple Index} If matrices receive a single multiple index, the result is a matrix. So if \code{m} is a matrix, so is \code{m[2:4]}. In contrast, supplying a single index, \code{m[3]}, produces a row vector result. -That is, \code{m[3]} produces the same result as \code{m[3,~]} -or \code{m[3,~1:cols(m)]}. +That is, \code{m[3]} produces the same result as \code{m[3,~]} +or \code{m[3,~1:cols(m)]}. \subsection{Arrays of Vectors or Matrices} @@ -2753,16 +2805,11 @@ \section{Matrices with Parameters and Constants} \begin{stancode} transformed data { - int ii[7]; - int jj[7]; - ii[1] = 1; jj[1] = 1; - ii[2] = 2; jj[2] = 1; // skip [1, 2] and [1, 3] - ii[3] = 3; jj[3] = 1; - ii[4] = 2; jj[4] = 2; - ii[5] = 3; jj[5] = 2; - ii[6] = 2; jj[6] = 3; - ii[7] = 3; jj[7] = 3; -} + int idxs[7, + = { {1, 1}, + {2, 1}, {2, 2}, {2, 3}, + {3, 1}, {3, 2}, {3, 3} }; + ... \end{stancode} % The seven remaining parameters are declared as a vector. @@ -2770,7 +2817,7 @@ \section{Matrices with Parameters and Constants} \begin{stancode} parameters { vector[7] A_raw; -} + ... \end{stancode} % Then the full matrix \code{A} is constructed in the model block as a @@ -2779,17 +2826,17 @@ \section{Matrices with Parameters and Constants} \begin{stancode} model { matrix[3, 3] A; - A[ii, jj] = A_raw; + for (i in 1:7) + A[idxs[i, 1], idxs[i, 2]] = A_raw[i]; A[1, 2] = 0; A[1, 3] = 0; -} + ... \end{stancode} - +% This may seem like overkill in this setting, but in more general -settings, the matrix size, vector size, and the \code{ii} and -\code{jj} array and size can be coded up as data along with the values -to fill in for the constant entries. Similar techniques can be used -to build up matrices with ad-hoc constraints, such as a handful of +settings, the matrix size, vector size, and the \code{idxs} array will +be too large to code directly. Similar techniques can be used to +build up matrices with ad-hoc constraints, such as a handful of entries known to be positive. @@ -2860,7 +2907,7 @@ \section{Statistical vs.\ Computational Efficiency} statistical efficiency for Stan programs fit with sampling-based algorithms. Computational efficiency measures the amount of time or memory required for a given step in a calculation, such as an -evaluation of a log posterior or penalized likelihood. +evaluation of a log posterior or penalized likelihood. Statistical efficiency typically involves requiring fewer steps in algorithms by making the statistical formulation of a model better @@ -2975,7 +3022,7 @@ \section{Well-Specified Models} intended runs slowly or may even have convergence and mixing issues. While some of the techniques recommended in the remaining sections of this chapter may mitigate the problem somewhat, the best remedy is a -better model specification. +better model specification. Somewhat counterintuitively, more complicated models often run faster than simpler models. One common pattern is with a group of parameters @@ -2995,7 +3042,7 @@ \section{Reparameterization}\label{reparameterization.section} through reparameterization. In some cases, reparameterization can dramatically increase effective sample size for the same number of iterations or even make programs that would not converge well -behaved. +behaved. \subsection{Example: Neal's Funnel} @@ -3021,7 +3068,7 @@ \subsection{Example: Neal's Funnel} Neal's funnel for the upper-level variable $y$ and one lower-level variable $x_1$ (see the text for the formula). The blue region has log density greater than -8, the yellow region density greater than - -16, and the gray background a density less than -16. + -16, and the gray background a density less than -16. (Right) 4000 draws from a run of Stan's sampler with default settings. Both plots are restricted to the shown window of $x_1$ and $y$ values; some draws fell outside of @@ -3052,7 +3099,7 @@ \subsection{Example: Neal's Funnel} The funnel can be implemented directly in Stan as follows. % \begin{stancode} -parameters { +parameters { real y; vector[9] x; } @@ -3074,7 +3121,7 @@ \subsection{Example: Neal's Funnel} the following more efficient form. % \begin{stancode} -parameters { +parameters { real y_raw; vector[9] x_raw; } @@ -3082,12 +3129,12 @@ \subsection{Example: Neal's Funnel} real y; vector[9] x; - y = 3.0 * y_raw; + y = 3.0 * y_raw; x = exp(y/2) * x_raw; } model { - y_raw ~ normal(0, 1); // implies y ~ normal(0, 3) - x_raw ~ normal(0, 1); // implies x ~ normal(0, exp(y/2)) + y_raw ~ normal(0, 1); // implies y ~ normal(0, 3) + x_raw ~ normal(0, 1); // implies x ~ normal(0, exp(y/2)) } \end{stancode} % @@ -3139,7 +3186,7 @@ \subsection{Reparameterizing the Cauchy} Thus if the random variable $Y$ has a unit uniform distribution, $Y \sim \distro{Uniform}(0,1)$, then $F^{-1}_X(Y)$ has a Cauchy distribution with location $\mu$ and scale $\tau$, i.e., $F^{-1}_X(Y) \sim -\distro{Cauchy}(\mu,\tau)$. +\distro{Cauchy}(\mu,\tau)$. Consider a Stan program involving a Cauchy-distributed parameter \code{beta}. @@ -3167,7 +3214,7 @@ \subsection{Reparameterizing the Cauchy} transformed parameters { real beta; beta = mu + tau * tan(beta_unif); // beta ~ cauchy(mu, tau) -} +} model { beta_unif ~ uniform(-pi()/2, pi()/2); // not necessary ... @@ -3189,7 +3236,7 @@ \subsection{Reparameterizing the Cauchy} needs the total log probability up to an additive constant. Stan will spend some time checking that that \code{beta\_unif} is between \code{-pi()/2} and \code{pi()/2}, but this condition is guaranteed by -the constraints in the declaration of \code{beta\_unif}. +the constraints in the declaration of \code{beta\_unif}. \subsection{Reparameterizing a Student-t Distribution} @@ -3217,8 +3264,8 @@ \subsection{Reparameterizing a Student-t Distribution} $\mbox{\sf Student-t}(\nu, 0, 1)$, i.e., \[ \mbox{\sf Student-t}(\beta | \nu,0,1). -= -\int_0^{\infty} += +\int_0^{\infty} \, \mbox{\sf Normal}(\beta | 0, 1 / \tau^{-\frac{1}{2}}) \times @@ -3267,7 +3314,7 @@ \subsection{Reparameterizing a Student-t Distribution} parameters { real nu; real tau; - real alpha; + real alpha; ... transformed parameters { real beta; @@ -3279,7 +3326,7 @@ \subsection{Reparameterizing a Student-t Distribution} tau ~ gamma(half_nu, half_nu); alpha ~ normal(0, 1); ... -\end{stancode} +\end{stancode} % Although set to \code{0} here, in most cases, the lower bound for the degrees of freedom parameter \code{nu} can be set to \code{1} or @@ -3293,7 +3340,7 @@ \subsection{Hierarchical Models and the Non-Centered Parameterization} Unfortunately, the usual situation in applied Bayesian modeling involves complex geometries and interactions that are not known analytically. Nevertheless, reparameterization can still be very -effective for separating parameters. +effective for separating parameters. \subsubsection{Centered parameterization} @@ -3303,7 +3350,7 @@ \subsubsection{Centered parameterization} % \begin{stancode} parameters { - real mu_beta; + real mu_beta; real sigma_beta; vector[K] beta; ... @@ -3353,7 +3400,7 @@ \subsection{Non-Centered Parameterization} // implies: beta ~ normal(mu_beta, sigma_beta) beta = mu_beta + sigma_beta * beta_raw; model { - beta_raw ~ normal(0, 1); + beta_raw ~ normal(0, 1); ... \end{stancode} % @@ -3362,19 +3409,19 @@ \subsection{Non-Centered Parameterization} Reparameterization of hierarchical models is not limited to the normal distribution, although the normal distribution is the best candidate -for doing so. In general, any distribution of parameters in the +for doing so. In general, any distribution of parameters in the location-scale family is a good candidate for reparameterization. Let $\beta = l + s\alpha$ where $l$ is a location parameter and $s$ is a scale parameter. Note that $l$ need not be the mean, $s$ need not be the standard deviation, and neither the mean nor the standard deviation need to exist. If $\alpha$ and $\beta$ are from the same -distributional family but $\alpha$ has location zero and unit scale, +distributional family but $\alpha$ has location zero and unit scale, while $\beta$ has location $l$ and scale $s$, then that distribution is a location-scale distribution. Thus, if $\alpha$ were a parameter and $\beta$ were a transformed parameter, then a prior distribution -from the location-scale family on $\alpha$ with location zero and unit +from the location-scale family on $\alpha$ with location zero and unit scale implies a prior distribution on $\beta$ with location $l$ and -scale $s$. Doing so would reduce the dependence between $\alpha$, +scale $s$. Doing so would reduce the dependence between $\alpha$, $l$, and $s$. There are several univariate distributions in the location-scale @@ -3382,9 +3429,9 @@ \subsection{Non-Centered Parameterization} cases of the Cauchy distribution (with one degree of freedom) and the normal distribution (with infinite degrees of freedom). As shown above, if $\alpha$ is distributed standard normal, then $\beta$ is distributed -normal with mean $\mu = l$ and standard deviation $\sigma = s$. The -logistic, the double exponential, the generalized extreme value -distributions, and the stable distribution are also in the +normal with mean $\mu = l$ and standard deviation $\sigma = s$. The +logistic, the double exponential, the generalized extreme value +distributions, and the stable distribution are also in the location-scale family. Also, if $z$ is distributed standard normal, then $z^2$ is distributed @@ -3399,7 +3446,7 @@ \subsection{Non-Centered Parameterization} \subsection{Multivariate Reparameterizations} -The benefits of reparameterization are not limited to univariate +The benefits of reparameterization are not limited to univariate distributions. A parameter with a multivariate normal prior distribution is also an excellent candidate for reparameterization. Suppose you intend the prior for $\beta$ to be multivariate normal with mean vector $\mu$ @@ -3424,10 +3471,10 @@ \subsection{Multivariate Reparameterizations} be unknown parameters, in which case their priors would be unaffected by a reparameterization of \Verb|beta|. -If $\alpha$ has the same dimensions as $\beta$ but the elements of -$\alpha$ are independently and identically distributed standard normal -such that $\beta = \mu + L\alpha$, where $LL^\top = \Sigma$, then -$\beta$ is distributed multivariate normal with mean vector $\mu$ and +If $\alpha$ has the same dimensions as $\beta$ but the elements of +$\alpha$ are independently and identically distributed standard normal +such that $\beta = \mu + L\alpha$, where $LL^\top = \Sigma$, then +$\beta$ is distributed multivariate normal with mean vector $\mu$ and covariance matrix $\Sigma$. One choice for $L$ is the Cholesky factor of $\Sigma$. Thus, the model above could be reparameterized as follows. % @@ -3446,10 +3493,10 @@ \subsection{Multivariate Reparameterizations} ... transformed parameters { vector[K] beta; - beta = mu + L * alpha; + beta = mu + L * alpha; } model { - alpha ~ normal(0, 1); + alpha ~ normal(0, 1); // implies: beta ~ multi_normal(mu, Sigma) ... \end{stancode} @@ -3459,7 +3506,7 @@ \subsection{Multivariate Reparameterizations} avoids the need to invert \Verb|Sigma| every time \Verb|multi_normal| is evaluated. -The Cholesky factor is also useful when a covariance matrix is +The Cholesky factor is also useful when a covariance matrix is decomposed into a correlation matrix that is multiplied from both sides by a diagonal matrix of standard deviations, where either the standard deviations or the correlations are unknown parameters. The @@ -3502,7 +3549,7 @@ \subsection{Multivariate Reparameterizations} % This reparameterization of a multivariate normal distribution in terms of standard normal variates can be extended to other multivariate -distributions that can be conceptualized as contaminations of the +distributions that can be conceptualized as contaminations of the multivariate normal, such as the multivariate Student t and the skew multivariate normal distribution. @@ -3556,7 +3603,7 @@ \subsection{Multivariate Reparameterizations} for (i in 1:(K-1)) A[i, K] = 0; A[K, K] = sqrt(c[K]); - + for (i in 1:K) c[i] ~ chi_square(nu - i + 1); @@ -3568,7 +3615,7 @@ \subsection{Multivariate Reparameterizations} % This reparameterization is more efficient for three reasons. First, it reduces dependence among the elements of \Verb|z| and second, it -avoids the need to invert the covariance matrix, $W$ every time +avoids the need to invert the covariance matrix, $W$ every time \Verb|wishart| is evaluated. Third, if $W$ is to be used with a multivariate normal distribution, you can pass $L A$ to the more efficient \Verb|multi_normal_cholesky| function, rather than passing @@ -3578,9 +3625,9 @@ \subsection{Multivariate Reparameterizations} freedom $\nu$, then $W^{-1}$ is distributed inverse Wishart with inverse scale matrix $S^{-1}$ and degrees of freedom $\nu$. Thus, the previous result can be used to reparameterize the inverse Wishart distribution. -Since $W = L * A * A^{\top} * L^{\top}$, +Since $W = L * A * A^{\top} * L^{\top}$, $W^{-1} = L^{{\top}^{-1}} A^{{\top}^{-1}} A^{-1} L^{-1}$, where all four -inverses exist, but +inverses exist, but $L^{{-1}^{\top}} = L^{{\top}^{-1}}$ and $A^{{-1}^{\top}} = A^{{\top}^{-1}}$. We can slightly modify the above Stan code for this case: % @@ -3607,7 +3654,7 @@ \subsection{Multivariate Reparameterizations} ... model { matrix[K, K] A; - matrix[K, K] A_inv_L_inv; + matrix[K, K] A_inv_L_inv; int count; count = 1; for (j in 1:(K-1)) { @@ -3628,18 +3675,18 @@ \subsection{Multivariate Reparameterizations} for (i in 1:K) c[i] ~ chi_square(nu - i + 1); - z ~ normal(0, 1); // implies: crossprod(A_inv_L_inv) ~ + z ~ normal(0, 1); // implies: crossprod(A_inv_L_inv) ~ // inv_wishart(nu, L_inv' * L_inv) ... \end{stancode} % Another candidate for reparameterization is the Dirichlet distribution -with all $K$ shape parameters equal. \cite{ZyczkowskiSommers:2001} shows -that if $\theta_i$ is equal to the sum of $\beta$ independent squared -standard normal variates and $\rho_i = \frac{\theta_i}{\sum \theta_i}$, -then the $K$-vector $\rho$ is distributed Dirichlet with all shape -parameters equal to $\frac{\beta}{2}$. In particular, if $\beta = 2$, -then $\rho$ is uniformly distributed on the unit simplex. Thus, we can +with all $K$ shape parameters equal. \cite{ZyczkowskiSommers:2001} shows +that if $\theta_i$ is equal to the sum of $\beta$ independent squared +standard normal variates and $\rho_i = \frac{\theta_i}{\sum \theta_i}$, +then the $K$-vector $\rho$ is distributed Dirichlet with all shape +parameters equal to $\frac{\beta}{2}$. In particular, if $\beta = 2$, +then $\rho$ is uniformly distributed on the unit simplex. Thus, we can make $\rho$ be a transformed parameter to reduce dependence, as in: % \begin{stancode} @@ -3657,7 +3704,7 @@ \subsection{Multivariate Reparameterizations} } model { for (k in 1:K) - z[k] ~ normal(0, 1); + z[k] ~ normal(0, 1); // implies: rho ~ dirichlet(0.5 * beta * ones) ... \end{stancode} @@ -3690,20 +3737,20 @@ \subsection{Vectorizing Summations} some operation that depends on \code{n}. % \begin{stancode} -for (n in 1:N) +for (n in 1:N) total = total + foo(n,...); \end{stancode} % This code has to create intermediate representations for each -of the \code{N} summands. +of the \code{N} summands. A faster alternative is to copy the values into a vector, then apply the \code{sum()} operator, as in the following refactoring. % \begin{stancode} -{ +{ vector[N] summands; - for (n in 1:N) + for (n in 1:N) summands[n] = foo(n,...); total = sum(summands); } @@ -3739,7 +3786,7 @@ \subsection{Vectorization through Matrix Operations} } model { for (n in 1:N) { - real gamma; + real gamma; gamma = 0; for (k in 1:K) gamma = gamma + x[n, k] * beta[k]; @@ -3783,7 +3830,7 @@ \subsection{Vectorization through Matrix Operations} \Cpp matrix library. This will require enhancing Eigen to deal with mixed-type arguments, such as the type \code{double} used for constants and the algorithmic differentiation type -\code{stan::math::var} +\code{stan::math::var} used for variables.}. % The inefficiency of transposition could itself be mitigated somewhat by @@ -3840,7 +3887,7 @@ \subsection{Vectorized Probability Functions} } parameters { vector[K] beta; -} +} model { y ~ normal(x * beta, 1); } @@ -3848,7 +3895,7 @@ \subsection{Vectorized Probability Functions} % The variables are all declared as either matrix or vector types. The result of the matrix-vector multiplication \code{x * beta} in the -model block is a vector of the same length as \code{y}. +model block is a vector of the same length as \code{y}. The probability function documentation in \refpart{built-in-functions} indicates which of Stan's probability functions support @@ -3886,7 +3933,7 @@ \subsection{Reshaping Data for Vectorization} \end{stancode} % If there aren't many levels (values \code{ii[n]} can take), then it -behooves us to reorganize the data by group in a case like this. +behooves us to reorganize the data by group in a case like this. Rather than having a single observation vector \code{y}, there are K of them. And because Stan doesn't support ragged arrays, it means K declarations. For instance, with 5 levels, we have @@ -3922,7 +3969,7 @@ \section{Exploiting Sufficient Statistics} } model { theta ~ beta(alpha, beta); - for (n in 1:N) + for (n in 1:N) y[n] ~ bernoulli(theta); } \end{stancode} @@ -3938,7 +3985,7 @@ \section{Exploiting Sufficient Statistics} % Because truth is represented as one and falsehood as zero, the sum \code{sum(y)} of a binary vector \code{y} is equal to the number of -positive outcomes out of a total of \code{N} trials. +positive outcomes out of a total of \code{N} trials. This can be generalized to other discrete cases (one wouldn't expect continuous observations to be duplicated if they are random). Suppose @@ -3969,7 +4016,7 @@ \section{Aggregating Common Subexpressions} If an expression is calculated once, the value should be saved and reused wherever possible. That is, rather than using \code{exp(theta)} in multiple places, declare a local variable to -store its value and reuse the local variable. +store its value and reuse the local variable. Another case that may not be so obvious is with two multilevel parameters, say \code{a[ii[n]] + b[jj[n]]}. If \code{a} and \code{b} @@ -4035,8 +4082,8 @@ \section{Standardizing Predictors and Outputs} \] and the sample standard deviation of $y$ is \[ -\mbox{sd}(y) -= \left( +\mbox{sd}(y) += \left( \frac{1}{N} \sum_{n=1}^N (y_n - \bar{y})^2 \right)^{1/2}. \] @@ -4065,7 +4112,7 @@ \section{Standardizing Predictors and Outputs} } model { // priors - alpha ~ normal(0, 10); + alpha ~ normal(0, 10); beta ~ normal(0, 10); sigma ~ cauchy(0, 5); // likelihood @@ -4077,7 +4124,7 @@ \section{Standardizing Predictors and Outputs} The data block for the standardized model is identical. The standardized predictors and outputs are defined in the transformed -data block. +data block. % \begin{stancode} data { @@ -4097,11 +4144,11 @@ \section{Standardizing Predictors and Outputs} real sigma_std; } model { - alpha_std ~ normal(0, 10); + alpha_std ~ normal(0, 10); beta_std ~ normal(0, 10); sigma_std ~ cauchy(0, 5); for (n in 1:N) - y_std[n] ~ normal(alpha_std + beta_std * x_std[n], + y_std[n] ~ normal(alpha_std + beta_std * x_std[n], sigma_std); } \end{stancode} @@ -4118,7 +4165,7 @@ \section{Standardizing Predictors and Outputs} The original regression \[ -y_n +y_n = \alpha + \beta x_n + \epsilon_n \] has been transformed to a regression on the standardized variables, @@ -4131,66 +4178,66 @@ \section{Standardizing Predictors and Outputs} The original parameters can be recovered with a little algebra, % \begin{eqnarray*} -y_n +y_n & = & \mbox{z}_y^{-1}(\mbox{z}_y(y_n)) \\[4pt] -& = & -\mbox{z}_y^{-1} -\left( -\alpha' +& = & +\mbox{z}_y^{-1} +\left( +\alpha' + \beta' \mbox{z}_x(x_n) + \epsilon_n' \right) \\[4pt] -& = & -\mbox{z}_y^{-1} -\left( -\alpha' -+ \beta' - \left( +& = & +\mbox{z}_y^{-1} +\left( +\alpha' ++ \beta' + \left( \frac{x_n - \bar{x}}{\mbox{\small sd}(x)} \right) + \epsilon_n' \right) \\[4pt] -& = & +& = & \mbox{sd}(y) -\left( -\alpha' -+ \beta' - \left( +\left( +\alpha' ++ \beta' + \left( \frac{x_n - \bar{x}}{\mbox{\small sd}(x)} \right) + \epsilon_n' \right) + \bar{y} \\[4pt] -& = & -\left( - \mbox{sd}(y) - \left( - \alpha' +& = & +\left( + \mbox{sd}(y) + \left( + \alpha' - \beta' \frac{\bar{x}}{\mbox{\small sd}(x)} - \right) - + \bar{y} + \right) + + \bar{y} \right) + \left( - \beta' \frac{\mbox{\small sd}(y)}{\mbox{\small sd}(x)} + \beta' \frac{\mbox{\small sd}(y)}{\mbox{\small sd}(x)} \right) x_n + \mbox{sd}(y) \epsilon'_n, \end{eqnarray*} % from which the original scale parameter values can be read off, \[ -\alpha +\alpha = -\mbox{sd}(y) - \left( - \alpha' +\mbox{sd}(y) + \left( + \alpha' - \beta' \frac{\bar{x}}{\mbox{\small sd}(x)} - \right) + \right) + \bar{y}; -\ \ \ \ \ +\ \ \ \ \ \beta = \beta' \frac{\mbox{\small sd}(y)}{\mbox{\small sd}(x)}; \ \ \ \ \ \sigma = \mbox{sd}(y) \sigma'. @@ -4204,7 +4251,7 @@ \section{Standardizing Predictors and Outputs} real alpha; real beta; real sigma; - alpha = sd(y) * (alpha_std - beta_std * mean(x) / sd(x)) + alpha = sd(y) * (alpha_std - beta_std * mean(x) / sd(x)) + mean(y); beta = beta_std * sd(y) / sd(x); sigma = sd(y) * sigma_std; diff --git a/src/docs/stan-reference/stan-reference.tex b/src/docs/stan-reference/stan-reference.tex index 68296118ffa..e2e04cc879e 100644 --- a/src/docs/stan-reference/stan-reference.tex +++ b/src/docs/stan-reference/stan-reference.tex @@ -8,7 +8,7 @@ \pagestyle{plain} -\newcommand{\stanversion}{2.15.0} +\newcommand{\stanversion}{2.16} \frontmatter \include{title} diff --git a/src/docs/stan-reference/title.tex b/src/docs/stan-reference/title.tex index 52157d330b7..6406b1b4e88 100644 --- a/src/docs/stan-reference/title.tex +++ b/src/docs/stan-reference/title.tex @@ -86,12 +86,8 @@ \subsection*{Currently Active Developers} \\ {\footnotesize Stan Math, Stan, RStan} \item Jonah Sol Gabry \ (Columbia University) \\ {\footnotesize RStan, RStanArm, ShinyStan, Loo, BayesPlot} -\item Alp Kucukelbir \ (Columbia University, Fero Labs) -\\ {\footnotesize Stan, CmdStan} \item Robert L. Grant \ (Consultant, London) \\ {\footnotesize StataStan} -\item Dustin Tran \ (Columbia University) -\\ {\footnotesize Stan, CmdStan} \item Krzysztof Sakrejda \ (University of Massachusetts, Amherst) \\ {\footnotesize Stan Math, Stan} \item Aki Vehtari \ (Aalto University) \\ {\footnotesize Stan Math, @@ -110,6 +106,11 @@ \subsection*{Currently Active Developers} \\ {\footnotesize RStan, RStanArm} \item Sean Talts \ (Dual Space, LLC) \\ {\footnotesize Stan Math, Stan, dev ops} +\item Ben Bales \ (University of California, Santa Barbara) +\\ {\footnotesize Stan Math} +\item Ari Hartikainen \ (Aalto University) +\\ {\footnotesize PyStan} + \end{itemize} \subsection*{Development Team Alumni} @@ -118,13 +119,17 @@ \subsection*{Development Team Alumni} past, but are no longer contributing actively. \begin{itemize} -\item Matt Hoffman \ (Adobe Creative Technologies Lab) +\item Matt Hoffman \ (while at Columbia University) \\ {\footnotesize Stan Math, Stan, CmdStan} -\item Michael Malecki \ (Crunch.io, YouGov plc) +\item Michael Malecki \ (while at Columbia University) \\ {\footnotesize software and graphical design} -\item Peter Li \ (Columbia University) +\item Peter Li \ (while at Columbia University) \\ {\footnotesize Stan Math, Stan} -\item Yuanjun Guo \ (Columbia University) +\item Yuanjun Guo \ (while at Columbia University) \\ {\footnotesize Stan} +\item Alp Kucukelbir \ (while at Columbia University) +\\ {\footnotesize Stan, CmdStan} +\item Dustin Tran \ (while at Columbia University) +\\ {\footnotesize Stan, CmdStan} \end{itemize} From bcbc55e591c46f30ed80bf70f236ed31b846369c Mon Sep 17 00:00:00 2001 From: Daniel Lee Date: Mon, 19 Jun 2017 11:01:35 -0400 Subject: [PATCH 10/10] Fixing spelling errors --- src/docs/stan-reference/algorithms.tex | 4 +- src/docs/stan-reference/appendices.tex | 16 ++-- src/docs/stan-reference/distributions.tex | 8 +- src/docs/stan-reference/examples.tex | 2 +- src/docs/stan-reference/functions.tex | 6 +- src/docs/stan-reference/language.tex | 6 +- src/docs/stan-reference/process.tex | 94 +++++++++++------------ src/docs/stan-reference/programming.tex | 10 +-- 8 files changed, 71 insertions(+), 75 deletions(-) diff --git a/src/docs/stan-reference/algorithms.tex b/src/docs/stan-reference/algorithms.tex index f30b40d58d8..1b7bc6c0700 100644 --- a/src/docs/stan-reference/algorithms.tex +++ b/src/docs/stan-reference/algorithms.tex @@ -169,11 +169,11 @@ \subsection{Leapfrog Integrator} repetitions of the above three steps) will be denoted $(\rho^{*},\theta^{*})$. -The leapgrog integrator's error is on the order of $\epsilon^3$ per +The leapfrog integrator's error is on the order of $\epsilon^3$ per step and $\epsilon^2$ globally, where $\epsilon$ is the time interval (also known as the step size); \cite{LeimkuhlerReich:2004} provide a detailed analysis of numerical integration for Hamiltonian systems, -including a derivation of the error bound for the leapforg +including a derivation of the error bound for the leapfrog integrator. diff --git a/src/docs/stan-reference/appendices.tex b/src/docs/stan-reference/appendices.tex index 37e8655e2f3..b35cd6054f4 100644 --- a/src/docs/stan-reference/appendices.tex +++ b/src/docs/stan-reference/appendices.tex @@ -52,7 +52,7 @@ \section{Boost License} \end{quote} % The copyright for each Boost package is held by its developers or their -assginees. +assignees. \section{Eigen License} @@ -70,7 +70,7 @@ \section{Eigen License} \section{SUNDIALS License} Stan uses the SUNDIALS package for solving stiff differential -equations. SUNDIALS is distrubted under the new BSD (3-clause) license. +equations. SUNDIALS is distributed under the new BSD (3-clause) license. % \begin{quote} \url{https://opensource.org/licenses/BSD-3-Clause} @@ -583,7 +583,7 @@ \subsection{Syntactic Conventions} of \code{A}. The notation \code{A \% B}, following the Boost Spirit parser library's notation, is shorthand for \code{?(A (B A)*)}, i.e., any number of \code{A} (including zero), separated by \code{B}. A -postfixed, curly-braced number indicates a fixed number of repetions; +postfixed, curly-braced number indicates a fixed number of repetitions; e.g., \code{A\{6\}} is equivalent to a sequence of six copies of \code{A}. \subsection{Programs} @@ -862,7 +862,7 @@ \section{Warnings vs.\ Errors} errors. Error messages arise when a fatal problem occurs under the hood from which there is no way to recover. Warning messages arise in circumstances from which the underlying program can continue to -operaate. +operate. An example warning message is an informational message about ill-formed input to an underlying function, which results in a @@ -893,7 +893,7 @@ \section{Parsing and Compilation} \section{Initialization} \begin{itemize} -\item {\it vanishing density.} This message arrises when there is a +\item {\it vanishing density.} This message arises when there is a problem with initialization not providing a finite log probability. % \end{itemize} @@ -964,7 +964,7 @@ \section{\code{increment\_log\_prob} Statement} \section{\code{lp\_\_} Variable} \begin{description} -\item[Depcreated] +\item[Deprecated] The variable \code{lp\_\_} is available wherever log density increment statements are allowed (\code{target~+=} and \Verb|~| shorthand statements). @@ -1113,7 +1113,7 @@ \section{\code{if\_else} Function} otherwise. % \item[Replacement] Use the conditional operator which allows more - flexiblity in the types of \code{b} and \code{c} and is much more + flexibility in the types of \code{b} and \code{c} and is much more efficient in that it only evaluates whichever of \code{b} or \code{c} is returned. See \refsection{conditional-operator} for full details of the conditional operator. Replace @@ -1126,7 +1126,7 @@ \section{\code{if\_else} Function} \end{stancode} \end{description} -\section{\code{\#} Commens} +\section{\code{\#} Comments} \begin{description} \item[Deprecated] The use of \code{\#} for line-based comments is diff --git a/src/docs/stan-reference/distributions.tex b/src/docs/stan-reference/distributions.tex index d725717a94f..880b3119944 100644 --- a/src/docs/stan-reference/distributions.tex +++ b/src/docs/stan-reference/distributions.tex @@ -708,7 +708,7 @@ \chapter{Unbounded Discrete Distributions} \section{Negative Binomial Distribution} For the negative binomial distribution Stan uses the parameterization -described in \citet{GelmanEtAl:2013}. For alternative parametrizations, see \refsection{nbalt}. +described in \citet{GelmanEtAl:2013}. For alternative parameterizations, see \refsection{nbalt}. \subsubsection{Probability Mass Function} @@ -1480,7 +1480,7 @@ \subsubsection{Stan Functions} given location \farg{mu} and scale \farg{sigma}} % \fitem{real}{lognormal\_lcdf}{reals \farg{y} \textbar\ reals \farg{mu}, reals - \farg{sigma}}{The log of the lognormal cumulative distribution fucntion + \farg{sigma}}{The log of the lognormal cumulative distribution function of \farg{y} given location \farg{mu} and scale \farg{sigma}} % \fitem{real}{lognormal\_lccdf}{reals \farg{y} \textbar\ reals \farg{mu}, reals @@ -1911,7 +1911,7 @@ \subsubsection{Probability Density Function} \subsubsection{Stan Functions} \begin{description} \fitem{real}{rayleigh\_lpdf}{reals \farg{y} \textbar\ reals \farg{sigma}}{The log of the Rayleigh - ensity of \farg{y} given scale \farg{sigma}} + density of \farg{y} given scale \farg{sigma}} % \fitem{real}{rayleigh\_cdf}{real \farg{y}, real \farg{sigma}}{The Rayleigh cumulative distribution of \farg{y} given scale \farg{sigma}} @@ -2490,7 +2490,7 @@ \subsubsection{Probability Density Function} where $y_i$ is the $i$th row of $y$. This is used to efficiently handle Gaussian Processes with multi-variate outputs where only the output dimensions share a kernel function but vary based on their scale. If the model allows -parametrization in terms of Cholesky factor of the kernel matrix, this distribution +parameterization in terms of Cholesky factor of the kernel matrix, this distribution is also more efficient than $\distro{MultiGP}()$. Note that this function does not take into account the mean prediction. diff --git a/src/docs/stan-reference/examples.tex b/src/docs/stan-reference/examples.tex index 45301e8c07c..ecc58c2c25f 100644 --- a/src/docs/stan-reference/examples.tex +++ b/src/docs/stan-reference/examples.tex @@ -1581,7 +1581,7 @@ \subsubsection{Optimization through Vectorization} ... \end{stancode} % -thsi fails because it breaks the vectorization of sampling for +this fails because it breaks the vectorization of sampling for \code{beta},% % \footnote{Thanks to Mike Lawrence for pointing this out in the GitHub diff --git a/src/docs/stan-reference/functions.tex b/src/docs/stan-reference/functions.tex index 66cf89f43bf..6fa9f55d01e 100644 --- a/src/docs/stan-reference/functions.tex +++ b/src/docs/stan-reference/functions.tex @@ -376,7 +376,7 @@ \section{Log Probability Function}\label{get-lp.section} various ways by a Stan program. The variables are first transformed from unconstrained to constrained, and the log Jacobian determinant added to the log probability accumulator. Then the model block is -executed on the constrained parmeters, with each sampling statement +executed on the constrained parameters, with each sampling statement (\Verb|~|) and log probability increment statement (\code{increment\_log\_prob}) adding to the accumulator. At the end of the model block execution, the value of the log probability @@ -2474,7 +2474,7 @@ \section{Diagonal Matrix Functions} \end{description} Although the \code{diag\_matrix} function is available, it is unlikely -to ever show up in an efficient Stan program. For exmaple, rather +to ever show up in an efficient Stan program. For example, rather than converting a diagonal to a full matrix for use as a covariance matrix, % @@ -2493,7 +2493,7 @@ \section{Diagonal Matrix Functions} Rather than writing \code{m~*~diag\_matrix(v)} where \code{m} is a matrix and \code{v} is a vector, it is much more efficient to write \code{diag\_post\_multiply(m,~v)} (and similarly for pre-multiplication). -By the same topken, it is better to use \code{quad\_form\_diag(m,~v)} +By the same token, it is better to use \code{quad\_form\_diag(m,~v)} rather than \code{quad\_form(m,~diag\_matrix(v))}. diff --git a/src/docs/stan-reference/language.tex b/src/docs/stan-reference/language.tex index 762e1b22161..e600fa1421a 100644 --- a/src/docs/stan-reference/language.tex +++ b/src/docs/stan-reference/language.tex @@ -753,7 +753,7 @@ \subsection{Cholesky Factors of Correlation Matrices} lower-triangular matrix with positive diagonal entries and rows that are of length 1 (i.e., $\sum_{n=1}^K L_{m,n}^2 = 1$). If $L$ is a Cholesky factor for a correlation matrix, then $L\,L^{\top}$ is a -correlation matrix (i.e., symmetric postive definite with a unit +correlation matrix (i.e., symmetric positive definite with a unit diagonal). A declaration such as @@ -3039,7 +3039,7 @@ \subsection{User-Transformed Variables} \end{stancode} % Unfortunately, this is not enough to properly model \code{beta} as -having a lognormal distribution. Whenever a nonliner transform is +having a lognormal distribution. Whenever a nonlinear transform is applied to a parameter, such as the logarithm function being applied to \code{beta} here, and then used on the left-hand side of a sampling statement or on the left of a vertical bar in a log pdf function, an @@ -3155,7 +3155,7 @@ \subsubsection{Complementary cumulative distribution functions} Unlike the cdf, the ccdf is exclusive of the bound, hence the event $X > x$ rather than the cdf's event $X \leq x$. -For continous distributions, the ccdf works out to +For continuous distributions, the ccdf works out to \[ F^C_X(x) \ = \ 1 - \int_{-\infty}^x p_X(u) \, du diff --git a/src/docs/stan-reference/process.tex b/src/docs/stan-reference/process.tex index 40afd13ba40..5c9c34df1b6 100644 --- a/src/docs/stan-reference/process.tex +++ b/src/docs/stan-reference/process.tex @@ -45,7 +45,7 @@ \section{Make it Reproducible} through the models and produce whatever posterior analysis you need. Scripts can be written for the shell, R, or Python. Whatever language a script is in, it should be self contained and not depend on global -variables having been set, other data being read in, etc. +variables having been set, other data being read in, etc. See \refchapter{reproducibility} for complete information on reproducibility in Stan and its interfaces. @@ -54,7 +54,7 @@ \subsection{Scripts are Good Documentation} It may seem like overkill if running the project is only a single line of code, but the script provides not only a way to run the code, but -also a form of concrete documentation for what is run. +also a form of concrete documentation for what is run. \subsection{Randomization and Saving Seeds} @@ -87,7 +87,7 @@ \section{Make it Readable} developer will want to read it later. One of the motivations of Stan's design was to make models self-documenting in terms of variable usage (e.g., data versus parameter), types (e.g., covariance matrix -vs. unconstrained matrix) and sizes. +vs. unconstrained matrix) and sizes. A large part of readability is consistency. Particularly in naming and layout. Not only of programs themselves, but the directories and @@ -123,13 +123,13 @@ \section{Design Top-Down, Code Bottom-Up} Software projects are almost always designed top-down from one or more intended use cases. Good software coding, on the other hand, is -typically done bottom-up. +typically done bottom-up. The motivation for top-down design is obvious. The motivation for bottom-up development is that it is much easier to develop software using components that have been thoroughly tested. Although Stan has no built-in support for either modularity or testing, many of the same -principles apply. +principles apply. The way the developers of Stan themselves build models is to start as simply as possibly, then build up. This is true even if we have a @@ -147,12 +147,12 @@ \section{Fit Simulated Data} computationally is to generate simulated (i.e., ``fake'') data with known parameter values, then see if the model can recover these parameters from the data. If not, there is very little hope that it -will do the right thing with data from the wild. +will do the right thing with data from the wild. There are fancier ways to do this, where you can do things like run $\chi^2$ tests on marginal statistics or follow the paradigm introduced in \citep{CookGelmanRubin:2006}, which involves interval -tests. +tests. \section{Debug by Print} @@ -162,7 +162,7 @@ \section{Debug by Print} % \footnote{The ``f'' is not a typo --- it's a historical artifact of the name of the \code{printf} function used for formatted printing - in C.} + in C.} Stan supports print statements with one or more string or expression arguments. Because Stan is an imperative language, variables can have @@ -206,7 +206,7 @@ \subsection{Code Never Lies} The machine does what the code says, not what the documentation says. Documentation, on the other hand, might not match the code. Code documentation easily rots as the code evolves if the documentation is -not well maintained. +not well maintained. Thus it is always preferable to write readable code as opposed to documenting unreadable code. Every time you write a piece of @@ -225,8 +225,8 @@ \subsection{Comment Styles in Stan} \subsection{What Not to Comment} -When commenting code, it is usually safe to assume that you are -writing the comments for other programmers who understand the basics +When commenting code, it is usually safe to assume that you are +writing the comments for other programmers who understand the basics of the programming language in use. In other words, don't comment the obvious. For instance, there is no need to have comments such as the following, which add nothing to the code. @@ -392,19 +392,19 @@ \section{Source Code Management} \includegraphics[height=4.5in]{img/git-process.png} \end{center} \caption{\small\it Git branching process for master - and development branches. New features and ordinary (not hot) + and development branches. New features and ordinary (not hot) bugfixes are developed in branches from and merged back into the development branch. These are then collected into releases and merged with the master branch, which tracks the releases. Hotfix branches are like feature or ordinary - bugfix branches, but branch from master and merge back into master. + bugfix branches, but branch from master and merge back into master. Image courtesy of \citep{Driessen:2010}.}\label{git-process.figure} \end{figure} % The basic Git process for branching, releasing, hotfixing, and merging follows standard Git procedure \citep{Driessen:2010}. A diagram outlining the process is presented in \reffigure{git-process}. The key idea is that the master branch is always at the latest release, with -older commits tagged for previous releases. +older commits tagged for previous releases. The development branch always represents the current state of development. Feature and bugfix branches branch from the development @@ -459,7 +459,7 @@ \subsection{Unit Testing} \url{http://cran.r-project.org/web/packages/RUnit/index.html}.} The point of unit testing is to test the program at the application -programmer interface (API) level, not the end-to-end functional level. +programmer interface (API) level, not the end-to-end functional level. The tests are run automatically when pull requests are created through a continuous integration process. PyStan uses the Travis continuous @@ -491,7 +491,7 @@ \subsection{Unit Testing} archive (zip file) or may be cloned for development under Git. Cloning in Git provides a complete copy of all version history, including every commit to every branch since the beginning of the -project. +project. User feedback is accommodated through three channels. First, and most formally, there is an issue tracker for each of Stan C++, CmdStan, @@ -517,7 +517,7 @@ \subsection{Functional Testing} undergoes rigorous end-to-end testing of its model fitting functionality. Models with known answers are evaluated for both speed and accuracy. Models without analytic solutions are evaluated in terms -of MCMC error. +of MCMC error. \section{Release Cycles} @@ -538,7 +538,7 @@ \section{Release Cycles} Stan is released exclusively as source code, so nothing needs to be done with respect to binary release management or compatibility. The source is tested so that it can be used under Windows, Mac OS X, and -Linux. +Linux. Instructions for installing Stan C++, CmdStan, RStan, and PyStan are managed separately and distributed with the associated product. @@ -575,7 +575,7 @@ \subsection{Minor Version and Forward Compatibility} feature without breaking backward compatibility, its minor version number is incremented. Whenever the minor version is incremented, the patch level reverts to 0. Most Stan releases increment the minor -version. +version. \subsection{Bug Fixes and Patch Releases} @@ -616,12 +616,12 @@ \section{Availability of Current and Historical Archive Versions} first cloning a repository, it may be kept up to date using Git's pull command (available from the command-line or platform-specific graphical user interfaces). An alternative delivery mechanism is as -a zip archive of a snapshot of the system. +a zip archive of a snapshot of the system. \section{Maintenance, Support, and Retirement} Stan support extends only to the most current release. Specifically, -patches are not backported to older versions. +patches are not backported to older versions. Early fixes of bugs are available to users in the form of updated development branches. Snapshots of the entire code base at every @@ -675,7 +675,7 @@ \section{Qualified Personnel} The members of the Stan development team are drawn from multiple computational, scientific, and statistical disciplines across -academic, not-for-profit, and industrial laboratories. +academic, not-for-profit, and industrial laboratories. Most of Stan's developers have Ph.D. degrees, some have Master's degrees, and some are currently enrolled as undergraduate or graduate @@ -693,7 +693,7 @@ \section{Qualified Personnel} Institutions at which the members of the Stan development team hold appointments as of Stan release 2.15.0 include Columbia University, Adobe Creative Technologies Lab, University of Warwick, University of -Toronto (Scarborough), Dartmouth Colloge, University of Washington, +Toronto (Scarborough), Dartmouth College, University of Washington, Lucidworks, CNRS (Paris), St. George's, University of London, University of Massachussetts (Amherst), Aalto University, and Novartis Pharma. @@ -734,7 +734,7 @@ \section{Disaster Recovery} The entire history of the Stan C++, CmdStan, RStan, and PyStan repositories is maintained on the GitHub servers as well as on each developer's individual copy of the repository. Specifically, each -repository can be reconstituted from any of the core +repository can be reconstituted from any of the core developers' machines. @@ -788,7 +788,7 @@ \chapter{Reproducibility}\label{reproducibility.chapter} you are running in RStan, Rcpp handles the conversion between R's floating point numbers and C++ doubles. If Rcpp changes the conversion process or use different types, the results are not guaranteed to be -the same down to the bit level. +the same down to the bit level. The compiler and compiler settings can also be an issue. There is a nice discussion of the issues and how to control reproducibility in @@ -799,7 +799,7 @@ \chapter{Reproducibility}\label{reproducibility.chapter} \chapter{Contributed Modules} \noindent -Stan is an open-source project and welcomes user contributions. +Stan is an open-source project and welcomes user contributions. In order to reduce maintenance on the main trunk of Stan development and to allow developer-specified licenses, contributed Stan modules @@ -812,10 +812,10 @@ \section{Contributing a Stan Module} Stan developers either through one of the following. % \begin{itemize} -\item \code{stan-users} mailing list: +\item \code{stan-users} mailing list: \\ \url{https://groups.google.com/forum/?fromgroups#!forum/stan-users} -\item Stan e-mail: +\item Stan e-mail: \\ \href{mailto:mc.stanislaw@gmail.com}{\tt mc.stanislaw@gmail.com} \end{itemize} @@ -889,7 +889,7 @@ \section{Line Length} \section{File Extensions} -The recommended file extension for Stan model files is \code{.stan}. +The recommended file extension for Stan model files is \code{.stan}. For Stan data dump files, the recommended extension is \code{.R}, or more informatively, \code{.data.R}. @@ -947,7 +947,7 @@ \section{Local Variable Scope} } } \end{stancode} -% +% The local variable can be eliminated altogether, as follows. % \begin{stancode} @@ -974,7 +974,7 @@ \subsubsection{Scope of Compound Structures with Componentwise Assignment} model { vector[K] mu; for (n in 1:N) { - for (k in 1:K) + for (k in 1:K) mu[k] = ...; y[n] ~ multi_normal(mu,Sigma); } @@ -1002,7 +1002,7 @@ \subsection{Optional Parentheses for Single-Statement Blocks} for (n in 1:N) y[n] ~ normal(mu,1); \end{stancode} -% +% Single-statement blocks can also be written on a single line, as in the following example. % @@ -1050,7 +1050,7 @@ \subsection{Parentheses in Nested Operator Expressions} Similarly, comparison operators can usually be written with minimal bracketing, with the form \code{y[n] > 0 || x[n] != 0} preferred to -the bracketed form \code{(y[n] > 0) || (x[n] != 0)}. +the bracketed form \code{(y[n] > 0) || (x[n] != 0)}. \subsection{No Open Brackets on Own Line} @@ -1059,13 +1059,13 @@ \subsection{No Open Brackets on Own Line} section, not as follows. % \begin{stancode} -for (n in 1:N) +for (n in 1:N) { y[n] ~ normal(mu,1); } \end{stancode} % -This also goes for parameters blocks, transformed data blocks, +This also goes for parameters blocks, transformed data blocks, which should look as follows. % \begin{stancode} @@ -1140,9 +1140,9 @@ \section{Functions} % \begin{stancode} /** - * Return a data matrix of specified size with rows - * corresponding to items and the first column filled - * with the value 1 to represent the intercept and the + * Return a data matrix of specified size with rows + * corresponding to items and the first column filled + * with the value 1 to represent the intercept and the * remaining columns randomly filled with unit-normal draws. * * @param N Number of rows correspond to data items @@ -1150,7 +1150,7 @@ \section{Functions} * item. * @return Simulated predictor matrix. */ -matrix predictors_rng(int N, int K) { +matrix predictors_rng(int N, int K) { ... \end{stancode} % @@ -1158,7 +1158,7 @@ \section{Functions} asterisk of the open comment, and the end comment \code{*/} is also aligned on the asterisk. The tags \code{@param} and \code{@return} are used to label function arguments (i.e., parameters) and return -values. +values. \section{White Space} @@ -1192,14 +1192,14 @@ \subsection{No Tabs} may be used anywhere other whitespace occurs. Using tabs to layout a program is highly unportable because the number of spaces represented by a single tab character varies depending on which -program is doing the rendering and how it is configured. +program is doing the rendering and how it is configured. \subsection{Two-Character Indents} Stan has standardized on two space characters of indentation, which is the standard convention for C/C++ code. Another sensible choice is four spaces, which is the convention for Java and Python. Just be -consistent. +consistent. \subsection{Space Between \code{if} and Condition} @@ -1236,7 +1236,7 @@ \subsection{Breaking Expressions across Lines} \end{stancode} % Here, the multiplication operator (\code{*}) is aligned to clearly -signal the multiplicands in the product. +signal the multiplicands in the product. For function arguments, break after a comma and line the next argument up underneath as follows. @@ -1257,10 +1257,10 @@ \subsection{Optional Spaces after Commas} \subsection{Unix Newlines} -Wherever possible, Stan programs should use a single line feed -character to separate lines. All of the Stan developers (so far, at -least) work on Unix-like operating systems and using a standard -newline makes the programs easier for us to read and share. +Wherever possible, Stan programs should use a single line feed +character to separate lines. All of the Stan developers (so far, at +least) work on Unix-like operating systems and using a standard +newline makes the programs easier for us to read and share. \subsubsection{Platform Specificity of Newlines} diff --git a/src/docs/stan-reference/programming.tex b/src/docs/stan-reference/programming.tex index 043da3c2492..02c60f2f278 100644 --- a/src/docs/stan-reference/programming.tex +++ b/src/docs/stan-reference/programming.tex @@ -536,12 +536,12 @@ \subsection{Varying Lower Bounds} The Jacobian for adding a constant is one, so its log drops out of the log density. -Even if the lower bound is a paramter rather than data, there is no +Even if the lower bound is a parameter rather than data, there is no Jacobian required, because the transform from $(L, \alpha_{\mathrm raw})$ to $(L + \alpha_{\mathrm raw}, \alpha_{\mathrm raw}$ produces a Jacobian derivative matrix with a unit determinant. -It's also possible implement the transform by directly trnasforming +It's also possible implement the transform by directly transforming an unconstrained parameter and accounting for the Jacobian. % \begin{stancode} @@ -573,7 +573,7 @@ \subsection{Varying Lower Bounds} \subsection{Varying Upper and Lower Bounds} -Suppowe there are lower and upper bounds that vary by parameter. +Suppose there are lower and upper bounds that vary by parameter. These can be applied to shift and rescale a parameter constrained to $(0, 1)$. % @@ -4266,7 +4266,3 @@ \section{Standardizing Predictors and Outputs} \begin{stancode} y_std ~ normal(alpha_std + beta_std * x_std, sigma_std); \end{stancode} - - - -