diff --git a/.gitignore b/.gitignore index 9167fd029..73e3a6ff6 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,5 @@ data/cranfield biicode.conf bii/ bin/ +*.pro +*.pro.user \ No newline at end of file diff --git a/include/meta/index/disk_index.h b/include/meta/index/disk_index.h index 12b699997..6f1f25ad9 100644 --- a/include/meta/index/disk_index.h +++ b/include/meta/index/disk_index.h @@ -87,6 +87,16 @@ class disk_index */ std::vector docs() const; + /** + * @return a vector of term_ids that are contained in this index + */ + std::vector terms() const; + + /** + * @return a vector of term_ids that are contained in the document with d_id + */ + std::vector terms(doc_id d_id) const; + /** * @param d_id The document to search for * @return the size of the given document (the total number of terms diff --git a/include/meta/index/ranker/all.h b/include/meta/index/ranker/all.h index 8a1fe0e04..3b3c1efcf 100644 --- a/include/meta/index/ranker/all.h +++ b/include/meta/index/ranker/all.h @@ -1,6 +1,7 @@ #include "meta/index/ranker/ranker.h" #include "meta/index/ranker/absolute_discount.h" #include "meta/index/ranker/dirichlet_prior.h" +#include "meta/index/ranker/dirichlet_prior_opt.h" #include "meta/index/ranker/jelinek_mercer.h" #include "meta/index/ranker/lm_ranker.h" #include "meta/index/ranker/okapi_bm25.h" diff --git a/include/meta/index/ranker/dirichlet_prior.h b/include/meta/index/ranker/dirichlet_prior.h index dfb5aef42..6f2456f8a 100644 --- a/include/meta/index/ranker/dirichlet_prior.h +++ b/include/meta/index/ranker/dirichlet_prior.h @@ -1,82 +1,89 @@ -/** - * @file dirichlet_prior.h - * @author Sean Massung - * - * All files in META are released under the MIT license. For more details, - * consult the file LICENSE in the root of the project. - */ - -#ifndef META_DIRICHLET_PRIOR_H_ -#define META_DIRICHLET_PRIOR_H_ - -#include "meta/index/ranker/lm_ranker.h" -#include "meta/index/ranker/ranker_factory.h" - -namespace meta -{ -namespace index -{ - -/** - * Implements Bayesian smoothing with a Dirichlet prior. - * - * Required config parameters: - * ~~~toml - * [ranker] - * method = "dirichlet-prior" - * ~~~ - * - * Optional config parameters: - * ~~~toml - * mu = 2000.0 - * ~~~ - - */ -class dirichlet_prior : public language_model_ranker -{ - public: - /// Identifier for this ranker. - const static util::string_view id; - - /// Default value of mu - const static constexpr float default_mu = 2000.0f; - - /** - * @param mu - */ - dirichlet_prior(float mu = default_mu); - - /** - * Loads a dirichlet_prior ranker from a stream. - * @param in The stream to read from - */ - dirichlet_prior(std::istream& in); - - void save(std::ostream& out) const override; - - /** - * Calculates the smoothed probability of a term. - * @param sd score_data for the current query - */ - float smoothed_prob(const score_data& sd) const override; - - /** - * A document-dependent constant. - * @param sd score_data for the current query - */ - float doc_constant(const score_data& sd) const override; - - private: - /// the Dirichlet prior parameter - const float mu_; -}; - -/** - * Specialization of the factory method used to create dirichlet_prior - * rankers. - */ -template <> -std::unique_ptr make_ranker(const cpptoml::table&); -} -} -#endif +/** + * @file dirichlet_prior.h + * @author Sean Massung + * + * All files in META are released under the MIT license. For more details, + * consult the file LICENSE in the root of the project. + */ + +#ifndef META_DIRICHLET_PRIOR_H_ +#define META_DIRICHLET_PRIOR_H_ + +#include "meta/index/ranker/lm_ranker.h" +#include "meta/index/ranker/ranker_factory.h" + +namespace meta +{ +namespace index +{ + +/** + * Implements Bayesian smoothing with a Dirichlet prior. + * + * Required config parameters: + * ~~~toml + * [ranker] + * method = "dirichlet-prior" + * ~~~ + * + * Optional config parameters: + * ~~~toml + * mu = 2000.0 + * ~~~ + + */ +class dirichlet_prior : public language_model_ranker +{ + public: + /// Identifier for this ranker. + const static util::string_view id; + + /// Default value of mu + const static constexpr float default_mu = 2000.0f; + + /** + * @param mu + */ + dirichlet_prior(float mu = default_mu); + + /** + * Loads a dirichlet_prior ranker from a stream. + * @param in The stream to read from + */ + dirichlet_prior(std::istream& in); + + void save(std::ostream& out) const override; + + /** + * Calculates the smoothed probability of a term. + * @param sd score_data for the current query + */ + float smoothed_prob(const score_data& sd) const override; + + /** + * A document-dependent constant. + * @param sd score_data for the current query + */ + float doc_constant(const score_data& sd) const override; + + float parameter() const { + return mu_; + } + + protected: + /// the Dirichlet prior parameter +// const float mu_; + float mu_; +}; + + +/** + * Specialization of the factory method used to create dirichlet_prior + * rankers. + */ +template <> +std::unique_ptr make_ranker(const cpptoml::table&); + +} +} +#endif diff --git a/include/meta/index/ranker/dirichlet_prior_opt.h b/include/meta/index/ranker/dirichlet_prior_opt.h new file mode 100644 index 000000000..351b01b2a --- /dev/null +++ b/include/meta/index/ranker/dirichlet_prior_opt.h @@ -0,0 +1,289 @@ +/** + * @file dirichlet_prior_opt.h + * @author Aleksey Marashov, Kolomiets Maxim + * + * All files in META are released under the MIT license. For more details, + * consult the file LICENSE in the root of the project. + */ + +#ifndef META_DIRICHLET_PRIOR_OPT_H_ +#define META_DIRICHLET_PRIOR_OPT_H_ + +#include "meta/index/ranker/dirichlet_prior.h" + +#include + +namespace meta +{ +namespace index +{ + +typedef long count_d; + +struct docs_data +{ + /// inverted index + const inverted_index& idx; + /// ids of all documents in the index + std::vector doc_ids; + /// ids of all terms in the index + std::vector term_ids; + /// total size of all documents + count_d ref_size; + /// C_.(n) + std::map docs_counts; + /// C_k(n) + std::map> terms_docs_counts; + /// vector alpha_m + std::map alpha_m; + + /** + * Constructor to initialize most elements. + * @param p_idx The index that is being used + * @param p_doc_ids ids of all docs + * @param p_term_ids ids of all terms + */ + docs_data(const inverted_index& p_idx, std::vector p_doc_ids, std::vector p_term_ids, count_d p_ref_size, + std::map p_docs_counts, std::map> p_terms_docs_counts, + std::map p_alpha_m) + : idx(p_idx), // gcc no non-const ref init from brace init list + doc_ids{p_doc_ids}, + term_ids{p_term_ids}, + ref_size{p_ref_size}, + docs_counts{p_docs_counts}, + terms_docs_counts{p_terms_docs_counts}, + alpha_m{p_alpha_m} + { + /* nothing */ + } +}; + + +/** + * Abstract class for Diriclhet prior smoothing with optimized constant mu. + * Constant mu is optimized at the stage of scoring documents using information about those documents. + * + * Virtual method optimize_mu(docs_data& dd, float eps, int max_iter) is needed to be overrided in inheritants. + */ +class dirichlet_prior_opt : public dirichlet_prior{ +public: + dirichlet_prior_opt(float mu) : dirichlet_prior(mu) { } + + dirichlet_prior_opt(std::istream& in) : dirichlet_prior(in) { } + + template + std::vector score(inverted_index& idx, ForwardIterator begin, + ForwardIterator end, + uint64_t num_results = 10) + { + // optimize mu before scoring + this->optimize_mu(idx); + + return ranker::score(idx, begin, end, num_results); + } + + std::map get_optimized_mu(const inverted_index& idx, float eps, int max_iter) { + return optimize_mu(idx, eps, max_iter); + } + +protected: + inline double get_alpha(std::map alpha_m){ + double alpha = 0; + + for (auto alpha_m_k: alpha_m){ + alpha += alpha_m_k.second; + } + + return alpha; + } + +private: + /** + * Extracts information necessary to find optimal mu and wrap it into docs_data. + * Then, calls class-specific realization of optimize_mu function. + * Found optimal value of mu is written to the member of the class. + * + * @param idx inverted index + * @param eps convergence precision + * @param max_iter maximal number of iterations (upper bound) + * + * @return optimal value [alpha * m_i] for each term + */ + std::map optimize_mu(const inverted_index& idx, float eps=1e-6, int max_iter=10000) { + // parse idx and extract what we need + + auto docs_ids = idx.docs(); + auto terms_ids = idx.terms(); + + // calculate total size of all documents + count_d ref_size = 0; + for (auto& id : docs_ids) + ref_size += idx.doc_size(id); + + // calculate C_.(n) and C_k(n) + std::map docs_counts; + std::map> terms_docs_counts; + + long doc_size, doc_term_freq; + for (auto d_id: docs_ids){ + doc_size = idx.doc_size(d_id); + + //// increase number of docs with the given size (C_.(n)) + docs_counts[doc_size] += 1; + + for (auto t_id: terms_ids){ + doc_term_freq = idx.term_freq(t_id, d_id); + + //// increase number of docs with the given count of word t_id (C_k(n)) + terms_docs_counts[t_id][doc_term_freq] += 1; + } + } + + // fill start vector alpha_m + std::map alpha_m; + + for (auto t_id: terms_ids){ + alpha_m[t_id] = idx.total_num_occurences(t_id) * default_mu; + alpha_m[t_id] /= (double)ref_size; + } + + // create docs_data + docs_data dd{idx, docs_ids, terms_ids, ref_size, docs_counts, terms_docs_counts, alpha_m}; + + // call optimizer + return optimize_mu(dd, eps, max_iter); + } + + /** + * Finds optimal mu using information from given docs_data structure. + * Writes optimal mu to the corresponding field of the class. + * + * @param idx inverted index + * @param eps convergence precision + * @param max_iter maximal number of iterations (upper bound) + * + * @return optimal value [alpha * m_i] for each term + */ + virtual std::map optimize_mu(docs_data& dd, float eps, int max_iter) = 0; +}; + +/** + * Implements Diriclhet Prior smoothing with optimized constant mu. + * + * Optimization method is Fixed-Point Iteration with digamma recurrence relation + * described at: https://people.cs.umass.edu/~wallach/theses/wallach_phd_thesis.pdf, pp. 27-28. + * + * Required config parameters: + * ~~~toml + * [ranker] + * method = "dirichlet-digamma-rec" + * ~~~ + */ +class dirichlet_digamma_rec: public dirichlet_prior_opt{ +public: + const static util::string_view id; + + /** + * @param mu + */ + dirichlet_digamma_rec(float mu = default_mu); + + /** + * Loads a dirichlet_prior ranker from a stream. + * @param in The stream to read from + */ + dirichlet_digamma_rec(std::istream& in); + + void save(std::ostream& out) const override; +private: + std::map optimize_mu(docs_data& dd, float eps, int max_iter) override; + +}; + +/** + * Implements Diriclhet Prior smoothing with optimized constant mu. + * + * Optimization method is Fixed-Point Iteration with digamma differences log approximation + * described at: https://people.cs.umass.edu/~wallach/theses/wallach_phd_thesis.pdf, pp. 28-29. + * + * Required config parameters: + * ~~~toml + * [ranker] + * method = "dirichlet-log-approx" + * ~~~ + */ +class dirichlet_log_approx: public dirichlet_prior_opt{ +public: + const static util::string_view id; + + /** + * @param mu + */ + dirichlet_log_approx(float mu = default_mu); + + /** + * Loads a dirichlet_prior ranker from a stream. + * @param in The stream to read from + */ + dirichlet_log_approx(std::istream& in); + + void save(std::ostream& out) const override; +private: + std::map optimize_mu(docs_data& dd, float eps, int max_iter) override; +}; + +/** + * Implements Diriclhet Prior smoothing with optimized constant mu. + * + * Optimization method is MacKay and Peto's Fixed-Point Iteration with efficiently computing N_fk + * described at: https://people.cs.umass.edu/~wallach/theses/wallach_phd_thesis.pdf, p. 30. + * + * Required config parameters: + * ~~~toml + * [ranker] + * method = "dirichlet-mackay-peto" + * ~~~ + */ +class dirichlet_mackay_peto: public dirichlet_prior_opt{ +public: + const static util::string_view id; + + /** + * @param mu + */ + dirichlet_mackay_peto(float mu = default_mu); + + /** + * Loads a dirichlet_prior ranker from a stream. + * @param in The stream to read from + */ + dirichlet_mackay_peto(std::istream& in); + + void save(std::ostream& out) const override; +private: + std::map optimize_mu(docs_data& dd, float eps, int max_iter) override; +}; + +/** + * Specialization of the factory method used to create dirichlet_digamma_rec + * rankers. + */ +template <> +std::unique_ptr make_ranker(const cpptoml::table&); + +/** + * Specialization of the factory method used to create dirichlet_log_approx + * rankers. + */ +template <> +std::unique_ptr make_ranker(const cpptoml::table&); + +/** + * Specialization of the factory method used to create dirichlet_mackay_peto + * rankers. + */ +template <> +std::unique_ptr make_ranker(const cpptoml::table&); +} +} +#endif diff --git a/include/meta/stats/statistics.h b/include/meta/stats/statistics.h index d6823fed1..ac1fb054e 100644 --- a/include/meta/stats/statistics.h +++ b/include/meta/stats/statistics.h @@ -18,7 +18,6 @@ namespace meta { namespace stats { - /** * Computation for \f$E_d[f(x)]\f$ where \f$d\f$ is specified by the * `dist` parameter and \f$f(x)\f$ is the `fun` parameter. diff --git a/src/index/disk_index.cpp b/src/index/disk_index.cpp index c0dcbfd14..441fa0dc7 100644 --- a/src/index/disk_index.cpp +++ b/src/index/disk_index.cpp @@ -123,6 +123,20 @@ std::vector disk_index::docs() const return ret; } +std::vector disk_index::terms() const +{ + std::vector ret(unique_terms()); + std::iota(ret.begin(), ret.end(), 0_tid); + return ret; +} + +std::vector disk_index::terms(doc_id d_id) const +{ + std::vector ret(unique_terms(d_id)); + std::iota(ret.begin(), ret.end(), 0_tid); + return ret; +} + // disk_index_impl const std::vector disk_index::disk_index_impl::files diff --git a/src/index/ranker/CMakeLists.txt b/src/index/ranker/CMakeLists.txt index 20518f751..84d22701d 100644 --- a/src/index/ranker/CMakeLists.txt +++ b/src/index/ranker/CMakeLists.txt @@ -2,6 +2,7 @@ project(meta-ranker) add_library(meta-ranker absolute_discount.cpp dirichlet_prior.cpp + dirichlet_prior_opt.cpp jelinek_mercer.cpp lm_ranker.cpp okapi_bm25.cpp diff --git a/src/index/ranker/dirichlet_prior_opt.cpp b/src/index/ranker/dirichlet_prior_opt.cpp new file mode 100644 index 000000000..4405719c4 --- /dev/null +++ b/src/index/ranker/dirichlet_prior_opt.cpp @@ -0,0 +1,291 @@ +/** + * @file dirichlet_prior_opt.cpp + * @author Aleksey Marashov, Kolomiets Maksim + */ + +#include "cpptoml.h" +#include "meta/index/ranker/dirichlet_prior_opt.h" +#include "meta/index/score_data.h" + +namespace meta +{ +namespace index +{ + +// makers + +const util::string_view dirichlet_digamma_rec::id = "dirichlet-digamma-rec"; +template <> +std::unique_ptr + make_ranker(const cpptoml::table& config) +{ + auto mu = config.get_as("mu").value_or(dirichlet_digamma_rec::default_mu); + if (mu < 0) + throw ranker_exception{"dirichlet-digamma-rec mu must be >= 0"}; + return make_unique(mu); +} + +const util::string_view dirichlet_log_approx::id = "dirichlet-log-approx"; +template <> +std::unique_ptr + make_ranker(const cpptoml::table& config) +{ + auto mu = config.get_as("mu").value_or(dirichlet_log_approx::default_mu); + if (mu < 0) + throw ranker_exception{"dirichlet-log-approx mu must be >= 0"}; + return make_unique(mu); +} + +const util::string_view dirichlet_mackay_peto::id = "dirichlet-mackay-peto"; +template <> +std::unique_ptr + make_ranker(const cpptoml::table& config) +{ + auto mu = config.get_as("mu").value_or(dirichlet_mackay_peto::default_mu); + if (mu < 0) + throw ranker_exception{"dirichlet-mackay-peto mu must be >= 0"}; + return make_unique(mu); +} + +// constructors + +dirichlet_digamma_rec::dirichlet_digamma_rec(float mu) : dirichlet_prior_opt(mu) +{ + // nothing +} + +dirichlet_digamma_rec::dirichlet_digamma_rec(std::istream& in) + : dirichlet_prior_opt(in) +{ + // nothing +} + +void dirichlet_digamma_rec::save(std::ostream& out) const +{ + io::packed::write(out, id); + + io::packed::write(out, mu_); +} + + +dirichlet_log_approx::dirichlet_log_approx(float mu) : dirichlet_prior_opt(mu) +{ + // nothing +} + +dirichlet_log_approx::dirichlet_log_approx(std::istream& in) + : dirichlet_prior_opt(in) +{ + // nothing +} + +void dirichlet_log_approx::save(std::ostream& out) const +{ + io::packed::write(out, id); + + io::packed::write(out, mu_); +} + + +dirichlet_mackay_peto::dirichlet_mackay_peto(float mu) : dirichlet_prior_opt(mu) +{ + // nothing +} + +dirichlet_mackay_peto::dirichlet_mackay_peto(std::istream& in) + : dirichlet_prior_opt(in) +{ + // nothing +} + +void dirichlet_mackay_peto::save(std::ostream& out) const +{ + io::packed::write(out, id); + + io::packed::write(out, mu_); +} + +// optimization methods + +std::map dirichlet_digamma_rec::optimize_mu(docs_data& dd, float eps, int max_iter) { + bool all_optimized = false; + int iter_num = 0; + double D, S; + double n_max = dd.docs_counts.rbegin()->first; + + // start values for alpha and alpha_m + double alpha = default_mu, alpha_mk_new; + std::map alpha_m = dd.alpha_m; + + while (!all_optimized && iter_num < max_iter){ + D = 0.0; + S = 0.0; + all_optimized = true; + + alpha = get_alpha(alpha_m); + + count_d c_d; + for (count_d n = 1; n <= n_max; n++){ + c_d = dd.docs_counts[n]; + + D += 1.0/(n - 1 + alpha); + S += c_d * D; + } + + term_id k; + std::map c_k; + double S_k; + for (auto kv: dd.terms_docs_counts){ + k = kv.first; + c_k = kv.second; + + D = 0.0; + S_k = 0.0; + + count_d c_k_n, n_k_max = c_k.rbegin()->first; + for (count_d n = 1; n <= n_k_max; n++){ + c_k_n = c_k[n]; + + D += 1.0/(n - 1 + alpha_m[k]); + S_k += c_k_n * D; + } + + alpha_mk_new = alpha_m[k] * S_k / S; + + if (std::abs(alpha_mk_new - alpha_m[k]) > eps){ + all_optimized = false; + } + + alpha_m[k] = alpha_mk_new; + } + + iter_num++; + } + + mu_ = get_alpha(alpha_m); + + return alpha_m; +} + +std::map dirichlet_log_approx::optimize_mu(docs_data& dd, float eps, int max_iter) { + bool all_optimized = false; + int iter_num = 0; + double S, S_k; + double n_max = dd.docs_counts.rbegin()->first; + + // start values for alpha and alpha_m + double alpha = default_mu, alpha_mk_new; + std::map alpha_m = dd.alpha_m; + + while (!all_optimized && iter_num < max_iter){ + S = 0.0; + all_optimized = true; + + alpha = get_alpha(alpha_m); + + count_d c_d; + // TODO: skip the zero docs counts + for (count_d n = 1; n <= n_max; n++){ + c_d = dd.docs_counts[n]; + + if (c_d != 0){ + S += c_d * (1.0/alpha + log(n + alpha - 0.5) - log(alpha + 0.5)); + } + } + + term_id k; + std::map c_k; + for (auto kv: dd.terms_docs_counts){ + k = kv.first; + c_k = kv.second; + + S_k = 0.0; + + count_d c_k_n, n_k_max = c_k.rbegin()->first; + // TODO: skip the zero docs counts + for (count_d n = 1; n <= n_k_max; n++){ + c_k_n = c_k[n]; + + if (c_k_n != 0){ + S_k += c_k_n * (1.0/alpha_m[k] + log(n + alpha_m[k] - 0.5) - log(alpha_m[k] + 0.5)); + } + } + + alpha_mk_new = alpha_m[k] * S_k / S; + + if (std::abs(alpha_mk_new - alpha_m[k]) > eps){ + all_optimized = false; + } + + alpha_m[k] = alpha_mk_new; + } + + iter_num++; + } + + mu_ = get_alpha(alpha_m); + + return alpha_m; +} + +std::map dirichlet_mackay_peto::optimize_mu(docs_data& dd, float eps, int max_iter) { + bool all_optimized = false; + int iter_num = 0; + + // start values for alpha and alpha_m + double alpha = default_mu, alpha_mk_new; + std::map alpha_m = dd.alpha_m; + + while (!all_optimized && iter_num < max_iter){ + all_optimized = true; + + alpha = get_alpha(alpha_m); + + // compute K(alpha) + double K_alpha = 0; + for (auto d_id: dd.doc_ids){ + double n_d = dd.idx.doc_size(d_id); + K_alpha += log((n_d + alpha) / alpha) + 0.5 * n_d / (alpha * (n_d + alpha)); + } + + term_id k; + std::map c_k; + for (auto kv: dd.terms_docs_counts){ + k = kv.first; + c_k = kv.second; + + count_d n_k_max = c_k.rbegin()->first; + + // compute V_k + count_d V_k = dd.idx.doc_freq(k); + + // compute H_k and G_k + double H_k = 0, G_k = 0; + count_d N_f = 0; + for (count_d f = n_k_max; f >= 2; f--){ + N_f += c_k[f]; + + G_k += (double) N_f / (f - 1.0); + H_k += (double) N_f / pow(f - 1.0, 2); + } + + // recompute alpha_mk + alpha_mk_new = 2 * V_k / (K_alpha - G_k + sqrt(pow(K_alpha - G_k, 2) + 4 * H_k * V_k)); + + if (std::abs(alpha_mk_new - alpha_m[k]) > eps){ + all_optimized = false; + } + + alpha_m[k] = alpha_mk_new; + } + + iter_num++; + } + + mu_ = get_alpha(alpha_m); + + return alpha_m; +} + +} +} diff --git a/src/index/ranker/ranker_factory.cpp b/src/index/ranker/ranker_factory.cpp index 86c1069af..0643b0742 100644 --- a/src/index/ranker/ranker_factory.cpp +++ b/src/index/ranker/ranker_factory.cpp @@ -31,6 +31,9 @@ ranker_factory::ranker_factory() reg(); reg(); reg(); + reg(); + reg(); + reg(); } std::unique_ptr make_ranker(const cpptoml::table& config)